首先,需要安装BeautifulSoup4库和pandas库。

pip install beautifulsoup4
pip install pandas

然后,可以使用以下代码实现爬取豆瓣电影TOP250排行榜的相关信息,并保存到excel文件中。

import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_movie_info(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;Win64;) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    movie_list = []
    movie_items = soup.find_all('div', class_='item')
    for item in movie_items:
        rank = item.find('em').text
        poster = item.find('img')['src']
        title = item.find('span', class_='title').text
        rating = item.find('span', class_='rating_num').text
        info = item.find('div', class_='bd').p.text.strip().split('\n')
        director = info[0].strip('导演: ').split('   ')[0]
        actors = info[0].strip('导演: ').split('   ')[1].strip('主演: ').replace('...', '')
        release_date = info[1].strip().split(' / ')[0]
        release_place = info[1].strip().split(' / ')[1]
        genres = info[1].strip().split(' / ')[2]
        comments = item.find('div', class_='star').find_all('span')[-1].text
        
        movie_list.append({
            '排名': rank,
            '海报': poster,
            '片名': title,
            '评分': rating,
            '导演': director,
            '主演': actors,
            '上映时间': release_date,
            '上映地点': release_place,
            '类型': genres,
            '热门短评': comments
        })
    
    return movie_list

def get_hot_comments(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;Win64;) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    comments_list = []
    comments_items = soup.find_all('div', class_='comment-item')
    for i, item in enumerate(comments_items[:5]):
        comments = item.find('span', class_='short').text
        comments_list.append(f'{i+1}.{comments}')
    
    return '\n'.join(comments_list)

def main():
    base_url = 'https://movie.douban.com/top250'
    movie_list = []
    
    for page in range(10):
        url = f'{base_url}?start={page*25}&filter='
        movie_list.extend(get_movie_info(url))
    
    for movie in movie_list[:10]:
        movie['热门短评'] = get_hot_comments(movie['海报'])
    
    df = pd.DataFrame(movie_list)
    df.to_excel('douban_top250.xlsx', index=False)

if __name__ == '__main__':
    main()

运行上述代码后,将会在当前目录下生成名为"douban_top250.xlsx"的excel文件,其中包含了豆瓣电影TOP250排行榜中250部影片的基本信息以及第一页前10名影片的前5条热门短评

使用Python编写爬虫自动爬取豆瓣电影TOP250排行榜中250部影片并利用BeautifulSoup4库提取相关信息包含基本信息排名海报片名评分导演主演上映时间上映地点类型热门短评与第一页前10名影片的前5条热门短评如果是5条短评请加上序号并用换行分隔如1这是一部好电影等并保存到excel格式

原文地址: http://www.cveoy.top/t/topic/hJEZ 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录