Python爬取豆瓣电影Top250数据并保存到Excel
import requests from bs4 import BeautifulSoup import pandas as pd
定义函数:获取影片详情页链接
def get_movie_url(movie): return movie.find('div', class_='hd').a['href']
定义函数:获取影片海报链接
def get_movie_poster(movie): return movie.find('div', class_='pic').a.img['src']
定义函数:获取影片基本信息
def get_movie_info(movie): movie_info = movie.find('div', class_='bd').p.text.strip().split(' ') movie_info = [info.strip() for info in movie_info if info.strip() != ''] movie_info.insert(1, get_movie_poster(movie)) # 添加海报链接 movie_info.insert(2, get_movie_url(movie)) # 添加影片详情页链接 return movie_info
定义函数:获取影片热门短评
def get_movie_reviews(url): res = requests.get(url) soup = BeautifulSoup(res.text, 'html.parser') reviews = soup.find_all('span', class_='comment-content') return [review.text.strip() for review in reviews[:5]] # 只获取前5条热门短评
发送HTTP请求并解析HTML
url = 'https://movie.douban.com/top250' res = requests.get(url) soup = BeautifulSoup(res.text, 'html.parser')
movies = soup.find_all('div', class_='item') data = []
提取影片基本信息与热门短评
for movie in movies: movie_info = get_movie_info(movie) data.append(movie_info) if len(data) < 10: movie_reviews = get_movie_reviews(get_movie_url(movie)) for i, review in enumerate(movie_reviews): data.append([f'{i+1}. {review}'])
创建DataFrame对象
df = pd.DataFrame(data, columns=['排名', '海报', '片名', '评分', '导演', '主演', '上映时间', '上映地点', '类型', '热门短评'])
保存为Excel文件
df.to_excel('top250_movies.xlsx', index=False)
原文地址: http://www.cveoy.top/t/topic/psIv 著作权归作者所有。请勿转载和采集!