Python豆瓣电影TOP250爬虫:自动抓取250部电影信息
以下是使用Python编写的爬取豆瓣电影TOP250排行榜的代码:\n\npython\nimport requests\nfrom bs4 import BeautifulSoup\n\n# 定义爬取函数\ndef get_movie_info(url):\n # 发送HTTP请求,获取页面内容\n headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;Win64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'}\n response = requests.get(url, headers=headers)\n html = response.text\n\n # 使用BeautifulSoup解析页面\n soup = BeautifulSoup(html, 'html.parser')\n movie_list = soup.find_all('div', class_='item')\n\n # 解析电影信息\n movies = []\n for movie in movie_list:\n # 获取排名\n rank = movie.find('em').text\n # 获取电影海报\n poster = movie.find('img')['src']\n # 获取电影标题\n title = movie.find('span', class_='title').text\n # 获取评分\n score = movie.find('span', class_='rating_num').text\n # 获取导演和主演\n info = movie.find('div', class_='bd').p.text.strip().split('\n')\n director = info[0].strip().split(':')[1].strip()\n actors = info[1].strip().split(':')[1].strip()\n # 获取上映时间、上映地点和类型\n details = movie.find('div', class_='bd').p.find_next_sibling().text.strip().split('/')\n year = details[0].strip()\n location = details[1].strip()\n genre = details[2].strip()\n # 获取热门短评\n comments = movie.find('div', class_='star').find_next_sibling().find_all('span')\n hot_comments = [comment.text for comment in comments[:5]]\n\n # 构造电影信息字典\n movie_info = {\n '排名': rank,\n '海报': poster,\n '片名': title,\n '评分': score,\n '导演': director,\n '主演': actors,\n '上映时间': year,\n '上映地点': location,\n '类型': genre,\n '热门短评': hot_comments\n }\n movies.append(movie_info)\n\n return movies\n\n# 主函数\ndef main():\n # 爬取豆瓣电影TOP250排行榜的前10页\n base_url = 'https://movie.douban.com/top250?start='\n movies = []\n for i in range(10):\n url = base_url + str(i * 25)\n movies.extend(get_movie_info(url))\n\n # 输出电影信息\n for movie in movies:\n print('排名:', movie['排名'])\n print('海报:', movie['海报'])\n print('片名:', movie['片名'])\n print('评分:', movie['评分'])\n print('导演:', movie['导演'])\n print('主演:', movie['主演'])\n print('上映时间:', movie['上映时间'])\n print('上映地点:', movie['上映地点'])\n print('类型:', movie['类型'])\n print('热门短评:')\n for index, comment in enumerate(movie['热门短评']):\n print(str(index+1) + '.' + comment)\n print()\n\nif __name__ == '__main__':\n main()\n\n\n这段代码会爬取豆瓣电影TOP250排行榜的前10页,每页包含25部电影,总共爬取250部电影。然后利用BeautifulSoup库解析页面,提取出电影的基本信息和热门短评,并将结果打印输出。\n\n注意:由于豆瓣网站的反爬机制较强,建议添加适当的延时或使用代理IP来进行爬取。
原文地址: http://www.cveoy.top/t/topic/psI6 著作权归作者所有。请勿转载和采集!