Python爬取豆瓣电影《穿靴子的猫2》影评数据教程
Python爬取豆瓣电影《穿靴子的猫2》影评数据教程
本教程将带领你使用Python的Selenium库,抓取豆瓣电影《穿靴子的猫2》的所有影评数据,并将其存储为JSON格式文件。
步骤1:借助Selenium库,点击进入电影全部影评
- 导入需要的库和驱动
from selenium import webdriver
import time
import json
- 下载浏览器驱动,这里以Chrome为例
# 下载浏览器驱动,这里以Chrome为例
options = webdriver.ChromeOptions()
options.add_argument('--headless') # 无头模式,不显示浏览器
driver = webdriver.Chrome(options=options)
- 进入电影全部影评页面
# 进入电影全部影评页面
url = 'https://movie.douban.com/subject/25868125/'
driver.get(url)
all_comment_btn = driver.find_element_by_css_selector('#reviews-wrapper > div > div > div.mod-hd > h2 > span > a')
all_comment_btn.click()
步骤2:抓取第一页的评论人名称、评论时间以及评论
- 定位到第一页评论区
# 定位到第一页评论区
url = 'https://movie.douban.com/subject/25868125/comments?start=0&limit=20&status=P&sort=new_score'
driver.get(url)
# 存储抓取到的数据
data = []
# 循环抓取每一条评论并存储
while True:
comments = driver.find_elements_by_css_selector('#comments > div.comment-item')
for comment in comments:
name = comment.find_element_by_css_selector('div.comment > h3 > span.comment-info > a').text
time = comment.find_element_by_css_selector('div.comment > h3 > span.comment-info > span.comment-time').text
content = comment.find_element_by_css_selector('div.comment > p > span.short').text
data.append({'name': name, 'time': time, 'content': content})
# 查找下一页的链接,如果存在则点击进入下一页,否则退出循环
next_link = driver.find_elements_by_css_selector('#comments > div.comment > div.paginator > span.next > a')
if next_link:
next_link[0].click()
time.sleep(1) # 等待页面加载
else:
break
步骤3:继续抓取2-3页的所有评论人名称、评论时间以及评论
for i in range(1, 3):
url = f'https://movie.douban.com/subject/25868125/comments?start={i*20}&limit=20&status=P&sort=new_score'
driver.get(url)
while True:
comments = driver.find_elements_by_css_selector('#comments > div.comment-item')
for comment in comments:
name = comment.find_element_by_css_selector('div.comment > h3 > span.comment-info > a').text
time = comment.find_element_by_css_selector('div.comment > h3 > span.comment-info > span.comment-time').text
content = comment.find_element_by_css_selector('div.comment > p > span.short').text
data.append({'name': name, 'time': time, 'content': content})
next_link = driver.find_elements_by_css_selector('#comments > div.comment > div.paginator > span.next > a')
if next_link:
next_link[0].click()
time.sleep(1)
else:
break
步骤4:将抓取到的数据以文件存储的方式,存储为json格式数据
with open('comments.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
完整代码
from selenium import webdriver
import time
import json
# 下载浏览器驱动,这里以Chrome为例
options = webdriver.ChromeOptions()
options.add_argument('--headless') # 无头模式,不显示浏览器
driver = webdriver.Chrome(options=options)
# 进入电影全部影评页面
url = 'https://movie.douban.com/subject/25868125/'
driver.get(url)
all_comment_btn = driver.find_element_by_css_selector('#reviews-wrapper > div > div > div.mod-hd > h2 > span > a')
all_comment_btn.click()
# 定位到第一页评论区
url = 'https://movie.douban.com/subject/25868125/comments?start=0&limit=20&status=P&sort=new_score'
driver.get(url)
# 存储抓取到的数据
data = []
# 循环抓取每一条评论并存储
while True:
comments = driver.find_elements_by_css_selector('#comments > div.comment-item')
for comment in comments:
name = comment.find_element_by_css_selector('div.comment > h3 > span.comment-info > a').text
time = comment.find_element_by_css_selector('div.comment > h3 > span.comment-info > span.comment-time').text
content = comment.find_element_by_css_selector('div.comment > p > span.short').text
data.append({'name': name, 'time': time, 'content': content})
# 查找下一页的链接,如果存在则点击进入下一页,否则退出循环
next_link = driver.find_elements_by_css_selector('#comments > div.comment > div.paginator > span.next > a')
if next_link:
next_link[0].click()
time.sleep(1) # 等待页面加载
else:
break
for i in range(1, 3):
url = f'https://movie.douban.com/subject/25868125/comments?start={i*20}&limit=20&status=P&sort=new_score'
driver.get(url)
while True:
comments = driver.find_elements_by_css_selector('#comments > div.comment-item')
for comment in comments:
name = comment.find_element_by_css_selector('div.comment > h3 > span.comment-info > a').text
time = comment.find_element_by_css_selector('div.comment > h3 > span.comment-info > span.comment-time').text
content = comment.find_element_by_css_selector('div.comment > p > span.short').text
data.append({'name': name, 'time': time, 'content': content})
next_link = driver.find_elements_by_css_selector('#comments > div.comment > div.paginator > span.next > a')
if next_link:
next_link[0].click()
time.sleep(1)
else:
break
# 将抓取到的数据以文件存储的方式,存储为json格式数据
with open('comments.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
# 关闭浏览器
driver.quit()
注意:
- 运行代码之前,请确保已安装Selenium库和Chrome浏览器驱动,并将其路径添加到系统环境变量中。
- 由于豆瓣电影的网站结构可能会发生变化,代码可能需要根据实际情况进行调整。
- 抓取数据时请遵守网站的robots协议,不要过度抓取,以免造成服务器负担。
原文地址: https://www.cveoy.top/t/topic/oA1R 著作权归作者所有。请勿转载和采集!