Python爬取豆瓣电影《穿靴子的猫2》影评数据教程

本教程将带领你使用Python的Selenium库,抓取豆瓣电影《穿靴子的猫2》的所有影评数据,并将其存储为JSON格式文件。

步骤1:借助Selenium库,点击进入电影全部影评

  1. 导入需要的库和驱动
from selenium import webdriver
import time
import json
  1. 下载浏览器驱动,这里以Chrome为例
# 下载浏览器驱动,这里以Chrome为例
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # 无头模式,不显示浏览器
driver = webdriver.Chrome(options=options)
  1. 进入电影全部影评页面
# 进入电影全部影评页面
url = 'https://movie.douban.com/subject/25868125/'
driver.get(url)
all_comment_btn = driver.find_element_by_css_selector('#reviews-wrapper > div > div > div.mod-hd > h2 > span > a')
all_comment_btn.click()

步骤2:抓取第一页的评论人名称、评论时间以及评论

  1. 定位到第一页评论区
# 定位到第一页评论区
url = 'https://movie.douban.com/subject/25868125/comments?start=0&limit=20&status=P&sort=new_score'
driver.get(url)

# 存储抓取到的数据
data = []

# 循环抓取每一条评论并存储
while True:
    comments = driver.find_elements_by_css_selector('#comments > div.comment-item')
    for comment in comments:
        name = comment.find_element_by_css_selector('div.comment > h3 > span.comment-info > a').text
        time = comment.find_element_by_css_selector('div.comment > h3 > span.comment-info > span.comment-time').text
        content = comment.find_element_by_css_selector('div.comment > p > span.short').text
        data.append({'name': name, 'time': time, 'content': content})
    
    # 查找下一页的链接,如果存在则点击进入下一页,否则退出循环
    next_link = driver.find_elements_by_css_selector('#comments > div.comment > div.paginator > span.next > a')
    if next_link:
        next_link[0].click()
        time.sleep(1)  # 等待页面加载
    else:
        break

步骤3:继续抓取2-3页的所有评论人名称、评论时间以及评论

for i in range(1, 3):
    url = f'https://movie.douban.com/subject/25868125/comments?start={i*20}&limit=20&status=P&sort=new_score'
    driver.get(url)
    while True:
        comments = driver.find_elements_by_css_selector('#comments > div.comment-item')
        for comment in comments:
            name = comment.find_element_by_css_selector('div.comment > h3 > span.comment-info > a').text
            time = comment.find_element_by_css_selector('div.comment > h3 > span.comment-info > span.comment-time').text
            content = comment.find_element_by_css_selector('div.comment > p > span.short').text
            data.append({'name': name, 'time': time, 'content': content})
        next_link = driver.find_elements_by_css_selector('#comments > div.comment > div.paginator > span.next > a')
        if next_link:
            next_link[0].click()
            time.sleep(1)
        else:
            break

步骤4:将抓取到的数据以文件存储的方式,存储为json格式数据

with open('comments.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

完整代码

from selenium import webdriver
import time
import json

# 下载浏览器驱动,这里以Chrome为例
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # 无头模式,不显示浏览器
driver = webdriver.Chrome(options=options)

# 进入电影全部影评页面
url = 'https://movie.douban.com/subject/25868125/'
driver.get(url)
all_comment_btn = driver.find_element_by_css_selector('#reviews-wrapper > div > div > div.mod-hd > h2 > span > a')
all_comment_btn.click()

# 定位到第一页评论区
url = 'https://movie.douban.com/subject/25868125/comments?start=0&limit=20&status=P&sort=new_score'
driver.get(url)

# 存储抓取到的数据
data = []

# 循环抓取每一条评论并存储
while True:
    comments = driver.find_elements_by_css_selector('#comments > div.comment-item')
    for comment in comments:
        name = comment.find_element_by_css_selector('div.comment > h3 > span.comment-info > a').text
        time = comment.find_element_by_css_selector('div.comment > h3 > span.comment-info > span.comment-time').text
        content = comment.find_element_by_css_selector('div.comment > p > span.short').text
        data.append({'name': name, 'time': time, 'content': content})
    
    # 查找下一页的链接,如果存在则点击进入下一页,否则退出循环
    next_link = driver.find_elements_by_css_selector('#comments > div.comment > div.paginator > span.next > a')
    if next_link:
        next_link[0].click()
        time.sleep(1)  # 等待页面加载
    else:
        break

for i in range(1, 3):
    url = f'https://movie.douban.com/subject/25868125/comments?start={i*20}&limit=20&status=P&sort=new_score'
    driver.get(url)
    while True:
        comments = driver.find_elements_by_css_selector('#comments > div.comment-item')
        for comment in comments:
            name = comment.find_element_by_css_selector('div.comment > h3 > span.comment-info > a').text
            time = comment.find_element_by_css_selector('div.comment > h3 > span.comment-info > span.comment-time').text
            content = comment.find_element_by_css_selector('div.comment > p > span.short').text
            data.append({'name': name, 'time': time, 'content': content})
        next_link = driver.find_elements_by_css_selector('#comments > div.comment > div.paginator > span.next > a')
        if next_link:
            next_link[0].click()
            time.sleep(1)
        else:
            break

# 将抓取到的数据以文件存储的方式,存储为json格式数据
with open('comments.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

# 关闭浏览器
driver.quit()

注意:

  • 运行代码之前,请确保已安装Selenium库和Chrome浏览器驱动,并将其路径添加到系统环境变量中。
  • 由于豆瓣电影的网站结构可能会发生变化,代码可能需要根据实际情况进行调整。
  • 抓取数据时请遵守网站的robots协议,不要过度抓取,以免造成服务器负担。

原文地址: https://www.cveoy.top/t/topic/oA1R 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录