Python爬取豆瓣电影《穿靴子的猫2》影评数据教程

本教程将带领你使用Python的Selenium库，抓取豆瓣电影《穿靴子的猫2》的所有影评数据，并将其存储为JSON格式文件。

步骤1：借助Selenium库，点击进入电影全部影评

导入需要的库和驱动

from selenium import webdriver
import time
import json

下载浏览器驱动，这里以Chrome为例

# 下载浏览器驱动，这里以Chrome为例
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # 无头模式，不显示浏览器
driver = webdriver.Chrome(options=options)

进入电影全部影评页面

# 进入电影全部影评页面
url = 'https://movie.douban.com/subject/25868125/'
driver.get(url)
all_comment_btn = driver.find_element_by_css_selector('#reviews-wrapper > div > div > div.mod-hd > h2 > span > a')
all_comment_btn.click()

步骤2：抓取第一页的评论人名称、评论时间以及评论

定位到第一页评论区

# 定位到第一页评论区
url = 'https://movie.douban.com/subject/25868125/comments?start=0&limit=20&status=P&sort=new_score'
driver.get(url)

# 存储抓取到的数据
data = []

# 循环抓取每一条评论并存储
while True:
    comments = driver.find_elements_by_css_selector('#comments > div.comment-item')
    for comment in comments:
        name = comment.find_element_by_css_selector('div.comment > h3 > span.comment-info > a').text
        time = comment.find_element_by_css_selector('div.comment > h3 > span.comment-info > span.comment-time').text
        content = comment.find_element_by_css_selector('div.comment > p > span.short').text
        data.append({'name': name, 'time': time, 'content': content})
    
    # 查找下一页的链接，如果存在则点击进入下一页，否则退出循环
    next_link = driver.find_elements_by_css_selector('#comments > div.comment > div.paginator > span.next > a')
    if next_link:
        next_link[0].click()
        time.sleep(1)  # 等待页面加载
    else:
        break

步骤3：继续抓取2-3页的所有评论人名称、评论时间以及评论

for i in range(1, 3):
    url = f'https://movie.douban.com/subject/25868125/comments?start={i*20}&limit=20&status=P&sort=new_score'
    driver.get(url)
    while True:
        comments = driver.find_elements_by_css_selector('#comments > div.comment-item')
        for comment in comments:
            name = comment.find_element_by_css_selector('div.comment > h3 > span.comment-info > a').text
            time = comment.find_element_by_css_selector('div.comment > h3 > span.comment-info > span.comment-time').text
            content = comment.find_element_by_css_selector('div.comment > p > span.short').text
            data.append({'name': name, 'time': time, 'content': content})
        next_link = driver.find_elements_by_css_selector('#comments > div.comment > div.paginator > span.next > a')
        if next_link:
            next_link[0].click()
            time.sleep(1)
        else:
            break

步骤4：将抓取到的数据以文件存储的方式，存储为json格式数据

with open('comments.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

完整代码

from selenium import webdriver
import time
import json

# 下载浏览器驱动，这里以Chrome为例
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # 无头模式，不显示浏览器
driver = webdriver.Chrome(options=options)

# 进入电影全部影评页面
url = 'https://movie.douban.com/subject/25868125/'
driver.get(url)
all_comment_btn = driver.find_element_by_css_selector('#reviews-wrapper > div > div > div.mod-hd > h2 > span > a')
all_comment_btn.click()

# 定位到第一页评论区
url = 'https://movie.douban.com/subject/25868125/comments?start=0&limit=20&status=P&sort=new_score'
driver.get(url)

# 存储抓取到的数据
data = []

# 循环抓取每一条评论并存储
while True:
    comments = driver.find_elements_by_css_selector('#comments > div.comment-item')
    for comment in comments:
        name = comment.find_element_by_css_selector('div.comment > h3 > span.comment-info > a').text
        time = comment.find_element_by_css_selector('div.comment > h3 > span.comment-info > span.comment-time').text
        content = comment.find_element_by_css_selector('div.comment > p > span.short').text
        data.append({'name': name, 'time': time, 'content': content})
    
    # 查找下一页的链接，如果存在则点击进入下一页，否则退出循环
    next_link = driver.find_elements_by_css_selector('#comments > div.comment > div.paginator > span.next > a')
    if next_link:
        next_link[0].click()
        time.sleep(1)  # 等待页面加载
    else:
        break

for i in range(1, 3):
    url = f'https://movie.douban.com/subject/25868125/comments?start={i*20}&limit=20&status=P&sort=new_score'
    driver.get(url)
    while True:
        comments = driver.find_elements_by_css_selector('#comments > div.comment-item')
        for comment in comments:
            name = comment.find_element_by_css_selector('div.comment > h3 > span.comment-info > a').text
            time = comment.find_element_by_css_selector('div.comment > h3 > span.comment-info > span.comment-time').text
            content = comment.find_element_by_css_selector('div.comment > p > span.short').text
            data.append({'name': name, 'time': time, 'content': content})
        next_link = driver.find_elements_by_css_selector('#comments > div.comment > div.paginator > span.next > a')
        if next_link:
            next_link[0].click()
            time.sleep(1)
        else:
            break

# 将抓取到的数据以文件存储的方式，存储为json格式数据
with open('comments.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

# 关闭浏览器
driver.quit()

注意：

运行代码之前，请确保已安装Selenium库和Chrome浏览器驱动，并将其路径添加到系统环境变量中。
由于豆瓣电影的网站结构可能会发生变化，代码可能需要根据实际情况进行调整。
抓取数据时请遵守网站的robots协议，不要过度抓取，以免造成服务器负担。