Python爬取豆瓣电影《穿靴子的猫2》影评数据教程

一、使用网络爬虫技术，抓取《穿靴子的猫2》在豆瓣电影上的所有页的影评数据

抓取地址：

https://movie.douban.com/subject/25868125/

步骤1：借助Selenium库，点击进入电影全部影评

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# 设置Chrome浏览器的路径
driver_path = r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'

# 初始化一个Chrome浏览器的实例
driver = webdriver.Chrome(executable_path=driver_path)

# 打开豆瓣电影的页面
url = 'https://movie.douban.com/subject/25868125/'
driver.get(url)

# 点击“全部影评”按钮，进入影评页面
btn = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.LINK_TEXT, '全部影评'))
)
btn.click()

步骤2：从'https://movie.douban.com/subject/25868125/comments?start=0&limit=20&status=P&sort=new_score'地址开始，抓取第一页的评论人名称、评论时间以及评论。

import time

url = 'https://movie.douban.com/subject/25868125/comments?start={}&limit=20&status=P&sort=new_score'

# 抓取第一页的评论数据
driver.get(url.format(0))
time.sleep(1)

# 获取所有评论区域的div元素
comments_div = driver.find_elements_by_xpath('//div[@class="comment-item"]')

# 遍历每一个评论区域的div元素，获取评论人名称、评论时间以及评论
for comment in comments_div:
    name = comment.find_element_by_xpath('.//span[@class="comment-info"]/a').text
    date = comment.find_element_by_xpath('.//span[@class="comment-time "]/@[title]').text
    content = comment.find_element_by_xpath('.//p').text
    print(name, date, content)

步骤3：继续抓取2-3页的所有评论人名称、评论时间以及评论。

# 抓取第2-3页的评论数据
for i in range(1, 3):
    driver.get(url.format(i * 20))
    time.sleep(1)
    comments_div = driver.find_elements_by_xpath('//div[@class="comment-item"]')
    for comment in comments_div:
        name = comment.find_element_by_xpath('.//span[@class="comment-info"]/a').text
        date = comment.find_element_by_xpath('.//span[@class="comment-time "]/@[title]').text
        content = comment.find_element_by_xpath('.//p').text
        print(name, date, content)