Python爬虫实战：抓取豆瓣电影《穿靴子的猫2》影评数据

本文将使用Python语言和Selenium库，爬取豆瓣电影《穿靴子的猫2》的影评数据，包括评论人名称、评论时间和评论内容。代码示例展示了如何使用Selenium模拟点击操作，抓取多个页面数据，并以JSON格式存储结果。

代码实现

from selenium import webdriver
import time
import json
import re

# 设置浏览器驱动路径
driver_path = 'chromedriver.exe'
# 创建浏览器对象
driver = webdriver.Chrome(executable_path=driver_path)

# 打开网页
url = 'https://movie.douban.com/subject/25868125/'
driver.get(url)
time.sleep(3)

# 点击展开全部影评
btn = driver.find_element_by_xpath('//*[@id="reviews-wrapper"]/div[1]/div[2]/div[1]/a')
btn.click()
time.sleep(3)

# 抓取第一页评论
comments = []
for i in range(0, 20):
    # 获取评论人名称
    name = driver.find_element_by_xpath(f'//*[@id="reviews"]/div[{i+1}]/div[1]/a[@class="name"]/text()')
    name = name.strip()
    # 获取评论时间
    time_str = driver.find_element_by_xpath(f'//*[@id="reviews"]/div[{i+1}]/div[1]/span[@class="main-meta"]/text()')
    time_str = time_str.strip()
    # 获取评论
    content = driver.find_element_by_xpath(f'//*[@id="reviews"]/div[{i+1}]/div[2]/p/text()')
    content = content.strip()
    # 将数据存入字典
    comment = {
        'name': name,
        'time': time_str,
        'content': content
    }
    comments.append(comment)

# 抓取第2-3页评论
for page in range(1, 3):
    # 构造url
    start = page * 20
    url = f'https://movie.douban.com/subject/25868125/comments?start={start}&limit=20&status=P&sort=new_score'
    driver.get(url)
    time.sleep(3)
    # 抓取评论
    for i in range(0, 20):
        # 获取评论人名称
        name = driver.find_element_by_xpath(f'//*[@id="comments"]/div[{i+1}]/div[2]/h3/span[2]/a/text()')
        name = name.strip()
        # 获取评论时间
        time_str = driver.find_element_by_xpath(f'//*[@id="comments"]/div[{i+1}]/div[2]/h3/span[2]/span[3]/@title')
        time_str = time_str.strip()
        # 获取评论
        content = driver.find_element_by_xpath(f'//*[@id="comments"]/div[{i+1}]/div[2]/p/span/text()')
        content = content.strip()
        # 将数据存入字典
        comment = {
            'name': name,
            'time': time_str,
            'content': content
        }
        comments.append(comment)

# 将数据以json格式保存到文件
with open('comments.json', 'w', encoding='utf-8') as f:
    json.dump(comments, f, ensure_ascii=False, indent=2)

# 关闭浏览器
driver.quit()

运行结果截图

总结

本文通过一个简单的例子，展示了使用Python和Selenium爬取网页数据的基本步骤。在实际应用中，需要根据具体的网站结构和数据格式进行调整。

注意： 爬取网站数据时，请遵守网站的 robots.txt 协议，避免对网站造成过大的负荷。