Python爬取豆瓣电影影评并保存到JSON文件

本文介绍如何使用Python爬取豆瓣电影影评数据，并将数据保存到JSON文件中。

代码示例

import time
import json
from selenium import webdriver
from selenium.webdriver.common.keys import Keys


# 设置Chrome浏览器的选项，以便在后台运行
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('window-size=1920x1080')
options.add_argument('disable-gpu')

# 启动Chrome浏览器，并打开豆瓣电影页面
driver = webdriver.Chrome(options=options)
driver.get('https://movie.douban.com/subject/25868125/')

# 点击'全部影评'按钮，加载所有影评数据
button = driver.find_element_by_class_name('more-btn')
while button.is_displayed():
    button.click()
    time.sleep(2)

# 获取影评数据
comments = []
for i in range(0, 60, 20):  # 抓取前3页数据
    url = f'https://movie.douban.com/subject/25868125/comments?start={i}&limit=20&status=P&sort=new_score'
    driver.get(url)
    items = driver.find_elements_by_css_selector('.comment-item')
    for item in items:
        comment = {}
        comment['user'] = item.find_element_by_class_name('comment-info').find_element_by_tag_name('a').text
        comment['time'] = item.find_element_by_class_name('comment-time').get_attribute('title')
        comment['content'] = item.find_element_by_class_name('short').text
        comments.append(comment)

# 保存影评数据到文件
with open('comments.json', 'w', encoding='utf-8') as f:
    json.dump(comments, f, ensure_ascii=False)

# 关闭浏览器
driver.quit()

将爬取的程序保存到D盘

import time
import json
from selenium import webdriver
from selenium.webdriver.common.keys import Keys


# 设置Chrome浏览器的选项，以便在后台运行
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('window-size=1920x1080')
options.add_argument('disable-gpu')

# 启动Chrome浏览器，并打开豆瓣电影页面
driver = webdriver.Chrome(executable_path='D:\chromedriver.exe', options=options)
driver.get('https://movie.douban.com/subject/25868125/')

# 点击'全部影评'按钮，加载所有影评数据
button = driver.find_element_by_class_name('more-btn')
while button.is_displayed():
    button.click()
    time.sleep(2)

# 获取影评数据
comments = []
for i in range(0, 60, 20):  # 抓取前3页数据
    url = f'https://movie.douban.com/subject/25868125/comments?start={i}&limit=20&status=P&sort=new_score'
    driver.get(url)
    items = driver.find_elements_by_css_selector('.comment-item')
    for item in items:
        comment = {}
        comment['user'] = item.find_element_by_class_name('comment-info').find_element_by_tag_name('a').text
        comment['time'] = item.find_element_by_class_name('comment-time').get_attribute('title')
        comment['content'] = item.find_element_by_class_name('short').text
        comments.append(comment)

# 保存影评数据到文件
with open('D:\comments.json', 'w', encoding='utf-8') as f:
    json.dump(comments, f, ensure_ascii=False)

# 关闭浏览器
driver.quit()

注意：

在代码中修改Chrome浏览器的驱动路径，例如：driver = webdriver.Chrome(executable_path='D:\chromedriver.exe', options=options)
要确保chromedriver.exe文件存在于D盘的根目录下或者指定的路径下

希望本文对你有所帮助！