Python爬取豆瓣电影影评并保存到JSON文件
Python爬取豆瓣电影影评并保存到JSON文件
本文介绍如何使用Python爬取豆瓣电影影评数据,并将数据保存到JSON文件中。
代码示例
import time
import json
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
# 设置Chrome浏览器的选项,以便在后台运行
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('window-size=1920x1080')
options.add_argument('disable-gpu')
# 启动Chrome浏览器,并打开豆瓣电影页面
driver = webdriver.Chrome(options=options)
driver.get('https://movie.douban.com/subject/25868125/')
# 点击'全部影评'按钮,加载所有影评数据
button = driver.find_element_by_class_name('more-btn')
while button.is_displayed():
button.click()
time.sleep(2)
# 获取影评数据
comments = []
for i in range(0, 60, 20): # 抓取前3页数据
url = f'https://movie.douban.com/subject/25868125/comments?start={i}&limit=20&status=P&sort=new_score'
driver.get(url)
items = driver.find_elements_by_css_selector('.comment-item')
for item in items:
comment = {}
comment['user'] = item.find_element_by_class_name('comment-info').find_element_by_tag_name('a').text
comment['time'] = item.find_element_by_class_name('comment-time').get_attribute('title')
comment['content'] = item.find_element_by_class_name('short').text
comments.append(comment)
# 保存影评数据到文件
with open('comments.json', 'w', encoding='utf-8') as f:
json.dump(comments, f, ensure_ascii=False)
# 关闭浏览器
driver.quit()
将爬取的程序保存到D盘
import time
import json
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
# 设置Chrome浏览器的选项,以便在后台运行
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('window-size=1920x1080')
options.add_argument('disable-gpu')
# 启动Chrome浏览器,并打开豆瓣电影页面
driver = webdriver.Chrome(executable_path='D:\chromedriver.exe', options=options)
driver.get('https://movie.douban.com/subject/25868125/')
# 点击'全部影评'按钮,加载所有影评数据
button = driver.find_element_by_class_name('more-btn')
while button.is_displayed():
button.click()
time.sleep(2)
# 获取影评数据
comments = []
for i in range(0, 60, 20): # 抓取前3页数据
url = f'https://movie.douban.com/subject/25868125/comments?start={i}&limit=20&status=P&sort=new_score'
driver.get(url)
items = driver.find_elements_by_css_selector('.comment-item')
for item in items:
comment = {}
comment['user'] = item.find_element_by_class_name('comment-info').find_element_by_tag_name('a').text
comment['time'] = item.find_element_by_class_name('comment-time').get_attribute('title')
comment['content'] = item.find_element_by_class_name('short').text
comments.append(comment)
# 保存影评数据到文件
with open('D:\comments.json', 'w', encoding='utf-8') as f:
json.dump(comments, f, ensure_ascii=False)
# 关闭浏览器
driver.quit()
注意:
- 在代码中修改Chrome浏览器的驱动路径,例如:
driver = webdriver.Chrome(executable_path='D:\chromedriver.exe', options=options) - 要确保
chromedriver.exe文件存在于D盘的根目录下或者指定的路径下
希望本文对你有所帮助!
原文地址: https://www.cveoy.top/t/topic/oA1o 著作权归作者所有。请勿转载和采集!