导入需要的库

import time import json from selenium import webdriver from bs4 import BeautifulSoup

设置网站地址

url = "https://movie.douban.com/subject/25868125/comments?status=P"

启动浏览器

driver = webdriver.Chrome()

打开网站

driver.get(url) time.sleep(3)

点击“全部影评”按钮

all_review_button = driver.find_element_by_xpath('//div[@class="review-hd"]//a[@href="#"]') all_review_button.click() time.sleep(3)

定义列表,用于存储所有的评论数据

reviews = []

循环抓取前3页的评论数据

for i in range(3): # 获取当前页面的html代码 html = driver.page_source soup = BeautifulSoup(html, 'html.parser')

# 获取评论数据
comments = soup.find_all('div', {'class': 'comment-item'})

# 循环遍历每条评论数据,提取出评论人名称、评论时间以及评论内容
for comment in comments:
    comment_dict = {}
    comment_dict['name'] = comment.find('span', {'class': 'comment-info'}).find('a').text.strip()
    comment_dict['time'] = comment.find('span', {'class': 'comment-time'}).text.strip()
    comment_dict['comment'] = comment.find('span', {'class': 'short'}).text.strip()
    reviews.append(comment_dict)

# 点击下一页按钮
next_page_button = driver.find_element_by_xpath('//div[@class="center"]//a[@class="next"]')
next_page_button.click()
time.sleep(3)

关闭浏览器

driver.quit()

将抓取到的数据以json格式存储到文件中

with open('reviews.json', 'w', encoding='utf-8') as f: json.dump(reviews, f, ensure_ascii=False)

输出结果

print(reviews


原文地址: http://www.cveoy.top/t/topic/g7kt 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录