import time import json from selenium import webdriver

配置浏览器驱动

options = webdriver.ChromeOptions() options.add_argument('--headless') options.add_argument('--disable-gpu') driver = webdriver.Chrome(options=options)

爬取页面

url = 'https://movie.douban.com/subject/25868125/' driver.get(url)

点击进入全部影评

btn_all_reviews = driver.find_element_by_xpath('//a[@class="more-link"]') btn_all_reviews.click()

爬取评论数据

reviews = [] for page in range(5): # 只爬取前5页评论 start = page * 20 url = f'https://movie.douban.com/subject/25868125/comments?start={start}&limit=20&status=P&sort=new_score' driver.get(url) time.sleep(2) # 等待页面加载完毕 elems = driver.find_elements_by_xpath('//div[@class="comment-item"]') for elem in elems: name = elem.find_element_by_xpath('.//span[@class="comment-info"]/a').text date = elem.find_element_by_xpath('.//span[@class="comment-time "]').text content = elem.find_element_by_xpath('.//p').text review = {'name': name, 'date': date, 'content': content} reviews.append(review)

存储为json格式数据

with open('reviews.json', 'w', encoding='utf-8') as f: json.dump(reviews, f, ensure_ascii=False)

driver.quit()


原文地址: https://www.cveoy.top/t/topic/oBbn 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录