导入所需模块

import time import json from selenium import webdriver

设置浏览器无界面模式

options = webdriver.ChromeOptions() options.add_argument('--headless') options.add_argument('--disable-gpu') driver = webdriver.Chrome(options=options)

访问网页

url = 'https://movie.douban.com/subject/25868125/comments?start=0&limit=20&status=P&sort=new_score' driver.get(url)

点击“全部影评”按钮

more_btn = driver.find_element_by_xpath('//a[@href="javascript:void(0)"]') more_btn.click() time.sleep(1)

获取评论信息

comments = [] for i in range(3): # 抓取2-3页的评论 print('正在抓取第{}页评论数据...'.format(i+1)) url = 'https://movie.douban.com/subject/25868125/comments?start={}&limit=20&status=P&sort=new_score'.format(i*20) driver.get(url) comment_items = driver.find_elements_by_xpath('//div[@class="comment-item"]') for item in comment_items: comment_dict = {} comment_dict['name'] = item.find_element_by_xpath('.//a[@class=""][@href]') .get_attribute('title') comment_dict['time'] = item.find_element_by_xpath('.//span[@class="comment-time"]') .get_attribute('title') comment_dict['content'] = item.find_element_by_xpath('.//span[@class="short"]') .text comments.append(comment_dict)

保存为json文件

with open('comments.json', 'w', encoding='utf-8') as f: json.dump(comments, f, ensure_ascii=False)

关闭浏览器

driver.quit()

Python爬虫实战：抓取豆瓣电影《穿靴子的猫2》影评数据