Python爬虫实战:抓取豆瓣电影《穿靴子的猫2》影评数据
导入所需模块
import time import json from selenium import webdriver
设置浏览器无界面模式
options = webdriver.ChromeOptions() options.add_argument('--headless') options.add_argument('--disable-gpu') driver = webdriver.Chrome(options=options)
访问网页
url = 'https://movie.douban.com/subject/25868125/comments?start=0&limit=20&status=P&sort=new_score' driver.get(url)
点击“全部影评”按钮
more_btn = driver.find_element_by_xpath('//a[@href="javascript:void(0)"]') more_btn.click() time.sleep(1)
获取评论信息
comments = [] for i in range(3): # 抓取2-3页的评论 print('正在抓取第{}页评论数据...'.format(i+1)) url = 'https://movie.douban.com/subject/25868125/comments?start={}&limit=20&status=P&sort=new_score'.format(i*20) driver.get(url) comment_items = driver.find_elements_by_xpath('//div[@class="comment-item"]') for item in comment_items: comment_dict = {} comment_dict['name'] = item.find_element_by_xpath('.//a[@class=""][@href]') .get_attribute('title') comment_dict['time'] = item.find_element_by_xpath('.//span[@class="comment-time"]') .get_attribute('title') comment_dict['content'] = item.find_element_by_xpath('.//span[@class="short"]') .text comments.append(comment_dict)
保存为json文件
with open('comments.json', 'w', encoding='utf-8') as f: json.dump(comments, f, ensure_ascii=False)
关闭浏览器
driver.quit()
原文地址: https://www.cveoy.top/t/topic/oA0u 著作权归作者所有。请勿转载和采集!