导入需要的库

import requests from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC

创建浏览器对象

browser = webdriver.Chrome()

访问url地址

url = 'https://movie.douban.com/subject/25868125/' browser.get(url)

设置浏览器窗口最大化

browser.maximize_window()

内容抓取

headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', 'Cookie': '你的Cookie', 'Referer': 'https://movie.douban.com/subject/25868125/' }

获取第一页的影评数据

res = requests.get(url, headers=headers) soup = BeautifulSoup(res.text, 'html.parser') comments = soup.select('.comment-item') for comment in comments: name = comment.select('.comment-info > a')[0].text.strip() time = comment.select('.comment-info > span')[0].text.strip() content = comment.select('.short')[0].text.strip() print(name, time, content)

循环爬取所有页的影评数据

while True: try: # 定位并点击下一页按钮 next_button = WebDriverWait(browser, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, '.comment .next a')) ) next_button.click() except: # 当没有下一页时退出循环 break

# 等待页面加载完成
WebDriverWait(browser, 10).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, '.comment-item'))
)

# 获取当前页的影评数据
soup = BeautifulSoup(browser.page_source, 'html.parser')
comments = soup.select('.comment-item')
for comment in comments:
    name = comment.select('.comment-info > a')[0].text.strip()
    time = comment.select('.comment-info > span')[0].text.strip()
    content = comment.select('.short')[0].text.strip()
    print(name, time, content)

关闭浏览器

browser.close()

豆瓣电影影评爬取 - 使用 Selenium 和 BeautifulSoup 获取所有页面的评论数据

豆瓣电影影评爬取 - 使用 Selenium 和 BeautifulSoup 获取所有页面的评论数据