from time import sleep import json from selenium import webdriver import requests from bs4 import BeautifulSoup

创建浏览器对象

browser = webdriver.Chrome('C:\Users\Administrator\Desktop\人工智能2103 2021240190 文铈博\chromedriver.exe')

访问url地址

browser.get('https://movie.douban.com/subject/25868125/')

定位元素并点击

btn = browser.find_element_by_css_selector('#comments-section > div:nth-child(1) > h2 > span > a') btn.click()

请求头提取

headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36', 'Referer': 'https://movie.douban.com/subject/25868125/comments?status=P' }

初始页面url确定

url = 'https://movie.douban.com/subject/25868125/comments?start=0&limit=20&status=P&sort=new_score'

遍历页面循环

for i in range(3): # 发送请求并获得网页数据 response = requests.get(url, headers=headers)

# 解析网页结构
soup = BeautifulSoup(response.text, 'html.parser')

# 定位评论人名称、评论时间以及评论
comments = soup.find_all('div', class_='comment-item')
data = []
for comment in comments:

    name = comment.find('a', class_='')
    if name:
        name = name.text.strip()
    else:
        name = ''
    time = comment.find('span', class_='comment-time')
    if time:
        time = time.text.strip()
    else:
        time = ''
    content = comment.find('span', class_='short')
    if content:
        content = content.text.strip()
    else:
        content = ''
    data.append({'name': name, 'time': time, 'content': content})

# 文件数据写入
with open(f'comments_{i+1}.json', 'w', encoding='utf-8') as f:
    # 文件定义json格式
    json.dump(data, f, ensure_ascii=False, indent=4)

# 页面翻页规律设置
url = 'https://movie.douban.com/subject/25868125/comments?start={}&limit=20&status=P&sort=new_score'.format(i*20)

# 间隔3秒爬取下一页
sleep(3)

关闭浏览器

browser.quit()

豆瓣电影评论爬取：Python代码示例和优化建议