豆瓣电影影评爬虫代码优化:使用Selenium和BeautifulSoup抓取数据
# 导入所需的模块
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# 创建浏览器对象
browser = webdriver.Chrome()
# 访问url地址
url = 'https://movie.douban.com/subject/25868125/'
browser.get(url)
# 设置浏览器窗口最大化
browser.maximize_window()
# 设置请求头部信息
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Cookie': '你的Cookie',
'Referer': 'https://movie.douban.com/subject/25868125/'
}
# 发送请求获取页面内容
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.text, 'html.parser')
# 解析页面内容获取影评数据
comments = soup.select('.comment-item')
for comment in comments:
name = comment.select('.comment-info > a')[0].text.strip()
time = comment.select('.comment-info > span')[0].text.strip()
content = comment.select('.short')[0].text.strip()
print(name, time, content)
# 循环爬取所有页的影评数据
while True:
try:
# 查找'下一页'按钮元素,若找到则点击,否则退出循环
next_button = WebDriverWait(browser, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '.comment .next a'))
)
next_button.click()
except:
break
# 关闭浏览器
browser.close()
原文地址: https://www.cveoy.top/t/topic/oA5G 著作权归作者所有。请勿转载和采集!