Python爬虫实战:抓取豆瓣电影《穿靴子的猫2》影评数据
Python爬虫实战:抓取豆瓣电影《穿靴子的猫2》影评数据
本案例将使用Python语言,结合Selenium库,爬取豆瓣电影《穿靴子的猫2》的影评数据,包括评论人名称、评论时间和评论内容。
一、使用网络爬虫技术,抓取《穿靴子的猫2》在豆瓣电影上的所有页的影评数据,抓取地址:
https://movie.douban.com/subject/25868125/
步骤1:借助Selenium库,点击进入电影全部影评。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# 启动浏览器
driver = webdriver.Chrome()
driver.get('https://movie.douban.com/subject/25868125/')
# 点击“全部”按钮
all_button = driver.find_element_by_xpath('//div[@class="reviews mod movie-content"]/div[1]/div[1]/button[2]')
all_button.click()
# 等待页面加载完成
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'review-list')))
步骤2:从'https://movie.douban.com/subject/25868125/comments?start=0&limit=20&status=P&sort=new_score'地址开始,抓取第一页的评论人名称、评论时间以及评论。
import re
from datetime import datetime
# 获取评论列表
review_list = driver.find_element_by_class_name('review-list').find_elements_by_class_name('main.review-item')
reviews = []
# 遍历评论列表,抓取评论信息
for review in review_list:
name = review.find_element_by_class_name('name').text
rating = review.find_element_by_class_name('rating').get_attribute('title')
time = review.find_element_by_class_name('main-meta').text
content = review.find_element_by_class_name('short').text
content = re.sub('\s+', ' ', content) # 去除评论内容中的多余空格
time = datetime.strptime(time, '%Y-%m-%d %H:%M:%S') # 将时间字符串转换为datetime类型
reviews.append({'name': name, 'rating': rating, 'time': time, 'content': content})
步骤3:继续抓取2-3页的所有评论人名称、评论时间以及评论。
# 获取下一页按钮
next_button = driver.find_element_by_xpath('//div[@class="reviews mod movie-content"]/div[1]/div[2]/a[2]')
# 抓取2-3页的评论
for i in range(2, 4):
next_button.click() # 点击下一页按钮
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'review-list'))) # 等待页面加载完成
review_list = driver.find_element_by_class_name('review-list').find_elements_by_class_name('main.review-item')
for review in review_list:
name = review.find_element_by_class_name('name').text
rating = review.find_element_by_class_name('rating').get_attribute('title')
time = review.find_element_by_class_name('main-meta').text
content = review.find_element_by_class_name('short').text
content = re.sub('\s+', ' ', content)
time = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
reviews.append({'name': name, 'rating': rating, 'time': time, 'content': content})
步骤4:将抓取到的数据以文件存储的方式,存储为json格式数据。
import json
# 将数据存储为json格式文件
with open('reviews.json', 'w', encoding='utf-8') as f:
json.dump(reviews, f, ensure_ascii=False, indent=4)
完整代码如下:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
from datetime import datetime
import json
# 启动浏览器
driver = webdriver.Chrome()
driver.get('https://movie.douban.com/subject/25868125/')
# 点击“全部”按钮
all_button = driver.find_element_by_xpath('//div[@class="reviews mod movie-content"]/div[1]/div[1]/button[2]')
all_button.click()
# 等待页面加载完成
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'review-list')))
# 抓取第一页的评论
review_list = driver.find_element_by_class_name('review-list').find_elements_by_class_name('main.review-item')
reviews = []
for review in review_list:
name = review.find_element_by_class_name('name').text
rating = review.find_element_by_class_name('rating').get_attribute('title')
time = review.find_element_by_class_name('main-meta').text
content = review.find_element_by_class_name('short').text
content = re.sub('\s+', ' ', content)
time = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
reviews.append({'name': name, 'rating': rating, 'time': time, 'content': content})
# 抓取2-3页的评论
next_button = driver.find_element_by_xpath('//div[@class="reviews mod movie-content"]/div[1]/div[2]/a[2]')
for i in range(2, 4):
next_button.click()
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'review-list')))
review_list = driver.find_element_by_class_name('review-list').find_elements_by_class_name('main.review-item')
for review in review_list:
name = review.find_element_by_class_name('name').text
rating = review.find_element_by_class_name('rating').get_attribute('title')
time = review.find_element_by_class_name('main-meta').text
content = review.find_element_by_class_name('short').text
content = re.sub('\s+', ' ', content)
time = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
reviews.append({'name': name, 'rating': rating, 'time': time, 'content': content})
# 将数据存储为json格式文件
with open('reviews.json', 'w', encoding='utf-8') as f:
json.dump(reviews, f, ensure_ascii=False, indent=4)
# 关闭浏览器
driver.quit()
运行结果截图如下:

总结
本案例展示了如何使用Python语言和Selenium库抓取豆瓣电影《穿靴子的猫2》的影评数据。通过该案例,可以学习到如何使用Selenium模拟浏览器操作、如何抓取网页元素信息以及如何将数据存储为JSON格式。
原文地址: https://www.cveoy.top/t/topic/oA0X 著作权归作者所有。请勿转载和采集!