Python爬虫实战:用requests和BeautifulSoup抓取豆瓣电影信息
import requests
import os
import json
from moviepy.editor import VideoFileClip, AudioFileClip
from bs4 import BeautifulSoup
def main():
url = 'https://movie.douban.com/j/chart/top_list'
param = {
'type': '24',
'interval_id': '100%3A90',
'action': '',
'start': '0',
'limit': '10',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.55'
}
response = requests.get(url=url, params=param, headers=headers)
if response.status_code == 200:
list_data = response.json()
for movie in list_data:
title = movie['title']
rating = movie['rating']
print('电影名称:{}'.format(title))
print('评分:{}'.format(rating))
# 向目标地址发送请求,获取响应
res = requests.get(movie['url'], headers=headers)
# 使用BeautifulSoup解析页面源码
soup = BeautifulSoup(res.text, 'html.parser')
# 获取页面标题作为文件名
title = soup.h1.string
# 查找所有script标签
script_all = soup.find_all('script')
for script in script_all:
content = script.text # 提取script标签内容
# 查找包含目标数据的script标签
if 'window.__playinfo__=' in content:
# 提取目标数据
content_res = content[20:]
# 将字符串数据转换为字典
data = json.loads(content_res)
# 提取视频链接
video_url = data['data']['dash']['video'][0]['baseUrl']
headers['referer'] = url
# 下载视频文件
video_res = requests.get(video_url, headers=headers)
with open('{}.mp4'.format(title), 'wb') as v_file:
v_file.write(video_res.content)
print('视频文件名:{}.mp4'.format(title))
# 提取音频链接
audio_url = data['data']['dash']['audio'][0]['baseUrl']
print('音频文件名:{}.mp3'.format(title))
# 下载音频文件
audio_res = requests.get(audio_url, headers=headers)
with open('{}.mp3'.format(title), 'wb') as a_file:
a_file.write(audio_res.content)
# 合并音视频文件
video_clip = VideoFileClip('{}.mp4'.format(title))
audio_clip = AudioFileClip('{}.mp3'.format(title))
merge_result = video_clip.set_audio(audio_clip)
merge_result.write_videofile('merge-{}.mp4'.format(title))
else:
print('请求失败,状态码:{}'.format(response.status_code))
if __name__ == '__main__':
main()
原文地址: https://www.cveoy.top/t/topic/fQ3E 著作权归作者所有。请勿转载和采集!