Python爬虫:新浪新闻每日头条抓取并存储到数据库
导入必要的库
import requests # 发起网络请求 from bs4 import BeautifulSoup # 解析HTML import datetime # 处理日期时间 import re # 正则表达式 import json # 处理JSON数据 import models # 数据库模型 import traceback # 异常处理
设置请求头
headers = { "Accept": "/", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,ja;q=0.8,ru;q=0.7", "Cache-Control": "no-cache", "Connection": "keep-alive", "Cookie": "UOR=www.baidu.com,tousu.sina.com.cn,; SINAGLOBAL=120.230.99.74_1660876060.256283; Apache=120.230.99.28_1663207019.801481; ULV=1663207021426:4:3:2:120.230.99.28_1663207019.801481:1663207018950; rotatecount=2; Hm_lvt_fcf72dc8287d20a78b3dfd301a50cbf8=1663207027; Hm_lpvt_fcf72dc8287d20a78b3dfd301a50cbf8=1663207027", "Host": "top.news.sina.com.cn", "Pragma": "no-cache", "Referer": "http://news.sina.com.cn/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36" }
设置起始日期
start_datetime = datetime.datetime.strptime('20230309', '%Y%m%d')
循环获取每天的新闻数据
for i in range(30): datetime1 = (start_datetime + datetime.timedelta(days=-i)).strftime("%Y%m%d") # 每次减少一天 print(datetime1) url = 'http://top.news.sina.com.cn/ws/GetTopDataList.php?top_type=day&top_cat=news_world_suda&top_time={}&top_show_num=20&top_order=DESC&js_var=news_'.format(datetime1)
# 发起请求,获取页面源码
h = requests.get(url=url,headers=headers)
# print(h.text)
# 解析出JSON数据
info_json = re.findall('var news_ = (.*);',h.text)
info_json1 = json.loads(info_json[0])
# 循环进入每个新闻详情页获取内容
for resu in info_json1['data']:
try:
# 设置请求头
headers1 = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9,ja;q=0.8,ru;q=0.7",
"cache-control": "no-cache",
"pragma": "no-cache",
"sec-ch-ua": "\"Google Chrome\";v=\"105\", \"Not)A;Brand\";v=\"8\", \"Chromium\";v=\"105\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
}
print(resu)
# 获取新闻的标题、来源、链接和发布时间
title = resu['title']
media = resu['media']
url1 = resu['url']
top_time = resu['top_time']
# 进入新闻详情页,获取新闻内容
h1 = requests.get(url=url1, headers=headers1)
code = h1.encoding
text = h1.text.encode(code).decode('utf-8')
soup1 = BeautifulSoup(text,'html.parser')
content = soup1.select('#article')[0].text
print(content)
# 检查数据库中是否已经存在该新闻,不存在则进行新增
if not models.XinWen.query.filter(models.XinWen.url==url1).all():
models.db.session.add(
models.XinWen(
title = title,
media = media,
url = url1,
top_time = top_time,
content = content
)
)
models.db.session.commit()
except:
print(traceback.format_exc()) # 输出异常信息
原文地址: https://www.cveoy.top/t/topic/jx58 著作权归作者所有。请勿转载和采集!