导入必要的库

import requests # 发起网络请求 from bs4 import BeautifulSoup # 解析HTML import datetime # 处理日期时间 import re # 正则表达式 import json # 处理JSON数据 import models # 数据库模型 import traceback # 异常处理

设置请求头

headers = { "Accept": "/", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,ja;q=0.8,ru;q=0.7", "Cache-Control": "no-cache", "Connection": "keep-alive", "Cookie": "UOR=www.baidu.com,tousu.sina.com.cn,; SINAGLOBAL=120.230.99.74_1660876060.256283; Apache=120.230.99.28_1663207019.801481; ULV=1663207021426:4:3:2:120.230.99.28_1663207019.801481:1663207018950; rotatecount=2; Hm_lvt_fcf72dc8287d20a78b3dfd301a50cbf8=1663207027; Hm_lpvt_fcf72dc8287d20a78b3dfd301a50cbf8=1663207027", "Host": "top.news.sina.com.cn", "Pragma": "no-cache", "Referer": "http://news.sina.com.cn/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36" }

设置起始日期

start_datetime = datetime.datetime.strptime('20230309', '%Y%m%d')

循环获取每天的新闻数据

for i in range(30): datetime1 = (start_datetime + datetime.timedelta(days=-i)).strftime("%Y%m%d") # 每次减少一天 print(datetime1) url = 'http://top.news.sina.com.cn/ws/GetTopDataList.php?top_type=day&top_cat=news_world_suda&top_time={}&top_show_num=20&top_order=DESC&js_var=news_'.format(datetime1)

# 发起请求,获取页面源码
h = requests.get(url=url,headers=headers)
# print(h.text)

# 解析出JSON数据
info_json = re.findall('var news_ = (.*);',h.text)
info_json1 = json.loads(info_json[0])

# 循环进入每个新闻详情页获取内容
for resu in info_json1['data']:
    try:
        # 设置请求头
        headers1 = {
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "zh-CN,zh;q=0.9,ja;q=0.8,ru;q=0.7",
            "cache-control": "no-cache",
            "pragma": "no-cache",
            "sec-ch-ua": "\"Google Chrome\";v=\"105\", \"Not)A;Brand\";v=\"8\", \"Chromium\";v=\"105\"",
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": "\"Windows\"",
            "sec-fetch-dest": "document",
            "sec-fetch-mode": "navigate",
            "sec-fetch-site": "none",
            "sec-fetch-user": "?1",
            "upgrade-insecure-requests": "1",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
        }

        print(resu)
        # 获取新闻的标题、来源、链接和发布时间
        title = resu['title']
        media = resu['media']
        url1 = resu['url']
        top_time = resu['top_time']

        # 进入新闻详情页,获取新闻内容
        h1 = requests.get(url=url1, headers=headers1)
        code = h1.encoding
        text = h1.text.encode(code).decode('utf-8')
        soup1 = BeautifulSoup(text,'html.parser')
        content = soup1.select('#article')[0].text

        print(content)

        # 检查数据库中是否已经存在该新闻,不存在则进行新增
        if not models.XinWen.query.filter(models.XinWen.url==url1).all():
            models.db.session.add(
                models.XinWen(
                    title = title,
                    media = media,
                    url = url1,
                    top_time = top_time,
                    content = content
                )
            )
            models.db.session.commit()

    except:
        print(traceback.format_exc()) # 输出异常信息
Python爬虫:新浪新闻每日头条抓取并存储到数据库

原文地址: https://www.cveoy.top/t/topic/jx58 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录