import requests
import json
import os

def crawl_people_daily(start_date, end_date, keyword):
    url = 'http://search.people.com.cn/api-search/elasticSearch/search'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36',
        'Referer': 'http://search.people.com.cn/cnpeople/search.do'
    }
    params = {
        'keyword': keyword,
        'pageNum': 1,
        'pageSize': 20,
        'facetFlag': 'false',
        'siteName': '人民网',
        'range': 'PD',
        'nodeType': 'belongsId',
        'nodeId': '0',
        'sortType': '1',
        'startTime': start_date,
        'endTime': end_date
    }
    articles = []
    total_count = 0

    while True:
        try:
            response = requests.get(url, headers=headers, params=params)
            data = response.json()
            if not data:
                break

            for item in data['items']:
                title = item['title']
                content = item['content']
                pub_time = item['pubTime']
                source_name = item['sourceName']
                articles.append({'title': title, 'content': content, 'pub_time': pub_time, 'source_name': source_name})

            total_count += len(data['items'])
            params['pageNum'] += 1
        except Exception as e:
            print('An error occurred while crawling:', e)
            break

    save_path = os.path.join(os.getcwd(), 'articles')
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    
    count = 0
    for article in articles:
        count += 1
        filename = f"{count}.txt"
        file_path = os.path.join(save_path, filename)
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(f"标题:{article['title']}

")
            f.write(f"发布时间:{article['pub_time']}

")
            f.write(f"来源:{article['source_name']}

")
            f.write(f"内容:
{article['content']}

")
    
    accuracy = count / total_count if total_count > 0 else 0
    print(f"共爬取到{count}篇文章,正确率:{accuracy * 100:.2f}%")

if __name__ == '__main__':
    start_date = input("请输入开始时间(格式:yyyy-mm-dd):")
    end_date = input("请输入结束时间(格式:yyyy-mm-dd):")
    keyword = input("请输入关键词:")
    crawl_people_daily(start_date, end_date, keyword)

你可以根据需要修改保存文章的路径和文件名的格式。注意,代码中使用了requests库来发送HTTP请求,并使用os库来进行文件操作。确保你已经安装了这两个库:

pip install requests

请将代码保存为一个单独的.py文件并运行,按照提示输入开始时间、结束时间和关键字,然后程序将会开始爬取和保存相关的文章,并在窗口中显示爬取的文章数量和正确率。

Python爬取人民日报文章:实现按时间和关键词筛选并下载

原文地址: https://www.cveoy.top/t/topic/qptG 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录