以下是一个使用Python爬取人民日报文章并下载相关文章的示例代码:

import requests
from bs4 import BeautifulSoup
import os

def crawl_people_daily(start_date, end_date, keyword):
    url = 'http://search.people.com.cn/cnpeople/news/getNewsResult.jsp'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Referer': 'http://search.people.com.cn/cnpeople/news/newsList'
    }

    params = {
        'keyword': keyword,
        'pageNum': 1,
        'startDate': start_date,
        'endDate': end_date
    }

    response = requests.get(url, headers=headers, params=params)
    data = response.json()
    total_page = data['totalPage']
    articles = []

    for page in range(1, total_page+1):
        params['pageNum'] = page
        response = requests.get(url, headers=headers, params=params)
        data = response.json()
        for item in data['items']:
            articles.append(item['url'])

    count = 0
    correct = 0

    for article_url in articles:
        response = requests.get(article_url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        title = soup.find('div', class_='clearfix w1000_320 text_title').h1.text
        content = soup.find('div', class_='box_con').text

        if keyword in content:
            correct += 1

        count += 1

        # 创建文件夹用于保存文章
        folder_path = os.path.join(os.getcwd(), 'articles')
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)

        # 保存文章
        file_path = os.path.join(folder_path, f'{title}.txt')
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(content)

    accuracy = correct / count * 100

    print(f'共爬取到{count}篇文章')
    print(f'正确率:{accuracy}%')

if __name__ == '__main__':
    start_date = input('请输入开始时间(格式:YYYY-MM-DD):')
    end_date = input('请输入结束时间(格式:YYYY-MM-DD):')
    keyword = input('请输入关键词:')

    crawl_people_daily(start_date, end_date, keyword)

在运行.py文件后,您需要按照提示输入开始时间、结束时间和关键词。然后,程序将爬取人民日报中包含关键词的文章,并将相关文章下载到当前路径下的名为'articles'的文件夹中。最后,程序会在窗口中显示爬取的文章数量和正确性的比率。请确保已安装requestsbeautifulsoup4库。

Python爬取人民日报文章并下载相关内容

原文地址: https://www.cveoy.top/t/topic/qptB 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录