人民日报文章下载工具 - 自动抓取和分类 - 常规

import requests
import bs4
import os
import time

def fetchUrl(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36'
    }
    r = requests.get(url, headers=headers)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return r.text

def NewsCategories():
    url = 'http://paper.people.com.cn/rmrb/html/2023-08/19/nbs.D110000renmrb_01.htm'
    html = fetchUrl(url)
    bsobj = bs4.BeautifulSoup(html, 'html.parser')
    categories = []
    temp = bsobj.find('div', attrs={'id': 'page'})
    if temp:
        categories = [a.text for a in temp.ul.find_all('a')]
    return categories

def getPage(year, month, day, category):
    url = f'http://paper.people.com.cn/rmrb/html/{year}-{month}/{day}/nbs.D110000renmrb_01.htm'
    html = fetchUrl(url)
    bsobj = bs4.BeautifulSoup(html, 'html.parser')
    pageList = []
    temp = bsobj.find('div', attrs={'id': 'titleList'})
    if temp:
        pageList = temp.ul.find_all('li')
    return [page.a.get('href') for page in pageList if category in page.text]

def getContent(pageUrl):
    url = f'http://paper.people.com.cn/rmrb/html/{pageUrl}'
    html = fetchUrl(url)
    bsobj = bs4.BeautifulSoup(html, 'html.parser')
    content = bsobj.find('div', attrs={'class': 'text_c'}).get_text()
    return content.strip()

def saveFile(content, path, filename):
    save_path = os.path.join(path, filename)
    if not os.path.exists(path):
        os.makedirs(path)
    with open(save_path, 'w', encoding='utf-8') as f:
        f.write(content)

def calculateAccuracy(category, total_samples, category_count):
    accuracy = category_count / total_samples
    return accuracy * 100

def predictCategory(content):
    predicted_category = 'Example Category'
    return predicted_category

def downloadArticles(beginDate_str, endDate_str, category, save_path):
    category_count = 0
    total_samples = 0
    correct_predictions = 0

    beginDate = time.mktime(time.strptime(beginDate_str, "%Y%m%d"))
    endDate = time.mktime(time.strptime(endDate_str, "%Y%m%d"))

    date_diff = int((endDate - beginDate) / 86400) + 1

    for i in range(date_diff):
        current_date = time.strftime("%Y%m%d", time.localtime(beginDate + i * 86400))
        year = current_date[:4]
        month = current_date[4:6]
        day = current_date[6:]

        try:
            pageList = getPage(year, month, day, category)

            for pageUrl in pageList:
                category_count += 1
                total_samples += 1

                try:
                    content = getContent(pageUrl)
                    prediction = predictCategory(content)

                    if prediction == category:
                        correct_predictions += 1

                    filename = f'{year}{month}{day}_{pageUrl}.txt'
                    saveFile(content, save_path, filename)
                    print(f'Successfully downloaded {filename}')
                except Exception as e:
                    print(f'Error occurred while downloading: {str(e)}')

        except requests.exceptions.HTTPError as e:
            print(f'Requested page not found: {str(e)}')

    accuracy = calculateAccuracy(category, total_samples, correct_predictions)
    print(f'Total samples: {total_samples}')
    print(f'Correct predictions: {correct_predictions}')
    print(f'Accuracy: {accuracy}%')

if __name__ == '__main__':
    beginDate_str = input('输入开始时间 (YYYYMMDD): ')
    endDate_str = input('输入结束时间 (YYYYMMDD): ')
    category = input('输入关键词: ')
    save_path = input('输入保存路径: ')
    downloadArticles(beginDate_str, endDate_str, category, save_path)

该代码使用了 requests 库获取网页内容，bs4 库解析 HTML 结构，并利用 os 库保存文件。程序可以根据用户输入的日期范围和关键词，自动下载对应文章并保存到指定目录。

使用方法：

确保已经安装 requests 和 bs4 库，可以使用 pip install requests beautifulsoup4 命令安装。
运行程序，程序会提示用户输入开始时间、结束时间、关键词和保存路径。
程序会自动下载并保存文章到指定目录。

注意事项：

该程序仅供学习使用，请勿用于任何违法行为。
人民日报网站可能随时调整网页结构，程序可能需要修改才能正常运行。
该程序默认使用 Example Category 进行文章分类，用户需要根据实际情况修改 predictCategory 函数实现更准确的分类。

代码解释：

fetchUrl 函数用于获取网页内容，并进行编码处理。
NewsCategories 函数用于获取新闻类别列表。
getPage 函数用于获取指定日期、类别的文章链接列表。
getContent 函数用于获取文章内容。
saveFile 函数用于保存文章内容到文件。
calculateAccuracy 函数用于计算分类准确率。
predictCategory 函数用于预测文章类别。
downloadArticles 函数用于下载文章并进行分类。

本程序是一个简单示例，用户可以根据自身需求进行扩展和修改。

希望本工具能够帮助您更方便地获取人民日报文章。