人民日报新闻文章下载器 - Python爬虫实现 - 常规

import requests import bs4 import os import time

def fetchUrl(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36' } r = requests.get(url, headers=headers) r.raise_for_status() r.encoding = r.apparent_encoding return r.text

def NewsCategories(): url = 'http://paper.people.com.cn/rmrb/html/2023-08/19/nbs.D110000renmrb_01.htm' html = fetchUrl(url) bsobj = bs4.BeautifulSoup(html, 'html.parser') categories = [] temp = bsobj.find('div', attrs={'id': 'page'}) if temp: categories = [a.text for a in temp.ul.find_all('a')] return categories

def getPage(year, month, day, category): url = f'http://paper.people.com.cn/rmrb/html/{year}-{month}/{day}/nbs.D110000renmrb_01.htm' html = fetchUrl(url) bsobj = bs4.BeautifulSoup(html, 'html.parser') pageList = [] temp = bsobj.find('div', attrs={'id': 'titleList'}) if temp: pageList = temp.ul.find_all('li') return [page.a.get('href') for page in pageList if category in page.text]

def getContent(pageUrl): url = f'http://paper.people.com.cn/rmrb/html/{pageUrl}' html = fetchUrl(url) bsobj = bs4.BeautifulSoup(html, 'html.parser') content = bsobj.find('div', attrs={'class': 'text_c'}).get_text() return content.strip()

def saveFile(content, path, filename): save_path = os.path.join(path, filename) if not os.path.exists(path): os.makedirs(path) with open(save_path, 'w', encoding='utf-8') as f: f.write(content)

def calculateAccuracy(category, total_samples, category_count): if total_samples == 0: return 0 accuracy = category_count / total_samples return accuracy * 100

def predictCategory(content): predicted_category = 'Example Category' return predicted_category

def downloadArticles(beginDate_str, endDate_str, category, save_path): category_count = 0 total_samples = 0 correct_predictions = 0

beginDate = time.mktime(time.strptime(beginDate_str, "%Y%m%d"))
endDate = time.mktime(time.strptime(endDate_str, "%Y%m%d"))

date_diff = int((endDate - beginDate) / 86400) + 1

for i in range(date_diff):
    current_date = time.strftime("%Y%m%d", time.localtime(beginDate + i * 86400))
    year = current_date[:4]
    month = current_date[4:6]
    day = current_date[6:]
    
    try:
        pageList = getPage(year, month, day, category)
        
        for pageUrl in pageList:
            category_count += 1
            total_samples += 1
            
            try:
                content = getContent(pageUrl)
                prediction = predictCategory(content)
                
                if prediction == category:
                    correct_predictions += 1
                
                filename = f'{year}{month}{day}_{pageUrl}.txt'
                saveFile(content, save_path, filename)
                print(f'Successfully downloaded {filename}')
            except Exception as e:
                print(f'Error occurred while downloading: {str(e)}')
    except requests.exceptions.HTTPError as e:
        print(f'Requested page not found: {str(e)}')

accuracy = calculateAccuracy(category, total_samples, correct_predictions)
print(f'Total samples: {total_samples}')
print(f'Correct predictions: {correct_predictions}')
print(f'Accuracy: {accuracy}%')

if name == 'main': beginDate_str = input('输入开始时间 (YYYYMMDD): ') endDate_str = input('输入结束时间 (YYYYMMDD): ') category = input('输入关键词: ') save_path = input('输入的路径: ') downloadArticles(beginDate_str, endDate_str, category, save_path)