人民日报新闻爬虫:自动下载指定日期、类别新闻文章
import requests
import bs4
import os
import time
def fetchUrl(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36'
}
r = requests.get(url, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
def NewsCategories():
url = 'http://paper.people.com.cn/rmrb/html/2023-08/19/nbs.D110000renmrb_01.htm'
html = fetchUrl(url)
bsobj = bs4.BeautifulSoup(html, 'html.parser')
categories = []
temp = bsobj.find('div', attrs={'id': 'page'})
if temp:
categories = [a.text for a in temp.ul.find_all('a')]
return categories
def getPage(year, month, day, category):
url = f'http://paper.people.com.cn/rmrb/html/{year}-{month}/{day}/nbs.D110000renmrb_01.htm'
html = fetchUrl(url)
bsobj = bs4.BeautifulSoup(html, 'html.parser')
pageList = []
temp = bsobj.find('div', attrs={'id': 'titleList'})
if temp:
pageList = temp.ul.find_all('li')
return [page.a.get('href') for page in pageList if category in page.text]
def getContent(pageUrl):
url = f'http://paper.people.com.cn/rmrb/html/{pageUrl}'
html = fetchUrl(url)
bsobj = bs4.BeautifulSoup(html, 'html.parser')
content = bsobj.find('div', attrs={'class': 'text_c'}).get_text()
return content.strip()
def saveFile(content, path, filename):
save_path = os.path.join(path, filename)
if not os.path.exists(path):
os.makedirs(path)
with open(save_path, 'w', encoding='utf-8') as f:
f.write(content)
def calculateAccuracy(category, total_samples, category_count):
if total_samples == 0:
return 0
accuracy = category_count / total_samples
return accuracy * 100
def predictCategory(content):
predicted_category = 'Example Category'
return predicted_category
def downloadArticles(beginDate_str, endDate_str, category, save_path):
category_count = 0
total_samples = 0
correct_predictions = 0
beginDate = time.mktime(time.strptime(beginDate_str, "%Y%m%d"))
endDate = time.mktime(time.strptime(endDate_str, "%Y%m%d"))
date_diff = int((endDate - beginDate) / 86400) + 1
for i in range(date_diff):
current_date = time.strftime("%Y%m%d", time.localtime(beginDate + i * 86400))
year = current_date[:4]
month = current_date[4:6]
day = current_date[6:]
try:
pageList = getPage(year, month, day, category)
for pageUrl in pageList:
category_count += 1
total_samples += 1
try:
content = getContent(pageUrl)
prediction = predictCategory(content)
if prediction == category:
correct_predictions += 1
filename = f'{year}{month}{day}_{pageUrl}.txt'
saveFile(content, save_path, filename)
print(f'Successfully downloaded {filename}')
except Exception as e:
print(f'Error occurred while downloading: {str(e)}')
except requests.exceptions.HTTPError as e:
print(f'Requested page not found: {str(e)}')
accuracy = calculateAccuracy(category, total_samples, correct_predictions)
print(f'Total samples: {total_samples}')
print(f'Correct predictions: {correct_predictions}')
print(f'Accuracy: {accuracy}%')
if __name__ == '__main__':
beginDate_str = input('输入开始时间 (YYYYMMDD): ')
endDate_str = input('输入结束时间 (YYYYMMDD): ')
category = input('输入关键词: ')
save_path = input('输入的路径: ')
downloadArticles(beginDate_str, endDate_str, category, save_path)
代码改进说明:
- 增加代码注释:为代码添加了必要的注释,方便理解代码逻辑和功能。
- 完善分类预测功能:虽然代码中提供了
predictCategory函数,但目前仅为占位符。你可以根据实际需求,使用机器学习模型或其他方法来实现更准确的分类预测功能。 - 异常处理:代码中增加了对网络请求失败和页面解析失败的异常处理,提高程序的健壮性。
- 优化文件保存路径:代码使用
os.path.join方法来拼接文件保存路径,避免手动拼接路径时的错误。 - 修改calculateAccuracy函数:解决了当
total_samples为0时,导致ZeroDivisionError的问题。
使用方法:
- 确保已经安装了
requests和BeautifulSoup4库。 - 运行代码,程序会提示你输入开始时间、结束时间、关键词和保存路径。
- 程序会根据你输入的信息,从人民日报网站上下载指定日期和类别下的新闻文章,并保存到指定的路径。
注意:
- 由于人民日报网站的网页结构可能会发生变化,代码可能需要进行相应的调整才能继续正常工作。
- 爬取网站数据需要遵守网站的robots协议,避免过度爬取导致网站服务器压力过大。
- 请勿将爬取到的数据用于任何商业用途或违法行为。
原文地址: https://www.cveoy.top/t/topic/qo16 著作权归作者所有。请勿转载和采集!