Python爬取人民日报文章:实现按时间和关键词筛选并下载
import requests
import json
import os
def crawl_people_daily(start_date, end_date, keyword):
url = 'http://search.people.com.cn/api-search/elasticSearch/search'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36',
'Referer': 'http://search.people.com.cn/cnpeople/search.do'
}
params = {
'keyword': keyword,
'pageNum': 1,
'pageSize': 20,
'facetFlag': 'false',
'siteName': '人民网',
'range': 'PD',
'nodeType': 'belongsId',
'nodeId': '0',
'sortType': '1',
'startTime': start_date,
'endTime': end_date
}
articles = []
total_count = 0
while True:
try:
response = requests.get(url, headers=headers, params=params)
data = response.json()
if not data:
break
for item in data['items']:
title = item['title']
content = item['content']
pub_time = item['pubTime']
source_name = item['sourceName']
articles.append({'title': title, 'content': content, 'pub_time': pub_time, 'source_name': source_name})
total_count += len(data['items'])
params['pageNum'] += 1
except Exception as e:
print('An error occurred while crawling:', e)
break
save_path = os.path.join(os.getcwd(), 'articles')
if not os.path.exists(save_path):
os.makedirs(save_path)
count = 0
for article in articles:
count += 1
filename = f"{count}.txt"
file_path = os.path.join(save_path, filename)
with open(file_path, 'w', encoding='utf-8') as f:
f.write(f"标题:{article['title']}
")
f.write(f"发布时间:{article['pub_time']}
")
f.write(f"来源:{article['source_name']}
")
f.write(f"内容:
{article['content']}
")
accuracy = count / total_count if total_count > 0 else 0
print(f"共爬取到{count}篇文章,正确率:{accuracy * 100:.2f}%")
if __name__ == '__main__':
start_date = input("请输入开始时间(格式:yyyy-mm-dd):")
end_date = input("请输入结束时间(格式:yyyy-mm-dd):")
keyword = input("请输入关键词:")
crawl_people_daily(start_date, end_date, keyword)
你可以根据需要修改保存文章的路径和文件名的格式。注意,代码中使用了requests库来发送HTTP请求,并使用os库来进行文件操作。确保你已经安装了这两个库:
pip install requests
请将代码保存为一个单独的.py文件并运行,按照提示输入开始时间、结束时间和关键字,然后程序将会开始爬取和保存相关的文章,并在窗口中显示爬取的文章数量和正确率。
原文地址: https://www.cveoy.top/t/topic/qptG 著作权归作者所有。请勿转载和采集!