Python爬取人民日报文章并下载相关内容
以下是一个使用Python爬取人民日报文章并下载相关文章的示例代码:
import requests
from bs4 import BeautifulSoup
import os
def crawl_people_daily(start_date, end_date, keyword):
url = 'http://search.people.com.cn/cnpeople/news/getNewsResult.jsp'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Referer': 'http://search.people.com.cn/cnpeople/news/newsList'
}
params = {
'keyword': keyword,
'pageNum': 1,
'startDate': start_date,
'endDate': end_date
}
response = requests.get(url, headers=headers, params=params)
data = response.json()
total_page = data['totalPage']
articles = []
for page in range(1, total_page+1):
params['pageNum'] = page
response = requests.get(url, headers=headers, params=params)
data = response.json()
for item in data['items']:
articles.append(item['url'])
count = 0
correct = 0
for article_url in articles:
response = requests.get(article_url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
title = soup.find('div', class_='clearfix w1000_320 text_title').h1.text
content = soup.find('div', class_='box_con').text
if keyword in content:
correct += 1
count += 1
# 创建文件夹用于保存文章
folder_path = os.path.join(os.getcwd(), 'articles')
if not os.path.exists(folder_path):
os.makedirs(folder_path)
# 保存文章
file_path = os.path.join(folder_path, f'{title}.txt')
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)
accuracy = correct / count * 100
print(f'共爬取到{count}篇文章')
print(f'正确率:{accuracy}%')
if __name__ == '__main__':
start_date = input('请输入开始时间(格式:YYYY-MM-DD):')
end_date = input('请输入结束时间(格式:YYYY-MM-DD):')
keyword = input('请输入关键词:')
crawl_people_daily(start_date, end_date, keyword)
在运行.py文件后,您需要按照提示输入开始时间、结束时间和关键词。然后,程序将爬取人民日报中包含关键词的文章,并将相关文章下载到当前路径下的名为'articles'的文件夹中。最后,程序会在窗口中显示爬取的文章数量和正确性的比率。请确保已安装requests和beautifulsoup4库。
原文地址: https://www.cveoy.top/t/topic/qptB 著作权归作者所有。请勿转载和采集!