Python爬取人民日报文章 - 自动下载关键词相关内容
以下是一个示例代码,使用Python的requests库和BeautifulSoup库来爬取人民日报的文章,并根据关键词将相关文章下载到指定路径中。在窗口中显示爬取的文章数量和正确性比率。\n\npython\nimport requests\nfrom bs4 import BeautifulSoup\nimport os\n\n# 定义爬取函数\ndef crawl_articles(start_date, end_date, keyword, save_path):\n article_count = 0\n correct_count = 0\n\n # 遍历日期范围\n current_date = start_date\n while current_date <= end_date:\n # 构造日期字符串\n date_str = current_date.strftime("%Y-%m/%d")\n url = "http://paper.people.com.cn/rmrb/html/{}/{}/nbs.D110000renmrb_01.htm".format(date_str[:4], date_str)\n\n # 发送请求\n response = requests.get(url)\n response.encoding = 'utf-8'\n html = response.text\n\n # 解析HTML\n soup = BeautifulSoup(html, 'html.parser')\n articles = soup.find_all("div", class_="news_li")\n\n # 处理每篇文章\n for article in articles:\n title = article.find("h3").text\n content_url = article.find("a")["href"]\n\n # 发送请求获取文章内容\n article_url = "http://paper.people.com.cn/rmrb/html/{}/{}/{}".format(date_str[:4], date_str, content_url)\n article_response = requests.get(article_url)\n article_response.encoding = 'utf-8'\n article_html = article_response.text\n\n # 解析文章内容\n article_soup = BeautifulSoup(article_html, 'html.parser')\n article_content = article_soup.find("div", class_="text_c").text\n\n # 检查关键词\n if keyword in article_content:\n correct_count += 1\n # 保存文章到指定路径\n with open(os.path.join(save_path, "{}.txt".format(title)), 'w', encoding='utf-8') as f:\n f.write(article_content)\n\n article_count += 1\n\n # 增加一天\n current_date += datetime.timedelta(days=1)\n\n # 计算正确性比率\n correctness_ratio = correct_count / article_count\n\n return article_count, correctness_ratio\n\nif __name__ == "__main__":\n import datetime\n\n # 输入开始时间、结束时间、关键词和保存路径\n start_date_str = input("请输入开始时间(YYYY-MM-DD):")\n end_date_str = input("请输入结束时间(YYYY-MM-DD):")\n keyword = input("请输入关键词:")\n save_path = input("请输入保存路径:")\n\n # 转换为日期格式\n start_date = datetime.datetime.strptime(start_date_str, "%Y-%m-%d")\n end_date = datetime.datetime.strptime(end_date_str, "%Y-%m-%d")\n\n # 爬取文章并获取统计数据\n article_count, correctness_ratio = crawl_articles(start_date, end_date, keyword, save_path)\n\n # 显示结果\n print("爬取的文章数量:", article_count)\n print("正确性比率:", correctness_ratio)\n\n\n请注意,该示例代码仅供参考,并不能保证在所有情况下正常运行。实际运行中可能需要根据人民日报网站的更新情况进行适当的调整和修改。
原文地址: https://www.cveoy.top/t/topic/qptV 著作权归作者所有。请勿转载和采集!