人民日报爬虫: 自动抓取指定日期范围内的新闻文章

"---"文章爬取系统---"\n请输入开始日期(格式如20220706):"\n请输入结束日期(格式如20220706):"\nimport requests\nfrom bs4 import BeautifulSoup\nimport os\nimport datetime\nfrom datetime import date\n\ndef fetchUrl(url):\n headers = {\n 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8',\n 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',\n }\n r = requests.get(url, headers=headers)\n r.raise_for_status()\n r.encoding = r.apparent_encoding\n return r.text\n\ndef getPageList(year, month, day):\n url = 'http://paper.people.com.cn/rmrb/html/' + year + '-' + month + '/' + day + '/nbs.D110000renmrb_01.htm'\n html = fetchUrl(url)\n bsobj = BeautifulSoup(html, 'html.parser')\n temp = bsobj.find('div', attrs={'id': 'List'})\n if temp:\n pageList = temp.ul.find_all('div', attrs={'class': 'right_title-name'})\n else:\n temp = bsobj.find('div', attrs={'class': '-container'})\n pageList = temp.find_all('div', attrs={'class': 'swiper-slide'})\n linkList = []\n for page in pageList:\n link = page.a["href"]\n url = 'http://paper.people.com.cn/rmrb/html/' + year + '-' + month + '/' + day + '/' + link\n linkList.append(url)\n return linkList\n\ndef getTitleList(year, month, day, pageUrl, category_count):\n html = fetchUrl(pageUrl)\n bsobj = BeautifulSoup(html, 'html.parser')\n temp = bsobj.find('div', attrs={'id': 'titleList'})\n if temp:\n tempList = temp.ul.find_all('li')\n else:\n tempList = bsobj.find('ul', attrs={'class': 'news-list'}).find_all('li')\n linkList = []\n for title in tempList:\n link = title.a["href"]\n if 'nw.D110000renmrb' in link:\n url = 'http://paper.people.com.cn/rmrb/html/' + year + '-' + month + '/' + day + '/' + link\n content = fetchUrl(url)\n\n # 判断文章类别\n if '体育' in content:\n category = '体育'\n elif '环境' in content:\n category = '环境'\n else:\n category = '其他'\n\n category_count[category] += 1 # 统计每种类型的数量\n linkList.append({"url": url, "category": category})\n return linkList\n\ndef getContent(html):\n bsobj = BeautifulSoup(html, 'html.parser')\n title = bsobj.h3.text + '\n' + bsobj.h1.text + '\n' + bsobj.h2.text + '\n'\n pList = bsobj.find('div', attrs={'id': 'ozoom'}).find_all('p')\n content = ''\n for p in pList:\n content += p.text + '\n'\n return title + content\n\ndef saveFile(content, path, filename, category, category_count, category_ratio):\n if not os.path.exists(path):\n os.makedirs(path)\n with open(os.path.join(path, filename), 'w', encoding='utf-8') as f:\n f.write(f'分类：{category}\n')\n f.write(f'数量：{category_count[category]}\n')\n f.write(f'比：{category_ratio[category] * 100:.2f}%\n')\n f.write(content)\n print(f'文章已保存为：{os.path.join(path, filename)}')\n\ndef download_rmrb(year, month, day, destdir):\n pageList = getPageList(year, month, day)\n category_count = {"体育": 0, "环境": 0, "其他": 0}\n for page in pageList:\n titleList = getTitleList(year, month, day, page, category_count)\n for item in titleList:\n url = item["url"]\n category = item["category"]\n html = fetchUrl(url)\n content = getContent(html)\n temp = url.split('_')\n pageNo = temp[-2]\n titleNo = temp[-1].split('.')[0]\n category_dir = os.path.join(destdir, category)\n filename = f"{year}{month}{day}-{pageNo}-{titleNo}.txt"\n\n total_samples = sum(category_count.values()) # 计算总本数量\n category_ratio = {k: v / total_samples for k, v in category_count.items()} # 计算每种类型总样本中的比例\n\n saveFile(content, category_dir, filename, category, category_count, category_ratio)\n\ndef gen_dates(b_date, days):\n day = datetime.timedelta(days=1)\n for i in range(days):\n yield b_date + day * i\n\ndef get_date_list(beginDate, endDate):\n start = datetime.datetime.strptime(beginDate.strftime("%Y%m%d"), "%Y%m%d")\n end = datetime.datetime.strptime(endDate.strftime("%Y%m%d"), "%Y%m%d")\n data = []\n for d in gen_dates(start, (end - start).days + 1):\n data.append(d)\n return data\n\nif name == 'main':\n beginDate = input('请输入开始日期(格式如20220706):')\n endDate = input('请输入结束日期(格式如20220706):')\n beginDate = date(int(beginDate[:4]), int(beginDate[4:6]), int(beginDate[6:]))\n endDate = date(int(endDate[:4]), int(endDate[4:6]), int(endDate[6:]))\n\n data = get_date_list(beginDate, endDate)\n for d in data:\n year = str(d.year)\n month = str(d.month) if d.month >= 10 else '0' + str(d.month)\n day = str(d.day) if d.day >= 10 else '0' + str(d.day)\n destdir = "./文章"\n download_rmrb(year, month, day, destdir)\n print(f'爬取文章时间为：{year}/{month}/{day}的文章成功写入文件夹中！')\n print("文章爬取完成！")\n