Python 爬取京东书籍信息并输出到 CSV 文件
import\x20time\nimport\x20re\nimport\x20csv\nfrom\x20selenium\x20import\x20webdriver\nfrom\x20selenium.webdriver.chrome.options\x20import\x20Options\nfrom\x20bs4\x20import\x20BeautifulSoup\nfrom\x20collections\x20import\x20defaultdict\n\n#\x20Using\x20Chrome\x20browser\x20driver\x20and\x20setting\x20it\x20to\x20headless\x20mode\nchrome_options\x20=\x20Options()\n\ndriver\x20=\x20webdriver.Chrome(options=chrome_options)\n\n#\x20Sending\x20request\x20and\x20retrieving\x20webpage\x20content\nurl\x20=\x20"https://search.jd.com/Search?keyword=%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD%E5%9B%BE%E4%B9%A6"\ndriver.get(url)\n\n#\x20Delay\x20to\x20allow\x20the\x20page\x20to\x20load\x20completely\ntime.sleep(5)\n\n#\x20Initializing\x20variables\npage_number\x20=\x201\nmax_pages\x20=\x201\x20#\x20Set\x20the\x20number\x20of\x20pages\x20to\x20crawl\ndata\x20=\x20[]\ncomment_counts\x20=\x20defaultdict(int)\n\nwhile\x20page_number\x20<=\x20max_pages:\n\x20\x20print("Crawling\x20Page",\x20page_number)\n\n\x20\x20#\x20Retrieving\x20html\x20content\x20after\x20the\x20page\x20has\x20fully\x20loaded\n\x20\x20html_content\x20=\x20driver.page_source\n\n\x20\x20#\x20Using\x20BeautifulSoup\x20to\x20parse\x20the\x20html\x20content\n\x20\x20soup\x20=\x20BeautifulSoup(html_content,\x20"html.parser")\n\n\x20\x20#\x20Finding\x20all\x20divs\x20with\x20class\x20"gl-i-wrap"\x20containing\x20product\x20information\n\x20\x20div_list\x20=\x20soup.find_all("div",\x20class_="gl-i-wrap")\n\n\x20\x20#\x20Extracting\x20text\x20information\x20from\x20each\x20div\n\x20\x20for\x20div\x20in\x20div_list:\n\x20\x20\x20\x20name\x20=\x20div.find("div",\x20class_="p-name").get_text()\n\x20\x20\x20\x20price\x20=\x20div.find("div",\x20class_="p-price").get_text()\n\x20\x20\x20\x20commit\x20=\x20div.find("div",\x20class_="p-commit").get_text()\n\x20\x20\x20\x20commit\x20=\x20commit.replace('条评价',\x20'').replace('+',\x20'')\n\x20\x20\x20\x20if\x20'万' in commit:\n\x20\x20\x20\x20\x20\x20commit\x20=\x20float(commit.replace('万',\x20'')) * 10000\n\n\x20\x20\x20\x20#\x20simulate\x20click\x20on\x20the\x20name,\x20collect\x20the\x20information\x20in\x20the\x20new\x20page\n\x20\x20\x20\x20link\x20=\x20div.find("div",\x20class_="p-name").find("a").get("href")\n\x20\x20\x20\x20if\x20"http"\x20not\x20in\x20link:\n\x20\x20\x20\x20\x20\x20link\x20=\x20"https:" + link\n\n\x20\x20\x20\x20#\x20open\x20new\x20tab\n\x20\x20\x20\x20driver.execute_script(f'''window.open("{link}","blank");''')\n\x20\x20\x20\x20#\x20switch\x20to\x20the\x20new\x20tab\n\x20\x20\x20\x20windows\x20=\x20driver.window_handles\n\x20\x20\x20\x20driver.switch_to.window(windows[-1])\n\x20\x20\x20\x20time.sleep(5)\n\x20\x20\x20\x20soup_new\x20=\x20BeautifulSoup(driver.page_source,\x20"html.parser")\n\n\x20\x20\x20\x20time.sleep(6)\n\x20\x20\x20\x20publisher\x20=\x20soup_new.find("li",\x20title=True,\x20clstag="shangpin|keycount|product|chubanshe_3")["title"]\n\x20\x20\x20\x20\x20\x20if\x20soup_new.find("li",\x20title=True,\x20clstag="shangpin|keycount|product|chubanshe_3")\x20is\x20not\x20None\x20else\x20'未找到'\n\x20\x20\x20\x20publish_date\x20=\x20soup_new.find("li",\x20text=re.compile(r"出版时间:")).get_text().replace('出版时间:',\x20'')\n\x20\x20\x20\x20\x20\x20if\x20soup_new.find("li",\x20text=re.compile(r"出版时间:"))\x20is\x20not\x20None\x20else\x20'未找到'\n\n\x20\x20\x20\x20driver.close()\n\x20\x20\x20\x20driver.switch_to.window(windows[0])\n\n\x20\x20\x20\x20#\x20Extracting\x20comment\x20counts\n\x20\x20\x20\x20good_comments\x20=\x20div.find("div",\x20class="p-commit").find_all("a",\x20class_="hide")\n\x20\x20\x20\x20for\x20comment\x20in\x20good_comments:\n\x20\x20\x20\x20\x20\x20comment_text\x20=\x20comment.get_text().strip("()+\n\x20\x20\x20\x20\x20\x20if\x20"万"\x20in\x20comment_text:\n\x20\x20\x20\x20\x20\x20\x20\x20comment_text\x20=\x20str(int(float(comment_text.strip("万")) * 10000))\n\x20\x20\x20\x20\x20\x20comment_counts["好评"]\x20+=\x20int(comment_text)\n\n\x20\x20\x20\x20medium_comments\x20=\x20div.find("div",\x20class_="p-commit").find_all("a",\x20class_="hide")\n\x20\x20\x20\x20for\x20comment\x20in\x20medium_comments:\n\x20\x20\x20\x20\x20\x20comment_text\x20=\x20comment.get_text().strip("()+\n\x20\x20\x20\x20\x20\x20if\x20"万"\x20in\x20comment_text:\n\x20\x20\x20\x20\x20\x20\x20\x20comment_text\x20=\x20str(int(float(comment_text.strip("万")) * 10000))\n\x20\x20\x20\x20\x20\x20comment_counts["中评"]\x20+=\x20int(comment_text)\n\n\x20\x20\x20\x20bad_comments\x20=\x20div.find("div",\x20class_="p-commit").find_all("a",\x20class_="hide")\n\x20\x20\x20\x20for\x20comment\x20in\x20bad_comments:\n\x20\x20\x20\x20\x20\x20comment_text\x20=\x20comment.get_text().strip("()+\n\x20\x20\x20\x20\x20\x20if\x20"万"\x20in\x20comment_text:\n\x20\x20\x20\x20\x20\x20\x20\x20comment_text\x20=\x20str(int(float(comment_text.strip("万")) * 10000))\n\x20\x20\x20\x20\x20\x20comment_counts["差评"]\x20+=\x20int(comment_text)\n\n\x20\x20\x20\x20#\x20Append\x20data\x20to\x20the\x20list\n\x20\x20\x20\x20info\x20=\x20{\n\x20\x20\x20\x20\x20\x20"书名":\x20name,\n\x20\x20\x20\x20\x20\x20"价格":\x20price,\n\x20\x20\x20\x20\x20\x20"评论数":\x20commit,\n\x20\x20\x20\x20\x20\x20"出版社":\x20publisher,\n\x20\x20\x20\x20\x20\x20"出版年份":\x20publish_date,\n\x20\x20\x20\x20\x20\x20"好评数":\x20comment_counts["好评"],\n\x20\x20\x20\x20\x20\x20"中评数":\x20comment_counts["中评"],\n\x20\x20\x20\x20\x20\x20"差评数":\x20comment_counts["差评"]\n\x20\x20\x20\x20}\ndata.append(info)\n\n\x20\x20#\x20Click\x20next\x20page\x20button\x20if\x20available\n\x20\x20next_page_button\x20=\x20driver.find_element_by_class_name("pn-next")\n\x20\x20if\x20next_page_button:\n\x20\x20\x20\x20next_page_button.click()\n\x20\x20\x20\x20time.sleep(3)\x20#\x20Delay\x20to\x20allow\x20the\x20next\x20page\x20to\x20load\x20completely\n\x20\x20else:\n\x20\x20\x20\x20break\n\n\x20\x20page_number\x20+=\x201\n\n#\x20Closing\x20the\x20browser\x20driver\ndriver.quit()\n\n#\x20Save\x20data\x20to\x20CSV\x20file\nfilename\x20=\x20"book_info.csv"\nfields\x20=\x20["书名",\x20"价格",\x20"评论数",\x20"出版社",\x20"出版年份",\x20"好评数",\x20"中评数",\x20"差评数"]\n\nwith\x20open(filename,\x20'w',\x20newline='',\x20encoding='utf-8')\x20as\x20csvfile:\n\x20\x20writer\x20=\x20csv.DictWriter(csvfile,\x20fieldnames=fields)\n\x20\x20writer.writeheader()\n\x20\x20writer.writerows(data)\n\nprint("Data\x20saved\x20to",\x20filename)
原文地址: https://www.cveoy.top/t/topic/pEvN 著作权归作者所有。请勿转载和采集!