京东人工智能书籍爬虫：提取书籍信息及评价数据

import\ time\nimport\ re\nfrom\ selenium\ import\ webdriver\nfrom\ selenium.webdriver.chrome.options\ import\ Options\nfrom\ bs4\ import\ BeautifulSoup\n\n#\ Using\ Chrome\ browser\ driver\ and\ setting\ it\ to\ headless\ mode\nchrome_options\ =\ Options()\nchrome_options.add_argument("--headless")\ndriver\ =\ webdriver.Chrome(options=chrome_options)\n\n#\ Sending\ request\ and\ retrieving\ webpage\ content\nurl\ =\ "https://search.jd.com/Search?keyword=%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD%E5%9B%BE%E4%B9%A6&enc=utf-8&suggest=1.his.0.0&wq=&pvid=bc47d80392dd4b19980a4ca7ea7de082"\ndriver.get(url)\n\n#\ Delay\ to\ allow\ the\ page\ to\ load\ completely\ntime.sleep(5)\n\n#\ Retrieving\ html\ content\ after\ the\ page\ has\ fully\ loaded\nhtml_content\ =\ driver.page_source\n\n#\ Using\ BeautifulSoup\ to\ parse\ the\ html\ content\nsoup\ =\ BeautifulSoup(html_content, "html.parser")\n\n#\ Finding\ all\ divs\ with\ class\ "gl-i-wrap"\ containing\ product\ information\ndiv_list\ =\ soup.find_all("div", class_="gl-i-wrap")\n\n#\ Extracting\ text\ information\ from\ each\ div\nfor\ div\ in\ div_list:\n\tname\ =\ div.find("div", class_="p-name").get_text()\n\tprice\ =\ div.find("div", class_="p-price").get_text()\n\tcommit\ =\ div.find("div", class_="p-commit").get_text()\n\tcommit\ =\ commit.replace('条评价', '').replace('+', '')\n\tif\ '万'\ in\ commit:\n\t commit\ =\ float(commit.replace('万', ''))\ \ 10000\n\n\t#simulate\ click\ on\ the\ name,\ collect\ the\ information\ in\ the\ new\ page\n\tlink\ =\ div.find("div", class_="p-name").find("a").get("href")\n\tif\ "http"\ not\ in\ link:\n\t link\ =\ "https:" + link\n\n\t#open\ new\ tab\n\tdriver.execute_script(f'''window.open("{link}","_blank");''')\n\t#switch\ to\ the\ new\ tab\n\twindows\ =\ driver.window_handles\n\tdriver.switch_to.window(windows[-1])\n\ttime.sleep(5)\n\tsoup_new\ =\ BeautifulSoup(driver.page_source, "html.parser")\n\tsubmit_button\ =\ driver.find_element_by_xpath("//[@id='detail']/div[1]/ul/li[5]")\n\tsubmit_button.click()\n\ttime.sleep(1)\n\tpublisher\ =\ soup_new.find("li", title=True, clstag="shangpin|keycount|product|chubanshe_3")["title"]\n\t if\ soup_new.find("li", title=True, clstag="shangpin|keycount|product|chubanshe_3")\ is\ not\ None\ else\ '未找到'\n\tpublish_date\ =\ soup_new.find("li", text\ =\ re.compile(r"出版时间：")).get_text().replace('出版时间：', '')\n\t if\ soup_new.find("li", text\ =\ re.compile(r"出版时间："))\ is\ not\ None\ else\ '未找到'\n\n\tdriver.close()\n\tdriver.switch_to.window(windows[0])\n\n\tprint("Name:", name)\n\tprint("Price:", price)\n\tprint("Reviews:", commit)\n\tprint("Publishers:", publisher)\n\tprint("Publish Date:", publish_date)\n\n\t#simulate\ click\ on\ the\ name,\ collect\ the\ information\ in\ the\ new\ page\n\tlink\ =\ div.find("div", class_="p-name").find("a").get("href")\n\tif\ "http"\ not\ in\ link:\n\t link\ =\ "https:" + link\n\n\t#open\ new\ tab\n\tdriver.execute_script(f'''window.open("{link}","_blank");''')\n\t#switch\ to\ the\ new\ tab\n\twindows\ =\ driver.window_handles\n\tdriver.switch_to.window(windows[-1])\n\ttime.sleep(5)\n\tsoup_new\ =\ BeautifulSoup(driver.page_source, "html.parser")\n\tsubmit_button\ =\ driver.find_element_by_xpath("//[@id='detail']/div[1]/ul/li[5]")\n\tsubmit_button.click()\n\ttime.sleep(1)\n\t#\ 爬取并输出评价信息(好评中评差评数目)\n\tgood_comments\ =\ driver.find_elements_by_xpath("//[@id='comment']/div[2]/div[2]/div[1]/ul/li[5]/a/em")\n\tfor\ comment\ in\ good_comments:\n\t comment_text\ =\ comment.text.strip("()+ ")\n\t if\ "万"\ in\ comment_text:\n\t comment_text\ =\ str(int(float(comment_text.strip("万"))\ \ 10000))\n\t print("好评:" + comment_text)\n\tmedium_comments\ =\ driver.find_elements_by_xpath("//[@id='comment']/div[2]/div[2]/div[1]/ul/li[6]/a/em")\n\tfor\ comment\ in\ medium_comments:\n\t comment_text\ =\ comment.text.strip("()+ ")\n\t if\ "万"\ in\ comment_text:\n\t comment_text\ =\ str(int(float(comment_text.strip("万"))\ \ 10000))\n\t print("中评:" + comment_text)\n\n\tbad_comments\ =\ driver.find_elements_by_xpath("//[@id='comment']/div[2]/div[2]/div[1]/ul/li[7]/a/em")\n\tfor\ comment\ in\ bad_comments:\n\t comment_text\ =\ comment.text.strip("()+ ")\n\t if\ "万"\ in\ comment_text:\n\t comment_text\ =\ str(int(float(comment_text.strip("万"))\ *\ 10000))\n\t print("差评:" + comment_text)\n\n\tdriver.close()\n\tdriver.switch_to.window(windows[0])\n\n#\ Closing\ the\ browser\ driver\ndriver.quit()\n