京东人工智能书籍信息爬取
import\u0020time import\u0020re import\u0020csv from\u0020selenium\u0020import\u0020webdriver from\u0020selenium.webdriver.chrome.options\u0020import\u0020Options from\u0020bs4\u0020import\u0020BeautifulSoup
#\u0020Using\u0020Chrome\u0020browser\u0020driver\u0020and\u0020setting\u0020it\u0020to\u0020headless\u0020mode chrome_options\u0020=\u0020Options()
driver\u0020=\u0020webdriver.Chrome(options=chrome_options)
#\u0020Sending\u0020request\u0020and\u0020retrieving\u0020webpage\u0020content url\u0020=\u0020"https://search.jd.com/Search?keyword=%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD%E5%9B%BE%E4%B9%A6" driver.get(url)
#\u0020Delay\u0020to\u0020allow\u0020the\u0020page\u0020to\u0020load\u0020completely time.sleep(5)
#\u0020Initializing\u0020variables page_number\u0020=\u00201 max_pages\u0020=\u00201\u0020#\u0020Set\u0020the\u0020number\u0020of\u0020pages\u0020to\u0020crawl data\u0020=\u0020[]
while\u0020page_number\u0020<=\u0020max_pages: \tprint("Crawling\u0020Page",\u0020page_number)
\t#\u0020Retrieving\u0020html\u0020content\u0020after\u0020the\u0020page\u0020has\u0020fully\u0020loaded \thtml_content\u0020=\u0020driver.page_source
\t#\u0020Using\u0020BeautifulSoup\u0020to\u0020parse\u0020the\u0020html\u0020content \tsoup\u0020=\u0020BeautifulSoup(html_content,\u0020"html.parser")
\t#\u0020Finding\u0020all\u0020divs\u0020with\u0020class\u0020"gl-i-wrap"\u0020containing\u0020product\u0020information \tdiv_list\u0020=\u0020soup.find_all("div",\u0020class_="gl-i-wrap")
\t#\u0020Extracting\u0020text\u0020information\u0020from\u0020each\u0020div \tfor\u0020div\u0020in\u0020div_list: \t\tname\u0020=\u0020div.find("div",\u0020class_="p-name").get_text() \t\tprice\u0020=\u0020div.find("div",\u0020class_="p-price").get_text() \t\tcommit\u0020=\u0020div.find("div",\u0020class_="p-commit").get_text() \t\tcommit\u0020=\u0020commit.replace('条评价',\u0020'').replace('+',\u0020'') \t\tif\u0020'万' in commit: \t\t\tcommit\u0020=\u0020float(commit.replace('万',\u0020'')) * 10000
\t\t#\u0020simulate\u0020click\u0020on\u0020the\u0020name,\u0020collect\u0020the\u0020information\u0020in\u0020the\u0020new\u0020page \t\tlink\u0020=\u0020div.find("div",\u0020class_="p-name").find("a").get("href") \t\tif\u0020"http" not in link: \t\t\tlink\u0020=\u0020"https:" + link
\t\t#\u0020open\u0020new\u0020tab \t\tdriver.execute_script(f'''window.open("{link}","_blank");''') \t\t#\u0020switch\u0020to\u0020the\u0020new\u0020tab \t\twindows\u0020=\u0020driver.window_handles \t\tdriver.switch_to.window(windows[-1]) \t\ttime.sleep(5) \t\tsoup_new\u0020=\u0020BeautifulSoup(driver.page_source,\u0020"html.parser")
\t\ttime.sleep(6)
\t\tpublisher\u0020=\u0020soup_new.find("li",\u0020title=True,\u0020clstag="shangpin|keycount|product|chubanshe_3")["title"]
\t\t\tif\u0020soup_new.find("li",\u0020title=True,\u0020clstag="shangpin|keycount|product|chubanshe_3") is not None else '未找到'
\t\tpublish_date\u0020=\u0020soup_new.find("li",\u0020text=re.compile(r"出版时间:")).get_text().replace('出版时间:',\u0020'')
\t\t\tif\u0020soup_new.find("li",\u0020text=re.compile(r"出版时间:")) is not None else '未找到'
\t\tdriver.close() \t\tdriver.switch_to.window(windows[0])
\t\t#\u0020Append\u0020data\u0020to\u0020the\u0020list \t\tinfo\u0020=\u0020{ \t\t\t"书名":\u0020name, \t\t\t"价格":\u0020price, \t\t\t"评论数":\u0020commit, \t\t\t"出版社":\u0020publisher, \t\t\t"出版年份":\u0020publish_date \t\t} \t\tdata.append(info)
\t\t#\u0020Click\u0020on\u0020the\u0020"商品评价"\u0020button \t\tshop_button\u0020=\u0020driver.find_elements_by_xpath("//*[@id='detail']/div[1]/ul/li[5]")[0] \t\tshop_button.click() \t\ttime.sleep(2)
\t\t#\u0020爬取并输出评价信息(好评中评差评数目) \t\tgood_comments\u0020=\u0020driver.find_elements_by_xpath("//*[@id='comment']/div[2]/div[2]/div[1]/ul/li[5]/a/em") \t\tfor\u0020comment\u0020in\u0020good_comments: \t\t\tcomment_text\u0020=\u0020comment.text.strip("()+)" \t\t\tif\u0020"万" in comment_text: \t\t\t\tcomment_text\u0020=\u0020str(int(float(comment_text.strip("万")) * 10000)) \t\t\tprint("好评:" + comment_text)
\t\tmedium_comments\u0020=\u0020driver.find_elements_by_xpath("//*[@id='comment']/div[2]/div[2]/div[1]/ul/li[6]/a/em") \t\tfor\u0020comment\u0020in\u0020medium_comments: \t\t\tcomment_text\u0020=\u0020comment.text.strip("()+)" \t\t\tif\u0020"万" in comment_text: \t\t\t\tcomment_text\u0020=\u0020str(int(float(comment_text.strip("万")) * 10000)) \t\t\tprint("中评:" + comment_text)
\t\tbad_comments\u0020=\u0020driver.find_elements_by_xpath("//*[@id='comment']/div[2]/div[2]/div[1]/ul/li[7]/a/em") \t\tfor\u0020comment\u0020in\u0020bad_comments: \t\t\tcomment_text\u0020=\u0020comment.text.strip("()+)" \t\t\tif\u0020"万" in comment_text: \t\t\t\tcomment_text\u0020=\u0020str(int(float(comment_text.strip("万")) * 10000)) \t\t\tprint("差评:" + comment_text)
\t#\u0020Click\u0020next\u0020page\u0020button\u0020if\u0020available \tnext_page_button\u0020=\u0020driver.find_element_by_class_name("pn-next") \tif\u0020next_page_button: \t\tnext_page_button.click() \t\ttime.sleep(3)\u0020#\u0020Delay\u0020to\u0020allow\u0020the\u0020next\u0020page\u0020to\u0020load\u0020completely \telse: \t\tbreak
\tpage_number\u0020+=\u00201
#\u0020Closing\u0020the\u0020browser\u0020driver driver.quit()
#\u0020Save\u0020data\u0020to\u0020CSV\u0020file filename\u0020=\u0020"book_info.csv" fields\u0020=\u0020["书名",\u0020"价格",\u0020"评论数",\u0020"出版社",\u0020"出版年份"]
with\u0020open(filename,\u0020'w',\u0020newline='',\u0020encoding='utf-8') as csvfile: \twriter\u0020=\u0020csv.DictWriter(csvfile,\u0020fieldnames=fields) \twriter.writeheader() \twriter.writerows(data)
print("Data\u0020saved\u0020to",\u0020filename
原文地址: https://www.cveoy.top/t/topic/pEvA 著作权归作者所有。请勿转载和采集!