京东人工智能书籍信息爬取
"import" "time"\n"import" "re"\n"import" "csv"\n"from" "selenium" "import" "webdriver"\n"from" "selenium.webdriver.chrome.options" "import" "Options"\n"from" "bs4" "import" "BeautifulSoup"\n\n# "Using" "Chrome" "browser" "driver" "and" "setting" "it" "to" "headless" "mode"\n"chrome_options" "=" "Options()"\n"chrome_options.add_argument("--headless")"\n\n"driver" "=" "webdriver.Chrome(options=chrome_options)"\n\n# "Sending" "request" "and" "retrieving" "webpage" "content"\n"url" "=" "https://search.jd.com/Search?keyword=%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD%E5%9B%BE%E4%B9%A6"\n"driver.get(url)"\n\n# "Delay" "to" "allow" "the" "page" "to" "load" "completely"\n"time.sleep(5)"\n\n# "Initializing" "variables"\n"page_number" "=" "1"\n"max_pages" "=" "1" # "Set" "the" "number" "of" "pages" "to" "crawl"\n"data" "=" "[]"\n\n"while" "page_number" "<=" "max_pages:"\n "print("Crawling" "Page"," "page_number)"\n\n # "Retrieving" "html" "content" "after" "the" "page" "has" "fully" "loaded"\n "html_content" "=" "driver.page_source"\n\n # "Using" "BeautifulSoup" "to" "parse" "the" "html" "content"\n "soup" "=" "BeautifulSoup(html_content, "html.parser")"\n\n # "Finding" "all" "divs" "with" "class" ""gl-i-wrap""" "containing" "product" "information"\n "div_list" "=" "soup.find_all("div", "class_"=""gl-i-wrap")"\n\n # "Extracting" "text" "information" "from" "each" "div"\n "for" "div" "in" "div_list:"\n "name" "=" "div.find("div", "class_"=""p-name").get_text()"\n "price" "=" "div.find("div", "class_"=""p-price").get_text()"\n "commit" "=" "div.find("div", "class_"=""p-commit").get_text()"\n "commit" "=" "commit.replace("条评价", "").replace("+", "")"\n "if" ""万"" "in" "commit:"\n "commit" "=" "float(commit.replace("万", "")) * 10000"\n\n # "simulate" "click" "on" "the" "name," "collect" "the" "information" "in" "the" "new" "page"\n "link" "=" "div.find("div", "class_"=""p-name").find("a").get("href")"\n "if" ""http"" "not" "in" "link:"\n "link" "=" "https:" + link"\n\n # "open" "new" "tab"\n "driver.execute_script(f'''window.open("{link}","blank");''')"\n # "switch" "to" "the" "new" "tab"\n "windows" "=" "driver.window_handles"\n "driver.switch_to.window(windows[-1])"\n "time.sleep(5)"\n "soup_new" "=" "BeautifulSoup(driver.page_source, "html.parser")"\n\n "time.sleep(6)"\n "publisher" "=" "soup_new.find("li", title=True, clstag="shangpin|keycount|product|chubanshe_3")["title"]" "if" "soup_new.find("li", title=True, clstag="shangpin|keycount|product|chubanshe_3") "is" "not" "None" "else" ""未找到")"\n "publish_date" "=" "soup_new.find("li", text=re.compile(r"出版时间:")).get_text().replace("出版时间:", "")" "if" "soup_new.find("li", text=re.compile(r"出版时间:")) "is" "not" "None" "else" ""未找到")"\n\n "driver.close()"\n "driver.switch_to.window(windows[0])"\n\n # "Append" "data" "to" "the" "list"\n "info" "=" "{"书名": "name," "价格": "price," "评论数": "commit," "出版社": "publisher," "出版年份": "publish_date}"\n "data.append(info)"\n\n # "Click" "next" "page" "button" "if" "available"\n "next_page_button" "=" "driver.find_element_by_class_name("pn-next")"\n "if" "next_page_button:"\n "next_page_button.click()"\n "time.sleep(3)" # "Delay" "to" "allow" "the" "next" "page" "to" "load" "completely"\n "else:"\n "break"\n\n "page_number" "+=" "1"\n\n# "Closing" "the" "browser" "driver"\n"driver.quit()"\n\n# "Save" "data" "to" "CSV" "file"\n"filename" "=" ""book_info.csv")"\n"fields" "=" "["书名", "价格", "评论数", "出版社", "出版年份"]"\n\n"with" "open(filename, "w", newline="", encoding="utf-8") "as" "csvfile:"\n "writer" "=" "csv.DictWriter(csvfile, fieldnames=fields)"\n "writer.writeheader()"\n "writer.writerows(data)"\n\n"print("Data" "saved" "to"," "filename)"\n\n# "Crawling" "comments"\n"driver" "=" "webdriver.Chrome(options=chrome_options)"\n\n# "Sending" "request" "and" "retrieving" "webpage" "content"\n"driver.get(url)"\n\n# "Delay" "to" "allow" "the" "page" "to" "load" "completely"\n"time.sleep(5)"\n\n# "Initializing" "variables"\n"page_number" "=" "1"\n"max_pages" "=" "1" # "Set" "the" "number" "of" "pages" "to" "crawl"\n"comment_data" "=" "[]"\n\n"while" "page_number" "<=" "max_pages:"\n "print("Crawling" "Page"," "page_number)"\n\n # "Retrieving" "html" "content" "after" "the" "page" "has" "fully" "loaded"\n "html_content" "=" "driver.page_source"\n\n # "Using" "BeautifulSoup" "to" "parse" "the" "html" "content"\n "soup" "=" "BeautifulSoup(html_content, "html.parser")"\n\n # "Finding" "all" "divs" "with" "class" ""gl-i-wrap""" "containing" "product" "information"\n "div_list" "=" "soup.find_all("div", "class"=""gl-i-wrap")"\n\n # "Extracting" "text" "information" "from" "each" "div"\n "for" "div" "in" "div_list:"\n "name" "=" "div.find("div", "class_"=""p-name").get_text()"\n\n # "simulate" "click" "on" "the" "name," "collect" "the" "information" "in" "the" "new" "page"\n "link" "=" "div.find("div", "class_"=""p-name").find("a").get("href")"\n "if" ""http"" "not" "in" "link:"\n "link" "=" "https:" + link"\n\n # "open" "new" "tab"\n "driver.execute_script(f'''window.open("{link}","_blank");''')"\n # "switch" "to" "the" "new" "tab"\n "windows" "=" "driver.window_handles"\n "driver.switch_to.window(windows[-1])"\n "time.sleep(5)"\n "soup_new" "=" "BeautifulSoup(driver.page_source, "html.parser")"\n\n # "Click" "on" "the" ""商品评价""" "button"\n "shop_button" "=" "driver.find_elements_by_xpath("//[@id='detail']/div[1]/ul/li[5]")[0]"\n "shop_button.click()"\n "time.sleep(2)" # "爬取并输出评价信息(好评中评差评数目)"\n "good_comments" "=" "driver.find_elements_by_xpath("//[@id='comment']/div[2]/div[2]/div[1]/ul/li[5]/a/em")"\n "for" "comment" "in" "good_comments:"\n "comment_text" "=" "comment.text.strip("()+"")"\n "if" ""万"" "in" "comment_text:"\n "comment_text" "=" "str(int(float(comment_text.strip("万")) * 10000))"\n "print("好评:" + comment_text)"\n\n "medium_comments" "=" "driver.find_elements_by_xpath("//[@id='comment']/div[2]/div[2]/div[1]/ul/li[6]/a/em")"\n "for" "comment" "in" "medium_comments:"\n "comment_text" "=" "comment.text.strip("()+"")"\n "if" ""万"" "in" "comment_text:"\n "comment_text" "=" "str(int(float(comment_text.strip("万")) * 10000))"\n "print("中评:" + comment_text)"\n\n "bad_comments" "=" "driver.find_elements_by_xpath("//[@id='comment']/div[2]/div[2]/div[1]/ul/li[7]/a/em")"\n "for" "comment" "in" "bad_comments:"\n "comment_text" "=" "comment.text.strip("()+"")"\n "if" ""万"" "in" "comment_text:"\n "comment_text" "=" "str(int(float(comment_text.strip("万")) * 10000))"\n "print("差评:" + comment_text)"\n\n "driver.close()"\n "driver.switch_to.window(windows[0])"\n\n # "Click" "next" "page" "button" "if" "available"\n "next_page_button" "=" "driver.find_element_by_class_name("pn-next")"\n "if" "next_page_button:"\n "next_page_button.click()"\n "time.sleep(3)" # "Delay" "to" "allow" "the" "next" "page" "to" "load" "completely"\n "else:"\n "break"\n\n "page_number" "+=" "1"\n\n# "Closing" "the" "browser" "driver"\n"driver.quit()"\n\n"comment_data.append({"书名": "name," "好评数": "good_comments," "中评数": "medium_comments," "差评数": "bad_comments})"\n\n# "Save" "comment" "data" "to" "CSV" "file"\n"filename" "=" ""book_comment.csv")"\n"fields" "=" "["书名", "好评数", "中评数", "差评数"]"\n\n"with" "open(filename, "w", newline="", encoding="utf-8") "as" "csvfile:"\n "writer" "=" "csv.DictWriter(csvfile, fieldnames=fields)"\n "writer.writeheader()"\n "writer.writerows(comment_data)"\n\n"print("Data" "saved" "to"," "filename)"\n
原文地址: https://www.cveoy.top/t/topic/pEv0 著作权归作者所有。请勿转载和采集!