京东人工智能书籍信息爬取 - 爬取书名、价格、评论数、出版社、出版年份、好评、中评、差评
"import" "time"\n"import" "re"\n"import" "csv"\n"from" "selenium" "import" "webdriver"\n"from" "selenium.webdriver.chrome.options" "import" "Options"\n"from" "bs4" "import" "BeautifulSoup"\n\n# "Using" "Chrome" "browser" "driver" "and" "setting" "it" "to" "headless" "mode"\n"chrome_options" "=" "Options()"\n"chrome_options.add_argument("--headless")"\n\n"driver" "=" "webdriver.Chrome(options=chrome_options)"\n\n# "Sending" "request" "and" "retrieving" "webpage" "content"\n"url" "=" "https://search.jd.com/Search?keyword=%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD%E5%9B%BE%E4%B9%A6"\n"driver.get(url)"\n\n# "Delay" "to" "allow" "the" "page" "to" "load" "completely"\n"time.sleep(5)"\n\n# "Initializing" "variables"\n"page_number" "=" "1"\n"max_pages" "=" "1" # "Set" "the" "number" "of" "pages" "to" "crawl"\n"data" "=" "[]"\n\n"while" "page_number" "<=" "max_pages:"\n "print("Crawling" "Page"," "page_number)"\n\n # "Retrieving" "html" "content" "after" "the" "page" "has" "fully" "loaded"\n "html_content" "=" "driver.page_source"\n\n # "Using" "BeautifulSoup" "to" "parse" "the" "html" "content"\n "soup" "=" "BeautifulSoup(html_content, "html.parser")"\n\n # "Finding" "all" "divs" "with" "class" "gl-i-wrap" "containing" "product" "information"\n "div_list" "=" "soup.find_all("div", "class_"=""gl-i-wrap")"\n\n # "Extracting" "text" "information" "from" "each" "div"\n "for" "div" "in" "div_list:"\n "name" "=" "div.find("div", "class_"=""p-name").get_text()"\n "price" "=" "div.find("div", "class_"=""p-price").get_text()"\n "commit" "=" "div.find("div", "class_"=""p-commit").get_text()"\n "commit" "=" "commit.replace("条评价", "").replace("+", "")"\n "if" ""万" "in" "commit:"\n "commit" "=" "float(commit.replace("万", "")) * 10000"\n\n # "simulate" "click" "on" "the" "name," "collect" "the" "information" "in" "the" "new" "page"\n "link" "=" "div.find("div", "class_"=""p-name").find("a").get("href")"\n "if" ""http" "not" "in" "link:"\n "link" "=" "https:" + link"\n\n # "open" "new" "tab"\n "driver.execute_script(f'''window.open("{link}","blank");''')"\n # "switch" "to" "the" "new" "tab"\n "windows" "=" "driver.window_handles"\n "driver.switch_to.window(windows[-1])"\n "time.sleep(5)"\n "soup_new" "=" "BeautifulSoup(driver.page_source, "html.parser")"\n\n "time.sleep(6)"\n "publisher" "=" "soup_new.find("li", title=True, clstag="shangpin|keycount|product|chubanshe_3")["title"]" "if" "soup_new.find("li", title=True, clstag="shangpin|keycount|product|chubanshe_3") "is" "not" "None" "else" ""未找到")"\n "publish_date" "=" "soup_new.find("li", text=re.compile(r"出版时间:")).get_text().replace("出版时间:", "")" "if" "soup_new.find("li", text=re.compile(r"出版时间:")) "is" "not" "None" "else" ""未找到")"\n\n # "Extracting" "review" "information"\n "review_list" "=" "soup_new.find_all("div", "class"=""comment-item")"\n "positive_reviews" "=" "0"\n "neutral_reviews" "=" "0"\n "negative_reviews" "=" "0"\n\n "for" "review" "in" "review_list:"\n "review_text" "=" "review.find("div", "class_"=""comment-con").get_text()"\n "if" ""好评" "in" "review_text:"\n "positive_reviews" "+=" "1"\n "elif" ""中评" "in" "review_text:"\n "neutral_reviews" "+=" "1"\n "elif" ""差评" "in" "review_text:"\n "negative_reviews" "+=" "1"\n\n "driver.close()"\n "driver.switch_to.window(windows[0])"\n\n # "Append" "data" "to" "the" "list"\n "info" "=" "{"\n ""书名"":" "name,"\n ""价格"":" "price,"\n ""评论数"":" "commit,"\n ""出版社"":" "publisher,"\n ""出版年份"":" "publish_date,"\n ""好评"":" "positive_reviews,"\n ""中评"":" "neutral_reviews,"\n ""差评"":" "negative_reviews\n }"\n "data.append(info)"\n\n # "Click" "next" "page" "button" "if" "available"\n "next_page_button" "=" "driver.find_element_by_class_name("pn-next")"\n "if" "next_page_button:"\n "next_page_button.click()"\n "time.sleep(3)" # "Delay" "to" "allow" "the" "next" "page" "to" "load" "completely"\n "else:"\n "break"\n\n "page_number" "+=" "1"\n\n# "Closing" "the" "browser" "driver"\n"driver.quit()"\n\n# "Save" "data" "to" "CSV" "file"\n"filename" "=" ""book_info.csv")"\n"fields" "=" "["书名", "价格", "评论数", "出版社", "出版年份", "好评", "中评", "差评"]"\n\n"with" "open(filename, "w", newline="", encoding="utf-8") "as" "csvfile:"\n "writer" "=" "csv.DictWriter(csvfile, fieldnames=fields)"\n "writer.writeheader()"\n "writer.writerows(data)"\n\n"print("Data" "saved" "to"," "filename)\
原文地址: https://www.cveoy.top/t/topic/pEuM 著作权归作者所有。请勿转载和采集!