import time\nimport re\nimport csv\nfrom selenium import webdriver\nfrom selenium.webdriver.chrome.options import Options\nfrom bs4 import BeautifulSoup\n\n# Using Chrome browser driver and setting it to headless mode\nchrome_options = Options()\n\ndriver = webdriver.Chrome(options=chrome_options)\n\n# Sending request and retrieving webpage content\nurl = "https://search.jd.com/Search?keyword=%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD%E5%9B%BE%E4%B9%A6"\ndriver.get(url)\n\n# Delay to allow the page to load completely\ntime.sleep(5)\n\n# Initializing variables\npage_number = 1\nmax_pages = 1 # Set the number of pages to crawl\ndata = []\n\nwhile page_number <= max_pages:\n print("Crawling Page", page_number)\n\n # Retrieving html content after the page has fully loaded\n html_content = driver.page_source\n\n # Using BeautifulSoup to parse the html content\n soup = BeautifulSoup(html_content, "html.parser")\n\n # Finding all divs with class "gl-i-wrap" containing product information\n div_list = soup.find_all("div", class_="gl-i-wrap")\n\n # Extracting text information from each div\n for div in div_list:\n name = div.find("div", class_="p-name").get_text()\n price = div.find("div", class_="p-price").get_text()\n commit = div.find("div", class_="p-commit").get_text()\n commit = commit.replace('条评价', '').replace('+', '')\n if '万' in commit:\n commit = float(commit.replace('万', '')) * 10000\n\n # simulate click on the name, collect the information in the new page\n link = div.find("div", class_="p-name").find("a").get("href")\n if "http" not in link:\n link = "https:" + link\n\n # open new tab\n driver.execute_script(f'''window.open("{link}","_blank");''')\n # switch to the new tab\n windows = driver.window_handles\n driver.switch_to.window(windows[-1])\n time.sleep(5)\n soup_new = BeautifulSoup(driver.page_source, "html.parser")\n\n time.sleep(6)\n publisher = soup_new.find("li", title=True, clstag="shangpin|keycount|product|chubanshe_3")["title"] \n if soup_new.find("li", title=True, clstag="shangpin|keycount|product|chubanshe_3") is not None else '未找到'\n publish_date = soup_new.find("li", text=re.compile(r"出版时间:")).get_text().replace('出版时间:', '') \n if soup_new.find("li", text=re.compile(r"出版时间:")) is not None else '未找到'\n\n driver.close()\n driver.switch_to.window(windows[0])\n\n # Find and extract the number of good, medium, and bad comments\n good_comments = driver.find_elements_by_xpath("//[@id='comment']/div[2]/div[2]/div[1]/ul/li[5]/a/em")\n medium_comments = driver.find_elements_by_xpath("//[@id='comment']/div[2]/div[2]/div[1]/ul/li[6]/a/em")\n bad_comments = driver.find_elements_by_xpath("//*[@id='comment']/div[2]/div[2]/div[1]/ul/li[7]/a/em")\n\n good_comment_count = 0\n medium_comment_count = 0\n bad_comment_count = 0\n\n for comment in good_comments:\n comment_text = comment.text.strip("()+ ")\n if "万" in comment_text:\n comment_text = str(int(float(comment_text.strip("万")) * 10000))\n good_comment_count += int(comment_text)\n\n for comment in medium_comments:\n comment_text = comment.text.strip("()+ ")\n if "万" in comment_text:\n comment_text = str(int(float(comment_text.strip("万")) * 10000))\n medium_comment_count += int(comment_text)\n\n for comment in bad_comments:\n comment_text = comment.text.strip("()+ ")\n if "万" in comment_text:\n comment_text = str(int(float(comment_text.strip("万")) * 10000))\n bad_comment_count += int(comment_text)\n\n # Append data to the list\n info = {\n "书名": name,\n "价格": price,\n "评论数": commit,\n "好评数": good_comment_count,\n "中评数": medium_comment_count,\n "差评数": bad_comment_count,\n "出版社": publisher,\n "出版年份": publish_date\n }\n data.append(info)\n\n # Click next page button if available\n next_page_button = driver.find_element_by_class_name("pn-next")\n if next_page_button:\n next_page_button.click()\n time.sleep(3) # Delay to allow the next page to load completely\n else:\n break\n\n page_number += 1\n\n# Closing the browser driver\ndriver.quit()\n\n# Save data to CSV file\nfilename = "book_info.csv"\nfields = ["书名", "价格", "评论数", "好评数", "中评数", "差评数", "出版社", "出版年份"]\n\nwith open(filename, 'w', newline='', encoding='utf-8') as csvfile:\n writer = csv.DictWriter(csvfile, fieldnames=fields)\n writer.writeheader()\n writer.writerows(data)\n\nprint("Data saved to", filename)\n


原文地址: https://www.cveoy.top/t/topic/pEvm 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录