Bilibili 文章图片爬取工具 - 自动下载所有文章图片
import\s+re\nimport\s+requests\nimport\s+time\nfrom\s+selenium.webdriver.common.by\s+import\s+By\nfrom\s+bs4\s+import\s+BeautifulSoup\nimport\s+os\nfrom\s+selenium\s+import\s+webdriver\nfrom\s+selenium.webdriver.chrome.service\s+import\s+Service\nfrom\s+selenium.webdriver.chrome.options\s+import\s+Options\n\n#\s+【请修改】设置\s+Chrome\s+WebDriver\s+的路径,就写程序路径下面的chromedriver文件路径\nwebdriver_path\s+=\s+'/Users/geeds/Desktop/Work/Python/ZcoolDL/chromedriver'\n\n#\s+【请修改】指定保存图片的路径\nsave_path\s+=\s+"test7"\n\n#\s+【请修改】最大页面翻页次数\s+-\s+打开页面以后自动翻页多少次开始下载\nMAX_SCROLL_TIMES\s+=\s+20\n\n#\s+【请修改】页面网址\nbase_url\s+=\s+"https://search.bilibili.com/article?keyword=%E9%A5%AD%E6%8B%8D%E5%9B%BE&page=2"\n\n#\s+规范化路径并确保\s+save_path\s+以斜杠结尾\nsave_path\s+=\s+os.path.normpath(save_path)\s++\s+os.sep\nif\s+not\s+os.path.exists(save_path):\n\tos.makedirs(save_path)\n\n#\s+【可改可不改】每翻页一次,预留多少时间用于加载页面图片\s+(网络不好建议设置大一些,但会等待更久)\nDELAY_PER_TIMES\s+=\s+0.1\n\n#\s+设置\s+Chrome\s+WebDriver\s+的选项\nchrome_options\s+=\s+Options()\n#\s+chrome_options.add_argument('--headless')\s+#\s+无界面模式\nchrome_options.add_argument('--disable-gpu')\nchrome_options.add_argument("--disable-dev-shm-usage")\s+#\s+禁用/dev/shm使用\nchrome_options.add_argument("--disable-extensions")\s+#\s+禁用扩展\n\n#\s+创建\s+Chrome\s+WebDriver\ndriver\s+=\s+webdriver.Chrome(service=Service(executable_path=webdriver_path),\s+options=chrome_options)\n\n#\s+加载页面\nprint(f"正在召唤浏览器..")\ndriver.get(base_url)\n\n#\s+模拟向下滚动至页面底部\nprint(f"浏览器召唤完毕,准备进行模拟下滑操作..")\nfor\s+i\s+in\s+range(1,\s+MAX_SCROLL_TIMES\s++\s+1):\n\tscroll_to_bottom_script\s+=\s+"window.scrollTo(0,\s+document.body.scrollHeight);" driver.execute_script(scroll_to_bottom_script)\n\ttime.sleep(0.5)\n\tscroll_to_top_script\s+=\s+"window.scrollTo(0,\s+0);" scroll_upper_script\s+=\s+"window.window.scrollBy(0,\s+-666);;" driver.execute_script(scroll_upper_script)\n\tprint(f"正在努力进行第{i}次下滑!")\n\ttime.sleep(1)\n\nprint(f"等待\s+{max(3,\s+(MAX_SCROLL_TIMES\s*\s+DELAY_PER_TIMES))}\s+秒,以等待页面加载完成..")\ntime.sleep(max(3,\s+(MAX_SCROLL_TIMES\s*\s+DELAY_PER_TIMES)))\s+#\s+等待页面加载完成,根据需要适当调整等待时间\n\ndriver2\s+=\s+webdriver.Chrome(service=Service(executable_path=webdriver_path),\s+options=chrome_options)\nprint(f"开始抓取页面所有文章,请稍候..")\narticle_urls\s+=\s+driver.find_elements(By.CSS_SELECTOR,\s+'a')\narticle_urls\s+=\s+[article_url.get_attribute('href')\s+for\s+article_url\s+in\s+article_urls\s+if\s+'_blank'\s+in\s+article_url.get_attribute('target')]\narticle_urls\s+=\s+[article_url\s+for\s+article_url\s+in\s+article_urls\s+if\s+'www.bilibili.com/read'\s+in\s+article_url]\nprint(f'本次共扫描到\s+{len(article_urls)}\s+篇文章!')\n\ntotal_url_count\s+=\s+len(article_urls)\ncurrent_url_count\s+=\s+0\nfor\s+article_url\s+in\s+article_urls:\n\tcurrent_url_count\s+=\s+1\n\ttotal_url_count\s+=\s+len(article_urls)\n\tprint(f"正在加载第\s+{current_url_count}/{total_url_count}\s+篇文章内容...")\n\n\t#\s+进入文章页面\n\tdriver2.get(article_url)\n\ttime.sleep(1)\n\tsoup\s+=\s+BeautifulSoup(driver2.page_source,\s+"html.parser")\n\timage_elements\s+=\s+[img['src']\s+for\s+img\s+in\s+soup.select('img[src]')]\n\n\t#\s+针对提取到的链接进行切片\n\tfor\s+i\s+in\s+range(len(image_elements)):\n\t\timage_elements[i]\s+=\s+image_elements[i].split("?")[0]\n\tprint(f"当前页面共检测到\s+{len(image_elements)}\s+张图片!")\n\n\tcurrent_img_count\s+=\s+0\n\n\t#\s+遍历图片元素并下载图片\n\tfor\s+image_element\s+in\s+image_elements:\n\t\tcurrent_img_count\s+=\s+1\n\t\ttotal_img_count\s+=\s+len(image_elements)\n\t\timage_url\s+=\s+image_element\n\t\tfilename\s+=\s+image_url.split("/")[-1]\n\t\tsave_file_path\s+=\s+os.path.join(save_path,\s+f"{filename}")\s+#\s+构建完整的文件路径\n\n\t\tif\s+os.path.exists(save_file_path):\n\t\t\tprint(f"\s+[article:{current_url_count}]\s+-\s+图片已存在,跳过下载。路径:{save_file_path}")\n\t\t\ttime.sleep(0.2)\n\t\telse:\n\t\t\ttry:\n\t\t\t\twith\s+open(save_file_path,\s+"wb")\s+as\s+f:\n\t\t\t\t\tresponse\s+=\s+requests.get(image_url)\n\t\t\t\t\tf.write(response.content)\n\t\t\t\t\tprint(f"\s+[{current_img_count}/{total_img_count}]\s+in\s+[article:{current_url_count}]\s+-\s+已下载图片:{filename},保存路径:{save_file_path}")\n\t\t\t\t\ttime.sleep(0.1)\n\t\t\t\texcept\s+requests.RequestException\s+as\s+e:\n\t\t\t\t\tprint(f"请求错误:\s+{e}")\n\t\t\t\t\tprint(f"等待十秒进行重试")\n\t\t\t\t\ttime.sleep(10)\n\t\t\t\t\tcontinue\n\ndriver.quit()
原文地址: https://www.cveoy.top/t/topic/hQTI 著作权归作者所有。请勿转载和采集!