import\x20re\nimport\x20requests\nimport\x20time\nfrom\x20selenium.webdriver.common.by\x20import\x20By\nfrom\x20bs4\x20import\x20BeautifulSoup\nimport\x20os\nfrom\x20selenium\x20import\x20webdriver\nfrom\x20selenium.webdriver.chrome.service\x20import\x20Service\nfrom\x20selenium.webdriver.chrome.options\x20import\x20Options\n\nRUNNING_MODE\x20=\x201\x20#\x20【请修改】请设定运行模式:1-关键词模式、2-相关图片页面模式\nKEY_WORDS\x20=\x20["自拍"]\x20#\x20【请修改】需要搜索的关键词列表,RUNNING_MODE\x20=\x201\x20时,请设置!\n\n#\x20【请修改】指定保存图片的路径\nsave_path\x20=\x20"test1"\n\n#\x20【请修改】设置 Chrome WebDriver 的路径,就写程序路径下面的chromedriver文件路径\nwebdriver_path\x20=\x20'/Users/geeds/Desktop/Work/Python/ZcoolDL/chromedriver'\n\n#\x20【请修改】最大页面翻页次数 - 打开页面以后自动翻页多少次开始下载\nMAX_SCROLL_TIMES\x20=\x205\n\nif\x20RUNNING_MODE\x20==\x202:\n related_pic_url\x20=\x20"https://www.hellorf.com/image/show/28480515"\x20#\x20【请修改】替换为目标页面的URL\n base_url\x20=\x20related_pic_url\n print(f"RUNNING_MODE\x20==\x202:\x20相关图片页面模式,相关页面url为{related_pic_url}")\nelif\x20RUNNING_MODE\x20==\x201:\n base_url\x20=\x20f"https://www.hellorf.com/image/search?q={KEY_WORDS[0]}"\x20#\x20(不用改)替换为目标页面的URL\n print(f"RUNNING_MODE\x20==\x201:\x20关键词模式,关键词为「{KEY_WORDS}」,相关页面url为{base_url}")\nelse:\n print(f"请设置程序运行模式 RUNNING_MODE:1-关键词模式、2-相关图片页面模式")\n\n#\x20规范化路径并确保 source_folder 以斜杠结尾\nsave_path\x20=\x20os.path.normpath(save_path)\x20+\x20os.sep\n#\x20save_path\x20=\x20save_path\x20+\x20KEY_WORD\x20+\x20os.sep\nif\x20not\x20os.path.exists(save_path):\n os.makedirs(save_path)\n\n#\x20【可改可不改】每翻页一次,预留多少时间用于加载页面图片 (网络不好建议设置大一些,但会等待更久)\nDELAY_PER_TIMES\x20=\x200.1\n\n#\x20设置 Chrome WebDriver 的选项\nchrome_options\x20=\x20Options()\n#\x20chrome_options.add_argument('--headless')\x20#\x20无界面模式\nchrome_options.add_argument('--disable-gpu')\nchrome_options.add_argument("--disable-dev-shm-usage")\x20#\x20禁用/dev/shm使用\nchrome_options.add_argument("--disable-extensions")\x20#\x20禁用扩展\n\n#\x20创建 Chrome WebDriver\ndriver\x20=\x20webdriver.Chrome()\n#\x20driver2\x20=\x20webdriver.Chrome(service=Service(executable_path=webdriver_path),\x20options=chrome_options)\n\n#\x20加载页面\nprint(f"正在召唤浏览器..")\ndriver.get(base_url)\n\n#\x20模拟向下滚动至页面底部\nprint(f"浏览器召唤完毕,准备进行模拟下滑操作..")\nfor\x20i\x20in\x20range(1,\x20MAX_SCROLL_TIMES\x20+\x201):\n scroll_to_bottom_script\x20=\x20"window.scrollTo(0,\x20document.body.scrollHeight);"\n driver.execute_script(scroll_to_bottom_script)\n time.sleep(0.5)\n scroll_to_top_script\x20=\x20"window.scrollTo(0,\x200);"\n scroll_upper_script\x20=\x20"window.window.scrollBy(0,\x20-666);;"\n driver.execute_script(scroll_upper_script)\n print(f"正在努力进行第{i}次下滑!")\n time.sleep(1)\n\nprint(f"等待 {max(3,\x20(MAX_SCROLL_TIMES\x20*\x20DELAY_PER_TIMES))} 秒,以等待页面加载完成..")\ntime.sleep(max(3,\x20(MAX_SCROLL_TIMES\x20*\x20DELAY_PER_TIMES)))\x20#\x20等待页面加载完成,根据需要适当调整等待时间\n\nif\x20RUNNING_MODE\x20==\x201:\n driver2\x20=\x20webdriver.Chrome()\n print(f"开始抓取页面所有文章,请稍候..")\n article_urls\x20=\x20driver.find_elements(By.CSS_SELECTOR,\x20'a')\n article_urls\x20=\x20[article_url.get_attribute('href')\x20for\x20article_url\x20in\x20article_urls\x20if\x20'_blank'\x20in\x20article_url.get_attribute('target')]\n article_urls\x20=\x20[article_url\x20for\x20article_url\x20in\x20article_urls\x20if\x20'www.hellorf.com/image/show/'\x20in\x20article_url]\n print(f'本次共扫描到 {len(article_urls)} 篇文章!')\n\n total_url_count\x20=\x20len(article_urls)\n current_url_count\x20=\x200\n for\x20article_url\x20in\x20article_urls:\n current_url_count\x20+=\x201\n total_url_count\x20=\x20len(article_urls)\n print(f"正在加载第 {current_url_count}/{total_url_count} 篇文章内容...")\n\n #\x20进入文章页面\n driver2.get(article_url)\n time.sleep(1)\n soup\x20=\x20BeautifulSoup(driver2.page_source,\x20"html.parser")\n image_elements\x20=\x20[img['src']\x20for\x20img\x20in\x20soup.select('img[src]')]\n\n #\x20针对提取到的链接进行切片\n for\x20i\x20in\x20range(len(image_elements)):\n image_elements[i]\x20=\x20image_elements[i].split("?")[0]\n print(f"当前页面共检测到 {len(image_elements)} 张图片!")\n\n current_img_count\x20=\x200\n\n #\x20遍历图片元素并下载图片\n for\x20image_element\x20in\x20image_elements:\n current_img_count\x20+=\x201\n total_img_count\x20=\x20len(image_elements)\n image_url\x20=\x20image_element\n filename\x20=\x20image_url.split("/")[-1]\n save_file_path\x20=\x20os.path.join(save_path,\x20f"{filename}")\x20#\x20构建完整的文件路径\n\n if\x20os.path.exists(save_file_path):\n print(f" [article:{current_url_count}] - 图片已存在,跳过下载。路径:{save_file_path}")\n time.sleep(0.2)\n else:\n try:\n with\x20open(save_file_path,\x20"wb")\x20as\x20f:\n response\x20=\x20requests.get(image_url)\n f.write(response.content)\n print(f" [{current_img_count}/{total_img_count}] in [article:{current_url_count}] - 已下载图片:{filename},保存路径:{save_file_path}")\n time.sleep(0.1)\n except\x20requests.RequestException\x20as\x20e:\n print(f"请求错误: {e}")\n print(f"等待十秒进行重试")\n time.sleep(10)\n continue\n\nelif\x20RUNNING_MODE\x20==\x202:\n #\x20进入图片页面\n response\x20=\x20driver.page_source\n pattern\x20=\x20r'<img\x20src="(.*?)"\x20class="image">'\n image_elements\x20=\x20re.findall(pattern,\x20response)\n\n #\x20针对提取到的链接进行切片\n for\x20i\x20in\x20range(len(image_elements)):\n image_elements[i]\x20=\x20image_elements[i].split("?")[0]\n print(f"当前页面共检测到 {len(image_elements)} 张图片!")\n\n current_img_count\x20=\x200\n\n #\x20遍历图片元素并下载图片\n for\x20image_element\x20in\x20image_elements:\n current_img_count\x20+=\x201\n total_img_count\x20=\x20len(image_elements)\n image_url\x20=\x20image_element\n filename\x20=\x20image_url.split("/")[-1]\n save_file_path\x20=\x20os.path.join(save_path,\x20f"{filename}")\x20#\x20构建完整的文件路径\n\n if\x20os.path.exists(save_file_path):\n print(f" 图片已存在,跳过下载。路径:{save_file_path}")\n time.sleep(0.2)\n else:\n try:\n with\x20open(save_file_path,\x20"wb")\x20as\x20f:\n response\x20=\x20requests.get(image_url)\n f.write(response.content)\n print(\n f" [{current_img_count}/{total_img_count}] - 已下载图片:{filename},保存路径:{save_file_path}")\n time.sleep(0.6)\n except\x20requests.RequestException\x20as\x20e:\n print(f"请求错误: {e}")\n print(f"等待十秒进行重试")\n time.sleep(10)\n continue

Hellorf 图片爬虫:根据关键词列表批量下载图片

原文地址: https://www.cveoy.top/t/topic/pBDG 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录