import reimport requestsimport timefrom seleniumwebdrivercommonby import Byfrom bs4 import BeautifulSoupimport osfrom selenium import webdriverfrom seleniumwebdriverchromeservice import Servicefrom se
import re import requests import time from selenium.webdriver.common.by import By from bs4 import BeautifulSoup import os from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options
RUNNING_MODE = 1 # 【请修改】请设定运行模式:1-关键词模式、2-相关图片页面模式 KEY_WORDS = ["自拍"] # 【请修改】需要搜索的关键词列表,RUNNING_MODE = 1 时,请设置!
【请修改】指定保存图片的路径
save_path = "test1"
【请修改】设置 Chrome WebDriver 的路径,就写程序路径下面的chromedriver文件路径
webdriver_path = '/Users/geeds/Desktop/Work/Python/ZcoolDL/chromedriver'
【请修改】最大页面翻页次数 - 打开页面以后自动翻页多少次开始下载
MAX_SCROLL_TIMES = 5
if RUNNING_MODE == 2: related_pic_url = "https://www.hellorf.com/image/show/28480515" # 【请修改】替换为目标页面的URL base_url = related_pic_url print(f"RUNNING_MODE == 2: 相关图片页面模式,相关页面url为{related_pic_url}") elif RUNNING_MODE == 1: base_url = f"https://www.hellorf.com/image/search?q={KEY_WORDS[0]}" # (不用改)替换为目标页面的URL print(f"RUNNING_MODE == 1: 关键词模式,关键词为「{KEY_WORDS}」,相关页面url为{base_url}") else: print(f"请设置程序运行模式 RUNNING_MODE:1-关键词模式、2-相关图片页面模式")
规范化路径并确保 source_folder 以斜杠结尾
save_path = os.path.normpath(save_path) + os.sep
save_path = save_path + KEY_WORD + os.sep
if not os.path.exists(save_path): os.makedirs(save_path)
【可改可不改】每翻页一次,预留多少时间用于加载页面图片 (网络不好建议设置大一些,但会等待更久)
DELAY_PER_TIMES = 0.1
设置 Chrome WebDriver 的选项
chrome_options = Options()
chrome_options.add_argument('--headless') # 无界面模式
chrome_options.add_argument('--disable-gpu') chrome_options.add_argument("--disable-dev-shm-usage") # 禁用/dev/shm使用 chrome_options.add_argument("--disable-extensions") # 禁用扩展
创建 Chrome WebDriver
driver = webdriver.Chrome()
driver2 = webdriver.Chrome(service=Service(executable_path=webdriver_path), options=chrome_options)
加载页面
print(f"正在召唤浏览器..") driver.get(base_url)
模拟向下滚动至页面底部
print(f"浏览器召唤完毕,准备进行模拟下滑操作..") for i in range(1, MAX_SCROLL_TIMES + 1): scroll_to_bottom_script = "window.scrollTo(0, document.body.scrollHeight);" driver.execute_script(scroll_to_bottom_script) time.sleep(0.5) scroll_to_top_script = "window.scrollTo(0, 0);" scroll_upper_script = "window.window.scrollBy(0, -666);;" driver.execute_script(scroll_upper_script) print(f"正在努力进行第{i}次下滑!") time.sleep(1)
print(f"等待 {max(3, (MAX_SCROLL_TIMES * DELAY_PER_TIMES))} 秒,以等待页面加载完成..") time.sleep(max(3, (MAX_SCROLL_TIMES * DELAY_PER_TIMES))) # 等待页面加载完成,根据需要适当调整等待时间
if RUNNING_MODE == 1: driver2 = webdriver.Chrome() print(f"开始抓取页面所有文章,请稍候..") article_urls = driver.find_elements(By.CSS_SELECTOR, 'a') article_urls = [article_url.get_attribute('href') for article_url in article_urls if '_blank' in article_url.get_attribute('target')] article_urls = [article_url for article_url in article_urls if 'www.hellorf.com/image/show/' in article_url] print(f'本次共扫描到 {len(article_urls)} 篇文章!')
total_url_count = len(article_urls)
current_url_count = 0
for article_url in article_urls:
current_url_count += 1
total_url_count = len(article_urls)
print(f"正在加载第 {current_url_count}/{total_url_count} 篇文章内容...")
# 进入文章页面
driver2.get(article_url)
time.sleep(1)
soup = BeautifulSoup(driver2.page_source, "html.parser")
image_elements = [img['src'] for img in soup.select('img[src]')]
# 针对提取到的链接进行切片
for i in range(len(image_elements)):
image_elements[i] = image_elements[i].split("?")[0]
print(f"当前页面共检测到 {len(image_elements)} 张图片!")
current_img_count = 0
# 遍历图片元素并下载图片
for image_element in image_elements:
current_img_count += 1
total_img_count = len(image_elements)
image_url = image_element
filename = image_url.split("/")[-1]
save_file_path = os.path.join(save_path, f"{filename}") # 构建完整的文件路径
if os.path.exists(save_file_path):
print(f" [article:{current_url_count}] - 图片已存在,跳过下载。路径:{save_file_path}")
time.sleep(0.2)
else:
try:
with open(save_file_path, "wb") as f:
response = requests.get(image_url)
f.write(response.content)
print(f" [{current_img_count}/{total_img_count}] in [article:{current_url_count}] - 已下载图片:{filename},保存路径:{save_file_path}")
time.sleep(0.1)
except requests.RequestException as e:
print(f"请求错误: {e}")
print(f"等待十秒进行重试")
time.sleep(10)
continue
elif RUNNING_MODE == 2: # 进入图片页面 response = driver.page_source pattern = r'<img\s+src="(.*?)"\s+class="image">' image_elements = re.findall(pattern, response)
# 针对提取到的链接进行切片
for i in range(len(image_elements)):
image_elements[i] = image_elements[i].split("?")[0]
print(f"当前页面共检测到 {len(image_elements)} 张图片!")
current_img_count = 0
# 遍历图片元素并下载图片
for image_element in image_elements:
current_img_count += 1
total_img_count = len(image_elements)
image_url = image_element
filename = image_url.split("/")[-1]
save_file_path = os.path.join(save_path, f"{filename}") # 构建完整的文件路径
if os.path.exists(save_file_path):
print(f" 图片已存在,跳过下载。路径:{save_file_path}")
time.sleep(0.2)
else:
try:
with open(save_file_path, "wb") as f:
response = requests.get(image_url)
f.write(response.content)
print(
f" [{current_img_count}/{total_img_count}] - 已下载图片:{filename},保存路径:{save_file_path}")
time.sleep(0.6)
except requests.RequestException as e:
print(f"请求错误: {e}")
print(f"等待十秒进行重试")
time.sleep(10)
continu
原文地址: https://www.cveoy.top/t/topic/i0tN 著作权归作者所有。请勿转载和采集!