import re import requests import time from selenium.webdriver.common.by import By from bs4 import BeautifulSoup import os from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options

RUNNING_MODE = 1 # 【请修改】请设定运行模式:1-关键词模式、2-相关图片页面模式 KEY_WORDS = ["自拍"] # 【请修改】需要搜索的关键词列表,RUNNING_MODE = 1 时,请设置!

【请修改】指定保存图片的路径

save_path = "test1"

【请修改】设置 Chrome WebDriver 的路径,就写程序路径下面的chromedriver文件路径

webdriver_path = '/Users/geeds/Desktop/Work/Python/ZcoolDL/chromedriver'

【请修改】最大页面翻页次数 - 打开页面以后自动翻页多少次开始下载

MAX_SCROLL_TIMES = 5

if RUNNING_MODE == 2: related_pic_url = "https://www.hellorf.com/image/show/28480515" # 【请修改】替换为目标页面的URL base_url = related_pic_url print(f"RUNNING_MODE == 2: 相关图片页面模式,相关页面url为{related_pic_url}") elif RUNNING_MODE == 1: base_url = f"https://www.hellorf.com/image/search?q={KEY_WORDS[0]}" # (不用改)替换为目标页面的URL print(f"RUNNING_MODE == 1: 关键词模式,关键词为「{KEY_WORDS}」,相关页面url为{base_url}") else: print(f"请设置程序运行模式 RUNNING_MODE:1-关键词模式、2-相关图片页面模式")

规范化路径并确保 source_folder 以斜杠结尾

save_path = os.path.normpath(save_path) + os.sep

save_path = save_path + KEY_WORD + os.sep

if not os.path.exists(save_path): os.makedirs(save_path)

【可改可不改】每翻页一次,预留多少时间用于加载页面图片 (网络不好建议设置大一些,但会等待更久)

DELAY_PER_TIMES = 0.1

设置 Chrome WebDriver 的选项

chrome_options = Options()

chrome_options.add_argument('--headless') # 无界面模式

chrome_options.add_argument('--disable-gpu') chrome_options.add_argument("--disable-dev-shm-usage") # 禁用/dev/shm使用 chrome_options.add_argument("--disable-extensions") # 禁用扩展

创建 Chrome WebDriver

driver = webdriver.Chrome()

driver2 = webdriver.Chrome(service=Service(executable_path=webdriver_path), options=chrome_options)

加载页面

print(f"正在召唤浏览器..") driver.get(base_url)

模拟向下滚动至页面底部

print(f"浏览器召唤完毕,准备进行模拟下滑操作..") for i in range(1, MAX_SCROLL_TIMES + 1): scroll_to_bottom_script = "window.scrollTo(0, document.body.scrollHeight);" driver.execute_script(scroll_to_bottom_script) time.sleep(0.5) scroll_to_top_script = "window.scrollTo(0, 0);" scroll_upper_script = "window.window.scrollBy(0, -666);;" driver.execute_script(scroll_upper_script) print(f"正在努力进行第{i}次下滑!") time.sleep(1)

print(f"等待 {max(3, (MAX_SCROLL_TIMES * DELAY_PER_TIMES))} 秒,以等待页面加载完成..") time.sleep(max(3, (MAX_SCROLL_TIMES * DELAY_PER_TIMES))) # 等待页面加载完成,根据需要适当调整等待时间

if RUNNING_MODE == 1: driver2 = webdriver.Chrome() print(f"开始抓取页面所有文章,请稍候..") article_urls = driver.find_elements(By.CSS_SELECTOR, 'a') article_urls = [article_url.get_attribute('href') for article_url in article_urls if '_blank' in article_url.get_attribute('target')] article_urls = [article_url for article_url in article_urls if 'www.hellorf.com/image/show/' in article_url] print(f'本次共扫描到 {len(article_urls)} 篇文章!')

total_url_count = len(article_urls)
current_url_count = 0
for article_url in article_urls:
    current_url_count += 1
    total_url_count = len(article_urls)
    print(f"正在加载第 {current_url_count}/{total_url_count} 篇文章内容...")

    # 进入文章页面
    driver2.get(article_url)
    time.sleep(1)
    soup = BeautifulSoup(driver2.page_source, "html.parser")
    image_elements = [img['src'] for img in soup.select('img[src]')]

    # 针对提取到的链接进行切片
    for i in range(len(image_elements)):
        image_elements[i] = image_elements[i].split("?")[0]
    print(f"当前页面共检测到 {len(image_elements)} 张图片!")

    current_img_count = 0

    # 遍历图片元素并下载图片
    for image_element in image_elements:
        current_img_count += 1
        total_img_count = len(image_elements)
        image_url = image_element
        filename = image_url.split("/")[-1]
        save_file_path = os.path.join(save_path, f"{filename}")  # 构建完整的文件路径

        if os.path.exists(save_file_path):
            print(f" [article:{current_url_count}] - 图片已存在,跳过下载。路径:{save_file_path}")
            time.sleep(0.2)
        else:
            try:
                with open(save_file_path, "wb") as f:
                    response = requests.get(image_url)
                    f.write(response.content)
                    print(f" [{current_img_count}/{total_img_count}] in [article:{current_url_count}] - 已下载图片:{filename},保存路径:{save_file_path}")
                    time.sleep(0.1)
            except requests.RequestException as e:
                print(f"请求错误: {e}")
                print(f"等待十秒进行重试")
                time.sleep(10)
                continue

elif RUNNING_MODE == 2: # 进入图片页面 response = driver.page_source pattern = r'<img\s+src="(.*?)"\s+class="image">' image_elements = re.findall(pattern, response)

# 针对提取到的链接进行切片
for i in range(len(image_elements)):
    image_elements[i] = image_elements[i].split("?")[0]
print(f"当前页面共检测到 {len(image_elements)} 张图片!")

current_img_count = 0

# 遍历图片元素并下载图片
for image_element in image_elements:
    current_img_count += 1
    total_img_count = len(image_elements)
    image_url = image_element
    filename = image_url.split("/")[-1]
    save_file_path = os.path.join(save_path, f"{filename}")  # 构建完整的文件路径

    if os.path.exists(save_file_path):
        print(f" 图片已存在,跳过下载。路径:{save_file_path}")
        time.sleep(0.2)
    else:
        try:
            with open(save_file_path, "wb") as f:
                response = requests.get(image_url)
                f.write(response.content)
                print(
                    f" [{current_img_count}/{total_img_count}] - 已下载图片:{filename},保存路径:{save_file_path}")
                time.sleep(0.6)
        except requests.RequestException as e:
            print(f"请求错误: {e}")
            print(f"等待十秒进行重试")
            time.sleep(10)
            continu
import reimport requestsimport timefrom seleniumwebdrivercommonby import Byfrom bs4 import BeautifulSoupimport osfrom selenium import webdriverfrom seleniumwebdriverchromeservice import Servicefrom se

原文地址: https://www.cveoy.top/t/topic/i0tN 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录