import re import os import time import requests import tqdm

【请修改】指定保存图片的路径

save_path = '自拍' TIME_GAP = 2.0 # 每间隔多少秒爬一张图,可自行修改。太快小心被网站封。推荐设置2~3。 TIME_SLEEP = 10.0 # 超出网站限制后,休息多少秒后继续爬虫。推荐设置10。

header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'}

def getImg(url, idx, path): img = requests.get(url, headers=header) file = open(path+str(idx)+'.jpg', 'wb') file.write(img.content) file.close()

search_list = input("请输入搜索内容,以逗号分隔:").split(',') number_list = list(map(int, input("请输入每个搜索内容需求数量,以逗号分隔:").split(','))) prefix_list = input("请输入命名关键词,以逗号分隔:").split(',')

if len(search_list) != len(number_list) or len(search_list) != len(prefix_list): print("输入的搜索内容、需求数量和命名关键词数量不一致,请重新输入!") exit()

save_path = os.path.normpath(save_path) + os.sep

for i in range(len(search_list)): search = search_list[i] number = number_list[i] prefix = prefix_list[i] path = save_path+search+'/' if not os.path.exists(path): os.makedirs(path)

bar = tqdm.tqdm(total=number)
page = 0
while True:
    if number == 0:
        break
    url = 'https://image.baidu.com/search/acjson'
    params = {
            "tn": "resultjson_com",
            "logid": "11555092689241190059",
            "ipn": "rj",
            "ct": "201326592",
            "is": "",
            "fp": "result",
            "queryWord": search,
            "cl": "2",
            "lm": "-1",
            "ie": "utf-8",
            "oe": "utf-8",
            "adpicid": "",
            "st": "-1",
            "z": "",
            "ic": "0",
            "hd": "",
            "latest": "",
            "copyright": "",
            "word": search,
            "s": "",
            "se": "",
            "tab": "",
            "width": "",
            "height": "",
            "face": "0",
            "istype": "2",
            "qc": "",
            "nc": "1",
            "fr": "",
            "expermode": "",
            "force": "",
            "pn": str(60*page),
            "rn": number,
            "gsm": "1e",
            "1617626956685": ""
        }
    result = requests.get(url, headers=header, params=params).json()
    url_list = []
    for data in result['data'][:-1]:
        for data2 in data['replaceUrl']:
            url_list.append(data2['ObjUrl'])
            # print(url_list)
    for i in range(len(url_list)):
        try:
            getImg(url_list[i], prefix + str(60*page+i), path)
            bar.update(1)
            time.sleep(TIME_GAP)
            number -= 1
            if number == 0:
                break
        except:
            print('

请求超出网站限制,歇会儿再继续..') time.sleep(10) page += 1

print(" finish!")

Python 批量爬取百度图片:按关键词列表爬取并分类保存

原文地址: https://www.cveoy.top/t/topic/qiqx 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录