Python 百度图片爬虫脚本 - 支持自定义图片命名前缀
import\x20re\nimport\x20os\nimport\x20time\nimport\x20requests\nimport\x20tqdm\n\n#\x20【请修改】指定保存图片的路径\nsave_path\x20=\x20("中性风")\nTIME_GAP\x20=\x202.0\x20#\x20每间隔多少秒爬一张图,可自行修改。太快小心被网站封。推荐设置2~3。\nTIME_SLEEP\x20=\x2010.0\x20#\x20超出网站限制后,休息多少秒后继续爬虫。推荐设置10。\n\nheader\x20=\x20{'User-Agent':\x20'Mozilla/5.0\x20(Windows\x20NT\x2010.0;\x20Win64;\x20x64)\x20AppleWebKit/537.36\x20(KHTML,\x20like\x20Gecko)\x20Chrome/66.0.3359.139\x20Safari/537.36'}\n\n\ndef\x20getImg(url,\x20idx,\x20path):\n\x20img\x20=\x20requests.get(url,\x20headers=header)\n\x20file\x20=\x20open(path+str(idx)+'.jpg',\x20'wb')\n\x20file.write(img.content)\n\x20file.close()\n\n\nsearch\x20=\x20input("请输入搜索内容:")\nnumber\x20=\x20int(input("请输入需求数量:"))\nprefix\x20=\x20input("请输入图片命名前缀:")\nsave_path\x20=\x20os.path.normpath(save_path)\x20+\x20os.sep\npath\x20=\x20save_path+search+'/'\n如果不存在,则创建该文件夹\nif\x20not\x20os.path.exists(path):\n\x20os.makedirs(path)\n\nbar\x20=\x20tqdm.tqdm(total=number)\npage\x20=\x200\nwhile\x20True:\n\x20if\x20number\x20==\x200:\n\x20break\n\x20url\x20=\x20'https://image.baidu.com/search/acjson'\n\x20params\x20=\x20{ \x20\x20\x20"tn":\x20"resultjson_com", \x20\x20\x20"logid":\x20"11555092689241190059", \x20\x20\x20"ipn":\x20"rj", \x20\x20\x20"ct":\x20"201326592", \x20\x20\x20"is":\x20"", \x20\x20\x20"fp":\x20"result", \x20\x20\x20"queryWord":\x20search, \x20\x20\x20"cl":\x20"2", \x20\x20\x20"lm":\x20"-1", \x20\x20\x20"ie":\x20"utf-8", \x20\x20\x20"oe":\x20"utf-8", \x20\x20\x20"adpicid":\x20"", \x20\x20\x20"st":\x20"-1", \x20\x20\x20"z":\x20"", \x20\x20\x20"ic":\x20"0", \x20\x20\x20"hd":\x20"", \x20\x20\x20"latest":\x20"", \x20\x20\x20"copyright":\x20"", \x20\x20\x20"word":\x20search, \x20\x20\x20"s":\x20"", \x20\x20\x20"se":\x20"", \x20\x20\x20"tab":\x20"", \x20\x20\x20"width":\x20"", \x20\x20\x20"height":\x20"", \x20\x20\x20"face":\x20"0", \x20\x20\x20"istype":\x20"2", \x20\x20\x20"qc":\x20"", \x20\x20\x20"nc":\x20"1", \x20\x20\x20"fr":\x20"", \x20\x20\x20"expermode":\x20"", \x20\x20\x20"force":\x20"", \x20\x20\x20"pn":\x20str(60page), \x20\x20\x20"rn":\x20number, \x20\x20\x20"gsm":\x20"1e", \x20\x20\x20"1617626956685":\x20"" \x20}\n\x20result\x20=\x20requests.get(url,\x20headers=header,\x20params=params).json()\n\x20url_list\x20=\x20[]\n\x20for\x20data\x20in\x20result['data'][:-1]:\n\x20\x20for\x20data2\x20in\x20data['replaceUrl']:\n\x20\x20\x20url_list.append(data2['ObjUrl'])\n\x20\x20\x20#\x20print(url_list)\n\x20for\x20i\x20in\x20range(len(url_list)):\n\x20\x20try:\n\x20\x20\x20getImg(url_list[i],\x20prefix\x20+\x20str(60page+i),\x20path)\n\x20\x20\x20bar.update(1)\n\x20\x20\x20time.sleep(TIME_GAP)\n\x20\x20\x20number\x20-=\x201\n\x20\x20\x20if\x20number\x20==\x200:\n\x20\x20\x20\x20break\n\x20\x20except:\n\x20\x20\x20print('\n请求超出网站限制,歇会儿再继续..')\n\x20\x20\x20time.sleep(10)\n\x20page\x20+=\x201\nprint("\nfinish!")
原文地址: https://www.cveoy.top/t/topic/qFxC 著作权归作者所有。请勿转载和采集!