给下面的代码使用头部信息伪装和随机User-Agent技术import requestsimport timeimport osimport urllib3import sysimport randomfrom bs4 import BeautifulSoupfrom concurrentfutures import ThreadPoolExecutorfrom threading import
修改后的代码如下:
import requests import time import os import urllib3 import sys import random from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor from threading import Lock from colorama import Fore, init
now_time = time.strftime('%Y-%m-%d %H-%M')
随机User-Agent列表
user_agents = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36' ]
读取Dorks
def work(dorks): with open(dorks, mode='r', encoding='utf-8') as file: read_content = file.readlines() # 将内容加入列表 content = [result.strip() for result in read_content] # 返回数量丢给任务池 return len(read_content), content
Google搜索
def google_serach(query, locks, filename): try: # 关闭HTTPS报错信息 urllib3.disable_warnings() filename = os.path.join(os.getcwd(), f'{filename}.txt') domains = ['fr','it','ca','co.uk','ru','co,jp','co.kr','com.au','co.in','com.br','com.ar','co.za','co.nz','es','se','nl','ch','at','dk','be','pl','fi','ie','pt','gr', 'tw', 'com', 'uk', 'de', 'br', 'ca', 'kr', 'mx', 'au', 'za'] random_domain = random.choice(domains) url = f'https://www.google.{random_domain}/search?q={query}&num=100' # 随机选择User-Agent headers = { 'User-Agent': random.choice(user_agents), 'Accept-Language': 'zh-CN,zh;q=0.9', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Referer': 'https://www.google.com/', 'Origin': 'https://www.google.com', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-User': '?1', 'Sec-Fetch-Dest': 'document' } # 代理 proxies = {'http': 'http://127.0.0.1:7890', 'https': 'http://127.0.0.1:7890'} response = requests.get(url=url, headers=headers, proxies=proxies, verify=False, timeout=5) soup = BeautifulSoup(response.content, 'html.parser') # 查找全部div标签 find_div = soup.find_all('div', {'class': 'yuRUbf'}) # 开启线程锁 locks.acquire() # 加入列表 get_url = [url.findNext('a')['href'] + '\n' for url in find_div] global url_num, dork_finish_num url_num += len(get_url) dork_finish_num += 1 print(Fore.GREEN + f'\r{now_time}[INFO]{ "-" * 10}>已获取Url数量:{url_num} Dorsk数量:{dork_finish_num} / {dork_total_num}', end='' + Fore.RESET) # 写入文件 write_info(filename, get_url) # 释放线程锁 locks.release()
except TimeoutError:
pass
写入文件函数
def write_info(filename, get_url):
with open(filename, mode='a+', encoding='utf-8') as file:
file.writelines(get_url)
if name == 'main': while True: try: init() # 初始化颜色模块 dorks_file = input(Fore.YELLOW + f'\n{now_time}[INFO]{ "-" * 10}>input file:' + Fore.RESET) print('') filename = input(Fore.YELLOW + f'\n{now_time}[INFO]{ "-" * 10}>output file:' + Fore.RESET) # 接受work函数返回的元组 dork_total_num, query_list = work(dorks_file) # 定义全局变量完成数量/URL数量 dork_finish_num = url_num = 0
# 定义线程池数量
threads = ThreadPoolExecutor(max_workers=5)
# 定义全局锁
threads_lock = Lock()
# 分配进程池任务
for dokr_list in query_list:
threads.submit(google_serach, dokr_list, threads_lock, filename)
threads.shutdown()
if len(sys.argv) == 1:
pass
input(Fore.YELLOW + f'\n\n{now_time}[INFO]{"-" * 10}>程序运行完毕,按回车退出' + Fore.RESET)
break
# 文件为空
except FileNotFoundError:
print(Fore.RED + f'{now_time}[Error]{"-" * 10}>文件不存在' + Fore.RESET)
# 中断异常
except KeyboardInterrupt:
sys.exit(1
原文地址: https://www.cveoy.top/t/topic/g9gO 著作权归作者所有。请勿转载和采集!