根据优化建议修改代码给出最后优化并调试过的代码这是优化建议:1 可以使用多进程或协程来替换线程池以提高效率。2 将一些常量如请求头、域名列表等定义为全局变量避免重复定义。3 使用更快的方式来写入文件如使用文件缓冲区等。import requestsimport timeimport osimport urllib3import sysimport randomfrom bs4 import Bea
import requests import time import os import urllib3 import sys import random from bs4 import BeautifulSoup from concurrent.futures import ProcessPoolExecutor from multiprocessing import Lock, Manager from colorama import Fore, init
定义全局变量
DOMAINS = ['fr', 'it', 'ca', 'co.uk', 'ru', 'co,jp', 'co.kr', 'com.au', 'co.in', 'com.br', 'com.ar', 'co.za', 'co.nz', 'es', 'se', 'nl', 'ch', 'at', 'dk', 'be', 'pl', 'fi', 'ie', 'pt', 'gr', 'tw', 'com', 'uk', 'de', 'br', 'ca', 'kr', 'mx', 'au', 'za'] HEADERS = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', 'accept-language': 'zh-CN,zh;q=0.9', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.7', 'referer': 'https://www.google.com/', 'origin': 'https://www.google.com', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-User': '?1', 'Sec-Fetch-Dest': 'document' } now_time = time.strftime('%Y-%m-%d %H-%M')
读取Dorks
def read_file(file_path): with open(file_path, mode='r', encoding='utf-8') as file: read_content = file.readlines() # 将内容加入列表 content = [result.strip() for result in read_content] # 返回数量丢给任务池 return len(read_content), content
Google搜索
def google_search(query, locks, url_num, dork_finish_num, filename): try: # 关闭HTTPS报错信息 urllib3.disable_warnings() filename = os.path.join(os.getcwd(), f'{filename}.txt') random_domain = random.choice(DOMAINS) url = f'https://www.google.{random_domain}/search?q={query}&num=100' # 代理 proxies = {'http': 'http://127.0.0.1:7890', 'https': 'http://127.0.0.1:7890'} response = requests.get(url=url, headers=HEADERS, proxies=proxies, verify=False, timeout=5) soup = BeautifulSoup(response.content, 'html.parser') # 查找全部div标签 find_div = soup.find_all('div', {'class': 'yuRUbf'}) # 开启线程锁 locks.acquire() # 加入列表 get_url = [url.findNext('a')['href'] + '\n' for url in find_div] url_num.value += len(get_url) dork_finish_num.value += 1 print(Fore.GREEN + f'\r{now_time}[INFO]{ "-" * 10}>已获取Url数量:{url_num.value} Dorsk数量:{dork_finish_num.value} / {dork_total_num}', end='' + Fore.RESET) # 写入文件 write_info(filename, get_url) # 释放线程锁 locks.release()
except TimeoutError:
pass
写入文件函数
def write_info(filename, get_url): with open(filename, mode='a+', encoding='utf-8') as file: file.writelines(get_url)
if name == 'main': while True: try: init() # 初始化颜色模块 dorks_file = input(Fore.YELLOW + f'\n{now_time}[INFO]{ "-" * 10}>input file:' + Fore.RESET) print('') filename = input(Fore.YELLOW + f'\n{now_time}[INFO]{ "-" * 10}>output file:' + Fore.RESET) # 接受read_file函数返回的元组 dork_total_num, query_list = read_file(dorks_file) # 定义全局变量完成数量/URL数量 manager = Manager() dork_finish_num = manager.Value('dork_finish_num', 0) url_num = manager.Value('url_num', 0)
# 定义进程池数量
processes = ProcessPoolExecutor(max_workers=5)
# 定义全局锁
lock = Lock()
# 分配进程池任务
for dokr_list in query_list:
processes.submit(google_search, dokr_list, lock, url_num, dork_finish_num, filename)
processes.shutdown()
if len(sys.argv) == 1:
pass
input(Fore.YELLOW + f'\n\n{now_time}[INFO]{"-" * 10}>程序运行完毕,按回车退出' + Fore.RESET)
break
# 文件为空
except FileNotFoundError:
print(Fore.RED + f'{now_time}[Error]{"-" * 10}>文件不存在' + Fore.RESET)
# 中断异常
except KeyboardInterrupt:
sys.exit(1
原文地址: https://www.cveoy.top/t/topic/g9gp 著作权归作者所有。请勿转载和采集!