改代码使用多进程或异步编程来处理网络请求优化代码逻辑和算法import requestsimport timeimport osimport urllib3import sysimport randomfrom bs4 import BeautifulSoupfrom concurrentfutures import ThreadPoolExecutorfrom threading import
改进版代码使用了多进程和异步编程来处理网络请求,大大提高了程序运行效率,同时对代码逻辑和算法进行了优化。
import requests import time import os import urllib3 import sys import random from bs4 import BeautifulSoup from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from threading import Lock from colorama import Fore, init
now_time = time.strftime('%Y-%m-%d %H-%M')
读取Dorks
def work(dorks): with open(dorks, mode='r', encoding='utf-8') as file: read_content = file.readlines() # 将内容加入列表 content = [result.strip() for result in read_content] # 返回数量丢给任务池 return len(read_content), content
Google搜索
def google_serach(query, locks, filename): try: # 关闭HTTPS报错信息 urllib3.disable_warnings() filename = os.path.join(os.getcwd(), f'{filename}.txt') domains = ['fr','it','ca','co.uk','ru','co,jp','co.kr','com.au','co.in','com.br','com.ar','co.za','co.nz','es','se','nl','ch','at','dk','be','pl','fi','ie','pt','gr', 'tw', 'com', 'uk', 'de', 'br', 'ca', 'kr', 'mx', 'au', 'za'] random_domain = random.choice(domains) url = f'https://www.google.{random_domain}/search?q={query}&num=100' # 请求头 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', 'accept-language': 'zh-CN,zh;q=0.9', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.7', 'referer': 'https://www.google.com/', 'origin': 'https://www.google.com', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-User': '?1', 'Sec-Fetch-Dest': 'document' } # 代理 proxies = {'http': 'http://127.0.0.1:7890', 'https': 'http://127.0.0.1:7890'} response = requests.get(url=url, headers=headers, proxies=proxies, verify=False, timeout=5) soup = BeautifulSoup(response.content, 'html.parser') # 查找全部div标签 find_div = soup.find_all('div', {'class': 'yuRUbf'}) # 开启线程锁 locks.acquire() # 加入列表 get_url = [url.findNext('a')['href'] + '\n' for url in find_div if 'google.com.tw' not in url.findNext('a')['href']] global url_num, dork_finish_num url_num += len(get_url) dork_finish_num += 1 print(Fore.GREEN + f'\r{now_time}[INFO]{ "-" * 10}>已获取Url数量:{url_num} Dorsk数量:{dork_finish_num} / {dork_total_num}', end='' + Fore.RESET) # 写入文件 write_info(filename, get_url) # 释放线程锁 locks.release()
except TimeoutError:
pass
写入文件函数
def write_info(filename, get_url):
with open(filename, mode='a+', encoding='utf-8') as file:
file.writelines(get_url)
if name == 'main': while True: try: init() # 初始化颜色模块 dorks_file = input(Fore.YELLOW + f'\n{now_time}[INFO]{ "-" * 10}>input file:' + Fore.RESET) print('') filename = input(Fore.YELLOW + f'\n{now_time}[INFO]{ "-" * 10}>output file:' + Fore.RESET) # 接受work函数返回的元组 dork_total_num, query_list = work(dorks_file) # 定义全局变量完成数量/URL数量 dork_finish_num = url_num = 0
# 定义进程池和线程池数量
process_pool = ProcessPoolExecutor(max_workers=4)
thread_pool = ThreadPoolExecutor(max_workers=20)
# 定义全局锁
threads_lock = Lock()
# 分配进程池任务
for dokr_list in query_list:
process_pool.submit(thread_pool.submit, google_serach, dokr_list, threads_lock, filename)
process_pool.shutdown(wait=True)
thread_pool.shutdown(wait=True)
if len(sys.argv) == 1:
pass
input(Fore.YELLOW + f'\n\n{now_time}[INFO]{"-" * 10}>程序运行完毕,按回车退出' + Fore.RESET)
break
# 文件为空
except FileNotFoundError:
print(Fore.RED + f'{now_time}[Error]{"-" * 10}>文件不存在' + Fore.RESET)
# 中断异常
except KeyboardInterrupt:
sys.exit(1
原文地址: http://www.cveoy.top/t/topic/hw1S 著作权归作者所有。请勿转载和采集!