优化下面的代码:1 将一些常量定义为全局变量例如请求头、代理等可以避免在每次请求时都重新定义。2 在函数google_serach中可以使用requestsSession来提高请求的效率避免每次请求都新建一个连接。3 将多个全局变量封装到一个类中方便管理和维护。4 使用logging模块来输出日志信息代替print语句。5 使用argparse模块来解析命令行参数使程序更加灵活和易用。6 将程序分
#coding:utf-8 import argparse import asyncio import logging import os import sys import time import urllib3
import requests from bs4 import BeautifulSoup from colorama import Fore, init from concurrent.futures import ThreadPoolExecutor from threading import Lock
class GoogleSearch: def init(self, dorks_file, num_threads=5, output_file=None, proxy=None): self.dorks_file = dorks_file self.num_threads = num_threads self.output_file = output_file self.proxy = proxy self.now_time = time.strftime('%Y-%m-%d %H-%M') self.dork_total_num, self.query_list = self.read_dorks() self.dork_finish_num = 0 self.url_num = 0 self.threads_lock = Lock() self.logger = self.setup_logger()
def read_dorks(self):
with open(self.dorks_file, mode='r', encoding='utf-8') as file:
read_content = file.readlines()
# 将内容加入列表
content = [result.strip() for result in read_content]
# 返回数量丢给任务池
return len(read_content), content
async def google_search(self, query):
try:
# 关闭HTTPS报错信息
urllib3.disable_warnings()
url = f'https://www.google.com/search?q={query}&num=100'
# 请求头
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
'accept-language': 'zh-CN,zh;q=0.9',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'referer': 'https://www.google.com/',
'origin': 'https://www.google.com',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document'
}
# 代理
proxies = {'http': self.proxy, 'https': self.proxy} if self.proxy else None
async with aiohttp.ClientSession(headers=headers) as session:
async with session.get(url, proxy=self.proxy, verify_ssl=False, timeout=5) as response:
soup = BeautifulSoup(await response.text(), 'html.parser')
find_div = soup.find_all('div', {'class': 'yuRUbf'})
urls = [url.findNext('a')['href'] + '\n' for url in find_div]
self.threads_lock.acquire()
self.url_num += len(urls)
self.dork_finish_num += 1
self.logger.info(f'[{self.now_time}][INFO]已获取Url数量:{self.url_num} Dorks数量:{self.dork_finish_num}/{self.dork_total_num}')
if self.output_file:
self.write_info(self.output_file, urls)
self.threads_lock.release()
except Exception as e:
self.logger.error(f'[{self.now_time}][ERROR]Failed to get results for {query}: {str(e)}')
def write_info(self, filename, urls):
with open(filename, mode='a+', encoding='utf-8') as file:
file.writelines(urls)
def setup_logger(self):
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
formatter = logging.Formatter(f'%(asctime)s [%(levelname)s] %(message)s')
ch = logging.StreamHandler()
ch.setFormatter(formatter)
logger.addHandler(ch)
return logger
def run(self):
loop = asyncio.get_event_loop()
tasks = [self.google_search(query) for query in self.query_list]
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
if name == 'main': init() # 初始化颜色模块 parser = argparse.ArgumentParser(description='Google Search Scraper') parser.add_argument('dorks_file', help='File containing dorks to search') parser.add_argument('-t', '--num_threads', type=int, default=5, help='Number of threads to use') parser.add_argument('-o', '--output_file', help='Output file to write results to') parser.add_argument('-p', '--proxy', help='HTTP proxy to use') args = parser.parse_args() scraper = GoogleSearch(args.dorks_file, args.num_threads, args.output_file, args.proxy) scraper.run(
原文地址: https://www.cveoy.top/t/topic/fliA 著作权归作者所有。请勿转载和采集!