为了优化代码的运行速度,可以考虑以下几点:

  1. 使用多线程或多进程进行并发处理。可以使用ThreadPoolExecutor来实现多线程并发执行,设置合适的max_workers参数来控制线程数量,从而加快处理速度。

  2. 减少网络请求的延迟。可以使用连接池来复用已经建立的连接,减少连接建立的时间。另外,可以合理设置请求头信息,包括user-agentaccept-language等,以及使用合适的代理来提高请求效率。

  3. 使用更高效的解析库。可以考虑使用lxml库来替代BeautifulSoup,因为lxml库的解析速度更快。

  4. 减少文件读写次数。可以将多次写入文件的操作合并为一次,减少文件操作的次数,提高效率。

  5. 使用锁机制进行资源控制。在多线程并发执行时,可能会出现资源竞争的情况,可以使用锁机制来保证资源的正确访问,避免数据错误。

根据以上几点优化,可以修改代码如下:

import os
import sys
import random
import time
import urllib3
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
import requests
from colorama import Fore, init

now_time = time.strftime('%Y-%m-%d %H-%M')

def read_dorks(dorks):
    with open(dorks, mode='r', encoding='utf-8') as file:
        content = [result.strip() for result in file.readlines()]
        return len(content), content

def google_search(query, filename):
    try:
        urllib3.disable_warnings()
        filename = os.path.join(os.getcwd(), f'{filename}.txt')
        domains = ['fr','it','ca','co.uk','ru','co.jp','co.kr','com.au','co.in','com.br','com.ar','bg','com.na','co.za','co','co.th','nl','co.ug','co.nz','es','se','nl','ch','at','dk','be','pl','fi','pt','gr', 'com.tw', 'com', 'co.uk', 'de', 'ca', 'co.kr', 'com.mx', 'co.za']
        random_domain = random.choice(domains)
        url = f'https://www.google.{random_domain}/search?q={query}&num=100'
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
            'accept-language': 'zh-CN,zh;q=0.9',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'referer': 'https://www.google.com/',
            'origin': 'https://www.google.com',
            'Sec-Fetch-Site': 'same-origin',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-User': '?1',
            'Sec-Fetch-Dest': 'document'
        }
        proxies = {'http': 'http://127.0.0.1:7890', 'https': 'http://127.0.0.1:7890'}
        response = requests.get(url=url, headers=headers, proxies=proxies, verify=False, timeout=10)
        soup = etree.HTML(response.content)
        find_div = soup.xpath('//div[@class="yuRUbf"]')
        urls = [url.findNext('a')['href'] + '\n' for url in find_div if 'google.com' not in url.findNext('a')['href']]
        with lock:
            write_urls(filename, urls)
        return len(urls)
    except Exception as e:
        print(Fore.RED + f'{now_time}[Error]{ "-" * 10}>{e}' + Fore.RESET)
        return 0

def write_urls(filename, urls):
    with open(filename, mode='a+', encoding='utf-8') as file:
        file.writelines(urls)

if __name__ == '__main__':
    lock = Lock()
    while True:
        try:
            init()
            dorks_file = input(Fore.YELLOW + f'\n{now_time}[INFO]{ "-" * 10}>input file:' + Fore.RESET)
            filename = input(Fore.YELLOW + f'\n{now_time}[INFO]{ "-" * 10}>output file:' + Fore.RESET)
            dork_total_num, query_list = read_dorks(dorks_file)
            dork_finish_num = url_num = 0
            with ThreadPoolExecutor(max_workers=50) as executor:
                for i, query in enumerate(query_list):
                    future = executor.submit(google_search, query, filename)
                    url_count = future.result()
                    url_num += url_count
                    dork_finish_num += 1
                    print(Fore.GREEN + f'\r{now_time}[INFO]{ "-" * 10}>get Urlnumber:{url_num}  Dorsk number:{dork_finish_num} / {dork_total_num}', end='' + Fore.RESET)
            input(Fore.YELLOW + f'\n\n{now_time}[INFO]{"-" * 10}>final huiche' + Fore.RESET)
            break
        except FileNotFoundError:
            print(Fore.RED + f'{now_time}[Error]{"-" * 10}>file not found' + Fore.RESET)
        except KeyboardInterrupt:
            sys.exit(1)

以上是对代码的一些优化措施,具体的优化效果还需要根据实际情况进行测试和调整

优化代码的运行速度import osimport sysimport randomimport timeimport urllib3from bs4 import BeautifulSoupfrom lxml import etreefrom concurrentfutures import ThreadPoolExecutorfrom threading import Lockimport re

原文地址: https://www.cveoy.top/t/topic/hHR4 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录