使用多进程和异步编程优化代码效率:网络请求加速方案

本文介绍使用多进程和异步编程来优化代码逻辑和算法,以提高网络请求效率。改进后的代码使用多进程和异步编程,显著提高了程序运行效率。

代码改进

改进版代码使用了多进程和异步编程来处理网络请求,大大提高了程序运行效率,同时对代码逻辑和算法进行了优化。

import requests
import time
import os
import urllib3
import sys
import random
from bs4 import BeautifulSoup
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from threading import Lock
from colorama import Fore, init

now_time = time.strftime('%Y-%m-%d %H-%M')


# 读取Dorks
def work(dorks):
    with open(dorks, mode='r', encoding='utf-8') as file:
        read_content = file.readlines()
        # 将内容加入列表
        content = [result.strip() for result in read_content]
        # 返回数量丢给任务池
        return len(read_content), content


# Google搜索
def google_serach(query, locks, filename):
    try:
        # 关闭HTTPS报错信息
        urllib3.disable_warnings()
        filename = os.path.join(os.getcwd(), f'{filename}.txt')
        domains = ['fr','it','ca','co.uk','ru','co,jp','co.kr','com.au','co.in','com.br','com.ar','co.za','co.nz','es','se','nl','ch','at','dk','be','pl','fi','ie','pt','gr', 'tw', 'com', 'uk', 'de', 'br', 'ca', 'kr', 'mx', 'au', 'za']
        random_domain = random.choice(domains)
        url = f'https://www.google.{random_domain}/search?q={query}&num=100'
        # 请求头
        headers = {
                   'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
                   'accept-language': 'zh-CN,zh;q=0.9',
                   'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                   'referer': 'https://www.google.com/',
                   'origin': 'https://www.google.com',
                   'Sec-Fetch-Site': 'same-origin',
                   'Sec-Fetch-Mode': 'navigate',
                   'Sec-Fetch-User': '?1',
                   'Sec-Fetch-Dest': 'document'
        }
        # 代理
        proxies = {'http': 'http://127.0.0.1:7890', 'https': 'http://127.0.0.1:7890'}
        response = requests.get(url=url, headers=headers, proxies=proxies, verify=False, timeout=5)
        soup = BeautifulSoup(response.content, 'html.parser')
        # 查找全部div标签
        find_div = soup.find_all('div', {'class': 'yuRUbf'})
        # 开启线程锁
        locks.acquire()
        # 加入列表
        get_url = [url.findNext('a')['href'] + '
' for url in find_div if 'google.com.tw' not in url.findNext('a')['href']]
        global url_num, dork_finish_num
        url_num += len(get_url)
        dork_finish_num += 1
        print(Fore.GREEN + f'
{now_time}[INFO]{ '-' * 10}>已获取Url数量:{url_num}  Dorsk数量:{dork_finish_num} / {dork_total_num}', end='' + Fore.RESET)
        # 写入文件
        write_info(filename, get_url)
        # 释放线程锁
        locks.release()

    except TimeoutError:
        pass


# 写入文件函数
def write_info(filename, get_url):

    with open(filename, mode='a+', encoding='utf-8') as file:

        file.writelines(get_url)


if __name__ == '__main__':
    while True:
        try:
            init()  # 初始化颜色模块
            dorks_file = input(Fore.YELLOW + f'
{now_time}[INFO]{ '-' * 10}>input file:' + Fore.RESET)
            print('')
            filename = input(Fore.YELLOW + f'
{now_time}[INFO]{ '-' * 10}>output file:' + Fore.RESET)
            # 接受work函数返回的元组
            dork_total_num, query_list = work(dorks_file)
            # 定义全局变量完成数量/URL数量
            dork_finish_num = url_num = 0

            # 定义进程池和线程池数量
            process_pool = ProcessPoolExecutor(max_workers=4)
            thread_pool = ThreadPoolExecutor(max_workers=20)

            # 定义全局锁
            threads_lock = Lock()

            # 分配进程池任务
            for dokr_list in query_list:

                process_pool.submit(thread_pool.submit, google_serach, dokr_list, threads_lock, filename)

            process_pool.shutdown(wait=True)
            thread_pool.shutdown(wait=True)
            if len(sys.argv) == 1:
                pass
            input(Fore.YELLOW + f'

{now_time}[INFO]{'-' * 10}>程序运行完毕,按回车退出' + Fore.RESET)
            break
        # 文件为空
        except FileNotFoundError:
            print(Fore.RED + f'{now_time}[Error]{'-' * 10}>文件不存在' + Fore.RESET)
        # 中断异常
        except KeyboardInterrupt:
            sys.exit(1)

总结

通过使用多进程和异步编程,可以显著提高网络请求的效率,加速程序运行速度。该改进方案适用于需要进行大量网络请求的场景,例如爬虫、数据抓取等。

使用多进程和异步编程优化代码效率:网络请求加速方案

原文地址: https://www.cveoy.top/t/topic/oZPc 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录