#coding:utf-8 import requests import time import os import urllib3 import sys from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor from threading import Lock from colorama import Fore, init

now_time = time.strftime('%Y-%m-%d %H-%M')

谷歌在其他国家的域名

GOOGLE_DOMAINS = ['google.com', 'google.co.uk', 'google.ca', 'google.fr', 'google.de', 'google.it', 'google.es', 'google.com.br', 'google.co.jp', 'google.co.kr', 'google.com.au', 'google.com.mx', 'google.ru', 'google.com.sg', 'google.co.th', 'google.com.sa', 'google.co.za', 'google.com.tr', 'google.com.ar', 'google.co.id', 'google.com.vn', 'google.com.ph', 'google.com.ua', 'google.co.il', 'google.com.pk', 'google.co.ke', 'google.com.ng', 'google.co.ve', 'google.co.nz']

读取Dorks

def work(dorks): with open(dorks, mode='r', encoding='utf-8') as file: read_content = file.readlines() # 将内容加入列表 content = [result.strip() for result in read_content] # 返回数量丢给任务池 return len(read_content), content

Google搜索

def google_serach(query, locks): global url_num, dork_finish_num, current_dock_url_num try: # 关闭HTTPS报错信息 urllib3.disable_warnings() filename = os.path.join(os.getcwd(), 'google.txt') url = f'https://{GOOGLE_DOMAINS[url_num % len(GOOGLE_DOMAINS)]}/search?q={query}&num=100' # 请求头 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', 'accept-language': 'zh-CN,zh;q=0.9', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.7', 'referer': f'https://{GOOGLE_DOMAINS[url_num % len(GOOGLE_DOMAINS)]}/', 'origin': f'https://{GOOGLE_DOMAINS[url_num % len(GOOGLE_DOMAINS)]}', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-User': '?1', 'Sec-Fetch-Dest': 'document' } # 代理 proxies = {'http': 'http://127.0.0.1:7890', 'https': 'http://127.0.0.1:7890'} response = requests.get(url=url, headers=headers, proxies=proxies, verify=False, timeout=5) soup = BeautifulSoup(response.content, 'html.parser') # 查找全部div标签 find_div = soup.find_all('div', {'class': 'yuRUbf'}) # 开启线程锁 locks.acquire() # 加入列表 get_url = [url.findNext('a')['href'] + '\n' for url in find_div] url_num += 1 dork_finish_num += 1 current_dock_url_num += len(get_url) print(Fore.GREEN + f' {now_time}[INFO]{ '-' * 10}>已获取Url数量:{url_num} Dorks数量:{dork_finish_num} / {dork_total_num}', end='' + Fore.RESET) # 写入文件 write_info(filename, get_url) # 释放线程锁 locks.release()

except TimeoutError:
    pass

写入文件函数

def write_info(filename, get_url):

with open(filename, mode='a+', encoding='utf-8') as file:

    file.writelines(get_url)

if name == 'main': while True: try: init() # 初始化颜色模块 dorks_file = input(Fore.YELLOW + f' {now_time}[INFO]{ '-' * 10}>输入文件位置:' + Fore.RESET) print('') # 接受work函数返回的元组 dork_total_num, query_list = work(dorks_file) # 定义全局变量完成数量/URL数量 dork_finish_num = url_num = 0 current_dock_url_num = 0 # 定义线程池数量 threads = ThreadPoolExecutor(max_workers=5)

        # 定义全局锁
        threads_lock = Lock()

        # 分配进程池任务
        for dokr_list in query_list:
            current_dock_url_num = 0
            threads.submit(google_serach, dokr_list, threads_lock)
            # 每读取10条语法换一个谷歌在其他国家的域名采集url
            if dork_finish_num % 10 == 0:
                time.sleep(1)
                # 当当前Dock获取的url数量达到一定值时再跳转到下一个Dock
                if current_dock_url_num >= 50:
                    current_dock_url_num = 0
                    url_num += 1

        threads.shutdown()
        if len(sys.argv) == 1:
            pass
        input(Fore.YELLOW + f'

{now_time}[INFO]{'-' * 10}>程序运行完毕,按回车退出' + Fore.RESET) break # 文件为空 except FileNotFoundError: print(Fore.RED + f'{now_time}[Error]{'-' * 10}>文件不存在' + Fore.RESET) # 中断异常 except KeyboardInterrupt: sys.exit(1)

如何解决Google搜索时URL获取数量过少导致跳转问题

原文地址: https://www.cveoy.top/t/topic/obTc 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录