import requests from bs4 import BeautifulSoup import time import tkinter as tk import webbrowser import random import os import re

def get_random_user_agent(): user_agents = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36', ] return random.choice(user_agents)

def crawl_baidu(keyword, page_limit): headers = { 'User-Agent': get_random_user_agent() }

results = []
for page in range(1, page_limit + 1):
    url = f'https://www.baidu.com/s?wd={keyword}&pn={(page - 1) * 10}'

    # 添加随机延迟
    delay = random.uniform(0.5, 1.0)
    time.sleep(delay)

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # 检查请求是否成功
        soup = BeautifulSoup(response.text, 'html.parser')

        for result in soup.find_all('div', class_='result'):
            result_title = result.find('h3').get_text()
            result_url = result.find('a')['href']
            results.append((result_title, result_url))
    except requests.exceptions.RequestException as e:
        print(f'请求异常: {e}')

return results

def open_url(url): webbrowser.open(url)

def crawl_and_index(): keywords = re.split(r'[,,\s]+', entry_keywords.get()) # 获取关键词列表 page_limit = int(entry_pages.get()) # 获取指定的爬取页数

# 创建文件夹用于保存网页文件
if not os.path.exists('webpages'):
    os.makedirs('webpages')

# 爬取并保存网页文件
for keyword in keywords:
    search_results = crawl_baidu(keyword, page_limit)
    if len(search_results) > 0:
        file_name = f'webpages/{keyword}.html'
        with open(file_name, 'w', encoding='utf-8') as file:
            for title, url in search_results:
                file.write(f'{title}\n')
                file.write(f'{url}\n')
            file.write('\n')
    else:
        print(f'关键词 \'{keyword}\' 没有搜索结果')

def search_local(): keyword = entry_search.get() result_text.delete('1.0', tk.END)

# 遍历网页文件,搜索匹配的结果
for file_name in os.listdir('webpages'):
    with open(f'webpages/{file_name}', 'r', encoding='utf-8') as file:
        lines = file.readlines()

        # 确保行数足够
        if len(lines) < 2:
            continue

        found_results = {}
        for i in range(0, len(lines), 2):
            if i + 1 >= len(lines):
                break

            title = lines[i].strip()
            url = lines[i + 1].strip()
            if keyword.lower() in title.lower() or keyword.lower() in url.lower():
                found_results[len(found_results) + 1] = (title, url)

        if len(found_results) > 0:
            result_text.insert(tk.END, f'搜索结果 - {file_name[:-5]}:\n\n', 'title')
            for index, (title, url) in found_results.items():
                result_text.insert(tk.END, f'{title}\n', 'found_title')
                result_text.insert(tk.END, f'{url}\n', f'link{index}')
                result_text.tag_configure(f'link{index}', foreground='blue', underline=True)
                result_text.tag_bind(f'link{index}', '<Button-1>', lambda event, url=url: open_url(url))
            result_text.insert(tk.END, '\n')

if result_text.get('1.0', tk.END) == '\n':
    result_text.insert(tk.END, '没有搜索结果\n')

创建UI界面

window = tk.Tk() window.title('百度搜索') window.geometry('800x600')

label_keywords = tk.Label(window, text='请输入关键词(用逗号或空格隔开):') label_keywords.pack()

entry_keywords = tk.Entry(window) entry_keywords.pack()

label_pages = tk.Label(window, text='请输入爬取页数:') label_pages.pack()

entry_pages = tk.Entry(window) entry_pages.pack()

crawl_button = tk.Button(window, text='爬取并索引', command=crawl_and_index) crawl_button.pack()

label_search = tk.Label(window, text='请输入搜索关键词:') label_search.pack()

entry_search = tk.Entry(window) entry_search.pack()

search_button = tk.Button(window, text='搜索', command=search_local) search_button.pack()

scrollbar = tk.Scrollbar(window) scrollbar.pack(side=tk.RIGHT, fill=tk.Y)

result_text = tk.Text(window, yscrollcommand=scrollbar.set) result_text.pack(fill=tk.BOTH)

scrollbar.config(command=result_text.yview)

window.mainloop()

Python打造本地百度搜索工具

原文地址: https://www.cveoy.top/t/topic/SO8 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录