Python打造本地网络搜索引擎：自动抓取百度搜索结果

import requests
from bs4 import BeautifulSoup
import time
import tkinter as tk
import webbrowser
import random
import os
import re


def get_random_user_agent():
    user_agents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
    ]
    return random.choice(user_agents)


def crawl_baidu(keyword, page_limit):
    headers = {
        'User-Agent': get_random_user_agent()
    }

    results = []
    for page in range(1, page_limit + 1):
        url = f'https://www.baidu.com/s?wd={keyword}&pn={(page - 1) * 10}'

        # 添加随机延迟
        delay = random.uniform(0.5, 1.0)
        time.sleep(delay)

        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        for result in soup.find_all('div', class_='result'):
            result_title = result.find('h3').get_text()
            result_url = result.find('a')['href']
            results.append((result_title, result_url))

    return results


def open_url(url):
    webbrowser.open(url)


def crawl_and_index_manual():
    keywords = re.split(r'[,，\s]+', entry_keywords.get())  # 获取关键词列表
    page_limit = int(entry_pages.get())  # 获取指定的爬取页数

    # 创建文件夹用于保存网页文件
    if not os.path.exists('webpages'):
        os.makedirs('webpages')

    # 爬取并保存网页文件
    for keyword in keywords:
        search_results = crawl_baidu(keyword, page_limit)
        if len(search_results) > 0:
            file_name = f'webpages/{keyword}.html'
            with open(file_name, 'w', encoding='utf-8') as file:
                for title, url in search_results:
                    file.write(f'{title}\n')
                    file.write(f'{url}\n')
                file.write('\n')
        else:
            print(f'关键词 \'{keyword}\' 没有搜索结果')


def crawl_and_index_auto():
    keywords = re.split(r'[,，\s]+', entry_auto_keywords.get())  # 获取关键词列表
    page_limit = int(entry_auto_pages.get())  # 获取指定的爬取页数
    result_limit = int(entry_auto_results.get())  # 获取搜索结果条数限制

    # 创建文件夹用于保存网页文件
    if not os.path.exists('webpages'):
        os.makedirs('webpages')

    # 爬取并保存网页文件
    crawled_keywords = set()
    queue = keywords.copy()
    while queue and len(crawled_keywords) < result_limit:
        keyword = queue.pop(0)
        if keyword in crawled_keywords:
            continue

        search_results = crawl_baidu(keyword, page_limit)
        if len(search_results) > 0:
            file_name = f'webpages/{keyword}.html'
            with open(file_name, 'w', encoding='utf-8') as file:
                for title, url in search_results:
                    file.write(f'{title}\n')
                    file.write(f'{url}\n')
                    for char in title:
                        queue.append(char)
                file.write('\n')
                crawled_keywords.add(keyword)
        else:
            print(f'关键词 \'{keyword}\' 没有搜索结果')

    print(f'自动爬取共获取到 {len(crawled_keywords)} 个搜索结果')

def clear_widgets():
    for widget in window.winfo_children():
        widget.destroy()

def load_manual_crawl_page():
    clear_widgets()

    global entry_keywords, entry_pages
    label_keywords = tk.Label(window, text='请输入关键词（用逗号或空格隔开）:')
    label_keywords.pack()

    entry_keywords = tk.Entry(window)
    entry_keywords.pack()

    label_pages = tk.Label(window, text='请输入爬取页数:')
    label_pages.pack()

    entry_pages = tk.Entry(window)
    entry_pages.pack()

    crawl_button = tk.Button(window, text='手动爬取', command=crawl_and_index_manual)
    crawl_button.pack()

    global entry_search, result_text, scrollbar
    label_search = tk.Label(window, text='请输入搜索关键词:')
    label_search.pack()

    entry_search = tk.Entry(window)
    entry_search.pack()

    search_button = tk.Button(window, text='搜索', command=search_local)
    search_button.pack()

    scrollbar = tk.Scrollbar(window)
    scrollbar.pack(side=tk.RIGHT, fill=tk.Y)

    result_text = tk.Text(window, yscrollcommand=scrollbar.set)
    result_text.pack(fill=tk.BOTH)

    scrollbar.config(command=result_text.yview)

def load_auto_crawl_page():
    clear_widgets()

    global entry_auto_keywords, entry_auto_pages, entry_auto_results
    label_auto_keywords = tk.Label(window, text='请输入关键词（用逗号或空格隔开）:')
    label_auto_keywords.pack()

    entry_auto_keywords = tk.Entry(window)
    entry_auto_keywords.pack()

    label_auto_pages = tk.Label(window, text='请输入爬取页数:')
    label_auto_pages.pack()

    entry_auto_pages = tk.Entry(window)
    entry_auto_pages.pack()

    label_auto_results = tk.Label(window, text='请输入结果条数限制:')
    label_auto_results.pack()

    entry_auto_results = tk.Entry(window)
    entry_auto_results.pack()

    crawl_button = tk.Button(window, text='自动爬取', command=crawl_and_index_auto)
    crawl_button.pack()

    global entry_search, result_text, scrollbar
    label_search = tk.Label(window, text='请输入搜索关键词:')
    label_search.pack()

    entry_search = tk.Entry(window)
    entry_search.pack()

    search_button = tk.Button(window, text='搜索', command=search_local)
    search_button.pack()

    scrollbar = tk.Scrollbar(window)
    scrollbar.pack(side=tk.RIGHT, fill=tk.Y)

    result_text = tk.Text(window, yscrollcommand=scrollbar.set)
    result_text.pack(fill=tk.BOTH)

    scrollbar.config(command=result_text.yview)

def search_local():
    keyword = entry_search.get()
    result_text.delete('1.0', tk.END)

    # 遍历网页文件，搜索匹配的结果
    for file_name in os.listdir('webpages'):
        with open(f'webpages/{file_name}', 'r', encoding='utf-8') as file:
            lines = file.readlines()

            # 确保行数足够
            if len(lines) < 2:
                continue

            found_results = {}
            for i in range(0, len(lines), 2):
                if i + 1 >= len(lines):
                    break

                title = lines[i].strip()
                url = lines[i + 1].strip()
                if keyword.lower() in title.lower() or keyword.lower() in url.lower():
                    found_results[len(found_results) + 1] = (title, url)

            if len(found_results) > 0:
                result_text.insert(tk.END, f'搜索结果 - {file_name[:-5]}:\n\n', 'title')
                for index, (title, url) in found_results.items():
                    result_text.insert(tk.END, f'{title}\n', 'found_title')
                    result_text.insert(tk.END, f'{url}\n', f'link{index}')
                    result_text.tag_configure(f'link{index}', foreground='blue', underline=True)
                    result_text.tag_bind(f'link{index}', '<Button-1>', lambda event, url=url: open_url(url))
                result_text.insert(tk.END, '\n')

    if result_text.get('1.0', tk.END) == '\n':
        result_text.insert(tk.END, '没有搜索结果\n')


# 创建UI界面
window = tk.Tk()
window.title('网络搜索引擎')
window.geometry('800x600')

button_manual_crawl = tk.Button(window, text='手动爬取', command=load_manual_crawl_page)
button_manual_crawl.pack()

button_auto_crawl = tk.Button(window, text='自动爬取', command=load_auto_crawl_page)
button_auto_crawl.pack()

window.mainloop()