Python爬虫实战:打造本地百度搜索引擎
Python爬虫实战:打造本地百度搜索引擎
想体验一把自己动手打造搜索引擎的乐趣吗?本文将带你使用Python和Beautiful Soup构建一个简单的本地百度搜索引擎,抓取百度搜索结果并保存到本地,实现离线关键词搜索功能。
项目目标
- 使用Python爬取百度搜索结果* 将搜索结果保存到本地HTML文件* 创建GUI界面,实现关键词搜索功能
代码实现pythonimport requestsfrom bs4 import BeautifulSoupimport timeimport tkinter as tkimport webbrowserimport randomimport osimport re
def get_random_user_agent(): user_agents = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36', ] return random.choice(user_agents)
def crawl_baidu(keyword, page_limit): headers = { 'User-Agent': get_random_user_agent() }
results = [] for page in range(1, page_limit + 1): url = f'https://www.baidu.com/s?wd={keyword}&pn={(page - 1) * 10}'
# 添加随机延迟 delay = random.uniform(0.5, 1.0) time.sleep(delay)
try: response = requests.get(url, headers=headers) response.raise_for_status() # 检查请求是否成功 except requests.exceptions.RequestException as e: print(f'网络请求异常: {e}') return results
soup = BeautifulSoup(response.text, 'html.parser')
for result in soup.find_all('div', class_='result'): result_title = result.find('h3').get_text() result_url = result.find('a')['href'] results.append((result_title, result_url))
return results
def open_url(url): webbrowser.open(url)
def crawl_and_index(): keywords = re.split(r'[,,\s]+', entry_keywords.get()) # 获取关键词列表 page_limit = int(entry_pages.get()) # 获取指定的爬取页数
# 创建文件夹用于保存网页文件 if not os.path.exists('webpages'): os.makedirs('webpages')
for keyword in keywords: search_results = crawl_baidu(keyword, page_limit) if len(search_results) > 0: file_name = f'webpages/{keyword}.html' with open(file_name, 'w', encoding='utf-8') as file: for title, url in search_results: file.write(f'{title}\n') file.write(f'{url}\n') file.write('\n') print(f'关键词 \'{keyword}\' 的搜索结果已保存至文件: {file_name}') else: print(f'关键词 \'{keyword}\' 没有搜索结果')
def search_local(): keyword = entry_search.get() result_text.delete('1.0', tk.END)
for file_name in os.listdir('webpages'): with open(f'webpages/{file_name}', 'r', encoding='utf-8') as file: lines = file.readlines()
if len(lines) < 2: continue
found_results = {} for i in range(0, len(lines), 2): if i + 1 >= len(lines): break
title = lines[i].strip() url = lines[i + 1].strip() if keyword.lower() in title.lower() or keyword.lower() in url.lower(): found_results[len(found_results) + 1] = (title, url)
if len(found_results) > 0: result_text.insert(tk.END, f'搜索结果 - {file_name[:-5]}:\n\n', 'title') for index, (title, url) in found_results.items(): result_text.insert(tk.END, f'{title}\n', 'found_title') result_text.insert(tk.END, f'{url}\n', f'link{index}') result_text.tag_configure(f'link{index}', foreground='blue', underline=True) result_text.tag_bind(f'link{index}', '<Button-1>', lambda event, url=url: open_url(url)) result_text.insert(tk.END, '\n')
if result_text.get('1.0', tk.END) == '\n': result_text.insert(tk.END, '没有搜索结果\n')
创建UI界面window = tk.Tk()window.title('百度搜索')window.geometry('800x600')
label_keywords = tk.Label(window, text='请输入关键词(用逗号或空格隔开):')label_keywords.pack()
entry_keywords = tk.Entry(window)entry_keywords.pack()
label_pages = tk.Label(window, text='请输入爬取页数:')label_pages.pack()
entry_pages = tk.Entry(window)entry_pages.pack()
crawl_button = tk.Button(window, text='爬取并索引', command=crawl_and_index)crawl_button.pack()
label_search = tk.Label(window, text='请输入搜索关键词:')label_search.pack()
entry_search = tk.Entry(window)entry_search.pack()
search_button = tk.Button(window, text='搜索', command=search_local)search_button.pack()
scrollbar = tk.Scrollbar(window)scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
result_text = tk.Text(window, yscrollcommand=scrollbar.set)result_text.pack(fill=tk.BOTH)
scrollbar.config(command=result_text.yview)
window.mainloop()
代码解析
- 爬虫部分: 使用
requests库获取网页内容,BeautifulSoup解析HTML,提取标题和链接。* 本地存储: 将爬取到的结果按关键词分别保存到本地HTML文件中。* GUI界面: 使用tkinter创建简单的用户界面,方便输入关键词、爬取页数和进行搜索。* 搜索功能: 读取本地HTML文件,根据用户输入的关键词进行匹配,并以友好的方式展示搜索结果。
注意事项
- 本项目仅供学习交流使用,请勿用于商业用途。* 爬取网页时请遵守 robots.txt 协议,合理设置爬取频率,避免对目标网站造成压力。
希望这个简单的项目能帮助你了解Python爬虫和本地搜索引擎的基本原理。你可以尝试扩展功能,例如实现更复杂的搜索逻辑、优化搜索结果排序等。
原文地址: https://www.cveoy.top/t/topic/SKE 著作权归作者所有。请勿转载和采集!