Python 文件检索系统优化:代码结构重构与性能提升
import bs4
import collections
import docx
import matplotlib.pyplot as plt
import openpyxl
import os
import pandas as pd
import PyPDF2
import re
from collections import Counter
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
from matplotlib.backends.backend_tkagg import NavigationToolbar2Tk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tkinter import filedialog
from tkinter import ttk
import tkinter as tk
plt.rcParams['font.family'] = ['Microsoft YaHei'] # 使用微软雅黑字体
def extract_text_from_file(file_path):
"""
提取文件文本内容
Args:
file_path (str): 文件路径
Returns:
str: 文件文本内容
"""
ext = file_path.split('.')[-1]
if ext == 'docx':
doc = docx.Document(file_path)
text = '\n'.join([p.text for p in doc.paragraphs])
return text
elif ext == 'xlsx':
wb = openpyxl.load_workbook(file_path, read_only=True)
excel_content = ' '.join([str(cell.value) for sheet in wb.worksheets for row in sheet.iter_rows() for cell in row if cell.value is not None])
return excel_content
elif ext == 'pptx':
prs = Presentation(file_path)
text = '\n'.join([shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, 'text')])
return text
elif ext == 'pdf':
with open(file_path, 'rb') as f:
pdf = PyPDF2.PdfFileReader(f, strict=False)
text = '\n'.join([page.extract_text() for page in pdf.pages])
return text
elif ext == 'html':
with open(file_path, 'r') as f:
soup = BeautifulSoup(f.read(), 'html.parser')
text = soup.get_text()
return text
elif ext == 'txt':
with open(file_path, 'r') as f:
text = f.read()
return text
else:
return ''
def classify_keywords(keywords):
"""
将关键词分类
Args:
keywords (list): 关键词列表
Returns:
tuple: 分类的关键词列表,包含 and_keywords、or_keywords、not_keywords
"""
and_keywords, or_keywords, not_keywords = [], [], []
for kw in keywords:
if kw.startswith('+'):
or_keywords.append(kw[1:])
elif kw.startswith('-'):
not_keywords.append(kw[1:])
else:
and_keywords.append(kw)
return and_keywords, or_keywords, not_keywords
def search_documents_by_frequency(results, and_keywords, or_keywords, not_keywords):
"""
使用词频算法进行检索
Args:
results (list): 文件信息列表
and_keywords (list): 必须包含的关键词列表
or_keywords (list): 可选包含的关键词列表
not_keywords (list): 不允许包含的关键词列表
Returns:
list: 检索结果列表,包含文件路径、文本内容和得分
"""
for i, file in enumerate(results):
file_path, text = file
match = True
for query in not_keywords:
if re.search(r'\b{}\b'.format(query), text, re.I):
match = False
break
if match:
file_score = 0
for query in and_keywords:
query_score = sum(text.count(word) for word in query.split())
file_score += query_score
for query in or_keywords:
query_score = sum(text.count(word) for word in query.split())
if query_score > 0:
file_score += query_score
results[i] = (file_path, text, file_score)
else:
results[i] = (file_path, text, 0)
return results
def search_documents_by_tfidf(results, and_keywords, or_keywords, not_keywords):
"""
使用 TF-IDF 算法进行检索
Args:
results (list): 文件信息列表
and_keywords (list): 必须包含的关键词列表
or_keywords (list): 可选包含的关键词列表
not_keywords (list): 不允许包含的关键词列表
Returns:
list: 检索结果列表,包含文件路径、文本内容和得分
"""
corpus = [result[1] for result in results]
vectorizer = TfidfVectorizer()
doc_term_matrix = vectorizer.fit_transform(corpus)
for i, file in enumerate(results):
file_path, text = file
match = True
for query in not_keywords:
if re.search(r'\b{}\b'.format(query), text, re.I):
match = False
break
if match:
file_score = 0
for query in and_keywords:
query_score = sum(text.count(word) for word in query.split())
file_score += query_score
query_doc_term_matrix = vectorizer.transform([query for query in or_keywords])
query_score_matrix = cosine_similarity(query_doc_term_matrix, doc_term_matrix[i])
file_score += sum(score for score in query_score_matrix[0] if score > 0)
tfidf_query = [Counter(keyword.split()) for keyword in and_keywords + or_keywords]
tfidf_query_strings = [' '.join([t[0] for t in tfidf_query[i]]) for i in range(len(tfidf_query))]
tfidf_weighted_query_doc_term_matrix = vectorizer.transform(tfidf_query_strings)
tfidf_weighted_query_score_matrix = cosine_similarity(tfidf_weighted_query_doc_term_matrix, doc_term_matrix[i])
file_score += sum(score for score in tfidf_weighted_query_score_matrix[0] if score > 0)
results[i] = (file_path, text, file_score)
else:
results[i] = (file_path, text, 0)
return results
def search_documents_by_vector_space_model(results, and_keywords, or_keywords, not_keywords):
"""
使用向量空间模型进行检索
Args:
results (list): 文件信息列表
and_keywords (list): 必须包含的关键词列表
or_keywords (list): 可选包含的关键词列表
not_keywords (list): 不允许包含的关键词列表
Returns:
list: 检索结果列表,包含文件路径、文本内容和得分
"""
corpus = [result[1] for result in results]
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(corpus)
query_tfidf = vectorizer.transform(and_keywords + or_keywords)
if not_keywords:
query_not_tfidf = vectorizer.transform(not_keywords)
score_matrix = cosine_similarity(tfidf, query_tfidf)
for i, score in enumerate(score_matrix):
if not_keywords:
not_score = cosine_similarity(tfidf[i], query_not_tfidf)
results[i] = (results[i][0], results[i][1], score[0] - not_score[0][0])
else:
results[i] = (results[i][0], results[i][1], score[0])
return results
class SearchApp(tk.Tk):
def __init__(self):
super().__init__()
self.title('文件检索系统')
self.geometry('1600x900')
self.resizable(False, False)
self.create_widgets()
def create_widgets(self):
# 文件夹选择
folder_frame = tk.Frame(self)
folder_frame.pack(side='top', pady=10)
self.folder_label = tk.Label(folder_frame, text='选择一个文件夹:')
self.folder_label.pack(side='left')
self.folder_entry = tk.Entry(folder_frame, width=50)
self.folder_entry.pack(side='left')
self.folder_button = tk.Button(folder_frame, text='选择文件夹', command=self.select_folder)
self.folder_button.pack(side='left', padx=10)
# 关键词输入
keywords_frame = tk.Frame(self)
keywords_frame.pack(side='top', pady=10)
self.keywords_label = tk.Label(keywords_frame, text='输入关键词:')
self.keywords_label.pack(side='left')
self.keywords_entry = tk.Entry(keywords_frame, width=50)
self.keywords_entry.pack(side='left')
# 算法选择
algorithm_frame = tk.Frame(self)
algorithm_frame.pack(side='top', pady=10)
self.algorithm_label = tk.Label(algorithm_frame, text='选择算法:')
self.algorithm_label.pack(side='left')
self.algorithm_dropdown = ttk.Combobox(algorithm_frame, width=48)
self.algorithm_dropdown['values'] = ['词频', 'TF-IDF', '空间向量模型']
self.algorithm_dropdown.current(0)
self.algorithm_dropdown.pack(side='left', padx=10)
# 检索按钮
search_button_frame = tk.Frame(self)
search_button_frame.pack(side='top', pady=10)
self.search_button = tk.Button(search_button_frame, text='检索', command=self.search_documents)
self.search_button.pack()
# 检索结果
results_frame = tk.Frame(self)
results_frame.pack(side='top', pady=10)
self.results_text_label = tk.Label(results_frame, text='检索结果文本框:')
self.results_text_label.pack(side='top')
self.results_text = tk.Text(results_frame, wrap='word', height=20)
self.results_text.pack(side='left')
self.results_figure_label = tk.Label(results_frame, text='检索结果图表:')
self.results_figure_label.pack(side='top', pady=(0, 10))
self.results_figure = plt.Figure(figsize=(6, 5), dpi=100)
self.results_plot = self.results_figure.add_subplot(111)
self.results_canvas = FigureCanvasTkAgg(self.results_figure, results_frame)
self.results_canvas.get_tk_widget().pack(side='left', padx=(50, 10))
self.toolbar = NavigationToolbar2Tk(self.results_figure.canvas, self)
self.toolbar.update()
self.results_canvas = tk.Canvas(self, width=600, height=300)
self.results_canvas.pack(padx=20, pady=(10, 20))
self.results_figure_canvas = FigureCanvasTkAgg(self.results_figure, self.results_canvas)
self.results_figure_canvas.draw()
self.results_figure_canvas.get_tk_widget().pack()
def select_folder(self):
folder_path = filedialog.askdirectory()
self.folder_entry.delete(0, tk.END)
self.folder_entry.insert(0, folder_path)
def search_documents(self):
self.results_text.delete(1.0, tk.END)
self.results_plot.clear()
keywords = '+' + self.keywords_entry.get()
keywords = keywords.split()
algorithm = self.algorithm_dropdown.get()
search_path = self.folder_entry.get()
and_keywords, or_keywords, not_keywords = classify_keywords(keywords)
results = []
for foldername, _, filenames in os.walk(search_path):
for filename in filenames:
file_path = os.path.join(foldername, filename)
results.append((file_path, extract_text_from_file(file_path)))
if algorithm == '词频':
results = search_documents_by_frequency(results, and_keywords, or_keywords, not_keywords)
elif algorithm == 'TF-IDF':
results = search_documents_by_tfidf(results, and_keywords, or_keywords, not_keywords)
elif algorithm == '空间向量模型':
results = search_documents_by_vector_space_model(results, and_keywords, or_keywords, not_keywords)
results.sort(key=lambda x: x[2], reverse=True)
graph_results = [(os.path.basename(file_path), score) for file_path, _, score in results]
df = pd.DataFrame(graph_results, columns=['File', 'Score'])
df.plot.bar(x='File', y='Score', ax=self.results_plot, rot=0)
self.results_plot.set_title('Search Results')
self.results_figure_canvas.draw()
if __name__ == '__main__':
app = SearchApp()
app.mainloop()
原文地址: https://www.cveoy.top/t/topic/oSRm 著作权归作者所有。请勿转载和采集!