垃圾邮件分类器:使用朴素贝叶斯算法和Tkinter界面
import tkinter as tk from tkinter import messagebox from tkinter import filedialog import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import confusion_matrix, classification_report from sklearn.model_selection import train_test_split from collections import Counter import random import matplotlib.pyplot as plt from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg from wordcloud import WordCloud from matplotlib import font_manager import matplotlib.pyplot as plt
class SpamClassifier: '初始化SpamClassifier类,创建CountVectorizer和MultinomialNB实例' def init(self): self.vectorizer = CountVectorizer() self.classifier = MultinomialNB()
'加载数据集,将文本数据转换为向量形式,并获取标签'
def load_dataset(self, file_path):
self.data = pd.read_csv(file_path)
self.X = self.vectorizer.fit_transform(self.data['messages'])
self.y = self.data['labels']
'训练模型,使用训练集数据拟合模型'
def train_model(self):
self.classifier.fit(self.X, self.y)
'预测邮件类型,将文本转换为向量形式,并使用模型进行预测'
def predict(self, text):
X_test = self.vectorizer.transform([text])
y_pred = self.classifier.predict(X_test)
return y_pred[0]
'评估模型,使用测试集数据计算混淆矩阵和分类报告'
def evaluate(self, X_test, y_test):
y_pred = self.classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)
return (cm, report)
'将数据分割成训练集和测试集,默认测试集比例为0.2'
def split_data(self, test_size=0.2):
return train_test_split(self.data['messages'], self.data['labels'], test_size=test_size)
'统计垃圾邮件内容中的高频词'
def count_word_frequency(self):
spam_messages = self.data[self.data['labels'] == 'spam']['messages']
spam_word_count = Counter()
for text in spam_messages:
spam_word_count.update(text.split())
return spam_word_count
class App: '初始化App类,创建Tkinter窗口和SpamClassifier实例,并创建各种按钮和标签' def init(self, master): self.master = master self.master.title('垃圾邮件分类器') self.master.geometry('1000x700') self.classifier = SpamClassifier()
self.load_button = tk.Button(self.master, text='导入数据集', command=self.load_dataset)
self.load_button.pack()
self.file_label = tk.Label(self.master, text='未选择文件')
self.file_label.pack()
self.train_button = tk.Button(self.master, text='训练模型', command=self.train_model)
self.train_button.pack()
self.text_entry = tk.Entry(self.master, width=50)
self.text_entry.pack()
self.predict_button = tk.Button(self.master, text='预测', command=self.predict)
self.predict_button.pack()
self.random_predict_button = tk.Button(self.master, text='随机预测', command=self.random_predict)
self.random_predict_button.pack()
self.result_label = tk.Label(self.master, text='')
self.result_label.pack()
# 创建一个Frame来存放饼图和混淆矩阵
self.plot_frame = tk.Frame(self.master)
self.plot_frame.pack()
self.pie_button = tk.Button(self.plot_frame, text='邮件分类饼图', command=self.show_pie_chart)
self.pie_button.pack(side=tk.LEFT)
self.confusion_matrix_button = tk.Button(self.plot_frame, text='混淆矩阵', command=self.show_confusion_matrix)
self.confusion_matrix_button.pack(side=tk.LEFT)
self.clear_canvas_button = tk.Button(self.master, text='清空画布', command=self.clear_canvas)
self.clear_canvas_button.pack()
self.cm_text = tk.Text(self.master, width=55, height=20)
self.cm_text.pack(side=tk.RIGHT)
self.canvas = tk.Canvas(self.master, width=600, height=300)
self.canvas.pack(side=tk.RIGHT)
self.wordcloud_button = tk.Button(self.master, text='垃圾邮件词云', command=self.show_wordcloud)
self.wordcloud_button.pack()
'加载数据集,显示文件路径'
def load_dataset(self):
file_path = filedialog.askopenfilename()
if file_path:
self.classifier.load_dataset(file_path)
self.file_label.config(text='文件路径:' + file_path)
'训练模型,弹出提示框'
def train_model(self):
self.classifier.train_model()
messagebox.showinfo('成功', '模型训练完成')
'预测邮件类型,根据预测结果更新标签'
def predict(self):
text = self.text_entry.get()
if text:
result = self.classifier.predict(text)
if result == 'spam':
self.result_label.config(text='垃圾邮件')
else:
self.result_label.config(text='非垃圾邮件')
else:
messagebox.showerror('错误', '请输入文本')
'随机预测,从测试集中随机选择一条数据进行预测'
def random_predict(self):
test_data = self.classifier.split_data()[1]
random_index = random.randint(0, len(test_data) - 1)
text = test_data.iloc[random_index]
self.text_entry.delete(0, tk.END)
self.text_entry.insert(0, text)
result = self.classifier.predict(text)
if result == 'spam':
self.result_label.config(text='垃圾邮件')
else:
self.result_label.config(text='非垃圾邮件')
'显示邮件分类饼图,更新Canvas内容'
def show_pie_chart(self):
# 获取最新的标签计数
counts = dict(Counter(self.classifier.y))
labels = ['ham', 'spam']
values = [counts['ham'], counts['spam']]
# 删除Canvas上的旧图并绘制新图
self.canvas.delete('all')
fig, ax = plt.subplots()
ax.pie(values, labels=labels, autopct='%1.1f%%')
ax.set_title('Message classification scale')
canvas = FigureCanvasTkAgg(fig, master=self.canvas)
canvas.draw()
self.plot = canvas.get_tk_widget()
self.plot.pack()
'显示混淆矩阵和分类报告,更新Text内容'
def show_confusion_matrix(self):
X_train, X_test, y_train, y_test = self.classifier.split_data()
cm, report = self.classifier.evaluate(self.classifier.vectorizer.transform(X_test), y_test)
self.cm_text.insert(tk.END, '混淆矩阵:\n')
self.cm_text.insert(tk.END, str(cm))
self.cm_text.insert(tk.END, '\n分类报告:\n')
self.cm_text.insert(tk.END, str(report))
# 显示混淆矩阵
plt.cm.Blues(cm)
self.canvas.delete('all')
fig, ax = plt.subplots()
im = ax.imshow(cm)
# 配置图像样式
ax.set_xticks([0, 1])
ax.set_yticks([0, 1])
ax.set_xticklabels(['ham', 'spam'], fontsize=14)
ax.set_yticklabels(['ham', 'spam'], fontsize=14)
ax.set_xlabel('Predicted labels', fontsize=14)
ax.set_ylabel('True labels', fontsize=14)
ax.format_coord = lambda x, y: f'True: {int(y)} Predicted: {int(x)} Value: {cm[int(y), int(x)]}'
ax.set_title('Confusion Matrix', fontsize=18)
# 显示图像
cbar = ax.figure.colorbar(im, ax=ax)
cbar.ax.tick_params(labelsize=14)
canvas = FigureCanvasTkAgg(fig, master=self.canvas)
canvas.draw()
self.plot = canvas.get_tk_widget()
self.plot.pack()
'清空画布'
def clear_canvas(self):
self.canvas.delete('all')
self.plot.destroy()
self.cm_text.delete('1.0', tk.END)
'显示垃圾邮件词云'
def show_wordcloud(self):
# 获取垃圾邮件词频统计结果
spam_word_count = self.classifier.count_word_frequency()
# 根据词频生成词云
wc = WordCloud(background_color='white', font_path='msyh.ttf')
wc.generate_from_frequencies(spam_word_count)
# 在GUI中显示词云
self.canvas.delete('all')
plt.figure()
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()
canvas = FigureCanvasTkAgg(plt.gcf(), master=self.canvas)
canvas.draw()
self.plot = canvas.get_tk_widget()
self.plot.pack()
root = tk.Tk() app = App(root) root.mainloop()
原文地址: https://www.cveoy.top/t/topic/oqHh 著作权归作者所有。请勿转载和采集!