垃圾邮件分类:基于朴素贝叶斯算法的Python实现
import tkinter as tk from tkinter import filedialog import pandas as pd import numpy as np
from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, roc_curve, roc_auc_score from sklearn.naive_bayes import BernoulliNB, MultinomialNB from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.impute import SimpleImputer import matplotlib.pyplot as plt
class SpamClassification: def init(self): self.window = tk.Tk() self.window.title('Spam Classification') self.window.geometry('1000x600')
# 导入数据集
self.import_button = tk.Button(self.window, text='导入数据集', command=self.import_data)
self.import_button.pack(pady=10)
# 数据预处理选项
self.preprocessing_label = tk.Label(self.window, text='数据预处理选项:')
self.preprocessing_label.pack()
self.missing_value_button = tk.Button(self.window, text='缺失值处理', command=self.missing_value_process)
self.missing_value_button.pack()
self.standardization_button = tk.Button(self.window, text='数值型数据标准化', command=self.standardization_process)
self.standardization_button.pack()
self.encoding_button = tk.Button(self.window, text='类别型数据编码', command=self.encoding_process)
self.encoding_button.pack()
# 训练集测试集划分
self.train_test_label = tk.Label(self.window, text='训练集测试集划分:')
self.train_test_label.pack()
self.train_test_ratio_label = tk.Label(self.window, text='请输入训练集测试集比例(如0.8):')
self.train_test_ratio_label.pack()
self.train_test_ratio_entry = tk.Entry(self.window)
self.train_test_ratio_entry.pack()
self.train_test_button = tk.Button(self.window, text='划分训练集测试集', command=self.train_test_split)
self.train_test_button.pack()
# 模型选择
self.model_label = tk.Label(self.window, text='模型选择:')
self.model_label.pack()
self.model_choice = tk.StringVar()
self.model_choice.set('MultinomialNB')
self.model_optionmenu = tk.OptionMenu(self.window, self.model_choice, 'MultinomialNB', 'BernoulliNB')
self.model_optionmenu.pack()
self.train_button = tk.Button(self.window, text='训练模型', command=self.train_model)
self.train_button.pack()
# 图像选择及显示
self.plot_label = tk.Label(self.window, text='图像选择:')
self.plot_label.pack()
self.plot_choice = tk.StringVar()
self.plot_choice.set('散点图')
self.plot_optionmenu = tk.OptionMenu(self.window, self.plot_choice, '散点图', 'ROC曲线')
self.plot_optionmenu.pack()
self.plot_button = tk.Button(self.window, text='显示图像', command=self.show_plot)
self.plot_button.pack()
# 模型评估
self.evaluate_label = tk.Label(self.window, text='模型评估:')
self.evaluate_label.pack()
self.evaluate_choice = tk.StringVar()
self.evaluate_choice.set('准确率')
self.evaluate_optionmenu = tk.OptionMenu(self.window, self.evaluate_choice, '准确率', '精确率', '召回率', 'F1值', 'ROC曲线')
self.evaluate_optionmenu.pack()
self.evaluate_button = tk.Button(self.window, text='评估模型', command=self.evaluate_model)
self.evaluate_button.pack()
# 数据展示
self.data_show_label = tk.Label(self.window, text='数据集前10行展示:')
self.data_show_label.pack()
self.data_show_text = tk.Text(self.window, height=10)
self.data_show_text.pack()
self.window.mainloop()
def import_data(self):
file_path = filedialog.askopenfilename()
if file_path:
if file_path.endswith('.csv'):
self.data = pd.read_csv(file_path)
elif file_path.endswith('.xlsx'):
self.data = pd.read_excel(file_path)
else:
self.data = None
tk.messagebox.showerror('错误', '只支持csv和xlsx格式的文件!')
return
self.data_show_text.insert('end', self.data.head(10))
else:
tk.messagebox.showerror('错误', '请选择文件!')
def _preprocess_data(self, data):
# 缺失值处理
data = data.dropna()
# 数值型数据标准化
numeric_cols = data.select_dtypes(include=np.number).columns
scaler = StandardScaler()
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])
# 类别型数据编码
object_cols = data.select_dtypes(include='object').columns
for col in object_cols:
le = LabelEncoder()
data[col] = le.fit_transform(data[col])
return data
def missing_value_process(self):
if hasattr(self, 'data'):
self.data = self._preprocess_data(self.data)
self.data_show_text.delete('1.0', 'end')
self.data_show_text.insert('end', self.data.head(10))
else:
tk.messagebox.showerror('错误', '请先导入数据集!')
def standardization_process(self):
if hasattr(self, 'data'):
self.data = self._preprocess_data(self.data)
self.data_show_text.delete('1.0', 'end')
self.data_show_text.insert('end', self.data.head(10))
else:
tk.messagebox.showerror('错误', '请先导入数据集!')
def encoding_process(self):
if hasattr(self, 'data'):
self.data = self._preprocess_data(self.data)
self.data_show_text.delete('1.0', 'end')
self.data_show_text.insert('end', self.data.head(10))
else:
tk.messagebox.showerror('错误', '请先导入数据集!')
def train_test_split(self):
if hasattr(self, 'data'):
try:
train_test_ratio = float(self.train_test_ratio_entry.get())
except ValueError:
tk.messagebox.showerror('错误', '请输入正确的训练集测试集比例!')
return
X = self.data.drop('labels', axis=1)
y = self.data['labels']
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, train_size=train_test_ratio,
random_state=123)
tk.messagebox.showinfo('提示', '训练集测试集划分完成!')
else:
tk.messagebox.showerror('错误', '请先导入数据集!')
def train_model(self):
if hasattr(self, 'data'):
if hasattr(self, 'X_train'):
vectorizer = CountVectorizer()
train_matrix = vectorizer.fit_transform(self.X_train['messages'])
test_matrix = vectorizer.transform(self.X_test['messages'])
if self.model_choice.get() == 'MultinomialNB':
model = MultinomialNB()
else:
model = BernoulliNB()
self.clf = model.fit(train_matrix, self.y_train)
self.y_pred = self.clf.predict(test_matrix)
tk.messagebox.showinfo('提示', '模型训练完成!')
else:
tk.messagebox.showerror('错误', '请先划分训练集测试集!')
else:
tk.messagebox.showerror('错误', '请先导入数据集!')
def show_plot(self):
if hasattr(self, 'data'):
if hasattr(self, 'X_train'):
if self.plot_choice.get() == '散点图':
plt.scatter(self.X_test.index, self.y_test, c=self.y_pred)
plt.xlabel('Index')
plt.ylabel('Label')
plt.title('Scatter Plot')
plt.show()
else:
if hasattr(self, 'clf'):
y_score = self.clf.predict_proba(self.X_test['messages'])[:, 1]
fpr, tpr, _ = roc_curve(self.y_test, y_score)
auc = roc_auc_score(self.y_test, y_score)
plt.plot(fpr, tpr, label='ROC Curve (AUC=%0.2f)' % auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
else:
tk.messagebox.showerror('错误', '请先训练模型!')
else:
tk.messagebox.showerror('错误', '请先划分训练集测试集!')
else:
tk.messagebox.showerror('错误', '请先导入数据集!')
def evaluate_model(self):
if hasattr(self, 'data'):
if hasattr(self, 'X_train'):
if hasattr(self, 'y_pred'):
report = classification_report(self.y_test, self.y_pred, output_dict=True)
if self.evaluate_choice.get() == '准确率':
score = self.clf.score(self.X_test, self.y_test)
tk.messagebox.showinfo('评估结果', '准确率为:%0.2f' % score)
elif self.evaluate_choice.get() == '精确率':
tk.messagebox.showinfo('评估结果', '垃圾短信精确率为:%0.2f' % report['1']['precision'])
elif self.evaluate_choice.get() == '召回率':
tk.messagebox.showinfo('评估结果', '垃圾短信召回率为:%0.2f' % report['1']['recall'])
elif self.evaluate_choice.get() == 'F1值':
tk.messagebox.showinfo('评估结果', '垃圾短信F1值为:%0.2f' % report['1']['f1-score'])
else:
if hasattr(self, 'clf'):
y_score = self.clf.predict_proba(self.X_test['messages'])[:, 1]
fpr, tpr, _ = roc_curve(self.y_test, y_score)
auc = roc_auc_score(self.y_test, y_score)
tk.messagebox.showinfo('评估结果', 'ROC曲线下的面积为:%0.2f' % auc)
else:
tk.messagebox.showerror('错误', '请先训练模型!')
else:
tk.messagebox.showerror('错误', '请先进行模型预测!')
else:
tk.messagebox.showerror('错误', '请先划分训练集测试集!')
else:
tk.messagebox.showerror('错误', '请先导入数据集!')
if name == 'main': SpamClassification()
原文地址: https://www.cveoy.top/t/topic/okP3 著作权归作者所有。请勿转载和采集!