垃圾邮件分类：基于朴素贝叶斯算法的Python实现

import tkinter as tk from tkinter import filedialog import pandas as pd import numpy as np
from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, roc_curve, roc_auc_score from sklearn.naive_bayes import BernoulliNB, MultinomialNB from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.impute import SimpleImputer import matplotlib.pyplot as plt
class SpamClassification: def init(self): self.window = tk.Tk() self.window.title('Spam Classification') self.window.geometry('1000x600')
    # 导入数据集
    self.import_button = tk.Button(self.window, text='导入数据集', command=self.import_data)
    self.import_button.pack(pady=10)

    # 数据预处理选项
    self.preprocessing_label = tk.Label(self.window, text='数据预处理选项：')
    self.preprocessing_label.pack()
    self.missing_value_button = tk.Button(self.window, text='缺失值处理', command=self.missing_value_process)
    self.missing_value_button.pack()
    self.standardization_button = tk.Button(self.window, text='数值型数据标准化', command=self.standardization_process)
    self.standardization_button.pack()
    self.encoding_button = tk.Button(self.window, text='类别型数据编码', command=self.encoding_process)
    self.encoding_button.pack()

    # 训练集测试集划分
    self.train_test_label = tk.Label(self.window, text='训练集测试集划分：')
    self.train_test_label.pack()
    self.train_test_ratio_label = tk.Label(self.window, text='请输入训练集测试集比例（如0.8）：')
    self.train_test_ratio_label.pack()
    self.train_test_ratio_entry = tk.Entry(self.window)
    self.train_test_ratio_entry.pack()
    self.train_test_button = tk.Button(self.window, text='划分训练集测试集', command=self.train_test_split)
    self.train_test_button.pack()

    # 模型选择
    self.model_label = tk.Label(self.window, text='模型选择：')
    self.model_label.pack()
    self.model_choice = tk.StringVar()
    self.model_choice.set('MultinomialNB')
    self.model_optionmenu = tk.OptionMenu(self.window, self.model_choice, 'MultinomialNB', 'BernoulliNB')
    self.model_optionmenu.pack()
    self.train_button = tk.Button(self.window, text='训练模型', command=self.train_model)
    self.train_button.pack()

    # 图像选择及显示
    self.plot_label = tk.Label(self.window, text='图像选择：')
    self.plot_label.pack()
    self.plot_choice = tk.StringVar()
    self.plot_choice.set('散点图')
    self.plot_optionmenu = tk.OptionMenu(self.window, self.plot_choice, '散点图', 'ROC曲线')
    self.plot_optionmenu.pack()
    self.plot_button = tk.Button(self.window, text='显示图像', command=self.show_plot)
    self.plot_button.pack()

    # 模型评估
    self.evaluate_label = tk.Label(self.window, text='模型评估：')
    self.evaluate_label.pack()
    self.evaluate_choice = tk.StringVar()
    self.evaluate_choice.set('准确率')
    self.evaluate_optionmenu = tk.OptionMenu(self.window, self.evaluate_choice, '准确率', '精确率', '召回率', 'F1值', 'ROC曲线')
    self.evaluate_optionmenu.pack()
    self.evaluate_button = tk.Button(self.window, text='评估模型', command=self.evaluate_model)
    self.evaluate_button.pack()

    # 数据展示
    self.data_show_label = tk.Label(self.window, text='数据集前10行展示：')
    self.data_show_label.pack()
    self.data_show_text = tk.Text(self.window, height=10)
    self.data_show_text.pack()

    self.window.mainloop()

def import_data(self):
    file_path = filedialog.askopenfilename()
    if file_path:
        if file_path.endswith('.csv'):
            self.data = pd.read_csv(file_path)
        elif file_path.endswith('.xlsx'):
            self.data = pd.read_excel(file_path)
        else:
            self.data = None
            tk.messagebox.showerror('错误', '只支持csv和xlsx格式的文件！')
            return
        self.data_show_text.insert('end', self.data.head(10))
    else:
        tk.messagebox.showerror('错误', '请选择文件！')

def _preprocess_data(self, data):
    # 缺失值处理
    data = data.dropna()
    # 数值型数据标准化
    numeric_cols = data.select_dtypes(include=np.number).columns
    scaler = StandardScaler()
    data[numeric_cols] = scaler.fit_transform(data[numeric_cols])
    # 类别型数据编码
    object_cols = data.select_dtypes(include='object').columns
    for col in object_cols:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
    return data

def missing_value_process(self):
    if hasattr(self, 'data'):
        self.data = self._preprocess_data(self.data)
        self.data_show_text.delete('1.0', 'end')
        self.data_show_text.insert('end', self.data.head(10))
    else:
        tk.messagebox.showerror('错误', '请先导入数据集！')

def standardization_process(self):
    if hasattr(self, 'data'):
        self.data = self._preprocess_data(self.data)
        self.data_show_text.delete('1.0', 'end')
        self.data_show_text.insert('end', self.data.head(10))
    else:
        tk.messagebox.showerror('错误', '请先导入数据集！')

def encoding_process(self):
    if hasattr(self, 'data'):
        self.data = self._preprocess_data(self.data)
        self.data_show_text.delete('1.0', 'end')
        self.data_show_text.insert('end', self.data.head(10))
    else:
        tk.messagebox.showerror('错误', '请先导入数据集！')

def train_test_split(self):
    if hasattr(self, 'data'):
        try:
            train_test_ratio = float(self.train_test_ratio_entry.get())
        except ValueError:
            tk.messagebox.showerror('错误', '请输入正确的训练集测试集比例！')
            return
        X = self.data.drop('labels', axis=1)
        y = self.data['labels']
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, train_size=train_test_ratio,
                                                                                    random_state=123)
        tk.messagebox.showinfo('提示', '训练集测试集划分完成！')
    else:
        tk.messagebox.showerror('错误', '请先导入数据集！')

def train_model(self):
    if hasattr(self, 'data'):
        if hasattr(self, 'X_train'):
            vectorizer = CountVectorizer()
            train_matrix = vectorizer.fit_transform(self.X_train['messages'])
            test_matrix = vectorizer.transform(self.X_test['messages'])
            if self.model_choice.get() == 'MultinomialNB':
                model = MultinomialNB()
            else:
                model = BernoulliNB()
            self.clf = model.fit(train_matrix, self.y_train)
            self.y_pred = self.clf.predict(test_matrix)
            tk.messagebox.showinfo('提示', '模型训练完成！')
        else:
            tk.messagebox.showerror('错误', '请先划分训练集测试集！')
    else:
        tk.messagebox.showerror('错误', '请先导入数据集！')

def show_plot(self):
    if hasattr(self, 'data'):
        if hasattr(self, 'X_train'):
            if self.plot_choice.get() == '散点图':
                plt.scatter(self.X_test.index, self.y_test, c=self.y_pred)
                plt.xlabel('Index')
                plt.ylabel('Label')
                plt.title('Scatter Plot')
                plt.show()
            else:
                if hasattr(self, 'clf'):
                    y_score = self.clf.predict_proba(self.X_test['messages'])[:, 1]
                    fpr, tpr, _ = roc_curve(self.y_test, y_score)
                    auc = roc_auc_score(self.y_test, y_score)
                    plt.plot(fpr, tpr, label='ROC Curve (AUC=%0.2f)' % auc)
                    plt.plot([0, 1], [0, 1], 'k--')
                    plt.xlabel('False Positive Rate')
                    plt.ylabel('True Positive Rate')
                    plt.title('ROC Curve')
                    plt.legend()
                    plt.show()
                else:
                    tk.messagebox.showerror('错误', '请先训练模型！')
        else:
            tk.messagebox.showerror('错误', '请先划分训练集测试集！')
    else:
        tk.messagebox.showerror('错误', '请先导入数据集！')

def evaluate_model(self):
    if hasattr(self, 'data'):
        if hasattr(self, 'X_train'):
            if hasattr(self, 'y_pred'):
                report = classification_report(self.y_test, self.y_pred, output_dict=True)
                if self.evaluate_choice.get() == '准确率':
                    score = self.clf.score(self.X_test, self.y_test)
                    tk.messagebox.showinfo('评估结果', '准确率为：%0.2f' % score)
                elif self.evaluate_choice.get() == '精确率':
                    tk.messagebox.showinfo('评估结果', '垃圾短信精确率为：%0.2f' % report['1']['precision'])
                elif self.evaluate_choice.get() == '召回率':
                    tk.messagebox.showinfo('评估结果', '垃圾短信召回率为：%0.2f' % report['1']['recall'])
                elif self.evaluate_choice.get() == 'F1值':
                    tk.messagebox.showinfo('评估结果', '垃圾短信F1值为：%0.2f' % report['1']['f1-score'])
                else:
                    if hasattr(self, 'clf'):
                        y_score = self.clf.predict_proba(self.X_test['messages'])[:, 1]
                        fpr, tpr, _ = roc_curve(self.y_test, y_score)
                        auc = roc_auc_score(self.y_test, y_score)
                        tk.messagebox.showinfo('评估结果', 'ROC曲线下的面积为：%0.2f' % auc)
                    else:
                        tk.messagebox.showerror('错误', '请先训练模型！')
            else:
                tk.messagebox.showerror('错误', '请先进行模型预测！')
        else:
            tk.messagebox.showerror('错误', '请先划分训练集测试集！')
    else:
        tk.messagebox.showerror('错误', '请先导入数据集！')
if name == 'main': SpamClassification()