Python 数据集处理与机器学习模型训练可视化
Python 数据集处理与机器学习模型训练可视化
本代码使用 Python 的 tkinter 库构建一个 GUI 应用程序,用于处理数据集,包括缺失值检测、填充、数据划分等操作,并使用机器学习模型进行训练和评估,最终将结果可视化呈现。
1. 导入必要的库
import tkinter as tk
import pandas as pd
import numpy as np
from tkinter import *
from tkinter import messagebox # 一个弹窗库
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from tkinter import filedialog
import matplotlib.pyplot as plt
2. 创建窗口
root = Tk() # 创建一个窗口
root.title('数据集处理窗口') # 给窗口命名
root.geometry('2000x1000') # 设置窗口的大小
lb1 = Label(root, text='选择数据集') # root 是主体,text 是内容
lb1.place(relx=0.1, rely=0.1, relwidth=0.8, relheight=0.1) # 设置位置
3. 定义函数
- openfile(): 打开文件并返回文件路径
def openfile(): # 打开文件并显示
openfile = filedialog.askopenfilename() # 获得选择好的文件, 单个文件
imgtype = ['.csv'] # 规定读取的文件类型
return openfile
- duru(): 读取数据集并显示基本统计信息
def duru(txt):
txt = pd.read_csv(txt, engine='python')
txt = txt.describe()
text = Text(root)
text.place(rely=0.6, relheight=0.4)
text.insert(END, txt)
- queshizhi(): 对数据集进行缺失值填充
def queshizhi(txt):
txt = pd.read_csv(txt, engine='python')
txt = txt.fillna(method='ffill') # 用前一个非空值填充缺失值
text = Text(root)
text.place(rely=0.6, relheight=0.4)
text.insert(END, txt)
- qsjc(): 检测数据集中的缺失值数量
def qsjc(txt):
txt = pd.read_csv(txt, engine='python')
isnull = txt.isnull().sum() # 统计每一列的缺失值数量
text = Text(root)
text.place(rely=0.6, relheight=0.4)
text.insert(END, isnull)
4. 创建按钮
- btn1: 选择数据集
btn1 = Button(root, text='选择csv数据集', command=lambda: duru(openfile()))
btn1.place(relx=0.3, rely=0.2, relwidth=0.1, relheight=0.1)
- btn2: 关闭窗口
btn2 = Button(root, text='关闭窗口', command=root.destroy)
btn2.place(relx=0.6, rely=0.2, relwidth=0.1, relheight=0.1)
- btn3: 缺失值检测
btn3 = Button(root, text='缺失值检测', command=lambda: qsjc(openfile())) # 调用来处理缺失值
btn3.place(relx=0.4, rely=0.2, relwidth=0.1, relheight=0.1)
- btn4: 缺失值填充
btn4 = Button(root, text='缺失值填充', command=lambda: queshizhi(openfile())) # 调用 queshizhi 函数来处理缺失值
btn4.place(relx=0.5, rely=0.2, relwidth=0.1, relheight=0.1) # 设置位置
5. 加载数据集并划分训练集和测试集
iris_data = pd.read_csv('train.csv')
X = iris_data[['android_id', 'media_id', 'cus_type', 'package']]
y = iris_data[['label']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
6. 定义训练模型并显示结果的函数
- func1: 训练逻辑回归模型
def func1():
my_model = LogisticRegression()
my_model.fit(X_train, y_train)
y_pred = my_model.predict(X_test)
score = metrics.accuracy_score(y_test, y_pred)
a1 = Label(root, text=('逻辑回归模型的准确率为:', score), font=('微软雅黑 -20')) # font 指定字体和字体大小的参数
a1.place(relx=0.2, rely=0.5, relwidth=0.5, relheight=0.1) # place 确定窗体布局的方法
- func2: 训练 K 近邻模型
def func2():
my_model = KNeighborsClassifier()
my_model.fit(X_train, y_train)
y_pred = my_model.predict(X_test)
score = metrics.accuracy_score(y_test, y_pred)
a2 = Label(root, text=('K近邻模型的准确率为:', score), font=('微软雅黑 -20'))
a2.place(relx=0.2, rely=0.6, relwidth=0.5, relheight=0.1)
- func3: 训练高斯模型
def func3():
my_model = GaussianNB()
my_model.fit(X_train, y_train)
y_pred = my_model.predict(X_test)
score = metrics.accuracy_score(y_test, y_pred)
a3 = Label(root, text=('高斯模型的准确率为:', score), font=('微软雅黑 -20'))
a3.place(relx=0.2, rely=0.7, relwidth=0.5, relheight=0.1)
7. 创建训练模型的按钮
- btn5: 逻辑回归模型
btn5 = Button(root, text='逻辑回归模型', command=lambda: func1())
btn5.place(relx=0.35, rely=0.4, relwidth=0.1, relheight=0.1)
- btn6: K 近邻模型
btn6 = Button(root, text='K近邻模型', command=lambda: func2())
btn6.place(relx=0.45, rely=0.4, relwidth=0.1, relheight=0.1)
- btn7: 高斯模型
btn7 = Button(root, text='高斯模型', command=lambda: func3())
btn7.place(relx=0.55, rely=0.4, relwidth=0.1, relheight=0.1)
8. 数据可视化
# 使用饼图制作数据可视化内容
data = [50, 30, 20] # 数据
labels = ['A', 'B', 'C'] # 标签
plt.pie(data, labels=labels, autopct='%.2f%%') # 绘制饼图
plt.show() # 显示图形
9. 运行程序
root.mainloop()
注意: 这段代码只是提供一个基本框架,您可以根据实际需求进行修改和扩展。例如,您可以添加更多功能,例如数据标准化、特征工程等。
代码示例:
import tkinter as tk
import pandas as pd
import numpy as np
from tkinter import *
from tkinter import messagebox
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from tkinter import filedialog
import matplotlib.pyplot as plt
root = Tk()
root.title('数据集处理窗口')
root.geometry('2000x1000')
lb1 = Label(root, text='选择数据集')
lb1.place(relx=0.1, rely=0.1, relwidth=0.8, relheight=0.1)
def openfile():
openfile = filedialog.askopenfilename()
imgtype = ['.csv']
return openfile
def duru(txt):
txt = pd.read_csv(txt, engine='python')
txt = txt.describe()
text = Text(root)
text.place(rely=0.6, relheight=0.4)
text.insert(END, txt)
def queshizhi(txt):
txt = pd.read_csv(txt, engine='python')
txt = txt.fillna(method='ffill')
text = Text(root)
text.place(rely=0.6, relheight=0.4)
text.insert(END, txt)
def qsjc(txt):
txt = pd.read_csv(txt, engine='python')
isnull = txt.isnull().sum()
text = Text(root)
text.place(rely=0.6, relheight=0.4)
text.insert(END, isnull)
btn1 = Button(root, text='选择csv数据集', command=lambda: duru(openfile()))
btn1.place(relx=0.3, rely=0.2, relwidth=0.1, relheight=0.1)
btn2 = Button(root, text='关闭窗口', command=root.destroy)
btn2.place(relx=0.6, rely=0.2, relwidth=0.1, relheight=0.1)
btn3 = Button(root, text='缺失值检测', command=lambda: qsjc(openfile()))
btn3.place(relx=0.4, rely=0.2, relwidth=0.1, relheight=0.1)
btn4 = Button(root, text='缺失值填充', command=lambda: queshizhi(openfile()))
btn4.place(relx=0.5, rely=0.2, relwidth=0.1, relheight=0.1)
iris_data = pd.read_csv('train.csv')
X = iris_data[['android_id', 'media_id', 'cus_type', 'package']]
y = iris_data[['label']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
def func1():
my_model = LogisticRegression()
my_model.fit(X_train, y_train)
y_pred = my_model.predict(X_test)
score = metrics.accuracy_score(y_test, y_pred)
a1 = Label(root, text=('逻辑回归模型的准确率为:', score), font=('微软雅黑 -20'))
a1.place(relx=0.2, rely=0.5, relwidth=0.5, relheight=0.1)
def func2():
my_model = KNeighborsClassifier()
my_model.fit(X_train, y_train)
y_pred = my_model.predict(X_test)
score = metrics.accuracy_score(y_test, y_pred)
a2 = Label(root, text=('K近邻模型的准确率为:', score), font=('微软雅黑 -20'))
a2.place(relx=0.2, rely=0.6, relwidth=0.5, relheight=0.1)
def func3():
my_model = GaussianNB()
my_model.fit(X_train, y_train)
y_pred = my_model.predict(X_test)
score = metrics.accuracy_score(y_test, y_pred)
a3 = Label(root, text=('高斯模型的准确率为:', score), font=('微软雅黑 -20'))
a3.place(relx=0.2, rely=0.7, relwidth=0.5, relheight=0.1)
btn5 = Button(root, text='逻辑回归模型', command=lambda: func1())
btn5.place(relx=0.35, rely=0.4, relwidth=0.1, relheight=0.1)
btn6 = Button(root, text='K近邻模型', command=lambda: func2())
btn6.place(relx=0.45, rely=0.4, relwidth=0.1, relheight=0.1)
btn7 = Button(root, text='高斯模型', command=lambda: func3())
btn7.place(relx=0.55, rely=0.4, relwidth=0.1, relheight=0.1)
# 使用饼图制作数据可视化内容
data = [50, 30, 20] # 数据
labels = ['A', 'B', 'C'] # 标签
plt.pie(data, labels=labels, autopct='%.2f%%') # 绘制饼图
plt.show() # 显示图形
root.mainloop()
可视化示例:
import matplotlib.pyplot as plt
data = [50, 30, 20] # 数据
labels = ['A', 'B', 'C'] # 标签
plt.pie(data, labels=labels, autopct='%.2f%%') # 绘制饼图
plt.show() # 显示图形
这将生成一个简单的饼图,其中三个部分分别代表数据 A、B 和 C。
总结:
这段代码提供了一个简单的数据集处理和机器学习模型训练的 GUI 应用程序示例。您可以根据实际需求进行修改和扩展,以实现更复杂的功能。
原文地址: https://www.cveoy.top/t/topic/okQa 著作权归作者所有。请勿转载和采集!