乳腺癌分类:使用特征筛选和XGBoost模型实现高精度预测
导入相关包
import numpy as np import pandas as pd from sklearn.model_selection import KFold from sklearn.linear_model import Lasso from xgboost import XGBClassifier from sklearn.metrics import accuracy_score import matplotlib.pyplot as plt
读取乳腺癌数据集
data = pd.read_csv('./breast_cancer.csv')
将输入特征和标签分离
X = data.iloc[:, 0:-1].values y = data.iloc[:, -1].values
五折交叉划分数据集
kf = KFold(n_splits=5, shuffle=True, random_state=0)
初始化特征筛选器
lasso = Lasso(alpha=0.001)
保存筛选出的重要特征
important_features = []
保存特征性能曲线
x_axis = [] y_axis = []
对每一折数据集进行训练
for i, (train_index, test_index) in enumerate(kf.split(X)): # 选取训练集 X_train = X[train_index] y_train = y[train_index] # 选取测试集 X_test = X[test_index] y_test = y[test_index] # 特征筛选 lasso.fit(X_train, y_train) # 获取筛选出的重要特征及其特征性能 important_features_temp = [X_train[i] for i in range(len(lasso.coef_)) if lasso.coef_[i] != 0] x_axis.append(i) y_axis.append(lasso.score(X_train, y_train)) # 将筛选出的重要特征合并到总的重要特征列表中 important_features = list(set(important_features_temp + important_features))
用筛选出的重要特征训练Xgboost模型
model = XGBClassifier() model.fit(X_train[important_features], y_train)
计算模型的准确率
acc = accuracy_score(y_test, model.predict(X_test[important_features]))
画出特征性能曲线
plt.plot(x_axis, y_axis) plt.title('Feature Performance Curve') plt.show()
print('Accuracy', acc)
原文地址: https://www.cveoy.top/t/topic/liMP 著作权归作者所有。请勿转载和采集!