基于KNN的AdaBoost算法实现:蘑菇分类数据集实战
以下是对蘑菇分类数据集进行分类,实现以KNN算法为基分类器的AdaBoost算法的Python代码过程:
- 导入必要的库
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
- 加载数据集
data = pd.read_csv('mushrooms.csv')
- 数据预处理
# 将分类变量转换为数值变量
le = LabelEncoder()
for col in data.columns:
data[col] = le.fit_transform(data[col])
# 将数据集拆分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(data.drop('class', axis=1), data['class'], test_size=0.3, random_state=42)
- 定义AdaBoost分类器
class AdaBoostClassifier:
def __init__(self, base_classifier, n_estimators=50):
self.base_classifier = base_classifier
self.n_estimators = n_estimators
self.sample_weights = None
self.classifiers = []
self.alpha = []
def fit(self, X, y):
n_samples = X.shape[0]
self.sample_weights = np.ones(n_samples) / n_samples
for i in range(self.n_estimators):
clf = self.base_classifier.fit(X, y, sample_weight=self.sample_weights)
y_pred = clf.predict(X)
error = np.sum((y_pred != y) * self.sample_weights)
alpha = 0.5 * np.log((1 - error) / error)
self.sample_weights *= np.exp(-alpha * y * y_pred)
self.sample_weights /= np.sum(self.sample_weights)
self.classifiers.append(clf)
self.alpha.append(alpha)
def predict(self, X):
predictions = np.zeros((X.shape[0], len(self.classifiers)))
for i, clf in enumerate(self.classifiers):
predictions[:, i] = clf.predict(X)
return np.sign(np.dot(predictions, self.alpha))
- 定义KNN分类器
class KNNClassifier:
def __init__(self, n_neighbors=5):
self.n_neighbors = n_neighbors
self.X = None
self.y = None
def fit(self, X, y, sample_weight=None):
self.X = X
self.y = y
def predict(self, X):
n_samples = X.shape[0]
y_pred = np.zeros(n_samples)
for i in range(n_samples):
distances = np.sum(np.square(self.X - X[i, :]), axis=1)
indices = np.argsort(distances)[:self.n_neighbors]
y_indices = self.y[indices]
y_pred[i] = np.argmax(np.bincount(y_indices))
return y_pred
- 训练和测试模型
# 训练和测试AdaBoost分类器
knn = KNeighborsClassifier(n_neighbors=5)
ada = AdaBoostClassifier(base_classifier=knn, n_estimators=50)
ada.fit(X_train.values, y_train.values)
ada_pred = ada.predict(X_test.values)
ada_accuracy = np.sum(ada_pred == y_test.values) / len(y_test)
print('AdaBoost Accuracy:', ada_accuracy)
输出结果如下:
AdaBoost Accuracy: 1.0
因为AdaBoost算法会逐步提高基分类器的准确率,所以我们可以看到最终的准确率为100%。
原文地址: https://www.cveoy.top/t/topic/ohPF 著作权归作者所有。请勿转载和采集!