基于聚类的欠采样方法 python代码

以下是基于聚类的欠采样方法的Python代码示例：

import numpy as np
from sklearn.cluster import KMeans

def cluster_based_undersampling(X, y, k=2):
    """
    基于聚类的欠采样方法：选取每个聚类中心最接近的样本作为代表样本
    :param X: 特征矩阵
    :param y: 标签矩阵
    :param k: 聚类簇数
    :return: 降采样后的特征矩阵和标签矩阵
    """
    # 将正负样本分离
    positives = X[y == 1]
    negatives = X[y == 0]

    # 对负样本进行聚类
    kmeans = KMeans(n_clusters=k, random_state=0).fit(negatives)

    # 选取每个聚类中心最接近的样本作为代表样本
    chosen_indices = []
    for i in range(k):
        cluster_indices = np.where(kmeans.labels_ == i)[0]
        cluster_center = kmeans.cluster_centers_[i]
        distances = np.linalg.norm(negatives[cluster_indices] - cluster_center, axis=1)
        chosen_index = cluster_indices[np.argmin(distances)]
        chosen_indices.append(chosen_index)

    # 将代表样本和正样本合并
    indices = np.concatenate([chosen_indices, np.where(y == 1)[0]])
    X_resampled = X[indices]
    y_resampled = y[indices]

    return X_resampled, y_resampled

使用示例：

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 生成分类数据集
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, random_state=0)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# 基于聚类的欠采样方法
X_resampled, y_resampled = cluster_based_undersampling(X_train, y_train, k=2)

# 训练模型
clf = LogisticRegression(random_state=0).fit(X_resampled, y_resampled)

# 预测并计算准确率
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))