以下是使用numpy和scipy.stats实现聚类性能指标DBI,DI, CHI, SI的代码:

import numpy as np
from scipy.stats import entropy

def calc_centroids(X, labels):
    n_clusters = len(np.unique(labels))
    centroids = np.zeros((n_clusters, X.shape[1]))
    for i in range(n_clusters):
        centroids[i] = np.mean(X[labels == i], axis=0)
    return centroids

def calc_distance(X, centroids):
    n_samples = X.shape[0]
    n_clusters = centroids.shape[0]
    distance = np.zeros((n_samples, n_clusters))
    for i in range(n_clusters):
        distance[:, i] = np.linalg.norm(X - centroids[i], axis=1)
    return distance

def calc_intra_cluster_distance(X, labels, centroids):
    n_clusters = centroids.shape[0]
    intra_cluster_distance = np.zeros(n_clusters)
    for i in range(n_clusters):
        intra_cluster_distance[i] = np.mean(np.linalg.norm(X[labels == i] - centroids[i], axis=1))
    return intra_cluster_distance

def calc_inter_cluster_distance(centroids):
    n_clusters = centroids.shape[0]
    inter_cluster_distance = np.zeros((n_clusters, n_clusters))
    for i in range(n_clusters):
        for j in range(i+1, n_clusters):
            inter_cluster_distance[i][j] = np.linalg.norm(centroids[i] - centroids[j])
            inter_cluster_distance[j][i] = inter_cluster_distance[i][j]
    return inter_cluster_distance

def calc_silhouette_index(X, labels):
    n_samples = X.shape[0]
    n_clusters = len(np.unique(labels))
    intra_cluster_distance = calc_intra_cluster_distance(X, labels, calc_centroids(X, labels))
    inter_cluster_distance = calc_inter_cluster_distance(calc_centroids(X, labels))
    silhouette_index = np.zeros(n_samples)
    for i in range(n_samples):
        a_i = intra_cluster_distance[labels[i]]
        b_i = np.min(inter_cluster_distance[labels[i]])
        silhouette_index[i] = (b_i - a_i) / max(a_i, b_i)
    return np.mean(silhouette_index)

def calc_davies_bouldin_index(X, labels):
    n_clusters = len(np.unique(labels))
    centroids = calc_centroids(X, labels)
    intra_cluster_distance = calc_intra_cluster_distance(X, labels, centroids)
    inter_cluster_distance = calc_inter_cluster_distance(centroids)
    db_index = np.zeros(n_clusters)
    for i in range(n_clusters):
        db_index[i] = np.mean((intra_cluster_distance + intra_cluster_distance[i]) / inter_cluster_distance[i])
    return np.max(db_index)

def calc_dunn_index(X, labels):
    n_clusters = len(np.unique(labels))
    centroids = calc_centroids(X, labels)
    intra_cluster_distance = calc_intra_cluster_distance(X, labels, centroids)
    inter_cluster_distance = calc_inter_cluster_distance(centroids)
    dunn_index = np.min(intra_cluster_distance) / np.max(inter_cluster_distance)
    return dunn_index

def calc_calinski_harabasz_index(X, labels):
    n_clusters = len(np.unique(labels))
    centroids = calc_centroids(X, labels)
    intra_cluster_distance = calc_intra_cluster_distance(X, labels, centroids)
    inter_cluster_distance = calc_inter_cluster_distance(centroids)
    chi_index = np.sum(intra_cluster_distance) / np.sum(inter_cluster_distance) * (X.shape[0] - n_clusters) / (n_clusters - 1)
    return chi_index

接下来是可视化代码,使用KMeans进行聚类,分别计算不同k值下的4种性能指标并绘制曲线:

import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

X = # load data here

k_range = range(2, 11)
dbi_scores = []
di_scores = []
chi_scores = []
si_scores = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=0).fit(X)
    labels = kmeans.labels_
    dbi_scores.append(calc_davies_bouldin_index(X, labels))
    di_scores.append(calc_dunn_index(X, labels))
    chi_scores.append(calc_calinski_harabasz_index(X, labels))
    si_scores.append(calc_silhouette_index(X, labels))

fig, ax = plt.subplots()
ax.plot(k_range, dbi_scores, label='DBI')
ax.plot(k_range, di_scores, label='DI')
ax.plot(k_range, chi_scores, label='CHI')
ax.plot(k_range, si_scores, label='SI')
ax.legend()
plt.show()

最后使用肘部法选择最佳k值:

inertias = []
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=0).fit(X)
    inertias.append(kmeans.inertia_)

fig, ax = plt.subplots()
ax.plot(k_range, inertias, 'bx-')
ax.set_xlabel('Number of clusters')
ax.set_ylabel('Inertia')
ax.set_title('Elbow Method For Optimal k')
plt.show()

根据肘部法选择最佳k值即可

实现聚类性能指标DBIDI CHI SI只能使用使用numpy和scipystats将超参数调优过程可视化:以不同的k值为横坐标性能指标为总坐标做出聚类模型性能曲线2=k=10 4种性能指标4条曲线利用肘部法选择最佳k值

原文地址: https://www.cveoy.top/t/topic/hlnk 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录