实现聚类性能指标DBIDI CHI SI只能使用使用numpy和scipystats将超参数调优过程可视化:以不同的k值为横坐标性能指标为总坐标做出聚类模型性能曲线2=k=10 4种性能指标4条曲线利用肘部法选择最佳k值
以下是使用numpy和scipy.stats实现聚类性能指标DBI,DI, CHI, SI的代码:
import numpy as np
from scipy.stats import entropy
def calc_centroids(X, labels):
n_clusters = len(np.unique(labels))
centroids = np.zeros((n_clusters, X.shape[1]))
for i in range(n_clusters):
centroids[i] = np.mean(X[labels == i], axis=0)
return centroids
def calc_distance(X, centroids):
n_samples = X.shape[0]
n_clusters = centroids.shape[0]
distance = np.zeros((n_samples, n_clusters))
for i in range(n_clusters):
distance[:, i] = np.linalg.norm(X - centroids[i], axis=1)
return distance
def calc_intra_cluster_distance(X, labels, centroids):
n_clusters = centroids.shape[0]
intra_cluster_distance = np.zeros(n_clusters)
for i in range(n_clusters):
intra_cluster_distance[i] = np.mean(np.linalg.norm(X[labels == i] - centroids[i], axis=1))
return intra_cluster_distance
def calc_inter_cluster_distance(centroids):
n_clusters = centroids.shape[0]
inter_cluster_distance = np.zeros((n_clusters, n_clusters))
for i in range(n_clusters):
for j in range(i+1, n_clusters):
inter_cluster_distance[i][j] = np.linalg.norm(centroids[i] - centroids[j])
inter_cluster_distance[j][i] = inter_cluster_distance[i][j]
return inter_cluster_distance
def calc_silhouette_index(X, labels):
n_samples = X.shape[0]
n_clusters = len(np.unique(labels))
intra_cluster_distance = calc_intra_cluster_distance(X, labels, calc_centroids(X, labels))
inter_cluster_distance = calc_inter_cluster_distance(calc_centroids(X, labels))
silhouette_index = np.zeros(n_samples)
for i in range(n_samples):
a_i = intra_cluster_distance[labels[i]]
b_i = np.min(inter_cluster_distance[labels[i]])
silhouette_index[i] = (b_i - a_i) / max(a_i, b_i)
return np.mean(silhouette_index)
def calc_davies_bouldin_index(X, labels):
n_clusters = len(np.unique(labels))
centroids = calc_centroids(X, labels)
intra_cluster_distance = calc_intra_cluster_distance(X, labels, centroids)
inter_cluster_distance = calc_inter_cluster_distance(centroids)
db_index = np.zeros(n_clusters)
for i in range(n_clusters):
db_index[i] = np.mean((intra_cluster_distance + intra_cluster_distance[i]) / inter_cluster_distance[i])
return np.max(db_index)
def calc_dunn_index(X, labels):
n_clusters = len(np.unique(labels))
centroids = calc_centroids(X, labels)
intra_cluster_distance = calc_intra_cluster_distance(X, labels, centroids)
inter_cluster_distance = calc_inter_cluster_distance(centroids)
dunn_index = np.min(intra_cluster_distance) / np.max(inter_cluster_distance)
return dunn_index
def calc_calinski_harabasz_index(X, labels):
n_clusters = len(np.unique(labels))
centroids = calc_centroids(X, labels)
intra_cluster_distance = calc_intra_cluster_distance(X, labels, centroids)
inter_cluster_distance = calc_inter_cluster_distance(centroids)
chi_index = np.sum(intra_cluster_distance) / np.sum(inter_cluster_distance) * (X.shape[0] - n_clusters) / (n_clusters - 1)
return chi_index
接下来是可视化代码,使用KMeans进行聚类,分别计算不同k值下的4种性能指标并绘制曲线:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
X = # load data here
k_range = range(2, 11)
dbi_scores = []
di_scores = []
chi_scores = []
si_scores = []
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=0).fit(X)
labels = kmeans.labels_
dbi_scores.append(calc_davies_bouldin_index(X, labels))
di_scores.append(calc_dunn_index(X, labels))
chi_scores.append(calc_calinski_harabasz_index(X, labels))
si_scores.append(calc_silhouette_index(X, labels))
fig, ax = plt.subplots()
ax.plot(k_range, dbi_scores, label='DBI')
ax.plot(k_range, di_scores, label='DI')
ax.plot(k_range, chi_scores, label='CHI')
ax.plot(k_range, si_scores, label='SI')
ax.legend()
plt.show()
最后使用肘部法选择最佳k值:
inertias = []
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=0).fit(X)
inertias.append(kmeans.inertia_)
fig, ax = plt.subplots()
ax.plot(k_range, inertias, 'bx-')
ax.set_xlabel('Number of clusters')
ax.set_ylabel('Inertia')
ax.set_title('Elbow Method For Optimal k')
plt.show()
根据肘部法选择最佳k值即可
原文地址: https://www.cveoy.top/t/topic/hlnk 著作权归作者所有。请勿转载和采集!