python手写实现聚类性能指标DBI、DI、CHI和SI的计算只能调用numpy scipystats
以下是使用numpy和scipy.stats库实现聚类性能指标DBI、DI、CHI和SI的代码:
import numpy as np
from scipy.stats import f
def euclidean_distance(a, b):
return np.linalg.norm(a - b)
def calculate_centroid(points):
return np.mean(points, axis=0)
def calculate_within_cluster_sum_of_squares(cluster):
centroid = calculate_centroid(cluster)
return sum([euclidean_distance(point, centroid)**2 for point in cluster])
def calculate_between_cluster_sum_of_squares(clusters):
centroid = calculate_centroid(np.concatenate(clusters))
return sum([calculate_within_cluster_sum_of_squares(cluster) for cluster in clusters]) / len(clusters)
def calculate_dunn_index(clusters):
max_intra_cluster_distance = max([np.max([euclidean_distance(point1, point2) for point1 in cluster for point2 in cluster if not np.array_equal(point1, point2)]) for cluster in clusters])
min_inter_cluster_distance = min([euclidean_distance(calculate_centroid(cluster1), calculate_centroid(cluster2)) for i, cluster1 in enumerate(clusters) for cluster2 in clusters[i+1:]])
return min_inter_cluster_distance / max_intra_cluster_distance
def calculate_davies_bouldin_index(clusters):
centroids = [calculate_centroid(cluster) for cluster in clusters]
distances = np.array([[euclidean_distance(centroids[i], centroids[j]) for j in range(len(centroids))] for i in range(len(centroids))])
max_intra_cluster_distances = [max([euclidean_distance(point, centroids[i]) for point in cluster]) for i, cluster in enumerate(clusters)]
return np.mean([max([(max_intra_cluster_distances[i] + max_intra_cluster_distances[j]) / distances[i][j] for j in range(len(centroids)) if j != i]) for i in range(len(centroids))])
def calculate_calinski_harabasz_index(clusters):
within_cluster_sum_of_squares = sum([calculate_within_cluster_sum_of_squares(cluster) for cluster in clusters])
between_cluster_sum_of_squares = calculate_between_cluster_sum_of_squares(clusters)
return (between_cluster_sum_of_squares / (len(clusters) - 1)) / (within_cluster_sum_of_squares / (len(np.concatenate(clusters)) - len(clusters)))
def calculate_silhouette_index(clusters):
silhouettes = []
for i, cluster in enumerate(clusters):
intra_cluster_distance = np.mean([euclidean_distance(point1, point2) for point1 in cluster for point2 in cluster if not np.array_equal(point1, point2)])
inter_cluster_distances = [np.mean([euclidean_distance(point, other_point) for other_point in other_cluster]) for j, other_cluster in enumerate(clusters) if j != i]
silhouette = (min(inter_cluster_distances) - intra_cluster_distance) / max([min(inter_cluster_distances), intra_cluster_distance])
silhouettes.extend([silhouette] * len(cluster))
return np.mean(silhouettes), silhouettes
其中,euclidean_distance函数用于计算两个点之间的欧几里得距离,calculate_centroid函数用于计算一个簇的中心点,calculate_within_cluster_sum_of_squares函数用于计算一个簇内点到中心点的平方和,calculate_between_cluster_sum_of_squares函数用于计算所有簇之间点到整体中心点的平方和的平均值,calculate_dunn_index函数用于计算Dunn指数,calculate_davies_bouldin_index函数用于计算Davies-Bouldin指数,calculate_calinski_harabasz_index函数用于计算Calinski-Harabasz指数,calculate_silhouette_index函数用于计算轮廓系数和每个点的轮廓系数
原文地址: https://www.cveoy.top/t/topic/hk84 著作权归作者所有。请勿转载和采集!