以下是使用numpy和scipy.stats库实现聚类性能指标DBI、DI、CHI和SI的代码:

import numpy as np
from scipy.stats import f

def euclidean_distance(a, b):
    return np.linalg.norm(a - b)

def calculate_centroid(points):
    return np.mean(points, axis=0)

def calculate_within_cluster_sum_of_squares(cluster):
    centroid = calculate_centroid(cluster)
    return sum([euclidean_distance(point, centroid)**2 for point in cluster])

def calculate_between_cluster_sum_of_squares(clusters):
    centroid = calculate_centroid(np.concatenate(clusters))
    return sum([calculate_within_cluster_sum_of_squares(cluster) for cluster in clusters]) / len(clusters)

def calculate_dunn_index(clusters):
    max_intra_cluster_distance = max([np.max([euclidean_distance(point1, point2) for point1 in cluster for point2 in cluster if not np.array_equal(point1, point2)]) for cluster in clusters])
    min_inter_cluster_distance = min([euclidean_distance(calculate_centroid(cluster1), calculate_centroid(cluster2)) for i, cluster1 in enumerate(clusters) for cluster2 in clusters[i+1:]])
    return min_inter_cluster_distance / max_intra_cluster_distance

def calculate_davies_bouldin_index(clusters):
    centroids = [calculate_centroid(cluster) for cluster in clusters]
    distances = np.array([[euclidean_distance(centroids[i], centroids[j]) for j in range(len(centroids))] for i in range(len(centroids))])
    max_intra_cluster_distances = [max([euclidean_distance(point, centroids[i]) for point in cluster]) for i, cluster in enumerate(clusters)]
    return np.mean([max([(max_intra_cluster_distances[i] + max_intra_cluster_distances[j]) / distances[i][j] for j in range(len(centroids)) if j != i]) for i in range(len(centroids))])

def calculate_calinski_harabasz_index(clusters):
    within_cluster_sum_of_squares = sum([calculate_within_cluster_sum_of_squares(cluster) for cluster in clusters])
    between_cluster_sum_of_squares = calculate_between_cluster_sum_of_squares(clusters)
    return (between_cluster_sum_of_squares / (len(clusters) - 1)) / (within_cluster_sum_of_squares / (len(np.concatenate(clusters)) - len(clusters)))

def calculate_silhouette_index(clusters):
    silhouettes = []
    for i, cluster in enumerate(clusters):
        intra_cluster_distance = np.mean([euclidean_distance(point1, point2) for point1 in cluster for point2 in cluster if not np.array_equal(point1, point2)])
        inter_cluster_distances = [np.mean([euclidean_distance(point, other_point) for other_point in other_cluster]) for j, other_cluster in enumerate(clusters) if j != i]
        silhouette = (min(inter_cluster_distances) - intra_cluster_distance) / max([min(inter_cluster_distances), intra_cluster_distance])
        silhouettes.extend([silhouette] * len(cluster))
    return np.mean(silhouettes), silhouettes

其中,euclidean_distance函数用于计算两个点之间的欧几里得距离,calculate_centroid函数用于计算一个簇的中心点,calculate_within_cluster_sum_of_squares函数用于计算一个簇内点到中心点的平方和,calculate_between_cluster_sum_of_squares函数用于计算所有簇之间点到整体中心点的平方和的平均值,calculate_dunn_index函数用于计算Dunn指数,calculate_davies_bouldin_index函数用于计算Davies-Bouldin指数,calculate_calinski_harabasz_index函数用于计算Calinski-Harabasz指数,calculate_silhouette_index函数用于计算轮廓系数和每个点的轮廓系数

python手写实现聚类性能指标DBI、DI、CHI和SI的计算只能调用numpy scipystats

原文地址: https://www.cveoy.top/t/topic/hk84 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录