K-Means 聚类算法实现:Python 代码详解
import scipy.io as sio # load mat import matplotlib.pyplot as plt import seaborn as sns import pandas as pd import numpy as np from sklearn.cluster import KMeans
#生成随机的k个中心,请使用sample(k) def random_init(data, k): #data:数据集 k:聚类中心个数 #返回 k 个聚类中心并转换成array数组 #********** Begin # return data.sample(k).values # End **********#
#单个找寻聚类 def find_cluster(x, centroids): #x:待聚类点坐标 centroids:中心坐标 #********** Begin **# distances = np.sqrt(np.sum((centroids - x)2, axis=1)) # 计算每个中心点到该点的距离 # End **********# return np.argmin(distances)
集体data聚类标签
def assign_cluster(data, centroids): return np.apply_along_axis(lambda x: find_cluster(x, centroids), axis=1, arr=data.values)
data中增加一列聚类标签C
def combineDataC(data, C): dataC = data.copy() dataC['C'] = C return dataC
新中心点,同时去掉C, 再转换成array数组
def newCentroids(data, C): dataC = combineDataC(data, C) return dataC.groupby('C', as_index=False).mean().sort_values(by='C').drop('C', axis=1).values
损失函数
def cost(data, centroids, C): #data:数据集 centroids:中心坐标 C:聚类标签 m = data.shape[0] # 样本量 dataCentroids = centroids[C] # 各行的中心坐标 #********** Begin **# distances = np.sqrt(np.sum((data - dataCentroids)2, axis=1)) # 计算每个点到其对应中心点的距离 # End **********# return distances.sum()/m
kmeans通道,运行一次
def kMeansIter(data, k, epoch=100, tol=0.0001): # 生成最初的中心坐标 centroids = random_init(data, k)
costProgress = [] # 用来存放递归聚类的每次损失
# 分配聚类标签
for i in range(epoch):
C = assign_cluster(data, centroids)
centroids = newCentroids(data, C)
costProgress.append(cost(data, centroids, C))
if len(costProgress) > 1:
if np.abs(costProgress[-1] - costProgress[-2]) / costProgress[-1] < tol:
break
return C, centroids, costProgress[-1]
每个k运行n_init次,套用kmeans通道
def kMeans(data, k, epoch=100, n_init=10): tries = np.array([kMeansIter(data, k) for _ in range(n_init)]) leasrCostIndex = np.argmin(tries[:, -1]) return tries[leasrCostIndex
原文地址: https://www.cveoy.top/t/topic/nv0Z 著作权归作者所有。请勿转载和采集!