kmeans算法使用numpy库源代码: import random import matplotlib.pyplot as plt import numpy as np import time class KMeans(): def __init__(self, k=1): ''' :param k: k代表分类数 ''' self.__k = k self.__data = None # 存放原始数据 self.__pointCenter = None # 存放中心点,第一次获得的中心点通过随机方式在__data里随机出来 self.__result = [] # 存放分类结果 for i in range(k): self.__result.append([]) # [[],[],[],[],[]] pass pass def fit(self, data, threshold, times=50000): ''' 进行模型训练 :param data: 训练数据 :param threshold: 阈值,退出条件 :return: ''' self.__data = data self.randomCenter() print(self.__pointCenter) centerDistance = self.calPointCenterDistance(self.__pointCenter, self.__data) # 对原始数据进行分类,将每个点分到离它最近的中心点 i = 0 for temp in centerDistance: index = np.argmin(temp) self.__result[index].append(self.__data[i]) i += 1 pass # 打印分类结果 # print(self.__result) oldCenterPoint = self.__pointCenter newCenterPoint = self.calNewPointCenter(self.__result) while np.sum(np.sum((oldCenterPoint - newCenterPoint)**2, axis=1)**0.5)/self.__k > threshold: times -= 1 result = [] for i in range(self.__k): result.append([]) pass # 保存上次的中心点 oldCenterPoint = newCenterPoint centerDistance = self.calPointCenterDistance(newCenterPoint, self.__data) # 对原始数据进行分类,将每个点分到离它最近的中心点 i = 0 for temp in centerDistance: index = np.argmin(temp) result[index].append(self.__data[i]) # result = [[[10,20]]] i += 1 pass newCenterPoint = self.calNewPointCenter(result) self.__result = result pass self.__pointCenter = newCenterPoint return newCenterPoint, self.__result pass def calPointCenterDistance(self, center, data): ''' 计算每个点和每个中心点之间的距离 :return: ''' centerDistance = [] flag = False for temp in data: centerDistance.append([np.sum((center - temp) ** 2, axis=1) ** 0.5]) pass # print(centerDistance) return np.array(centerDistance) pass def calNewPointCenter(self, result): ''' 计算新的中心点 :param result: :return: ''' newCenterPoint = None flag = False for temp in result: # 转置 temps = np.array(temp) point = np.mean(temps, axis=0) if not flag: newCenterPoint = np.array([point]) flag = True pass else: newCenterPoint = np.vstack((newCenterPoint, point)) pass # print(newCenterPoint) return newCenterPoint pass def randomCenter(self): ''' 从原始的__data里随机出最开始进行计算的k个中心点 :return: ''' if not self.__pointCenter: index = random.randint(0, len(self.__data) - 1) self.__pointCenter = np.array([self.__data[index]]) pass while len(self.__pointCenter) < self.__k: # 随机一个索引 index = random.randint(0, len(self.__data) - 1) # 判断中心点是否重复,如果不重复,加入中心点列表 if self.__data[index] not in self.__pointCenter: self.__pointCenter = np.vstack((self.__pointCenter, self.__data[index])) pass pass pass pass if __name__ == "__main__": # 原始数据改为nunmpy结构 data = np.random.randint(0, 100, 20000).reshape(10000, 2) # print(data) startTime = time.time() kmeans = KMeans(k=5) centerPoint, result = kmeans.fit(data, 0.0001) print(time.time() - startTime) print(centerPoint) plt.plot() plt.title("KMeans Classification") i = 0 tempx = [] tempy = [] color = [] for temp in result: temps = [[temp[x][i] for x in range(len(temp))] for i in range(len(temp[0]))] color += [i] * len(temps[0]) tempx += temps[0] tempy += temps[1] i += 2 pass plt.scatter(tempx, tempy, c=color, s=30) plt.show() pass
结果图片展示: