一、K-mediods 算法步驟:
1、隨機選取k個樣本作為中心點
2、一層遍歷每一個樣本,二層遍歷每一個中心樣本,找出離該樣本最近的中心樣本
3、遍歷中心樣本,該中心樣本划分出來的該簇樣本,遍歷該簇樣本,找出離所有樣本距離最小的樣本,代替舊中心
4、直到達到指定訓練次數或者樣本分類結果不再變化,結束訓練
1 import numpy as np 2 from numpy import * 3 4 5 with open("xigua.txt") as file: 6 data_lines = file.readlines() 7 data_list = [[] for i in data_lines] 8 for item in range(len(data_lines)): 9 data_list[item][:] = (float(i) for i in data_lines[item].strip().split(",")[0:3]) 10 data_list[item].append(-1) 11 # print(data_list) 12 13 14 def choice_center(data, k): 15 centers = [] 16 for i in np.random.choice(len(data), k): 17 centers.append(data[i]) 18 print("隨機選取的中心點(第一次):\n", centers) 19 return centers 20 21 22 def distance(a, b): 23 dis = [] 24 for i in range(len(a)): 25 dis.append(pow(a[i] - b[i], 2)) 26 # print(sqrt(sum(dis))) 27 return sqrt(sum(dis)) 28 29 30 def k_center(data_list,center): 31 flag = True 32 i = 0 33 while flag: 34 flag = False 35 for i in range(len(data_list)): # 遍歷所有樣本,最后一列標記該樣本所屬簇 36 min_index = -2 37 min_dis = inf 38 for j in range(len(center)): 39 dis = distance(data_list[i][1:3],center[j][1:3]) 40 if dis < min_dis: 41 min_dis = dis 42 min_index = j 43 if data_list[i][-1] != min_index: 44 flag = True 45 data_list[i][-1] = min_index 46 print("分類結果111:",data_list) 47 # 重新計算簇中心 48 for k in range(len(center)): # 遍歷中心向量,取出屬於當前中心向量簇的樣本 49 current_k = [] 50 for i in range(len(data_list)): 51 if data_list[i][-1] == k: 52 current_k.append(data_list[i]) 53 # print(k, ":", current_k) 54 old_dis = 0.0 55 for i in range(len(current_k)): 56 old_dis += distance(current_k[i][1:3], center[k][1:3]) 57 for m in range(len(current_k)): 58 new_dis = 0.0 59 for n in range(len(current_k)): 60 new_dis += distance(current_k[m][1:3], current_k[n][1:3]) 61 if new_dis < old_dis: 62 old_dis = new_dis 63 center[k][:] = current_k[m][:] 64 # flag = True 65 # print("新中心點", center) 66 # i +=1 67 # print("循環次數: 68 print("選中的最終中心點", center) 69 for i in range(len(data_list)): # 遍歷所有樣本,最后一列標記該樣本所屬簇 70 min_index = -2 71 min_dis = inf 72 for j in range(len(center)): 73 dis = distance(data_list[i][1:3], center[j][1:3]) 74 if dis < min_dis: 75 min_dis = dis 76 min_index = j 77 data_list[i][-1] = min_index 78 print("分類結果222:", data_list) 79 80 81 82 centers = choice_center(data_list,3) 83 k_center(data_list,centers)
二、k-means算法:
1、隨機選取k個樣本作為中心向量
2、遍歷每一個樣本,以及每一個中心向量,對每一個樣本進行類別的更新
3、取出同一類別的所有樣本,求每一列的平均值,得到新的中心向量
4、直到達到指定訓練次數,或者中心向量不再改變
1 import numpy as np 2 3 4 def kmeans(x,k,maxIt): 5 numPoints,numDim = x.shape 6 dataset = np.zeros((numPoints,numDim+1)) # 多加一列存儲類別 7 dataset[:,:-1] = x 8 centroids = dataset[np.random.randint(numPoints,size=k)] # 隨機選取k個中心點 9 centroids[:,-1] = range(1,k+1) 10 iteration = 0 11 oldCentroids = None 12 while not shouldStop(oldCentroids,centroids,iteration,maxIt): 13 oldCentroids=np.copy(centroids) 14 iteration +=1 15 updataLable(dataset,centroids) # 重新分類 16 centroids = getCentriods(dataset,k) # 得到新的中心點 17 return dataset 18 19 20 def shouldStop (oldCentroids,centroids,iteration,maxIt): # 滿足了兩個結束條件 21 if iteration>maxIt: 22 return True 23 return np.array_equal(oldCentroids,centroids) # !!!!!!!!!!!比較兩個array是否相等 24 25 def updataLable(dataset,centroids): 26 numPoints,numDim = dataset.shape 27 for i in range(0,numPoints): 28 dataset[i,-1] = getLableFromClosestCentriod(dataset[i,:-1],centroids) 29 30 def getLableFromClosestCentriod(dataSetRow,centroids): 31 lable = centroids[0,-1] 32 minDist = np.linalg.norm(dataSetRow-centroids[0,:-1]) # 求范數,跟求歐氏距離一個道理 33 for i in range(1,centroids.shape[0]): 34 dist = np.linalg.norm(dataSetRow-centroids[i,:-1]) 35 if dist<minDist : 36 minDist = dist 37 lable = centroids[i,-1] 38 return lable 39 40 41 def getCentriods(dataset,k): 42 result = np.zeros((k,dataset.shape[1])) 43 for i in range(1,k+1): 44 oneCluster = dataset[dataset[:,-1]==i,:-1] # 取出最后一列等於指定值的樣本行 45 result[i-1,:-1] = np.mean(oneCluster,axis=0) # 對傳入矩陣,求列的平均值,即可以求到該簇的中心向量 46 result[i-1,-1] = i 47 return result 48 49 x1 = np.array([1,2]) 50 x2 = np.array([2,1]) 51 x3 = np.array([4,3]) 52 x4 = np.array([5,4]) 53 x = np.vstack((x1,x2,x3,x4)) 54 result = kmeans(x,2,10) 55 print("result",result)