K-means算法與K-mediods算法 python實現


一、K-mediods 算法步驟:

1、隨機選取k個樣本作為中心點
2、一層遍歷每一個樣本,二層遍歷每一個中心樣本,找出離該樣本最近的中心樣本
3、遍歷中心樣本,該中心樣本划分出來的該簇樣本,遍歷該簇樣本,找出離所有樣本距離最小的樣本,代替舊中心
4、直到達到指定訓練次數或者樣本分類結果不再變化,結束訓練
 1 import  numpy as np
 2 from numpy import *
 3 
 4 
 5 with open("xigua.txt") as file:
 6     data_lines = file.readlines()
 7     data_list = [[] for i in data_lines]
 8     for item in range(len(data_lines)):
 9         data_list[item][:] = (float(i) for i in data_lines[item].strip().split(",")[0:3])
10         data_list[item].append(-1)
11  #   print(data_list)
12 
13 
14 def choice_center(data, k):
15     centers = []
16     for i in np.random.choice(len(data), k):
17         centers.append(data[i])
18     print("隨機選取的中心點(第一次):\n", centers)
19     return centers
20 
21 
22 def distance(a, b):
23     dis = []
24     for i in range(len(a)):
25         dis.append(pow(a[i] - b[i], 2))
26 #    print(sqrt(sum(dis)))
27     return sqrt(sum(dis))
28 
29 
30 def k_center(data_list,center):
31     flag = True
32     i = 0
33     while flag:
34         flag = False
35         for i in range(len(data_list)):                       # 遍歷所有樣本,最后一列標記該樣本所屬簇
36             min_index = -2
37             min_dis = inf
38             for j in range(len(center)):
39                 dis = distance(data_list[i][1:3],center[j][1:3])
40                 if dis < min_dis:
41                     min_dis = dis
42                     min_index = j
43             if data_list[i][-1] != min_index:
44                 flag = True
45             data_list[i][-1] = min_index
46         print("分類結果111:",data_list)
47         # 重新計算簇中心
48         for k in range(len(center)):                      # 遍歷中心向量,取出屬於當前中心向量簇的樣本
49             current_k = []
50             for i in range(len(data_list)):
51                 if data_list[i][-1] == k:
52                     current_k.append(data_list[i])
53 #            print(k, ":", current_k)
54             old_dis = 0.0
55             for i in range(len(current_k)):
56                 old_dis += distance(current_k[i][1:3], center[k][1:3])
57             for m in range(len(current_k)):
58                 new_dis = 0.0
59                 for n in range(len(current_k)):
60                     new_dis += distance(current_k[m][1:3], current_k[n][1:3])
61                 if new_dis < old_dis:
62                     old_dis = new_dis
63                     center[k][:] = current_k[m][:]
64                     # flag = True
65         # print("新中心點", center)
66         # i +=1
67         # print("循環次數:
68     print("選中的最終中心點", center)
69     for i in range(len(data_list)):  # 遍歷所有樣本,最后一列標記該樣本所屬簇
70         min_index = -2
71         min_dis = inf
72         for j in range(len(center)):
73             dis = distance(data_list[i][1:3], center[j][1:3])
74             if dis < min_dis:
75                 min_dis = dis
76                 min_index = j
77         data_list[i][-1] = min_index
78     print("分類結果222:", data_list)
79 
80 
81 
82 centers = choice_center(data_list,3)
83 k_center(data_list,centers)

二、k-means算法:

1、隨機選取k個樣本作為中心向量
2、遍歷每一個樣本,以及每一個中心向量,對每一個樣本進行類別的更新
3、取出同一類別的所有樣本,求每一列的平均值,得到新的中心向量
4、直到達到指定訓練次數,或者中心向量不再改變
 1 import numpy as np
 2 
 3 
 4 def kmeans(x,k,maxIt):
 5     numPoints,numDim = x.shape
 6     dataset = np.zeros((numPoints,numDim+1))  # 多加一列存儲類別
 7     dataset[:,:-1] = x
 8     centroids = dataset[np.random.randint(numPoints,size=k)]     # 隨機選取k個中心點
 9     centroids[:,-1] = range(1,k+1)
10     iteration = 0
11     oldCentroids = None
12     while not shouldStop(oldCentroids,centroids,iteration,maxIt):
13         oldCentroids=np.copy(centroids)
14         iteration +=1
15         updataLable(dataset,centroids)    # 重新分類
16         centroids = getCentriods(dataset,k)  # 得到新的中心點
17     return dataset
18 
19 
20 def shouldStop (oldCentroids,centroids,iteration,maxIt):    # 滿足了兩個結束條件
21     if iteration>maxIt:
22         return True
23     return np.array_equal(oldCentroids,centroids)      # !!!!!!!!!!!比較兩個array是否相等
24 
25 def updataLable(dataset,centroids):
26     numPoints,numDim = dataset.shape
27     for i in range(0,numPoints):
28         dataset[i,-1] = getLableFromClosestCentriod(dataset[i,:-1],centroids)
29 
30 def getLableFromClosestCentriod(dataSetRow,centroids):
31     lable = centroids[0,-1]
32     minDist = np.linalg.norm(dataSetRow-centroids[0,:-1])  # 求范數,跟求歐氏距離一個道理
33     for i in range(1,centroids.shape[0]):
34         dist = np.linalg.norm(dataSetRow-centroids[i,:-1])
35         if dist<minDist :
36             minDist = dist
37             lable = centroids[i,-1]
38     return lable
39 
40 
41 def getCentriods(dataset,k):
42     result = np.zeros((k,dataset.shape[1]))
43     for i in range(1,k+1):
44         oneCluster = dataset[dataset[:,-1]==i,:-1]      # 取出最后一列等於指定值的樣本行
45         result[i-1,:-1] = np.mean(oneCluster,axis=0)    # 對傳入矩陣,求列的平均值,即可以求到該簇的中心向量
46         result[i-1,-1] = i
47     return result
48 
49 x1 = np.array([1,2])
50 x2 = np.array([2,1])
51 x3 = np.array([4,3])
52 x4 = np.array([5,4])
53 x = np.vstack((x1,x2,x3,x4))
54 result = kmeans(x,2,10)
55 print("result",result)

 

 
 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM