1、用戶界面
1)點擊讀取文件按鈕,讀取到的文件如下圖所示:
數據聚類系統讀取文件
數據聚類系統導入文件
2)設置簇的個數,這里設置成2,並選擇K-means聚類算法,顯示的結果如下圖:
數據聚類系統運行K-means聚類算法
3)設置簇的個數,這里設置成2,並選擇K-中心點聚類算法,顯示的結果如下圖:
數據聚類系統運行K-中心點聚類算法
4)清屏,顯示的結果如下圖:
數據聚類系統清屏
2、實驗源碼
編譯環境為Spyder,所用語言及版本為python3.7,GUI環境為tkinter。
1)主運行界面 kmedgui.py
# -*- coding: utf-8 -*- import sys import random import kmeans import k_medoids_2d as k2d import numpy as np import tkinter as tk from tkinter import filedialog from tkinter import scrolledtext from PIL import Image,ImageTk import matplotlib.pyplot as plt import sklearn class GUI(object): #布局界面 def __init__(self): #設置初始界面 self.window=tk.Tk() self.window.title('數據聚類系統') self.window.geometry('1150x580') #導入文件按鈕 self.botton1=tk.Button(self.window, text='加載數據集',bg='green',fg='white', font=('楷體', 12, 'bold'), width=12, height=1,command=self.openfile) self.botton1.place(x=60,y=60) #標簽配置 self.label2=tk.Label(self.window, text='簇個數',bg='light blue',fg='white', font=('楷體', 16, 'bold'), width=10, height=1).place(x=10,y=160) #導入文件內容的輸出顯示 self.label4=tk.Label(self.window, text='導入文件內容如下',font=('楷體', 16, 'bold'), width=16, height=1).place(x=280,y=20) #創建結果顯示框 self.text1=scrolledtext.ScrolledText(self.window, height=10, width=30,font=('楷體', 12)) self.text1.place(x=250,y=60) self.text1.bind("<Button-1>",self.clear) #各個頻繁項集和強關聯規則的輸出顯示 self.label5=tk.Label(self.window, text='聚類實現',font=('楷體', 16, 'bold'), width=20, height=1).place(x=255,y=290) self.label6=tk.Label(self.window, text='聚類可視化',font=('楷體', 16, 'bold'), width=20, height=1).place(x=700,y=20) #創建結果顯示框 self.text2=scrolledtext.ScrolledText(self.window, height=10, width=30,font=('楷體', 12)) self.text2.place(x=250,y=330) self.text2.bind("<Button-1>",self.clear) #顯示導入文件的路徑 self.var0=tk.StringVar() self.entry1=tk.Entry(self.window, show=None, width='25', font=('Arial', 10), textvariable=self.var0) self.entry1.place(x=10,y=100) #自行設置簇個數,個數為2 self.var1=tk.StringVar() self.var1.set('2') self.entry2=tk.Entry(self.window, show=None, width='3', font=('Arial', 16), textvariable=self.var1) self.entry2.place(x=180,y=160) #選擇所需算法 self.btnlist=tk.IntVar() self.radiobtn1=tk.Radiobutton(self.window, variable=self.btnlist, value=0, text='K-means聚類算法', font=('bold'),command=self.runkmeans) self.radiobtn1.place(x=30,y=240) self.radiobtn2=tk.Radiobutton(self.window, variable=self.btnlist, value=1,text='K-中心點聚類算法', font=('bold'), command=self.runkmid) self.radiobtn2.place(x=30,y=300) self.btnlist.set(0) #清空頁面按鈕 self.btn2=tk.Button(self.window, bg='green',fg='white', text='清屏', font=('楷體', 12,'bold'), width=6, height=1) self.btn2.place(x=80,y=380) self.btn2.bind("<Button-1>",self.clear) #關閉頁面按鈕 self.btn3=tk.Button(self.window, bg='green',fg='white', text='退出', font=('楷體', 12,'bold'), width=6, height=1) self.btn3.place(x=80,y=450) self.btn3.bind("<Button-1>",self.close) self.pilImage = Image.open("white.png") img=self.pilImage.resize((500,480)) self.tkImage = ImageTk.PhotoImage(image=img) self.label = tk.Label(self.window, image=self.tkImage) self.label.place(x=600,y=60) #主窗口循環顯示 self.window.mainloop() #清空所填內容 def clear(self,event): self.text1.delete("1.0",tk.END) self.text2.delete("1.0",tk.END) self.pilImage = Image.open("white.png") img=self.pilImage.resize((500,480)) self.tkImage = ImageTk.PhotoImage(image=img) self.label = tk.Label(self.window, image=self.tkImage) self.label.place(x=600,y=60) self.label.configure(image = img) self.window.update_idletasks() #退出系統,對控制台清屏 def close(self,event): e=tk.messagebox.askokcancel('詢問','確定退出系統嗎?') if e==True: exit() self.window.destroy() # 恢復sys.stdout def __del__(self): sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ #從輸入文本框中獲取文本並返回數字列表 def getCNUM(self): entry_num1 = int(self.var1.get()) return entry_num1 def openfile(self): nameFile = filedialog.askopenfilename(title='打開文件', filetypes=[('txt', '*.txt')]) self.entry1.insert('insert', nameFile) def getnamefile(self): namefile=self.var0.get() return namefile #加載kmeans所需的數據集 def loadDataSet1(self): nameFile=self.getnamefile() data = np.loadtxt(nameFile,delimiter='\t') self.text1.insert("0.0",data) return data #加載k-中點所需的數據集 def loadDataSet2(self): data = [] for i in range(100): data.append(0 + i) for i in range(100): data.append(1000 + i) random.shuffle(data) return data def runkmeans(self): dataSet = self.loadDataSet1() k = self.getCNUM() c=kmeans.randCent(dataSet, k) centroids,clusterAssment = kmeans.KMeans(dataSet,k) self.text2.insert('insert',c) c1,c2,c3,c4=kmeans.showCluster(dataSet,k,centroids,clusterAssment) self.text2.insert('insert',c1) t0='\n' self.text2.insert('insert',t0) self.text2.insert('insert',c2) self.text2.insert('insert',t0) self.text2.insert('insert',c3) self.text2.insert('insert',t0) self.text2.insert('insert',c4) kmeans.showCluster(dataSet,k,centroids,clusterAssment) self.pilImage = Image.open("kpic.png") img=self.pilImage.resize((500,480)) self.tkImage = ImageTk.PhotoImage(image=img) self.label = tk.Label(self.window, image=self.tkImage) self.label.place(x=600,y=60) self.label.configure(image = img) self.window.update_idletasks() def runkmid(self): data=k2d.im_txt("a_data_set.txt") self.text1.insert("0.0",data) data_TSNE = sklearn.manifold.TSNE(learning_rate=100,n_iter=5000).fit_transform(data) k=self.getCNUM() t='簇中心:\n' t1='\n' self.text2.insert('insert',t) centers,result_clusters = k2d.KMedoids(k,data,10) self.text2.insert('insert',centers) self.text2.insert('insert',t1) color=k2d.randomcolor(k) colors = ([color[k] for k in result_clusters]) color = ['black'] plt.scatter(data_TSNE[:,0],data_TSNE[:,1],s=10,c=colors) plt.title('K-medoids Resul of '.format(str(k))) plt.savefig("kpic1.png") s1="第一類:" s2="第二類:" s3="第三類:" s4="第四類:" m=1 for m in range(len(result_clusters)): if result_clusters[m]==0: s1=s1+str(data[m])+"," if result_clusters[m]==1: s2=s2+str(data[m])+"," if result_clusters[m]==2: s3=s3+str(data[m])+"," if result_clusters[m]==3: s4=s4+str(data[m])+"," self.text2.insert('insert',s1) t1='\n' self.text2.insert('insert',t1) self.text2.insert('insert',s2) self.text2.insert('insert',t1) self.text2.insert('insert',s3) self.text2.insert('insert',t1) self.text2.insert('insert',s4) self.pilImage = Image.open("kpic1.png") img=self.pilImage.resize((500,480)) self.tkImage = ImageTk.PhotoImage(image=img) self.label = tk.Label(self.window, image=self.tkImage) self.label.place(x=600,y=60) self.label.configure(image = img) self.window.update_idletasks() if __name__ == '__main__': GUI()
2)導入的kmeans.py
import numpy as np import matplotlib.pyplot as plt # 加載數據 def loadDataSet(fileName): data = np.loadtxt(fileName,delimiter='\t') return data # 歐氏距離計算 def distEclud(x,y): return np.sqrt(np.sum((x-y)**2)) # 計算歐氏距離 # 為給定數據集構建一個包含K個隨機質心的集合 def randCent(dataSet, k): m, n = dataSet.shape centroids = np.zeros((k, n)) for i in range(k): centroids[i, :] = dataSet[i, :] print('質心:') print(centroids,end=" ") return centroids # k均值聚類 def KMeans(dataSet,k): m = np.shape(dataSet)[0] #行的數目 # 第一列存樣本屬於哪一簇 # 第二列存樣本的到簇的中心點的誤差 clusterAssment = np.mat(np.zeros((m,2))) clusterChange = True # 第1步 初始化centroids centroids = randCent(dataSet,k) while clusterChange: clusterChange = False # 遍歷所有的樣本(行數) for i in range(m): minDist = 100000.0 minIndex = -1 # 遍歷所有的質心 #第2步 找出最近的質心 for j in range(k): # 計算該樣本到質心的歐式距離 distance = distEclud(centroids[j,:],dataSet[i,:]) if distance < minDist: minDist = distance minIndex = j # 第 3 步:更新每一行樣本所屬的簇 if clusterAssment[i,0] != minIndex: clusterChange = True clusterAssment[i,:] = minIndex,minDist**2 #第 4 步:更新質心 for j in range(k): pointsInCluster = dataSet[np.nonzero(clusterAssment[:,0].A == j)[0]] # 獲取簇類所有的點 centroids[j,:] = np.mean(pointsInCluster,axis=0) # 對矩陣的行求均值 return centroids,clusterAssment def showCluster(dataSet,k,centroids,clusterAssment): m,n = dataSet.shape if n != 2: print("數據不是二維的") return 1 mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr'] if k > len(mark): print("k值太大了") return 1 # 繪制所有的樣本 for i in range(m): markIndex = int(clusterAssment[i,0]) plt.plot(dataSet[i,0],dataSet[i,1],mark[markIndex]) clu_1 = [] clu_2 = [] clu_3 = [] clu_4 = [] # 繪制所有的樣本 for i in range(m): markIndex = int(clusterAssment[i, 0]) if markIndex == 0: clu_1.append(dataSet[i, 0]) if markIndex == 1: clu_2.append(dataSet[i, 0]) if markIndex == 2: clu_3.append(dataSet[i, 0]) if markIndex == 3: clu_4.append(dataSet[i, 0]) plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex]) print("") print("c1:",clu_1) print("c2:",clu_2) print("c3:",clu_3) print("c4:",clu_4) mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb'] # 繪制質心 for i in range(k): plt.plot(centroids[i,0],centroids[i,1],mark[i]) plt.savefig("kpic.png") return clu_1,clu_2,clu_3,clu_4 3)導入的k_medoid.py import numpy as np from sklearn.manifold import TSNE import matplotlib.pyplot as plt import random def im_txt(file): """ 讀取數據 """ data=np.loadtxt(file,dtype=np.float32) return data def initianlize_centers(n_clusters): """初始化,生成隨機聚類中心""" n_data=lendata() centers=[] #聚類中心位置信息例:[101,205,5,3,7] i=0 while i<n_clusters: temp=random.randint(0,n_data-1) if temp not in centers: centers.append(temp) i=i+1 else: pass return centers def clus_process(centers,data): """根據聚類中心進行聚類""" result_clusters=[] centers=np.array(centers) """遍歷每個樣本""" for i in range(0,len(data)): uni_temp=[] #臨時存儲距離數據 for j in centers: temp=np.sqrt(np.sum(np.square(data[i]-data[j]))) uni_temp.append(temp) c_min=min(uni_temp) #距離最小值 result_clusters.append(uni_temp.index(c_min)) #距離最小值所在位置即為歸屬簇 return result_clusters def chose_centers(result_clusters,n_clusters): global c_temp centers=[] for i in range(0,n_clusters): #逐個簇進行隨機 temp=[] #記錄每個簇樣本在data中的位置 for j in range(0,len(result_clusters)): #遍歷每個樣本 if result_clusters[j]==i: #尋找簇i的樣本 temp.append(j) try: c_temp=random.sample(temp,1) #在樣本中隨機取一個值作為新的聚類中心 except: print("sample bug") print(temp) centers.append(c_temp[0]) return centers def count_E(centers_new,data,result_clusters_new): """計算價值函數""" E=0 for i in range(0,len(centers_new)): for j in range(0,len(data)): if result_clusters_new[j]==i: temp=np.sqrt(np.sum(np.square(data[j]-data[centers_new[i]]))) E+=temp return E def KMedoids(n_clusters,data,max_iter): """初始化""" centers=initianlize_centers(n_clusters) """根據隨機中心進行聚類""" result_clusters=clus_process(centers,data) """重新選擇聚類中心,並比較""" xie=0 #計數器 E=5*5000 """ _old:用來記錄上一次的聚類結果 _new:新一次聚類的結果 無old和new:輸出結果 """ while xie<=max_iter: centers_new=chose_centers(result_clusters,n_clusters) #新的聚類中心 result_clusters_new=clus_process(centers,data) #新的聚類結果 """計算價值函數E""" E_new=count_E(centers_new,data,result_clusters_new) """價值函數變小,則更新聚類中心和聚類結果""" if E_new<E: centers=centers_new result_clusters=result_clusters_new E=E_new t="" y="" t=t+"價值函數為:"+str(E)+"\n" # print("價值函數為:%s"%E) y=y+"聚類中心:"+str(centers)+"\n" # print("聚類中心:%s"%centers) print(t) print(y) xie=0 """閾值計數器""" xie=xie+1 return centers,result_clusters def randomcolor(x): """隨機生成十六進制編碼""" colors=[] i=0 while i<x: colorArr = ['1','7','A','F'] # colorArr = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr'] color = "" j=0 while j<6: color += colorArr[random.randint(0,3)] j=j+1 color="#"+color if color in colors: continue else: colors.append(color) i=i+1 return colors def lendata(): file="a_data_set.txt" data=im_txt(file) n_data=len(data) return n_data