基於python3的可視化數據聚類系統(k-means算法和k-中心點算法)


1、用戶界面

1)點擊讀取文件按鈕,讀取到的文件如下圖所示:

 

數據聚類系統讀取文件

 

數據聚類系統導入文件

2)設置簇的個數,這里設置成2,並選擇K-means聚類算法,顯示的結果如下圖:

 

數據聚類系統運行K-means聚類算法

3)設置簇的個數,這里設置成2,並選擇K-中心點聚類算法,顯示的結果如下圖:

數據聚類系統運行K-中心點聚類算法

4)清屏,顯示的結果如下圖:

 

數據聚類系統清屏

2、實驗源碼

編譯環境為Spyder,所用語言及版本為python3.7,GUI環境為tkinter。

1)主運行界面 kmedgui.py

# -*- coding: utf-8 -*-
import sys
import random
import kmeans
import k_medoids_2d as k2d
import numpy as np
import tkinter as tk 
from tkinter import filedialog
from tkinter import scrolledtext
from PIL import Image,ImageTk
import matplotlib.pyplot as plt
import sklearn

class GUI(object):
    #布局界面
    def __init__(self):
        #設置初始界面
        self.window=tk.Tk()
        self.window.title('數據聚類系統')
        self.window.geometry('1150x580')
        #導入文件按鈕
        self.botton1=tk.Button(self.window, text='加載數據集',bg='green',fg='white',  font=('楷體', 12, 'bold'), width=12, height=1,command=self.openfile)
        self.botton1.place(x=60,y=60)
        #標簽配置
        self.label2=tk.Label(self.window, text='簇個數',bg='light blue',fg='white', font=('楷體', 16, 'bold'), width=10, height=1).place(x=10,y=160)
        #導入文件內容的輸出顯示
        self.label4=tk.Label(self.window, text='導入文件內容如下',font=('楷體', 16, 'bold'), width=16, height=1).place(x=280,y=20)
        #創建結果顯示框
        self.text1=scrolledtext.ScrolledText(self.window, height=10, width=30,font=('楷體', 12))
        self.text1.place(x=250,y=60)
        self.text1.bind("<Button-1>",self.clear)
        #各個頻繁項集和強關聯規則的輸出顯示
        self.label5=tk.Label(self.window, text='聚類實現',font=('楷體', 16, 'bold'), width=20, height=1).place(x=255,y=290)
        self.label6=tk.Label(self.window, text='聚類可視化',font=('楷體', 16, 'bold'), width=20, height=1).place(x=700,y=20)
        #創建結果顯示框
        self.text2=scrolledtext.ScrolledText(self.window, height=10, width=30,font=('楷體', 12))
        self.text2.place(x=250,y=330)
        self.text2.bind("<Button-1>",self.clear)
        #顯示導入文件的路徑
        self.var0=tk.StringVar()
        self.entry1=tk.Entry(self.window, show=None, width='25', font=('Arial', 10), textvariable=self.var0)
        self.entry1.place(x=10,y=100)
        #自行設置簇個數,個數為2
        self.var1=tk.StringVar()
        self.var1.set('2')
        self.entry2=tk.Entry(self.window, show=None, width='3', font=('Arial', 16), textvariable=self.var1)
        self.entry2.place(x=180,y=160)
        #選擇所需算法
        self.btnlist=tk.IntVar()
        self.radiobtn1=tk.Radiobutton(self.window, variable=self.btnlist, value=0, text='K-means聚類算法', font=('bold'),command=self.runkmeans)
        self.radiobtn1.place(x=30,y=240)
        self.radiobtn2=tk.Radiobutton(self.window, variable=self.btnlist, value=1,text='K-中心點聚類算法', font=('bold'), command=self.runkmid)
        self.radiobtn2.place(x=30,y=300)
        self.btnlist.set(0)
        #清空頁面按鈕
        self.btn2=tk.Button(self.window, bg='green',fg='white', text='清屏', font=('楷體', 12,'bold'), width=6, height=1)
        self.btn2.place(x=80,y=380)
        self.btn2.bind("<Button-1>",self.clear)
        #關閉頁面按鈕
        self.btn3=tk.Button(self.window, bg='green',fg='white', text='退出', font=('楷體', 12,'bold'), width=6, height=1)
        self.btn3.place(x=80,y=450)
        self.btn3.bind("<Button-1>",self.close)
        self.pilImage = Image.open("white.png")
        img=self.pilImage.resize((500,480))
        self.tkImage = ImageTk.PhotoImage(image=img)
        self.label = tk.Label(self.window, image=self.tkImage)
        self.label.place(x=600,y=60)
        #主窗口循環顯示
        self.window.mainloop()

    #清空所填內容    
    def clear(self,event):
        self.text1.delete("1.0",tk.END)
        self.text2.delete("1.0",tk.END)
        self.pilImage = Image.open("white.png")
        img=self.pilImage.resize((500,480))
        self.tkImage = ImageTk.PhotoImage(image=img)
        self.label = tk.Label(self.window, image=self.tkImage)
        self.label.place(x=600,y=60)
        self.label.configure(image = img)
        self.window.update_idletasks()
    #退出系統,對控制台清屏    
    def close(self,event):
        e=tk.messagebox.askokcancel('詢問','確定退出系統嗎?')
        if e==True:
            exit()
            self.window.destroy()  
    # 恢復sys.stdout      
    def __del__(self):
        sys.stdout = sys.__stdout__
        sys.stderr = sys.__stderr__
        
    #從輸入文本框中獲取文本並返回數字列表
    def getCNUM(self):   
          entry_num1 = int(self.var1.get())
          return entry_num1
      
    def openfile(self):
        nameFile = filedialog.askopenfilename(title='打開文件', filetypes=[('txt', '*.txt')])
        self.entry1.insert('insert', nameFile)
        
    def getnamefile(self):
        namefile=self.var0.get()
        return namefile
    
    #加載kmeans所需的數據集       
    def loadDataSet1(self):
        nameFile=self.getnamefile()
        data = np.loadtxt(nameFile,delimiter='\t')
        self.text1.insert("0.0",data)
        return data
    
    #加載k-中點所需的數據集
    def loadDataSet2(self):
        data = []
        for i in range(100):
            data.append(0 + i)
        for i in range(100):
            data.append(1000 + i)
        random.shuffle(data)
        return data
    
    def runkmeans(self):
        dataSet = self.loadDataSet1()
        k = self.getCNUM()
        c=kmeans.randCent(dataSet, k)
        centroids,clusterAssment = kmeans.KMeans(dataSet,k)
        self.text2.insert('insert',c)
        c1,c2,c3,c4=kmeans.showCluster(dataSet,k,centroids,clusterAssment)
        self.text2.insert('insert',c1)
        t0='\n'
        self.text2.insert('insert',t0)
        self.text2.insert('insert',c2)
        self.text2.insert('insert',t0)
        self.text2.insert('insert',c3)
        self.text2.insert('insert',t0)
        self.text2.insert('insert',c4)
        kmeans.showCluster(dataSet,k,centroids,clusterAssment)
        self.pilImage = Image.open("kpic.png")
        img=self.pilImage.resize((500,480))
        self.tkImage = ImageTk.PhotoImage(image=img)
        self.label = tk.Label(self.window, image=self.tkImage)
        self.label.place(x=600,y=60)
        self.label.configure(image = img)
        self.window.update_idletasks()
    
    def runkmid(self):
        data=k2d.im_txt("a_data_set.txt")
        self.text1.insert("0.0",data)
        data_TSNE = sklearn.manifold.TSNE(learning_rate=100,n_iter=5000).fit_transform(data)
        k=self.getCNUM() 
        t='簇中心:\n'
        t1='\n'
        self.text2.insert('insert',t)
        centers,result_clusters = k2d.KMedoids(k,data,10) 
        self.text2.insert('insert',centers)
        self.text2.insert('insert',t1)
        color=k2d.randomcolor(k)
        colors = ([color[k] for k in result_clusters])
        color = ['black']
        plt.scatter(data_TSNE[:,0],data_TSNE[:,1],s=10,c=colors)
        plt.title('K-medoids Resul of '.format(str(k)))
        plt.savefig("kpic1.png")
        s1="第一類:"
        s2="第二類:"
        s3="第三類:"
        s4="第四類:"
        m=1
        for m in range(len(result_clusters)):
            
            if result_clusters[m]==0:
                s1=s1+str(data[m])+","
            if result_clusters[m]==1:
                s2=s2+str(data[m])+","
            if result_clusters[m]==2:
                s3=s3+str(data[m])+","
            if result_clusters[m]==3:
                s4=s4+str(data[m])+","
        self.text2.insert('insert',s1)
        t1='\n'
        self.text2.insert('insert',t1)
        self.text2.insert('insert',s2)
        self.text2.insert('insert',t1)
        self.text2.insert('insert',s3)
        self.text2.insert('insert',t1)
        self.text2.insert('insert',s4)
        self.pilImage = Image.open("kpic1.png")
        img=self.pilImage.resize((500,480))
        self.tkImage = ImageTk.PhotoImage(image=img)
        self.label = tk.Label(self.window, image=self.tkImage)
        self.label.place(x=600,y=60)
        self.label.configure(image = img)
        self.window.update_idletasks()
           
if __name__ == '__main__':
    GUI()

  

2)導入的kmeans.py

import numpy as np
import matplotlib.pyplot as plt
 
# 加載數據
def loadDataSet(fileName):
    data = np.loadtxt(fileName,delimiter='\t')
    return data
 
# 歐氏距離計算
def distEclud(x,y):
    return np.sqrt(np.sum((x-y)**2))  # 計算歐氏距離
 
# 為給定數據集構建一個包含K個隨機質心的集合 
def randCent(dataSet, k):
    m, n = dataSet.shape
    centroids = np.zeros((k, n))
    for i in range(k):
        centroids[i, :] = dataSet[i, :]
    print('質心:')
    print(centroids,end=" ")
    return centroids
# k均值聚類
def KMeans(dataSet,k):
    m = np.shape(dataSet)[0]  #行的數目
    # 第一列存樣本屬於哪一簇
    # 第二列存樣本的到簇的中心點的誤差
    clusterAssment = np.mat(np.zeros((m,2)))
    clusterChange = True
 
    # 第1步 初始化centroids
    centroids = randCent(dataSet,k)
    while clusterChange:
        clusterChange = False
 
        # 遍歷所有的樣本(行數)
        for i in range(m):
            minDist = 100000.0
            minIndex = -1
 
            # 遍歷所有的質心
            #第2步 找出最近的質心
            for j in range(k):
                # 計算該樣本到質心的歐式距離
                distance = distEclud(centroids[j,:],dataSet[i,:])
                if distance < minDist:
                    minDist = distance
                    minIndex = j
            # 第 3 步:更新每一行樣本所屬的簇
            if clusterAssment[i,0] != minIndex:
                clusterChange = True
                clusterAssment[i,:] = minIndex,minDist**2
        #第 4 步:更新質心
        for j in range(k):
            pointsInCluster = dataSet[np.nonzero(clusterAssment[:,0].A == j)[0]]  # 獲取簇類所有的點
            centroids[j,:] = np.mean(pointsInCluster,axis=0)   # 對矩陣的行求均值
    return centroids,clusterAssment
 
def showCluster(dataSet,k,centroids,clusterAssment):
    m,n = dataSet.shape
    if n != 2:
        print("數據不是二維的")
        return 1
 
    mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
    if k > len(mark):
        print("k值太大了")
        return 1
 
    # 繪制所有的樣本
    for i in range(m):
        markIndex = int(clusterAssment[i,0])
        plt.plot(dataSet[i,0],dataSet[i,1],mark[markIndex])
    clu_1 = []
    clu_2 = []
    clu_3 = []
    clu_4 = []
    # 繪制所有的樣本
    for i in range(m):
        markIndex = int(clusterAssment[i, 0])
        if markIndex == 0:
            clu_1.append(dataSet[i, 0])
        if markIndex == 1:
            clu_2.append(dataSet[i, 0])
        if markIndex == 2:
            clu_3.append(dataSet[i, 0])
        if markIndex == 3:
            clu_4.append(dataSet[i, 0])
        plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex])
    print("")
    print("c1:",clu_1)
    print("c2:",clu_2)
    print("c3:",clu_3)
    print("c4:",clu_4)
    mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']
    # 繪制質心
    for i in range(k):
        plt.plot(centroids[i,0],centroids[i,1],mark[i])
    plt.savefig("kpic.png") 
return clu_1,clu_2,clu_3,clu_4

3)導入的k_medoid.py

import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import random

def im_txt(file):
    """
    讀取數據
    """
    data=np.loadtxt(file,dtype=np.float32)
return data

def initianlize_centers(n_clusters):
    """初始化,生成隨機聚類中心"""
    n_data=lendata()
    centers=[]  #聚類中心位置信息例:[101,205,5,3,7]
    i=0
    while i<n_clusters:
        temp=random.randint(0,n_data-1)
        if temp not in centers:
            centers.append(temp)
            i=i+1
        else:
            pass
    return centers

def clus_process(centers,data):
    """根據聚類中心進行聚類"""
    result_clusters=[]
    centers=np.array(centers)
    """遍歷每個樣本"""
    for i in range(0,len(data)):
        uni_temp=[] #臨時存儲距離數據
        for j in centers:
            temp=np.sqrt(np.sum(np.square(data[i]-data[j])))
            uni_temp.append(temp)
        c_min=min(uni_temp) #距離最小值
        result_clusters.append(uni_temp.index(c_min))  #距離最小值所在位置即為歸屬簇

    return result_clusters

def chose_centers(result_clusters,n_clusters):
    global c_temp
    centers=[]
    for i in range(0,n_clusters):  #逐個簇進行隨機
        temp=[]  #記錄每個簇樣本在data中的位置
        for j in range(0,len(result_clusters)):   #遍歷每個樣本
            if result_clusters[j]==i:     #尋找簇i的樣本
                temp.append(j)
        try:
            c_temp=random.sample(temp,1)   #在樣本中隨機取一個值作為新的聚類中心
        except:
            print("sample bug")
            print(temp)
        centers.append(c_temp[0])

    return centers

def count_E(centers_new,data,result_clusters_new):
    """計算價值函數"""
    E=0
    for i in range(0,len(centers_new)):
        for j in range(0,len(data)):
            if result_clusters_new[j]==i:
                temp=np.sqrt(np.sum(np.square(data[j]-data[centers_new[i]])))
                E+=temp
    return E

def KMedoids(n_clusters,data,max_iter):
    """初始化"""
    centers=initianlize_centers(n_clusters)
    """根據隨機中心進行聚類"""
    result_clusters=clus_process(centers,data)
    """重新選擇聚類中心,並比較"""
    xie=0  #計數器
    E=5*5000
    """
    _old:用來記錄上一次的聚類結果
    _new:新一次聚類的結果
    無old和new:輸出結果
    """
    while xie<=max_iter:
        centers_new=chose_centers(result_clusters,n_clusters)  #新的聚類中心
        result_clusters_new=clus_process(centers,data)  #新的聚類結果
        """計算價值函數E"""
        E_new=count_E(centers_new,data,result_clusters_new)
        """價值函數變小,則更新聚類中心和聚類結果"""
        if E_new<E:
           centers=centers_new
           result_clusters=result_clusters_new
           E=E_new
           t=""
           y=""
           t=t+"價值函數為:"+str(E)+"\n"
           # print("價值函數為:%s"%E)
           y=y+"聚類中心:"+str(centers)+"\n"
           # print("聚類中心:%s"%centers)
           print(t)
           print(y)
           xie=0
        """閾值計數器"""
        xie=xie+1


    return centers,result_clusters


def randomcolor(x):
    """隨機生成十六進制編碼"""
    colors=[]
    i=0

    while i<x:
        colorArr = ['1','7','A','F']
#        colorArr = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
        color = ""
        j=0
        while j<6:
            color += colorArr[random.randint(0,3)]
            j=j+1
        color="#"+color
        if color in colors:
            continue
        else:
            colors.append(color)
            i=i+1
    return colors

def lendata():
file="a_data_set.txt"
 	data=im_txt(file)
    n_data=len(data)
    return n_data

  


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM