#!/usr/bin/env python #-*- coding:utf-8 -*- import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import KMeans from sklearn import preprocessing from scipy.spatial.distance import cdist from sklearn import metrics # 讀取原始數據 X = [] y_true = [] id = [] f = open('data/wina.data') for line in f: y = [] for index,item in enumerate(line.split(",")): if index == 0: id.append(int(item)) continue y.append(float(item)) X.append(y) # 轉化為numpy array X = np.array(X) y_true = np.array(id) min_max_scaler = preprocessing.MinMaxScaler() X = min_max_scaler.fit_transform(X) K = range(1, 10) meandistortions = [] for k in K: kmeans = KMeans(n_clusters=k) kmeans.fit(X) meandistortions.append(sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0]) plt.plot(K, meandistortions, 'bx-') plt.xlabel('k') plt.ylabel('meandistortions') plt.title('best K of the model') plt.show() n_clusters = 3 cls = KMeans(n_clusters).fit(X) y_pre = cls.predict(X) n_samples,n_features=X.shape #總樣本量,總特征數 inertias = cls.inertia_ #樣本距離最近的聚類中心的總和 adjusted_rand_s=metrics.adjusted_rand_score(y_true,y_pre) #調整后的蘭德指數 homogeneity_s=metrics.homogeneity_score(y_true,y_pre) #同質化得分 silhouette_s=metrics.silhouette_score(X,y_pre,metric='euclidean') #平均輪廓系數 print("蘭德指數ART",adjusted_rand_s) print("同質化得分homo",homogeneity_s) print("平均輪廓系數",silhouette_s) centers=cls.cluster_centers_ #各類別中心 colors=['#ff0000','#00ff00','#0000ff'] #設置不同類別的顏色 plt.figure() #建立畫布 for i in range(n_clusters): #循環讀取類別 index_sets=np.where(y_pre==i) #找到相同類的索引集合、 cluster=X[index_sets] #將相同類的數據划分為一個聚類子集 plt.scatter(cluster[:,0],cluster[:,0],c=colors[i],marker='.') #展示聚類子集內的樣本點 plt.plot(centers[i][0],centers[i][0],'*',markerfacecolor=colors[i],markeredgecolor='k',markersize=6) plt.show()