class sklearn.cluster.KMeans (n_clusters=8, init=’k-means++’, n_init=10, max_iter=300, tol=0.0001,precompute_distances=’auto’, verbose=0, random_state=None, copy_x=True, n_jobs=None, algorithm=’auto’)
1 重要參數n_clusters
n_clusters是KMeans中的k,表示着我們告訴模型我們要分幾類。這是KMeans當中唯一一個必填的參數,默認為8類,但通常我們的聚類結果會是一個小於8的結果。通常,在開始聚類之前,我們並不知道n_clusters究竟是多少,因此我們要對它進行探索。
1.1 先進行一次聚類看看吧
當我們拿到一個數據集,如果可能的話,我們希望能夠通過繪圖先觀察一下這個數據集的數據分布,以此來為我們聚類時輸入的n_clusters做一個參考。
首先,我們來自己創建一個數據集。這樣的數據集是我們自己創建,所以是有標簽的。
from sklearn.datasets import make_blobs import matplotlib.pyplot as plt #自己創建數據集 X, y = make_blobs(n_samples=500,n_features=2,centers=4,random_state=1) fig, ax1 = plt.subplots(1) ax1.scatter(X[:, 0], X[:, 1] ,marker='o' #點的形狀 ,s=8 #點的大小 ) plt.show() #如果我們想要看見這個點的分布,怎么辦? color = ["red","pink","orange","gray"] fig, ax1 = plt.subplots(1) for i in range(4): ax1.scatter(X[y==i, 0], X[y==i, 1] ,marker='o' #點的形狀 ,s=8 #點的大小 ,c=color[i] ) plt.show()
基於這個分布,我們來使用Kmeans進行聚類。首先,我們要猜測一下,這個數據中有幾簇?
from sklearn.cluster import KMeans n_clusters = 3 cluster = KMeans(n_clusters=n_clusters, random_state=0).fit(X) y_pred = cluster.labels_ y_pred pre = cluster.fit_predict(X) pre == y_pred
cluster_smallsub = KMeans(n_clusters=n_clusters, random_state=0).fit(X[:200]) y_pred_ = cluster_smallsub.predict(X) y_pred == y_pred_
centroid = cluster.cluster_centers_ centroid
centroid.shape
inertia = cluster.inertia_ inertia
color = ["red","pink","orange","gray"] fig, ax1 = plt.subplots(1)
for i in range(n_clusters): ax1.scatter(X[y_pred==i, 0], X[y_pred==i, 1] ,marker='o' ,s=8 ,c=color[i] ) ax1.scatter(centroid[:,0],centroid[:,1] ,marker="x" ,s=15 ,c="black") plt.show()
n_clusters = 4 cluster_ = KMeans(n_clusters=n_clusters, random_state=0).fit(X) inertia_ = cluster_.inertia_ inertia_
n_clusters = 5 cluster_ = KMeans(n_clusters=n_clusters, random_state=0).fit(X) inertia_ = cluster_.inertia_ inertia_
n_clusters = 6 cluster_ = KMeans(n_clusters=n_clusters, random_state=0).fit(X) inertia_ = cluster_.inertia_ inertia_
1.2 聚類算法的模型評估指標
1.2.1 當真實標簽已知的時候
1.2.2 當真實標簽未知的時候:輪廓系數
from sklearn.metrics import silhouette_score from sklearn.metrics import silhouette_samples X y_pred silhouette_score(X,y_pred) silhouette_score(X,cluster_.labels_) silhouette_samples(X,y_pred)
from sklearn.metrics import calinski_harabaz_score X y_pred calinski_harabaz_score(X, y_pred)
雖然calinski-Harabaz指數沒有界,在凸型的數據上的聚類也會表現虛高。但是比起輪廓系數,它有一個巨大的優點,就是計算非常快速。之前我們使用過魔法命令%%timeit來計算一個命令的運算時間,今天我們來選擇另一種方法:時間戳計算運行時間。
from time import time t0 = time() calinski_harabaz_score(X, y_pred) time() - t0 t0 = time() silhouette_score(X,y_pred) time() - t0 import datetime datetime.datetime.fromtimestamp(t0).strftime("%Y-%m-%d %H:%M:%S")
可以看得出,calinski-harabaz指數比輪廓系數的計算塊了一倍不止。想想看我們使用的數據量,如果是一個以萬計的數據,輪廓系數就會大大拖慢我們模型的運行速度了。
1.3 案例:基於輪廓系數來選擇n_clusters
我們通常會繪制輪廓系數分布圖和聚類后的數據分布圖來選擇我們的最佳n_clusters。
from sklearn.cluster import KMeans from sklearn.metrics import silhouette_samples, silhouette_score import matplotlib.pyplot as plt import matplotlib.cm as cm import numpy as np n_clusters = 4 fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) ax1.set_xlim([-0.1, 1]) ax1.set_ylim([0, X.shape[0] + (n_clusters + 1) * 10]) clusterer = KMeans(n_clusters=n_clusters, random_state=10).fit(X) cluster_labels = clusterer.labels_ silhouette_avg = silhouette_score(X, cluster_labels) print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) sample_silhouette_values = silhouette_samples(X, cluster_labels) y_lower = 10 for i in range(n_clusters): ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i)/n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper) ,ith_cluster_silhouette_values ,facecolor=color ,alpha=0.7 ) ax1.text(-0.05 , y_lower + 0.5 * size_cluster_i , str(i)) y_lower = y_upper + 10 ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters) ax2.scatter(X[:, 0], X[:, 1] ,marker='o' ,s=8 ,c=colors ) centers = clusterer.cluster_centers_ # Draw white circles at cluster centers ax2.scatter(centers[:, 0], centers[:, 1], marker='x', c="red", alpha=1, s=200) ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") plt.suptitle(("Silhouette analysis for KMeans clustering on sample data " "with n_clusters = %d" % n_clusters), fontsize=14, fontweight='bold') plt.show()
將上述過程包裝成一個循環,可以得到:
from sklearn.cluster import KMeans from sklearn.metrics import silhouette_samples, silhouette_score import matplotlib.pyplot as plt import matplotlib.cm as cm import numpy as np for n_clusters in [2,3,4,5,6,7]: n_clusters = n_clusters fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) ax1.set_xlim([-0.1, 1]) ax1.set_ylim([0, X.shape[0] + (n_clusters + 1) * 10]) clusterer = KMeans(n_clusters=n_clusters, random_state=10).fit(X) cluster_labels = clusterer.labels_ silhouette_avg = silhouette_score(X, cluster_labels) print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) sample_silhouette_values = silhouette_samples(X, cluster_labels) y_lower = 10 for i in range(n_clusters): ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i)/n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper) ,ith_cluster_silhouette_values ,facecolor=color ,alpha=0.7 ) ax1.text(-0.05 , y_lower + 0.5 * size_cluster_i , str(i)) y_lower = y_upper + 10 ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters) ax2.scatter(X[:, 0], X[:, 1] ,marker='o' ,s=8 ,c=colors ) centers = clusterer.cluster_centers_ # Draw white circles at cluster centers ax2.scatter(centers[:, 0], centers[:, 1], marker='x', c="red", alpha=1, s=200) ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") plt.suptitle(("Silhouette analysis for KMeans clustering on sample data " "with n_clusters = %d" % n_clusters), fontsize=14, fontweight='bold') plt.show()
2 重要參數init & random_state & n_init:初始質心怎么放好?
X
y plus = KMeans(n_clusters = 10).fit(X) plus.n_iter_ random = KMeans(n_clusters = 10,init="random",random_state=420).fit(X) random.n_iter_
3 重要參數max_iter & tol:讓迭代停下來
random = KMeans(n_clusters = 10,init="random",max_iter=10,random_state=420).fit(X) y_pred_max10 = random.labels_ silhouette_score(X,y_pred_max10) random = KMeans(n_clusters = 10,init="random",max_iter=20,random_state=420).fit(X) y_pred_max20 = random.labels_ silhouette_score(X,y_pred_max20)
4 重要屬性與重要接口
5 函數cluster.k_means
sklearn.cluster.k_means (X, n_clusters, sample_weight=None, init=’k-means++’, precompute_distances=’auto’,n_init=10, max_iter=300, verbose=False, tol=0.0001, random_state=None, copy_x=True, n_jobs=None,algorithm=’auto’, return_n_iter=False)
函數k_means的用法其實和類非常相似,不過函數是輸入一系列值,而直接返回結果。一次性地,函數k_means會依次返回質心,每個樣本對應的簇的標簽,inertia以及最佳迭代次數。
from sklearn.cluster import k_means k_means(X,4,return_n_iter=True)