import numpy as np import pandas as pd import matplotlib from matplotlib import pyplot as plt %matplotlib inline matplotlib.rcParams['font.sans-serif'] = ['SimHei'] data = pd.read_csv('./010-data_multivar.csv',header=None) #拆分数据 dataset_X,dataset_y = data.iloc[:,:-1],data.iloc[:,-1] # print(dataset_X.head()) dataset_X = dataset_X.values dataset_y = dataset_y.values
无标签数据集可视化,将第一列feature作为X,第二列feature作为y
def visual_2D_dataset_dist(dataset): '''将二维数据集dataset显示在散点图中''' assert dataset.shape[1]==2,'only support dataset with 2 features' plt.figure() X=dataset[:,0] Y=dataset[:,1] plt.scatter(X,Y,marker='v',c='g',label='dataset') X_min,X_max=np.min(X)-1,np.max(X)+1 Y_min,Y_max=np.min(Y)-1,np.max(Y)+1 plt.title('dataset distribution') plt.xlim(X_min,X_max) plt.ylim(Y_min,Y_max) plt.xlabel('feature_0') plt.ylabel('feature_1') plt.legend() visual_2D_dataset_dist(dataset_X)
构造 kmeans
from sklearn.cluster import KMeans #init kmeans = KMeans(init='k-means++',n_clusters=4,n_init=5) kmeans.fit(dataset_X)

将dataset_X聚类效果可视化
def visual_kmeans_effect(k_means,dataset): assert dataset.shape[1]==2,'only support dataset with 2 features' X=dataset[:,0] Y=dataset[:,1] X_min,X_max=np.min(X)-1,np.max(X)+1 Y_min,Y_max=np.min(Y)-1,np.max(Y)+1 # meshgrid 生成网格点坐标矩阵 X_values,Y_values=np.meshgrid(np.arange(X_min,X_max,0.01), np.arange(Y_min,Y_max,0.01)) # 预测网格点的标记 predict_labels=k_means.predict(np.c_[X_values.ravel(),Y_values.ravel()]) predict_labels=predict_labels.reshape(X_values.shape) plt.figure() plt.imshow(predict_labels,interpolation='nearest', extent=(X_values.min(),X_values.max(), Y_values.min(),Y_values.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower') # 将数据集绘制到图表中 plt.scatter(X,Y,marker='v',facecolors='none',edgecolors='k',s=30) # 将中心点绘制到图中 centroids=k_means.cluster_centers_ plt.scatter(centroids[:,0],centroids[:,1],marker='o', s=100,linewidths=2,color='k',zorder=5,facecolors='b') plt.title('K-Means effect graph') plt.xlim(X_min,X_max) plt.ylim(Y_min,Y_max) plt.xlabel('feature_0') plt.ylabel('feature_1') plt.show() visual_kmeans_effect(kmeans,dataset_X)

# 鸢尾花聚类
from sklearn.datasets import load_iris datairis = load_iris() dataset = datairis.data from sklearn.cluster import KMeans #init kmeans = KMeans(init='k-means++',n_clusters=4,n_init=5) kmeans.fit(dataset) print(kmeans.labels_) #去除标签 print(datairis)