數據形式如下:
前期數據整合:
import pandas as pd import scipy import scipy.cluster.hierarchy as sch from scipy.cluster.vq import vq,kmeans,whiten import numpy as np import matplotlib.pylab as plt df1 = pd.read_csv(r"D:\01RiverPro\01DATA\01Headwater\CSV\dem.csv") df2 = pd.read_csv(r"D:\01RiverPro\01DATA\01Headwater\CSV\ndvi_mean.csv") df3 = pd.read_csv(r"D:\01RiverPro\01DATA\01Headwater\CSV\pop_mean.csv") result = pd.merge(df1, df2, how='inner', on=['GRIDCODE'])#取交集 result = pd.merge(result, df3, how='inner', on=['GRIDCODE']) df=result[['GRIDCODE','dem_mean','ndvi_mean','pop_mean']] #新增一列其他方法進行的分類標簽 ishw = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] df['Headwater_label'] = ishw #轉為array dataset = df.values points = dataset [:,1:4]#第2列到第4屬性列 ishw_label = dataset[:,-1] #print("points:\n",points) # k-means聚類 #將原始數據做歸一化處理 data=whiten(points) #使用kmeans函數進行聚類,輸入第一維為數據,第二維為聚類個數k. #有些時候我們可能不知道最終究竟聚成多少類,一個辦法是用層次聚類的結果進行初始化.當然也可以直接輸入某個數值. #k-means最后輸出的結果其實是兩維的,第一維是聚類中心,第二維是損失distortion,我們在這里只取第一維,所以最后有個[0] #centroid = kmeans(data,max(cluster))[0] centroid = kmeans(data,2)[0]#分為2類 print(centroid)#輸出中心 #使用vq函數根據聚類中心對所有數據進行分類,vq的輸出也是兩維的,[0]表示的是所有數據的label label=vq(data,centroid)[0] label #輸出兩類的數量 num = [0,0] for i in label: if(i == 0): num[0] = num[0] + 1 else: num[1] = num[1] + 1 print('num =',num) #輸出符合預期的比例等 print("Final clustering by k-means:\n",label) result = np.subtract(label,ishw_label) print("result:\n",result) count = [0,0] for i in result: if(i == 0): count[0] = count[0] + 1 else: count[1] = count[1] + 1 print(count) print(float(count[0])/(count[0]+count[1]))