KNN Python實現
''' k近鄰(kNN)算法的工作機制比較簡單,根據某種距離測度找出距離給定待測樣本距離最小的k個訓練樣本,根據k個訓練樣本進行預測。 分類問題:k個點中出現頻率最高的類別作為待測樣本的類別 回歸問題:通常以k個訓練樣本的平均值作為待測樣本的預測值 kNN模型三要素:距離測度、k值的選擇、分類或回歸決策方式 ''' import numpy as np class KNNClassfier(object): def __init__(self, k=5, distance='euc'): self.k = k self.distance = distance self.x = None self.y = None def fit(self,X, Y): ''' X : array-like [n_samples,shape] Y : array-like [n_samples,1] ''' self.x = X self.y = Y def predict(self,X_test): ''' X_test : array-like [n_samples,shape] Y_test : array-like [n_samples,1] output : array-like [n_samples,1] ''' output = np.zeros((X_test.shape[0],1)) for i in range(X_test.shape[0]): dis = [] for j in range(self.x.shape[0]): if self.distance == 'euc': # 歐式距離 dis.append(np.linalg.norm(X_test[i]-self.x[j,:])) labels = [] index=sorted(range(len(dis)), key=dis.__getitem__) for j in range(self.k): labels.append(self.y[index[j]]) counts = [] for label in labels: counts.append(labels.count(label)) output[i] = labels[np.argmax(counts)] return output def score(self,x,y): pred = self.predict(x) err = 0.0 for i in range(x.shape[0]): if pred[i]!=y[i]: err = err+1 return 1-float(err/x.shape[0]) if __name__ == '__main__': from sklearn import datasets iris = datasets.load_iris() x = iris.data y = iris.target # x = np.array([[0.5,0.4],[0.1,0.2],[0.7,0.8],[0.2,0.1],[0.4,0.6],[0.9,0.9],[1,1]]).reshape(-1,2) # y = np.array([0,1,0,1,0,1,1]).reshape(-1,1) clf = KNNClassfier(k=3) clf.fit(x,y) print('myknn score:',clf.score(x,y)) from sklearn.neighbors import KNeighborsClassifier clf_sklearn = KNeighborsClassifier(n_neighbors=3) clf_sklearn.fit(x,y) print('sklearn score:',clf_sklearn.score(x,y))
手寫數字識別
from sklearn import datasets from KNN import KNNClassfier import matplotlib.pyplot as plt import numpy as np import time digits = datasets.load_digits() x = digits.data y = digits.target myknn_start_time = time.time() clf = KNNClassfier(k=5) clf.fit(x,y) print('myknn score:',clf.score(x,y)) myknn_end_time = time.time() from sklearn.neighbors import KNeighborsClassifier sklearnknn_start_time = time.time() clf_sklearn = KNeighborsClassifier(n_neighbors=5) clf_sklearn.fit(x,y) print('sklearn score:',clf_sklearn.score(x,y)) sklearnknn_end_time = time.time() print('myknn uses time:',myknn_end_time-myknn_start_time) print('sklearn uses time:',sklearnknn_end_time-sklearnknn_start_time)
可以看出處理較大數據集時,本人編寫的kNN時間開銷非常大,原因在於每次查找k個近鄰點時都將掃描整個數據集,計算量很大,因此
k近鄰(kNN)的實現還需要考慮如何最快的查找出k個近鄰點,為了減少距離計算次數,可通過構造kd樹,減少對大部分點的搜索、計算,kd樹的構造可參考《統計學習方法》-李航