KNN Python實現


KNN Python實現
'''
k近鄰(kNN)算法的工作機制比較簡單,根據某種距離測度找出距離給定待測樣本距離最小的k個訓練樣本,根據k個訓練樣本進行預測。
分類問題:k個點中出現頻率最高的類別作為待測樣本的類別
回歸問題:通常以k個訓練樣本的平均值作為待測樣本的預測值
kNN模型三要素:距離測度、k值的選擇、分類或回歸決策方式
'''
import numpy as np
class KNNClassfier(object):

    def __init__(self, k=5, distance='euc'):
        self.k = k
        self.distance = distance
        self.x = None
        self.y = None
    def fit(self,X, Y):
        '''
        X : array-like [n_samples,shape]
        Y : array-like [n_samples,1]
        '''        
        self.x = X
        self.y = Y
    def predict(self,X_test):
        '''
        X_test : array-like [n_samples,shape]
        Y_test : array-like [n_samples,1]
        output : array-like [n_samples,1]
        '''  
        output = np.zeros((X_test.shape[0],1))
        for i in range(X_test.shape[0]):
            dis = [] 
            for j in range(self.x.shape[0]):
                if self.distance == 'euc': # 歐式距離
                    dis.append(np.linalg.norm(X_test[i]-self.x[j,:]))
            labels = []
            index=sorted(range(len(dis)), key=dis.__getitem__)
            for j in range(self.k):
                labels.append(self.y[index[j]])
            counts = []
            for label in labels:
                counts.append(labels.count(label))
            output[i] = labels[np.argmax(counts)]
        return output
    def score(self,x,y):
        pred = self.predict(x)
        err = 0.0
        for i in range(x.shape[0]):
            if pred[i]!=y[i]:
                err = err+1
        return 1-float(err/x.shape[0])


if __name__ == '__main__':
    from sklearn import datasets
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target
    # x = np.array([[0.5,0.4],[0.1,0.2],[0.7,0.8],[0.2,0.1],[0.4,0.6],[0.9,0.9],[1,1]]).reshape(-1,2)
    # y = np.array([0,1,0,1,0,1,1]).reshape(-1,1)
    clf = KNNClassfier(k=3)
    clf.fit(x,y)
    print('myknn score:',clf.score(x,y))
    from sklearn.neighbors import KNeighborsClassifier
    clf_sklearn = KNeighborsClassifier(n_neighbors=3)
    clf_sklearn.fit(x,y)
    print('sklearn score:',clf_sklearn.score(x,y))

手寫數字識別

from sklearn import datasets
from KNN import KNNClassfier
import matplotlib.pyplot as plt 
import numpy as  np
import time

digits = datasets.load_digits()
x = digits.data
y = digits.target

myknn_start_time = time.time()
clf = KNNClassfier(k=5)
clf.fit(x,y)
print('myknn score:',clf.score(x,y))
myknn_end_time = time.time()

from sklearn.neighbors import KNeighborsClassifier
sklearnknn_start_time = time.time()
clf_sklearn = KNeighborsClassifier(n_neighbors=5)
clf_sklearn.fit(x,y)
print('sklearn score:',clf_sklearn.score(x,y))
sklearnknn_end_time = time.time()

print('myknn uses time:',myknn_end_time-myknn_start_time)
print('sklearn uses time:',sklearnknn_end_time-sklearnknn_start_time)

可以看出處理較大數據集時,本人編寫的kNN時間開銷非常大,原因在於每次查找k個近鄰點時都將掃描整個數據集,計算量很大,因此
k近鄰(kNN)的實現還需要考慮如何最快的查找出k個近鄰點,為了減少距離計算次數,可通過構造kd樹,減少對大部分點的搜索、計算,kd樹的構造可參考《統計學習方法》-李航

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM