knn算法--超參數問題


一.參數和超參數之間的區別以及聯系?

===點擊這里===

二.knn算法的的超參數問題

1.尋找到最好的k值

k值在knn中是一個超參數的問題,我們如何選取一個最好的k值呢?

示例代碼如下:

import numpy as np
from sklearn import datasets

#加載相應的數據集(手寫識別數字的數據集)
digits=datasets.load_digits()
X=digits.data
y=digits.target


from sklearn.model_selection import train_test_split
X_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=666)  #random_state 參數是保持每次的取得數都是同一次(隨機打亂數據)

from sklearn.neighbors import KNeighborsClassifier
knn_clf=KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train,y_train)
knn_clf.score(x_test,y_test) #算法的精確度     0.9888888888888889


#=======尋找到最好的k===========

best_score=0.0  #先設置一個精確度的初始值
best_k=-1         #設置一個k的初始值
for k in range(1,11):
    knn_clf=KNeighborsClassifier(n_neighbors=k)
    knn_clf.fit(X_train,y_train)
    score=knn_clf.score(x_test,y_test)
    if score > best_score:
        best_score=score
        best_k=k

print('best_score=%s'%(best_score)) #best_score=0.9916666666666667
print('best_k=%s'%(best_k)) #best_k=4

2.考慮距離的權重問題

#綜合考慮距離和不考慮距離來選取最優的k值
best_methon=''
best_score=0.0
best_k=-1
for methon in ['uniform','distance']:
    for k in range(1,11):
        knn_clf=KNeighborsClassifier(n_neighbors=k,weights=methon)
        knn_clf.fit(X_train,y_train)
        score=knn_clf.score(x_test,y_test)
        if score > best_score:
            best_score=score
            best_k=k
            best_methon=methon
print('best_methon=%s'%(best_methon))
print('best_k=%s'%(best_k))
print('best_score=%s'%(best_score))


#best_methon=uniform
#best_k=4
#best_score=0.9916666666666667

#通過比較上述問題,我們可以知道,該數據集適合不考慮距離,k值為4的情況下的精確度最高

3.當需要考慮到距離問題的時候,選擇哪種距離公式

考慮搜索明科夫斯基距離

明科夫斯基距離公式:(∑|xi-yi|^p)^(1/p) (i=1,2,...n) 

best_p=-1
best_score=0.0
best_k=-1
for k in range(1,11):
    for p in range(1,5):
        knn_clf=KNeighborsClassifier(n_neighbors=k,weights='distance',p=p)
        knn_clf.fit(X_train,y_train)
        score=knn_clf.score(x_test,y_test)
        if score > best_score:
            best_score=score
            best_k=k
            best_p=p
            
print('best_p=%s'%(best_p))
print('best_k=%s'%(best_k))
print('best_score=%s'%(best_score))


#best_p=2
#best_k=3
#best_score=0.9888888888888889

 4.使用sklearn進行超參數設置問題

1.網格搜索

import numpy as np
from sklearn import datasets

digits=datasets.load_digits()
X=digits.data
y=digits.target

from sklearn.model_selection import train_test_split
X_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=666)

#===網格搜索=====

#Grid Search

param_grid = [
    {
        'weights': ['uniform'],
        'n_neighbors':[i for i in range(1,11)]
    },
    {
        'weights':['distance'],
        'n_neighbors':[i for i in range(1,11)],
        'p':[i for i in range(1,6)]
    }
]

knn_clf=KNeighborsClassifier()

from sklearn.model_selection import GridSearchCV  #導入相應的網格搜素模塊

grid_search=GridSearchCV(knn_clf,param_grid) #這個相當於代替之前的兩個for循環

grid_search.fit(X_train,y_train) #傳入相應的參數進行擬合

'''
GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'weights': ['uniform'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)
'''


grid_search.best_estimator_  #這個參數可以知道每個參數的最優選擇

'''
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=3,
           weights='distance')
'''

grid_search.best_score_  #這個參數可以知道該算法的精確度

'''
0.9853862212943633
'''

grid_search.best_params_  #這個參數可以知道最優的參數選擇方案

'''
{'n_neighbors': 3, 'p': 3, 'weights': 'distance'}
'''


#===============
#得到上述參數后,我們就重新定義一個分類器

knn_clf=grid_search.best_estimator_

knn_clf.score(x_test,y_test)  #算法的精確度  0.9833333333333333

#%%time  jupyter中測試運行時間的方法
grid_search=GridSearchCV(knn_clf,param_grid,n_jobs=-1,verbose=2)  

#n_jobs參數是表示用多少個核來進行運算,-1表示所有的核,
verbose參數表示在運行的時候可以同時打印出相關的信息,一般寫2,也可以寫更大是數據
grid_search.fit(X_train,y_train) ''' Fitting 3 folds for each of 60 candidates, totalling 180 fits [Parallel(n_jobs=-1)]: Done 33 tasks | elapsed: 28.3s [Parallel(n_jobs=-1)]: Done 154 tasks | elapsed: 1.6min Wall time: 1min 58s [Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 2.0min finished '''

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM