knn算法--超参数问题


一.参数和超参数之间的区别以及联系?

===点击这里===

二.knn算法的的超参数问题

1.寻找到最好的k值

k值在knn中是一个超参数的问题,我们如何选取一个最好的k值呢?

示例代码如下:

import numpy as np
from sklearn import datasets

#加载相应的数据集(手写识别数字的数据集)
digits=datasets.load_digits()
X=digits.data
y=digits.target


from sklearn.model_selection import train_test_split
X_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=666)  #random_state 参数是保持每次的取得数都是同一次(随机打乱数据)

from sklearn.neighbors import KNeighborsClassifier
knn_clf=KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train,y_train)
knn_clf.score(x_test,y_test) #算法的精确度     0.9888888888888889


#=======寻找到最好的k===========

best_score=0.0  #先设置一个精确度的初始值
best_k=-1         #设置一个k的初始值
for k in range(1,11):
    knn_clf=KNeighborsClassifier(n_neighbors=k)
    knn_clf.fit(X_train,y_train)
    score=knn_clf.score(x_test,y_test)
    if score > best_score:
        best_score=score
        best_k=k

print('best_score=%s'%(best_score)) #best_score=0.9916666666666667
print('best_k=%s'%(best_k)) #best_k=4

2.考虑距离的权重问题

#综合考虑距离和不考虑距离来选取最优的k值
best_methon=''
best_score=0.0
best_k=-1
for methon in ['uniform','distance']:
    for k in range(1,11):
        knn_clf=KNeighborsClassifier(n_neighbors=k,weights=methon)
        knn_clf.fit(X_train,y_train)
        score=knn_clf.score(x_test,y_test)
        if score > best_score:
            best_score=score
            best_k=k
            best_methon=methon
print('best_methon=%s'%(best_methon))
print('best_k=%s'%(best_k))
print('best_score=%s'%(best_score))


#best_methon=uniform
#best_k=4
#best_score=0.9916666666666667

#通过比较上述问题,我们可以知道,该数据集适合不考虑距离,k值为4的情况下的精确度最高

3.当需要考虑到距离问题的时候,选择哪种距离公式

考虑搜索明科夫斯基距离

明科夫斯基距离公式:(∑|xi-yi|^p)^(1/p) (i=1,2,...n) 

best_p=-1
best_score=0.0
best_k=-1
for k in range(1,11):
    for p in range(1,5):
        knn_clf=KNeighborsClassifier(n_neighbors=k,weights='distance',p=p)
        knn_clf.fit(X_train,y_train)
        score=knn_clf.score(x_test,y_test)
        if score > best_score:
            best_score=score
            best_k=k
            best_p=p
            
print('best_p=%s'%(best_p))
print('best_k=%s'%(best_k))
print('best_score=%s'%(best_score))


#best_p=2
#best_k=3
#best_score=0.9888888888888889

 4.使用sklearn进行超参数设置问题

1.网格搜索

import numpy as np
from sklearn import datasets

digits=datasets.load_digits()
X=digits.data
y=digits.target

from sklearn.model_selection import train_test_split
X_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=666)

#===网格搜索=====

#Grid Search

param_grid = [
    {
        'weights': ['uniform'],
        'n_neighbors':[i for i in range(1,11)]
    },
    {
        'weights':['distance'],
        'n_neighbors':[i for i in range(1,11)],
        'p':[i for i in range(1,6)]
    }
]

knn_clf=KNeighborsClassifier()

from sklearn.model_selection import GridSearchCV  #导入相应的网格搜素模块

grid_search=GridSearchCV(knn_clf,param_grid) #这个相当于代替之前的两个for循环

grid_search.fit(X_train,y_train) #传入相应的参数进行拟合

'''
GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'weights': ['uniform'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)
'''


grid_search.best_estimator_  #这个参数可以知道每个参数的最优选择

'''
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=3,
           weights='distance')
'''

grid_search.best_score_  #这个参数可以知道该算法的精确度

'''
0.9853862212943633
'''

grid_search.best_params_  #这个参数可以知道最优的参数选择方案

'''
{'n_neighbors': 3, 'p': 3, 'weights': 'distance'}
'''


#===============
#得到上述参数后,我们就重新定义一个分类器

knn_clf=grid_search.best_estimator_

knn_clf.score(x_test,y_test)  #算法的精确度  0.9833333333333333

#%%time  jupyter中测试运行时间的方法
grid_search=GridSearchCV(knn_clf,param_grid,n_jobs=-1,verbose=2)  

#n_jobs参数是表示用多少个核来进行运算,-1表示所有的核,
verbose参数表示在运行的时候可以同时打印出相关的信息,一般写2,也可以写更大是数据
grid_search.fit(X_train,y_train) ''' Fitting 3 folds for each of 60 candidates, totalling 180 fits [Parallel(n_jobs=-1)]: Done 33 tasks | elapsed: 28.3s [Parallel(n_jobs=-1)]: Done 154 tasks | elapsed: 1.6min Wall time: 1min 58s [Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 2.0min finished '''

 


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM