一.参数和超参数之间的区别以及联系?
===点击这里===
二.knn算法的的超参数问题
1.寻找到最好的k值
k值在knn中是一个超参数的问题,我们如何选取一个最好的k值呢?
示例代码如下:
import numpy as np from sklearn import datasets #加载相应的数据集(手写识别数字的数据集) digits=datasets.load_digits() X=digits.data y=digits.target from sklearn.model_selection import train_test_split X_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=666) #random_state 参数是保持每次的取得数都是同一次(随机打乱数据) from sklearn.neighbors import KNeighborsClassifier knn_clf=KNeighborsClassifier(n_neighbors=3) knn_clf.fit(X_train,y_train) knn_clf.score(x_test,y_test) #算法的精确度 0.9888888888888889 #=======寻找到最好的k=========== best_score=0.0 #先设置一个精确度的初始值 best_k=-1 #设置一个k的初始值 for k in range(1,11): knn_clf=KNeighborsClassifier(n_neighbors=k) knn_clf.fit(X_train,y_train) score=knn_clf.score(x_test,y_test) if score > best_score: best_score=score best_k=k print('best_score=%s'%(best_score)) #best_score=0.9916666666666667 print('best_k=%s'%(best_k)) #best_k=4
2.考虑距离的权重问题
#综合考虑距离和不考虑距离来选取最优的k值 best_methon='' best_score=0.0 best_k=-1 for methon in ['uniform','distance']: for k in range(1,11): knn_clf=KNeighborsClassifier(n_neighbors=k,weights=methon) knn_clf.fit(X_train,y_train) score=knn_clf.score(x_test,y_test) if score > best_score: best_score=score best_k=k best_methon=methon print('best_methon=%s'%(best_methon)) print('best_k=%s'%(best_k)) print('best_score=%s'%(best_score)) #best_methon=uniform #best_k=4 #best_score=0.9916666666666667 #通过比较上述问题,我们可以知道,该数据集适合不考虑距离,k值为4的情况下的精确度最高
3.当需要考虑到距离问题的时候,选择哪种距离公式
考虑搜索明科夫斯基距离
明科夫斯基距离公式:(∑|xi-yi|^p)^(1/p) (i=1,2,...n)
best_p=-1 best_score=0.0 best_k=-1 for k in range(1,11): for p in range(1,5): knn_clf=KNeighborsClassifier(n_neighbors=k,weights='distance',p=p) knn_clf.fit(X_train,y_train) score=knn_clf.score(x_test,y_test) if score > best_score: best_score=score best_k=k best_p=p print('best_p=%s'%(best_p)) print('best_k=%s'%(best_k)) print('best_score=%s'%(best_score)) #best_p=2 #best_k=3 #best_score=0.9888888888888889
4.使用sklearn进行超参数设置问题
1.网格搜索
import numpy as np from sklearn import datasets digits=datasets.load_digits() X=digits.data y=digits.target from sklearn.model_selection import train_test_split X_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=666) #===网格搜索===== #Grid Search param_grid = [ { 'weights': ['uniform'], 'n_neighbors':[i for i in range(1,11)] }, { 'weights':['distance'], 'n_neighbors':[i for i in range(1,11)], 'p':[i for i in range(1,6)] } ] knn_clf=KNeighborsClassifier() from sklearn.model_selection import GridSearchCV #导入相应的网格搜素模块 grid_search=GridSearchCV(knn_clf,param_grid) #这个相当于代替之前的两个for循环 grid_search.fit(X_train,y_train) #传入相应的参数进行拟合 ''' GridSearchCV(cv=None, error_score='raise', estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2, weights='uniform'), fit_params=None, iid=True, n_jobs=1, param_grid=[{'weights': ['uniform'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}], pre_dispatch='2*n_jobs', refit=True, return_train_score='warn', scoring=None, verbose=0) ''' grid_search.best_estimator_ #这个参数可以知道每个参数的最优选择 ''' KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=3, p=3, weights='distance') ''' grid_search.best_score_ #这个参数可以知道该算法的精确度 ''' 0.9853862212943633 ''' grid_search.best_params_ #这个参数可以知道最优的参数选择方案 ''' {'n_neighbors': 3, 'p': 3, 'weights': 'distance'} ''' #=============== #得到上述参数后,我们就重新定义一个分类器 knn_clf=grid_search.best_estimator_ knn_clf.score(x_test,y_test) #算法的精确度 0.9833333333333333 #%%time jupyter中测试运行时间的方法 grid_search=GridSearchCV(knn_clf,param_grid,n_jobs=-1,verbose=2)
#n_jobs参数是表示用多少个核来进行运算,-1表示所有的核,
verbose参数表示在运行的时候可以同时打印出相关的信息,一般写2,也可以写更大是数据 grid_search.fit(X_train,y_train) ''' Fitting 3 folds for each of 60 candidates, totalling 180 fits [Parallel(n_jobs=-1)]: Done 33 tasks | elapsed: 28.3s [Parallel(n_jobs=-1)]: Done 154 tasks | elapsed: 1.6min Wall time: 1min 58s [Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 2.0min finished '''