一.參數和超參數之間的區別以及聯系?
===點擊這里===
二.knn算法的的超參數問題
1.尋找到最好的k值
k值在knn中是一個超參數的問題,我們如何選取一個最好的k值呢?
示例代碼如下:
import numpy as np from sklearn import datasets #加載相應的數據集(手寫識別數字的數據集) digits=datasets.load_digits() X=digits.data y=digits.target from sklearn.model_selection import train_test_split X_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=666) #random_state 參數是保持每次的取得數都是同一次(隨機打亂數據) from sklearn.neighbors import KNeighborsClassifier knn_clf=KNeighborsClassifier(n_neighbors=3) knn_clf.fit(X_train,y_train) knn_clf.score(x_test,y_test) #算法的精確度 0.9888888888888889 #=======尋找到最好的k=========== best_score=0.0 #先設置一個精確度的初始值 best_k=-1 #設置一個k的初始值 for k in range(1,11): knn_clf=KNeighborsClassifier(n_neighbors=k) knn_clf.fit(X_train,y_train) score=knn_clf.score(x_test,y_test) if score > best_score: best_score=score best_k=k print('best_score=%s'%(best_score)) #best_score=0.9916666666666667 print('best_k=%s'%(best_k)) #best_k=4
2.考慮距離的權重問題
#綜合考慮距離和不考慮距離來選取最優的k值 best_methon='' best_score=0.0 best_k=-1 for methon in ['uniform','distance']: for k in range(1,11): knn_clf=KNeighborsClassifier(n_neighbors=k,weights=methon) knn_clf.fit(X_train,y_train) score=knn_clf.score(x_test,y_test) if score > best_score: best_score=score best_k=k best_methon=methon print('best_methon=%s'%(best_methon)) print('best_k=%s'%(best_k)) print('best_score=%s'%(best_score)) #best_methon=uniform #best_k=4 #best_score=0.9916666666666667 #通過比較上述問題,我們可以知道,該數據集適合不考慮距離,k值為4的情況下的精確度最高
3.當需要考慮到距離問題的時候,選擇哪種距離公式
考慮搜索明科夫斯基距離
明科夫斯基距離公式:(∑|xi-yi|^p)^(1/p) (i=1,2,...n)
best_p=-1 best_score=0.0 best_k=-1 for k in range(1,11): for p in range(1,5): knn_clf=KNeighborsClassifier(n_neighbors=k,weights='distance',p=p) knn_clf.fit(X_train,y_train) score=knn_clf.score(x_test,y_test) if score > best_score: best_score=score best_k=k best_p=p print('best_p=%s'%(best_p)) print('best_k=%s'%(best_k)) print('best_score=%s'%(best_score)) #best_p=2 #best_k=3 #best_score=0.9888888888888889
4.使用sklearn進行超參數設置問題
1.網格搜索
import numpy as np from sklearn import datasets digits=datasets.load_digits() X=digits.data y=digits.target from sklearn.model_selection import train_test_split X_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=666) #===網格搜索===== #Grid Search param_grid = [ { 'weights': ['uniform'], 'n_neighbors':[i for i in range(1,11)] }, { 'weights':['distance'], 'n_neighbors':[i for i in range(1,11)], 'p':[i for i in range(1,6)] } ] knn_clf=KNeighborsClassifier() from sklearn.model_selection import GridSearchCV #導入相應的網格搜素模塊 grid_search=GridSearchCV(knn_clf,param_grid) #這個相當於代替之前的兩個for循環 grid_search.fit(X_train,y_train) #傳入相應的參數進行擬合 ''' GridSearchCV(cv=None, error_score='raise', estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2, weights='uniform'), fit_params=None, iid=True, n_jobs=1, param_grid=[{'weights': ['uniform'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}], pre_dispatch='2*n_jobs', refit=True, return_train_score='warn', scoring=None, verbose=0) ''' grid_search.best_estimator_ #這個參數可以知道每個參數的最優選擇 ''' KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=3, p=3, weights='distance') ''' grid_search.best_score_ #這個參數可以知道該算法的精確度 ''' 0.9853862212943633 ''' grid_search.best_params_ #這個參數可以知道最優的參數選擇方案 ''' {'n_neighbors': 3, 'p': 3, 'weights': 'distance'} ''' #=============== #得到上述參數后,我們就重新定義一個分類器 knn_clf=grid_search.best_estimator_ knn_clf.score(x_test,y_test) #算法的精確度 0.9833333333333333 #%%time jupyter中測試運行時間的方法 grid_search=GridSearchCV(knn_clf,param_grid,n_jobs=-1,verbose=2)
#n_jobs參數是表示用多少個核來進行運算,-1表示所有的核,
verbose參數表示在運行的時候可以同時打印出相關的信息,一般寫2,也可以寫更大是數據 grid_search.fit(X_train,y_train) ''' Fitting 3 folds for each of 60 candidates, totalling 180 fits [Parallel(n_jobs=-1)]: Done 33 tasks | elapsed: 28.3s [Parallel(n_jobs=-1)]: Done 154 tasks | elapsed: 1.6min Wall time: 1min 58s [Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 2.0min finished '''