1 提升集成算法:重要參數n_estimators
1. 導入需要的庫,模塊以及數據
from xgboost import XGBRegressor as XGBR from sklearn.ensemble import RandomForestRegressor as RFR from sklearn.linear_model import LinearRegression as LinearR from sklearn.datasets import load_boston from sklearn.model_selection import KFold, cross_val_score as CVS, train_test_split as TTS from sklearn.metrics import mean_squared_error as MSE import pandas as pd import numpy as np import matplotlib.pyplot as plt from time import time import datetime data = load_boston() #波士頓數據集非常簡單,但它所涉及到的問題卻很多 X = data.data y = data.target
2. 建模,查看其他接口和屬性
Xtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.3,random_state=420) reg = XGBR(n_estimators=100).fit(Xtrain,Ytrain) reg.predict(Xtest) #傳統接口predict reg.score(Xtest,Ytest) #你能想出這里應該返回什么模型評估指標么? MSE(Ytest,reg.predict(Xtest)) reg.feature_importances_ #樹模型的優勢之一:能夠查看模型的重要性分數,可以使用嵌入法進行特征選擇
3. 交叉驗證,與線性回歸&隨機森林回歸進行對比
reg = XGBR(n_estimators=100) CVS(reg,Xtrain,Ytrain,cv=5).mean() #這里應該返回什么模型評估指標,還記得么? #嚴謹的交叉驗證與不嚴謹的交叉驗證之間的討論:訓練集or全數據? CVS(reg,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean() #來查看一下sklearn中所有的模型評估指標 import sklearn sorted(sklearn.metrics.SCORERS.keys()) #使用隨機森林和線性回歸進行一個對比 rfr = RFR(n_estimators=100) CVS(rfr,Xtrain,Ytrain,cv=5).mean() CVS(rfr,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean() lr = LinearR() CVS(lr,Xtrain,Ytrain,cv=5).mean() CVS(lr,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean() #開啟參數slient:在數據巨大,預料到算法運行會非常緩慢的時候可以使用這個參數來監控模型的訓練進度 reg = XGBR(n_estimators=10,silent=False) CVS(reg,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()
4. 定義繪制以訓練樣本數為橫坐標的學習曲線的函數
def plot_learning_curve(estimator,title, X, y, ax=None, #選擇子圖 ylim=None, #設置縱坐標的取值范圍 cv=None, #交叉驗證 n_jobs=None #設定索要使用的線程 ): from sklearn.model_selection import learning_curve import matplotlib.pyplot as plt import numpy as np train_sizes, train_scores, test_scores = learning_curve(estimator, X, y ,shuffle=True ,cv=cv # ,random_state=420 ,n_jobs=n_jobs) if ax == None: ax = plt.gca() else: ax = plt.figure() ax.set_title(title) if ylim is not None: ax.set_ylim(*ylim) ax.set_xlabel("Training examples") ax.set_ylabel("Score") ax.grid() #繪制網格,不是必須 ax.plot(train_sizes, np.mean(train_scores, axis=1), 'o-' , color="r",label="Training score") ax.plot(train_sizes, np.mean(test_scores, axis=1), 'o-' , color="g",label="Test score") ax.legend(loc="best") return ax
5. 使用學習曲線觀察XGB在波士頓數據集上的潛力
cv = KFold(n_splits=5, shuffle = True, random_state=42) plot_learning_curve(XGBR(n_estimators=100,random_state=420) ,"XGB",Xtrain,Ytrain,ax=None,cv=cv) plt.show() #多次運行,觀察結果,這是怎么造成的? #在現在的狀況下,如何看數據的潛力?還能調上去么?
6. 使用參數學習曲線觀察n_estimators對模型的影響
#=====【TIME WARNING:25 seconds】=====# axisx = range(10,1010,50) rs = [] for i in axisx: reg = XGBR(n_estimators=i,random_state=420) rs.append(CVS(reg,Xtrain,Ytrain,cv=cv).mean()) print(axisx[rs.index(max(rs))],max(rs)) plt.figure(figsize=(20,5)) plt.plot(axisx,rs,c="red",label="XGB") plt.legend() plt.show() #選出來的n_estimators非常不尋常,我們是否要選擇准確率最高的n_estimators值呢?
7. 進化的學習曲線:方差與泛化誤差
#======【TIME WARNING: 20s】=======# axisx = range(50,1050,50) rs = [] var = [] ge = [] for i in axisx: reg = XGBR(n_estimators=i,random_state=420) cvresult = CVS(reg,Xtrain,Ytrain,cv=cv) #記錄1-偏差 rs.append(cvresult.mean()) #記錄方差 var.append(cvresult.var()) #計算泛化誤差的可控部分 ge.append((1 - cvresult.mean())**2+cvresult.var()) #打印R2最高所對應的參數取值,並打印這個參數下的方差 print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))]) #打印方差最低時對應的參數取值,並打印這個參數下的R2 print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var)) #打印泛化誤差可控部分的參數取值,並打印這個參數下的R2,方差以及泛化誤差的可控部分 print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge)) plt.figure(figsize=(20,5)) plt.plot(axisx,rs,c="red",label="XGB") plt.legend() plt.show()
8. 細化學習曲線,找出最佳n_estimators
axisx = range(100,300,10) rs = [] var = [] ge = [] for i in axisx: reg = XGBR(n_estimators=i,random_state=420) cvresult = CVS(reg,Xtrain,Ytrain,cv=cv) rs.append(cvresult.mean()) var.append(cvresult.var()) ge.append((1 - cvresult.mean())**2+cvresult.var()) print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))]) print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var)) print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge)) rs = np.array(rs) var = np.array(var)*0.01 plt.figure(figsize=(20,5)) plt.plot(axisx,rs,c="black",label="XGB") #添加方差線 plt.plot(axisx,rs+var,c="red",linestyle='-.') plt.plot(axisx,rs-var,c="red",linestyle='-.') plt.legend() plt.show() #看看泛化誤差的可控部分如何? plt.figure(figsize=(20,5)) plt.plot(axisx,ge,c="gray",linestyle='-.') plt.show()
9. 檢測模型效果
#驗證模型效果是否提高了? time0 = time() print(XGBR(n_estimators=100,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest)) print(time()-time0) time0 = time() print(XGBR(n_estimators=660,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest)) print(time()-time0) time0 = time() print(XGBR(n_estimators=180,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest)) print(time()-time0)