XGBoost解決xgboost保險賠償預測
import xgboost as xgb import pandas as pd import numpy as np import pickle import sys import matplotlib.pyplot as plt from sklearn.metrics import mean_absolute_error, make_scorer from sklearn.preprocessing import StandardScaler from sklearn.model_selection import GridSearchCV,KFold, train_test_split from scipy.sparse import csr_matrix, hstack from xgboost import XGBRegressor import warnings warnings.filterwarnings('ignore') %matplotlib inline # This may raise an exception in earlier versions of Jupyter %config InlineBackend.figure_format = 'retina'
數據預處理
train = pd.read_csv('train.csv')
train['log_loss'] = np.log(train['loss'])
數據分成連續和離散特征
features = [x for x in train.columns if x not in ['id','loss', 'log_loss']] cat_features = [x for x in train.select_dtypes( include=['object']).columns if x not in ['id','loss', 'log_loss']] num_features = [x for x in train.select_dtypes( exclude=['object']).columns if x not in ['id','loss', 'log_loss']] print ("Categorical features:", len(cat_features)) print ("Numerical features:", len(num_features))
And use a label encoder for categorical features:
ntrain = train.shape[0] train_x = train[features] train_y = train['log_loss'] for c in range(len(cat_features)): train_x[cat_features[c]] = train_x[cat_features[c]].astype('category').cat.codes print ("Xtrain:", train_x.shape) print ("ytrain:", train_y.shape)
簡單的XGBoost 模型
首先,我們訓練一個基本的xgboost模型,然后進行參數調節通過交叉驗證來觀察結果的變換,使用平均絕對誤差來衡量
mean_absolute_error(np.exp(y), np.exp(yhat))。
xgboost 自定義了一個數據矩陣類 DMatrix,會在訓練開始時進行一遍預處理,從而提高之后每次迭代的效率
def xg_eval_mae(yhat, dtrain): y = dtrain.get_label() return 'mae', mean_absolute_error(np.exp(y), np.exp(yhat))
Model
# 將數據進行轉化成xgboost支持的數據格式(效率問題) dtrain = xgb.DMatrix(train_x, train['log_loss'])
Xgboost參數
- 'booster':'gbtree',
- 'objective': 'multi:softmax', 多分類的問題
- 'num_class':10, 類別數,與 multisoftmax 並用
- 'gamma':損失下降多少才進行分裂
- 'max_depth':12, 構建樹的深度,越大越容易過擬合
- 'lambda':2, 控制模型復雜度的權重值的L2正則化項參數,參數越大,模型越不容易過擬合。
- 'subsample':0.7, 隨機采樣訓練樣本
- 'colsample_bytree':0.7, 生成樹時進行的列采樣
- 'min_child_weight':3, 孩子節點中最小的樣本權重和。如果一個葉子節點的樣本權重和小於min_child_weight則拆分過程結束
- 'silent':0 ,設置成1則沒有運行信息輸出,最好是設置為0.
- 'eta': 0.007, 如同學習率
- 'seed':1000,隨即種子
- 'nthread':7, cpu 線程數
xgb_params = { 'seed': 0, 'eta': 0.1, 'colsample_bytree': 0.5, 'silent': 1, 'subsample': 0.5, 'objective': 'reg:linear', 'max_depth': 5, 'min_child_weight': 3 }
使用交叉驗證 xgb.cv
%%time bst_cv1 = xgb.cv(xgb_params, dtrain, num_boost_round=50, nfold=3, seed=0, feval=xg_eval_mae, maximize=False, early_stopping_rounds=10) print ('CV score:', bst_cv1.iloc[-1,:]['test-mae-mean'])
我們得到了第一個基准結果:MAE=1218.9
plt.figure() bst_cv1[['train-mae-mean', 'test-mae-mean']].plot()
我們的第一個基礎模型:
- 沒有發生過擬合
- 只建立了50個樹模型
%%time #建立100個樹模型 bst_cv2 = xgb.cv(xgb_params, dtrain, num_boost_round=100, nfold=3, seed=0, feval=xg_eval_mae, maximize=False, early_stopping_rounds=10) print ('CV score:', bst_cv2.iloc[-1,:]['test-mae-mean'])
fig, (ax1, ax2) = plt.subplots(1,2) fig.set_size_inches(16,4) ax1.set_title('100 rounds of training') ax1.set_xlabel('Rounds') ax1.set_ylabel('Loss') ax1.grid(True) ax1.plot(bst_cv2[['train-mae-mean', 'test-mae-mean']]) ax1.legend(['Training Loss', 'Test Loss']) ax2.set_title('60 last rounds of training') ax2.set_xlabel('Rounds') ax2.set_ylabel('Loss') ax2.grid(True) ax2.plot(bst_cv2.iloc[40:][['train-mae-mean', 'test-mae-mean']]) ax2.legend(['Training Loss', 'Test Loss'])
有那么一丟丟過擬合,現在還沒多大事
我們得到了新的紀錄 MAE = 1171.77 比第一次的要好 (1218.9). 接下來我們要改變其他參數了。
XGBoost 參數調節
- Step 1: 選擇一組初始參數
- Step 2: 改變
max_depth
和min_child_weight
.
- Step 3: 調節
gamma
降低模型過擬合風險.
- Step 4: 調節
subsample
和colsample_bytree
改變數據采樣策略.
- Step 5: 調節學習率
eta
.
class XGBoostRegressor(object): def __init__(self, **kwargs): self.params = kwargs if 'num_boost_round' in self.params: self.num_boost_round = self.params['num_boost_round'] self.params.update({'silent': 1, 'objective': 'reg:linear', 'seed': 0}) def fit(self, x_train, y_train): dtrain = xgb.DMatrix(x_train, y_train) self.bst = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=self.num_boost_round, feval=xg_eval_mae, maximize=False) def predict(self, x_pred): dpred = xgb.DMatrix(x_pred) return self.bst.predict(dpred) def kfold(self, x_train, y_train, nfold=5): dtrain = xgb.DMatrix(x_train, y_train) cv_rounds = xgb.cv(params=self.params, dtrain=dtrain, num_boost_round=self.num_boost_round, nfold=nfold, feval=xg_eval_mae, maximize=False, early_stopping_rounds=10) return cv_rounds.iloc[-1,:] def plot_feature_importances(self): feat_imp = pd.Series(self.bst.get_fscore()).sort_values(ascending=False) feat_imp.plot(title='Feature Importances') plt.ylabel('Feature Importance Score') def get_params(self, deep=True): return self.params def set_params(self, **params): self.params.update(params) return self
def mae_score(y_true, y_pred): return mean_absolute_error(np.exp(y_true), np.exp(y_pred)) mae_scorer = make_scorer(mae_score, greater_is_better=False)
bst = XGBoostRegressor(eta=0.1, colsample_bytree=0.5, subsample=0.5,
max_depth=5, min_child_weight=3, num_boost_round=50)
bst.kfold(train_x, train_y, nfold=5)
Step 1: 學習率與樹個數
Step 2: 樹的深度與節點權重
這些參數對xgboost性能影響最大,因此,他們應該調整第一。我們簡要地概述它們:
-
max_depth
: 樹的最大深度。增加這個值會使模型更加復雜,也容易出現過擬合,深度3-10是合理的。 -
min_child_weight
: 正則化參數. 如果樹分區中的實例權重小於定義的總和,則停止樹構建過程。
xgb_param_grid = {'max_depth': list(range(4,9)), 'min_child_weight': list((1,3,6))} xgb_param_grid['max_depth']
%%time grid = GridSearchCV(XGBoostRegressor(eta=0.1, num_boost_round=50, colsample_bytree=0.5, subsample=0.5), param_grid=xgb_param_grid, cv=5, scoring=mae_scorer) grid.fit(train_x, train_y.values)
grid.grid_scores_, grid.best_params_, grid.best_score_
網格搜索發現的最佳結果:
{'max_depth': 8, 'min_child_weight': 6}, -1187.9597499123447)
設置成負的值是因為要找大的值
def convert_grid_scores(scores): _params = [] _params_mae = [] for i in scores: _params.append(i[0].values()) _params_mae.append(i[1]) params = np.array(_params) grid_res = np.column_stack((_params,_params_mae)) return [grid_res[:,i] for i in range(grid_res.shape[1])]
_,scores = convert_grid_scores(grid.grid_scores_)
scores = scores.reshape(5,3)
plt.figure(figsize=(10,5)) cp = plt.contourf(xgb_param_grid['min_child_weight'], xgb_param_grid['max_depth'], scores, cmap='BrBG') plt.colorbar(cp) plt.title('Depth / min_child_weight optimization') plt.annotate('We use this', xy=(5.95, 7.95), xytext=(4, 7.5), arrowprops=dict(facecolor='white'), color='white') plt.annotate('Good for depth=7', xy=(5.98, 7.05), xytext=(4, 6.5), arrowprops=dict(facecolor='white'), color='white') plt.xlabel('min_child_weight') plt.ylabel('max_depth') plt.grid(True) plt.show()
我們看到,從網格搜索的結果,分數的提高主要是基於max_depth增加. min_child_weight稍有影響的成績,但是,我們看到,min_child_weight = 6會更好一些。
Step 3: 調節 gamma去降低過擬合風險
%%time xgb_param_grid = {'gamma':[ 0.1 * i for i in range(0,5)]} grid = GridSearchCV(XGBoostRegressor(eta=0.1, num_boost_round=50, max_depth=8, min_child_weight=6, colsample_bytree=0.5, subsample=0.5), param_grid=xgb_param_grid, cv=5, scoring=mae_scorer) grid.fit(train_x, train_y.values)
grid.grid_scores_, grid.best_params_, grid.best_score_
我們選擇使用偏小一些的 gamma
.
Step 4: 調節樣本采樣方式 subsample 和 colsample_bytree
%%time xgb_param_grid = {'subsample':[ 0.1 * i for i in range(6,9)], 'colsample_bytree':[ 0.1 * i for i in range(6,9)]} grid = GridSearchCV(XGBoostRegressor(eta=0.1, gamma=0.2, num_boost_round=50, max_depth=8, min_child_weight=6), param_grid=xgb_param_grid, cv=5, scoring=mae_scorer) grid.fit(train_x, train_y.values)
Wall time: 28min 26s
grid.grid_scores_, grid.best_params_, grid.best_score_
_, scores = convert_grid_scores(grid.grid_scores_) scores = scores.reshape(3,3) plt.figure(figsize=(10,5)) cp = plt.contourf(xgb_param_grid['subsample'], xgb_param_grid['colsample_bytree'], scores, cmap='BrBG') plt.colorbar(cp) plt.title('Subsampling params tuning') plt.annotate('Optimum', xy=(0.895, 0.6), xytext=(0.8, 0.695), arrowprops=dict(facecolor='black')) plt.xlabel('subsample') plt.ylabel('colsample_bytree') plt.grid(True) plt.show()
在當前的預訓練模式的具體案例,我得到了下面的結果:
`{'colsample_bytree': 0.8, 'subsample': 0.8}, -1182.9309918891634)
Step 5: 減小學習率並增大樹個數
參數優化的最后一步是降低學習速度,同時增加更多的估計量
First, we plot different learning rates for a simpler model (50 trees):
%%time xgb_param_grid = {'eta':[0.5,0.4,0.3,0.2,0.1,0.075,0.05,0.04,0.03]} grid = GridSearchCV(XGBoostRegressor(num_boost_round=50, gamma=0.2, max_depth=8, min_child_weight=6, colsample_bytree=0.6, subsample=0.9), param_grid=xgb_param_grid, cv=5, scoring=mae_scorer) grid.fit(train_x, train_y.values)
grid.grid_scores_, grid.best_params_, grid.best_score_
eta, y = convert_grid_scores(grid.grid_scores_) plt.figure(figsize=(10,4)) plt.title('MAE and ETA, 50 trees') plt.xlabel('eta') plt.ylabel('score') plt.plot(eta, -y) plt.grid(True) plt.show()
{'eta': 0.2}, -1160.9736284869114
是目前最好的結果
現在我們把樹的個數增加到100
xgb_param_grid = {'eta':[0.5,0.4,0.3,0.2,0.1,0.075,0.05,0.04,0.03]} grid = GridSearchCV(XGBoostRegressor(num_boost_round=100, gamma=0.2, max_depth=8, min_child_weight=6, colsample_bytree=0.6, subsample=0.9), param_grid=xgb_param_grid, cv=5, scoring=mae_scorer) grid.fit(train_x, train_y.values)
grid.grid_scores_, grid.best_params_, grid.best_score_
eta, y = convert_grid_scores(grid.grid_scores_) plt.figure(figsize=(10,4)) plt.title('MAE and ETA, 100 trees') plt.xlabel('eta') plt.ylabel('score') plt.plot(eta, -y) plt.grid(True) plt.show()
學習率低一些的效果更好
%%time xgb_param_grid = {'eta':[0.09,0.08,0.07,0.06,0.05,0.04]} grid = GridSearchCV(XGBoostRegressor(num_boost_round=200, gamma=0.2, max_depth=8, min_child_weight=6, colsample_bytree=0.6, subsample=0.9), param_grid=xgb_param_grid, cv=5, scoring=mae_scorer) grid.fit(train_x, train_y.values)
在增加樹的個數呢?
grid.grid_scores_, grid.best_params_, grid.best_score_
eta, y = convert_grid_scores(grid.grid_scores_) plt.figure(figsize=(10,4)) plt.title('MAE and ETA, 200 trees') plt.xlabel('eta') plt.ylabel('score') plt.plot(eta, -y) plt.grid(True) plt.show()
%%time # Final XGBoost model bst = XGBoostRegressor(num_boost_round=200, eta=0.07, gamma=0.2, max_depth=8, min_child_weight=6, colsample_bytree=0.6, subsample=0.9) cv = bst.kfold(train_x, train_y, nfold=5)
cv
總結
可以看到200棵樹最好的ETA是0.07。正如我們所預料的那樣,ETA和num_boost_round依賴關系不是線性的,但是有些關聯。
花了相當長的一段時間優化xgboost. 從初始值: 1219.57. 經過調參之后達到 MAE=1171.77.
我們還發現參數之間的關系ETA
和num_boost_round
:
- 100 trees,
eta=0.1
: MAE=1152.247 - 200 trees,
eta=0.07
: MAE=1145.92
`XGBoostRegressor(num_boost_round=200, gamma=0.2, max_depth=8, min_child_weight=6,
colsample_bytree=0.6, subsample=0.9, eta=0.07).
xgboost作為kaggle和天池等各種數據比賽最受歡迎的算法之一,從項目中可見調參也是一件很容易的事情,並不復雜,好用精確率高,叫誰誰不用,