集成學習-Adaboost 參數選擇


先看下ababoost和決策樹效果對比

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator,title,X,y,ylim=None,cv=None,
                        n_jobs=None,train_sizes=np.linspace(.1,1.0,10)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

from sklearn.datasets import  make_gaussian_quantiles
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
# ##########################
# 生成2維正態分布,生成的數據按分位數分為兩類,50個樣本特征,5000個樣本數據
X,y = make_gaussian_quantiles(cov=2.0,n_samples=5000,n_features=50,n_classes=2,random_state=1)
# 設置一百折交叉驗證參數,數據集分層越多,交叉最優模型越接近原模型
cv = ShuffleSplit(n_splits=10,test_size=0.2,random_state=1)
# 分別畫出CART分類決策樹和AdaBoost分類決策樹的學習曲線
estimatorCart = DecisionTreeClassifier(max_depth=1)
estimatorBoost = AdaBoostClassifier(base_estimator=estimatorCart,n_estimators=270)
# 畫CART決策樹和AdaBoost的學習曲線
estimatorTuple = (estimatorCart,estimatorBoost)
titleTuple =("decision learning curve","adaBoost learning curve")
title = "decision learning curve"
for i in range(2):
    estimator = estimatorTuple[i]
    title = titleTuple[i]
    plot_learning_curve(estimator,title, X, y, cv=cv)
    plt.show()

輸出學習曲線

分析:隨着樣本數的增加,單決策樹的預測精度穩定在0.5左右,是個弱分類器,而adaboost預測精度在0.85左右,明顯高於單決策樹,是個強分類器。

 

參數選擇

上面的模型使用的是默認參數,其實還有優化的空間。

在集成學習中,參數調優一般是先選擇框架的參數,再選擇基學習器的參數

 

框架參數調優

以基學習器個數為例

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_gaussian_quantiles


estimatorCart = DecisionTreeClassifier(max_depth=1)
X,y = make_gaussian_quantiles(cov=2.0,n_samples=5000,n_features=50,n_classes=2,random_state=1)

### 第一輪
# 對框架參數 弱學習器個數進行擇優
param_test1 = {"n_estimators":range(150,300,50)}
# 框架參數擇優
gsearch1 = GridSearchCV(estimator=AdaBoostClassifier(estimatorCart),param_grid=param_test1,scoring="roc_auc",cv=5)
gsearch1.fit(X,y)
print(gsearch1.best_params_,gsearch1.best_score_)       # ({'n_estimators': 250}, 0.9360103999999999)


### 第二輪
# 繼續優化弱學習器個數,在最優學習器個數的范圍內再次搜尋
n_estimator1 = 250
param_test2 = {"n_estimators":range(n_estimator1-30,n_estimator1+30,10)}
gsearch2 = GridSearchCV(estimator=AdaBoostClassifier(estimatorCart),param_grid=param_test2,scoring="roc_auc",cv=5)
gsearch2.fit(X,y)
print(gsearch2.best_params_,gsearch2.best_score_)           # ({'n_estimators': 270}, 0.9387719999999999)

 

基學習器參數調優

以max_depth和min_samples_split為例

import numpy as np
from sklearn.model_selection import cross_validate
n_estimators2 = 270
score = 0
for i in range(1,3):  # 決策樹最大深度循環
    print(i)
    for j in range(18,22):
        print(j)
        bdt=AdaBoostClassifier(DecisionTreeClassifier(max_depth=i,min_samples_split=j),n_estimators=n_estimators2)
        cv_result = cross_validate(bdt,X,y,return_train_score=False,cv=5)
        cv_value_vec = cv_result["test_score"]
        cv_mean = np.mean(cv_value_vec)
        print(cv_mean)
        if cv_mean>=score:
            score = cv_mean
            tree_depth = i
            samples_split = j

 

用最優參數構建模型

from sklearn.model_selection import train_test_split
tree_depth = 1
X_train, y_train, X_test, y_test = train_test_split(X, y)
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=tree_depth),
                         n_estimators=n_estimators2)
bdt.fit(X_train,y_train)
print(bdt.score(X_test,y_test))

85.6%,略有提高

 

學習率與基學習器個數的探索

import matplotlib.pyplot as plt

from sklearn.ensemble import  AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import  make_gaussian_quantiles
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import  accuracy_score
from sklearn.metrics import zero_one_loss


n_estimators = 200
# 生成2維正態分布,生成的數據按分位數分為兩類,50個樣本特征,5000個樣本數據
X,y = make_gaussian_quantiles(cov=2.0,n_samples=5000,n_features=50,n_classes=2,random_state=1)
# 數據划分為訓練集和測試集
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)
# 根據上一節的參數擇優,選擇最優參數來構建模型
estimatorCart = DecisionTreeClassifier(max_depth=1)
dt_stump1 = AdaBoostClassifier(base_estimator=estimatorCart,n_estimators=n_estimators,learning_rate=0.8)
dt_stump2 = AdaBoostClassifier(base_estimator=estimatorCart,n_estimators=n_estimators,learning_rate=0.1)
dt_stump1.fit(X_train,y_train)
dt_stump_err1 = 1.0 - dt_stump1.score(X_test,y_test)
#
dt_stump2.fit(X_train,y_train)
dt_stump_err2 = 1.0 - dt_stump2.score(X_test,y_test)

############
test_errors1 = []
# 每迭代一次,得到一個測試結果
ada_discrete_err1 = np.zeros((n_estimators,))
ada_discrete_err2 = np.zeros((n_estimators,))
for i,ypred in enumerate(dt_stump1.staged_predict(X_test)):
    ada_discrete_err1[i] = zero_one_loss(ypred,y_test)

for i,ypred in enumerate(dt_stump2.staged_predict(X_test)):
    ada_discrete_err2[i] = zero_one_loss(ypred,y_test)

# 畫出迭代次數與准確率的關系
fig = plt.figure()
ax = fig.add_subplot(111)

ax.plot(np.arange(n_estimators) + 1, ada_discrete_err1,label='learning rate = 0.8',color='red')
ax.plot(np.arange(n_estimators) + 1, ada_discrete_err2,label='learning rate = 0.1',color='green')
ax.set_ylim((0.0, 1))
ax.set_xlabel('n_estimators')
ax.set_ylabel('error rate')
leg = ax.legend(loc='upper right', fancybox=True)
leg.get_frame().set_alpha(0.7)
plt.show()

輸出

針對當前數據,學習率大,錯誤率低

 

總結

基學習器的復雜度盡量低,可以通過增加學習器個數提高泛化能力,

但是當數據噪聲較大或者基學習器復雜度較高時,增加基學習器個數很難提高泛化能力

這只是大致方向,不絕對。

 

 

參考資料:

https://zhuanlan.zhihu.com/p/57319411


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM