XGBoost 學習調參的例子


發現后面設置參數的時候,原生接口和sklearn的參數混在一起了,現在修改為

 

def run_xgboost(data_x,data_y,random_state_num):
    train_x,valid_x,train_y,valid_y = train_test_split(data_x.values,data_y.values,test_size=0.25,random_state=random_state_num)
    print('開始訓練模型')
    start = time.time()
    #轉換成xgb運算格式
    d_train = xgb.DMatrix(train_x,train_y)
    d_valid = xgb.DMatrix(valid_x,valid_y)
    watchlist = [(d_train,'train'),(d_valid,'valid')]
    #參數設置(未調箱前的參數)
    params={
        'eta':0.2,                        #特征權重,取值范圍0~1,通常最后設置eta為0.01~0.2
        'max_depth':3,                    #樹的深度,通常取值3-10,過大容易過擬合,過小欠擬合
        'min_child_weight':1,             #最小樣本的權重,調大參數可以繁殖過擬合
        'gamma':0.4,                      #控制是否后剪枝,越大越保守,一般0.1、 0.2的樣子
        'subsample':0.8,                  #隨機取樣比例
        'colsample_bytree':0.8 ,          #默認為1,取值0~1,對特征隨機采集比例
        'lambda':0.8,
        'alpha':0.6,
        'n_estimators':500,
        'booster':'gbtree',               #迭代樹
        'objective':'binary:logistic',    #邏輯回歸,輸出為概率
        'nthread':6,                      #設置最大的進程量,若不設置則會使用全部資源
        'scale_pos_weight':1,             #默認為0,1可以處理類別不平衡

        'seed':1234,                      #隨機樹種子
        'silent':1,                       #0表示輸出結果
        'eval_metric':'auc'               #評分指標
    }
    bst = xgb.train(params, d_train,1000,watchlist,early_stopping_rounds=100, verbose_eval=5)   #最大迭代次數1000次
    print(time.time()-start)
    tree_nums = bst.best_ntree_limit
    print('最優模型樹的數量:%s,最優迭代次數:%s,auc: %s' %(bst.best_ntree_limit,bst.best_iteration,bst.best_score))
    bst = xgb.train(params, d_train,tree_nums,watchlist,early_stopping_rounds=100, verbose_eval=10) #最優模型迭代次數去訓練
    
#     feat_imp = pd.Series(clf.booster().get_fscore()).sort_values(ascending=False)
#     #新版需要轉換成dict or list 
#     #feat_imp = pd.Series(dict(clf.get_booster().get_fscore())).sort_values(ascending=False)
#     #plt.bar(feat_imp.index, feat_imp)
#     feat_imp.plot(kind='bar', title='Feature Importances')
    #展示特征重要性排名
    feat_imp = bst.get_fscore(fmap='xgb.txt')
    feat_imp = sorted(feat_imp.items(),key=operator.itemgetter(1))
    df = pd.DataFrame(feat_imp,columns=['feature','fscore'])
    #每個特征被調用的次數/所有特征被調用總次數
    df['fscore'] = df['fscore']/df['fscore'].sum()
    #分數高的排在前面,展示前40個重要特征排名
    df = df.sort_values(by='fscore',ascending=False)
    df = df.iloc[:40]
    plt.figure()
    df.plot(kind='bar',x='feature',y='fscore',legend=True,figsize=(32,10))
    plt.title('XGBoost Feature Importance')
    plt.xlabel('relative importance')
    plt.gcf().savefig('feature_importance_xgb.png')
    plt.show()
    return bst

 

 

 

 

XGBoost 其實也是GBDT的一種,本編就說一下代碼

導入模塊

 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import operator
import time
import xgboost as xgb
from xgboost import plot_importance  #畫特征重要性的函數
#from imblearn.ensemble import EasyEnsemble  #還有模塊木有安裝
from sklearn.model_selection import train_test_split
#from sklearn.externals import joblib 已經改成了下面這種方式
import joblib
from sklearn.metrics import auc,roc_curve  #說明是分類
plt.rc('font',family='SimHei',size=13)   #使畫出的圖形中能正常顯示中文
%matplotlib inline

 

 

 

EDA數據探索性分析

#訓練數據、線上數據(無Y)、驗證數據
train_data = pd.read_csv('F:\\win10 升級桌面數據備份\\3.學習模型\\train_user_model_feat.csv')
print(len(train_data[train_data['label']==1]),len(train_data[train_data['label']==0]))  # 1: 815  0:42688
online_data = pd.read_csv('F:\\win10 升級桌面數據備份\\3.學習模型\\online_user_model_feat.csv')
valid_data = pd.read_csv('F:\\win10 升級桌面數據備份\\3.學習模型\\valid_user_model_feat.csv')
print(len(valid_data[valid_data['label']==1]),len(valid_data[valid_data['label']==0]))  # 1:892   0:39302

拆分特征和標簽

train_y = train_data[['label']]
train_y.columns = ['y']
train_x = train_data.drop(['label','user_id'],axis=1)

valid_y = valid_data[['label']]
valid_y.columns = ['y']
valid_x = valid_data.drop(['label','user_id'],axis=1)
# 
file_xgboost_model='./xgboost_model' #模型文件
file_xgboost_columns='./columns.csv' #最終使用的特征
file_xgboost_model_auc_ks='./xgboost_model_auc_ks.png' #模型AUC和KS值
file_xgboost_model_score='./xgboost_model_score.png' # 模型預測用戶的評分分布
file_xgboost_model_prob='./xgboost_model_prob.png' #模型預測用戶的概率分布

 網格搜索法調參

#coding=utf-8
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV  #網格搜索法
import xgboost as xgb
def xgbpa(trainX, trainY):
    # init ,分類
    xgb1 = XGBClassifier(  
        learning_rate=0.3,
        n_estimators=200,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=6
    )

    # max_depth 和 min_weight 參數調優   這個用給網格搜索法時的參數,數的層數由3-6
    param1 = {'max_depth': list(range(3, 7)), 'min_child_weight': list(range(1, 5, 2))}

    from sklearn import svm, datasets
    gsearch1 = GridSearchCV(
        estimator=XGBClassifier(
            learning_rate=0.3,
            n_estimators=150,
            max_depth=5,
            min_child_weight=1,
            gamma=0,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='binary:logistic',
            nthread=4,
            scale_pos_weight=1,
            seed=6
        ),
        param_grid=param1, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
    gsearch1.fit(trainX, trainY)
    print(gsearch1.scorer_)
    print(gsearch1.best_params_, gsearch1.best_score_)  #最佳參數(形式是字典型),最高分數(就一個值)
    best_max_depth = gsearch1.best_params_['max_depth'] #輸出的max_depth的values
    best_min_child_weight = gsearch1.best_params_['min_child_weight']  #同理上面

    # gamma參數調優
    param2 = {'gamma': [i / 10.0 for i in range(0, 5, 2)]}
    gsearch2 = GridSearchCV(
        estimator=XGBClassifier(
            learning_rate=0.3,  # 如同學習率
            n_estimators=150,  # 樹的個數
            max_depth=best_max_depth,  #同時替換上面的值
            min_child_weight=best_min_child_weight,  #同理上面
            gamma=0,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='binary:logistic',
            nthread=4,
            scale_pos_weight=1,
            seed=6
        ),
        param_grid=param2, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
    gsearch2.fit(trainX, trainY)
    print(gsearch2.scorer_)
    print(gsearch2.best_params_, gsearch2.best_score_)
    best_gamma = gsearch2.best_params_['gamma']

    # 調整subsample 和 colsample_bytree參數
    param3 = {'subsample': [i / 10.0 for i in range(6, 9)], 'colsample_bytree': [i / 10.0 for i in range(6, 9)]}
    gsearch3 = GridSearchCV(
        estimator=XGBClassifier(
            learning_rate=0.3,
            n_estimators=150,
            max_depth=best_max_depth,
            min_child_weight=best_min_child_weight,
            gamma=best_gamma,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='binary:logistic',
            nthread=4,
            scale_pos_weight=1,
            seed=6
        ),
        param_grid=param3, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
    gsearch3.fit(trainX, trainY)
    print(gsearch3.scorer_)
    print(gsearch3.best_params_, gsearch3.best_score_)
    best_subsample = gsearch3.best_params_['subsample']
    best_colsample_bytree = gsearch3.best_params_['colsample_bytree']

    # 正則化參數調優
    param4 = {'reg_alpha': [i / 10.0 for i in range(2, 10, 2)], 'reg_lambda': [i / 10.0 for i in range(2, 10, 2)]}
    gsearch4 = GridSearchCV(
        estimator=XGBClassifier(
            learning_rate=0.3,
            n_estimators=150,
            max_depth=best_max_depth,
            min_child_weight=best_min_child_weight,
            gamma=best_gamma,
            subsample=best_subsample,
            colsample_bytree=best_colsample_bytree,
            objective='binary:logistic',
            nthread=4,
            scale_pos_weight=1,
            seed=6
        ),
        param_grid=param4, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
    gsearch4.fit(trainX, trainY)
    print(gsearch4.scorer_)
    print(gsearch4.best_params_, gsearch4.best_score_)
    best_reg_alpha = gsearch4.best_params_['reg_alpha']
    best_reg_lambda = gsearch4.best_params_['reg_lambda']


    param5= {'scale_pos_weight': [i for i in [0.5, 1, 2]]}

    gsearch5 = GridSearchCV(
        estimator = XGBClassifier(
            learning_rate = 0.3,
            n_estimators = 150,
            max_depth = best_max_depth,
            min_child_weight = best_min_child_weight,
            gamma = best_gamma,
            subsample = best_subsample,
            colsample_bytree = best_colsample_bytree,
            reg_alpha = best_reg_alpha,
            reg_lambda = best_reg_lambda,
            objective = 'binary:logistic',
            nthread = 4,
            scale_pos_weight = 1,
            seed = 6
            ),
        param_grid = param5, scoring = 'roc_auc', n_jobs = 4, iid = False, cv = 5)
    gsearch5.fit(trainX, trainY)
    print(gsearch5.best_params_, gsearch5.best_score_)
    best_scale_pos_weight = gsearch5.best_params_['scale_pos_weight']

    # 降低學習速率,數的數量
    param6 = [{'learning_rate': [0.01, 0.05, 0.1, 0.2], 'n_estimators': [800, 1000, 1200]}]

    gsearch6 = GridSearchCV(
        estimator=XGBClassifier(
            learning_rate=0.3,
            n_estimators=150,
            max_depth=best_max_depth,
            min_child_weight=best_min_child_weight,
            gamma=best_gamma,
            subsample=best_subsample,
            colsample_bytree=best_colsample_bytree,
            reg_alpha=best_reg_alpha,
            reg_lambda = best_reg_lambda,
            objective = 'binary:logistic',
            nthread = 4,
            scale_pos_weight = best_scale_pos_weight,
            seed = 6
    ),
    param_grid = param6, scoring = 'roc_auc', n_jobs = 4, iid = False, cv = 5)
    gsearch6.fit(trainX, trainY)
    print(gsearch6.scorer_)
    print(gsearch6.best_params_, gsearch6.best_score_)
    best_learning_rate = gsearch6.best_params_['learning_rate']
    best_n_estimators = gsearch6.best_params_['n_estimators']
    print('最好參數集:')
    print(gsearch1.best_params_, gsearch1.best_score_)
    print(gsearch2.best_params_, gsearch2.best_score_)
    print(gsearch3.best_params_, gsearch3.best_score_)
    print(gsearch4.best_params_, gsearch4.best_score_)
    print(gsearch5.best_params_, gsearch5.best_score_)
    print(gsearch6.best_params_, gsearch6.best_score_)


if __name__ == '__main__':
    # user_model cv
    #調參前得保持模型訓練樣本和調參樣本數據一致
    print('--------------開始調參---------------')
    start = time.time()
    data_x,temp_x,data_y,temp_y = train_test_split(train_x,train_y,test_size=0.25,random_state=1234)
    xgbpa(data_x,data_y.y) #標簽值有要是數組型的,不能是df,所以就.y了 print('調參用時:%s'%(time.time()-start))

這個數據要跑挺久的(>0.5h)要留足時間去運行 

--------------開始調參---------------
make_scorer(roc_auc_score, needs_threshold=True)
{'max_depth': 3, 'min_child_weight': 3} 0.8169763045780181
make_scorer(roc_auc_score, needs_threshold=True)
{'gamma': 0.0} 0.8169763045780181
make_scorer(roc_auc_score, needs_threshold=True)
{'colsample_bytree': 0.8, 'subsample': 0.8} 0.8169763045780181
make_scorer(roc_auc_score, needs_threshold=True)
{'reg_alpha': 0.6, 'reg_lambda': 0.8} 0.8148521719194484
{'scale_pos_weight': 0.5} 0.8155242908735241
make_scorer(roc_auc_score, needs_threshold=True)
{'learning_rate': 0.01, 'n_estimators': 1200} 0.8467294278425243
最好參數集:
{'max_depth': 3, 'min_child_weight': 3} 0.8169763045780181
{'gamma': 0.0} 0.8169763045780181
{'colsample_bytree': 0.8, 'subsample': 0.8} 0.8169763045780181
{'reg_alpha': 0.6, 'reg_lambda': 0.8} 0.8148521719194484
{'scale_pos_weight': 0.5} 0.8155242908735241
{'learning_rate': 0.01, 'n_estimators': 1200} 0.8467294278425243
調參用時:1126.5513534545898

特征列集索引表的建立

def create_feature_map(features):
    outfile = open('xgb.txt', 'w')  #寫,新建一個叫xgb.txt的文件
    i = 0
    for feat in features:
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))   #格式為 0  feature  q  \t是分隔符,為空  就是說第一列是序號,第二列是特征名稱,第三列是q,不知道需要這個q干嗎,可以是多寫了,先要着吧,后面再看看吧
        i = i + 1
    outfile.close()
create_feature_map(train_x.columns)

 具體為什么要使用q,看這個https://blog.csdn.net/ai_XX/article/details/102778684?utm_medium=distribute.pc_aggpage_search_result.none-task-blog-2~all~first_rank_v2~rank_v25-3-102778684.nonecase&utm_term=python%E7%9A%84fmap%E6%96%87%E4%BB%B6

使用XGBoost訓練模型

只是用了一部分,還有一些參數沒有根據最優參數來使用,但是大部分都已經運用進去了

#運行XGBoost,輸出特征重要性排名
def run_xgboost(data_x,data_y,random_state_num):
    train_x,valid_x,train_y,valid_y = train_test_split(data_x.values,data_y.values,test_size=0.25,random_state=random_state_num)
    print('開始訓練模型')
    start = time.time()
    #轉換成xgb運算格式
    d_train = xgb.DMatrix(train_x,train_y)
    d_valid = xgb.DMatrix(valid_x,valid_y)
    watchlist = [(d_train,'train'),(d_valid,'valid')]
    #參數設置(未調箱前的參數)
    params={
        'eta':0.2,                        #特征權重,取值范圍0~1,通常最后設置eta為0.01~0.2
        'max_depth':3,                    #樹的深度,通常取值3-10,過大容易過擬合,過小欠擬合
        'min_child_weight':1,             #最小樣本的權重,調大參數可以繁殖過擬合
        'gamma':0.4,                      #控制是否后剪枝,越大越保守,一般0.1、 0.2的樣子
        'subsample':0.8,                  #隨機取樣比例
        'colsample_bytree':0.8 ,          #默認為1,取值0~1,對特征隨機采集比例
        'reg_lambda':0.8,
        'reg_alpha':0.6,
        'learning_rate':0.1,
        'n_estimators':1000,
        'booster':'gbtree',               #迭代樹
        'objective':'binary:logistic',    #邏輯回歸,輸出為概率
        'nthread':6,                      #設置最大的進程量,若不設置則會使用全部資源
        'scale_pos_weight':1,             #默認為0,1可以處理類別不平衡
        'lambda':1,                       #默認為1,用於L2平滑處理項,避免模型過擬合
        'seed':1234,                      #隨機樹種子
        'silent':1,                       #0表示輸出結果
        'eval_metric':'auc'               #評分指標
    }
    bst = xgb.train(params, d_train,1000,watchlist,early_stopping_rounds=100, verbose_eval=5)   #最大迭代次數1000次
    print(time.time()-start)
    tree_nums = bst.best_ntree_limit
    print('最優模型樹的數量:%s,最優迭代次數:%s,auc: %s' %(bst.best_ntree_limit,bst.best_iteration,bst.best_score))
    bst = xgb.train(params, d_train,tree_nums,watchlist,early_stopping_rounds=100, verbose_eval=10) #最優模型迭代次數去訓練
    
#     feat_imp = pd.Series(clf.booster().get_fscore()).sort_values(ascending=False)
#     #新版需要轉換成dict or list 
#     #feat_imp = pd.Series(dict(clf.get_booster().get_fscore())).sort_values(ascending=False)
#     #plt.bar(feat_imp.index, feat_imp)
#     feat_imp.plot(kind='bar', title='Feature Importances')
    #展示特征重要性排名
    feat_imp = bst.get_fscore(fmap='xgb.txt')
    feat_imp = sorted(feat_imp.items(),key=operator.itemgetter(1))
    df = pd.DataFrame(feat_imp,columns=['feature','fscore'])
    #每個特征被調用的次數/所有特征被調用總次數
    df['fscore'] = df['fscore']/df['fscore'].sum()
    #分數高的排在前面,展示前40個重要特征排名
    df = df.sort_values(by='fscore',ascending=False)
    df = df.iloc[:40]
    plt.figure()
    df.plot(kind='bar',x='feature',y='fscore',legend=True,figsize=(32,10))
    plt.title('XGBoost Feature Importance')
    plt.xlabel('relative importance')
    plt.gcf().savefig('feature_importance_xgb.png')
    plt.show()
    return bst 

 繪制roc曲線函數

# 繪制ROC曲線函數
def plot_roc(test_x, test_y):
    predictions = bst.predict(xgb.DMatrix(test_x))
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_y, predictions)  #roc的幾個參數
    roc_auc = auc(false_positive_rate, true_positive_rate)  #直接計算auc
    plt.title('Receiver Operating Characteristic')
    plt.plot(false_positive_rate,true_positive_rate, 'b', label='AUC = %0.4f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r.')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.ylabel('tpr')
    plt.xlabel('fpr')

# 繪制K-S函數 從大到小排序,分10等分
def plot_ks(test_x, test_y):
    predictions = bst.predict(xgb.DMatrix(test_x))
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_y, predictions, drop_intermediate=False)
    pre = sorted(predictions, reverse=True)  #reverse參數為True意味着按照降序排序,這是畫ks時要求的
    num = []
    for i in range(10):
        num.append((i) * int(len(pre) / 10))
    num.append(len(pre) - 1)
    df = pd.DataFrame()
    df['false_positive_rate'] = false_positive_rate
    df['true_positive_rate'] = true_positive_rate
    df['thresholds'] = thresholds
    data_ks = []
    for i in num:
        data_ks.append(list(df[df['thresholds'] == pre[i]].values[0]))
    data_ks = pd.DataFrame(data_ks)
    data_ks.columns = ['fpr', 'tpr', 'thresholds']
    ks = max(data_ks['tpr'] - data_ks['fpr'])
    plt.title('K-S曲線')
    plt.plot(np.array(range(len(num))), data_ks['tpr'])
    plt.plot(np.array(range(len(num))), data_ks['fpr'])
    plt.plot(np.array(range(len(num))), data_ks['tpr'] - data_ks['fpr'], label='K-S = %0.4f' % ks)
    plt.legend(loc='lower right')
    plt.xlim([0, 10])
    plt.ylim([0.0, 1.0])
    plt.ylabel('累計占比')
    plt.xlabel('分組編號')

# 繪制一張圖,包含訓練和測試集的ROC、AUC、K-S圖形指標。
def auc_ks(train_x, test_x, train_y, test_y):
    plt.figure(figsize=(15, 15))
    plt.subplot(221)
    plot_roc(train_x, train_y)
    plt.subplot(222)
    plot_roc(test_x, test_y)
    plt.subplot(223)
    plot_ks(train_x, train_y)
    plt.subplot(224)
    plot_ks(test_x, test_y)
    plt.savefig(file_xgboost_model_auc_ks)
    plt.show()

保存模型、評價指標、選擇變量等

#保存模型、評價指標、選擇變量到D盤
def run_main(data_x,data_y):
    global bst
    start=time.time()
    bst=run_xgboost(data_x,data_y,random_state_num=1234)  #為什么要是1234,因為調參時候就是=1234
    joblib.dump(bst, file_xgboost_model)  #joblib的用法https://www.cnblogs.com/wzdLY/p/9630671.html 將模型保存
    print('模型已成功保存在 %s'%(file_xgboost_model))
    train_x, test_x, train_y, test_y = train_test_split(data_x.values, data_y.values, test_size=0.25, random_state=1234)
    auc_ks(train_x, test_x, train_y, test_y)
    print('模型評價指標已保存在:%s'%(file_xgboost_model_auc_ks))
    print('運行共花費時間:%s'%(time.time()-start))

if __name__=='__main__':
    run_main(train_x, train_y)

 

 

 分別是訓練集和測試集的auc和ks,還有特征重要性的排列

 用驗證集數據驗證模型效果

# 繪制ROC曲線函數
def plot_test_roc(test_x, test_y,filename):
    bst = joblib.load(filename)
    predictions = bst.predict(xgb.DMatrix(test_x.values))
    false_positive_rate,true_positive_rate, thresholds = roc_curve(test_y, predictions)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    plt.title('Receiver Operating Characteristic')
    plt.plot(false_positive_rate,true_positive_rate, 'b', label='AUC = %0.4f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.ylabel('Recall')
    plt.xlabel('Fall-out')
    plt.show()

if __name__=='__main__':
    plot_test_roc(valid_x,valid_y,file_xgboost_model)
    

 下面附上全部代碼

# -*- coding: utf-8 -*-
"""
Created on Wed Mar 10 19:01:07 2021

@author: Administrator
"""



#%%
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import operator
import time
import xgboost as xgb
from xgboost import plot_importance  #畫特征重要性的函數
#from imblearn.ensemble import EasyEnsemble  #還有模塊木有安裝
from sklearn.model_selection import train_test_split
#from sklearn.externals import joblib 已經改成了下面這種方式
import joblib
from sklearn.metrics import auc,roc_curve  #說明是分類
plt.rc('font',family='SimHei',size=13)   #使畫出的圖形中能正常顯示中文
%matplotlib inline



#%%
#訓練數據、線上數據(無Y)、驗證數據
train_data = pd.read_csv('D:/迅雷下載/3.學習模型/3.學習模型/train_user_model_feat.csv')
print(len(train_data[train_data['label']==1]),len(train_data[train_data['label']==0]))  # 1: 815  0:42688
online_data = pd.read_csv('D:/迅雷下載/3.學習模型/3.學習模型/online_user_model_feat.csv')
valid_data = pd.read_csv('D:/迅雷下載/3.學習模型/3.學習模型/valid_user_model_feat.csv')
print(len(valid_data[valid_data['label']==1]),len(valid_data[valid_data['label']==0]))  # 1:892   0:39302



#%%
train_y = train_data[['label']]
train_y.columns = ['y']
train_x = train_data.drop(['label','user_id'],axis=1)

valid_y = valid_data[['label']]
valid_y.columns = ['y']
valid_x = valid_data.drop(['label','user_id'],axis=1)
# 
file_xgboost_model='./xgboost_model' #模型文件
file_xgboost_columns='./columns.csv' #最終使用的特征
file_xgboost_model_auc_ks='./xgboost_model_auc_ks.png' #模型AUC和KS值
file_xgboost_model_score='./xgboost_model_score.png' # 模型預測用戶的評分分布
file_xgboost_model_prob='./xgboost_model_prob.png' #模型預測用戶的概率分布


#%%
#coding=utf-8
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV  #網格搜索法
import xgboost as xgb
def xgbpa(trainX, trainY):
    # init ,分類
    xgb1 = XGBClassifier(  
        learning_rate=0.3,
        n_estimators=150,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=6
    )

    # max_depth 和 min_weight 參數調優   這個用給網格搜索法時的參數,數的層數由3-6
    param1 = {'max_depth': list(range(3, 7)), 'min_child_weight': list(range(1, 5, 2))}

    from sklearn import svm, datasets
    gsearch1 = GridSearchCV(
        estimator=XGBClassifier(
            learning_rate=0.3,
            n_estimators=150,
            max_depth=5,
            min_child_weight=1,
            gamma=0,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='binary:logistic',
            nthread=4,
            scale_pos_weight=1,
            seed=6
        ),
        param_grid=param1, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
    gsearch1.fit(trainX, trainY)
    print(gsearch1.scorer_)
    print(gsearch1.best_params_, gsearch1.best_score_)  #最佳參數(形式是字典型),最高分數(就一個值)
    best_max_depth = gsearch1.best_params_['max_depth'] #輸出的max_depth的values
    best_min_child_weight = gsearch1.best_params_['min_child_weight']  #同理上面

    # gamma參數調優
    param2 = {'gamma': [i / 10.0 for i in range(0, 5, 2)]}
    gsearch2 = GridSearchCV(
        estimator=XGBClassifier(
            learning_rate=0.3,  # 如同學習率
            n_estimators=150,  # 樹的個數
            max_depth=best_max_depth,  #同時替換上面的值
            min_child_weight=best_min_child_weight,  #同理上面
            gamma=0,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='binary:logistic',
            nthread=4,
            scale_pos_weight=1,
            seed=6
        ),
        param_grid=param2, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
    gsearch2.fit(trainX, trainY)
    print(gsearch2.scorer_)
    print(gsearch2.best_params_, gsearch2.best_score_)
    best_gamma = gsearch2.best_params_['gamma']

    # 調整subsample 和 colsample_bytree參數
    param3 = {'subsample': [i / 10.0 for i in range(6, 9)], 'colsample_bytree': [i / 10.0 for i in range(6, 9)]}
    gsearch3 = GridSearchCV(
        estimator=XGBClassifier(
            learning_rate=0.3,
            n_estimators=150,
            max_depth=best_max_depth,
            min_child_weight=best_min_child_weight,
            gamma=best_gamma,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='binary:logistic',
            nthread=4,
            scale_pos_weight=1,
            seed=6
        ),
        param_grid=param3, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
    gsearch3.fit(trainX, trainY)
    print(gsearch3.scorer_)
    print(gsearch3.best_params_, gsearch3.best_score_)
    best_subsample = gsearch3.best_params_['subsample']
    best_colsample_bytree = gsearch3.best_params_['colsample_bytree']

    # 正則化參數調優
    param4 = {'reg_alpha': [i / 10.0 for i in range(2, 10, 2)], 'reg_lambda': [i / 10.0 for i in range(2, 10, 2)]}
    gsearch4 = GridSearchCV(
        estimator=XGBClassifier(
            learning_rate=0.3,
            n_estimators=150,
            max_depth=best_max_depth,
            min_child_weight=best_min_child_weight,
            gamma=best_gamma,
            subsample=best_subsample,
            colsample_bytree=best_colsample_bytree,
            objective='binary:logistic',
            nthread=4,
            scale_pos_weight=1,
            seed=6
        ),
        param_grid=param4, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
    gsearch4.fit(trainX, trainY)
    print(gsearch4.scorer_)
    print(gsearch4.best_params_, gsearch4.best_score_)
    best_reg_alpha = gsearch4.best_params_['reg_alpha']
    best_reg_lambda = gsearch4.best_params_['reg_lambda']


    param5= {'scale_pos_weight': [i for i in [0.5, 1, 2]]}

    gsearch5 = GridSearchCV(
        estimator = XGBClassifier(
            learning_rate = 0.3,
            n_estimators = 150,
            max_depth = best_max_depth,
            min_child_weight = best_min_child_weight,
            gamma = best_gamma,
            subsample = best_subsample,
            colsample_bytree = best_colsample_bytree,
            reg_alpha = best_reg_alpha,
            reg_lambda = best_reg_lambda,
            objective = 'binary:logistic',
            nthread = 4,
            scale_pos_weight = 1,
            seed = 6
            ),
        param_grid = param5, scoring = 'roc_auc', n_jobs = 4, iid = False, cv = 5)
    gsearch5.fit(trainX, trainY)
    print(gsearch5.best_params_, gsearch5.best_score_)
    best_scale_pos_weight = gsearch5.best_params_['scale_pos_weight']

    # 降低學習速率,數的數量
    param6 = [{'learning_rate': [0.01, 0.05, 0.1, 0.2], 'n_estimators': [800, 1000, 1200]}]

    gsearch6 = GridSearchCV(
        estimator=XGBClassifier(
            learning_rate=0.3,
            n_estimators=150,
            max_depth=best_max_depth,
            min_child_weight=best_min_child_weight,
            gamma=best_gamma,
            subsample=best_subsample,
            colsample_bytree=best_colsample_bytree,
            reg_alpha=best_reg_alpha,
            reg_lambda = best_reg_lambda,
            objective = 'binary:logistic',
            nthread = 4,
            scale_pos_weight = best_scale_pos_weight,
            seed = 6
    ),
    param_grid = param6, scoring = 'roc_auc', n_jobs = 4, iid = False, cv = 5)
    gsearch6.fit(trainX, trainY)
    print(gsearch6.scorer_)
    print(gsearch6.best_params_, gsearch6.best_score_)
    best_learning_rate = gsearch6.best_params_['learning_rate']
    best_n_estimators = gsearch6.best_params_['n_estimators']
    print('最好參數集:')
    print(gsearch1.best_params_, gsearch1.best_score_)
    print(gsearch2.best_params_, gsearch2.best_score_)
    print(gsearch3.best_params_, gsearch3.best_score_)
    print(gsearch4.best_params_, gsearch4.best_score_)
    print(gsearch5.best_params_, gsearch5.best_score_)
    print(gsearch6.best_params_, gsearch6.best_score_)


if __name__ == '__main__':
    # user_model cv
    #調參前得保持模型訓練樣本和調參樣本數據一致
    print('--------------開始調參---------------')
    start = time.time()
    data_x,temp_x,data_y,temp_y = train_test_split(train_x,train_y,test_size=0.25,random_state=1234)
    xgbpa(data_x,data_y.y) #標簽值有要是數組型的,不能是df,所以就.y了
    print('調參用時:%s'%(time.time()-start))

#%%
def create_feature_map(features):
    outfile = open('xgb.txt', 'w')  #寫,新建一個叫xgb.txt的文件
    i = 0
    for feat in features:
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))   #格式為 0  feature  q  \t是分隔符,為空  就是說第一列是序號,第二列是特征名稱,第三列是q,不知道需要這個q干嗎,可以是多寫了,先要着吧,后面再看看吧
        i = i + 1
    outfile.close()
create_feature_map(train_x.columns)

#%%
#運行XGBoost,輸出特征重要性排名
def run_xgboost(data_x,data_y,random_state_num):
    train_x,valid_x,train_y,valid_y = train_test_split(data_x.values,data_y.values,test_size=0.25,random_state=random_state_num)
    print('開始訓練模型')
    start = time.time()
    #轉換成xgb運算格式
    d_train = xgb.DMatrix(train_x,train_y)
    d_valid = xgb.DMatrix(valid_x,valid_y)
    watchlist = [(d_train,'train'),(d_valid,'valid')]
    #參數設置(未調箱前的參數)
    params={
        'eta':0.2,                        #特征權重,取值范圍0~1,通常最后設置eta為0.01~0.2
        'max_depth':3,                    #樹的深度,通常取值3-10,過大容易過擬合,過小欠擬合
        'min_child_weight':1,             #最小樣本的權重,調大參數可以繁殖過擬合
        'gamma':0.4,                      #控制是否后剪枝,越大越保守,一般0.1、 0.2的樣子
        'subsample':0.8,                  #隨機取樣比例
        'colsample_bytree':0.8 ,          #默認為1,取值0~1,對特征隨機采集比例
        'reg_lambda':0.8,
        'reg_alpha':0.6,
        'learning_rate':0.1,
        'n_estimators':1000,
        'booster':'gbtree',               #迭代樹
        'objective':'binary:logistic',    #邏輯回歸,輸出為概率
        'nthread':6,                      #設置最大的進程量,若不設置則會使用全部資源
        'scale_pos_weight':1,             #默認為0,1可以處理類別不平衡
        'lambda':1,                       #默認為1,用於L2平滑處理項,避免模型過擬合
        'seed':1234,                      #隨機樹種子
        'silent':1,                       #0表示輸出結果
        'eval_metric':'auc'               #評分指標
    }
    bst = xgb.train(params, d_train,1000,watchlist,early_stopping_rounds=100, verbose_eval=5)   #最大迭代次數1000次
    print(time.time()-start)
    tree_nums = bst.best_ntree_limit
    print('最優模型樹的數量:%s,最優迭代次數:%s,auc: %s' %(bst.best_ntree_limit,bst.best_iteration,bst.best_score))
    bst = xgb.train(params, d_train,tree_nums,watchlist,early_stopping_rounds=100, verbose_eval=10) #最優模型迭代次數去訓練
    
#     feat_imp = pd.Series(clf.booster().get_fscore()).sort_values(ascending=False)
#     #新版需要轉換成dict or list 
#     #feat_imp = pd.Series(dict(clf.get_booster().get_fscore())).sort_values(ascending=False)
#     #plt.bar(feat_imp.index, feat_imp)
#     feat_imp.plot(kind='bar', title='Feature Importances')
    #展示特征重要性排名
    feat_imp = bst.get_fscore(fmap='xgb.txt')
    feat_imp = sorted(feat_imp.items(),key=operator.itemgetter(1))
    df = pd.DataFrame(feat_imp,columns=['feature','fscore'])
    #每個特征被調用的次數/所有特征被調用總次數
    df['fscore'] = df['fscore']/df['fscore'].sum()
    #分數高的排在前面,展示前40個重要特征排名
    df = df.sort_values(by='fscore',ascending=False)
    df = df.iloc[:40]
    plt.figure()
    df.plot(kind='bar',x='feature',y='fscore',legend=True,figsize=(32,10))
    plt.title('XGBoost Feature Importance')
    plt.xlabel('relative importance')
    plt.gcf().savefig('feature_importance_xgb.png')
    plt.show()
    return bst

#%%
# 繪制ROC曲線函數
def plot_roc(test_x, test_y):
    predictions = bst.predict(xgb.DMatrix(test_x))
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_y, predictions)  #roc的幾個參數
    roc_auc = auc(false_positive_rate, true_positive_rate)  #直接計算auc
    plt.title('Receiver Operating Characteristic')
    plt.plot(false_positive_rate,true_positive_rate, 'b', label='AUC = %0.4f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r.')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.ylabel('tpr')
    plt.xlabel('fpr')

# 繪制K-S函數 從大到小排序,分10等分
def plot_ks(test_x, test_y):
    predictions = bst.predict(xgb.DMatrix(test_x))
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_y, predictions, drop_intermediate=False)
    pre = sorted(predictions, reverse=True)  #reverse參數為True意味着按照降序排序,這是畫ks時要求的
    num = []
    for i in range(10):
        num.append((i) * int(len(pre) / 10))
    num.append(len(pre) - 1)
    df = pd.DataFrame()
    df['false_positive_rate'] = false_positive_rate
    df['true_positive_rate'] = true_positive_rate
    df['thresholds'] = thresholds
    data_ks = []
    for i in num:
        data_ks.append(list(df[df['thresholds'] == pre[i]].values[0]))
    data_ks = pd.DataFrame(data_ks)
    data_ks.columns = ['fpr', 'tpr', 'thresholds']
    ks = max(data_ks['tpr'] - data_ks['fpr'])
    plt.title('K-S曲線')
    plt.plot(np.array(range(len(num))), data_ks['tpr'])
    plt.plot(np.array(range(len(num))), data_ks['fpr'])
    plt.plot(np.array(range(len(num))), data_ks['tpr'] - data_ks['fpr'], label='K-S = %0.4f' % ks)
    plt.legend(loc='lower right')
    plt.xlim([0, 10])
    plt.ylim([0.0, 1.0])
    plt.ylabel('累計占比')
    plt.xlabel('分組編號')

# 繪制一張圖,包含訓練和測試集的ROC、AUC、K-S圖形指標。
def auc_ks(train_x, test_x, train_y, test_y):
    plt.figure(figsize=(15, 15))
    plt.subplot(221)
    plot_roc(train_x, train_y)
    plt.subplot(222)
    plot_roc(test_x, test_y)
    plt.subplot(223)
    plot_ks(train_x, train_y)
    plt.subplot(224)
    plot_ks(test_x, test_y)
    plt.savefig(file_xgboost_model_auc_ks)
    plt.show()

#%%
#保存模型、評價指標、選擇變量到D盤
def run_main(data_x,data_y):
    global bst
    start=time.time()
    bst=run_xgboost(data_x,data_y,random_state_num=1234)  #為什么要是1234,因為調參時候就是=1234
    joblib.dump(bst, file_xgboost_model)  #joblib的用法https://www.cnblogs.com/wzdLY/p/9630671.html 將模型保存
    print('模型已成功保存在 %s'%(file_xgboost_model))
    train_x, test_x, train_y, test_y = train_test_split(data_x.values, data_y.values, test_size=0.25, random_state=1234)
    auc_ks(train_x, test_x, train_y, test_y)
    print('模型評價指標已保存在:%s'%(file_xgboost_model_auc_ks))
    print('運行共花費時間:%s'%(time.time()-start))

if __name__=='__main__':
    run_main(train_x, train_y)
    
# 繪制ROC曲線函數
def plot_test_roc(test_x, test_y,filename):
    bst = joblib.load(filename)
    predictions = bst.predict(xgb.DMatrix(test_x.values))
    false_positive_rate,true_positive_rate, thresholds = roc_curve(test_y, predictions)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    plt.title('Receiver Operating Characteristic')
    plt.plot(false_positive_rate,true_positive_rate, 'b', label='AUC = %0.4f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.ylabel('Recall')
    plt.xlabel('Fall-out')
    plt.show()

if __name__=='__main__':
    plot_test_roc(valid_x,valid_y,file_xgboost_model)
View Code

 

這里主要補充一下這個調參的不足之處

1.調參使用的是全部樣本,這樣不是很適合

2.使用網格搜索法,太耗時,

3.調參順序不對,最后面的效果不好

后面我又使用邏輯回歸做了一個模型,代碼如下:

# -*- coding: utf-8 -*-
"""
Created on Wed Mar 10 15:58:41 2021

@author: Administrator
"""

#%%
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import operator
import time
import xgboost as xgb
from xgboost import plot_importance  #畫特征重要性的函數
#from imblearn.ensemble import EasyEnsemble  #還有模塊木有安裝
from sklearn.model_selection import train_test_split
#from sklearn.externals import joblib 已經改成了下面這種方式
import joblib
from sklearn.metrics import auc,roc_curve  #說明是分類
plt.rc('font',family='SimHei',size=13)   #使畫出的圖形中能正常顯示中文
%matplotlib inline



#%%
#訓練數據、線上數據(無Y)、驗證數據
train_data = pd.read_csv('D:/迅雷下載/3.學習模型/3.學習模型/train_user_model_feat.csv')
print(len(train_data[train_data['label']==1]),len(train_data[train_data['label']==0]))  # 1: 815  0:42688
online_data = pd.read_csv('D:/迅雷下載/3.學習模型/3.學習模型/online_user_model_feat.csv')
valid_data = pd.read_csv('D:/迅雷下載/3.學習模型/3.學習模型/valid_user_model_feat.csv')
print(len(valid_data[valid_data['label']==1]),len(valid_data[valid_data['label']==0]))  # 1:892   0:39302


#%%
for i in train_data.columns :
    print(i,train_data[i].nunique())
a=[]
for i in train_data.columns :
    if train_data[i].nunique()<10:
        a.append(i)
a=['rule_uid',
 'user_lv_cd',
 'user_date_cnt_b7day',
 'uc_date_cnt_b7day',
 'uc_act_4',
 'uc_buy_bool_day7']

train_data.pop('user_id')  
train_data['y'] = train_data['label']
train_data.pop('label') 

#%%類別型變量的iv 
import pycard as pc
cate_iv_woedf = pc.WoeDf()
for i in a:
    cate_iv_woedf.append(pc.cross_woe(train_data[i] ,train_data.y))
#cate_iv_woedf.to_excel('tmp1')    

#%%數值型變量的iv 
num_col = [i for i in train_data.columns if i not in a]
num_col.remove('y')
clf = pc.NumBin(min_bin_samples=200, min_impurity_decrease=4e-5)
for i in num_col:
    clf.fit(train_data[i] ,train_data.y)
    #clf.generate_transform_fun()
    cate_iv_woedf.append(clf.woe_df_)


#%%相關性分析
train_data.corr_tri().abs().to_excel('tmp1.xlsx')

def argmax(x):
    """計算 df 的最大值所對應的行、列索引,返回 (row, cols) 元組"""
    m0 = x.max()
    max_value = m0.max()
    col_label = m0.idxmax()
    row_label = x[col_label].idxmax()
    return row_label, col_label


def corr_filter(detail_df, vars_iv, corr_tol=0.9, iv_diff=0.01):
    """相關性系數 >= tol的兩個列, 假設 var1 的 IV 比 var2 的 IV 更高:
        若 var1_iv - var2_iv > iv_diff,則將其中 IV 值更低的列刪除 \n

    參數:
    ----------
    detail_sr: dataframe, 需要計算相關性的明細數據框 \n
    vars_iv: dataframe, 包含2列:colName和iv列。各個變量的 IV 指標 
        該參數的值一般由 woedf.var_ivs 方法返回。 \n
    corr_tol: float, 線性相關性閾值,超過此閾值的兩個列,則判斷兩個列相關性過高,進而判斷 IV 之差是否足夠大 \n
    iv_diff: float, 兩個列的 IV 差值的閾值,自動刪除 IV 更低的列

    返回值:
    ----------
    corr_df: dataframe, 相關性矩陣,並刪除了相關性超過閾值的列 \n
    dropped_col: list, 刪除的列"""
    corr_df = detail_df.corr_tri().abs()
    vars_iv = vars_iv.set_index('colName')
    corr_df = corr_df.fillna(0)
    dropped_col = []
    while True:
        row, col = argmax(corr_df)
        if corr_df.loc[row, col] >= corr_tol:
            drop_label = row if vars_iv.loc[row,'IV'] < vars_iv.loc[col,'IV'] else col
            dropped_col.append(drop_label)
            corr_df = corr_df.drop(drop_label).drop(drop_label, axis=1)
            vars_iv = vars_iv.drop(drop_label)
            if len(corr_df) == 1:
                break
        else:
            break
    return corr_df, dropped_col
t=cate_iv_woedf.var_ivs().iloc[:,0:-1].reset_index()
t.columns = ['colName','IV']
corr_df, dropped_col = corr_filter(train_data[a+num_col],t,corr_tol=0.75, iv_diff=0.00001)

data_drop_corr = train_data.drop(columns = dropped_col)

#%%
cate_iv_woedf = pc.WoeDf()
clf = pc.NumBin(min_bin_samples=200, min_impurity_decrease=4e-5)
for i in data_drop_corr.columns:
    clf.fit(data_drop_corr[i] ,data_drop_corr.y)
    #clf.generate_transform_fun()
    cate_iv_woedf.append(clf.woe_df_)

cate_iv_woedf.to_excel('tmp1')


#%%刪除下面這些字段
drop_2 = ["uc_date_cnt_b7day",
"user_act_totalCnt_15day",
"max_click",
"freq_click",
"uc_act_decay_3",
"uc_act_4",
"uc_act_time_zone_0",
"uc_act_time_zone_1",
"ratio_1_6",
"uc_last_tm_dist",
"user_date_cnt_b15day",
"uc_date_cnt_b15day",
"uc_date_ratio_15",
"uc_act_totalCnt",
"uc_act_ratio_60day",
"uc_act_ratio_15day",
"ratio_act_time_1day",
"user_act_time_5day",
"ratio_act_time_5day",
"mean_uc_act",
"uc_act_time_zone_2",
"uc_act_time_zone_3",
"uc_buy_bool_day7"]

# 去掉這些V13    V15    V22    V24    V25    V26
num_col = [i for i in data_drop_corr.columns if i not in drop_2]
num_col.remove('y')

num_iv_woedf = pc.WoeDf()
clf = pc.NumBin(min_bin_samples=200, min_impurity_decrease=4e-5)
for i in num_col:
    clf.fit(data_drop_corr[i] ,data_drop_corr.y)
    data_drop_corr[i+'_bin'] = clf.transform(data_drop_corr[i])  #這樣可以省略掉后面轉換成_bin的一步驟
    num_iv_woedf.append(clf.woe_df_)
    

#%%woe轉換
bin_col = [i for i in list(data_drop_corr.columns) if i[-4:]=='_bin']

cate_iv_woedf = pc.WoeDf()
for i in bin_col:
    cate_iv_woedf.append(pc.cross_woe(data_drop_corr[i] ,data_drop_corr.y))
#cate_iv_woedf.to_excel('tmp1')
cate_iv_woedf.bin2woe(data_drop_corr,bin_col)
cate_iv_woedf.to_excel('tmp.xlsx')

#%%建模
model_col = [i for i in list(data_drop_corr.columns) if i[-4:]=='_woe']

import pandas as pd
import matplotlib.pyplot as plt #導入圖像庫
import matplotlib
import seaborn as sns
import statsmodels.api as sm
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split

X = data_drop_corr[model_col]
Y = data_drop_corr['y']


x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=100)


X1=sm.add_constant(x_train)   #在X前加上一列常數1,方便做帶截距項的回歸
logit=sm.Logit(y_train.astype(float),X1.astype(float))
result=logit.fit()
result.summary()
result.params

resu_1 = result.predict(X1.astype(float))
fpr, tpr, threshold = roc_curve(y_train, resu_1)
rocauc = auc(fpr, tpr)  #0.9693313248601317
plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % rocauc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('真正率')
plt.xlabel('假正率')
plt.show()




#%%測試集
X3 = sm.add_constant(x_test)
resu = result.predict(X3.astype(float))
fpr, tpr, threshold = roc_curve(y_test, resu)
rocauc = auc(fpr, tpr)
plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % rocauc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('真正率')
plt.xlabel('假正率')
plt.show()

#%%驗證集

num_iv_woedf_1 = pc.WoeDf()
clf = pc.NumBin(min_bin_samples=200, min_impurity_decrease=4e-5)
for i in num_col:
    clf.fit(data_drop_corr[i] ,data_drop_corr.y)
    valid_data[i+'_bin'] = pc.binning(valid_data[i],clf.bins_) #這樣可以省略掉后面轉換成_bin的一步驟
    #num_iv_woedf_1.append(clf.woe_df_)
    
#%%woe轉換
bin_col_1 = [i for i in list(valid_data.columns) if i[-4:]=='_bin']

cate_iv_woedf.bin2woe(valid_data,bin_col)
model_col_1 = [i for i in list(valid_data.columns) if i[-4:]=='_woe']

valid_data = valid_data.rename(columns={'label':'y'})
X_test = valid_data[model_col_1]
Y_test = valid_data['y']


X4 = sm.add_constant(X_test)
resu = result.predict(X4.astype(float))
fpr, tpr, threshold = roc_curve(Y_test, resu)
rocauc = auc(fpr, tpr)  #0.7931891609482327
plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % rocauc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('真正率')
plt.xlabel('假正率')
plt.show()
View Code

 

 

 

 上面分別是測試集和驗證集的auc ,比xgboost的驗證集差一點,但是測試集比他好一點,這也驗證了xgboost不易過擬合的性質

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM