發現后面設置參數的時候,原生接口和sklearn的參數混在一起了,現在修改為
def run_xgboost(data_x,data_y,random_state_num): train_x,valid_x,train_y,valid_y = train_test_split(data_x.values,data_y.values,test_size=0.25,random_state=random_state_num) print('開始訓練模型') start = time.time() #轉換成xgb運算格式 d_train = xgb.DMatrix(train_x,train_y) d_valid = xgb.DMatrix(valid_x,valid_y) watchlist = [(d_train,'train'),(d_valid,'valid')] #參數設置(未調箱前的參數) params={ 'eta':0.2, #特征權重,取值范圍0~1,通常最后設置eta為0.01~0.2 'max_depth':3, #樹的深度,通常取值3-10,過大容易過擬合,過小欠擬合 'min_child_weight':1, #最小樣本的權重,調大參數可以繁殖過擬合 'gamma':0.4, #控制是否后剪枝,越大越保守,一般0.1、 0.2的樣子 'subsample':0.8, #隨機取樣比例 'colsample_bytree':0.8 , #默認為1,取值0~1,對特征隨機采集比例 'lambda':0.8, 'alpha':0.6, 'n_estimators':500, 'booster':'gbtree', #迭代樹 'objective':'binary:logistic', #邏輯回歸,輸出為概率 'nthread':6, #設置最大的進程量,若不設置則會使用全部資源 'scale_pos_weight':1, #默認為0,1可以處理類別不平衡 'seed':1234, #隨機樹種子 'silent':1, #0表示輸出結果 'eval_metric':'auc' #評分指標 } bst = xgb.train(params, d_train,1000,watchlist,early_stopping_rounds=100, verbose_eval=5) #最大迭代次數1000次 print(time.time()-start) tree_nums = bst.best_ntree_limit print('最優模型樹的數量:%s,最優迭代次數:%s,auc: %s' %(bst.best_ntree_limit,bst.best_iteration,bst.best_score)) bst = xgb.train(params, d_train,tree_nums,watchlist,early_stopping_rounds=100, verbose_eval=10) #最優模型迭代次數去訓練 # feat_imp = pd.Series(clf.booster().get_fscore()).sort_values(ascending=False) # #新版需要轉換成dict or list # #feat_imp = pd.Series(dict(clf.get_booster().get_fscore())).sort_values(ascending=False) # #plt.bar(feat_imp.index, feat_imp) # feat_imp.plot(kind='bar', title='Feature Importances') #展示特征重要性排名 feat_imp = bst.get_fscore(fmap='xgb.txt') feat_imp = sorted(feat_imp.items(),key=operator.itemgetter(1)) df = pd.DataFrame(feat_imp,columns=['feature','fscore']) #每個特征被調用的次數/所有特征被調用總次數 df['fscore'] = df['fscore']/df['fscore'].sum() #分數高的排在前面,展示前40個重要特征排名 df = df.sort_values(by='fscore',ascending=False) df = df.iloc[:40] plt.figure() df.plot(kind='bar',x='feature',y='fscore',legend=True,figsize=(32,10)) plt.title('XGBoost Feature Importance') plt.xlabel('relative importance') plt.gcf().savefig('feature_importance_xgb.png') plt.show() return bst
XGBoost 其實也是GBDT的一種,本編就說一下代碼
導入模塊
import numpy as np import pandas as pd import matplotlib.pyplot as plt import operator import time import xgboost as xgb from xgboost import plot_importance #畫特征重要性的函數 #from imblearn.ensemble import EasyEnsemble #還有模塊木有安裝 from sklearn.model_selection import train_test_split #from sklearn.externals import joblib 已經改成了下面這種方式 import joblib from sklearn.metrics import auc,roc_curve #說明是分類 plt.rc('font',family='SimHei',size=13) #使畫出的圖形中能正常顯示中文 %matplotlib inline
EDA數據探索性分析
#訓練數據、線上數據(無Y)、驗證數據 train_data = pd.read_csv('F:\\win10 升級桌面數據備份\\3.學習模型\\train_user_model_feat.csv') print(len(train_data[train_data['label']==1]),len(train_data[train_data['label']==0])) # 1: 815 0:42688 online_data = pd.read_csv('F:\\win10 升級桌面數據備份\\3.學習模型\\online_user_model_feat.csv') valid_data = pd.read_csv('F:\\win10 升級桌面數據備份\\3.學習模型\\valid_user_model_feat.csv') print(len(valid_data[valid_data['label']==1]),len(valid_data[valid_data['label']==0])) # 1:892 0:39302
拆分特征和標簽
train_y = train_data[['label']] train_y.columns = ['y'] train_x = train_data.drop(['label','user_id'],axis=1) valid_y = valid_data[['label']] valid_y.columns = ['y'] valid_x = valid_data.drop(['label','user_id'],axis=1) # file_xgboost_model='./xgboost_model' #模型文件 file_xgboost_columns='./columns.csv' #最終使用的特征 file_xgboost_model_auc_ks='./xgboost_model_auc_ks.png' #模型AUC和KS值 file_xgboost_model_score='./xgboost_model_score.png' # 模型預測用戶的評分分布 file_xgboost_model_prob='./xgboost_model_prob.png' #模型預測用戶的概率分布
網格搜索法調參
#coding=utf-8 from xgboost import XGBClassifier from sklearn.model_selection import GridSearchCV #網格搜索法 import xgboost as xgb def xgbpa(trainX, trainY): # init ,分類 xgb1 = XGBClassifier( learning_rate=0.3, n_estimators=200, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=6 ) # max_depth 和 min_weight 參數調優 這個用給網格搜索法時的參數,數的層數由3-6 param1 = {'max_depth': list(range(3, 7)), 'min_child_weight': list(range(1, 5, 2))} from sklearn import svm, datasets gsearch1 = GridSearchCV( estimator=XGBClassifier( learning_rate=0.3, n_estimators=150, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=6 ), param_grid=param1, scoring='roc_auc', n_jobs=4, iid=False, cv=5) gsearch1.fit(trainX, trainY) print(gsearch1.scorer_) print(gsearch1.best_params_, gsearch1.best_score_) #最佳參數(形式是字典型),最高分數(就一個值) best_max_depth = gsearch1.best_params_['max_depth'] #輸出的max_depth的values best_min_child_weight = gsearch1.best_params_['min_child_weight'] #同理上面 # gamma參數調優 param2 = {'gamma': [i / 10.0 for i in range(0, 5, 2)]} gsearch2 = GridSearchCV( estimator=XGBClassifier( learning_rate=0.3, # 如同學習率 n_estimators=150, # 樹的個數 max_depth=best_max_depth, #同時替換上面的值 min_child_weight=best_min_child_weight, #同理上面 gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=6 ), param_grid=param2, scoring='roc_auc', n_jobs=4, iid=False, cv=5) gsearch2.fit(trainX, trainY) print(gsearch2.scorer_) print(gsearch2.best_params_, gsearch2.best_score_) best_gamma = gsearch2.best_params_['gamma'] # 調整subsample 和 colsample_bytree參數 param3 = {'subsample': [i / 10.0 for i in range(6, 9)], 'colsample_bytree': [i / 10.0 for i in range(6, 9)]} gsearch3 = GridSearchCV( estimator=XGBClassifier( learning_rate=0.3, n_estimators=150, max_depth=best_max_depth, min_child_weight=best_min_child_weight, gamma=best_gamma, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=6 ), param_grid=param3, scoring='roc_auc', n_jobs=4, iid=False, cv=5) gsearch3.fit(trainX, trainY) print(gsearch3.scorer_) print(gsearch3.best_params_, gsearch3.best_score_) best_subsample = gsearch3.best_params_['subsample'] best_colsample_bytree = gsearch3.best_params_['colsample_bytree'] # 正則化參數調優 param4 = {'reg_alpha': [i / 10.0 for i in range(2, 10, 2)], 'reg_lambda': [i / 10.0 for i in range(2, 10, 2)]} gsearch4 = GridSearchCV( estimator=XGBClassifier( learning_rate=0.3, n_estimators=150, max_depth=best_max_depth, min_child_weight=best_min_child_weight, gamma=best_gamma, subsample=best_subsample, colsample_bytree=best_colsample_bytree, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=6 ), param_grid=param4, scoring='roc_auc', n_jobs=4, iid=False, cv=5) gsearch4.fit(trainX, trainY) print(gsearch4.scorer_) print(gsearch4.best_params_, gsearch4.best_score_) best_reg_alpha = gsearch4.best_params_['reg_alpha'] best_reg_lambda = gsearch4.best_params_['reg_lambda'] param5= {'scale_pos_weight': [i for i in [0.5, 1, 2]]} gsearch5 = GridSearchCV( estimator = XGBClassifier( learning_rate = 0.3, n_estimators = 150, max_depth = best_max_depth, min_child_weight = best_min_child_weight, gamma = best_gamma, subsample = best_subsample, colsample_bytree = best_colsample_bytree, reg_alpha = best_reg_alpha, reg_lambda = best_reg_lambda, objective = 'binary:logistic', nthread = 4, scale_pos_weight = 1, seed = 6 ), param_grid = param5, scoring = 'roc_auc', n_jobs = 4, iid = False, cv = 5) gsearch5.fit(trainX, trainY) print(gsearch5.best_params_, gsearch5.best_score_) best_scale_pos_weight = gsearch5.best_params_['scale_pos_weight'] # 降低學習速率,數的數量 param6 = [{'learning_rate': [0.01, 0.05, 0.1, 0.2], 'n_estimators': [800, 1000, 1200]}] gsearch6 = GridSearchCV( estimator=XGBClassifier( learning_rate=0.3, n_estimators=150, max_depth=best_max_depth, min_child_weight=best_min_child_weight, gamma=best_gamma, subsample=best_subsample, colsample_bytree=best_colsample_bytree, reg_alpha=best_reg_alpha, reg_lambda = best_reg_lambda, objective = 'binary:logistic', nthread = 4, scale_pos_weight = best_scale_pos_weight, seed = 6 ), param_grid = param6, scoring = 'roc_auc', n_jobs = 4, iid = False, cv = 5) gsearch6.fit(trainX, trainY) print(gsearch6.scorer_) print(gsearch6.best_params_, gsearch6.best_score_) best_learning_rate = gsearch6.best_params_['learning_rate'] best_n_estimators = gsearch6.best_params_['n_estimators'] print('最好參數集:') print(gsearch1.best_params_, gsearch1.best_score_) print(gsearch2.best_params_, gsearch2.best_score_) print(gsearch3.best_params_, gsearch3.best_score_) print(gsearch4.best_params_, gsearch4.best_score_) print(gsearch5.best_params_, gsearch5.best_score_) print(gsearch6.best_params_, gsearch6.best_score_) if __name__ == '__main__': # user_model cv #調參前得保持模型訓練樣本和調參樣本數據一致 print('--------------開始調參---------------') start = time.time() data_x,temp_x,data_y,temp_y = train_test_split(train_x,train_y,test_size=0.25,random_state=1234) xgbpa(data_x,data_y.y) #標簽值有要是數組型的,不能是df,所以就.y了 print('調參用時:%s'%(time.time()-start))
這個數據要跑挺久的(>0.5h)要留足時間去運行
--------------開始調參--------------- make_scorer(roc_auc_score, needs_threshold=True) {'max_depth': 3, 'min_child_weight': 3} 0.8169763045780181 make_scorer(roc_auc_score, needs_threshold=True) {'gamma': 0.0} 0.8169763045780181 make_scorer(roc_auc_score, needs_threshold=True) {'colsample_bytree': 0.8, 'subsample': 0.8} 0.8169763045780181 make_scorer(roc_auc_score, needs_threshold=True) {'reg_alpha': 0.6, 'reg_lambda': 0.8} 0.8148521719194484 {'scale_pos_weight': 0.5} 0.8155242908735241 make_scorer(roc_auc_score, needs_threshold=True) {'learning_rate': 0.01, 'n_estimators': 1200} 0.8467294278425243 最好參數集: {'max_depth': 3, 'min_child_weight': 3} 0.8169763045780181 {'gamma': 0.0} 0.8169763045780181 {'colsample_bytree': 0.8, 'subsample': 0.8} 0.8169763045780181 {'reg_alpha': 0.6, 'reg_lambda': 0.8} 0.8148521719194484 {'scale_pos_weight': 0.5} 0.8155242908735241 {'learning_rate': 0.01, 'n_estimators': 1200} 0.8467294278425243 調參用時:1126.5513534545898
特征列集索引表的建立
def create_feature_map(features): outfile = open('xgb.txt', 'w') #寫,新建一個叫xgb.txt的文件 i = 0 for feat in features: outfile.write('{0}\t{1}\tq\n'.format(i, feat)) #格式為 0 feature q \t是分隔符,為空 就是說第一列是序號,第二列是特征名稱,第三列是q,不知道需要這個q干嗎,可以是多寫了,先要着吧,后面再看看吧 i = i + 1 outfile.close()
create_feature_map(train_x.columns)
使用XGBoost訓練模型
只是用了一部分,還有一些參數沒有根據最優參數來使用,但是大部分都已經運用進去了
#運行XGBoost,輸出特征重要性排名 def run_xgboost(data_x,data_y,random_state_num): train_x,valid_x,train_y,valid_y = train_test_split(data_x.values,data_y.values,test_size=0.25,random_state=random_state_num) print('開始訓練模型') start = time.time() #轉換成xgb運算格式 d_train = xgb.DMatrix(train_x,train_y) d_valid = xgb.DMatrix(valid_x,valid_y) watchlist = [(d_train,'train'),(d_valid,'valid')] #參數設置(未調箱前的參數) params={ 'eta':0.2, #特征權重,取值范圍0~1,通常最后設置eta為0.01~0.2 'max_depth':3, #樹的深度,通常取值3-10,過大容易過擬合,過小欠擬合 'min_child_weight':1, #最小樣本的權重,調大參數可以繁殖過擬合 'gamma':0.4, #控制是否后剪枝,越大越保守,一般0.1、 0.2的樣子 'subsample':0.8, #隨機取樣比例 'colsample_bytree':0.8 , #默認為1,取值0~1,對特征隨機采集比例 'reg_lambda':0.8, 'reg_alpha':0.6, 'learning_rate':0.1, 'n_estimators':1000, 'booster':'gbtree', #迭代樹 'objective':'binary:logistic', #邏輯回歸,輸出為概率 'nthread':6, #設置最大的進程量,若不設置則會使用全部資源 'scale_pos_weight':1, #默認為0,1可以處理類別不平衡 'lambda':1, #默認為1,用於L2平滑處理項,避免模型過擬合 'seed':1234, #隨機樹種子 'silent':1, #0表示輸出結果 'eval_metric':'auc' #評分指標 } bst = xgb.train(params, d_train,1000,watchlist,early_stopping_rounds=100, verbose_eval=5) #最大迭代次數1000次 print(time.time()-start) tree_nums = bst.best_ntree_limit print('最優模型樹的數量:%s,最優迭代次數:%s,auc: %s' %(bst.best_ntree_limit,bst.best_iteration,bst.best_score)) bst = xgb.train(params, d_train,tree_nums,watchlist,early_stopping_rounds=100, verbose_eval=10) #最優模型迭代次數去訓練 # feat_imp = pd.Series(clf.booster().get_fscore()).sort_values(ascending=False) # #新版需要轉換成dict or list # #feat_imp = pd.Series(dict(clf.get_booster().get_fscore())).sort_values(ascending=False) # #plt.bar(feat_imp.index, feat_imp) # feat_imp.plot(kind='bar', title='Feature Importances') #展示特征重要性排名 feat_imp = bst.get_fscore(fmap='xgb.txt') feat_imp = sorted(feat_imp.items(),key=operator.itemgetter(1)) df = pd.DataFrame(feat_imp,columns=['feature','fscore']) #每個特征被調用的次數/所有特征被調用總次數 df['fscore'] = df['fscore']/df['fscore'].sum() #分數高的排在前面,展示前40個重要特征排名 df = df.sort_values(by='fscore',ascending=False) df = df.iloc[:40] plt.figure() df.plot(kind='bar',x='feature',y='fscore',legend=True,figsize=(32,10)) plt.title('XGBoost Feature Importance') plt.xlabel('relative importance') plt.gcf().savefig('feature_importance_xgb.png') plt.show() return bst
繪制roc曲線函數
# 繪制ROC曲線函數 def plot_roc(test_x, test_y): predictions = bst.predict(xgb.DMatrix(test_x)) false_positive_rate, true_positive_rate, thresholds = roc_curve(test_y, predictions) #roc的幾個參數 roc_auc = auc(false_positive_rate, true_positive_rate) #直接計算auc plt.title('Receiver Operating Characteristic') plt.plot(false_positive_rate,true_positive_rate, 'b', label='AUC = %0.4f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r.') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.ylabel('tpr') plt.xlabel('fpr') # 繪制K-S函數 從大到小排序,分10等分 def plot_ks(test_x, test_y): predictions = bst.predict(xgb.DMatrix(test_x)) false_positive_rate, true_positive_rate, thresholds = roc_curve(test_y, predictions, drop_intermediate=False) pre = sorted(predictions, reverse=True) #reverse參數為True意味着按照降序排序,這是畫ks時要求的 num = [] for i in range(10): num.append((i) * int(len(pre) / 10)) num.append(len(pre) - 1) df = pd.DataFrame() df['false_positive_rate'] = false_positive_rate df['true_positive_rate'] = true_positive_rate df['thresholds'] = thresholds data_ks = [] for i in num: data_ks.append(list(df[df['thresholds'] == pre[i]].values[0])) data_ks = pd.DataFrame(data_ks) data_ks.columns = ['fpr', 'tpr', 'thresholds'] ks = max(data_ks['tpr'] - data_ks['fpr']) plt.title('K-S曲線') plt.plot(np.array(range(len(num))), data_ks['tpr']) plt.plot(np.array(range(len(num))), data_ks['fpr']) plt.plot(np.array(range(len(num))), data_ks['tpr'] - data_ks['fpr'], label='K-S = %0.4f' % ks) plt.legend(loc='lower right') plt.xlim([0, 10]) plt.ylim([0.0, 1.0]) plt.ylabel('累計占比') plt.xlabel('分組編號') # 繪制一張圖,包含訓練和測試集的ROC、AUC、K-S圖形指標。 def auc_ks(train_x, test_x, train_y, test_y): plt.figure(figsize=(15, 15)) plt.subplot(221) plot_roc(train_x, train_y) plt.subplot(222) plot_roc(test_x, test_y) plt.subplot(223) plot_ks(train_x, train_y) plt.subplot(224) plot_ks(test_x, test_y) plt.savefig(file_xgboost_model_auc_ks) plt.show()
保存模型、評價指標、選擇變量等
#保存模型、評價指標、選擇變量到D盤 def run_main(data_x,data_y): global bst start=time.time() bst=run_xgboost(data_x,data_y,random_state_num=1234) #為什么要是1234,因為調參時候就是=1234 joblib.dump(bst, file_xgboost_model) #joblib的用法https://www.cnblogs.com/wzdLY/p/9630671.html 將模型保存 print('模型已成功保存在 %s'%(file_xgboost_model)) train_x, test_x, train_y, test_y = train_test_split(data_x.values, data_y.values, test_size=0.25, random_state=1234) auc_ks(train_x, test_x, train_y, test_y) print('模型評價指標已保存在:%s'%(file_xgboost_model_auc_ks)) print('運行共花費時間:%s'%(time.time()-start)) if __name__=='__main__': run_main(train_x, train_y)
分別是訓練集和測試集的auc和ks,還有特征重要性的排列
用驗證集數據驗證模型效果
# 繪制ROC曲線函數 def plot_test_roc(test_x, test_y,filename): bst = joblib.load(filename) predictions = bst.predict(xgb.DMatrix(test_x.values)) false_positive_rate,true_positive_rate, thresholds = roc_curve(test_y, predictions) roc_auc = auc(false_positive_rate, true_positive_rate) plt.title('Receiver Operating Characteristic') plt.plot(false_positive_rate,true_positive_rate, 'b', label='AUC = %0.4f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.ylabel('Recall') plt.xlabel('Fall-out') plt.show() if __name__=='__main__': plot_test_roc(valid_x,valid_y,file_xgboost_model)
下面附上全部代碼

# -*- coding: utf-8 -*- """ Created on Wed Mar 10 19:01:07 2021 @author: Administrator """ #%% import numpy as np import pandas as pd import matplotlib.pyplot as plt import operator import time import xgboost as xgb from xgboost import plot_importance #畫特征重要性的函數 #from imblearn.ensemble import EasyEnsemble #還有模塊木有安裝 from sklearn.model_selection import train_test_split #from sklearn.externals import joblib 已經改成了下面這種方式 import joblib from sklearn.metrics import auc,roc_curve #說明是分類 plt.rc('font',family='SimHei',size=13) #使畫出的圖形中能正常顯示中文 %matplotlib inline #%% #訓練數據、線上數據(無Y)、驗證數據 train_data = pd.read_csv('D:/迅雷下載/3.學習模型/3.學習模型/train_user_model_feat.csv') print(len(train_data[train_data['label']==1]),len(train_data[train_data['label']==0])) # 1: 815 0:42688 online_data = pd.read_csv('D:/迅雷下載/3.學習模型/3.學習模型/online_user_model_feat.csv') valid_data = pd.read_csv('D:/迅雷下載/3.學習模型/3.學習模型/valid_user_model_feat.csv') print(len(valid_data[valid_data['label']==1]),len(valid_data[valid_data['label']==0])) # 1:892 0:39302 #%% train_y = train_data[['label']] train_y.columns = ['y'] train_x = train_data.drop(['label','user_id'],axis=1) valid_y = valid_data[['label']] valid_y.columns = ['y'] valid_x = valid_data.drop(['label','user_id'],axis=1) # file_xgboost_model='./xgboost_model' #模型文件 file_xgboost_columns='./columns.csv' #最終使用的特征 file_xgboost_model_auc_ks='./xgboost_model_auc_ks.png' #模型AUC和KS值 file_xgboost_model_score='./xgboost_model_score.png' # 模型預測用戶的評分分布 file_xgboost_model_prob='./xgboost_model_prob.png' #模型預測用戶的概率分布 #%% #coding=utf-8 from xgboost import XGBClassifier from sklearn.model_selection import GridSearchCV #網格搜索法 import xgboost as xgb def xgbpa(trainX, trainY): # init ,分類 xgb1 = XGBClassifier( learning_rate=0.3, n_estimators=150, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=6 ) # max_depth 和 min_weight 參數調優 這個用給網格搜索法時的參數,數的層數由3-6 param1 = {'max_depth': list(range(3, 7)), 'min_child_weight': list(range(1, 5, 2))} from sklearn import svm, datasets gsearch1 = GridSearchCV( estimator=XGBClassifier( learning_rate=0.3, n_estimators=150, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=6 ), param_grid=param1, scoring='roc_auc', n_jobs=4, iid=False, cv=5) gsearch1.fit(trainX, trainY) print(gsearch1.scorer_) print(gsearch1.best_params_, gsearch1.best_score_) #最佳參數(形式是字典型),最高分數(就一個值) best_max_depth = gsearch1.best_params_['max_depth'] #輸出的max_depth的values best_min_child_weight = gsearch1.best_params_['min_child_weight'] #同理上面 # gamma參數調優 param2 = {'gamma': [i / 10.0 for i in range(0, 5, 2)]} gsearch2 = GridSearchCV( estimator=XGBClassifier( learning_rate=0.3, # 如同學習率 n_estimators=150, # 樹的個數 max_depth=best_max_depth, #同時替換上面的值 min_child_weight=best_min_child_weight, #同理上面 gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=6 ), param_grid=param2, scoring='roc_auc', n_jobs=4, iid=False, cv=5) gsearch2.fit(trainX, trainY) print(gsearch2.scorer_) print(gsearch2.best_params_, gsearch2.best_score_) best_gamma = gsearch2.best_params_['gamma'] # 調整subsample 和 colsample_bytree參數 param3 = {'subsample': [i / 10.0 for i in range(6, 9)], 'colsample_bytree': [i / 10.0 for i in range(6, 9)]} gsearch3 = GridSearchCV( estimator=XGBClassifier( learning_rate=0.3, n_estimators=150, max_depth=best_max_depth, min_child_weight=best_min_child_weight, gamma=best_gamma, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=6 ), param_grid=param3, scoring='roc_auc', n_jobs=4, iid=False, cv=5) gsearch3.fit(trainX, trainY) print(gsearch3.scorer_) print(gsearch3.best_params_, gsearch3.best_score_) best_subsample = gsearch3.best_params_['subsample'] best_colsample_bytree = gsearch3.best_params_['colsample_bytree'] # 正則化參數調優 param4 = {'reg_alpha': [i / 10.0 for i in range(2, 10, 2)], 'reg_lambda': [i / 10.0 for i in range(2, 10, 2)]} gsearch4 = GridSearchCV( estimator=XGBClassifier( learning_rate=0.3, n_estimators=150, max_depth=best_max_depth, min_child_weight=best_min_child_weight, gamma=best_gamma, subsample=best_subsample, colsample_bytree=best_colsample_bytree, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=6 ), param_grid=param4, scoring='roc_auc', n_jobs=4, iid=False, cv=5) gsearch4.fit(trainX, trainY) print(gsearch4.scorer_) print(gsearch4.best_params_, gsearch4.best_score_) best_reg_alpha = gsearch4.best_params_['reg_alpha'] best_reg_lambda = gsearch4.best_params_['reg_lambda'] param5= {'scale_pos_weight': [i for i in [0.5, 1, 2]]} gsearch5 = GridSearchCV( estimator = XGBClassifier( learning_rate = 0.3, n_estimators = 150, max_depth = best_max_depth, min_child_weight = best_min_child_weight, gamma = best_gamma, subsample = best_subsample, colsample_bytree = best_colsample_bytree, reg_alpha = best_reg_alpha, reg_lambda = best_reg_lambda, objective = 'binary:logistic', nthread = 4, scale_pos_weight = 1, seed = 6 ), param_grid = param5, scoring = 'roc_auc', n_jobs = 4, iid = False, cv = 5) gsearch5.fit(trainX, trainY) print(gsearch5.best_params_, gsearch5.best_score_) best_scale_pos_weight = gsearch5.best_params_['scale_pos_weight'] # 降低學習速率,數的數量 param6 = [{'learning_rate': [0.01, 0.05, 0.1, 0.2], 'n_estimators': [800, 1000, 1200]}] gsearch6 = GridSearchCV( estimator=XGBClassifier( learning_rate=0.3, n_estimators=150, max_depth=best_max_depth, min_child_weight=best_min_child_weight, gamma=best_gamma, subsample=best_subsample, colsample_bytree=best_colsample_bytree, reg_alpha=best_reg_alpha, reg_lambda = best_reg_lambda, objective = 'binary:logistic', nthread = 4, scale_pos_weight = best_scale_pos_weight, seed = 6 ), param_grid = param6, scoring = 'roc_auc', n_jobs = 4, iid = False, cv = 5) gsearch6.fit(trainX, trainY) print(gsearch6.scorer_) print(gsearch6.best_params_, gsearch6.best_score_) best_learning_rate = gsearch6.best_params_['learning_rate'] best_n_estimators = gsearch6.best_params_['n_estimators'] print('最好參數集:') print(gsearch1.best_params_, gsearch1.best_score_) print(gsearch2.best_params_, gsearch2.best_score_) print(gsearch3.best_params_, gsearch3.best_score_) print(gsearch4.best_params_, gsearch4.best_score_) print(gsearch5.best_params_, gsearch5.best_score_) print(gsearch6.best_params_, gsearch6.best_score_) if __name__ == '__main__': # user_model cv #調參前得保持模型訓練樣本和調參樣本數據一致 print('--------------開始調參---------------') start = time.time() data_x,temp_x,data_y,temp_y = train_test_split(train_x,train_y,test_size=0.25,random_state=1234) xgbpa(data_x,data_y.y) #標簽值有要是數組型的,不能是df,所以就.y了 print('調參用時:%s'%(time.time()-start)) #%% def create_feature_map(features): outfile = open('xgb.txt', 'w') #寫,新建一個叫xgb.txt的文件 i = 0 for feat in features: outfile.write('{0}\t{1}\tq\n'.format(i, feat)) #格式為 0 feature q \t是分隔符,為空 就是說第一列是序號,第二列是特征名稱,第三列是q,不知道需要這個q干嗎,可以是多寫了,先要着吧,后面再看看吧 i = i + 1 outfile.close() create_feature_map(train_x.columns) #%% #運行XGBoost,輸出特征重要性排名 def run_xgboost(data_x,data_y,random_state_num): train_x,valid_x,train_y,valid_y = train_test_split(data_x.values,data_y.values,test_size=0.25,random_state=random_state_num) print('開始訓練模型') start = time.time() #轉換成xgb運算格式 d_train = xgb.DMatrix(train_x,train_y) d_valid = xgb.DMatrix(valid_x,valid_y) watchlist = [(d_train,'train'),(d_valid,'valid')] #參數設置(未調箱前的參數) params={ 'eta':0.2, #特征權重,取值范圍0~1,通常最后設置eta為0.01~0.2 'max_depth':3, #樹的深度,通常取值3-10,過大容易過擬合,過小欠擬合 'min_child_weight':1, #最小樣本的權重,調大參數可以繁殖過擬合 'gamma':0.4, #控制是否后剪枝,越大越保守,一般0.1、 0.2的樣子 'subsample':0.8, #隨機取樣比例 'colsample_bytree':0.8 , #默認為1,取值0~1,對特征隨機采集比例 'reg_lambda':0.8, 'reg_alpha':0.6, 'learning_rate':0.1, 'n_estimators':1000, 'booster':'gbtree', #迭代樹 'objective':'binary:logistic', #邏輯回歸,輸出為概率 'nthread':6, #設置最大的進程量,若不設置則會使用全部資源 'scale_pos_weight':1, #默認為0,1可以處理類別不平衡 'lambda':1, #默認為1,用於L2平滑處理項,避免模型過擬合 'seed':1234, #隨機樹種子 'silent':1, #0表示輸出結果 'eval_metric':'auc' #評分指標 } bst = xgb.train(params, d_train,1000,watchlist,early_stopping_rounds=100, verbose_eval=5) #最大迭代次數1000次 print(time.time()-start) tree_nums = bst.best_ntree_limit print('最優模型樹的數量:%s,最優迭代次數:%s,auc: %s' %(bst.best_ntree_limit,bst.best_iteration,bst.best_score)) bst = xgb.train(params, d_train,tree_nums,watchlist,early_stopping_rounds=100, verbose_eval=10) #最優模型迭代次數去訓練 # feat_imp = pd.Series(clf.booster().get_fscore()).sort_values(ascending=False) # #新版需要轉換成dict or list # #feat_imp = pd.Series(dict(clf.get_booster().get_fscore())).sort_values(ascending=False) # #plt.bar(feat_imp.index, feat_imp) # feat_imp.plot(kind='bar', title='Feature Importances') #展示特征重要性排名 feat_imp = bst.get_fscore(fmap='xgb.txt') feat_imp = sorted(feat_imp.items(),key=operator.itemgetter(1)) df = pd.DataFrame(feat_imp,columns=['feature','fscore']) #每個特征被調用的次數/所有特征被調用總次數 df['fscore'] = df['fscore']/df['fscore'].sum() #分數高的排在前面,展示前40個重要特征排名 df = df.sort_values(by='fscore',ascending=False) df = df.iloc[:40] plt.figure() df.plot(kind='bar',x='feature',y='fscore',legend=True,figsize=(32,10)) plt.title('XGBoost Feature Importance') plt.xlabel('relative importance') plt.gcf().savefig('feature_importance_xgb.png') plt.show() return bst #%% # 繪制ROC曲線函數 def plot_roc(test_x, test_y): predictions = bst.predict(xgb.DMatrix(test_x)) false_positive_rate, true_positive_rate, thresholds = roc_curve(test_y, predictions) #roc的幾個參數 roc_auc = auc(false_positive_rate, true_positive_rate) #直接計算auc plt.title('Receiver Operating Characteristic') plt.plot(false_positive_rate,true_positive_rate, 'b', label='AUC = %0.4f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r.') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.ylabel('tpr') plt.xlabel('fpr') # 繪制K-S函數 從大到小排序,分10等分 def plot_ks(test_x, test_y): predictions = bst.predict(xgb.DMatrix(test_x)) false_positive_rate, true_positive_rate, thresholds = roc_curve(test_y, predictions, drop_intermediate=False) pre = sorted(predictions, reverse=True) #reverse參數為True意味着按照降序排序,這是畫ks時要求的 num = [] for i in range(10): num.append((i) * int(len(pre) / 10)) num.append(len(pre) - 1) df = pd.DataFrame() df['false_positive_rate'] = false_positive_rate df['true_positive_rate'] = true_positive_rate df['thresholds'] = thresholds data_ks = [] for i in num: data_ks.append(list(df[df['thresholds'] == pre[i]].values[0])) data_ks = pd.DataFrame(data_ks) data_ks.columns = ['fpr', 'tpr', 'thresholds'] ks = max(data_ks['tpr'] - data_ks['fpr']) plt.title('K-S曲線') plt.plot(np.array(range(len(num))), data_ks['tpr']) plt.plot(np.array(range(len(num))), data_ks['fpr']) plt.plot(np.array(range(len(num))), data_ks['tpr'] - data_ks['fpr'], label='K-S = %0.4f' % ks) plt.legend(loc='lower right') plt.xlim([0, 10]) plt.ylim([0.0, 1.0]) plt.ylabel('累計占比') plt.xlabel('分組編號') # 繪制一張圖,包含訓練和測試集的ROC、AUC、K-S圖形指標。 def auc_ks(train_x, test_x, train_y, test_y): plt.figure(figsize=(15, 15)) plt.subplot(221) plot_roc(train_x, train_y) plt.subplot(222) plot_roc(test_x, test_y) plt.subplot(223) plot_ks(train_x, train_y) plt.subplot(224) plot_ks(test_x, test_y) plt.savefig(file_xgboost_model_auc_ks) plt.show() #%% #保存模型、評價指標、選擇變量到D盤 def run_main(data_x,data_y): global bst start=time.time() bst=run_xgboost(data_x,data_y,random_state_num=1234) #為什么要是1234,因為調參時候就是=1234 joblib.dump(bst, file_xgboost_model) #joblib的用法https://www.cnblogs.com/wzdLY/p/9630671.html 將模型保存 print('模型已成功保存在 %s'%(file_xgboost_model)) train_x, test_x, train_y, test_y = train_test_split(data_x.values, data_y.values, test_size=0.25, random_state=1234) auc_ks(train_x, test_x, train_y, test_y) print('模型評價指標已保存在:%s'%(file_xgboost_model_auc_ks)) print('運行共花費時間:%s'%(time.time()-start)) if __name__=='__main__': run_main(train_x, train_y) # 繪制ROC曲線函數 def plot_test_roc(test_x, test_y,filename): bst = joblib.load(filename) predictions = bst.predict(xgb.DMatrix(test_x.values)) false_positive_rate,true_positive_rate, thresholds = roc_curve(test_y, predictions) roc_auc = auc(false_positive_rate, true_positive_rate) plt.title('Receiver Operating Characteristic') plt.plot(false_positive_rate,true_positive_rate, 'b', label='AUC = %0.4f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.ylabel('Recall') plt.xlabel('Fall-out') plt.show() if __name__=='__main__': plot_test_roc(valid_x,valid_y,file_xgboost_model)
這里主要補充一下這個調參的不足之處
1.調參使用的是全部樣本,這樣不是很適合
2.使用網格搜索法,太耗時,
3.調參順序不對,最后面的效果不好
后面我又使用邏輯回歸做了一個模型,代碼如下:

# -*- coding: utf-8 -*- """ Created on Wed Mar 10 15:58:41 2021 @author: Administrator """ #%% import numpy as np import pandas as pd import matplotlib.pyplot as plt import operator import time import xgboost as xgb from xgboost import plot_importance #畫特征重要性的函數 #from imblearn.ensemble import EasyEnsemble #還有模塊木有安裝 from sklearn.model_selection import train_test_split #from sklearn.externals import joblib 已經改成了下面這種方式 import joblib from sklearn.metrics import auc,roc_curve #說明是分類 plt.rc('font',family='SimHei',size=13) #使畫出的圖形中能正常顯示中文 %matplotlib inline #%% #訓練數據、線上數據(無Y)、驗證數據 train_data = pd.read_csv('D:/迅雷下載/3.學習模型/3.學習模型/train_user_model_feat.csv') print(len(train_data[train_data['label']==1]),len(train_data[train_data['label']==0])) # 1: 815 0:42688 online_data = pd.read_csv('D:/迅雷下載/3.學習模型/3.學習模型/online_user_model_feat.csv') valid_data = pd.read_csv('D:/迅雷下載/3.學習模型/3.學習模型/valid_user_model_feat.csv') print(len(valid_data[valid_data['label']==1]),len(valid_data[valid_data['label']==0])) # 1:892 0:39302 #%% for i in train_data.columns : print(i,train_data[i].nunique()) a=[] for i in train_data.columns : if train_data[i].nunique()<10: a.append(i) a=['rule_uid', 'user_lv_cd', 'user_date_cnt_b7day', 'uc_date_cnt_b7day', 'uc_act_4', 'uc_buy_bool_day7'] train_data.pop('user_id') train_data['y'] = train_data['label'] train_data.pop('label') #%%類別型變量的iv import pycard as pc cate_iv_woedf = pc.WoeDf() for i in a: cate_iv_woedf.append(pc.cross_woe(train_data[i] ,train_data.y)) #cate_iv_woedf.to_excel('tmp1') #%%數值型變量的iv num_col = [i for i in train_data.columns if i not in a] num_col.remove('y') clf = pc.NumBin(min_bin_samples=200, min_impurity_decrease=4e-5) for i in num_col: clf.fit(train_data[i] ,train_data.y) #clf.generate_transform_fun() cate_iv_woedf.append(clf.woe_df_) #%%相關性分析 train_data.corr_tri().abs().to_excel('tmp1.xlsx') def argmax(x): """計算 df 的最大值所對應的行、列索引,返回 (row, cols) 元組""" m0 = x.max() max_value = m0.max() col_label = m0.idxmax() row_label = x[col_label].idxmax() return row_label, col_label def corr_filter(detail_df, vars_iv, corr_tol=0.9, iv_diff=0.01): """相關性系數 >= tol的兩個列, 假設 var1 的 IV 比 var2 的 IV 更高: 若 var1_iv - var2_iv > iv_diff,則將其中 IV 值更低的列刪除 \n 參數: ---------- detail_sr: dataframe, 需要計算相關性的明細數據框 \n vars_iv: dataframe, 包含2列:colName和iv列。各個變量的 IV 指標 該參數的值一般由 woedf.var_ivs 方法返回。 \n corr_tol: float, 線性相關性閾值,超過此閾值的兩個列,則判斷兩個列相關性過高,進而判斷 IV 之差是否足夠大 \n iv_diff: float, 兩個列的 IV 差值的閾值,自動刪除 IV 更低的列 返回值: ---------- corr_df: dataframe, 相關性矩陣,並刪除了相關性超過閾值的列 \n dropped_col: list, 刪除的列""" corr_df = detail_df.corr_tri().abs() vars_iv = vars_iv.set_index('colName') corr_df = corr_df.fillna(0) dropped_col = [] while True: row, col = argmax(corr_df) if corr_df.loc[row, col] >= corr_tol: drop_label = row if vars_iv.loc[row,'IV'] < vars_iv.loc[col,'IV'] else col dropped_col.append(drop_label) corr_df = corr_df.drop(drop_label).drop(drop_label, axis=1) vars_iv = vars_iv.drop(drop_label) if len(corr_df) == 1: break else: break return corr_df, dropped_col t=cate_iv_woedf.var_ivs().iloc[:,0:-1].reset_index() t.columns = ['colName','IV'] corr_df, dropped_col = corr_filter(train_data[a+num_col],t,corr_tol=0.75, iv_diff=0.00001) data_drop_corr = train_data.drop(columns = dropped_col) #%% cate_iv_woedf = pc.WoeDf() clf = pc.NumBin(min_bin_samples=200, min_impurity_decrease=4e-5) for i in data_drop_corr.columns: clf.fit(data_drop_corr[i] ,data_drop_corr.y) #clf.generate_transform_fun() cate_iv_woedf.append(clf.woe_df_) cate_iv_woedf.to_excel('tmp1') #%%刪除下面這些字段 drop_2 = ["uc_date_cnt_b7day", "user_act_totalCnt_15day", "max_click", "freq_click", "uc_act_decay_3", "uc_act_4", "uc_act_time_zone_0", "uc_act_time_zone_1", "ratio_1_6", "uc_last_tm_dist", "user_date_cnt_b15day", "uc_date_cnt_b15day", "uc_date_ratio_15", "uc_act_totalCnt", "uc_act_ratio_60day", "uc_act_ratio_15day", "ratio_act_time_1day", "user_act_time_5day", "ratio_act_time_5day", "mean_uc_act", "uc_act_time_zone_2", "uc_act_time_zone_3", "uc_buy_bool_day7"] # 去掉這些V13 V15 V22 V24 V25 V26 num_col = [i for i in data_drop_corr.columns if i not in drop_2] num_col.remove('y') num_iv_woedf = pc.WoeDf() clf = pc.NumBin(min_bin_samples=200, min_impurity_decrease=4e-5) for i in num_col: clf.fit(data_drop_corr[i] ,data_drop_corr.y) data_drop_corr[i+'_bin'] = clf.transform(data_drop_corr[i]) #這樣可以省略掉后面轉換成_bin的一步驟 num_iv_woedf.append(clf.woe_df_) #%%woe轉換 bin_col = [i for i in list(data_drop_corr.columns) if i[-4:]=='_bin'] cate_iv_woedf = pc.WoeDf() for i in bin_col: cate_iv_woedf.append(pc.cross_woe(data_drop_corr[i] ,data_drop_corr.y)) #cate_iv_woedf.to_excel('tmp1') cate_iv_woedf.bin2woe(data_drop_corr,bin_col) cate_iv_woedf.to_excel('tmp.xlsx') #%%建模 model_col = [i for i in list(data_drop_corr.columns) if i[-4:]=='_woe'] import pandas as pd import matplotlib.pyplot as plt #導入圖像庫 import matplotlib import seaborn as sns import statsmodels.api as sm from sklearn.metrics import roc_curve, auc from sklearn.model_selection import train_test_split X = data_drop_corr[model_col] Y = data_drop_corr['y'] x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=100) X1=sm.add_constant(x_train) #在X前加上一列常數1,方便做帶截距項的回歸 logit=sm.Logit(y_train.astype(float),X1.astype(float)) result=logit.fit() result.summary() result.params resu_1 = result.predict(X1.astype(float)) fpr, tpr, threshold = roc_curve(y_train, resu_1) rocauc = auc(fpr, tpr) #0.9693313248601317 plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % rocauc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('真正率') plt.xlabel('假正率') plt.show() #%%測試集 X3 = sm.add_constant(x_test) resu = result.predict(X3.astype(float)) fpr, tpr, threshold = roc_curve(y_test, resu) rocauc = auc(fpr, tpr) plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % rocauc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('真正率') plt.xlabel('假正率') plt.show() #%%驗證集 num_iv_woedf_1 = pc.WoeDf() clf = pc.NumBin(min_bin_samples=200, min_impurity_decrease=4e-5) for i in num_col: clf.fit(data_drop_corr[i] ,data_drop_corr.y) valid_data[i+'_bin'] = pc.binning(valid_data[i],clf.bins_) #這樣可以省略掉后面轉換成_bin的一步驟 #num_iv_woedf_1.append(clf.woe_df_) #%%woe轉換 bin_col_1 = [i for i in list(valid_data.columns) if i[-4:]=='_bin'] cate_iv_woedf.bin2woe(valid_data,bin_col) model_col_1 = [i for i in list(valid_data.columns) if i[-4:]=='_woe'] valid_data = valid_data.rename(columns={'label':'y'}) X_test = valid_data[model_col_1] Y_test = valid_data['y'] X4 = sm.add_constant(X_test) resu = result.predict(X4.astype(float)) fpr, tpr, threshold = roc_curve(Y_test, resu) rocauc = auc(fpr, tpr) #0.7931891609482327 plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % rocauc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('真正率') plt.xlabel('假正率') plt.show()
上面分別是測試集和驗證集的auc ,比xgboost的驗證集差一點,但是測試集比他好一點,這也驗證了xgboost不易過擬合的性質