地址:https://www.kaggle.com/mlg-ulb/creditcardfraud
數據概述
數據集包含2013年9月歐洲持卡人通過信用卡進行的交易。
該數據集顯示了兩天內發生的交易,在284,807筆交易中,我們有492起欺詐。數據集高度不平衡,陽性類別(欺詐)占所有交易的0.172%。
它僅包含數字輸入變量,它們是PCA轉換的結果。遺憾的是,由於機密性問題,我們無法提供有關數據的原始功能和更多背景信息。功能部件V1,V2,…,V28是使用PCA獲得的主要組件,唯一尚未使用PCA轉換的功能部件是“時間”和“量”。功能“時間”包含數據集中每個事務和第一個事務之間經過的秒數。功能“金額”是交易金額,此功能可用於與示例相關的成本敏感型學習。要素“類別”是響應變量,在發生欺詐時其值為1,否則為0。
識別欺詐性的信用卡交易。
給定類別不平衡率,我們建議使用精確召回曲線下的面積(AUPRC)測量精度。混淆矩陣的准確性對於不平衡分類沒有意義。
還有很多的至於要使用PRC曲線,我們后面在補上,現在先補充代碼
這個是使用邏輯回歸計算的代碼

# -*- coding: utf-8 -*- """ Created on Thu Feb 18 17:22:54 2021 @author: Administrator """ #%%導入模塊 import pandas as pd import numpy as np from scipy import stats import seaborn as sns import matplotlib.pyplot as plt %matplotlib inline plt.rc("font",family="SimHei",size="12") #解決中文無法顯示的問題 #%%導入數據 creditcard = pd.read_csv('D:/信用卡欺詐檢測/creditcard.csv/creditcard.csv') creditcard.info() creditcard.isnull().sum() creditcard.corr().to_excel('tmp1.xlsx') #%%使用常規的方法 creditcard.Class.mean() #creditcard.Class,這個屬於極度傾斜了 28w條數據 #%%數值變量的iv值計算 num_col = list(creditcard.columns)[1:-1] num_iv_woedf = pc.WoeDf() clf = pc.NumBin() for i in num_col: clf.fit(creditcard[i] ,creditcard.Class) #clf.generate_transform_fun() num_iv_woedf.append(clf.woe_df_) num_iv_woedf.to_excel('tmp2') # 去掉這些V13 V15 V22 V24 V25 V26 num_col = [i for i in num_col if i not in ['V13', 'V15', 'V22', 'V24', 'V25', 'V26']] num_iv_woedf = pc.WoeDf() clf = pc.NumBin() for i in num_col: clf.fit(creditcard[i] ,creditcard.Class) creditcard[i+'_bin'] = clf.transform(creditcard[i]) #這樣可以省略掉后面轉換成_bin的一步驟 num_iv_woedf.append(clf.woe_df_) #%%woe轉換 bin_col = [i for i in list(creditcard.columns) if i[-4:]=='_bin'] cate_iv_woedf = pc.WoeDf() for i in bin_col: cate_iv_woedf.append(pc.cross_woe(creditcard[i] ,creditcard.Class)) cate_iv_woedf.to_excel('tmp1') cate_iv_woedf.bin2woe(creditcard,bin_col) #%%建模 model_col = [i for i in list(creditcard.columns) if i[-4:]=='_woe'] import pandas as pd import matplotlib.pyplot as plt #導入圖像庫 import matplotlib import seaborn as sns import statsmodels.api as sm from sklearn.metrics import roc_curve, auc from sklearn.model_selection import train_test_split X = creditcard[model_col] Y = creditcard['Class'] x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=100) X1=sm.add_constant(x_train) #在X前加上一列常數1,方便做帶截距項的回歸 logit=sm.Logit(y_train.astype(float),X1.astype(float)) result=logit.fit() result.summary() result.params resu_1 = result.predict(X1.astype(float)) fpr, tpr, threshold = roc_curve(y_train, resu_1) rocauc = auc(fpr, tpr) #0.9693313248601317 plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % rocauc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('真正率') plt.xlabel('假正率') plt.show() # 此處我們看一下混淆矩陣 from sklearn.metrics import precision_score, recall_score, f1_score,confusion_matrix #lr = LogisticRegression(C=best_c, penalty='l1') #lr.fit(X_train_undersample, y_train_undersample) #y_pred_undersample = lr.predict(X_train_undersample) resu_1 = resu_1.apply(lambda x :1 if x>=0.5 else 0) matrix = confusion_matrix(y_train, resu_1) print("混淆矩陣:\n", matrix) print("精度:", precision_score(y_train, resu_1)) print("召回率:", recall_score(y_train, resu_1)) print("f1分數:", f1_score(y_train, resu_1)) ''' 混淆矩陣: [[198985 29] [ 73 277]] 精度: 0.9052287581699346 召回率: 0.7914285714285715 f1分數: 0.8445121951219513 ''' #%%驗證集 X3 = sm.add_constant(x_test) resu = result.predict(X3.astype(float)) fpr, tpr, threshold = roc_curve(y_test, resu) rocauc = auc(fpr, tpr) plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % rocauc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('真正率') plt.xlabel('假正率') plt.show() # 此處我們看一下混淆矩陣 from sklearn.metrics import precision_score, recall_score, f1_score,confusion_matrix #lr = LogisticRegression(C=best_c, penalty='l1') #lr.fit(X_train_undersample, y_train_undersample) #y_pred_undersample = lr.predict(X_train_undersample) resu = resu.apply(lambda x :1 if x>=0.5 else 0) matrix = confusion_matrix(y_test, resu) print("混淆矩陣:\n", matrix) print("精度:", precision_score(y_test, resu)) print("召回率:", recall_score(y_test, resu)) print("f1分數:", f1_score(y_test, resu)) ''' 混淆矩陣: [[85275 26] [ 40 102]] 精度: 0.796875 召回率: 0.7183098591549296 f1分數: 0.7555555555555555 ''' #%%試一下那個度量工具 def tpr_weight_funtion(y_true,y_predict): d = pd.DataFrame() d['prob'] = list(y_predict) d['y'] = list(y_true) d = d.sort_values(['prob'], ascending=[0]) y = d.y PosAll = pd.Series(y).value_counts()[1] NegAll = pd.Series(y).value_counts()[0] pCumsum = d['y'].cumsum() nCumsum = np.arange(len(y)) - pCumsum + 1 pCumsumPer = pCumsum / PosAll nCumsumPer = nCumsum / NegAll TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()] TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()] TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()] return 0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3 tpr_weight_funtion(y_train, resu_1) #0.8754285714285714
下面補上xgboost模型的代碼

# -*- coding: utf-8 -*- """ Created on Wed Mar 10 19:47:40 2021 @author: Administrator """ #%% import numpy as np import pandas as pd import matplotlib.pyplot as plt import operator import time import xgboost as xgb from xgboost import plot_importance #畫特征重要性的函數 #from imblearn.ensemble import EasyEnsemble #還有模塊木有安裝 from sklearn.model_selection import train_test_split #from sklearn.externals import joblib 已經改成了下面這種方式 import joblib from sklearn.metrics import auc,roc_curve #說明是分類 plt.rc('font',family='SimHei',size=13) #使畫出的圖形中能正常顯示中文 %matplotlib inline #%%導入數據 creditcard = pd.read_csv('D:/信用卡欺詐檢測/creditcard.csv/creditcard.csv') #%% train_y = creditcard[['Class']] train_y.columns = ['y'] train_x = creditcard.drop(['Class','Time'],axis=1) # file_xgboost_model='./xgboost_model' #模型文件 file_xgboost_columns='./columns.csv' #最終使用的特征 file_xgboost_model_auc_ks='./xgboost_model_auc_ks.png' #模型AUC和KS值 file_xgboost_model_score='./xgboost_model_score.png' # 模型預測用戶的評分分布 file_xgboost_model_prob='./xgboost_model_prob.png' #模型預測用戶的概率分布 #%% def create_feature_map(features): outfile = open('xgb.txt', 'w') #寫,新建一個叫xgb.txt的文件 i = 0 for feat in features: outfile.write('{0}\t{1}\tq\n'.format(i, feat)) #格式為 0 feature q \t是分隔符,為空 就是說第一列是序號,第二列是特征名稱,第三列是q,不知道需要這個q干嗎,可以是多寫了,先要着吧,后面再看看吧 i = i + 1 outfile.close() create_feature_map(train_x.columns) #%% #運行XGBoost,輸出特征重要性排名 #運行XGBoost,輸出特征重要性排名 def run_xgboost(data_x,data_y,random_state_num): train_x,valid_x,train_y,valid_y = train_test_split(data_x.values,data_y.values,test_size=0.25,random_state=random_state_num) print('開始訓練模型') start = time.time() #轉換成xgb運算格式 d_train = xgb.DMatrix(train_x,train_y) d_valid = xgb.DMatrix(valid_x,valid_y) watchlist = [(d_train,'train'),(d_valid,'valid')] #參數設置(未調箱前的參數) params={ 'eta':0.2, #特征權重,取值范圍0~1,通常最后設置eta為0.01~0.2 'max_depth':3, #樹的深度,通常取值3-10,過大容易過擬合,過小欠擬合 'min_child_weight':1, #最小樣本的權重,調大參數可以繁殖過擬合 'gamma':0.4, #控制是否后剪枝,越大越保守,一般0.1、 0.2的樣子 'subsample':0.8, #隨機取樣比例 'colsample_bytree':0.8 , #默認為1,取值0~1,對特征隨機采集比例 'reg_lambda':0.8, 'reg_alpha':0.6, 'learning_rate':0.1, 'n_estimators':500, 'booster':'gbtree', #迭代樹 'objective':'binary:logistic', #邏輯回歸,輸出為概率 'nthread':6, #設置最大的進程量,若不設置則會使用全部資源 'scale_pos_weight':1, #默認為0,1可以處理類別不平衡 'lambda':1, #默認為1,用於L2平滑處理項,避免模型過擬合 'seed':1234, #隨機樹種子 'silent':1, #0表示輸出結果 'eval_metric':'auc' #評分指標 } bst = xgb.train(params, d_train,1000,watchlist,early_stopping_rounds=100, verbose_eval=5) #最大迭代次數1000次 print(time.time()-start) tree_nums = bst.best_ntree_limit print('最優模型樹的數量:%s,最優迭代次數:%s,auc: %s' %(bst.best_ntree_limit,bst.best_iteration,bst.best_score)) bst = xgb.train(params, d_train,tree_nums,watchlist,early_stopping_rounds=100, verbose_eval=10) #最優模型迭代次數去訓練 # feat_imp = pd.Series(clf.booster().get_fscore()).sort_values(ascending=False) # #新版需要轉換成dict or list # #feat_imp = pd.Series(dict(clf.get_booster().get_fscore())).sort_values(ascending=False) # #plt.bar(feat_imp.index, feat_imp) # feat_imp.plot(kind='bar', title='Feature Importances') #展示特征重要性排名 feat_imp = bst.get_fscore(fmap='xgb.txt') feat_imp = sorted(feat_imp.items(),key=operator.itemgetter(1)) df = pd.DataFrame(feat_imp,columns=['feature','fscore']) #每個特征被調用的次數/所有特征被調用總次數 df['fscore'] = df['fscore']/df['fscore'].sum() #分數高的排在前面,展示前40個重要特征排名 df = df.sort_values(by='fscore',ascending=False) df = df.iloc[:40] plt.figure() df.plot(kind='bar',x='feature',y='fscore',legend=True,figsize=(32,10)) plt.title('XGBoost Feature Importance') plt.xlabel('relative importance') plt.gcf().savefig('feature_importance_xgb.png') plt.show() return bst #%% # 繪制ROC曲線函數 def plot_roc(test_x, test_y): predictions = bst.predict(xgb.DMatrix(test_x)) false_positive_rate, true_positive_rate, thresholds = roc_curve(test_y, predictions) #roc的幾個參數 roc_auc = auc(false_positive_rate, true_positive_rate) #直接計算auc plt.title('Receiver Operating Characteristic') plt.plot(false_positive_rate,true_positive_rate, 'b', label='AUC = %0.4f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r.') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.ylabel('tpr') plt.xlabel('fpr') # 繪制K-S函數 從大到小排序,分10等分 def plot_ks(test_x, test_y): predictions = bst.predict(xgb.DMatrix(test_x)) false_positive_rate, true_positive_rate, thresholds = roc_curve(test_y, predictions, drop_intermediate=False) pre = sorted(predictions, reverse=True) #reverse參數為True意味着按照降序排序,這是畫ks時要求的 num = [] for i in range(10): num.append((i) * int(len(pre) / 10)) num.append(len(pre) - 1) df = pd.DataFrame() df['false_positive_rate'] = false_positive_rate df['true_positive_rate'] = true_positive_rate df['thresholds'] = thresholds data_ks = [] for i in num: data_ks.append(list(df[df['thresholds'] == pre[i]].values[0])) data_ks = pd.DataFrame(data_ks) data_ks.columns = ['fpr', 'tpr', 'thresholds'] ks = max(data_ks['tpr'] - data_ks['fpr']) plt.title('K-S曲線') plt.plot(np.array(range(len(num))), data_ks['tpr']) plt.plot(np.array(range(len(num))), data_ks['fpr']) plt.plot(np.array(range(len(num))), data_ks['tpr'] - data_ks['fpr'], label='K-S = %0.4f' % ks) plt.legend(loc='lower right') plt.xlim([0, 10]) plt.ylim([0.0, 1.0]) plt.ylabel('累計占比') plt.xlabel('分組編號') # 繪制一張圖,包含訓練和測試集的ROC、AUC、K-S圖形指標。 def auc_ks(train_x, test_x, train_y, test_y): plt.figure(figsize=(15, 15)) plt.subplot(221) plot_roc(train_x, train_y) plt.subplot(222) plot_roc(test_x, test_y) plt.subplot(223) plot_ks(train_x, train_y) plt.subplot(224) plot_ks(test_x, test_y) plt.savefig(file_xgboost_model_auc_ks) plt.show() #%% #保存模型、評價指標、選擇變量到D盤 def run_main(data_x,data_y): global bst start=time.time() bst=run_xgboost(data_x,data_y,random_state_num=1234) #為什么要是1234,因為調參時候就是=1234 joblib.dump(bst, file_xgboost_model) #joblib的用法https://www.cnblogs.com/wzdLY/p/9630671.html 將模型保存 print('模型已成功保存在 %s'%(file_xgboost_model)) train_x, test_x, train_y, test_y = train_test_split(data_x.values, data_y.values, test_size=0.25, random_state=1234) auc_ks(train_x, test_x, train_y, test_y) print('模型評價指標已保存在:%s'%(file_xgboost_model_auc_ks)) print('運行共花費時間:%s'%(time.time()-start)) resu = bst.predict(xgb.DMatrix(test_x)) if __name__=='__main__': run_main(train_x, train_y) #%%單獨跑這段,就可以得到混淆矩陣 from sklearn.metrics import precision_score, recall_score, f1_score,confusion_matrix #lr = LogisticRegression(C=best_c, penalty='l1') #lr.fit(X_train_undersample, y_train_undersample) #y_pred_undersample = lr.predict(X_train_undersample) bst=run_xgboost(train_x, train_y,random_state_num=1234) train_x, test_x, train_y, test_y = train_test_split(train_x.values, train_y.values, test_size=0.25, random_state=1234) resu = bst.predict(xgb.DMatrix(test_x)) resu = pd.DataFrame(resu) resu.columns=['y'] resu = resu['y'].apply(lambda x:1 if x>0.5 else 0) resu = resu.values matrix = confusion_matrix(test_y, resu) print("混淆矩陣:\n", matrix) print("精度:", precision_score(test_y, resu)) print("召回率:", recall_score(test_y, resu)) print("f1分數:", f1_score(test_y, resu)) ''' 混淆矩陣: [[71078 6] [ 32 86]] 精度: 0.9347826086956522 召回率: 0.7288135593220338 f1分數: 0.819047619047619 '''
混淆矩陣:
[[71078 6]
[ 32 86]]
精度: 0.9347826086956522
召回率: 0.7288135593220338
f1分數: 0.819047619047619
我們可以看出,xgboost模型還是比邏輯回歸模型要好 ,而且我還沒有經過調參
2021.03.12補充LightGBM
代碼如下:

# -*- coding: utf-8 -*- """ Created on Fri Mar 12 14:43:16 2021 @author: Administrator """ #%%導入模塊 import pandas as pd import numpy as np from scipy import stats import seaborn as sns import matplotlib.pyplot as plt %matplotlib inline plt.rc("font",family="SimHei",size="12") #解決中文無法顯示的問題 #%%導入數據 creditcard = pd.read_csv('D:/信用卡欺詐檢測/creditcard.csv/creditcard.csv') creditcard.info() #284806 creditcard.isnull().sum() creditcard.head(3) creditcard.rename(columns={'Class':'y'},inplace = True) from sklearn.model_selection import KFold # 分離數據集,方便進行交叉驗證 X_train = creditcard.iloc[:,0:-1] y_train = creditcard.y # 5折交叉驗證 folds = 5 seed = 2021 kf = KFold(n_splits=folds, shuffle=True, random_state=seed) #%%對訓練集數據進行划分,分成訓練集和驗證集,並進行相應的操作 from sklearn.model_selection import train_test_split import lightgbm as lgb # 數據集划分 X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.3,stratify=y_train) train_matrix = lgb.Dataset(X_train_split, label=y_train_split) valid_matrix = lgb.Dataset(X_val, label=y_val) params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'learning_rate': 0.1, 'metric': 'auc', 'min_child_weight': 1, 'num_leaves': 10, 'max_depth': 7, 'reg_lambda': 0, 'reg_alpha': 0, 'feature_fraction': 1, 'bagging_fraction': 1, 'bagging_freq': 0, 'seed': 2020, 'nthread': 8, 'silent': True, 'verbose': -1, } """使用訓練集數據進行模型訓練""" model = lgb.train(params, train_set=train_matrix, valid_sets=valid_matrix, \ num_boost_round=20000, verbose_eval=1000, early_stopping_rounds=200) #[847] valid_0's auc: 0.94372 from sklearn import metrics from sklearn.metrics import roc_auc_score """預測並計算roc的相關指標""" val_pre_lgb = model.predict(X_val, num_iteration=model.best_iteration) fpr, tpr, threshold = metrics.roc_curve(y_val, val_pre_lgb) roc_auc = metrics.auc(fpr, tpr) print('未調參前lightgbm單模型在驗證集上的AUC:{}'.format(roc_auc)) """畫出roc曲線圖""" plt.figure(figsize=(8, 8)) plt.title('Validation ROC') plt.plot(fpr, tpr, 'b', label = 'Val AUC = %0.4f' % roc_auc) plt.ylim(0,1) plt.xlim(0,1) plt.legend(loc='best') plt.title('ROC') plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') # 畫出對角線 plt.plot([0,1],[0,1],'r--') plt.show() import lightgbm as lgb """使用lightgbm 5折交叉驗證進行建模預測""" cv_scores = [] for i, (train_index, valid_index) in enumerate(kf.split(X_train, y_train)): print('************************************ {} ************************************'.format(str(i+1))) X_train_split, y_train_split, X_val, y_val = X_train.iloc[train_index], y_train[train_index], X_train.iloc[valid_index], y_train[valid_index] train_matrix = lgb.Dataset(X_train_split, label=y_train_split) valid_matrix = lgb.Dataset(X_val, label=y_val) params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'learning_rate': 0.1, 'metric': 'auc', 'min_child_weight': 1e-3, 'num_leaves': 10, 'max_depth': -1, 'reg_lambda': 0, 'reg_alpha': 0, 'feature_fraction': 1, 'bagging_fraction': 1, 'bagging_freq': 0, 'seed': 2021, 'nthread': 8, 'silent': True, 'verbose': -1, } model = lgb.train(params, train_set=train_matrix, num_boost_round=20000, valid_sets=valid_matrix, verbose_eval=1000, early_stopping_rounds=200) val_pred = model.predict(X_val, num_iteration=model.best_iteration) cv_scores.append(roc_auc_score(y_val, val_pred)) print(cv_scores) print("lgb_scotrainre_list:{}".format(cv_scores)) print("lgb_score_mean:{}".format(np.mean(cv_scores))) print("lgb_score_std:{}".format(np.std(cv_scores))) #%%貝葉斯調參 from sklearn.model_selection import cross_val_score """定義優化函數""" def rf_cv_lgb(num_leaves, max_depth, bagging_fraction, feature_fraction, bagging_freq, min_data_in_leaf, min_child_weight, min_split_gain, reg_lambda, reg_alpha): # 建立模型 model_lgb = lgb.LGBMClassifier(boosting_type='gbdt', bjective='binary', metric='auc', learning_rate=0.1, n_estimators=5000, num_leaves=int(num_leaves), max_depth=int(max_depth), bagging_fraction=round(bagging_fraction, 2), feature_fraction=round(feature_fraction, 2), bagging_freq=int(bagging_freq), min_data_in_leaf=int(min_data_in_leaf), min_child_weight=min_child_weight, min_split_gain=min_split_gain, reg_lambda=reg_lambda, reg_alpha=reg_alpha, n_jobs= 8 ) val = cross_val_score(model_lgb, X_train_split, y_train_split, cv=5, scoring='roc_auc').mean() return val from bayes_opt import BayesianOptimization """定義優化參數""" bayes_lgb = BayesianOptimization( rf_cv_lgb, { 'num_leaves':(10, 200), 'max_depth':(3, 20), 'bagging_fraction':(0.5, 1.0), 'feature_fraction':(0.5, 1.0), 'bagging_freq':(0, 100), 'min_data_in_leaf':(10,100), 'min_child_weight':(0, 10), 'min_split_gain':(0.0, 1.0), 'reg_alpha':(0.0, 10), 'reg_lambda':(0.0, 10), } ) """開始優化""" bayes_lgb.maximize(n_iter=10) bayes_lgb.max ''' {'target': 0.978984093218777, 'params': {'bagging_fraction': 0.7852426281123215, 'bagging_freq': 42.927767267031435, 'feature_fraction': 0.8729234124911952, 'max_depth': 18.80072510809031, 'min_child_weight': 8.29481722055312, 'min_data_in_leaf': 13.261838180182071, 'min_split_gain': 0.45972976507462127, 'num_leaves': 154.4793280962274, 'reg_alpha': 7.018060276190158, 'reg_lambda': 2.1475557765094413}} ''' #%%調整一個較小的學習率,並通過cv函數確定當前最優的迭代次數""" base_params_lgb = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.01, 'num_leaves': 154, 'max_depth': 18, 'min_data_in_leaf': 21, 'min_child_weight':8.3, 'bagging_fraction': 0.78, 'feature_fraction': 0.87, 'bagging_freq': 43, 'reg_lambda': 2, 'reg_alpha': 7, 'min_split_gain': 0.5, 'nthread': 8, 'seed': 2021, 'silent': True, 'verbose': -1 } cv_result_lgb = lgb.cv( train_set=train_matrix, early_stopping_rounds=1000, num_boost_round=20000, nfold=5, stratified=True, shuffle=True, params=base_params_lgb, metrics='auc', seed=0 ) print('迭代次數{}'.format(len(cv_result_lgb['auc-mean']))) print('最終模型的AUC為{}'.format(max(cv_result_lgb['auc-mean']))) ''' 迭代次數855 最終模型的AUC為0.9821581751610478 ''' #%%模型參數已經確定,建立最終模型並對驗證集進行驗證 import lightgbm as lgb """使用lightgbm 5折交叉驗證進行建模預測""" cv_scores = [] for i, (train_index, valid_index) in enumerate(kf.split(X_train, y_train)): print('************************************ {} ************************************'.format(str(i+1))) X_train_split, y_train_split, X_val, y_val = X_train.iloc[train_index], y_train[train_index], X_train.iloc[valid_index], y_train[valid_index] train_matrix = lgb.Dataset(X_train_split, label=y_train_split) valid_matrix = lgb.Dataset(X_val, label=y_val) params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.01, 'num_leaves': 154, 'max_depth': 18, 'min_data_in_leaf': 20, 'min_child_weight':8.3, 'bagging_fraction': 0.78, 'feature_fraction': 0.87, 'bagging_freq': 43, 'reg_lambda': 2, 'reg_alpha': 7, 'min_split_gain': 0.5, 'nthread': 8, 'seed': 2021, 'silent': True, 'verbose': -1 } model = lgb.train(params, train_set=train_matrix, num_boost_round=855, valid_sets=valid_matrix, verbose_eval=1000, early_stopping_rounds=200) val_pred = model.predict(X_val, num_iteration=model.best_iteration) cv_scores.append(roc_auc_score(y_val, val_pred)) print(cv_scores) print("lgb_scotrainre_list:{}".format(cv_scores)) print("lgb_score_mean:{}".format(np.mean(cv_scores))) print("lgb_score_std:{}".format(np.std(cv_scores))) #%%通過5折交叉驗證可以發現,模型迭代次數在750次的時候會停之,那么我們在建立新模型時直接設置最大迭代次數,並使用驗證集進行模型預測 """""" base_params_lgb = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.01, 'num_leaves': 154, 'max_depth': 18, 'min_data_in_leaf': 20, 'min_child_weight':8.3, 'bagging_fraction': 0.78, 'feature_fraction': 0.87, 'bagging_freq': 43, 'reg_lambda': 2, 'reg_alpha': 7, 'min_split_gain': 0.5, 'nthread': 8, 'seed': 2021, 'silent': True } """使用訓練集數據進行模型訓練""" final_model_lgb = lgb.train(base_params_lgb, train_set=train_matrix, valid_sets=valid_matrix, num_boost_round=855, verbose_eval=1000, early_stopping_rounds=200) """預測並計算roc的相關指標""" val_pre_lgb = final_model_lgb.predict(X_val) fpr, tpr, threshold = metrics.roc_curve(y_val, val_pre_lgb) roc_auc = metrics.auc(fpr, tpr) print('調參后lightgbm單模型在驗證集上的AUC:{}'.format(roc_auc)) #0.9765762181212846 """畫出roc曲線圖""" plt.figure(figsize=(8, 8)) plt.title('Validation ROC') plt.plot(fpr, tpr, 'b', label = 'Val AUC = %0.4f' % roc_auc) plt.ylim(0,1) plt.xlim(0,1) plt.legend(loc='best') plt.title('ROC') plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') # 畫出對角線 plt.plot([0,1],[0,1],'r--') plt.show()
使用了貝葉斯調參,最后效果索然不如xgboost,但是也比邏輯回歸要好,且不需要處理任何變量,直接喂給算法
2021.03.15補充xgboost另外一種調參方式

# -*- coding: utf-8 -*- """ Created on Tue Mar 9 16:16:56 2021 @author: Administrator """ #%%導入模塊 import pandas as pd import numpy as np from scipy import stats import seaborn as sns import matplotlib.pyplot as plt %matplotlib inline plt.rc("font",family="SimHei",size="12") #解決中文無法顯示的問題 #%%導入數據 creditcard = pd.read_csv('D:/信用卡欺詐檢測/creditcard.csv/creditcard.csv') creditcard.info() train_y = creditcard[['Class']] train_y.columns = ['y'] train_x = creditcard.drop(['Class','Time'],axis=1) import numpy as np import pandas as pd import matplotlib.pyplot as plt import operator import time import xgboost as xgb from xgboost import plot_importance #畫特征重要性的函數 #from imblearn.ensemble import EasyEnsemble #還有模塊木有安裝 from sklearn.model_selection import train_test_split #from sklearn.externals import joblib 已經改成了下面這種方式 import joblib from sklearn.metrics import auc,roc_curve #說明是分類 plt.rc('font',family='SimHei',size=13) #使畫出的圖形中能正常顯示中文 %matplotlib inline def create_feature_map(features): outfile = open('xgb.txt', 'w') #寫,新建一個叫xgb.txt的文件 i = 0 for feat in features: outfile.write('{0}\t{1}\tq\n'.format(i, feat)) #格式為 0 feature q \t是分隔符,為空 就是說第一列是序號,第二列是特征名稱,第三列是q,不知道需要這個q干嗎,可以是多寫了,先要着吧,后面再看看吧 i = i + 1 outfile.close() create_feature_map(train_x.columns) file_xgboost_model='./xgboost_model' #模型文件 file_xgboost_columns='./columns.csv' #最終使用的特征 file_xgboost_model_auc_ks='./xgboost_model_auc_ks.png' #模型AUC和KS值 file_xgboost_model_score='./xgboost_model_score.png' # 模型預測用戶的評分分布 file_xgboost_model_prob='./xgboost_model_prob.png' #模型預測用戶的概率分布 import xgboost as xgb from xgboost import XGBClassifier from xgboost import plot_tree import matplotlib.pyplot as plt from sklearn import metrics from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix X = creditcard.iloc[:,0:-1] y = creditcard.Class X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0) #%% def tun_parameters(train_x,train_y): #通過這個函數,確定樹的個數 xgb1 = XGBClassifier(learning_rate=0.1,n_estimators=1000,max_depth=5,min_child_weight=1,gamma=0,subsample=0.8, colsample_bytree=0.8,objective= 'binary:logistic',scale_pos_weight=1,seed=27) modelfit(xgb1,train_x,train_y) def modelfit(alg,X, y,useTrainCV=True, cv_folds=5, early_stopping_rounds=50): if useTrainCV: xgb_param = alg.get_xgb_params() xgtrain = xgb.DMatrix(X, label=y) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds,callbacks=[ xgb.callback.print_evaluation(show_stdv=False), xgb.callback.early_stop(early_stopping_rounds) ]) alg.set_params(n_estimators=cvresult.shape[0]) #Fit the algorithm on the data alg.fit(X, y,eval_metric='auc') #Predict training set: dtrain_predictions = alg.predict(X) dtrain_predprob = alg.predict_proba(X)[:,1] #Print model report: print ("\nModel Report") print ("Accuracy : %.4g" % metrics.accuracy_score(y, dtrain_predictions)) print ("AUC Score (Train): %f" % metrics.roc_auc_score(y, dtrain_predprob)) print ('n_estimators=',cvresult.shape[0]) tun_parameters(X_train,y_train) ''' Accuracy : 0.9998 AUC Score (Train): 0.999886 n_estimators= 100 ''' #%%第二步: max_depth 和 min_child_weight 參數調優 from sklearn.model_selection import GridSearchCV param_test1 = { 'max_depth':range(3,10,1), 'min_child_weight':range(2,9,1) } gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=100, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8,colsample_bytree=0.8,\ objective= 'binary:logistic', nthread=8,scale_pos_weight=1, seed=27), param_grid = param_test1,scoring='roc_auc',n_jobs=-1,iid=False, cv=5) gsearch1.fit(X_train,y_train) gsearch1.best_params_, gsearch1.best_score_ #({'max_depth': 3, 'min_child_weight': 5}, 0.9851612149724902) #({'max_depth': 5, 'min_child_weight': 8}, 0.9860796809303931) #%%第三步:gamma參數調優 param_test3 = { 'gamma': [i / 10.0 for i in range(0, 5)] } gsearch3 = GridSearchCV( estimator=XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=5, min_child_weight=8, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=8, scale_pos_weight=1, seed=27), param_grid=param_test3, scoring='roc_auc', n_jobs=-1, iid=False, cv=5) gsearch3.fit(X_train,y_train) gsearch3.best_params_, gsearch3.best_score_ #({'gamma': 0.0}, 0.9860796809303931) #%%第四步:調整subsample 和 colsample_bytree 參數 param_test4 = { 'subsample': [i / 10.0 for i in range(6, 10)], 'colsample_bytree': [i / 10.0 for i in range(6, 10)] } gsearch4 = GridSearchCV( estimator=XGBClassifier(learning_rate=0.1,n_estimators=100, max_depth=5, min_child_weight=8, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=8, scale_pos_weight=1, seed=27), param_grid=param_test4, scoring='roc_auc', n_jobs=-1, iid=False, cv=5) gsearch4.fit(X_train,y_train) gsearch4.best_params_, gsearch4.best_score_ #%%第五步:正則化參數調優 reg_alpha和reg_lambda(這里只調了reg_alpha) def tun_parameters2(train_x,train_y): #通過這個函數,確定樹的個數 xgb1 = XGBClassifier(learning_rate =0.1, n_estimators=5000, max_depth=5, min_child_weight=8, gamma=0, subsample=0.8, colsample_bytree=0.8,objective= 'binary:logistic', nthread=8,booster='gbtree', reg_alpha= 0.6,reg_lambda= 0.8, scale_pos_weight=1,seed=2021) modelfit(xgb1,train_x,train_y) tun_parameters2(X_train,y_train) ''' Model Report Accuracy : 0.9997 AUC Score (Train): 0.998747 n_estimators= 134 ''' model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.8, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.1, max_delta_step=0, max_depth=5, min_child_weight=8, missing=np.nan, monotone_constraints='()', n_estimators=134, n_jobs=8, nthread=8, num_parallel_tree=1, random_state=27, reg_alpha=0.6, reg_lambda=0.8, scale_pos_weight=1, seed=27, subsample=0.8, tree_method='exact', validate_parameters=1, verbosity=None) model.fit(X_train,y_train) #%%驗證集 def plot_roc(test_x, test_y): predictions = model.predict(test_x) false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(test_y, predictions) #roc的幾個參數 roc_auc = metrics.auc(false_positive_rate, true_positive_rate) #直接計算auc plt.title('Receiver Operating Characteristic') plt.plot(false_positive_rate,true_positive_rate, 'b', label='AUC = %0.4f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r.') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.ylabel('tpr') plt.xlabel('fpr') plot_roc(X_test,y_test) def run_xgboost(data_x,data_y,random_state_num): train_x,valid_x,train_y,valid_y = train_test_split(data_x.values,data_y.values,test_size=0.25,random_state=random_state_num) print('開始訓練模型') start = time.time() #轉換成xgb運算格式 d_train = xgb.DMatrix(train_x,train_y) d_valid = xgb.DMatrix(valid_x,valid_y) watchlist = [(d_train,'train'),(d_valid,'valid')] #參數設置(未調箱前的參數) params={ 'eta':0.1, #特征權重,取值范圍0~1,通常最后設置eta為0.01~0.2 'max_depth':5, #樹的深度,通常取值3-10,過大容易過擬合,過小欠擬合 'min_child_weight':8, #最小樣本的權重,調大參數可以繁殖過擬合 'gamma':0.0, #控制是否后剪枝,越大越保守,一般0.1、 0.2的樣子 'subsample':0.8, #隨機取樣比例 'colsample_bytree':0.8 , #默認為1,取值0~1,對特征隨機采集比例 'lambda':0.8, 'alpha':0.6, 'n_estimators':500, 'booster':'gbtree', #迭代樹 'objective':'binary:logistic', #邏輯回歸,輸出為概率 'nthread':6, #設置最大的進程量,若不設置則會使用全部資源 'scale_pos_weight':10, #默認為0,1可以處理類別不平衡 'lambda':1, #默認為1,用於L2平滑處理項,避免模型過擬合 'seed':1234, #隨機樹種子 'silent':1, #0表示輸出結果 'eval_metric':'auc' #評分指標 } bst = xgb.train(params, d_train,1000,watchlist,early_stopping_rounds=100, verbose_eval=5) #最大迭代次數1000次 print(time.time()-start) tree_nums = bst.best_ntree_limit print('最優模型樹的數量:%s,最優迭代次數:%s,auc: %s' %(bst.best_ntree_limit,bst.best_iteration,bst.best_score)) bst = xgb.train(params, d_train,tree_nums,watchlist,early_stopping_rounds=100, verbose_eval=10) #最優模型迭代次數去訓練 # feat_imp = pd.Series(clf.booster().get_fscore()).sort_values(ascending=False) # #新版需要轉換成dict or list # #feat_imp = pd.Series(dict(clf.get_booster().get_fscore())).sort_values(ascending=False) # #plt.bar(feat_imp.index, feat_imp) # feat_imp.plot(kind='bar', title='Feature Importances') #展示特征重要性排名 feat_imp = bst.get_fscore(fmap='xgb.txt') feat_imp = sorted(feat_imp.items(),key=operator.itemgetter(1)) df = pd.DataFrame(feat_imp,columns=['feature','fscore']) #每個特征被調用的次數/所有特征被調用總次數 df['fscore'] = df['fscore']/df['fscore'].sum() #分數高的排在前面,展示前40個重要特征排名 df = df.sort_values(by='fscore',ascending=False) df = df.iloc[:40] plt.figure() df.plot(kind='bar',x='feature',y='fscore',legend=True,figsize=(32,10)) plt.title('XGBoost Feature Importance') plt.xlabel('relative importance') plt.gcf().savefig('feature_importance_xgb.png') plt.show() return bst #%% # 繪制ROC曲線函數 def plot_roc(test_x, test_y): predictions = bst.predict(xgb.DMatrix(test_x)) false_positive_rate, true_positive_rate, thresholds = roc_curve(test_y, predictions) #roc的幾個參數 roc_auc = auc(false_positive_rate, true_positive_rate) #直接計算auc plt.title('Receiver Operating Characteristic') plt.plot(false_positive_rate,true_positive_rate, 'b', label='AUC = %0.4f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r.') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.ylabel('tpr') plt.xlabel('fpr') # 繪制K-S函數 從大到小排序,分10等分 def plot_ks(test_x, test_y): predictions = bst.predict(xgb.DMatrix(test_x)) false_positive_rate, true_positive_rate, thresholds = roc_curve(test_y, predictions, drop_intermediate=False) pre = sorted(predictions, reverse=True) #reverse參數為True意味着按照降序排序,這是畫ks時要求的 num = [] for i in range(10): num.append((i) * int(len(pre) / 10)) num.append(len(pre) - 1) df = pd.DataFrame() df['false_positive_rate'] = false_positive_rate df['true_positive_rate'] = true_positive_rate df['thresholds'] = thresholds data_ks = [] for i in num: data_ks.append(list(df[df['thresholds'] == pre[i]].values[0])) data_ks = pd.DataFrame(data_ks) data_ks.columns = ['fpr', 'tpr', 'thresholds'] ks = max(data_ks['tpr'] - data_ks['fpr']) plt.title('K-S曲線') plt.plot(np.array(range(len(num))), data_ks['tpr']) plt.plot(np.array(range(len(num))), data_ks['fpr']) plt.plot(np.array(range(len(num))), data_ks['tpr'] - data_ks['fpr'], label='K-S = %0.4f' % ks) plt.legend(loc='lower right') plt.xlim([0, 10]) plt.ylim([0.0, 1.0]) plt.ylabel('累計占比') plt.xlabel('分組編號') # 繪制一張圖,包含訓練和測試集的ROC、AUC、K-S圖形指標。 def auc_ks(train_x, test_x, train_y, test_y): plt.figure(figsize=(15, 15)) plt.subplot(221) plot_roc(train_x, train_y) plt.subplot(222) plot_roc(test_x, test_y) plt.subplot(223) plot_ks(train_x, train_y) plt.subplot(224) plot_ks(test_x, test_y) plt.savefig(file_xgboost_model_auc_ks) plt.show() #%% #保存模型、評價指標、選擇變量到D盤 def run_main(data_x,data_y): global bst start=time.time() bst=run_xgboost(data_x,data_y,random_state_num=1234) #為什么要是1234,因為調參時候就是=1234 joblib.dump(bst, file_xgboost_model) #joblib的用法https://www.cnblogs.com/wzdLY/p/9630671.html 將模型保存 print('模型已成功保存在 %s'%(file_xgboost_model)) train_x, test_x, train_y, test_y = train_test_split(data_x.values, data_y.values, test_size=0.25, random_state=1234) auc_ks(train_x, test_x, train_y, test_y) print('模型評價指標已保存在:%s'%(file_xgboost_model_auc_ks)) print('運行共花費時間:%s'%(time.time()-start)) resu = bst.predict(xgb.DMatrix(test_x)) if __name__=='__main__': run_main(train_x, train_y)