針對銀行客戶流失預測,主要流程分為:特征預處理、特征選擇,分類模型選擇與訓練。主要工作如下:
1:特征預處理與選擇
對性別進行啞變量處理;
對是否有****信息將布爾值轉換01表示;
畫出年齡直方圖可以看出大致呈正態分布,對年齡分段處理后缺失值采用插補方式;
資產當前總額=存儲類資產當前總額=本幣存儲當前總金額 月日均余額=存儲類資產月日均余額=本幣存儲月日均余額 分別刪除其中兩項;
針對*NUM,*DUR,*AMT,*BAL字段分別進行特征提取(SelectKBest)達到降維效果;
最后整合數據,特征標准化處理最終為44個特征(StandardScaler)。
2:分類模型選擇與訓練
數據集划分:采用K折交叉驗證,train_test_split自主切分數據集
模型選擇:采用了決策樹,提升樹(GBDT/XGBoost),SVM(libsvm)神經網絡(多層感知器算法)分別訓練模型
3:對應python主要代碼:
-
decisiontree.py
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score X_train,X_test,y_train,y_test=train_test_split(StS,y,test_size=0.4,random_state=0) clf = tree.DecisionTreeClassifier() clf = clf.fit(X_train, y_train) pre_labels = clf.predict(X_test) print('accuracy score:',accuracy_score(y_test,pre_labels,normalize=True)) print('recall score:',recall_score(y_test,pre_labels)) print('precision score:',precision_score(y_test,pre_labels)) print('f1 score:',f1_score(y_test,pre_labels))
- XGBoost.py
import xgboost as xgb from sklearn.preprocessing import StandardScaler #記錄程序運行時間 import time start_time = time.time() from xgboost.sklearn import XGBClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,classification_report,roc_auc_score bankChurn = pd.read_csv('D:/work/lost data and dictionary/test/bankChurn.csv')#原始數據 bankChurn_data = pd.read_csv('D:/work/lost data and dictionary/test/bankChurn_data.csv')#預處理數據 Y_train=bankChurn['CHUR0_CUST_I0D']#標簽 StS=StandardScaler().fit_transform(bankChurn_data) X_train,X_test,y_train,y_test=train_test_split(StS,Y_train,test_size=0.4,random_state=None) print(X_train.shape, X_test.shape) #模型參數設置 xlf = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=10, silent=True, objective='binary:logistic', nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=0.85, colsample_bytree=0.7, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,#這個值是因為類別十分不平衡。 seed=1440) xlf.fit(X_train, y_train, eval_metric='error', verbose = True, eval_set = [(X_test, y_test)],early_stopping_rounds=100) # 計算 auc 分數、預測 preds = xlf.predict(X_test) pre_pro = xlf.predict_proba(X_test)[:,1] print('accuracy score:',accuracy_score(y_test,preds ,normalize=True)) print('classification report:',classification_report(y_test,preds )) print('precision score:',precision_score(y_test,preds )) print('roc_auc_score:%f' % roc_auc_score(y_test,pre_pro)) #輸出運行時長 cost_time = time.time()-start_time print("xgboost success!",'\n',"cost time:",cost_time,"(s)......")
-
libsvm.py
import os os.chdir('C:\libsvm-2.81\python') from svmutil import * from sklearn.metrics import accuracy_score,classification_report y,x=svm_read_problem('bankchurnLibsvm.txt')#轉換成libsvm格式 # print(type(x)) x=np.array(x) y=np.array(y) stratified_folder=StratifiedKFold(n_splits=4,random_state=0,shuffle=True) for train_index,test_index in stratified_folder.split(x,y): print('shuffled train index:',train_index) print('shuffled test index:', test_index) print('shuffled x_train:', x[train_index]) print('shuffled x_test:', x[test_index]) print('shuffled y_train:', y[train_index]) print('shuffled y_test:', y[test_index]) print('.......') y_train=list(y[train_index]) y_test=list(y[test_index]) x_train=list(x[train_index]) x_test=list(x[test_index]) m=svm_train( y_train,x_train,'-c 4 -g 2') p_label,p_acc,p_val=svm_predict(y_test,x_test,m) print('accuracy score:',accuracy_score(y_test,p_label ,normalize=True)) print('classification report:',classification_report(y_test,p_label ))
-
BPtest
import pandas as pd import numpy as np from sklearn.model_selection import cross_val_score from sklearn.neural_network import MLPClassifier from sklearn.metrics import accuracy_score,roc_auc_score from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,classification_report bankChurn = pd.read_csv('D:/work/lost data and dictionary/test/bankChurn.csv') X_data = pd.read_csv('D:/work/lost data and dictionary/test/bankChurn_data.csv') X_data=X_data.values[:,:] Y_label=bankChurn['CHUR0_CUST_I0D'] Y_label=Y_label.values[:] data=np.hstack((X_data,Y_label.reshape(Y_label.size,1)))##將樣本集與標簽合並 np.random.shuffle(data)##混洗數據 X=data[:,:-1] Y=data[:,-1] train_x=X[:-8620] test_x=X[-8620:] train_y=Y[:-8620] test_y=Y[-8620:]#數據5:5 ######mlpclassifier_data():###多層感知機算法,BP算法 classifier=MLPClassifier(hidden_layer_sizes=(30,),activation='logistic',max_iter=1000) clf=classifier.fit(train_x,train_y) train_score=classifier.score(train_x,train_y) test_score=classifier.score(test_x,test_y) print('train_score:',train_score) print('test_score:',test_score) ####得到其他分類效果#### pre_labels = clf.predict(test_x) pre_pro = clf.predict_proba(test_x)[:,1] print('accuracy score:',accuracy_score(test_y,pre_labels,normalize=True)) print('recall score:',recall_score(test_y,pre_labels)) print('classification report:',classification_report(test_y,pre_labels)) print('precision score:',precision_score(test_y,pre_labels)) print('f1 score:',f1_score(test_y,pre_labels)) print('roc_auc_score:%f' % roc_auc_score(test_y,pre_pro))
DT XGBoost Libsvm BP Accuracy 0.856 0.91 0.894 0.90 Precision 0.86 0.89 0.84 0.88 Recall 0.86 0.91 0.89 0.90 F1 score 0.86 0.89 0.85 0.87