2.7.py xgboost版評分映射


主要修改點有2處:

1.xgboost的參數,有些參數現版本的xgboost是沒有的,需要注釋掉或者使用現在的替換

2.xgboost版評分映射的問題,由於預測的是逾期的概率,因此我們需要使用基礎分-后面的,而不是+

#%%
import pandas as pd  
from sklearn.metrics import roc_auc_score,roc_curve,auc  
from sklearn import metrics  
from sklearn.linear_model import LogisticRegression  
import numpy as np  
data = pd.read_csv('xxx/Acard.txt')  
data.head()  
data.obs_mth.unique()
'''
array(['2018-10-31', '2018-07-31', '2018-09-30', '2018-06-30',
       '2018-11-30'], dtype=object)

'''
#每個月的數據量
data.obs_mth.value_counts()
'''
Out[233]: 
2018-07-31    34030
2018-06-30    20565
2018-11-30    15975
2018-10-31    14527
2018-09-30    10709
Name: obs_mth, dtype: int64
'''

train = data[data.obs_mth != '2018-11-30'].reset_index().copy()   #訓練集
val = data[data.obs_mth == '2018-11-30'].reset_index().copy()  #測試集
  
feature_lst = ['person_info','finance_info','credit_info','act_info']  
x = train[feature_lst]  
y = train['bad_ind']  
  
val_x =  val[feature_lst]  
val_y = val['bad_ind']  #0.0205320813771518


#%%
lr_model = LogisticRegression(C=0.1,class_weight='balanced')  
lr_model.fit(x,y) 
 
#訓練集
y_pred = lr_model.predict_proba(x)[:,1]  
fpr_lr_train,tpr_lr_train,_ = roc_curve(y,y_pred)  
train_ks = abs(fpr_lr_train - tpr_lr_train).max()  
print('train_ks : ',train_ks)  #0.4482325608488951

#測試集  
y_pred = lr_model.predict_proba(val_x)[:,1]  
fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred)  
val_ks = abs(fpr_lr - tpr_lr).max()  
print('val_ks : ',val_ks)  #0.4198642457760936

from matplotlib import pyplot as plt  
plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR')  
plt.plot(fpr_lr,tpr_lr,label = 'evl LR')  
plt.plot([0,1],[0,1],'k--')  
plt.xlabel('False positive rate')  
plt.ylabel('True positive rate')  
plt.title('ROC Curve')  
plt.legend(loc = 'best')  
plt.show() 

#%% ks
import math
model = lr_model  
row_num, col_num = 0, 0  
bins = 20  
Y_predict = [s[1] for s in model.predict_proba(val_x)]  
Y = val_y  
nrows = Y.shape[0]  
lis = [(Y_predict[i], Y[i]) for i in range(nrows)]  
ks_lis = sorted(lis, key=lambda x: x[0], reverse=True)  
bin_num = int(nrows/bins+1)  
bad = sum([1 for (p, y) in ks_lis if y > 0.5])  
good = sum([1 for (p, y) in ks_lis if y <= 0.5])  
bad_cnt, good_cnt = 0, 0  
KS = []  
BAD = []  
GOOD = []  
BAD_CNT = []  
GOOD_CNT = []  
BAD_PCTG = []  
BADRATE = []  
dct_report = {}  
for j in range(bins):  
    ds = ks_lis[j*bin_num: min((j+1)*bin_num, nrows)]  
    bad1 = sum([1 for (p, y) in ds if y > 0.5])  
    good1 = sum([1 for (p, y) in ds if y <= 0.5])  
    bad_cnt += bad1  
    good_cnt += good1  
    bad_pctg = round(bad_cnt/sum(val_y),3)  
    badrate = round(bad1/(bad1+good1),3)  
    ks = round(math.fabs((bad_cnt / bad) - (good_cnt / good)),3)  
    KS.append(ks)  
    BAD.append(bad1)  
    GOOD.append(good1)  
    BAD_CNT.append(bad_cnt)  
    GOOD_CNT.append(good_cnt)  
    BAD_PCTG.append(bad_pctg)  
    BADRATE.append(badrate)  
    dct_report['KS'] = KS  
    dct_report['負樣本個數'] = BAD  
    dct_report['正樣本個數'] = GOOD  
    dct_report['負樣本累計個數'] = BAD_CNT  
    dct_report['正樣本累計個數'] = GOOD_CNT  
    dct_report['捕獲率'] = BAD_PCTG  
    dct_report['負樣本占比'] = BADRATE  
val_repot = pd.DataFrame(dct_report)  
print(val_repot)  

#%% 利用pyecharts來畫圖 

#其實就每個分區逾期率和ks的曲線圖
from pyecharts.charts import *  
from pyecharts import options as opts  
from pylab import *  
mpl.rcParams['font.sans-serif'] = ['SimHei']  
np.set_printoptions(suppress=True)  
pd.set_option('display.unicode.ambiguous_as_wide', True)  
pd.set_option('display.unicode.east_asian_width', True)  
line = (  
  
    Line()  
    .add_xaxis(list(val_repot.index))  
    .add_yaxis(  
        "分組壞人占比",  
        list(val_repot.負樣本占比),  
        yaxis_index=0,  
        color="red",  
    )  
    .set_global_opts(  
        title_opts=opts.TitleOpts(title="行為評分卡模型表現"),  
    )  
    .extend_axis(  
        yaxis=opts.AxisOpts(  
            name="累計壞人占比",  
            type_="value",  
            min_=0,  
            max_=0.5,  
            position="right",  
            axisline_opts=opts.AxisLineOpts(  
                linestyle_opts=opts.LineStyleOpts(color="red")  
            ),  
            axislabel_opts=opts.LabelOpts(formatter="{value}"),  
        )  
  
    )  
    .add_xaxis(list(val_repot.index))  
    .add_yaxis(  
        "KS",  
        list(val_repot['KS']),  
        yaxis_index=1,  
        color="blue",  
        label_opts=opts.LabelOpts(is_show=False),  
    )  
)  
line.render_notebook()  


#還有最好在jupyter 上面跑,Spyder不展示該圖

#%%
print('變量名單:',feature_lst)  
print('系數:',lr_model.coef_)  
print('截距:',lr_model.intercept_)  
'''
變量名單: ['person_info', 'finance_info', 'credit_info', 'act_info']
系數: [[ 3.4946237  11.40440098  2.45601882 -1.6844742 ]]
截距: [-0.34578469]
'''

import math
#算分數onekey   
def score(person_info,finance_info,credit_info,act_info):  
    xbeta = person_info * ( 3.49460978) \
                  + finance_info * ( 11.40051582 ) \
                  + credit_info * (2.45541981) \
                  + act_info * ( -1.68676079) \
                  -0.34484897   
    score = 650-34* (xbeta)/math.log(2)  
    return score  
val['score'] = val.apply(lambda x : 
                            score(x.person_info,x.finance_info,x.
                            credit_info,x.act_info) ,axis=1)  
fpr_lr,tpr_lr,_ = roc_curve(val_y,val['score'])  
val_ks = abs(fpr_lr - tpr_lr).max()  
print('val_ks : ',val_ks)  

#%%

#對應評級區間  
def level(score):  
    level = 0  
    if score <= 600:  
        level = "D"  
    elif score <= 640 and score > 600 :   
        level = "C"  
    elif score <= 680 and score > 640:  
        level = "B"  
    elif  score > 680 :  
        level = "A"  
    return level  
val['level'] = val.score.map(lambda x : level(x) )  
print(val.level.groupby(val.level).count()/len(val))  



#%% xgb

import xgboost as xgb  
data = pd.read_csv('xxx/Acard.txt')  
df_train = data[data.obs_mth != '2018-11-30'].reset_index().copy()  
val = data[data.obs_mth == '2018-11-30'].reset_index().copy()  
lst = ['person_info','finance_info','credit_info','act_info']  
  
train = data[data.obs_mth != '2018-11-30'].reset_index().copy()  
evl = data[data.obs_mth == '2018-11-30'].reset_index().copy()  
  
x = train[lst]  
y = train['bad_ind']  
  
evl_x =  evl[lst]  
evl_y = evl['bad_ind']  



#%% 
#定義XGB函數  
def XGB_test(train_x,train_y,test_x,test_y):  
    from multiprocessing import cpu_count  
    clf = xgb.XGBClassifier(
        boosting_type='gbdt', num_leaves=31, 
                reg_Ap=0.0, reg_lambda=1,  
        max_depth=2, n_estimators=800,
                max_features = 140, objective='binary:logistic',  
        subsample=0.7, colsample_bytree=0.7, subsample_freq=1,  
        learning_rate=0.05, min_child_weight=50,
                random_state=None,n_jobs=cpu_count()-1,  
        num_iterations = 800 #迭代次數  
    )  
    clf.fit(train_x, train_y,eval_set=[(train_x, train_y),(test_x,test_y)],
                eval_metric='auc',early_stopping_rounds=100)  
    #print(clf.n_features_) 現在乜有這個參數了  
    return clf #,clf.best_score_[ 'valid_1']['auc']   

#模型訓練
model = XGB_test(x,y,evl_x,evl_y) 

#訓練集預測
y_pred = model.predict_proba(x)[:,1]  
fpr_xgb_train,tpr_xgb_train,_ = roc_curve(y,y_pred)  
train_ks = abs(fpr_xgb_train - tpr_xgb_train).max()  
print('train_ks : ',train_ks)  #train_ks :  0.45953542070724995 

#跨時間驗證集預測
y_pred = model.predict_proba(evl_x)[:,1]  
fpr_xgb,tpr_xgb,_ = roc_curve(evl_y,y_pred)  
evl_ks = abs(fpr_xgb - tpr_xgb).max()  
print('evl_ks : ',evl_ks)  #evl_ks :  0.4368715190475225

#畫出ROC曲線並計算KS值
from matplotlib import pyplot as plt  
plt.plot(fpr_xgb_train,tpr_xgb_train,label = 'train LR')  
plt.plot(fpr_xgb,tpr_xgb,label = 'evl LR')  
plt.plot([0,1],[0,1],'k--')  
plt.xlabel('False positive rate')  
plt.ylabel('True positive rate')  
plt.title('ROC Curve')  
plt.legend(loc = 'best')  
plt.show()  


#計算分數,評分映射,原來梅老師這里是600-50*(math.log2((1- pred)/ pred)),但是由於預測的是壞用戶,因此設置為減
def score(pred):   
    score = 600-50*(math.log2((1- pred)/ pred))  
    return score  
evl['xbeta'] = model.predict_proba(evl_x)[:,1]     
evl['score'] = evl.apply(lambda x : score(x.xbeta) ,axis=1)  
fpr_lr,tpr_lr,_ = roc_curve(evl_y,evl['score'])  
evl_ks = abs(fpr_lr - tpr_lr).max()  
print('val_ks : ',evl_ks)  #val_ks :  0.4368715190475225




#%%生成模型報告
row_num, col_num = 0, 0  
bins = 20  
Y_predict = evl['score']  
Y = evl_y  
nrows = Y.shape[0]  
lis = [(Y_predict[i], Y[i]) for i in range(nrows)]  
ks_lis = sorted(lis, key=lambda x: x[0], reverse=True)  
bin_num = int(nrows/bins+1)  
bad = sum([1 for (p, y) in ks_lis if y > 0.5])  
good = sum([1 for (p, y) in ks_lis if y <= 0.5])  
bad_cnt, good_cnt = 0, 0  
KS = []  
BAD = []  
GOOD = []  
BAD_CNT = []  
GOOD_CNT = []  
BAD_PCTG = []  
BADRATE = []  
dct_report = {}  
for j in range(bins):  
    ds = ks_lis[j*bin_num: min((j+1)*bin_num, nrows)]  
    bad1 = sum([1 for (p, y) in ds if y > 0.5])  
    good1 = sum([1 for (p, y) in ds if y <= 0.5])  
    bad_cnt += bad1  
    good_cnt += good1  
    bad_pctg = round(bad_cnt/sum(evl_y),3)  
    badrate = round(bad1/(bad1+good1),3)  
    ks = round(math.fabs((bad_cnt / bad) - (good_cnt / good)),3)  
    KS.append(ks)  
    BAD.append(bad1)  
    GOOD.append(good1)  
    BAD_CNT.append(bad_cnt)  
    GOOD_CNT.append(good_cnt)  
    BAD_PCTG.append(bad_pctg)  
    BADRATE.append(badrate)  
    dct_report['KS'] = KS  
    dct_report['BAD'] = BAD  
    dct_report['GOOD'] = GOOD  
    dct_report['BAD_CNT'] = BAD_CNT  
    dct_report['GOOD_CNT'] = GOOD_CNT  
    dct_report['BAD_PCTG'] = BAD_PCTG  
    dct_report['BADRATE'] = BADRATE  
val_repot = pd.DataFrame(dct_report)  
print(val_repot)



#%% 自定義損失函數,需要提供損失函數的一階導和二階導  
def loglikelood(preds, dtrain):  
    labels = dtrain.get_label()  
    preds = 1.0 / (1.0 + np.exp(-preds))  
    grad = preds - labels  
    hess = preds * (1.0-preds)  
    return grad, hess  
  
# 自定義前20%正樣本占比最大化評價函數  
def binary_error(preds, train_data):  
    labels = train_data.get_label()  
    dct = pd.DataFrame({'pred':preds,'percent':preds,'labels':labels})  
    #取百分位點對應的閾值  
    key = dct['percent'].quantile(0.2)  
    #按照閾值處理成二分類任務  
    dct['percent']= dct['percent'].map(lambda x :1 if x <= key else 0)    
    #計算評價函數,權重默認0.5,可以根據情況調整  
    result = np.mean(dct[dct.percent== 1]['labels'] == 1)*0.5 \
               + np.mean((dct.labels - dct.pred)**2)*0.5  
    return 'error',result  
  
watchlist  = [(dtest,'eval'), (dtrain,'train')]  
param = {'max_depth':3, 'eta':0.1, 'silent':1}  
num_round = 100  
# 自定義損失函數訓練  
bst = xgb.train(param, dtrain, num_round, watchlist, loglikelood, binary_error) 

展示一些過程圖片

 

 

 

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM