目錄
- 導入數據
- 缺失值和異常值處理
- 特征可視化
- 特征選擇
- 模型訓練
- 模型評估
- 模型結果轉評分
- 計算用戶總分
一、導入數據
#導入模塊 import pandas as pd import numpy as np from scipy import stats import seaborn as sns import matplotlib.pyplot as plt %matplotlib inline plt.rc("font",family="SimHei",size="12") #解決中文無法顯示的問題 #導入數據 train=pd.read_csv('F:\\python\\Give-me-some-credit-master\\data\\cs-training.csv')
數據信息簡單查看
#簡單查看數據 train.info() ''' train.info() <class 'pandas.core.frame.DataFrame'> RangeIndex: 150000 entries, 0 to 149999 Data columns (total 12 columns): Unnamed: 0 150000 non-null int64 SeriousDlqin2yrs 150000 non-null int64 RevolvingUtilizationOfUnsecuredLines 150000 non-null float64 age 150000 non-null int64 NumberOfTime30-59DaysPastDueNotWorse 150000 non-null int64 DebtRatio 150000 non-null float64 MonthlyIncome 120269 non-null float64 NumberOfOpenCreditLinesAndLoans 150000 non-null int64 NumberOfTimes90DaysLate 150000 non-null int64 NumberRealEstateLoansOrLines 150000 non-null int64 NumberOfTime60-89DaysPastDueNotWorse 150000 non-null int64 NumberOfDependents 146076 non-null float64 dtypes: float64(4), int64(8) memory usage: 13.7 MB '''
頭三行和末尾三行數據查看
#頭三行和尾三行數據查看 train.head(3).append(train.tail(3))
shape查看
#shape train.shape #(150000, 11)
將各英文字段轉為中文字段名方便理解
states={'Unnamed: 0':'id', 'SeriousDlqin2yrs':'好壞客戶', 'RevolvingUtilizationOfUnsecuredLines':'可用額度比值', 'age':'年齡', 'NumberOfTime30-59DaysPastDueNotWorse':'逾期30-59天筆數', 'DebtRatio':'負債率', 'MonthlyIncome':'月收入', 'NumberOfOpenCreditLinesAndLoans':'信貸數量', 'NumberOfTimes90DaysLate':'逾期90天筆數', 'NumberRealEstateLoansOrLines':'固定資產貸款量', 'NumberOfTime60-89DaysPastDueNotWorse':'逾期60-89天筆數', 'NumberOfDependents':'家屬數量'} train.rename(columns=states,inplace=True) #設置索引 train=train.set_index('id',drop=True)
描述性統計
#描述性統計 train.describe()
二、缺失值和異常值處理
1.缺失值處理
查看缺失值
#查看每列缺失情況 train.isnull().sum() #查看缺失占比情況 train.isnull().sum()/len(train) #缺失值可視化 missing=train.isnull().sum() missing[missing>0].sort_values().plot.bar() #將大於0的拿出來並排序
可知
月收入缺失值是:29731,缺失比例是:0.198207
家屬數量缺失值:3924,缺失比例是:0.026160
先copy一份數據,保留原數據,然后對缺失值進行處理
#保留原數據 train_cp=train.copy() #月收入使用平均值填補缺失值 train_cp.fillna({'月收入':train_cp['月收入'].mean()},inplace=True) train_cp.isnull().sum() #家屬數量缺失的行去掉 train_cp=train_cp.dropna() train_cp.shape #(146076, 11)
2.異常值處理
查看異常值
#查看異常值 #畫箱型圖 for col in train_cp.columns: plt.boxplot(train_cp[col]) plt.title(col) plt.show()
可用額度比率大於1的數據是異常的
年齡為0的數據也是異常,其實小於18歲的都可以認定為異常,逾期30-59天筆數的有一個超級離群數據
異常值處理消除不合邏輯的數據和超級離群的數據,可用額度比值應該小於1,年齡為0的是異常值,逾期天數筆數大於80的是超級離群數據,將這些離群值過濾掉,篩選出剩余部分數據
train_cp=train_cp[train_cp['可用額度比值']<1] train_cp=train_cp[train_cp['年齡']>0] train_cp=train_cp[train_cp['逾期30-59天筆數']<80] train_cp=train_cp[train_cp['逾期60-89天筆數']<80] train_cp=train_cp[train_cp['逾期90天筆數']<80] train_cp=train_cp[train_cp['固定資產貸款量']<50] train_cp=train_cp[train_cp['負債率']<5000] train_cp.shape #(141180, 11)
三、特征可視化
1.單變量可視化
好壞用戶
#好壞用戶 train_cp.info() train_cp['好壞客戶'].value_counts() train_cp['好壞客戶'].value_counts()/len(train_cp) train_cp['好壞客戶'].value_counts().plot.bar() ''' 0 132787 1 8393 Name: 好壞客戶, dtype: int64
數據嚴重傾斜 0 0.940551 1 0.059449 Name: 好壞客戶, dtype: float64 '''
可知y值嚴重傾斜
可用額度比值和負債率
#可用額度比值和負債率 train_cp['可用額度比值'].plot.hist() train_cp['負債率'].plot.hist()
#負債率大於1的數據影響太大了 a=train_cp['負債率'] a[a<=1].plot.hist()
逾期30-59天筆數,逾期90天筆數,逾期60-89天筆數
#逾期30-59天筆數,逾期90天筆數,逾期60-89天筆數 for i,col in enumerate(['逾期30-59天筆數','逾期90天筆數','逾期60-89天筆數']): plt.subplot(1,3,i+1) train_cp[col].value_counts().plot.bar() plt.title(col) train_cp['逾期30-59天筆數'].value_counts().plot.bar() train_cp['逾期90天筆數'].value_counts().plot.bar() train_cp['逾期60-89天筆數'].value_counts().plot.bar()
年齡:基本符合正態分布
#年齡 train_cp['年齡'].plot.hist()
月收入
#月收入 train_cp['月收入'].plot.hist() sns.distplot(train_cp['月收入']) #超級離群值影響太大了,我們取小於5w的數據畫圖 a=train_cp['月收入'] a[a<=50000].plot.hist() #發現小於5萬的也不多,那就取2w a=train_cp['月收入'] a[a<=20000].plot.hist()
信貸數量
#信貸數量 train_cp['信貸數量'].value_counts().plot.bar() sns.distplot(train_cp['信貸數量'])
固定資產貸款量
#固定資產貸款量 train_cp['固定資產貸款量'].value_counts().plot.bar() sns.distplot(train_cp['固定資產貸款量'])
家屬數量
#家屬數量 train_cp['家屬數量'].value_counts().plot.bar() sns.distplot(train_cp['家屬數量'])
2.單變量與y值可視化
可用額度比值
#單變量與y值可視化 #可用額度比值、負債率、年齡、月收入,這些需要分箱 #可用額度比值 train_cp['可用額度比值_cut']=pd.cut(train_cp['可用額度比值'],5) pd.crosstab(train_cp['可用額度比值_cut'],train_cp['好壞客戶']).plot(kind="bar") a=pd.crosstab(train_cp['可用額度比值_cut'],train_cp['好壞客戶']) a['壞用戶占比']=a[1]/(a[0]+a[1]) a['壞用戶占比'].plot()
可見分箱最后的每個箱子的逾期率相差居然有6倍只差,說明該特征還是不錯的
負債率
#負債率 cut=[-1,0.2,0.4,0.6,0.8,1,1.5,2,5,10,5000] train_cp['負債率_cut']=pd.cut(train_cp['負債率'],bins=cut) pd.crosstab(train_cp['負債率_cut'],train_cp['好壞客戶']).plot(kind="bar") a=pd.crosstab(train_cp['負債率_cut'],train_cp['好壞客戶']) a['壞用戶占比']=a[1]/(a[0]+a[1]) a['壞用戶占比'].plot()
年齡
#年齡 cut=[0,30,40,50,60,100] train_cp['年齡_cut']=pd.cut(train_cp['年齡'],bins=cut) pd.crosstab(train_cp['年齡_cut'],train_cp['好壞客戶']).plot(kind="bar") a=pd.crosstab(train_cp['年齡_cut'],train_cp['好壞客戶']) a['壞用戶占比']=a[1]/(a[0]+a[1]) a['壞用戶占比'].plot()
為什么老年人這么多,不大現實吧,難道產品主要針對老年用戶
月收入
#月收入 cut=[0,3000,5000,7000,10000,15000,30000,1000000] train_cp['月收入_cut']=pd.cut(train_cp['月收入'],bins=cut) pd.crosstab(train_cp['月收入_cut'],train_cp['好壞客戶']).plot(kind="bar") a=pd.crosstab(train_cp['月收入_cut'],train_cp['好壞客戶']) a['壞用戶占比']=a[1]/(a[0]+a[1]) a['壞用戶占比'].plot()
逾期30-59天筆數,逾期90天筆數,逾期60-89天筆數 \信貸數量\固定資產貸款量\家屬數量這些暫時不需要分箱:
逾期30-59天筆數
#逾期30-59天筆數,逾期90天筆數,逾期60-89天筆數 \信貸數量\固定資產貸款量\家屬數量 #逾期30-59天筆數 pd.crosstab(train_cp['逾期30-59天筆數'],train_cp['好壞客戶']).plot(kind="bar") a=pd.crosstab(train_cp['逾期30-59天筆數'],train_cp['好壞客戶']) a['壞用戶占比']=a[1]/(a[0]+a[1]) a['壞用戶占比'].plot()
逾期90天筆數
#逾期90天筆數 pd.crosstab(train_cp['逾期90天筆數'],train_cp['好壞客戶']).plot(kind="bar") a=pd.crosstab(train_cp['逾期90天筆數'],train_cp['好壞客戶']) a['壞用戶占比']=a[1]/(a[0]+a[1]) a['壞用戶占比'].plot()
逾期60-89天筆數
#逾期60-89天筆數 pd.crosstab(train_cp['逾期60-89天筆數'],train_cp['好壞客戶']).plot(kind="bar") a=pd.crosstab(train_cp['逾期60-89天筆數'],train_cp['好壞客戶']) a['壞用戶占比']=a[1]/(a[0]+a[1]) a['壞用戶占比'].plot()
信貸數量
#信貸數量 cut=[-1,0,1,2,3,4,5,10,15,100] train_cp['信貸數量_cut']=pd.cut(train_cp['月收入'],bins=cut) pd.crosstab(train_cp['信貸數量_cut'],train_cp['好壞客戶']).plot(kind="bar") a=pd.crosstab(train_cp['信貸數量_cut'],train_cp['好壞客戶']) a['壞用戶占比']=a[1]/(a[0]+a[1]) a['壞用戶占比'].plot()
固定資產貸款量
#固定資產貸款量 pd.crosstab(train_cp['固定資產貸款量'],train_cp['好壞客戶']).plot(kind="bar") a=pd.crosstab(train_cp['固定資產貸款量'],train_cp['好壞客戶']) a['壞用戶占比']=a[1]/(a[0]+a[1]) a['壞用戶占比'].plot()
家屬數量
#家屬數量 pd.crosstab(train_cp['家屬數量'],train_cp['好壞客戶']).plot(kind="bar") a=pd.crosstab(train_cp['家屬數量'],train_cp['好壞客戶']) a['壞用戶占比']=a[1]/(a[0]+a[1]) a['壞用戶占比'].plot()
3.變量之間的相關性:
#變量之間的相關性 train_cp.corr()['好壞客戶'].sort_values(ascending = False).plot(kind='bar') plt.figure(figsize=(20,16)) corr=train_cp.corr() sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, linewidths=0.2, cmap="YlGnBu",annot=True)
四、特征選擇
1.woe分箱
#woe分箱 cut1=pd.qcut(train_cp["可用額度比值"],4,labels=False) cut2=pd.qcut(train_cp["年齡"],8,labels=False) bins3=[-1,0,1,3,5,13] cut3=pd.cut(train_cp["逾期30-59天筆數"],bins3,labels=False) cut4=pd.qcut(train_cp["負債率"],3,labels=False) cut5=pd.qcut(train_cp["月收入"],4,labels=False) cut6=pd.qcut(train_cp["信貸數量"],4,labels=False) bins7=[-1, 0, 1, 3,5, 20] cut7=pd.cut(train_cp["逾期90天筆數"],bins7,labels=False) bins8=[-1, 0,1,2, 3, 33] cut8=pd.cut(train_cp["固定資產貸款量"],bins8,labels=False) bins9=[-1, 0, 1, 3, 12] cut9=pd.cut(train_cp["逾期60-89天筆數"],bins9,labels=False) bins10=[-1, 0, 1, 2, 3, 5, 21] cut10=pd.cut(train_cp["家屬數量"],bins10,labels=False)
2.WOE值計算
當前這個組中壞客戶和好客戶的比值,和所有樣本中這個比值的差異
#woe計算 rate=train_cp["好壞客戶"].sum()/(train_cp["好壞客戶"].count()-train_cp["好壞客戶"].sum()) #rate=壞/(總-壞) def get_woe_data(cut): grouped=train_cp["好壞客戶"].groupby(cut,as_index = True).value_counts() woe=np.log(grouped.unstack().iloc[:,1]/grouped.unstack().iloc[:,0]/rate) return woe cut1_woe=get_woe_data(cut1) cut2_woe=get_woe_data(cut2) cut3_woe=get_woe_data(cut3) cut4_woe=get_woe_data(cut4) cut5_woe=get_woe_data(cut5) cut6_woe=get_woe_data(cut6) cut7_woe=get_woe_data(cut7) cut8_woe=get_woe_data(cut8) cut9_woe=get_woe_data(cut9) cut10_woe=get_woe_data(cut10)
可視化一下:
l=[cut1_woe,cut2_woe,cut3_woe,cut4_woe,cut5_woe,cut6_woe,cut7_woe,cut8_woe,cut9_woe,cut10_woe] for i,col in enumerate(l): col.plot()
3.iv值計算
iv值其實就等於woe*(當前分組中壞客戶占所有壞客戶的比例 - 當前分組中好客戶占所有好客戶的比例)
#iv值計算 def get_IV_data(cut,cut_woe): grouped=train_cp["好壞客戶"].groupby(cut,as_index = True).value_counts() cut_IV=((grouped.unstack().iloc[:,1]/train_cp["好壞客戶"].sum()-grouped.unstack().iloc[:,0]/(train_cp["好壞客戶"].count()-train_cp["好壞客戶"].sum()))*cut_woe).sum() return cut_IV #計算各分組的IV值 cut1_IV=get_IV_data(cut1,cut1_woe) cut2_IV=get_IV_data(cut2,cut2_woe) cut3_IV=get_IV_data(cut3,cut3_woe) cut4_IV=get_IV_data(cut4,cut4_woe) cut5_IV=get_IV_data(cut5,cut5_woe) cut6_IV=get_IV_data(cut6,cut6_woe) cut7_IV=get_IV_data(cut7,cut7_woe) cut8_IV=get_IV_data(cut8,cut8_woe) cut9_IV=get_IV_data(cut9,cut9_woe) cut10_IV=get_IV_data(cut10,cut10_woe) IV=pd.DataFrame([cut1_IV,cut2_IV,cut3_IV,cut4_IV,cut5_IV,cut6_IV,cut7_IV,cut8_IV,cut9_IV,cut10_IV],index=['可用額度比值','年齡','逾期30-59天筆數','負債率','月收入','信貸數量','逾期90天筆數','固定資產貸款量','逾期60-89天筆數','家屬數量'],columns=['IV']) iv=IV.plot.bar(color='b',alpha=0.3,rot=30,figsize=(10,5),fontsize=(10)) iv.set_title('特征變量與IV值分布圖',fontsize=(15)) iv.set_xlabel('特征變量',fontsize=(15)) iv.set_ylabel('IV',fontsize=(15))
一般選取IV大於0.02的特征變量進行后續訓練,從以上可以看出所有變量均滿足,所以選取全部的
4.woe轉換
df_new=pd.DataFrame() #新建df_new存放woe轉換后的數據 def replace_data(cut,cut_woe): a=[] for i in cut.unique(): a.append(i) a.sort() for m in range(len(a)): cut.replace(a[m],cut_woe.values[m],inplace=True) return cut df_new["好壞客戶"]=train_cp["好壞客戶"] df_new["可用額度比值"]=replace_data(cut1,cut1_woe) df_new["年齡"]=replace_data(cut2,cut2_woe) df_new["逾期30-59天筆數"]=replace_data(cut3,cut3_woe) df_new["負債率"]=replace_data(cut4,cut4_woe) df_new["月收入"]=replace_data(cut5,cut5_woe) df_new["信貸數量"]=replace_data(cut6,cut6_woe) df_new["逾期90天筆數"]=replace_data(cut7,cut7_woe) df_new["固定資產貸款量"]=replace_data(cut8,cut8_woe) df_new["逾期60-89天筆數"]=replace_data(cut9,cut9_woe) df_new["家屬數量"]=replace_data(cut10,cut10_woe) df_new.head()
五、模型訓練
信用評分卡主要使用的算法模型是邏輯回歸。logistic模型客群變化的敏感度不如其他高復雜度模型,因此穩健更好,魯棒性更強。另外,模型直觀,系數含義好闡述、易理解,使用邏輯回歸優點是可以得到一個變量之間的線性關系式和對應的特征權值,方便后面將其轉成一一對應的分數形式
模型訓練
#模型訓練 from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split x=df_new.iloc[:,1:] y=df_new.iloc[:,:1] x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.6,random_state=0) model=LogisticRegression() clf=model.fit(x_train,y_train) print('測試成績:{}'.format(clf.score(x_test,y_test)))
測試成績:0.9427326816829579,看似很高,其實是由於數據傾斜太嚴重導致,最終結果還要看auc
求特征權值系數coe,后面訓練結果轉分值時會用到:
coe=clf.coef_ #特征權值系數,后面轉換為打分規則時會用到 coe ''' array([[0.62805638, 0.46284749, 0.54319513, 1.14645109, 0.42744108, 0.2503357 , 0.59564263, 0.81828033, 0.4433141 , 0.23788103]]) '''
六、模型評估
模型評估主要看AUC和K-S值
#模型評估 from sklearn.metrics import roc_curve, auc fpr, tpr, threshold = roc_curve(y_test, y_pred) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, color='darkorange',label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC_curve') plt.legend(loc="lower right") plt.show() roc_auc #0.5756615527156178
ks
#ks fig, ax = plt.subplots() ax.plot(1 - threshold, tpr, label='tpr') # ks曲線要按照預測概率降序排列,所以需要1-threshold鏡像 ax.plot(1 - threshold, fpr, label='fpr') ax.plot(1 - threshold, tpr-fpr,label='KS') plt.xlabel('score') plt.title('KS Curve') plt.ylim([0.0, 1.0]) plt.figure(figsize=(20,20)) legend = ax.legend(loc='upper left') plt.show() max(tpr-fpr) # 0.1513231054312355
ROC0.58, K-S值0.15左右,建模效果一般
為什么分數這么高但是auc和ks很低,那是樣本不均衡導致的
七、模型結果轉評分
假設好壞比為20的時候分數為600分,每高20分好壞比翻一倍
現在我們求每個變量不同woe值對應的分數刻度可得:
#模型結果轉評分 factor = 20 / np.log(2) offset = 600 - 20 * np.log(20) / np.log(2) def get_score(coe,woe,factor): scores=[] for w in woe: score=round(coe*w*factor,0) scores.append(score) return scores x1 = get_score(coe[0][0], cut1_woe, factor) x2 = get_score(coe[0][1], cut2_woe, factor) x3 = get_score(coe[0][2], cut3_woe, factor) x4 = get_score(coe[0][3], cut4_woe, factor) x5 = get_score(coe[0][4], cut5_woe, factor) x6 = get_score(coe[0][5], cut6_woe, factor) x7 = get_score(coe[0][6], cut7_woe, factor) x8 = get_score(coe[0][7], cut8_woe, factor) x9 = get_score(coe[0][8], cut9_woe, factor) x10 = get_score(coe[0][9], cut10_woe, factor) print("可用額度比值對應的分數:{}".format(x1)) print("年齡對應的分數:{}".format(x2)) print("逾期30-59天筆數對應的分數:{}".format(x3)) print("負債率對應的分數:{}".format(x4)) print("月收入對應的分數:{}".format(x5)) print("信貸數量對應的分數:{}".format(x6)) print("逾期90天筆數對應的分數:{}".format(x7)) print("固定資產貸款量對應的分數:{}".format(x8)) print("逾期60-89天筆數對應的分數:{}".format(x9)) print("家屬數量對應的分數:{}".format(x10))
可用額度比值對應的分數:[-22.0, -21.0, -5.0, 19.0]
年齡對應的分數:[7.0, 5.0, 3.0, 2.0, -0.0, -5.0, -11.0, -14.0]
逾期30-59天筆數對應的分數:[-7.0, 14.0, 27.0, 37.0, 41.0]
負債率對應的分數:[-5.0, -2.0, 6.0]
月收入對應的分數:[4.0, 1.0, -2.0, -4.0]
信貸數量對應的分數:[2.0, -2.0, -1.0, 0.0]
逾期90天筆數對應的分數:[-6.0, 34.0, 48.0, 56.0, 57.0]
固定資產貸款量對應的分數:[5.0, -6.0, -3.0, 2.0, 16.0]
逾期60-89天筆數對應的分數:[-3.0, 23.0, 35.0, 38.0]
家屬數量對應的分數:[-1.0, 1.0, 1.0, 2.0, 3.0, 5.0]
可以看出分數越高,成為壞客戶的可能性越大。像年齡越大壞客率越低,可用額度比值、逾期筆數這幾個變量的分數跨度較大對最后的總分有更大的影響,這些都印證了前面探索分析的結果。
八、計算用戶總得分
1.取自動分箱的邊界分割點
cu1=pd.qcut(train_cp["可用額度比值"],4,labels=False,retbins=True) bins1=cu1[1] cu2=pd.qcut(train_cp["年齡"],8,labels=False,retbins=True) bins2=cu2[1] # bins3=[-1,0,1,3,5,13] # cut3=pd.cut(train_cp["逾期30-59天筆數"],bins3,labels=False) cu4=pd.qcut(train_cp["負債率"],3,labels=False,retbins=True) bins4=cu4[1] cu5=pd.qcut(train_cp["月收入"],4,labels=False,retbins=True) bins5=cu5[1] cu6=pd.qcut(train_cp["信貸數量"],4,labels=False,retbins=True) bins6=cu6[1]
2.各變量對應的分數求和,算出每個用戶的總分
#.各變量對應的分數求和,算出每個用戶的總分 def compute_score(series,bins,score): list = [] i = 0 while i < len(series): value = series[i] j = len(bins) - 2 m = len(bins) - 2 while j >= 0: if value >= bins[j]: j = -1 else: j -= 1 m -= 1 list.append(score[m]) i += 1 return list path2=r'F:\\python\\Give-me-some-credit-master\\data\\cs-test.csv' test1 = pd.read_csv(path2) test1['x1'] = pd.Series(compute_score(test1['RevolvingUtilizationOfUnsecuredLines'], bins1, x1)) test1['x2'] = pd.Series(compute_score(test1['age'], bins2, x2)) test1['x3'] = pd.Series(compute_score(test1['NumberOfTime30-59DaysPastDueNotWorse'], bins3, x3)) test1['x4'] = pd.Series(compute_score(test1['DebtRatio'], bins4, x4)) test1['x5'] = pd.Series(compute_score(test1['MonthlyIncome'], bins5, x5)) test1['x6'] = pd.Series(compute_score(test1['NumberOfOpenCreditLinesAndLoans'], bins6, x6)) test1['x7'] = pd.Series(compute_score(test1['NumberOfTimes90DaysLate'], bins7, x7)) test1['x8'] = pd.Series(compute_score(test1['NumberRealEstateLoansOrLines'], bins8, x8)) test1['x9'] = pd.Series(compute_score(test1['NumberOfTime60-89DaysPastDueNotWorse'], bins9, x9)) test1['x10'] = pd.Series(compute_score(test1['NumberOfDependents'], bins10, x10)) test1['Score'] = test1['x1']+test1['x2']+test1['x3']+test1['x4']+test1['x5']+test1['x6']+test1['x7']+test1['x8']+test1['x9']+test1['x10']+600 test1.to_csv(r'F:\\python\\Give-me-some-credit-master\\data\\ScoreData.csv', index=False)
文章轉載:https://www.cnblogs.com/daliner/p/10268350.html
全部代碼:

# -*- coding: utf-8 -*- """ Created on Tue Aug 11 14:09:20 2020 @author: Admin """ #導入模塊 import pandas as pd import numpy as np from scipy import stats import seaborn as sns import matplotlib.pyplot as plt %matplotlib inline plt.rc("font",family="SimHei",size="12") #解決中文無法顯示的問題 #導入數據 train=pd.read_csv('F:\\python\\Give-me-some-credit-master\\data\\cs-training.csv') #簡單查看數據 train.info() #頭三行和尾三行數據查看 b=train.head(3).append(train.tail(3)) #shape train.shape #(150000, 11) #將各英文字段轉為中文字段名方便理解 states={'Unnamed: 0':'id', 'SeriousDlqin2yrs':'好壞客戶', 'RevolvingUtilizationOfUnsecuredLines':'可用額度比值', 'age':'年齡', 'NumberOfTime30-59DaysPastDueNotWorse':'逾期30-59天筆數', 'DebtRatio':'負債率', 'MonthlyIncome':'月收入', 'NumberOfOpenCreditLinesAndLoans':'信貸數量', 'NumberOfTimes90DaysLate':'逾期90天筆數', 'NumberRealEstateLoansOrLines':'固定資產貸款量', 'NumberOfTime60-89DaysPastDueNotWorse':'逾期60-89天筆數', 'NumberOfDependents':'家屬數量'} train.rename(columns=states,inplace=True) #設置索引 train=train.set_index('id',drop=True) #描述性統計 train.describe() #查看每列缺失情況 train.isnull().sum() #查看缺失占比情況 train.isnull().sum()/len(train) #缺失值可視化 missing=train.isnull().sum() missing[missing>0].sort_values().plot.bar() #將大於0的拿出來並排序 #保留原數據 train_cp=train.copy() #月收入使用平均值填補缺失值 train_cp.fillna({'月收入':train_cp['月收入'].mean()},inplace=True) train_cp.isnull().sum() #家屬數量缺失的行去掉 train_cp=train_cp.dropna() train_cp.shape #(146076, 11) #查看異常值 #畫箱型圖 for col in train_cp.columns: plt.boxplot(train_cp[col]) plt.title(col) plt.show() #異常值處理 train_cp=train_cp[train_cp['可用額度比值']<1] train_cp=train_cp[train_cp['年齡']>0] train_cp=train_cp[train_cp['逾期30-59天筆數']<80] train_cp=train_cp[train_cp['逾期60-89天筆數']<80] train_cp=train_cp[train_cp['逾期90天筆數']<80] train_cp=train_cp[train_cp['固定資產貸款量']<50] train_cp=train_cp[train_cp['負債率']<5000] train_cp.shape #(141180, 11) #單變量分析 #好壞用戶 train_cp.info() train_cp['好壞客戶'].value_counts() train_cp['好壞客戶'].value_counts()/len(train_cp) train_cp['好壞客戶'].value_counts().plot.bar() #可用額度比值和負債率 train_cp['可用額度比值'].plot.hist() train_cp['負債率'].plot.hist() #負債率大於1的數據影響太大了 a=train_cp['負債率'] a[a<=1].plot.hist() #逾期30-59天筆數,逾期90天筆數,逾期60-89天筆數 for i,col in enumerate(['逾期30-59天筆數','逾期90天筆數','逾期60-89天筆數']): plt.subplot(1,3,i+1) train_cp[col].value_counts().plot.bar() plt.title(col) train_cp['逾期30-59天筆數'].value_counts().plot.bar() train_cp['逾期90天筆數'].value_counts().plot.bar() train_cp['逾期60-89天筆數'].value_counts().plot.bar() #年齡 train_cp['年齡'].plot.hist() #月收入 train_cp['月收入'].plot.hist() sns.distplot(train_cp['月收入']) #超級離群值影響太大了,我們取小於5w的數據畫圖 a=train_cp['月收入'] a[a<=50000].plot.hist() #發現小於5萬的也不多,那就取2w a=train_cp['月收入'] a[a<=20000].plot.hist() #信貸數量 train_cp['信貸數量'].value_counts().plot.bar() sns.distplot(train_cp['信貸數量']) #固定資產貸款量 train_cp['固定資產貸款量'].value_counts().plot.bar() sns.distplot(train_cp['固定資產貸款量']) #家屬數量 train_cp['家屬數量'].value_counts().plot.bar() sns.distplot(train_cp['家屬數量']) #單變量與y值可視化 #可用額度比值、負債率、年齡、月收入,這些需要分箱 #可用額度比值 train_cp['可用額度比值_cut']=pd.cut(train_cp['可用額度比值'],5) pd.crosstab(train_cp['可用額度比值_cut'],train_cp['好壞客戶']).plot(kind="bar") a=pd.crosstab(train_cp['可用額度比值_cut'],train_cp['好壞客戶']) a['壞用戶占比']=a[1]/(a[0]+a[1]) a['壞用戶占比'].plot() #負債率 cut=[-1,0.2,0.4,0.6,0.8,1,1.5,2,5,10,5000] train_cp['負債率_cut']=pd.cut(train_cp['負債率'],bins=cut) pd.crosstab(train_cp['負債率_cut'],train_cp['好壞客戶']).plot(kind="bar") a=pd.crosstab(train_cp['負債率_cut'],train_cp['好壞客戶']) a['壞用戶占比']=a[1]/(a[0]+a[1]) a['壞用戶占比'].plot() #年齡 cut=[0,30,40,50,60,100] train_cp['年齡_cut']=pd.cut(train_cp['年齡'],bins=cut) pd.crosstab(train_cp['年齡_cut'],train_cp['好壞客戶']).plot(kind="bar") a=pd.crosstab(train_cp['年齡_cut'],train_cp['好壞客戶']) a['壞用戶占比']=a[1]/(a[0]+a[1]) a['壞用戶占比'].plot() #月收入 cut=[0,3000,5000,7000,10000,15000,30000,1000000] train_cp['月收入_cut']=pd.cut(train_cp['月收入'],bins=cut) pd.crosstab(train_cp['月收入_cut'],train_cp['好壞客戶']).plot(kind="bar") a=pd.crosstab(train_cp['月收入_cut'],train_cp['好壞客戶']) a['壞用戶占比']=a[1]/(a[0]+a[1]) a['壞用戶占比'].plot() #逾期30-59天筆數,逾期90天筆數,逾期60-89天筆數 \信貸數量\固定資產貸款量\家屬數量 #逾期30-59天筆數 pd.crosstab(train_cp['逾期30-59天筆數'],train_cp['好壞客戶']).plot(kind="bar") a=pd.crosstab(train_cp['逾期30-59天筆數'],train_cp['好壞客戶']) a['壞用戶占比']=a[1]/(a[0]+a[1]) a['壞用戶占比'].plot() #逾期90天筆數 pd.crosstab(train_cp['逾期90天筆數'],train_cp['好壞客戶']).plot(kind="bar") a=pd.crosstab(train_cp['逾期90天筆數'],train_cp['好壞客戶']) a['壞用戶占比']=a[1]/(a[0]+a[1]) a['壞用戶占比'].plot() #逾期60-89天筆數 pd.crosstab(train_cp['逾期60-89天筆數'],train_cp['好壞客戶']).plot(kind="bar") a=pd.crosstab(train_cp['逾期60-89天筆數'],train_cp['好壞客戶']) a['壞用戶占比']=a[1]/(a[0]+a[1]) a['壞用戶占比'].plot() #信貸數量 cut=[-1,0,1,2,3,4,5,10,15,100] train_cp['信貸數量_cut']=pd.cut(train_cp['月收入'],bins=cut) pd.crosstab(train_cp['信貸數量_cut'],train_cp['好壞客戶']).plot(kind="bar") a=pd.crosstab(train_cp['信貸數量_cut'],train_cp['好壞客戶']) a['壞用戶占比']=a[1]/(a[0]+a[1]) a['壞用戶占比'].plot() #固定資產貸款量 pd.crosstab(train_cp['固定資產貸款量'],train_cp['好壞客戶']).plot(kind="bar") a=pd.crosstab(train_cp['固定資產貸款量'],train_cp['好壞客戶']) a['壞用戶占比']=a[1]/(a[0]+a[1]) a['壞用戶占比'].plot() #家屬數量 pd.crosstab(train_cp['家屬數量'],train_cp['好壞客戶']).plot(kind="bar") a=pd.crosstab(train_cp['家屬數量'],train_cp['好壞客戶']) a['壞用戶占比']=a[1]/(a[0]+a[1]) a['壞用戶占比'].plot() #變量之間的相關性 train_cp.corr()['好壞客戶'].sort_values(ascending = False).plot(kind='bar') plt.figure(figsize=(20,16)) corr=train_cp.corr() sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, linewidths=0.2, cmap="YlGnBu",annot=True) #woe分箱 cut1=pd.qcut(train_cp["可用額度比值"],4,labels=False) cut2=pd.qcut(train_cp["年齡"],8,labels=False) bins3=[-1,0,1,3,5,13] cut3=pd.cut(train_cp["逾期30-59天筆數"],bins3,labels=False) cut4=pd.qcut(train_cp["負債率"],3,labels=False) cut5=pd.qcut(train_cp["月收入"],4,labels=False) cut6=pd.qcut(train_cp["信貸數量"],4,labels=False) bins7=[-1, 0, 1, 3,5, 20] cut7=pd.cut(train_cp["逾期90天筆數"],bins7,labels=False) bins8=[-1, 0,1,2, 3, 33] cut8=pd.cut(train_cp["固定資產貸款量"],bins8,labels=False) bins9=[-1, 0, 1, 3, 12] cut9=pd.cut(train_cp["逾期60-89天筆數"],bins9,labels=False) bins10=[-1, 0, 1, 2, 3, 5, 21] cut10=pd.cut(train_cp["家屬數量"],bins10,labels=False) #woe計算 rate=train_cp["好壞客戶"].sum()/(train_cp["好壞客戶"].count()-train_cp["好壞客戶"].sum()) #rate=壞/(總-壞) def get_woe_data(cut): grouped=train_cp["好壞客戶"].groupby(cut,as_index = True).value_counts() woe=np.log(grouped.unstack().iloc[:,1]/grouped.unstack().iloc[:,0]/rate) return woe cut1_woe=get_woe_data(cut1) cut2_woe=get_woe_data(cut2) cut3_woe=get_woe_data(cut3) cut4_woe=get_woe_data(cut4) cut5_woe=get_woe_data(cut5) cut6_woe=get_woe_data(cut6) cut7_woe=get_woe_data(cut7) cut8_woe=get_woe_data(cut8) cut9_woe=get_woe_data(cut9) cut10_woe=get_woe_data(cut10) l=[cut1_woe,cut2_woe,cut3_woe,cut4_woe,cut5_woe,cut6_woe,cut7_woe,cut8_woe,cut9_woe,cut10_woe] for i,col in enumerate(l): col.plot() #iv值計算 def get_IV_data(cut,cut_woe): grouped=train_cp["好壞客戶"].groupby(cut,as_index = True).value_counts() cut_IV=((grouped.unstack().iloc[:,1]/train_cp["好壞客戶"].sum()-grouped.unstack().iloc[:,0]/(train_cp["好壞客戶"].count()-train_cp["好壞客戶"].sum()))*cut_woe).sum() return cut_IV #計算各分組的IV值 cut1_IV=get_IV_data(cut1,cut1_woe) cut2_IV=get_IV_data(cut2,cut2_woe) cut3_IV=get_IV_data(cut3,cut3_woe) cut4_IV=get_IV_data(cut4,cut4_woe) cut5_IV=get_IV_data(cut5,cut5_woe) cut6_IV=get_IV_data(cut6,cut6_woe) cut7_IV=get_IV_data(cut7,cut7_woe) cut8_IV=get_IV_data(cut8,cut8_woe) cut9_IV=get_IV_data(cut9,cut9_woe) cut10_IV=get_IV_data(cut10,cut10_woe) IV=pd.DataFrame([cut1_IV,cut2_IV,cut3_IV,cut4_IV,cut5_IV,cut6_IV,cut7_IV,cut8_IV,cut9_IV,cut10_IV],index=['可用額度比值','年齡','逾期30-59天筆數','負債率','月收入','信貸數量','逾期90天筆數','固定資產貸款量','逾期60-89天筆數','家屬數量'],columns=['IV']) iv=IV.plot.bar(color='b',alpha=0.3,rot=30,figsize=(10,5),fontsize=(10)) iv.set_title('特征變量與IV值分布圖',fontsize=(15)) iv.set_xlabel('特征變量',fontsize=(15)) iv.set_ylabel('IV',fontsize=(15)) #woe轉換 df_new=pd.DataFrame() #新建df_new存放woe轉換后的數據 def replace_data(cut,cut_woe): a=[] for i in cut.unique(): a.append(i) a.sort() for m in range(len(a)): cut.replace(a[m],cut_woe.values[m],inplace=True) return cut df_new["好壞客戶"]=train_cp["好壞客戶"] df_new["可用額度比值"]=replace_data(cut1,cut1_woe) df_new["年齡"]=replace_data(cut2,cut2_woe) df_new["逾期30-59天筆數"]=replace_data(cut3,cut3_woe) df_new["負債率"]=replace_data(cut4,cut4_woe) df_new["月收入"]=replace_data(cut5,cut5_woe) df_new["信貸數量"]=replace_data(cut6,cut6_woe) df_new["逾期90天筆數"]=replace_data(cut7,cut7_woe) df_new["固定資產貸款量"]=replace_data(cut8,cut8_woe) df_new["逾期60-89天筆數"]=replace_data(cut9,cut9_woe) df_new["家屬數量"]=replace_data(cut10,cut10_woe) df_new.head() #模型訓練 from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split x=df_new.iloc[:,1:] y=df_new.iloc[:,:1] x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.6,random_state=0) model=LogisticRegression() clf=model.fit(x_train,y_train) print('測試成績:{}'.format(clf.score(x_test,y_test))) #系數 coe=clf.coef_ #特征權值系數,后面轉換為打分規則時會用到 coe #測試集的分數 y_pred=clf.predict(x_test) #模型評估 from sklearn.metrics import roc_curve, auc fpr, tpr, threshold = roc_curve(y_test, y_pred) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, color='darkorange',label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC_curve') plt.legend(loc="lower right") plt.show() roc_auc #0.5756615527156178 #ks fig, ax = plt.subplots() ax.plot(1 - threshold, tpr, label='tpr') # ks曲線要按照預測概率降序排列,所以需要1-threshold鏡像 ax.plot(1 - threshold, fpr, label='fpr') ax.plot(1 - threshold, tpr-fpr,label='KS') plt.xlabel('score') plt.title('KS Curve') plt.ylim([0.0, 1.0]) plt.figure(figsize=(20,20)) legend = ax.legend(loc='upper left') plt.show() max(tpr-fpr) # 0.1513231054312355 #模型結果轉評分 factor = 20 / np.log(2) offset = 600 - 20 * np.log(20) / np.log(2) def get_score(coe,woe,factor): scores=[] for w in woe: score=round(coe*w*factor,0) scores.append(score) return scores x1 = get_score(coe[0][0], cut1_woe, factor) x2 = get_score(coe[0][1], cut2_woe, factor) x3 = get_score(coe[0][2], cut3_woe, factor) x4 = get_score(coe[0][3], cut4_woe, factor) x5 = get_score(coe[0][4], cut5_woe, factor) x6 = get_score(coe[0][5], cut6_woe, factor) x7 = get_score(coe[0][6], cut7_woe, factor) x8 = get_score(coe[0][7], cut8_woe, factor) x9 = get_score(coe[0][8], cut9_woe, factor) x10 = get_score(coe[0][9], cut10_woe, factor) print("可用額度比值對應的分數:{}".format(x1)) print("年齡對應的分數:{}".format(x2)) print("逾期30-59天筆數對應的分數:{}".format(x3)) print("負債率對應的分數:{}".format(x4)) print("月收入對應的分數:{}".format(x5)) print("信貸數量對應的分數:{}".format(x6)) print("逾期90天筆數對應的分數:{}".format(x7)) print("固定資產貸款量對應的分數:{}".format(x8)) print("逾期60-89天筆數對應的分數:{}".format(x9)) print("家屬數量對應的分數:{}".format(x10)) #1.取自動分箱的邊界分割點 cu1=pd.qcut(train_cp["可用額度比值"],4,labels=False,retbins=True) bins1=cu1[1] cu2=pd.qcut(train_cp["年齡"],8,labels=False,retbins=True) bins2=cu2[1] # bins3=[-1,0,1,3,5,13] # cut3=pd.cut(train_cp["逾期30-59天筆數"],bins3,labels=False) cu4=pd.qcut(train_cp["負債率"],3,labels=False,retbins=True) bins4=cu4[1] cu5=pd.qcut(train_cp["月收入"],4,labels=False,retbins=True) bins5=cu5[1] cu6=pd.qcut(train_cp["信貸數量"],4,labels=False,retbins=True) bins6=cu6[1] #.各變量對應的分數求和,算出每個用戶的總分 def compute_score(series,bins,score): list = [] i = 0 while i < len(series): value = series[i] j = len(bins) - 2 m = len(bins) - 2 while j >= 0: if value >= bins[j]: j = -1 else: j -= 1 m -= 1 list.append(score[m]) i += 1 return list path2=r'F:\\python\\Give-me-some-credit-master\\data\\cs-test.csv' test1 = pd.read_csv(path2) test1['x1'] = pd.Series(compute_score(test1['RevolvingUtilizationOfUnsecuredLines'], bins1, x1)) test1['x2'] = pd.Series(compute_score(test1['age'], bins2, x2)) test1['x3'] = pd.Series(compute_score(test1['NumberOfTime30-59DaysPastDueNotWorse'], bins3, x3)) test1['x4'] = pd.Series(compute_score(test1['DebtRatio'], bins4, x4)) test1['x5'] = pd.Series(compute_score(test1['MonthlyIncome'], bins5, x5)) test1['x6'] = pd.Series(compute_score(test1['NumberOfOpenCreditLinesAndLoans'], bins6, x6)) test1['x7'] = pd.Series(compute_score(test1['NumberOfTimes90DaysLate'], bins7, x7)) test1['x8'] = pd.Series(compute_score(test1['NumberRealEstateLoansOrLines'], bins8, x8)) test1['x9'] = pd.Series(compute_score(test1['NumberOfTime60-89DaysPastDueNotWorse'], bins9, x9)) test1['x10'] = pd.Series(compute_score(test1['NumberOfDependents'], bins10, x10)) test1['Score'] = test1['x1']+test1['x2']+test1['x3']+test1['x4']+test1['x5']+test1['x6']+test1['x7']+test1['x8']+test1['x9']+test1['x10']+600 test1.to_csv(r'F:\\python\\Give-me-some-credit-master\\data\\ScoreData.csv', index=False)