(信貸風控九)行為評分卡模型python實現


 python信用評分卡建模視頻系列教程(附代碼)  博主錄制

https://blog.csdn.net/LuYi_WeiLin/article/details/87968830 轉載

淺談行為評分卡
我們知道行為評分卡只要用在信貸的貸中環節,貸中指的是貸款發放之后到期之前的時間段,其實行為評分卡和申請評分卡在實現上沒有太大的差別,主要是數據集不一樣。因為前一篇博客已經介紹過行為評分卡,這里就不過多去討論了,主要以代碼為主

 

行為評分卡建模步驟
數據預處理
特征衍生
特征處理與篩選
變量分箱
模型的參數估計
特征挑選
模型性能測試
概率轉換為分數
 

代碼如下:

和申請評分卡代碼有一些不同的地方在於,由於行為邏輯回歸結果后有存在符號為正的系數和不顯著的變量,變量挑選使用了GBDT評估重要性和LASSO。由於數據集不一樣,代碼中可能后面有一些變量挑選的地方對不上,不過整體代碼是沒有問題的,思路和流程在代碼中均可以體現,數據集可以在我的博客資源下載。

 

 

import pandas as pd
import numpy as np
import pickle
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from sklearn import ensemble
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
 
 
#################################
#由於數據已經經過一定的清洗了,非一手數據,所以我們忽略了一些步驟,進行變量衍生
#   1, 讀取數據,衍生初始變量   #
'''
Loan_Amount:總額度
OS:未還金額
Payment:還款金額
Spend:使用金額
Delq:逾期情況
'''
#################################
folderOfData = 'H:'
trainData = pd.read_csv(folderOfData+'/訓練集.csv',header = 0,engine ='python')
testData = pd.read_csv(folderOfData+'/測試集.csv',header = 0,engine ='python')
 
 
#衍生逾期類型的特征的函數
def DelqFeatures(event,window,type):
    '''
    :parms event 數據框
    :parms windows 時間窗口
    :parms type 響應事件類型
    '''
    current = 12
    start = 12 - window + 1
    #delq1、delq2、delq3為了獲取window相對應的dataframe范圍
    delq1 = [event[a] for a in ['Delq1_' + str(t) for t in range(current, start - 1, -1)]]
    delq2 = [event[a] for a in ['Delq2_' + str(t) for t in range(current, start - 1, -1)]]
    delq3 = [event[a] for a in ['Delq3_' + str(t) for t in range(current, start - 1, -1)]]
    if type == 'max delq':
        if max(delq3) == 1:
            return 3
        elif max(delq2) == 1:
            return 2
        elif max(delq1) == 1:
            return 1
        else:
            return 0
    if type in ['M0 times','M1 times', 'M2 times']:
        if type.find('M0')>-1:
            return sum(delq1)
        elif type.find('M1')>-1:
            return sum(delq2)
        else:
            return sum(delq3)
 
allFeatures = []
 
'''
逾期類型的特征在行為評分卡(預測違約行為)中,一般是非常顯著的變量。
通過設定時間窗口,可以衍生以下類型的逾期變量:
'''
 
# 考慮過去1個月,3個月,6個月,12個月
for t in [1,3,6,12]:
    # 1,過去t時間窗口內的最大逾期狀態
    allFeatures.append('maxDelqL'+str(t)+"M")
    trainData['maxDelqL'+str(t)+"M"] = trainData.apply(lambda x: DelqFeatures(x,t,'max delq'),axis=1)
 
    # 2,過去t時間窗口內的,M0,M1,M2的次數
    allFeatures.append('M0FreqL' + str(t) + "M")
    trainData['M0FreqL' + str(t) + "M"] = trainData.apply(lambda x: DelqFeatures(x,t,'M0 times'),axis=1)
 
    allFeatures.append('M1FreqL' + str(t) + "M")
    trainData['M1FreqL' + str(t) + "M"] = trainData.apply(lambda x: DelqFeatures(x, t, 'M1 times'), axis=1)
 
    allFeatures.append('M2FreqL' + str(t) + "M")
    trainData['M2FreqL' + str(t) + "M"] = trainData.apply(lambda x: DelqFeatures(x, t, 'M2 times'), axis=1)
 
 
 
#衍生額度使用率類型特征的函數
def UrateFeatures(event, window, type):
    '''
    :parms event 數據框
    :parms windows 時間窗口
    :parms type 響應事件類型
    '''
    current = 12
    start = 12 - window + 1
    #獲取在數據框內有效區域
    monthlySpend = [event[a] for a in ['Spend_' + str(t) for t in range(current, start - 1, -1)]]
    #獲取授信總額度
    limit = event['Loan_Amount']
    #月使用率
    monthlyUrate = [x / limit for x in monthlySpend]
    if type == 'mean utilization rate':
        return np.mean(monthlyUrate)
    if type == 'max utilization rate':
        return max(monthlyUrate)
    #月額度使用率增加的月份
    if type == 'increase utilization rate':
        #val[0:-1]表示第一個元素到倒數第二個元素的切片
        currentUrate = monthlyUrate[0:-1]
        #val[1:]表示第二個元素到最后一個元素的切片
        previousUrate = monthlyUrate[1:]
        compareUrate = [int(x[0]>x[1]) for x in zip(currentUrate,previousUrate)]
        return sum(compareUrate)
 
 
'''
額度使用率類型特征在行為評分卡模型中,通常是與違約高度相關的
'''
# 考慮過去1個月,3個月,6個月,12個月
for t in [1,3,6,12]:
    # 1,過去t時間窗口內的最大月額度使用率
    allFeatures.append('maxUrateL' + str(t) + "M")
    trainData['maxUrateL' + str(t) + "M"] = trainData.apply(lambda x: UrateFeatures(x,t,'max utilization rate'),axis = 1)
 
    # 2,過去t時間窗口內的平均月額度使用率
    allFeatures.append('avgUrateL' + str(t) + "M")
    trainData['avgUrateL' + str(t) + "M"] = trainData.apply(lambda x: UrateFeatures(x, t, 'mean utilization rate'),axis=1)
 
    # 3,過去t時間窗口內,月額度使用率增加的月份。該變量要求t>1
    if t > 1:
        allFeatures.append('increaseUrateL' + str(t) + "M")
        trainData['increaseUrateL' + str(t) + "M"] = trainData.apply(lambda x: UrateFeatures(x, t, 'increase utilization rate'),axis=1)
 
 
#衍生還款類型特征的函數
def PaymentFeatures(event, window, type):
    current = 12
    start = 12 - window + 1
    #月還款金額
    currentPayment = [event[a] for a in ['Payment_' + str(t) for t in range(current, start - 1, -1)]]
    #月使用金額,錯位一下
    previousOS = [event[a] for a in ['OS_' + str(t) for t in range(current-1, start - 2, -1)]]
    monthlyPayRatio = []
    for Pay_OS in zip(currentPayment,previousOS):
        #前一個月使用了才會產生還款
        if Pay_OS[1]>0:
            payRatio = Pay_OS[0]*1.0 / Pay_OS[1]
            monthlyPayRatio.append(payRatio)
        #前一個月沒使用,就按照100%還款
        else:
            monthlyPayRatio.append(1)
    if type == 'min payment ratio':
        return min(monthlyPayRatio)
    if type == 'max payment ratio':
        return max(monthlyPayRatio)
    if type == 'mean payment ratio':
        total_payment = sum(currentPayment)
        total_OS = sum(previousOS)
        if total_OS > 0:
            return total_payment / total_OS
        else:
            return 1
 
'''
還款類型特征也是行為評分卡模型中常用的特征
'''
# 考慮過去1個月,3個月,6個月,12個月
for t in [1,3,6,12]:
    # 1,過去t時間窗口內的最大月還款率
    allFeatures.append('maxPayL' + str(t) + "M")
    trainData['maxPayL' + str(t) + "M"] = trainData.apply(lambda x: PaymentFeatures(x, t, 'max payment ratio'),axis=1)
    # 2,過去t時間窗口內的最小月還款率
    allFeatures.append('minPayL' + str(t) + "M")
    trainData['minPayL' + str(t) + "M"] = trainData.apply(lambda x: PaymentFeatures(x, t, 'min payment ratio'),axis=1)
    # 3,過去t時間窗口內的平均月還款率
    allFeatures.append('avgPayL' + str(t) + "M")
    trainData['avgPayL' + str(t) + "M"] = trainData.apply(lambda x: PaymentFeatures(x, t, 'mean payment ratio'),axis=1)
 
 
 
###函數########
#計算變量分箱之后各分箱的壞樣本率
def BinBadRate(df, col, target, grantRateIndicator=0):
    '''
    :param df: 需要計算好壞比率的數據集
    :param col: 需要計算好壞比率的特征
    :param target: 好壞標簽
    :param grantRateIndicator: 1返回總體的壞樣本率,0不返回
    :return: 每箱的壞樣本率,以及總體的壞樣本率(當grantRateIndicator==1時)
    '''
    #print(df.groupby([col])[target])
    total = df.groupby([col])[target].count()
    #print(total)
    total = pd.DataFrame({'total': total})
    #print(total)
    bad = df.groupby([col])[target].sum()
    bad = pd.DataFrame({'bad': bad})
    #合並
    regroup = total.merge(bad, left_index=True, right_index=True, how='left')
    #print(regroup)
    regroup.reset_index(level=0, inplace=True)
    #print(regroup)
    #計算壞樣本率
    regroup['bad_rate'] = regroup.apply(lambda x: x.bad * 1.0 / x.total, axis=1)
    #print(regroup)
    #生成字典,(變量名取值:壞樣本率)
    dicts = dict(zip(regroup[col],regroup['bad_rate']))
    if grantRateIndicator==0:
        return (dicts, regroup)
    N = sum(regroup['total'])
    B = sum(regroup['bad'])
    #總體樣本率
    overallRate = B * 1.0 / N
    return (dicts, regroup, overallRate)
 
 
## 判斷某變量的壞樣本率是否單調
def BadRateMonotone(df, sortByVar, target,special_attribute = []):
    '''
    :param df: 包含檢驗壞樣本率的變量,和目標變量
    :param sortByVar: 需要檢驗壞樣本率的變量
    :param target: 目標變量,0、1表示好、壞
    :param special_attribute: 不參與檢驗的特殊值
    :return: 壞樣本率單調與否
    '''
    df2 = df.loc[~df[sortByVar].isin(special_attribute)]
    if len(set(df2[sortByVar])) <= 2:
        return True
    regroup = BinBadRate(df2, sortByVar, target)[1]
    combined = zip(regroup['total'],regroup['bad'])
    badRate = [x[1]*1.0/x[0] for x in combined]
    badRateNotMonotone = [badRate[i]<badRate[i+1] and badRate[i] < badRate[i-1] or badRate[i]>badRate[i+1] and badRate[i] > badRate[i-1]
                          for i in range(1,len(badRate)-1)]
    if True in badRateNotMonotone:
        return False
    else:
        return True
 
 
############################
#   2, 分箱,計算WOE並編碼   #
############################
 
'''
對類別型變量的分箱和WOE計算
可以通過計算取值個數的方式判斷是否是類別型變量
'''
#類別型變量
categoricalFeatures = []
#連續型變量
numericalFeatures = []
WOE_IV_dict = {}
for var in allFeatures:
    if len(set(trainData[var])) > 5:
        numericalFeatures.append(var)
    else:
        categoricalFeatures.append(var)
 
not_monotone =[]
for var in categoricalFeatures:
    #檢查bad rate在箱中的單調性
    if not BadRateMonotone(trainData, var, 'label'):
        not_monotone.append(var)
 
#print("數值取值小於5類別型變量{}壞樣本率不單調".format(not_monotone))
 
# 'M1FreqL3M','M2FreqL3M', 'maxDelqL12M' 是不單調的,需要合並其中某些類別
trainData.groupby(['M2FreqL3M'])['label'].mean()  #檢查單調性
trainData.groupby(['M2FreqL3M'])['label'].count()   #其中,M2FreqL3M=3總共只有3個樣本,因此要進行合並
 
 
 
# 將 M2FreqL3M>=1的合並為一組,計算WOE和IV
trainData['M2FreqL3M_Bin'] = trainData['M2FreqL3M'].apply(lambda x: int(x>=1))
trainData.groupby(['M2FreqL3M_Bin'])['label'].mean()
 
 
 
#計算WOE值
def CalcWOE(df, col, target):
    '''
    :param df: 包含需要計算WOE的變量和目標變量
    :param col: 需要計算WOE、IV的變量,必須是分箱后的變量,或者不需要分箱的類別型變量
    :param target: 目標變量,0、1表示好、壞
    :return: 返回WOE和IV
    '''
    total = df.groupby([col])[target].count()
    total = pd.DataFrame({'total': total})
    bad = df.groupby([col])[target].sum()
    bad = pd.DataFrame({'bad': bad})
    regroup = total.merge(bad, left_index=True, right_index=True, how='left')
    regroup.reset_index(level=0, inplace=True)
    N = sum(regroup['total'])
    B = sum(regroup['bad'])
    regroup['good'] = regroup['total'] - regroup['bad']
    G = N - B
    regroup['bad_pcnt'] = regroup['bad'].map(lambda x: x*1.0/B)
    regroup['good_pcnt'] = regroup['good'].map(lambda x: x * 1.0 / G)
    regroup['WOE'] = regroup.apply(lambda x: np.log(x.good_pcnt*1.0/x.bad_pcnt),axis = 1)
    WOE_dict = regroup[[col,'WOE']].set_index(col).to_dict(orient='index')
    for k, v in WOE_dict.items():
        WOE_dict[k] = v['WOE']
    IV = regroup.apply(lambda x: (x.good_pcnt-x.bad_pcnt)*np.log(x.good_pcnt*1.0/x.bad_pcnt),axis = 1)
    IV = sum(IV)
    return {"WOE": WOE_dict, 'IV':IV}
WOE_IV_dict['M2FreqL3M_Bin'] = CalcWOE(trainData, 'M2FreqL3M_Bin', 'label')
 
trainData.groupby(['M1FreqL3M'])['label'].mean()  #檢查單調性
trainData.groupby(['M1FreqL3M'])['label'].count()
 
# 除了M1FreqL3M=3外, 其他組別的bad rate單調。
# 此外,M1FreqL3M=0 占比很大,因此將M1FreqL3M>=1的分為一組
trainData['M1FreqL3M_Bin'] = trainData['M1FreqL3M'].apply(lambda x: int(x>=1))
trainData.groupby(['M1FreqL3M_Bin'])['label'].mean()
WOE_IV_dict['M1FreqL3M_Bin'] = CalcWOE(trainData, 'M1FreqL3M_Bin', 'label')
 
'''
對其他單調的類別型變量,檢查是否有一箱的占比低於5%。 如果有,將該變量進行合並
'''
small_bin_var = []
large_bin_var = []
N = trainData.shape[0]
for var in categoricalFeatures:
    if var not in not_monotone:
        total = trainData.groupby([var])[var].count()
        pcnt = total * 1.0 / N
        if min(pcnt)<0.05:
            small_bin_var.append({var:pcnt.to_dict()})
        else:
            large_bin_var.append(var)
 
 
#對於M2FreqL1M、M2FreqL6M和M2FreqL12M,由於有部分箱占了很大比例,故刪除,因為樣本表現99%都一樣,這個變量沒有區分度
allFeatures.remove('M2FreqL1M')
allFeatures.remove('M2FreqL6M')
allFeatures.remove('M2FreqL12M')
 
def MergeByCondition(x,condition_list):
    #condition_list是條件列表。滿足第幾個condition,就輸出幾
    s = 0
    for condition in condition_list:
        if eval(str(x)+condition):
            return s
        else:
            s+=1
    return s
 
#對於small_bin_var中的其他變量,將最小的箱和相鄰的箱進行合並並計算WOE
trainData['maxDelqL1M_Bin'] = trainData['maxDelqL1M'].apply(lambda x: MergeByCondition(x,['==0','==1','>=2']))
trainData['maxDelqL3M_Bin'] = trainData['maxDelqL3M'].apply(lambda x: MergeByCondition(x,['==0','==1','>=2']))
trainData['maxDelqL6M_Bin'] = trainData['maxDelqL6M'].apply(lambda x: MergeByCondition(x,['==0','==1','>=2']))
for var in ['maxDelqL1M_Bin','maxDelqL3M_Bin','maxDelqL6M_Bin']:
    WOE_IV_dict[var] = CalcWOE(trainData, var, 'label')
 
 
'''
對於不需要合並、原始箱的bad rate單調的特征,直接計算WOE和IV
'''
for var in large_bin_var:
    WOE_IV_dict[var] = CalcWOE(trainData, var, 'label')
 
 
def AssignBin(x, cutOffPoints,special_attribute=[]):
    '''
    :param x: 某個變量的某個取值
    :param cutOffPoints: 上述變量的分箱結果,用切分點表示
    :param special_attribute:  不參與分箱的特殊取值
    :return: 分箱后的對應的第幾個箱,從0開始
    for example, if cutOffPoints = [10,20,30], if x = 7, return Bin 0. If x = 35, return Bin 3
    '''
    numBin = len(cutOffPoints) + 1 + len(special_attribute)
    if x in special_attribute:
        i = special_attribute.index(x)+1
        return 'Bin {}'.format(0-i)
    if x<=cutOffPoints[0]:
        return 'Bin 0'
    elif x > cutOffPoints[-1]:
        return 'Bin {}'.format(numBin-1)
    else:
        for i in range(0,numBin-1):
            if cutOffPoints[i] < x <=  cutOffPoints[i+1]:
                return 'Bin {}'.format(i+1)
 
 
 
def AssignGroup(x, bin):
    '''
    :param x: 某個變量的某個取值
    :param bin: 上述變量的分箱結果
    :return: x在分箱結果下的映射
    '''
    N = len(bin)
    if x<=min(bin):
        return min(bin)
    elif x>max(bin):
        return 10e10
    else:
        for i in range(N-1):
            if bin[i] < x <= bin[i+1]:
                return bin[i+1]
 
 
def SplitData(df, col, numOfSplit, special_attribute=[]):
    '''
    :param df: 按照col排序后的數據集
    :param col: 待分箱的變量
    :param numOfSplit: 切分的組別數
    :param special_attribute: 在切分數據集的時候,某些特殊值需要排除在外
    :return: 在原數據集上增加一列,把原始細粒度的col重新划分成粗粒度的值,便於分箱中的合並處理
    '''
    df2 = df.copy()
    if special_attribute != []:
        df2 = df.loc[~df[col].isin(special_attribute)]
    N = df2.shape[0]#行數
    #" / "就表示 浮點數除法,返回浮點結果;" // "表示整數除法
    n = N//numOfSplit #每組樣本數
    splitPointIndex = [i*n for i in range(1,numOfSplit)] #分割點的下標
    '''
    [i*2 for i in range(1,100)]
    [2, 4, 6, 8, 10,......,198]
    '''
    rawValues = sorted(list(df2[col])) #對取值進行排序
    #取到粗糙卡方划分節點
    splitPoint = [rawValues[i] for i in splitPointIndex] #分割點的取值
    splitPoint = sorted(list(set(splitPoint)))
    return splitPoint
 
 
#計算卡方值的函數
def Chi2(df, total_col, bad_col, overallRate):
    '''
    :param df: 包含全部樣本總計與壞樣本總計的數據框
    :param total_col: 全部樣本的個數
    :param bad_col: 壞樣本的個數
    :param overallRate: 全體樣本的壞樣本占比
    :return: 卡方值
    '''
    df2 = df.copy()
    # 期望壞樣本個數=全部樣本個數*平均壞樣本占比
    df2['expected'] = df[total_col].apply(lambda x: x*overallRate)
    combined = zip(df2['expected'], df2[bad_col])
    chi = [(i[0]-i[1])**2/i[0] for i in combined]
    chi2 = sum(chi)
    return chi2
 
 
##ChiMerge_MaxInterval:通過指定最大間隔數,使用卡方值分割連續變量
def ChiMerge(df, col, target, max_interval=5,special_attribute=[],minBinPcnt=0):
    '''
    :param df: 包含目標變量與分箱屬性的數據框
    :param col: 需要分箱的屬性
    :param target: 目標變量,取值0或1
    :param max_interval: 最大分箱數。如果原始屬性的取值個數低於該參數,不執行這段函數
    :param special_attribute: 不參與分箱的屬性取值,缺失值的情況
    :param minBinPcnt:最小箱的占比,默認為0
    :return: 分箱結果
    '''
    colLevels = sorted(list(set(df[col])))
    N_distinct = len(colLevels)#不同的取值個數
    if N_distinct <= max_interval:  #如果原始屬性的取值個數低於max_interval,不執行這段函數
        print ("原始屬性{}的取值個數低於max_interval".format(col))
        #分箱分數間隔段,少一個值也可以
        #返回值colLevels會少一個最大值
        return colLevels[:-1]
    else:
        if len(special_attribute)>=1:
            #df1數據框取trainData中col那一列為特殊值的數據集
            #df1 = df.loc[df[col].isin(special_attribute)]
            print('{} 有缺失值的情況'.format(col))
            #用逆函數對篩選后的結果取余,起刪除指定行作用
            df2 = df.loc[~df[col].isin(special_attribute)]
        else:
            df2 = df.copy()
        N_distinct = len(list(set(df2[col])))#該特征不同的取值
 
        # 步驟一: 通過col對數據集進行分組,求出每組的總樣本數與壞樣本數
        if N_distinct > 100:
            '''
            split_x樣例
            [2, 8, 9.3 , 1 0, 30 ,......,1800]
            '''
            split_x = SplitData(df2, col, 100)
            #把值變為划分點的值
            df2['temp'] = df2[col].map(lambda x: AssignGroup(x, split_x))
        else:
            #假如數值取值小於100就不發生變化了
            df2['temp'] = df2[col]
        # 總體bad rate將被用來計算expected bad count
        (binBadRate, regroup, overallRate) = BinBadRate(df2, 'temp', target, grantRateIndicator=1)
 
        # 首先,每個單獨的屬性值將被分為單獨的一組
        # 對屬性值進行排序,然后兩兩組別進行合並
        colLevels = sorted(list(set(df2['temp'])))
        groupIntervals = [[i] for i in colLevels]
 
        # 步驟二:建立循環,不斷合並最優的相鄰兩個組別,直到:
        # 1,最終分裂出來的分箱數<=預設的最大分箱數
        # 2,每箱的占比不低於預設值(可選)
        # 3,每箱同時包含好壞樣本
        # 如果有特殊屬性,那么最終分裂出來的分箱數=預設的最大分箱數-特殊屬性的個數
        split_intervals = max_interval - len(special_attribute)
        while (len(groupIntervals) > split_intervals):  # 終止條件: 當前分箱數=預設的分箱數
            # 每次循環時, 計算合並相鄰組別后的卡方值。具有最小卡方值的合並方案,是最優方案
            #存儲卡方值
            chisqList = []
            for k in range(len(groupIntervals)-1):
                temp_group = groupIntervals[k] + groupIntervals[k+1]
                df2b = regroup.loc[regroup['temp'].isin(temp_group)]
                chisq = Chi2(df2b, 'total', 'bad', overallRate)
                chisqList.append(chisq)
            best_comnbined = chisqList.index(min(chisqList))
            groupIntervals[best_comnbined] = groupIntervals[best_comnbined] + groupIntervals[best_comnbined+1]
            # after combining two intervals, we need to remove one of them
            groupIntervals.remove(groupIntervals[best_comnbined+1])
        groupIntervals = [sorted(i) for i in groupIntervals]
        cutOffPoints = [max(i) for i in groupIntervals[:-1]]
 
        # 檢查是否有箱沒有好或者壞樣本。如果有,需要跟相鄰的箱進行合並,直到每箱同時包含好壞樣本
        groupedvalues = df2['temp'].apply(lambda x: AssignBin(x, cutOffPoints))
        #已成完成卡方分箱,但是沒有考慮其單調性
        df2['temp_Bin'] = groupedvalues
        (binBadRate,regroup) = BinBadRate(df2, 'temp_Bin', target)
        [minBadRate, maxBadRate] = [min(binBadRate.values()),max(binBadRate.values())]
        while minBadRate ==0 or maxBadRate == 1:
            # 找出全部為好/壞樣本的箱
            indexForBad01 = regroup[regroup['bad_rate'].isin([0,1])].temp_Bin.tolist()
            bin=indexForBad01[0]
            # 如果是最后一箱,則需要和上一個箱進行合並,也就意味着分裂點cutOffPoints中的最后一個需要移除
            if bin == max(regroup.temp_Bin):
                cutOffPoints = cutOffPoints[:-1]
            # 如果是第一箱,則需要和下一個箱進行合並,也就意味着分裂點cutOffPoints中的第一個需要移除
            elif bin == min(regroup.temp_Bin):
                cutOffPoints = cutOffPoints[1:]
            # 如果是中間的某一箱,則需要和前后中的一個箱進行合並,依據是較小的卡方值
            else:
                # 和前一箱進行合並,並且計算卡方值
                currentIndex = list(regroup.temp_Bin).index(bin)
                prevIndex = list(regroup.temp_Bin)[currentIndex - 1]
                df3 = df2.loc[df2['temp_Bin'].isin([prevIndex, bin])]
                (binBadRate, df2b) = BinBadRate(df3, 'temp_Bin', target)
                chisq1 = Chi2(df2b, 'total', 'bad', overallRate)
                # 和后一箱進行合並,並且計算卡方值
                laterIndex = list(regroup.temp_Bin)[currentIndex + 1]
                df3b = df2.loc[df2['temp_Bin'].isin([laterIndex, bin])]
                (binBadRate, df2b) = BinBadRate(df3b, 'temp_Bin', target)
                chisq2 = Chi2(df2b, 'total', 'bad', overallRate)
                if chisq1 < chisq2:
                    cutOffPoints.remove(cutOffPoints[currentIndex - 1])
                else:
                    cutOffPoints.remove(cutOffPoints[currentIndex])
            # 完成合並之后,需要再次計算新的分箱准則下,每箱是否同時包含好壞樣本
            groupedvalues = df2['temp'].apply(lambda x: AssignBin(x, cutOffPoints))
            df2['temp_Bin'] = groupedvalues
            (binBadRate, regroup) = BinBadRate(df2, 'temp_Bin', target)
            [minBadRate, maxBadRate] = [min(binBadRate.values()), max(binBadRate.values())]
 
        # 需要檢查分箱后的最小占比
        if minBinPcnt > 0:
            groupedvalues = df2['temp'].apply(lambda x: AssignBin(x, cutOffPoints))
            df2['temp_Bin'] = groupedvalues
            #value_counts每個數值出現了多少次
            valueCounts = groupedvalues.value_counts().to_frame()
            N=sum(valueCounts['temp'])
            valueCounts['pcnt'] = valueCounts['temp'].apply(lambda x: x * 1.0 / N)
            valueCounts = valueCounts.sort_index()
            minPcnt = min(valueCounts['pcnt'])
            #一定要箱數大於2才可以,要不就不能再合並了
            while minPcnt < minBinPcnt and len(cutOffPoints) > 2:
                # 找出占比最小的箱
                indexForMinPcnt = valueCounts[valueCounts['pcnt'] == minPcnt].index.tolist()[0]
                # 如果占比最小的箱是最后一箱,則需要和上一個箱進行合並,也就意味着分裂點cutOffPoints中的最后一個需要移除
                if indexForMinPcnt == max(valueCounts.index):
                    cutOffPoints = cutOffPoints[:-1]
                # 如果占比最小的箱是第一箱,則需要和下一個箱進行合並,也就意味着分裂點cutOffPoints中的第一個需要移除
                elif indexForMinPcnt == min(valueCounts.index):
                    cutOffPoints = cutOffPoints[1:]
                # 如果占比最小的箱是中間的某一箱,則需要和前后中的一個箱進行合並,依據是較小的卡方值
                else:
                    # 和前一箱進行合並,並且計算卡方值
                    currentIndex = list(valueCounts.index).index(indexForMinPcnt)
                    prevIndex = list(valueCounts.index)[currentIndex - 1]
                    df3 = df2.loc[df2['temp_Bin'].isin([prevIndex, indexForMinPcnt])]
                    (binBadRate, df2b) = BinBadRate(df3, 'temp_Bin', target)
                    chisq1 = Chi2(df2b, 'total', 'bad', overallRate)
                    # 和后一箱進行合並,並且計算卡方值
                    laterIndex = list(valueCounts.index)[currentIndex + 1]
                    df3b = df2.loc[df2['temp_Bin'].isin([laterIndex, indexForMinPcnt])]
                    (binBadRate, df2b) = BinBadRate(df3b, 'temp_Bin', target)
                    chisq2 = Chi2(df2b, 'total', 'bad', overallRate)
                    if chisq1 < chisq2:
                        cutOffPoints.remove(cutOffPoints[currentIndex - 1])
                    else:
                        cutOffPoints.remove(cutOffPoints[currentIndex])
        cutOffPoints = special_attribute + cutOffPoints
        return cutOffPoints
 
 
'''
對於數值型變量,需要先分箱,再計算WOE、IV
分箱的結果需要滿足:
1,箱數不超過5
2,bad rate單調
3,每箱占比不低於5%
'''
bin_dict = []
for var in numericalFeatures:
    binNum = 5
    newBin = var + '_Bin'
    bin = ChiMerge(trainData, var, 'label',max_interval=binNum,minBinPcnt = 0.05)
    trainData[newBin] = trainData[var].apply(lambda x: AssignBin(x,bin))
    # 如果不滿足單調性,就降低分箱個數
    while not BadRateMonotone(trainData, newBin, 'label'):
        binNum -= 1
        bin = ChiMerge(trainData, var, 'label', max_interval=binNum, minBinPcnt=0.05)
        trainData[newBin] = trainData[var].apply(lambda x: AssignBin(x, bin))
    WOE_IV_dict[newBin] = CalcWOE(trainData, newBin, 'label')
    bin_dict.append({var:bin})
 
 
 
##############################
#   3, 單變量分析和多變量分析   #
##############################
#  選取IV高於0.02的變量
high_IV = [(k,v['IV']) for k,v in WOE_IV_dict.items() if v['IV'] >= 0.02]
high_IV_sorted = sorted(high_IV, key=lambda k: k[1],reverse=True)
IV_values = [i[1] for i in high_IV_sorted]
IV_name = [i[0] for i in high_IV_sorted]
plt.title('High feature IV')
plt.bar(range(len(IV_values)),IV_values)
 
 
for (var,iv) in high_IV:
    newVar = var+"_WOE"
    trainData[newVar] = trainData[var].map(lambda x: WOE_IV_dict[var]['WOE'][x])
 
 
saveFile = open(folderOfData+'/trainData.pkl','wb+')
pickle.dump(trainData,saveFile)
saveFile.close()
 
 
saveFile = open(folderOfData+'/trainData.pkl','rb+')
trainData = pickle.load(saveFile)
saveFile.close()
 
 
 
'''
多變量分析:比較兩兩線性相關性。如果相關系數的絕對值高於閾值,剔除IV較低的一個
'''
deleted_index = []
cnt_vars = len(high_IV_sorted)
for i in range(cnt_vars):
    if i in deleted_index:
        continue
    x1 = high_IV_sorted[i][0]+"_WOE"
    for j in range(cnt_vars):
        if i == j or j in deleted_index:
            continue
        y1 = high_IV_sorted[j][0]+"_WOE"
        roh = np.corrcoef(trainData[x1],trainData[y1])[0,1]
        if abs(roh)>0.7:
            x1_IV = high_IV_sorted[i][1]
            y1_IV = high_IV_sorted[j][1]
            if x1_IV > y1_IV:
                deleted_index.append(j)
            else:
                deleted_index.append(i)
 
single_analysis_vars = [high_IV_sorted[i][0]+"_WOE" for i in range(cnt_vars) if i not in deleted_index]
 
 
X = trainData[single_analysis_vars]
f, ax = plt.subplots(figsize=(10, 8))
corr = X.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),square=True, ax=ax)
 
 
 
'''
多變量分析:VIF
'''
X = np.matrix(trainData[single_analysis_vars])
VIF_list = [variance_inflation_factor(X, i) for i in range(X.shape[1])]
print(max(VIF_list))
# 最大的VIF是 3.429,小於10,因此這一步認為沒有多重共線性
multi_analysis = single_analysis_vars
 
 
################################
#   4, 建立邏輯回歸模型預測違約   #
################################
X = trainData[multi_analysis]
#截距項
X['intercept'] = [1] * X.shape[0]
y = trainData['label']
logit = sm.Logit(y, X)
logit_result = logit.fit()
pvalues = logit_result.pvalues
params = logit_result.params
fit_result = pd.concat([params,pvalues],axis=1)
fit_result.columns = ['coef','p-value']
fit_result = fit_result.sort_values(by = 'coef')
 
'''
                               coef        p-value
intercept                 -1.812690   0.000000e+00
increaseUrateL6M_Bin_WOE  -1.220508   2.620858e-62
maxDelqL3M_Bin_WOE        -0.735785  3.600473e-163
M2FreqL3M_Bin_WOE         -0.681009   1.284840e-63
avgUrateL1M_Bin_WOE       -0.548608   2.350785e-07
avgUrateL3M_Bin_WOE       -0.467298   8.870679e-05
M0FreqL3M_WOE             -0.392261   2.386403e-26
avgUrateL6M_Bin_WOE       -0.309831   3.028939e-02
increaseUrateL3M_WOE      -0.300805   1.713878e-03
maxUrateL3M_Bin_WOE       -0.213742   1.412028e-01
avgPayL6M_Bin_WOE         -0.208924   4.241600e-07
maxDelqL1M_Bin_WOE        -0.162785   1.835990e-07
M1FreqL12M_Bin_WOE        -0.125595   2.576692e-03
M1FreqL6M_Bin_WOE         -0.067979   8.572653e-02
maxPayL6M_Bin_WOE         -0.063942   3.807461e-01
maxUrateL6M_Bin_WOE       -0.056266   7.120434e-01
avgPayL12M_Bin_WOE        -0.039538   4.487068e-01
maxPayL12M_Bin_WOE         0.030780   8.135143e-01
M0FreqL12M_Bin_WOE         0.077365   1.826047e-01
minPayL6M_Bin_WOE          0.107868   3.441998e-01
increaseUrateL12M_Bin_WOE  0.115845   4.292397e-01
M0FreqL6M_Bin_WOE          0.145630   1.869349e-03
minPayL3M_Bin_WOE          0.151294   4.293344e-02
avgPayL1M_Bin_WOE          0.260946   6.606818e-04
變量 
maxPayL12M_Bin_WOE         0.030780   8.135143e-01
M0FreqL12M_Bin_WOE         0.077365   1.826047e-01
minPayL6M_Bin_WOE          0.107868   3.441998e-01
increaseUrateL12M_Bin_WOE  0.115845   4.292397e-01
M0FreqL6M_Bin_WOE          0.145630   1.869349e-03
minPayL3M_Bin_WOE          0.151294   4.293344e-02
avgPayL1M_Bin_WOE          0.260946   6.606818e-04
的系數為正,需要單獨檢驗
'''
 
sm.Logit(y, trainData['maxPayL12M_Bin_WOE']).fit().params  # -0.980206
sm.Logit(y, trainData['M0FreqL12M_Bin_WOE']).fit().params  # -1.050918
sm.Logit(y, trainData['minPayL6M_Bin_WOE']).fit().params  # -0.812302
sm.Logit(y, trainData['increaseUrateL12M_Bin_WOE']).fit().params  #  -0.914707
sm.Logit(y, trainData['M0FreqL6M_Bin_WOE']).fit().params  # -1.065785
sm.Logit(y, trainData['minPayL3M_Bin_WOE']).fit().params  #  -0.819148
sm.Logit(y, trainData['avgPayL1M_Bin_WOE']).fit().params  #  -1.007179
 
# 單獨建立回歸模型,系數為負,與預期相符,說明仍然存在多重共線性
# 下一步,用GBDT跑出變量重要性,挑選出合適的變量
clf = ensemble.GradientBoostingClassifier()
gbdt_model = clf.fit(X, y)
importace = gbdt_model.feature_importances_.tolist()
featureImportance = zip(multi_analysis,importace)
featureImportanceSorted = sorted(featureImportance, key=lambda k: k[1],reverse=True)
 
 
 
# 先假定模型可以容納5個特征,再逐步增加特征個數,直到有特征的系數為正,或者p值超過0.1
n = 5
featureSelected = [i[0] for i in featureImportanceSorted[:n]]
X_train = X[featureSelected+['intercept']]
logit = sm.Logit(y, X_train)
logit_result = logit.fit()
pvalues = logit_result.pvalues
params = logit_result.params
fit_result = pd.concat([params,pvalues],axis=1)
fit_result.columns = ['coef','p-value']
 
while(n<len(featureImportanceSorted)):
    nextVar = featureImportanceSorted[n][0]
    featureSelected = featureSelected + [nextVar]
    X_train = X[featureSelected+['intercept']]
    logit = sm.Logit(y, X_train)
    logit_result = logit.fit()
    params = logit_result.params
    #print("current var is ",nextVar,'   ', params[nextVar])
    if max(params) < 0:
        n += 1
    else:
        featureSelected.remove(nextVar)
        n += 1
 
X_train = X[featureSelected+['intercept']]
logit = sm.Logit(y, X_train)
logit_result = logit.fit()
pvalues = logit_result.pvalues
params = logit_result.params
fit_result = pd.concat([params,pvalues],axis=1)
fit_result.columns = ['coef','p-value']
fit_result = fit_result.sort_values(by  = 'p-value')
'''
                              coef        p-value
intercept                -1.809479   0.000000e+00
maxDelqL3M_Bin_WOE       -0.762903  2.603323e-192
increaseUrateL6M_Bin_WOE -1.194299   4.259502e-68
M2FreqL3M_Bin_WOE        -0.684674   1.067350e-64
M0FreqL3M_WOE            -0.266852   6.912786e-18
avgPayL6M_Bin_WOE        -0.191338   5.979102e-08
avgUrateL1M_Bin_WOE      -0.555628   1.473557e-07
maxDelqL1M_Bin_WOE       -0.129355   1.536173e-06
avgUrateL3M_Bin_WOE      -0.453340   1.364483e-04
increaseUrateL3M_WOE     -0.281940   3.123852e-03
M1FreqL12M_Bin_WOE       -0.104303   5.702452e-03
avgUrateL6M_Bin_WOE      -0.280308   4.784200e-02
maxUrateL3M_Bin_WOE      -0.221817   1.254597e-01
M1FreqL6M_Bin_WOE        -0.024903   5.002232e-01
maxUrateL6M_Bin_WOE      -0.060720   6.897626e-01
maxPayL6M_Bin_WOE,maxUrateL6M_Bin_WOE,avgUrateL6M_Bin_WOE,avgPayL12M_Bin_WOE,increaseUrateL12M_Bin_WOE,maxPayL12M_Bin_WOE 的p值大於0.1
單獨檢驗顯著性
'''
 
largePValueVars = pvalues[pvalues>0.1].index
for var in largePValueVars:
    X_temp = X[[var, 'intercept']]
    logit = sm.Logit(y, X_temp)
    logit_result = logit.fit()
    pvalues = logit_result.pvalues
    print("The p-value of {0} is {1} ".format(var, str(pvalues[var])))
'''
The p-value of maxPayL6M_Bin_WOE is 3.94466107162e-137 
The p-value of maxUrateL6M_Bin_WOE is 5.83590695685e-35 
The p-value of avgUrateL6M_Bin_WOE is 8.17633724544e-37 
The p-value of avgPayL12M_Bin_WOE is 1.10614470149e-295 
The p-value of increaseUrateL12M_Bin_WOE is 1.9777915301e-57 
The p-value of maxPayL12M_Bin_WOE is 1.04348079207e-45 
顯然,單個變量的p值是顯著地。說明任然存在着共線性。
'''
'''
可用L1約束,直到所有變量顯著
'''
 
 
X2 = X[featureSelected+['intercept']]
for alpha in range(100,0,-1):
    l1_logit = sm.Logit.fit_regularized(sm.Logit(y, X2), start_params=None, method='l1', alpha=alpha)
    pvalues = l1_logit.pvalues
    params = l1_logit.params
    if max(pvalues)>=0.1 or max(params)>0:
        break
 
bestAlpha = alpha + 1
l1_logit = sm.Logit.fit_regularized(sm.Logit(y, X2), start_params=None, method='l1', alpha=bestAlpha)
params = l1_logit.params
params2 = params.to_dict()
featuresInModel = [k for k, v in params2.items() if k!='intercept' and v < -0.0000001]
 
 
X_train = X[featuresInModel + ['intercept']]
logit = sm.Logit(y, X_train)
logit_result = logit.fit()
trainData['pred'] = logit_result.predict(X_train)
 
### 計算KS值
def KS(df, score, target):
    '''
    :param df: 包含目標變量與預測值的數據集,dataframe
    :param score: 得分或者概率,str
    :param target: 目標變量,str
    :return: KS值
    '''
    total = df.groupby([score])[target].count()
    bad = df.groupby([score])[target].sum()
    all = pd.DataFrame({'total':total, 'bad':bad})
    all['good'] = all['total'] - all['bad']
    all[score] = all.index
    all = all.sort_values(by=score,ascending=False)
    all.index = range(len(all))
    all['badCumRate'] = all['bad'].cumsum() / all['bad'].sum()
    all['goodCumRate'] = all['good'].cumsum() / all['good'].sum()
    KS = all.apply(lambda x: x.badCumRate - x.goodCumRate, axis=1)
    return max(KS)
 
 
 
###################################
#   5,在測試集上測試邏輯回歸的結果   #
###################################
# 准備WOE編碼后的變量
modelFeatures = [i.replace('_Bin','').replace('_WOE','') for i in featuresInModel]
'''
['maxDelqL3M',
 'increaseUrateL6M',
 'M0FreqL3M',
 'avgUrateL1M',
 'M2FreqL3M',
 'M1FreqL6M',
 'avgUrateL3M',
 'maxDelqL1M',
 'avgPayL6M',
 'M1FreqL12M']
'''
numFeatures = [i for i in modelFeatures if i in numericalFeatures]
charFeatures = [i for i in modelFeatures if i in categoricalFeatures]
 
#滿足變量的數據預處理
testData['maxDelqL1M'] = testData.apply(lambda x: DelqFeatures(x,1,'max delq'),axis=1)
testData['maxDelqL3M'] = testData.apply(lambda x: DelqFeatures(x,3,'max delq'),axis=1)
# testData['M2FreqL3M'] = testData.apply(lambda x: DelqFeatures(x, 3, 'M2 times'), axis=1)
testData['M0FreqL3M'] = testData.apply(lambda x: DelqFeatures(x,3,'M0 times'),axis=1)
testData['M1FreqL6M'] = testData.apply(lambda x: DelqFeatures(x, 6, 'M1 times'), axis=1)
testData['M2FreqL3M'] = testData.apply(lambda x: DelqFeatures(x, 3, 'M2 times'), axis=1)
testData['M1FreqL12M'] = testData.apply(lambda x: DelqFeatures(x, 12, 'M1 times'), axis=1)
# testData['maxUrateL6M'] = testData.apply(lambda x: UrateFeatures(x,6,'max utilization rate'),axis = 1)
testData['avgUrateL1M'] = testData.apply(lambda x: UrateFeatures(x,1, 'mean utilization rate'),axis=1)
testData['avgUrateL3M'] = testData.apply(lambda x: UrateFeatures(x,3, 'mean utilization rate'),axis=1)
# testData['avgUrateL6M'] = testData.apply(lambda x: UrateFeatures(x,6, 'mean utilization rate'),axis=1)
testData['increaseUrateL6M'] = testData.apply(lambda x: UrateFeatures(x, 6, 'increase utilization rate'),axis=1)
# testData['avgPayL3M'] = testData.apply(lambda x: PaymentFeatures(x, 3, 'mean payment ratio'),axis=1)
testData['avgPayL6M'] = testData.apply(lambda x: PaymentFeatures(x, 6, 'mean payment ratio'),axis=1)
 
#合並分箱
testData['M2FreqL3M_Bin'] = testData['M2FreqL3M'].apply(lambda x: int(x>=1))
testData['maxDelqL1M_Bin'] = testData['maxDelqL1M'].apply(lambda x: MergeByCondition(x,['==0','==1','>=2']))
testData['maxDelqL3M_Bin'] = testData['maxDelqL3M'].apply(lambda x: MergeByCondition(x,['==0','==1','>=2']))
 
for var in numFeatures:
    newBin = var+"_Bin"
    bin = [list(i.values()) for i in bin_dict if var in i][0][0]
    testData[newBin] = testData[var].apply(lambda x: AssignBin(x,bin))
 
finalFeatures = [i+'_Bin' for i in numFeatures] + ['M2FreqL3M_Bin','maxDelqL1M_Bin','maxDelqL3M_Bin','M0FreqL3M']
 
for var in finalFeatures:
    var2 = var+"_WOE"
    testData[var2] = testData[var].apply(lambda x: WOE_IV_dict[var]['WOE'][x])
 
X_test = testData[featuresInModel]
X_test['intercept'] = [1]*X_test.shape[0]
testData['pred'] = logit_result.predict(X_test)
 
 
ks = KS(testData, 'pred', 'label')
auc = roc_auc_score(testData['label'],testData['pred'])
# KS=64.94%, AUC = 84.43%,都高於30%的標准。因此該模型是可用的。
 
##########################
#   6,在測試集上計算分數   #
##########################
def Prob2Score(prob, basePoint, PDO):
    #將概率轉化成分數且為正整數
    y = np.log(prob/(1-prob))
    return int(basePoint+PDO/np.log(2)*(-y))
 
BasePoint, PDO = 500,50
testData['score'] = testData['pred'].apply(lambda x: Prob2Score(x, BasePoint, PDO))
plt.hist(testData['score'],bins=100)

  

python金融風控評分卡模型和數據分析微專業課(博主親自錄制視頻):http://dwz.date/b9vv

 微信掃二維碼,免費學習更多python資源

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM