變量的KS值
KS(Kolmogorov-Smirnov)用於模型風險區分能力進行評估,指標衡量的是好壞樣本累計部分之間的差距 。KS值越大,表示該變量越能將正,負客戶的區分程度越大。通常來說,KS>0.2即表示特征有較好的准確率。強調一下,這
里的KS值是變量的KS值,而不是模型的KS值。(后面的模型評估里會重點講解模型的KS值)。
KS的計算方式:
- 計算每個評分區間的好壞賬戶數。
- 計算各每個評分區間的累計好賬戶數占總好賬戶數比率(good%)和累計壞賬戶數占總壞賬戶數比率(bad%)。
- 計算每個評分區間累計壞賬戶比與累計好賬戶占比差的絕對值(累計good%-累計bad%),然后對這些絕對值取最大值記得到KS值。
Best-KS分箱
Best-KS分箱的算法執行過程是一個逐步拆分的過程:
- 將特征值值進行從小到大的排序。
- 計算出KS最大的那個值,即為切點,記為D。然后把數據切分成兩部分。
- 重復步驟2,進行遞歸,D左右的數據進一步切割。直到KS的箱體數達到我們的預設閾值即可。
Best-KS分箱的特點:
- 連續型變量:分箱后的KS值<=分箱前的KS值
- 分箱過程中,決定分箱后的KS值是某一個切點,而不是多個切點的共同作用。這個切點的位置是原始KS值最大的位置。
1.簡單版
# -*- coding: utf-8 -*- """ 創建KS分箱實驗 """ import pandas as pd def best_ks_box(data, var_name, box_num): data = data[[var_name, '是否違約']] """ KS值函數 """ def ks_bin(data_, limit): g = data_.iloc[:, 1].value_counts()[0] b = data_.iloc[:, 1].value_counts()[1] data_cro = pd.crosstab(data_.iloc[:, 0], data_.iloc[:, 1]) data_cro[0] = data_cro[0] / g data_cro[1] = data_cro[1] / b data_cro_cum = data_cro.cumsum() ks_list = abs(data_cro_cum[1] - data_cro_cum[0]) ks_list_index = ks_list.nlargest(len(ks_list)).index.tolist() for i in ks_list_index: data_1 = data_[data_.iloc[:, 0] <= i] data_2 = data_[data_.iloc[:, 0] > i] if len(data_1) >= limit and len(data_2) >= limit: break return i # 測試: ks_bin(data,data.shape[0]/7) """ 區間選取函數 """ def ks_zone(data_, list_): list_zone = list() list_.sort() n = 0 for i in list_: m = sum(data_.iloc[:, 0] <= i) - n n = sum(data_.iloc[:, 0] <= i) list_zone.append(m) list_zone.append(50000 - sum(list_zone)) max_index = list_zone.index(max(list_zone)) if max_index == 0: rst = [data_.iloc[:, 0].unique().min(), list_[0]] elif max_index == len(list_): rst = [list_[-1], data_.iloc[:, 0].unique().max()] else: rst = [list_[max_index - 1], list_[max_index]] return rst # 測試: ks_zone(data_,[23]) #左開右閉 data_ = data.copy() limit_ = data.shape[0] / 20 # 總體的5% """" 循環體 """ zone = list() for i in range(box_num - 1): ks_ = ks_bin(data_, limit_) zone.append(ks_) new_zone = ks_zone(data, zone) data_ = data[(data.iloc[:, 0] > new_zone[0]) & (data.iloc[:, 0] <= new_zone[1])] """ 構造分箱明細表 """ zone.append(data.iloc[:, 0].unique().max()) zone.append(data.iloc[:, 0].unique().min()) zone.sort() df_ = pd.DataFrame(columns=[0, 1]) for i in range(len(zone) - 1): if i == 0: data_ = data[(data.iloc[:, 0] >= zone[i]) & (data.iloc[:, 0] <= zone[i + 1])] else: data_ = data[(data.iloc[:, 0] > zone[i]) & (data.iloc[:, 0] <= zone[i + 1])] data_cro = pd.crosstab(data_.iloc[:, 0], data_.iloc[:, 1]) df_.loc['{0}-{1}'.format(data_cro.index.min(), data_cro.index.max())] = data_cro.apply(sum) return df_ data = pd.read_excel('測試1.xlsx') var_name = '年齡' print(best_ks_box(data, var_name, 5))
2.復雜版
# -*- coding: utf-8 -*- import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns #import missingno as msno plt.style.use('fivethirtyeight') import warnings import datetime warnings.filterwarnings('ignore') #%matplotlib inline #from tqdm import tqdm import re import math import time import itertools import random from logging import Logger from logging.handlers import TimedRotatingFileHandler import os #######################################################KS分箱的主體邏輯############################################## def init_logger(logger_name,logging_path): if not os.path.exists(logging_path): os.makedirs(logging_path) if logger_name not in Logger.manager.loggerDict: logger = logging.getLogger(logger_name) logger.setLevel(logging.DEBUG) handler = TimedRotatingFileHandler(filename=logging_path+"/%sAll.log"%logger_name,when='D',backupCount = 7) datefmt = '%Y-%m-%d %H:%M:%S' format_str = '[%(asctime)s]: %(name)s %(filename)s[line:%(lineno)s] %(levelname)s %(message)s' formatter = logging.Formatter(format_str,datefmt) handler.setFormatter(formatter) handler.setLevel(logging.INFO) logger.addHandler(handler) console= logging.StreamHandler() console.setLevel(logging.INFO) console.setFormatter(formatter) logger.addHandler(console) handler = TimedRotatingFileHandler(filename=logging_path+"/%sError.log"%logger_name,when='D',backupCount=7) datefmt = '%Y-%m-%d %H:%M:%S' format_str = '[%(asctime)s]: %(name)s %(filename)s[line:%(lineno)s] %(levelname)s %(message)s' formatter = logging.Formatter(format_str,datefmt) handler.setFormatter(formatter) handler.setLevel(logging.ERROR) logger.addHandler(handler) logger = logging.getLogger(logger_name) return logger def get_max_ks(date_df, start, end, rate, factor_name, bad_name, good_name, total_name,total_all): ''' 計算最大的ks值 :param date_df: 數據源 :param start: 第一條數據的index :param end: 最后一條數據的index :param rate: :param factor_name: :param bad_name: :param good_name: :param total_name: :param total_all: :return:最大ks值切點的index ''' ks = '' #獲取黑名單數據 bad = date_df.loc[start:end,bad_name] #獲取白名單數據 good = date_df.loc[start:end,good_name] #np.cumsum累加。計算黑白的數量占比,累計差 bad_good_cum = list(abs(np.cumsum(bad/sum(bad)) - np.cumsum(good/sum(good)))) if bad_good_cum: #找到最大的ks max_ks = max(bad_good_cum) #找到最大ks的切點index。 index_max = bad_good_cum.index(max_ks) t = start + index_max len1 = sum(date_df.loc[start:t,total_name]) len2 = sum(date_df.loc[t+1:end,total_name]) #這個就是rate起的效果,一旦按照最大ks切點切割數據,要保證兩邊的數據量都不能小於一個閾值 if len1 >= rate*total_all: if len2 >= rate*total_all: ks = t #如果分割之后,任意一部分數據的數量小於rate這個閾值,那么ks就返回為空了。 return ks def cut_fun(x,date_df,types,rate,factor_name,bad_name,good_name,total_name,total_all): ''' :param x: List,就是保存了date_df的第一條index和最后一條index的List。 :param date_df: 數據源 :param types: 不知道是什么意思 :param rate: rate的含義也是一直不清楚 :param factor_name: 待分箱的特征字段 :param bad_name: :param good_name: :param total_name: :param total_all: :return: 數據的start index,切點index,end index。 ''' if types == 'upper': #起始從date_df的第一條開始 start = x[0] else: start = x[0]+1 #結束時date_df的最后一條 end = x[1] t = '' #很明顯start != end,所以就執行這個函數體 if start != end: #計算得到最大ks切點index的值,並且把值存入t。 t = get_max_ks(date_df,start,end,rate,factor_name,bad_name,good_name,total_name,total_all) if t: #把t存入x。 x.append(t) #這個時候x存着[start,切點,end] x.sort() if t == 0: x.append(t) x.sort() return x def cut_while_fun(t_list,date_df,rate,factor_name,bad_name,good_name,total_name,total_all): ''' :param t_list: start_index,分箱切點 ,end_index :param date_df: :param rate: :param factor_name: :param bad_name: :param good_name: :param total_name: :param total_all: :return: ''' if len(t_list) != 2: #切點左邊數據 t_up = [t_list[0],t_list[1]] #切點右邊數據 t_down = [t_list[1],t_list[2]] #遞歸對左邊數據進行切割 if t_list[1]-t_list[0] > 1 and sum(date_df.loc[t_up[0]:t_up[1],total_name]) >= rate * sum(date_df[total_name]): t_up = cut_fun(t_up,date_df,'upper',rate,factor_name,good_name,bad_name,total_name,total_all) else: t_up = [] #遞歸對右邊數據進行切割 if t_list[2]-t_list[1] > 1 and sum(date_df.loc[t_down[0]+1:t_down[1],total_name]) >= rate * sum(date_df[total_name]): t_down = cut_fun(t_down,date_df,'down',rate,factor_name,good_name,bad_name,total_name,total_all) else: t_down = [] else: t_up = [] t_down = [] return t_up,t_down def ks_auto(date_df,piece,rate,factor_name,bad_name,good_name,total_name,total_all): ''' :param date_df: 數據源 :param piece: 分箱數目 :param rate: 最小數量占比,就是把數據通過切點分成兩半部分之后,要保證兩部分的數量都必須不能小於這個占比rate。 :param factor_name: 待分箱的特征名稱 :param bad_name: 黑名單特征名稱 :param good_name: 白名單特征名稱 :param total_name: 總和的特診名稱 :param total_all: 總共數據量 :return: 返回整個分箱的間隔點,用List保存。這里是以date_df的index為分割點的。 ''' t1 = 0 #數據源的大小,條數 t2 = len(date_df)-1 num = len(date_df) #還不知道這樣做的目的是什么。 if num > pow(2,piece-1): num = pow(2,piece-1) #新定義一個list,這個list是什么含義 t_list = [t1,t2] tt =[] i = 1 #如果數據源的條數大於1,就表示有分箱的資格 if len(date_df) > 1: #這個是為了獲取date_df數據的[start_index,切點_index, end_index] #將數據根據ks最大處進行二分 t_list = cut_fun(t_list,date_df,'upper',rate,factor_name,bad_name,good_name,total_name,total_all) tt.append(t_list) for t_new in tt: #>2說明,分箱是成功的。 if len(t_new) > 2: # up_down = cut_while_fun(t_new,date_df,rate,factor_name,bad_name,good_name,total_name,total_all) t_up = up_down[0] if len(t_up) > 2: # t_list = list(set(t_list+t_up)) tt.append(t_up) t_down = up_down[1] if len(t_down) > 2: t_list = list(set(t_list+t_down)) tt.append(t_down) i += 1 #注意循環的停止條件 #1. i表示通過箱數限制break #2. len(t_list)還不是很清楚 if len(t_list)-1 > num: break if i >= piece: break if len(date_df) > 0: #這里有個疑問,我感覺有問題 #這里為啥要獲取第一條數據,total的數量 length1 = date_df.loc[0,total_name] if length1 >= rate*total_all: if 0 not in t_list: t_list.append(0) else: t_list.remove(0) t_list.sort() return t_list def get_combine(t_list, date_df, piece): ''' :param t_list: 這個值分箱間隔點 :param date_df: 數據源 :param piece: 分箱的箱數,表示第幾箱。 :return: 枚舉所有的分箱可能組合 ''' t1 = 0 t2 = len(date_df)-1 list0 = t_list[1:len(t_list)-1] combine = [] if len(t_list)-2 < piece: c = len(t_list)-2 else: c = piece-1 #獲取list0的所有子序列。子序列長度是c list1 = list(itertools.combinations(list0, c)) if list1: #向list1收尾添加數據,頭部添加t1-1,尾部添加t2 combine = [sorted(x + (t1-1,t2)) for x in list1] return combine def cal_iv(date_df,items,bad_name,good_name,total_name): ''' :param date_df: :param items: :param bad_name: :param good_name: :param total_name: :return: 返回計算的IV值 ''' iv0 = 0 bad0 = np.array([sum(date_df.ix[x[0]:x[1],bad_name]) for x in items]) good0 = np.array([sum(date_df.ix[x[0]:x[1],good_name]) for x in items]) bad_rate0 = np.array([sum(date_df.ix[x[0]:x[1],bad_name])*1.0/sum(date_df.ix[x[0]:x[1],total_name]) for x in items]) if 0 in bad0: return iv0 if 0 in good0: return iv0 good_per0 = good0*1.0/sum(date_df[good_name]) bad_per0 = bad0*1.0/sum(date_df[bad_name]) woe0 = [math.log(x,math.e) for x in good_per0/bad_per0] if sorted(woe0, reverse=False) == list(woe0) and sorted(bad_rate0, reverse=True) == list(bad_rate0): iv0 = sum(woe0*(good_per0-bad_per0)) elif sorted(woe0, reverse=True) == list(woe0) and sorted(bad_rate0, reverse=False) == list(bad_rate0): iv0 = sum(woe0*(good_per0-bad_per0)) return iv0 def choose_best_combine(date_df,combine,bad_name,good_name,total_name): ''' :param date_df: 數據源 :param combine: 所有的分箱可能 :param bad_name: :param good_name: :param total_name: :return: 通過最大IV值,來得到最優的分箱方法 ''' z = [0]*len(combine) for i in range(len(combine)): item = combine[i] z[i] = (list(zip([x+1 for x in item[0:len(item)-1]],item[1:]))) #計算最大的IV值 iv_list = [cal_iv(date_df,x,bad_name,good_name,total_name) for x in z] iv_max = max(iv_list) if iv_max == 0: return '' index_max = iv_list.index(iv_max) combine_max = z[index_max] #返回最好的分箱組合 #[(0, 180), (181, 268), (269, 348), (349, 450), (451, 605)] 類似於這種數據 return combine_max def verify_woe(x): if re.match('^\d*\.?\d+$', str(x)): return x else: return 0 def best_df(date_df, items, na_df, rate, factor_name, total_name, bad_name, good_name,total_all,good_all,bad_all): ''' :param date_df: :param items: 分箱間隔,數組[(0, 180), (181, 268), (269, 348), (349, 450), (451, 605)] :param na_df: :param rate: :param factor_name: :param total_name: :param bad_name: :param good_name: :param total_all: :param good_all: :param bad_all: :return:分箱之后的指標保存為dataframe,並返回。 ''' df0 = pd.DataFrame() if items: piece0 = ['['+str(date_df.ix[x[0],factor_name])+','+str(date_df.ix[x[1],factor_name])+']' for x in items] bad0 = [sum(date_df.ix[x[0]:x[1],bad_name]) for x in items] good0 = [sum(date_df.ix[x[0]:x[1],good_name]) for x in items] if len(na_df) > 0: piece0 = np.array(list(piece0) + ['['+str(x)+','+str(x)+']' for x in list(na_df[factor_name])]) bad0 = np.array(list(bad0) + list(na_df[bad_name])) good0 = np.array(list(good0) + list(na_df[good_name])) else: piece0 = np.array(list(piece0)) bad0 = np.array(list(bad0)) good0 = np.array(list(good0)) #bad0,good0都是list數據結構 total0 = bad0 + good0 #計算每一個箱子的總數量占比 total_per0 = total0*1.0/total_all #當前箱子的黑名單比例 bad_rate0 = bad0*1.0/total0 #當前箱子的白名單比例 good_rate0 = 1 - bad_rate0 #當前箱子的白名單在整體白名單數據的比例 good_per0 = good0*1.0/good_all #當前箱子黑名單在在整體黑名單數據的比例 bad_per0 = bad0*1.0/bad_all #先將這些數據保存為數框 df0 = pd.DataFrame(list(zip(piece0,total0,bad0,good0,total_per0,bad_rate0,good_rate0,good_per0,bad_per0)),columns=['Bin','Total_Num','Bad_Num','Good_Num','Total_Pcnt','Bad_Rate','Good_Rate','Good_Pcnt','Bad_Pcnt']) #通過bad_rate進行排序 df0 = df0.sort_values(by='Bad_Rate',ascending=False) df0.index = list(range(len(df0))) bad_per0 = np.array(list(df0['Bad_Pcnt'])) good_per0 = np.array(list(df0['Good_Pcnt'])) bad_rate0 = np.array(list(df0['Bad_Rate'])) good_rate0 = np.array(list(df0['Good_Rate'])) bad_cum = np.cumsum(bad_per0) good_cum = np.cumsum(good_per0) # woe0 = [math.log(x, math.e) for x in good_per0/bad_per0] #這里要注意當woe是無窮大的情況 #這種情況是因為在某些箱體中,黑名單數量或者白名單數量為0造成的 if 'inf' in str(woe0): woe0 = [verify_woe(x) for x in woe0] iv0 = woe0*(good_per0-bad_per0) gini = 1-pow(good_rate0,2)-pow(bad_rate0,2) df0['Bad_Cum'] = bad_cum df0['Good_Cum'] = good_cum df0["Woe"] = woe0 df0["IV"] = iv0 df0['Gini'] = gini #就是累計到KS最大的那個點 df0['KS'] = abs(df0['Good_Cum'] - df0['Bad_Cum']) #返回數框 return df0 def all_information(date_df, na_df, piece, rate, factor_name, total_name, bad_name, good_name,total_all,good_all,bad_all): ''' :param date_df: 這是經過處理之后的數據源,主要是針對factor_name統計flag_name的good,bad數量的數據 :param na_df: 這是個空的df。 :param piece: 分片大小,就是箱數 :param rate: 值是0.05,這個值目前的含義不明 :param factor_name: 分箱特征 :param total_name: 總和的特征名稱 :param bad_name: 黑名單的特征名稱 :param good_name: 白名單的特征名稱 :param total_all: 總和數量 :param good_all: 白名單數量 :param bad_all: 黑名單數量 :return:分箱之后的所有結果 ''' #新創建的一個List p_sort = list(range(piece+1)) #倒着排序,就是從大到小排序 p_sort.sort(reverse=True) t_list = ks_auto(date_df,piece,rate,factor_name,bad_name,good_name,total_name,total_all) #就是說明不需要分箱 if len(t_list) < 3: df1 = pd.DataFrame() print('Warning: this data cannot get bins or the bins does not satisfy monotonicity') return df1 df1 = pd.DataFrame() for c in p_sort[:piece-1]: #枚舉所有的分箱可能組合。 combine = get_combine(t_list,date_df,c) #選出最好的分箱 best_combine = choose_best_combine(date_df,combine,bad_name,good_name,total_name) #按照最佳的分箱數組,分箱 df1 = best_df(date_df,best_combine,na_df,rate,factor_name,total_name,bad_name,good_name,total_all,good_all,bad_all) if len(df1) != 0: gini = sum(df1['Gini']*df1['Total_Num']/sum(df1['Total_Num'])) print('piece_count:',str(len(df1))) print('IV_All_Max:',str(sum(df1['IV']))) print('Best_KS:',str(max(df1['KS']))) print('Gini_index:',str(gini)) print(df1) #把分箱之后的各個指標存為df,並且返回。 return df1 if len(df1) == 0: logger.warning('Warning: this data cannot get bins or the bins does not satisfy monotonicity') return df1 def fun_group_by(date_df,factor_name,bad_name,good_name): df_bad = date_df.groupby(factor_name)[bad_name].agg([(bad_name,'sum')]) df_good = date_df.groupby(factor_name)[good_name].agg([(good_name,'sum')]) df_bad = df_bad.reset_index() df_good = df_good.reset_index() good_dict = dict(list(zip(list(df_good[factor_name]),list(df_good[good_name])))) df_bad[good_name] = df_bad[factor_name].map(good_dict) df_bad[factor_name]= df_bad[factor_name].apply(lambda x : verify_factor(x)) df_bad = df_bad.sort_values(by=[factor_name],ascending=True) df_bad[factor_name] = df_bad[factor_name].astype(str) return df_bad def verify_factor(x): ''' :param x: :return: ''' if re.match('^\-?\d*\.?\d+$',x): x = float(x) return x def path_df(path,sep,factor_name): data = pd.read_csv(path,sep=sep) data[factor_name] = data[factor_name].astype(str).map(lambda x: x.upper()) data[factor_name] = data[factor_name].apply(lambda x: re.sub(' ','MISSING',x)) return data def verify_df_multiple(date_df,factor_name,total_name,bad_name,good_name): date_df = date_df.fillna(0) if (bad_name in date_df.columns) & (good_name in date_df.columns) & (total_name not in date_df.columns): date_df[good_name] = date_df[good_name].astype(float) date_df[bad_name] = date_df[bad_name].astype(float) date_df[total_name] = date_df[bad_name] + date_df[good_name] date_df = date_df.drop(date_df[date_df[total_name] == 0].index) if total_name in date_df.columns: date_df = date_df.drop(date_df[date_df[total_name] == 0].index) if bad_name in date_df.columns and good_name in date_df.columns: date_df['check'] = date_df[good_name] + date_df[bad_name] - date_df[total_name] date_df_check = date_df[date_df['check'] != 0] if len(date_df_check) > 0: date_df = pd.DataFrame() print('Error: total amounts is not equal to the sum of bad & good amounts') print(date_df_check) elif bad_name in date_df.columns: date_df['check'] = date_df[total_name] - date_df[bad_name] date_df_check = date_df[date_df['check'] < 0] if len(date_df_check) > 0: date_df = pd.DataFrame() print('Error: total amounts is smaller than bad amounts') print(date_df_check) else: date_df[good_name] = date_df[total_name] - date_df[bad_name] elif good_name in date_df.columns: date_df['check'] = date_df[total_name] - date_df[good_name] date_df_check = date_df[date_df['check'] < 0] if len(date_df_check) > 0: date_df = pd.DataFrame() print('Error: total amounts is smaller than good amounts') print(date_df_check) else: date_df[bad_name] = date_df[total_name] - date_df[good_name] else: print('Error: lack of bad or good data') date_df = pd.DataFrame() elif bad_name not in date_df.columns : print('Error: lack of bad data') date_df = pd.DataFrame() elif good_name not in date_df.columns: print('Error: lack of good data') date_df = pd.DataFrame() if len(date_df) != 0: date_df[good_name] = date_df[good_name].astype(int) date_df[bad_name] = date_df[bad_name].astype(int) date_df[factor_name] = date_df[factor_name].apply(lambda x: verify_factor(x)) date_df = date_df.sort_values(by=[factor_name],ascending=True) date_df[factor_name] = date_df[factor_name].astype(str) del date_df['check'] return date_df def verify_df_two(date_df,flag_name,factor_name): ''' 驗證數據集 :param date_df: :param flag_name: :param factor_name: :return: ''' #先刪除flag_name為空的數據 date_df = date_df.drop(date_df[date_df[flag_name].isnull()].index) #獲取flag_name值大於1的數據。如果是二分類,flag_name只會是0和1,不應該出現大於1的情況。 check = date_df[date_df[flag_name] > 1] if len(check) != 0 : print('Error: there exits the number bigger than one in the data') date_df = pd.DataFrame() return date_df elif len(date_df) != 0 : #這是正常,說明是二分類問題,並且轉化flag_name的值為int類型。 date_df[flag_name] = date_df[flag_name].astype(int) return date_df else: print('Error: the data is wrong') date_df = pd.DataFrame() return date_df def universal_df(data,flag_name,factor_name,total_name,bad_name,good_name): ''' 轉換數據,統計每一個值的黑白個數 :param data: :param flag_name: :param factor_name: :param total_name: :param bad_name: :param good_name: :return: ''' if flag_name != '': # 只讀取factor_name和flag_name這兩個特征的值 data = data[[factor_name,flag_name]] # 確保數據的flag_name是二元化,並且不會有空值。 data = verify_df_two(data,flag_name,factor_name) if len(data) != 0: # 根據 flag_name,factor_name聚合,統計flag_name的數量 data = data[flag_name].groupby([data[factor_name],data[flag_name]]).count() #把series轉化成新的 dataframe data = data.unstack() data = data.reset_index() #定義新的data列名 data.columns = [factor_name,'good','bad'] # 將factor_name數據的值類型進行校驗,看是不是數值型,然后轉化成float. data[factor_name] = data[factor_name].apply(lambda x: verify_factor(x)) #把data按照factor_name進行升序排序。 data = data.sort_values(by=[factor_name],ascending=True) #空缺值用0填補 data = data.fillna(0) #對data新增total字段 data['total'] = data['good']+data['bad'] #將data的factor_name字段改成str類型 data[factor_name] = data[factor_name].astype(str) else: data =[x.upper() for x in data[factor_name].astype(str)] verify_df_multiple(data,factor_name,total_name,bad_name,good_name) if len(data[factor_name]) != len(set(data[factor_name])): data = fun_group_by(data,factor_name,bad_name,good_name) print('universal_df') return data def Best_KS_Bin(path='',data=pd.DataFrame(),sep=',',flag_name='',factor_name='name',total_name='total',bad_name='bad',good_name='good',piece=5,rate=0.05,not_in_list=[]): time0 = time.time() if len(data) != 0: # 如果factor_name是字符串類型,那就全部轉化成大寫。 data[factor_name] = [x.upper() for x in data[factor_name].astype(str)] elif path != '': #如果path不為空,那么就從path里加載數據 data = path_df(path,sep,factor_name) data[factor_name] = [x.upper() for x in data[factor_name].astype(str)] else: data = pd.DataFrame() print('Error: there is no data') time1 = time.time() print('spend time(s):', round(time1-time0,0)) return data #這里就是返回數據里factor_name列數據的每個值的統計 data = universal_df(data,flag_name,factor_name,total_name,bad_name,good_name) # 總的樣本數 total_all = sum(data['total']) # 白名單個數 good_all = sum(data['good']) # 黑名單個數 bad_all = sum(data['bad']) if len(data) != 0: not_list = [x.upper() for x in not_in_list] if not_in_list: not_name = not_list if 'NA' in not_list or 'NAN' in not_list or '' in not_list: not_name = not_list + ['NAN'] elif ' ' in not_list: not_name = not_list + ['MISSING'] na_df = data[data[factor_name].isin(not_name)] date_df = data.drop(data[data[factor_name].isin(not_name)].index) if (0 in na_df[good_name]) or (0 in na_df[bad_name]): not_value = list(set(list(na_df[na_df[good_name] == 0][factor_name]) + list(na_df[na_df[bad_name] == 0][factor_name]))) print("Warning: the count of good or bad for the value in 'not_in_list' is 0. The value ("+str(not_value)+") will not get the separate bin. ") na_df_new = na_df[na_df[factor_name].isin(not_value)] na_df = na_df.drop(na_df[na_df[factor_name].isin(not_value)].index) na_df.index = list(range(len(na_df))) na_df_new[factor_name] = na_df_new[factor_name].map(lambda x: verify_factor(x)) date_df[factor_name] = date_df[factor_name].map(lambda x: verify_factor(x)) date_df = na_df_new.append(date_df) date_df = date_df.sort_values(by=factor_name,ascending=True) type_len = list(set([type(x) for x in list(date_df[factor_name])])) if len(type_len) > 1: other_df = date_df[date_df[factor_name].apply(lambda x: type(x) == str)] date_df = date_df[date_df[factor_name].apply(lambda x: type(x) == float)] date_df = other_df.append(date_df) else: #在not_in_list不為空的時候,執行如下邏輯 na_df = pd.DataFrame() date_df = data #重新定義data_df的index date_df.index = list(range(len(date_df))) if len(date_df) > 0: # 計算分箱 bin_df = all_information(date_df,na_df,piece,rate,factor_name,total_name,bad_name,good_name,total_all,good_all,bad_all) else: time1 = time.time() print('spend time(s):', round(time1-time0,0)) return data time1 = time.time() #統計分箱消耗時長 print('spend time(s):', round(time1-time0,0)) return bin_df else: time1 = time.time() print('spend time(s):', round(time1-time0,0)) return data ###############################################對KS分箱之后進行IV排名######################################### def sort_band_by_iv(): tmp_df=pd.DataFrame() indexvalue=1 for filename in os.listdir('/home/liuweitang/yellow_model/eda/band_result'): if 'csv' in filename: print(filename) try: band_result=pd.read_csv('/home/liuweitang/yellow_model/eda/band_result/%s'%filename) ks=band_result['KS'].max() iv_sum=band_result['IV'].sum() df=pd.DataFrame({ 'band':[filename], 'ks':[ks], 'iv_sum':[iv_sum] }) tmp_df=tmp_df.append(df) except Exception as err: pass tmp_df.reset_index(drop=True, inplace=True) tmp_df.info() tmp_df.sort_values(by=['iv_sum'], ascending=False, inplace=True) print(tmp_df) tmp_df.to_csv('/home/liuweitang/yellow_model/eda/IVSort/IV.csv',index=False) ####################################################數據合並##################################################### #數據合並 #就是開房次數和異性同住次數特征表進行合並,並且將數據合並之后的數據保存到本地。 def merge_data(lgzsPath,yxtzPath): lgzs_data=pd.read_csv(lgzsPath) yxtz_data=pd.read_csv(yxtzPath) result_data=pd.merge(yxtz_data,lgzs_data,how='inner',left_on='gmsfhm_rzsj',right_on='gmsfhm_rzsj') result_data.rename(columns={'label_x':'label'}, inplace=True) now_time=time.strftime('%Y%m%d',time.localtime(time.time())) result_data.to_csv('/home/liuweitang/yellow_model/data/input/new/yxtz_lgzs_merge_%s.csv'%now_time,index=False) ###################################################KS分箱的主類################################################# class KS_Bin(): def __init__(self,path,flag,notBandColList): ''' :param path: 數據源路徑 :param flag: 目標值1-0值 :param colList: 需要分箱的數據列 ''' line = os.popen("head -1 %s"%path) line=line.readlines()[0] if "$" in line: self.df=pd.read_csv(path,sep='$',engine='c') else: self.df=pd.read_csv(path, sep=',', engine='c') if 'bad' in self.df['label'].drop_duplicates().values: self.df[flag]=self.df[flag].map(lambda x: 1 if x=='bad' else 0) self.flag=flag self.path=path not_band_list=[] for col in self.df.columns.tolist(): if col not in notBandColList: not_band_list.append(col) self.colList=not_band_list print(self.colList) def to_band(self): for col in tqdm(self.colList): ks_data = Best_KS_Bin(data=self.df, flag_name=self.flag, factor_name=col) #將分箱數據導出來 self.binData_csv(ks_data, '/home/liuweitang/yellow_model/eda/band_result/%s_binResult.csv'%col) # 用WOE值代替分類值 for row in ks_data.index: bin= ks_data.loc[row].Bin woe= ks_data.loc[row].Woe binStart = float(bin.split(',')[0][1:]) binEnd=float(bin.split(',')[1][:-1]) self.df[col]=self.df[col].map(lambda x: float(x)) #用WOE值代替原來的值 self.df.loc[(self.df[col] >= binStart) & (self.df[col] <= binEnd),'%s_band'%col] = woe print('save data') self.save_band_data() def binData_csv(self,df,csvPath): df.to_csv(csvPath,index=False) def save_band_data(self): ''' 這里就是把分箱之后的字段提取出,作為新的數據進行保存 ''' band_list=[] #這兩個字段現在寫死了,看后期怎么玩,其實可以拿出來,當做參數,這樣子就可以通用化。 #目前只是我們的業務,所以自己寫了。 band_list.append('gmsfhm_rzsj') band_list.append('label') for col in self.df.columns.tolist(): if 'band' in col: band_list.append(col) band_data=self.df[band_list] filename=self.path.split('/')[-1] filename=filename.split('.')[0]+'_band' band_data.to_csv('/home/liuweitang/yellow_model/data/input/new/%s.csv'%filename,index=False) if __name__=="__main__": # print('start band lgzs') # #這里是分箱lgzs的數據 # lgzs_not_band_col=[ # 'gmsfhm_rzsj', # 'label' # ] # # lgzs_data_path='/home/liuweitang/yellow_model/feature/raw/train_openning_feature_20180508.txt' # lgzs_ks_bin=KS_Bin(lgzs_data_path,flag='label', notBandColList=lgzs_not_band_col) # lgzs_ks_bin.to_band() # # print('band lgzs finished') # # print('band yxtz start') # #這里對yxtz的數據分箱。 # yxtz_col_list=[ # 'gmsfhm_rzsj', # 'label' # ] # yxtz_data_path='/home/liuweitang/yellow_model/data/mk/tmp_good_people_in_yxtz_lwt2.txt' # yxtz_ks_bin=KS_Bin(yxtz_data_path,flag='label', notBandColList=yxtz_col_list) # yxtz_ks_bin.to_band() # print('band yxtz finished') # # print('start iv rank') # #對所有分箱之后的特征IV值排名保存 # sort_band_by_iv() # # print('start merge band_data') # #合並數據 # lgzs_band='/home/liuweitang/yellow_model/data/input/new/'+lgzs_data_path.split(".")[0]+'_band.csv' # yxtz_band='/home/liuweitang/yellow_model/data/input/new/'+yxtz_data_path.split(".")[0]+'_band.csv' # merge_data(lgzs_band,yxtz_band) data=pd.read_csv('application_test.csv') data['FLAG_OWN_CAR']=data['FLAG_OWN_CAR'].map(lambda x:1 if x=='Y' else 0) Best_KS_Bin(data=data,factor_name='AMT_INCOME_TOTAL',flag_name='FLAG_OWN_CAR') print(data[['FLAG_OWN_CAR','AMT_INCOME_TOTAL']].head())