
醫葯統計項目聯系QQ:231469242
如果樣本量太小,數據必須做分段化處理,否則會有很多空缺數據,woe效果不能有效發揮

隨機森林結果

iv》0.02的因子在隨機森林結果里都屬於有效因子,但是隨機森林重要性最強的因子沒有出現在有效iv參數里,說明這些缺失重要變量沒有做分段處理,數據離散造成。
數據文件

腳本備份
step1_customers_split_goodOrBad.py
# -*- coding: utf-8 -*- """ Created on Sun Jan 14 21:45:43 2018 @author QQ:231469242 把數據源分類為兩個Excel,好客戶Excel數據和壞客戶Excel數據 """ import pandas as pd import numpy as np import matplotlib.pyplot as plt #讀取文件 readFileName="breast_cancer_總.xlsx" #保存文件 saveFileName_good="result_good.xlsx" saveFileName_bad="result_bad.xlsx" #讀取excel df=pd.read_excel(readFileName) #帥選數據 df_good=df[df.diagnosis=="B"] df_bad=df[df.diagnosis=="M"] #保存數據 df_good.to_excel(saveFileName_good, sheet_name='Sheet1') df_bad.to_excel(saveFileName_bad, sheet_name='Sheet1')
step2_automate_find_informative_variables.py
# -*- coding: utf-8 -*-
"""
Created on Sun Jan 14 22:13:30 2018
@author: QQ:231469242
woe負數,好客戶<壞客戶
woe正數,好客戶>壞客戶
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
#創建save文件
newFile=os.mkdir("save/")
#讀取文件
FileName_good="result_good.xlsx"
FileName_bad="result_bad.xlsx"
#保存文件
saveFileName="result_woe_iv.xlsx"
#讀取excel
df_good=pd.read_excel(FileName_good)
df_bad=pd.read_excel(FileName_bad)
#所有變量列表
list_columns=list(df_good.columns[:-1])
index=0
def Ratio_goodDevideBad(index):
#第一列字段名(好客戶屬性)
columnName=list(df_good.columns)[index]
#第一列好客戶內容和第二列壞客戶內容
column_goodCustomers=df_good[columnName]
column_badCustomers=df_bad[columnName]
#去掉NAN
num_goodCustomers=column_goodCustomers.dropna()
#統計數量
num_goodCustomers=num_goodCustomers.size
#去掉NAN
num_badCustomers=column_badCustomers.dropna()
#統計數量
num_badCustomers=num_badCustomers.size
#第一列頻率分析
frenquency_goodCustomers=column_goodCustomers.value_counts()
#第二列頻率分析
frenquency_badCustomers=column_badCustomers.value_counts()
#各個元素占比
ratio_goodCustomers=frenquency_goodCustomers/num_goodCustomers
ratio_badCustomers=frenquency_badCustomers/num_badCustomers
#最終好壞比例
ratio_goodDevideBad=ratio_goodCustomers/ratio_badCustomers
return (columnName,num_goodCustomers,num_badCustomers,frenquency_goodCustomers,frenquency_badCustomers,ratio_goodCustomers,ratio_badCustomers,ratio_goodDevideBad)
#woe函數,陣列計算
def Woe(ratio_goodDevideBad):
woe=np.log(ratio_goodDevideBad)
return woe
'''
#iv函數,陣列計算
def Iv(woe):
iv=(ratio_goodCustomers-ratio_badCustomers)*woe
return iv
'''
#iv參數評估,參數iv_sum(變量iv總值)
def Iv_estimate(iv_sum):
#如果iv值大於0.02,為有效因子
if iv_sum>0.02:
print("informative")
return "A"
#評估能力一般
else:
print("not informative")
return "B"
'''
#詳細參數輸出
def Print():
print ("columnName:",columnName)
Iv_estimate(iv_sum)
print("iv_sum",iv_sum)
#print("",)
#print("",)
'''
#詳細參數保存到excel,save文件里
def Write_singleVariable_to_Excel(index):
#index為變量索引,第一個變量,index=0
ratio=Ratio_goodDevideBad(index)
columnName,num_goodCustomers,num_badCustomers,frenquency_goodCustomers,frenquency_badCustomers,ratio_goodCustomers,ratio_badCustomers,ratio_goodDevideBad=ratio[0],ratio[1],ratio[2],ratio[3],ratio[4],ratio[5],ratio[6],ratio[7]
woe=Woe(ratio_goodDevideBad)
iv=(ratio_goodCustomers-ratio_badCustomers)*woe
df_woe_iv=pd.DataFrame({"num_goodCustomers":num_goodCustomers,"num_badCustomers":num_badCustomers,"frenquency_goodCustomers":frenquency_goodCustomers,
"frenquency_badCustomers":frenquency_badCustomers,"ratio_goodCustomers":ratio_goodCustomers,
"ratio_badCustomers":ratio_badCustomers,"ratio_goodDevideBad":ratio_goodDevideBad,
"woe":woe,"iv":iv},columns=["num_goodCustomers","num_badCustomers","frenquency_goodCustomers","frenquency_badCustomers",
"ratio_goodCustomers","ratio_badCustomers","ratio_goodDevideBad","woe","iv"])
#sort_values(by=...)用於對指定字段排序
df_sort=df_woe_iv.sort_values(by='iv',ascending=False)
#ratio_badDevideGood數據寫入到result_compare_badDevideGood.xlsx文件
df_sort.to_excel("save/"+columnName+".xlsx")
#計算iv總和,評估整體變量
iv_sum=sum([i for i in iv if np.isnan(i)!=True])
print ("變量:",columnName)
#iv參數評估,參數iv_sum(變量iv總值)
iv_estimate=Iv_estimate(iv_sum)
print("iv_sum",iv_sum)
return iv_estimate,columnName
#y\有價值變量列表存儲器
list_Informative_variables=[]
#寫入所有變量參數,保存到excel里,save文件
for i in range(len(list_columns)):
status=Write_singleVariable_to_Excel(i)[0]
columnName=Write_singleVariable_to_Excel(i)[1]
if status=="A":
list_Informative_variables.append(columnName)
最終得到一部分有效因子,共12個,經過數據分段化處理,會得到更多有效因子。


