離散特征信息增益計算
數據來自《.統計學習方法——李航》5.2.1節中貸款申請樣本數據表
利用pandas的value_counts(),快速計算
import pandas as pd import numpy as np
def ent(data): ''' calculate entropy :param data: :return: ''' prob = pd.value_counts(data)/len(data) return sum(np.log2(prob)*prob*(-1)) def get_info_gain(data, feat, label): ''' :param data: DataFrame :param feat: feature :param label: target :return: ''' e1 = data.groupby(feat).apply(lambda x:ent(x[label])) p1 = pd.value_counts(data[feat])/len(data[feat]) e2 = sum(e1*p1) return ent(data[label]) - e2 pass if __name__ == '__main__': data = pd.DataFrame({'年齡':['青年','青年','青年','青年','青年','中年','中年','中年','中年','中年','老年','老年','老年','老年','老年'], '有工作':['否','否','是','是','否','否','否','是','否','否','否','否','是','是','否'], '有自己的房子':['否','否','否','是','否','否','否','是','是','是','是','是','否','否','否'], '貸款情況':['一般','好','好','一般','一般','一般','好','好','非常好','非常好','非常好','好','好','非常好','一般'], '類別':['否','否','是','是','否','否','否','是','是','是','是','是','是','是','否']}) print(ent(data['類別'])) # 0.9709505944546686 label = '類別' for feat in ['年齡','有工作','有自己的房子','貸款情況']: print(get_info_gain(data, feat, label)) # 0.08300749985576883 # 0.32365019815155627 # 0.4199730940219749 # 0.36298956253708536
refference:python詳細步驟計算信息增益