缺失值算是決策樹里處理起來比較麻煩的了,其他簡單的我就不發布了。
# encoding:utf-8 from __future__ import division __author__ = 'HP' import copy import math import numpy as np import pandas as pd from collections import Counter from sklearn.preprocessing import LabelEncoder ################################ # id3 # 離散屬性 # 多分類 # 多重字典記錄學習規則 # 非遞歸 # 深度優先 # 預剪枝 ### 缺失值處理 # 解決兩個問題 # 如何進行划分屬性選擇,缺失值如何處理 # 如何進行樣本划分,缺失值對應的樣本如何划分 ################################ ''' 缺失值處理 1. 如何進行屬性選擇 a. 第一次選擇划分屬性時,樣本等權重,均為1,找出未缺失的樣本集,計算該樣本集的信息增益 和 該樣本集的占比,兩者相乘即為真正的信息增益 . 注意這時計算占比,就是數個數,因為權重都是1 . 計算信息增益時,P也是數個數 b. 后面選擇划分屬性時,樣本不等權重,找出未缺失的樣本集,計算該樣本集的信息增益 和 該樣本集的占比,兩者相乘即為真正的信息增益 . 此時樣本權重不全為1 . 計算占比時不是數個數,而是求權重和 . 計算信息增益的P時,也是求權重和 2. 如何划分節點 a. 未缺失按照正常方法划分,權重都為1 b. 缺失值划到所有子集當中,權重不為1, 而是該屬性值占未缺失的樣本集的比例 ''' def mydata(): data = pd.read_csv('xg3.txt',index_col=[0], encoding='gbk') data[[-1]] = data.apply(lambda x:x[-1].strip(), axis=1) # print(data) # print(pd.get_dummies(data[[0]])) data.columns = range(9) # print(data) encode_str = LabelEncoder() str_cols = [0, 1, 2, 3, 4, 5, 8] for i in str_cols: data[[i]] = encode_str.fit_transform(data[[i]]) return data.values def get_label(labels): count_label = Counter(labels) key = None sum = 0 for label, count in count_label.items(): if count > sum: sum = count key = label return key def entropy(attr): # 信息熵 attr_values_count = Counter(attr) attr_len = len(attr) sum = 0 for i in attr_values_count.values(): sum += -1 * i / attr_len * math.log(i / attr_len, 2) return sum def gain_queshi_equal_weight(attr, label): # 缺失屬性的信息增益,用於初次划分,初次划分樣本權重都為1 index_nan = np.isnan(attr) index_nonan = np.where(attr>=0) # 未缺失屬性及標簽 attr_new = attr[index_nonan] label_new = label[index_nonan] # 未缺失樣本數 count_nonan = label_new.shape[0] # 未缺失占比 zhanbi = attr_new.shape[0]/attr.shape[0] # 未缺失的原始熵 ori_entropy = entropy(label_new) # 未缺失的新熵 new_entropy = 0 for key, count in Counter(attr_new).items(): # 未缺失中屬性值為key的占比 * key對應的樣本集的熵 new_entropy += count/count_nonan * entropy(label_new[np.where(attr_new == key)]) # 信息增益 gain = zhanbi * (ori_entropy - new_entropy) return gain def split_node_queshi(node, attr_split): # 屬性有缺失值的樣本划分 index_nan = np.isnan(node[:,attr_split]) index_nonan = np.where(node[:,attr_split]>=0) # 未缺失屬性值對應的樣本集 node_new = node[index_nonan] # 缺失屬性值對應的樣本集 sample_queshi = node[index_nan] # 未缺失樣本大小 count_nonan = node_new.shape[0] ### 對該樣本集進行划分 # 未缺失的划分 [屬性值,樣本集,樣本占比] split = [] for key, node_child in pd.DataFrame(node_new).groupby(attr_split): # 屬性值為key的樣本在未缺失樣本中占比 zhanbi_key = round(len(node_child) / count_nonan, 3) # 未缺失樣本權重為1 weight = [1] * len(node_child) # 添加缺失樣本 node_child = np.vstack((node_child.values, sample_queshi)) # 缺失樣本權重 weight.extend([zhanbi_key] * len(sample_queshi)) split.append([key, node_child, np.array(weight)]) return split def entropy_no_equal_weight(attr, weight): # 樣本不等權重的信息熵 sum = 0 sum_weight = np.sum(weight) for key in Counter(attr).keys(): index = np.where(attr==key) zhanbi = np.sum(weight[index]) / sum_weight sum += -1 * zhanbi * math.log(zhanbi, 2) return sum def gain_queshi_no_equal_weight(attr, weight, label): # 缺失屬性的信息增益,樣本權重不相等,用於第一次之后的屬性選擇 index_nan = np.isnan(attr) index_nonan = np.where(attr>=0) # 未缺失的屬性/標簽/權重 attr_new = attr[index_nonan] label_new = label[index_nonan] weight_new = weight[index_nonan] # 未缺失對應的樣本占比 zhanbi = np.sum(weight_new) / np.sum(weight) ### 未缺失對應的信息增益 # 未缺失對應的原始熵 ori_entropy = entropy_no_equal_weight(label_new, weight_new) # 未缺失的新熵 new_entropy = 0 for key in Counter(attr_new).keys(): index_key = np.where(attr_new==key) label_key = label_new[index_key] weight_key = weight_new[index_key] new_entropy += len(label_key) / len(label_new) * entropy_no_equal_weight(label_key, weight_key) # 信息增益 gain = zhanbi * (ori_entropy - new_entropy) return gain if __name__ == '__main__': data = mydata() # 離散型樣本 data = data[:,[0,1,2,3,4,5,8]] data[0, 0] = None data[4, 0] = None data[12, 0] = None data[7, 3] = None data[9, 3] = None print(data) # 缺失屬性的信息增益 樣本等權重 for i in range(data.shape[1]): print gain_queshi_equal_weight(data[:,i], data[:,-1]) # 缺失值屬性的樣本划分 split = split_node_queshi(data, 3) print(split) # 缺失屬性的信息增益 樣本不等權重 # weight = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1/3, 1/3]) # gain_queshi_no_equal_weight(data[:,0], weight, data[:,-1]) # 以色澤為例 gain = gain_queshi_no_equal_weight(split[2][1][:,0], split[2][2],split[2][1][:,-1]) print(gain)