機器學習:決策樹——分類樹 ID3算法 代碼+案例


一、決策樹ID3遞歸算法的實現

import numpy as np

class DecisionTree:
    class Node:
        def __init__(self):
            self.value = None

            # 內部葉節點屬性
            self.feature_index = None
            self.children = {}

        def __str__(self):
            if self.children:
                s = '內部節點<%s>:\n' % self.feature_index
                for fv, node in self.children.items():
                    ss = '[%s]-> %s' % (fv, node)
                    s += '\t' + ss.replace('\n', '\n\t') + '\n'
                s = s[:-1]
            else:
                s = '葉節點(%s)' % self.value
            return s

    def __init__(self, gain_threshold=1e-2):
        # 信息增益閾值
        self.gain_threshold = gain_threshold

    def _entropy(self, y):
        # 熵: -sum(pi*log(pi))
        c = np.bincount(y) 
        p = c[np.nonzero(c)] / y.size
        return -np.sum(p * np.log2(p))

    def _conditional_entropy(self, feature, y):
        # 條件熵
        feature_values = np.unique(feature)
        h = 0.
        for v in feature_values:
            y_sub = y[feature == v]
            p = y_sub.size / y.size
            h +=  p * self._entropy(y_sub)
        return h 

    def _information_gain(self, feature, y):
        # 信息增益 = 經驗熵 - 經驗條件熵
        return self._entropy(y) - self._conditional_entropy(feature, y)

    def _select_feature(self, X, y, features_list):
        # 選擇信息增益最大特征

        # 正常情況下, 返回特征(最大信息增益)在features_list中的index值.
        if features_list:
            gains = np.apply_along_axis(self._information_gain, 0, X[:, features_list], y)
            index = np.argmax(gains)
            if gains[index] > self.gain_threshold:
                return index

        # 當features_list已為空, 或所有特征信息增益都小於閾值, 返回None.
        return None

    def _create_tree(self, X, y, features_list):
        # 創建節點
        node = DecisionTree.Node()
        # 統計數據集中樣本類標記的個數
        labels_count = np.bincount(y)
        # 任何情況下, 節點值總等於數據集中樣本最多的類標記.
        node.value = np.argmax(np.bincount(y))

        # 判斷類標記是否全部一致
        if np.count_nonzero(labels_count) != 1:
            # 選擇信息增益最大的特征
            index = self._select_feature(X, y, features_list)

            # 能選擇到適合的特征時, 創建內部節點, 否則創建葉節點.
            if index is not None:
                # 將已選特征從特征集合中刪除.
                node.feature_index = features_list.pop(index)

                # 根據已選特征的取值划分數據集, 並使用數據子集創建子樹.
                feature_values = np.unique(X[:, node.feature_index])
                for v in feature_values:
                    # 篩選出數據子集
                    idx = X[:, node.feature_index] == v
                    X_sub, y_sub = X[idx], y[idx]
                    # 創建子樹
                    node.children[v] = self._create_tree(X_sub, y_sub, features_list.copy())

        return node

    def _predict_one(self, x_test):
        # 搜索決策樹, 對單個樣本進行預測.
        
        # 爬樹一直爬到某葉節點為止, 返回葉節點的值.
        node = self.tree_
        while node.children:
            child = node.children.get(x_test[node.feature_index])
            if not child:
                # 根據測試點屬性值不能找到相應子樹(這是有可能的),
                # 則停止搜索, 將該內部節點當作葉節點(返回其值).
                break
            node = child

        return node.value

    def train(self, X_train, y_train):
        # 訓練決策樹
        _, n = X_train.shape 
        self.tree_ = self._create_tree(X_train, y_train, list(range(n)))

    def predict(self, X_test):
        # 對每一個測試樣本, 調用_predict_one, 將收集到的結果數組返回.
        return np.apply_along_axis(self._predict_one, axis=1, arr=X_test)

    def __str__(self):
        if hasattr(self, 'tree_'):
            return str(self.tree_)
        return ''

二、數據集的加載

數據集獲取的網站

http://archive.ics.uci.edu/ml/machine-learning-databases/lenses/

數據描述信息:

1. Title: Database for fitting contact lenses

2. Sources:
(a) Cendrowska, J. "PRISM: An algorithm for inducing modular rules",
International Journal of Man-Machine Studies, 1987, 27, 349-370
(b) Donor: Benoit Julien (Julien@ce.cmu.edu)
(c) Date: 1 August 1990

3. Past Usage:
1. See above.
2. Witten, I. H. & MacDonald, B. A. (1988). Using concept
learning for knowledge acquisition. International Journal of
Man-Machine Studies, 27, (pp. 349-370).

Notes: This database is complete (all possible combinations of
attribute-value pairs are represented).

Each instance is complete and correct.

9 rules cover the training set.

4. Relevant Information Paragraph:
The examples are complete and noise free.
The examples highly simplified the problem. The attributes do not
fully describe all the factors affecting the decision as to which type,
if any, to fit.

5. Number of Instances: 24

6. Number of Attributes: 4 (all nominal)

7. Attribute Information:
-- 3 Classes
1 : the patient should be fitted with hard contact lenses,
2 : the patient should be fitted with soft contact lenses,
3 : the patient should not be fitted with contact lenses.

1. age of the patient: (1) young, (2) pre-presbyopic, (3) presbyopic
2. spectacle prescription: (1) myope, (2) hypermetrope
3. astigmatic: (1) no, (2) yes
4. tear production rate: (1) reduced, (2) normal

8. Number of Missing Attribute Values: 0

9. Class Distribution:
1. hard contact lenses: 4
2. soft contact lenses: 5
3. no contact lenses: 15

加載

import numpy as np
dataset=np.genfromtxt('F:/python_test/data/lenses.data')
print(dataset)

 

 

 除去樣本的id,將屬性和決策結果分開並訓練模型

X=dataset[:,1:-1]
y=dataset[:,-1]
y=y.astype('int64')
X=X.astype('int64')
dt=DecisionTree()
dt.train(X,y)

三、打印樹的結構

規模小的時候可以通過上述代碼中的函數打印樹的結構

print(dt)

 

 

 其中內部節點<>表示的是屬性的下標,上述描述文件中的下標是從1開始的,在這里的下標是0-3

葉節點的值()表示的是類別標記,一共有1,2,3三種

下標[]表示的是屬性的下標,對應於每一個內部節點都有一些屬性划分,縮進表示的是以這個結點為根,一下都是他的子節點

下面我們通過另一種方式展示決策樹,通過graphviz,下載可以通過網上搜索,官網比較慢,建議直接下載網頁上的快速下載就行,將bin文件加入環境變量,並且在命令行中輸入dot -version查看是否設置成功,成功之后便可以通過下面的程序以上述已經測試好的樹來進行可視化。首先寫可視化代碼:

from graphviz import Digraph
class DecisionTreePlotter:
    def __init__(self,tree,feature_names=None,label_names=None):
        #保存決策樹
        self.tree=tree
        self.feature_names=feature_names
        self.label_names=label_names
        self.graph=Digraph('Decision Tree')
    
    def _build(self,dt_node):
        
        if dt_node.children:
            #dt_node是內部節點
            
            #獲取特征名字
            d=self.feature_names[dt_node.feature_index]
            if self.feature_names:
                label=d['name']
            else:
                label=str(dt_node.feature_index)
            
            #創建方形內部節點graphviz
            self.graph.node(str(id(dt_node)),label=label,shape='box')
            
            for feature_value,dt_child in dt_node.children.items():
                #遞歸調用build構建子節點graphviz
                child=self._build(dt_child)
                
                #獲得特征值的名9字
                d_value=d.get('value_names')
                if d_value:
                    label=d_value[feature_value]
                else:
                    label=str(feature_value)
                
                #創建連接父子節點的邊(graphviz)
                self.graph.edge(str(id(dt_node)),str(id(dt_child)),label=label,font_size='10')
        else:
            #dt_node是葉節點
            
            #獲取類標記的名字
            if self.label_names:
                label=self.label_names[dt_node.value]
            else:
                label=str(node.value)
                
            #創建圓形葉子結點(graphviz)
            self.graph.node(str(id(dt_node)),label=label,shape='')

    def plot(self):
        #創建graphviz圖
        self._build(self.tree)
        #顯示圖
        self.graph.view()

為屬性和決策的取值設置映射,這是按照屬性的錄入順序設置的,下標要從0開始,然后分類標記不需要從0開始

import numpy as np
D=np.genfromtxt('F:/python_test/data/lenses.data')
X=D[:,1:-1]
y=D[:,-1]
X=X.astype('int64')
y=y.astype('int64')
dt=DecisionTree()
dt.train(X,y)

feature_dict = {
    0:{
        'name':'age',
        'value_names':{1:'young',2:'pre-presbyopic',3:'presbyopic'}
    },
    
    1:{
       'name':'prescript',
        'value_names':{1:'myope',2:'hypermetrope'}
    },
    
    2:{
        'name':'astigmatic',
        'value_names':{1:'no',2:'yes'}
    },
    
    3:{
        'name':'tear rate',
        'value_names':{1:'reduced',2:'normal'}
    },
}

label_dict = {
    1:'hard',
    2:'soft',
    3:'no_lenses',
}

dtp=DecisionTreePlotter(dt.tree_,feature_names=feature_dict,label_names=label_dict)
dtp.plot()

決策樹生成之后是一個pdf+gv文件,我們可以將gv文件用graphviz/bin/gvedit.exe打開並且保存為png,最終決策樹生成的結果如下圖所示

 

 

 

我們在做決策的時候,只需要按照屬性以及取值進行匹配即可到達一個葉節點,做出決策,ID3算法是一個簡潔高效的算法 

四、項目實戰

4.1 數據集的獲取 car.data

http://archive.ics.uci.edu/ml/machine-learning-databases/car/

4.2 查看數據

import numpy as np
dataset=np.genfromtxt('F:/python_test/data/car.data',delimiter=',',dtype=np.str)
print(dataset)
print(dataset.shape)

 

 4.3 標簽轉換成整型

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
col=dataset[:,0]
print('第一列的原始數據:\n',col)
le.fit(col)
print('第一列轉換之后的數據:\n',le.transform(col))
print('第一列標簽的保存:\n',le.classes_)

 

 4.4 對每一列數據進行轉換

#對每一列進行轉換
def convert(col,value_name_list):
    le=LabelEncoder()
    res=le.fit_transform(col)
    value_name_list.append(le.classes_)
    return res

value_name_list=[]
dataset=np.apply_along_axis(convert,axis=0,arr=dataset,value_name_list=value_name_list)
print(dataset)

 

 

print(value_name_list)

 

 4.5 數據集加載、划分以及屬性集、標簽集的映射

X=dataset[:,:-1]
y=dataset[:,-1]
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
dt=DecisionTree()
dt.train(X_train,y_train)
feature_names = ['buying','maint','doors','persons','lug_boot','safety']

#使用保存的屬性名稱對整數的屬性進行賦值
feature_dict={
    i:{
        'name':v,
        'value_names':dict(enumerate(value_name_list[i]))
    }for i,v in enumerate(feature_names)
}

#value_name_list中的最后一列數據是類別標記
label_dict = dict(enumerate(value_name_list[-1]))

plotter=DecisionTreePlotter(dt.tree_,feature_names=feature_dict,label_names=label_dict)
plotter.plot()

構成的決策樹如下圖所示,是一棵枝葉很茂盛的樹

 

 4.6 計算在測試集大小不同的情況下模型的泛化能力

單次測試得到模型的准確度為0.93

from sklearn.metrics import accuracy_score
y_predict=dt.predict(X_test)
print(y_predict)
score=accuracy_score(y_test,y_predict)
print('accuracy_score:',score)

觀察不同的測試集大小划分情況下的准確度,以0.1為步長,每個划分大小都測試100次並計算平均准確度

訓練集的比例分別是(0.9,0.8,0.7,0.6,0.5)

def test(test_size,times):
    def test_one():
        X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=test_size)
        dt=DecisionTree()
        dt.train(X_train,y_train)
        y_predict = dt.predict(X_test)
        score = accuracy_score(y_test,y_predict)
        return score
    return np.mean([test_one() for _ in range(times)])

TEST_SIZE=np.arange(0.1,0.51,0.1)
SCORE=np.array([test(test_size,100) for test_size in TEST_SIZE])
print(SCORE)

 

 繪制准確度變化曲線

可見即使是50%的測試集和50%的訓練集,精確度仍然達到了90%,略有下降

import matplotlib.pyplot as plt
plt.scatter(TEST_SIZE,SCORE)
plt.plot(TEST_SIZE,SCORE,'--',color='red')
plt.ylim([0.75,1.0])
plt.xlabel('test/(test+train)')
plt.ylabel('accuracy')
plt.show()

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM