機器學習:基於CART算法的決策樹——分類樹與回歸樹


一、分類樹構建(實際上是一棵遞歸構建的二叉樹,相關的理論就不介紹了)

import numpy as np

class CartClassificationTree:
    class Node:
        '''樹節點類'''

        def __init__(self):
            self.value = None

            # 內部葉節點屬性
            self.feature_index = None
            self.feature_value = None
            self.left = None
            self.right = None

        def __str__(self):
            if self.left:
                s = '內部節點<%s>:\n' % self.feature_index
                ss = '[ >%s]-> %s' % (self.feature_value, self.left)
                s += '\t' + ss.replace('\n', '\n\t') + '\n'
                ss = '[<=%s]-> %s' % (self.feature_value, self.right)
                s += '\t' + ss.replace('\n', '\n\t')
            else:
                s = '葉節點(%s)' % self.value
            return s

    def __init__(self, gini_threshold=0.01, gini_dec_threshold=0., min_samples_split=2):
        '''構造器函數'''
        # 基尼系數的閾值
        self.gini_threshold = gini_threshold
        # 基尼系數降低的閾值
        self.gini_dec_threshold = gini_dec_threshold
        # 數據集還可繼續分割的最小樣本數量
        self.min_samples_split = min_samples_split

    def _gini(self, y):
        '''計算基尼指數'''
        values = np.unique(y)

        s = 0.
        for v in values:
            y_sub = y[y == v]
            s += (y_sub.size / y.size) ** 2

        return 1 - s

    def _gini_split(self, y, feature, value):
        '''計算根據特征切分后的基尼指數'''
        # 根據特征的值將數據集拆分成兩個子集
        indices = feature > value
        y1 = y[indices]
        y2 = y[~indices]

        # 分別計算兩個子集的基尼系數
        gini1 = self._gini(y1)
        gini2 = self._gini(y2)

        # 計算分割后的基尼系數
        # gini(y, feature) = (|y1| * gini(y1) + |y2| * gini(y2)) / |y|
        gini = (y1.size * gini1 + y2.size * gini2) / y.size

        return gini

    def _get_split_points(self, feature):
        '''獲得一個連續值特征的所有分割點'''
        # 獲得一個特征所有出現過的值, 並排序.
        values = np.unique(feature)
        # 分割點為values中相鄰兩個點的中點.
        split_points = [(v1 + v2) / 2 for v1, v2 in zip(values[:-1], values[1:])]

        return split_points

    def _select_feature(self, X, y):
        '''選擇划分特征'''
        # 最佳分割特征的index
        best_feature_index = None
        # 最佳分割點
        best_split_value = None

        min_gini = np.inf
        _, n = X.shape
        for feature_index in range(n):
            # 迭代每一個特征
            feature = X[:, feature_index]
            # 獲得一個特征的所有分割點
            split_points = self._get_split_points(feature)
            for value in split_points:
                # 迭代每一個分割點value, 計算使用value分割后的數據集基尼系數.
                gini = self._gini_split(y, feature, value)
                # 找到更小的gini, 則更新分割特征和.
                if gini < min_gini:
                    min_gini = gini 
                    best_feature_index = feature_index
                    best_split_value = value

        # 判斷分割后基尼系數的降低是否超過閾值
        if self._gini(y) - min_gini < self.gini_dec_threshold:
            best_feature_index = None
            best_split_value = None

        return best_feature_index, best_split_value, min_gini

    def _node_value(self, y):
        '''計算節點的值'''
        # 統計數據集中樣本類標記的個數
        labels_count = np.bincount(y)
        # 節點值等於數據集中樣本最多的類標記.
        return np.argmax(np.bincount(y))

    def _create_tree(self, X, y):
        '''生成樹遞歸算法'''
        # 創建節點
        node = self.Node()
        # 計算節點的值, 等於y的均值.
        node.value = self._node_value(y)

        # 若當前數據集樣本數量小於最小分割數量min_samples_split, 則返回葉節點.
        if y.size < self.min_samples_split:
            return node

        # 若當前數據集的基尼系數小於閾值gini_threshold, 則返回葉節點.
        if self._gini(y) < self.gini_threshold:
            return node

        # 選擇最佳分割特征
        feature_index, feature_value, min_gini = self._select_feature(X, y)
        if feature_index is not None:
            # 如果存在適合分割特征, 當前節點為內部節點.
            node.feature_index = feature_index
            node.feature_value = feature_value

            # 根據已選特征及分割點將數據集划分成兩個子集.
            feature = X[:, feature_index]
            indices = feature > feature_value
            X1, y1 = X[indices], y[indices]
            X2, y2 = X[~indices], y[~indices]

            # 使用數據子集創建左右子樹.
            node.left = self._create_tree(X1, y1)
            node.right = self._create_tree(X2, y2)

        return node

    def _predict_one(self, x_test):
        '''對單個樣本進行預測'''
        # 爬樹一直爬到某葉節點為止, 返回葉節點的值.
        node = self.tree_
        while node.left:
            if x_test[node.feature_index] > node.feature_value:
                node = node.left
            else:
                node = node.right

        return node.value

    def train(self, X_train, y_train):
        '''訓練決策樹'''
        self.tree_ = self._create_tree(X_train, y_train)

    def predict(self, X_test):
        '''對測試集進行預測'''
        # 對每一個測試樣本, 調用_predict_one, 將收集到的結果數組返回.
        return np.apply_along_axis(self._predict_one, axis=1, arr=X_test)

二、分類樹項目實戰

2.1 數據集獲取(經典的鳶尾花數據集)

http://archive.ics.uci.edu/ml/machine-learning-databases/iris/

描述:

Attribute Information:
1. sepal length in cm
2. sepal width in cm
3. petal length in cm
4. petal width in cm
5. class:
-- Iris Setosa
-- Iris Versicolour
-- Iris Virginica

2.2 加載數據

import numpy as np
X=np.genfromtxt('F:/python_test/data/iris.data',delimiter=',',usecols=range(4),dtype=float)
print(X)
y=np.genfromtxt('F:/python_test/data/iris.data',delimiter=',',usecols=4,dtype=str)
print(y)

 

 2.3 分類標簽的變換

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(y)
print('變換之后的y:\n',y)

 

2.4 訓練模型以及計算精確度,分類的精確度達到了0.9777777777777777

 

cct = CartClassificationTree()
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
cct.train(X_train,y_train)
from sklearn.metrics import accuracy_score
y_pred=cct.predict(X_test)
score=accuracy_score(y_test,y_pred)
print(score)

2.5 調整測試集大小,發現不管測試集划分為多大,最終的准確度大約都是94%

import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
plt.scatter(TEST_SIZE,SCORE)
plt.plot(TEST_SIZE,SCORE,'--',color='red')
plt.ylim([0.90,1.0])
plt.xlabel('test/(test+train)')
plt.ylabel('accuracy')
plt.show()

 

 三、回歸樹項目實戰

3.1 回歸樹的代碼(通過遞歸構建的二叉樹,cart算法)

import numpy as np

class CartRegressionTree:
    class Node:
        '''樹節點類'''

        def __init__(self):
            self.value = None

            # 內部葉節點屬性
            self.feature_index = None
            self.feature_value = None
            self.left = None
            self.right = None

        def __str__(self):
            if self.left:
                s = '內部節點<%s>:\n' % self.feature_index
                ss = '[ >%s]-> %s' % (self.feature_value, self.left)
                s += '\t' + ss.replace('\n', '\n\t') + '\n'
                ss = '[<=%s]-> %s' % (self.feature_value, self.right)
                s += '\t' + ss.replace('\n', '\n\t')
            else:
                s = '葉節點(%s)' % self.value
            return s

    def __init__(self, mse_threshold=0.01, mse_dec_threshold=0., min_samples_split=2):
        '''構造器函數'''
        # mse的閾值
        self.mse_threshold = mse_threshold
        # mse降低的閾值
        self.mse_dec_threshold = mse_dec_threshold
        # 數據集還可繼續分割的最小樣本數量
        self.min_samples_split = min_samples_split

    def _mse(self, y):
        '''計算MSE'''
        # 估計值為y的均值, 因此均方誤差即方差.
        return np.var(y)

    def _mse_split(self, y, feature, value):
        '''計算根據特征切分后的MSE'''
        # 根據特征的值將數據集拆分成兩個子集
        indices = feature > value
        y1 = y[indices]
        y2 = y[~indices]

        # 分別計算兩個子集的均方誤差
        mse1 = self._mse(y1)
        mse2 = self._mse(y2)

        # 計算划分后的總均方誤差
        return (y1.size * mse1 + y2.size * mse2) / y.size

    def _get_split_points(self, feature):
        '''獲得一個連續值特征的所有分割點'''
        # 獲得一個特征所有出現過的值, 並排序.
        values = np.unique(feature)
        # 分割點為values中相鄰兩個點的中點.
        split_points = [(v1 + v2) / 2 for v1, v2 in zip(values[:-1], values[1:])]

        return split_points

    def _select_feature(self, X, y):
        '''選擇划分特征'''
        # 最佳分割特征的index
        best_feature_index = None
        # 最佳分割點
        best_split_value = None

        min_mse = np.inf
        _, n = X.shape
        for feature_index in range(n):
            # 迭代每一個特征
            feature = X[:, feature_index]
            # 獲得一個特征的所有分割點
            split_points = self._get_split_points(feature)
            for value in split_points:
                # 迭代每一個分割點value, 計算使用value分割后的數據集mse.
                mse = self._mse_split(y, feature, value)
                # 找到更小的mse, 則更新分割特征和.
                if mse < min_mse:
                    min_mse = mse 
                    best_feature_index = feature_index
                    best_split_value = value

        # 判斷分割后mse的降低是否超過閾值, 如果沒有超過, 則找不到適合分割特征.
        if self._mse(y) - min_mse < self.mse_dec_threshold:
            best_feature_index = None
            best_split_value = None

        return best_feature_index, best_split_value, min_mse

    def _node_value(self, y):
        '''計算節點的值'''
        # 節點值等於樣本均值
        return np.mean(y)

    def _create_tree(self, X, y):
        '''生成樹遞歸算法'''
        # 創建節點
        node = self.Node()
        # 計算節點的值, 等於y的均值.
        node.value = self._node_value(y)

        # 若當前數據集樣本數量小於最小分割數量min_samples_split, 則返回葉節點.
        if y.size < self.min_samples_split:
            return node

        # 若當前數據集的mse小於閾值mse_threshold, 則返回葉節點.
        if self._mse(y) < self.mse_threshold:
            return node

        # 選擇最佳分割特征
        feature_index, feature_value, min_mse = self._select_feature(X, y)
        if feature_index is not None:
            # 如果存在適合分割特征, 當前節點為內部節點.
            node.feature_index = feature_index
            node.feature_value = feature_value

            # 根據已選特征及分割點將數據集划分成兩個子集.
            feature = X[:, feature_index]
            indices = feature > feature_value
            X1, y1 = X[indices], y[indices]
            X2, y2 = X[~indices], y[~indices]

            # 使用數據子集創建左右子樹.
            node.left = self._create_tree(X1, y1)
            node.right = self._create_tree(X2, y2)

        return node

    def _predict_one(self, x_test):
        '''對單個樣本進行預測'''
        # 爬樹一直爬到某葉節點為止, 返回葉節點的值.
        node = self.tree_
        while node.left:
            if x_test[node.feature_index] > node.feature_value:
                node = node.left
            else:
                node = node.right

        return node.value

    def train(self, X_train, y_train):
        '''訓練決策樹'''
        self.tree_ = self._create_tree(X_train, y_train)

    def predict(self, X_test):
        '''對測試集進行預測'''
        # 對每一個測試樣本, 調用_predict_one, 將收集到的結果數組返回.
        return np.apply_along_axis(self._predict_one, axis=1, arr=X_test)

3.2 數據集的獲取

http://archive.ics.uci.edu/ml/machine-learning-databases/housing/

3.3 加載數據集

import numpy as np
dataset=np.genfromtxt('F:/python_test/data/housing.data',dtype=np.float)
print(dataset)

 

 3.4 數據集的划分、模型的訓練與預測

X=dataset[:,:-1]
y=dataset[:,-1]
crt=CartRegressionTree()
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
crt.train(X_train,y_train)
from sklearn.metrics import accuracy_score
y_predict = crt.predict(X_test)
crt.predict(X_test)

 

3.5 量化模型預測的誤差,實際上使用mae比較好,可以看出價格預測的偏差大小大約是3036美元

from sklearn.metrics import mean_squared_error,mean_absolute_error
mse=mean_squared_error(y_test,y_predict)
mae=mean_absolute_error(y_test,y_predict)
print('均方差:',mse)
print('平均絕對誤差:',mae)

 

 3.6 調用sklearn中的線性回歸與決策樹回歸模型與我們的CartDecisionTree進行比較,將計算分為5組(5中test_size),每組都進行十次,得到三個mae和mse的值,繪制在圖像中

我們可以發現我們的決策樹回歸算法和sklearn中的決策樹回歸算法准確度一致,都比LinearRegression效果要好,經過對mse的測試我們發現我們編寫的算法比兩種算法都優秀一些,線性回歸算法

在三種算法中質量是最差的。

 

import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.model_selection import train_test_split
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import numpy as np
dataset=np.genfromtxt('F:/python_test/data/housing.data',dtype=np.float)
X=dataset[:,:-1]
y=dataset[:,-1]

#使用三維數組來存儲mae
mae_array=np.empty((5,10,3))
mse_array=np.empty((5,10,3))
#產生五個test_size,步長是0.1,包括尾部
test_size=np.linspace(0.1,0.5,5)
for i,size in enumerate(test_size):
    for j in range(10):
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=size)
        
        crt = CartRegressionTree()
        crt.train(X_train,y_train)
        y_pred=crt.predict(X_test)
        mae_array[i,j,0] = mean_absolute_error(y_test,y_pred)
        mse_array[i,j,0] = mean_squared_error(y_test,y_pred)
        
        dtr = DecisionTreeRegressor()
        dtr.fit(X_train,y_train)
        y_pred=dtr.predict(X_test)
        mae_array[i,j,1] = mean_absolute_error(y_test,y_pred)
        mse_array[i,j,1] = mean_squared_error(y_test,y_pred)
        
        lr=LinearRegression()
        lr.fit(X_train,y_train)
        y_pred=lr.predict(X_test)
        mae_array[i,j,2] = mean_absolute_error(y_test,y_pred)
        mse_array[i,j,2] = mean_squared_error(y_test,y_pred)

#計算均值,5*3的矩陣,由於列才是axis=0的部分,所以要將矩陣轉置輸出
Y=mae_array.mean(axis=1).T
plt.plot(test_size,Y[0],'o:',label='CartRegressionTree')
plt.plot(test_size,Y[1],'^:',label='DecisionTreeRegression')
plt.plot(test_size,Y[2],'s:',label='LinearRegression')
plt.xlabel('test_size')
plt.ylabel('MAE')
plt.xticks(test_size)
plt.ylim([0.0,6.0])
plt.yticks(np.arange(0.0,6.1,1.0))
plt.grid(linestyle='--')
plt.legend()
plt.show()

 

 

 

Y=mse_array.mean(axis=1).T
plt.plot(test_size,Y[0],'o:',label='CartRegressionTree')
plt.plot(test_size,Y[1],'^:',label='DecisionTreeRegression')
plt.plot(test_size,Y[2],'s:',label='LinearRegression')
plt.xlabel('test_size')
plt.ylabel('MSE')
plt.xticks(test_size)
# plt.ylim([0.0,6.0])
# plt.yticks(np.arange(0.0,6.1,1.0))
plt.grid(linestyle='--')
plt.legend()
plt.show()

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM