一、分類樹構建(實際上是一棵遞歸構建的二叉樹,相關的理論就不介紹了)
import numpy as np class CartClassificationTree: class Node: '''樹節點類''' def __init__(self): self.value = None # 內部葉節點屬性 self.feature_index = None self.feature_value = None self.left = None self.right = None def __str__(self): if self.left: s = '內部節點<%s>:\n' % self.feature_index ss = '[ >%s]-> %s' % (self.feature_value, self.left) s += '\t' + ss.replace('\n', '\n\t') + '\n' ss = '[<=%s]-> %s' % (self.feature_value, self.right) s += '\t' + ss.replace('\n', '\n\t') else: s = '葉節點(%s)' % self.value return s def __init__(self, gini_threshold=0.01, gini_dec_threshold=0., min_samples_split=2): '''構造器函數''' # 基尼系數的閾值 self.gini_threshold = gini_threshold # 基尼系數降低的閾值 self.gini_dec_threshold = gini_dec_threshold # 數據集還可繼續分割的最小樣本數量 self.min_samples_split = min_samples_split def _gini(self, y): '''計算基尼指數''' values = np.unique(y) s = 0. for v in values: y_sub = y[y == v] s += (y_sub.size / y.size) ** 2 return 1 - s def _gini_split(self, y, feature, value): '''計算根據特征切分后的基尼指數''' # 根據特征的值將數據集拆分成兩個子集 indices = feature > value y1 = y[indices] y2 = y[~indices] # 分別計算兩個子集的基尼系數 gini1 = self._gini(y1) gini2 = self._gini(y2) # 計算分割后的基尼系數 # gini(y, feature) = (|y1| * gini(y1) + |y2| * gini(y2)) / |y| gini = (y1.size * gini1 + y2.size * gini2) / y.size return gini def _get_split_points(self, feature): '''獲得一個連續值特征的所有分割點''' # 獲得一個特征所有出現過的值, 並排序. values = np.unique(feature) # 分割點為values中相鄰兩個點的中點. split_points = [(v1 + v2) / 2 for v1, v2 in zip(values[:-1], values[1:])] return split_points def _select_feature(self, X, y): '''選擇划分特征''' # 最佳分割特征的index best_feature_index = None # 最佳分割點 best_split_value = None min_gini = np.inf _, n = X.shape for feature_index in range(n): # 迭代每一個特征 feature = X[:, feature_index] # 獲得一個特征的所有分割點 split_points = self._get_split_points(feature) for value in split_points: # 迭代每一個分割點value, 計算使用value分割后的數據集基尼系數. gini = self._gini_split(y, feature, value) # 找到更小的gini, 則更新分割特征和. if gini < min_gini: min_gini = gini best_feature_index = feature_index best_split_value = value # 判斷分割后基尼系數的降低是否超過閾值 if self._gini(y) - min_gini < self.gini_dec_threshold: best_feature_index = None best_split_value = None return best_feature_index, best_split_value, min_gini def _node_value(self, y): '''計算節點的值''' # 統計數據集中樣本類標記的個數 labels_count = np.bincount(y) # 節點值等於數據集中樣本最多的類標記. return np.argmax(np.bincount(y)) def _create_tree(self, X, y): '''生成樹遞歸算法''' # 創建節點 node = self.Node() # 計算節點的值, 等於y的均值. node.value = self._node_value(y) # 若當前數據集樣本數量小於最小分割數量min_samples_split, 則返回葉節點. if y.size < self.min_samples_split: return node # 若當前數據集的基尼系數小於閾值gini_threshold, 則返回葉節點. if self._gini(y) < self.gini_threshold: return node # 選擇最佳分割特征 feature_index, feature_value, min_gini = self._select_feature(X, y) if feature_index is not None: # 如果存在適合分割特征, 當前節點為內部節點. node.feature_index = feature_index node.feature_value = feature_value # 根據已選特征及分割點將數據集划分成兩個子集. feature = X[:, feature_index] indices = feature > feature_value X1, y1 = X[indices], y[indices] X2, y2 = X[~indices], y[~indices] # 使用數據子集創建左右子樹. node.left = self._create_tree(X1, y1) node.right = self._create_tree(X2, y2) return node def _predict_one(self, x_test): '''對單個樣本進行預測''' # 爬樹一直爬到某葉節點為止, 返回葉節點的值. node = self.tree_ while node.left: if x_test[node.feature_index] > node.feature_value: node = node.left else: node = node.right return node.value def train(self, X_train, y_train): '''訓練決策樹''' self.tree_ = self._create_tree(X_train, y_train) def predict(self, X_test): '''對測試集進行預測''' # 對每一個測試樣本, 調用_predict_one, 將收集到的結果數組返回. return np.apply_along_axis(self._predict_one, axis=1, arr=X_test)
二、分類樹項目實戰
2.1 數據集獲取(經典的鳶尾花數據集)
http://archive.ics.uci.edu/ml/machine-learning-databases/iris/
描述:
Attribute Information:
1. sepal length in cm
2. sepal width in cm
3. petal length in cm
4. petal width in cm
5. class:
-- Iris Setosa
-- Iris Versicolour
-- Iris Virginica
2.2 加載數據
import numpy as np X=np.genfromtxt('F:/python_test/data/iris.data',delimiter=',',usecols=range(4),dtype=float) print(X) y=np.genfromtxt('F:/python_test/data/iris.data',delimiter=',',usecols=4,dtype=str) print(y)
2.3 分類標簽的變換
from sklearn.preprocessing import LabelEncoder le=LabelEncoder() y=le.fit_transform(y) print('變換之后的y:\n',y)
2.4 訓練模型以及計算精確度,分類的精確度達到了0.9777777777777777
cct = CartClassificationTree() from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3) cct.train(X_train,y_train) from sklearn.metrics import accuracy_score y_pred=cct.predict(X_test) score=accuracy_score(y_test,y_pred) print(score)
2.5 調整測試集大小,發現不管測試集划分為多大,最終的准確度大約都是94%
import matplotlib matplotlib.use('TkAgg') import matplotlib.pyplot as plt plt.scatter(TEST_SIZE,SCORE) plt.plot(TEST_SIZE,SCORE,'--',color='red') plt.ylim([0.90,1.0]) plt.xlabel('test/(test+train)') plt.ylabel('accuracy') plt.show()
三、回歸樹項目實戰
3.1 回歸樹的代碼(通過遞歸構建的二叉樹,cart算法)
import numpy as np class CartRegressionTree: class Node: '''樹節點類''' def __init__(self): self.value = None # 內部葉節點屬性 self.feature_index = None self.feature_value = None self.left = None self.right = None def __str__(self): if self.left: s = '內部節點<%s>:\n' % self.feature_index ss = '[ >%s]-> %s' % (self.feature_value, self.left) s += '\t' + ss.replace('\n', '\n\t') + '\n' ss = '[<=%s]-> %s' % (self.feature_value, self.right) s += '\t' + ss.replace('\n', '\n\t') else: s = '葉節點(%s)' % self.value return s def __init__(self, mse_threshold=0.01, mse_dec_threshold=0., min_samples_split=2): '''構造器函數''' # mse的閾值 self.mse_threshold = mse_threshold # mse降低的閾值 self.mse_dec_threshold = mse_dec_threshold # 數據集還可繼續分割的最小樣本數量 self.min_samples_split = min_samples_split def _mse(self, y): '''計算MSE''' # 估計值為y的均值, 因此均方誤差即方差. return np.var(y) def _mse_split(self, y, feature, value): '''計算根據特征切分后的MSE''' # 根據特征的值將數據集拆分成兩個子集 indices = feature > value y1 = y[indices] y2 = y[~indices] # 分別計算兩個子集的均方誤差 mse1 = self._mse(y1) mse2 = self._mse(y2) # 計算划分后的總均方誤差 return (y1.size * mse1 + y2.size * mse2) / y.size def _get_split_points(self, feature): '''獲得一個連續值特征的所有分割點''' # 獲得一個特征所有出現過的值, 並排序. values = np.unique(feature) # 分割點為values中相鄰兩個點的中點. split_points = [(v1 + v2) / 2 for v1, v2 in zip(values[:-1], values[1:])] return split_points def _select_feature(self, X, y): '''選擇划分特征''' # 最佳分割特征的index best_feature_index = None # 最佳分割點 best_split_value = None min_mse = np.inf _, n = X.shape for feature_index in range(n): # 迭代每一個特征 feature = X[:, feature_index] # 獲得一個特征的所有分割點 split_points = self._get_split_points(feature) for value in split_points: # 迭代每一個分割點value, 計算使用value分割后的數據集mse. mse = self._mse_split(y, feature, value) # 找到更小的mse, 則更新分割特征和. if mse < min_mse: min_mse = mse best_feature_index = feature_index best_split_value = value # 判斷分割后mse的降低是否超過閾值, 如果沒有超過, 則找不到適合分割特征. if self._mse(y) - min_mse < self.mse_dec_threshold: best_feature_index = None best_split_value = None return best_feature_index, best_split_value, min_mse def _node_value(self, y): '''計算節點的值''' # 節點值等於樣本均值 return np.mean(y) def _create_tree(self, X, y): '''生成樹遞歸算法''' # 創建節點 node = self.Node() # 計算節點的值, 等於y的均值. node.value = self._node_value(y) # 若當前數據集樣本數量小於最小分割數量min_samples_split, 則返回葉節點. if y.size < self.min_samples_split: return node # 若當前數據集的mse小於閾值mse_threshold, 則返回葉節點. if self._mse(y) < self.mse_threshold: return node # 選擇最佳分割特征 feature_index, feature_value, min_mse = self._select_feature(X, y) if feature_index is not None: # 如果存在適合分割特征, 當前節點為內部節點. node.feature_index = feature_index node.feature_value = feature_value # 根據已選特征及分割點將數據集划分成兩個子集. feature = X[:, feature_index] indices = feature > feature_value X1, y1 = X[indices], y[indices] X2, y2 = X[~indices], y[~indices] # 使用數據子集創建左右子樹. node.left = self._create_tree(X1, y1) node.right = self._create_tree(X2, y2) return node def _predict_one(self, x_test): '''對單個樣本進行預測''' # 爬樹一直爬到某葉節點為止, 返回葉節點的值. node = self.tree_ while node.left: if x_test[node.feature_index] > node.feature_value: node = node.left else: node = node.right return node.value def train(self, X_train, y_train): '''訓練決策樹''' self.tree_ = self._create_tree(X_train, y_train) def predict(self, X_test): '''對測試集進行預測''' # 對每一個測試樣本, 調用_predict_one, 將收集到的結果數組返回. return np.apply_along_axis(self._predict_one, axis=1, arr=X_test)
3.2 數據集的獲取
http://archive.ics.uci.edu/ml/machine-learning-databases/housing/
3.3 加載數據集
import numpy as np dataset=np.genfromtxt('F:/python_test/data/housing.data',dtype=np.float) print(dataset)
3.4 數據集的划分、模型的訓練與預測
X=dataset[:,:-1] y=dataset[:,-1] crt=CartRegressionTree() from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3) crt.train(X_train,y_train) from sklearn.metrics import accuracy_score y_predict = crt.predict(X_test) crt.predict(X_test)
3.5 量化模型預測的誤差,實際上使用mae比較好,可以看出價格預測的偏差大小大約是3036美元
from sklearn.metrics import mean_squared_error,mean_absolute_error mse=mean_squared_error(y_test,y_predict) mae=mean_absolute_error(y_test,y_predict) print('均方差:',mse) print('平均絕對誤差:',mae)
3.6 調用sklearn中的線性回歸與決策樹回歸模型與我們的CartDecisionTree進行比較,將計算分為5組(5中test_size),每組都進行十次,得到三個mae和mse的值,繪制在圖像中
我們可以發現我們的決策樹回歸算法和sklearn中的決策樹回歸算法准確度一致,都比LinearRegression效果要好,經過對mse的測試我們發現我們編寫的算法比兩種算法都優秀一些,線性回歸算法
在三種算法中質量是最差的。
import numpy as np from sklearn.tree import DecisionTreeRegressor from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error,mean_absolute_error from sklearn.model_selection import train_test_split import matplotlib matplotlib.use('TkAgg') import matplotlib.pyplot as plt import numpy as np dataset=np.genfromtxt('F:/python_test/data/housing.data',dtype=np.float) X=dataset[:,:-1] y=dataset[:,-1] #使用三維數組來存儲mae mae_array=np.empty((5,10,3)) mse_array=np.empty((5,10,3)) #產生五個test_size,步長是0.1,包括尾部 test_size=np.linspace(0.1,0.5,5) for i,size in enumerate(test_size): for j in range(10): X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=size) crt = CartRegressionTree() crt.train(X_train,y_train) y_pred=crt.predict(X_test) mae_array[i,j,0] = mean_absolute_error(y_test,y_pred) mse_array[i,j,0] = mean_squared_error(y_test,y_pred) dtr = DecisionTreeRegressor() dtr.fit(X_train,y_train) y_pred=dtr.predict(X_test) mae_array[i,j,1] = mean_absolute_error(y_test,y_pred) mse_array[i,j,1] = mean_squared_error(y_test,y_pred) lr=LinearRegression() lr.fit(X_train,y_train) y_pred=lr.predict(X_test) mae_array[i,j,2] = mean_absolute_error(y_test,y_pred) mse_array[i,j,2] = mean_squared_error(y_test,y_pred) #計算均值,5*3的矩陣,由於列才是axis=0的部分,所以要將矩陣轉置輸出 Y=mae_array.mean(axis=1).T plt.plot(test_size,Y[0],'o:',label='CartRegressionTree') plt.plot(test_size,Y[1],'^:',label='DecisionTreeRegression') plt.plot(test_size,Y[2],'s:',label='LinearRegression') plt.xlabel('test_size') plt.ylabel('MAE') plt.xticks(test_size) plt.ylim([0.0,6.0]) plt.yticks(np.arange(0.0,6.1,1.0)) plt.grid(linestyle='--') plt.legend() plt.show()
Y=mse_array.mean(axis=1).T plt.plot(test_size,Y[0],'o:',label='CartRegressionTree') plt.plot(test_size,Y[1],'^:',label='DecisionTreeRegression') plt.plot(test_size,Y[2],'s:',label='LinearRegression') plt.xlabel('test_size') plt.ylabel('MSE') plt.xticks(test_size) # plt.ylim([0.0,6.0]) # plt.yticks(np.arange(0.0,6.1,1.0)) plt.grid(linestyle='--') plt.legend() plt.show()