題目:給定如下訓練集和測試集,參考《機器學習》(Tom Mitchell)第三章和《機器學習》(周志華)第四章,先閱讀ID3、C4.5和CART算法並且仔細閱讀附件給出的ID3、C4.5算法python程序,再實現基於基尼指數(Gini index)選擇最優划分屬性(特征)構造的CART決策樹的 python程序。最終提交一份實驗報告, 提交的實驗報告給出python實現的完整程序和實驗結果。
訓練集:
outlook temperature humidity windy
---------------------------------------------------------
sunny hot high false N
sunny hot high true N
overcast hot high false Y
rain mild high false Y
rain cool normal false Y
rain cool normal true N
overcast cool normal true Y
測試集
outlook temperature humidity windy
---------------------------------------------------------
sunny mild high false
sunny cool normal false
rain mild normal false
sunny mild normal true
overcast mild high true
overcast hot normal false
true rain mild high
Python程序
1、CART.py
# -*- coding: utf-8 -*-
## 參考《機器學習》(Tom M. Mitchell) 第三章 決策樹學習
## 《機器學習》(周志華), 第四章 決策樹
1 from math import log 2 import operator 3 import treePlotter 4 5 def calcGiniIndex(dataSet): 6 """ 7 輸入:數據集 8 輸出:數據集的基尼指數 9 描述:計算給定數據集的基尼指數 10 """ 11 numEntries = len(dataSet) # 返回數據集的行數 12 labelCounts = {} # 保存每個標簽(Label)出現次數的字典 13 for featVec in dataSet: # 對每組特征向量進行統計 14 currentLabel = featVec[-1] # 提取標簽信息 15 if currentLabel not in labelCounts.keys(): # 如果標簽沒有放入統計次數的字典,添加進去 16 labelCounts[currentLabel] = 0 17 labelCounts[currentLabel] += 1 # Label計數 18 giniIndexEnt = 0.0 # 基尼指數 19 for key in labelCounts: # 計算基尼指數 20 prob = float(labelCounts[key])/numEntries # 選擇該標簽(Label)的概率 21 giniIndexEnt += prob * (1.0 - prob) # 利用公式計算 22 return giniIndexEnt # 返回基尼指數 23 24 def splitDataSet(dataSet, axis, value): # 待划分數據集合,特征下標,特征值 25 """ 26 輸入:數據集,選擇維度,選擇值 27 輸出:划分數據集 28 描述:按照給定特征划分數據集;去除選擇維度中等於選擇值的項 29 """ 30 retDataSet = [] # 保存划分的數據子集 31 for featVec in dataSet: # 遍歷數據集中的每個樣本 32 if featVec[axis] == value: #如果特征值符合要求,則添加到子集中 33 reduceFeatVec = featVec[:axis] # 保存第0到第axis-1個特征 34 reduceFeatVec.extend(featVec[axis+1:]) # 保存第axis+1到最后一個特征 35 retDataSet.append(reduceFeatVec) # 添加符合要求的樣本到划分子集中 36 return retDataSet # 返回划分好的(特征axis的值=value)的子集 37 38 def chooseBestFeatureToSplit(dataSet): 39 """ 40 輸入:數據集 41 輸出:最好的划分維度 42 描述:選擇最好的數據集划分維度 43 """ 44 numFeatures = len(dataSet[0]) - 1 # 特征數量 45 bestInfoGini = calcGiniIndex(dataSet) # 計算數據集的基尼指數 46 bestFeature = -1 # 最優特征索引值 47 for i in range(numFeatures): # 遍歷所有特征 48 featList = [example[i] for example in dataSet] # 獲取dataSet的第i個所有特征-第i列全部特征 49 uniqueVals = set(featList) # 創建set集合{}元素不可重復 50 newGini = 0.0 51 for value in uniqueVals: # 計算新的基尼指數 52 subDataSet = splitDataSet(dataSet, i, value) # subDataSet划分后的子集 53 prob = len(subDataSet)/float(len(dataSet)) # 計算子集概率 54 newGini += prob * calcGiniIndex(subDataSet) # 新的基尼指數 55 if (newGini < bestInfoGini): 56 bestInfoGini = newGini 57 bestFeature = i 58 return bestFeature # 返回基尼指數最小的特征索引值 59 60 def majorityCnt(classList): 61 """ 62 輸入:分類類別列表 63 輸出:子節點的分類 64 描述:數據集已經處理了所有屬性,但是類標簽依然不是唯一的, 65 采用多數判決的方法決定該子節點的分類 66 """ 67 classCount = {} 68 for vote in classList: # 統計classList中每個元素出現的次數 69 if vote not in classCount.keys(): 70 classCount[vote] = 0 71 classCount[vote] += 1 72 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reversed=True) # 根據字典的值降序排序 73 return sortedClassCount[0][0] # 返回classList中出現次數最多的元素 74 75 def createTree(dataSet, labels): 76 """ 77 輸入:數據集,特征標簽 78 輸出:決策樹 79 描述:遞歸構建決策樹,利用上述的函數 80 """ 81 classList = [example[-1] for example in dataSet] 82 if classList.count(classList[0]) == len(classList): 83 # 類別完全相同,停止划分 84 return classList[0] 85 if len(dataSet[0]) == 1: 86 # 遍歷完所有特征時返回出現次數最多的 87 return majorityCnt(classList) 88 bestFeat = chooseBestFeatureToSplit(dataSet) 89 bestFeatLabel = labels[bestFeat] 90 myTree = {bestFeatLabel:{}} 91 del(labels[bestFeat]) 92 # 得到列表包括節點所有的屬性值 93 featValues = [example[bestFeat] for example in dataSet] 94 uniqueVals = set(featValues) 95 for value in uniqueVals: 96 subLabels = labels[:] 97 myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels) 98 return myTree 99 100 def classify(inputTree, featLabels, testVec): 101 """ 102 輸入:決策樹,分類標簽,測試數據 103 輸出:決策結果 104 描述:跑決策樹 105 """ 106 firstStr = list(inputTree.keys())[0] 107 secondDict = inputTree[firstStr] 108 featIndex = featLabels.index(firstStr) 109 for key in secondDict.keys(): 110 if testVec[featIndex] == key: 111 if type(secondDict[key]).__name__ == 'dict': 112 classLabel = classify(secondDict[key], featLabels, testVec) 113 else: 114 classLabel = secondDict[key] 115 return classLabel 116 117 def classifyAll(inputTree, featLabels, testDataSet): 118 """ 119 輸入:決策樹,分類標簽,測試數據集 120 輸出:決策結果 121 描述:跑決策樹 122 """ 123 classLabelAll = [] 124 for testVec in testDataSet: 125 classLabelAll.append(classify(inputTree, featLabels, testVec)) 126 return classLabelAll 127 128 def storeTree(inputTree, filename): 129 """ 130 輸入:決策樹,保存文件路徑 131 輸出: 132 描述:保存決策樹到文件 133 """ 134 import pickle 135 fw = open(filename, 'wb') 136 pickle.dump(inputTree, fw) 137 fw.close() 138 139 def grabTree(filename): 140 """ 141 輸入:文件路徑名 142 輸出:決策樹 143 描述:從文件讀取決策樹 144 """ 145 import pickle 146 fr = open(filename, 'rb') 147 return pickle.load(fr) 148 149 def createDataSet(): 150 """ 151 outlook-> 0: sunny | 1: overcast | 2: rain 152 temperature-> 0: hot | 1: mild | 2: cool 153 humidity-> 0: high | 1: normal 154 windy-> 0: false | 1: true 155 """ 156 dataSet = [[0, 0, 0, 0, 'N'], 157 [0, 0, 0, 1, 'N'], 158 [1, 0, 0, 0, 'Y'], 159 [2, 1, 0, 0, 'Y'], 160 [2, 2, 1, 0, 'Y'], 161 [2, 2, 1, 1, 'N'], 162 [1, 2, 1, 1, 'Y']] 163 labels = ['outlook', 'temperature', 'humidity', 'windy'] 164 return dataSet, labels 165 166 def createTestSet(): 167 """ 168 outlook-> 0: sunny | 1: overcast | 2: rain 169 temperature-> 0: hot | 1: mild | 2: cool 170 humidity-> 0: high | 1: normal 171 windy-> 0: false | 1: true 172 """ 173 testSet = [[0, 1, 0, 0], 174 [0, 2, 1, 0], 175 [2, 1, 1, 0], 176 [0, 1, 1, 1], 177 [1, 1, 0, 1], 178 [1, 0, 1, 0], 179 [2, 1, 0, 1]] 180 return testSet 181 182 def main(): 183 dataSet, labels = createDataSet() 184 labels_tmp = labels[:] # 拷貝,createTree會改變labels 185 desicionTree = createTree(dataSet, labels_tmp) 186 #storeTree(desicionTree, 'classifierStorage.txt') 187 #desicionTree = grabTree('classifierStorage.txt') 188 print('desicionTree:\n', desicionTree) 189 treePlotter.createPlot(desicionTree) 190 testSet = createTestSet() 191 print('classifyResult:\n', classifyAll(desicionTree, labels, testSet)) 192 193 if __name__ == '__main__': 194 main() 195 196 2、treePlotter.py 197 import matplotlib.pyplot as plt 198 199 decisionNode = dict(boxstyle="sawtooth", fc="0.8") 200 leafNode = dict(boxstyle="round4", fc="0.8") 201 arrow_args = dict(arrowstyle="<-") 202 203 def plotNode(nodeTxt, centerPt, parentPt, nodeType): 204 """ 205 輸入: 206 輸出: 207 描述:繪制一個點 208 """ 209 createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction', \ 210 xytext=centerPt, textcoords='axes fraction', \ 211 va="center", ha="center", bbox=nodeType, arrowprops=arrow_args) 212 213 def getNumLeafs(myTree): 214 """ 215 輸入:決策樹 216 輸出:決策樹的葉子數量 217 描述: 218 """ 219 numLeafs = 0 220 firstStr = list(myTree.keys())[0] 221 secondDict = myTree[firstStr] 222 for key in secondDict.keys(): 223 if type(secondDict[key]).__name__ == 'dict': 224 numLeafs += getNumLeafs(secondDict[key]) 225 else: 226 numLeafs += 1 227 return numLeafs 228 229 def getTreeDepth(myTree): 230 """ 231 輸入:決策樹 232 輸出:樹的深度 233 描述: 234 """ 235 maxDepth = 0 236 firstStr = list(myTree.keys())[0] 237 secondDict = myTree[firstStr] 238 for key in secondDict.keys(): 239 if type(secondDict[key]).__name__ == 'dict': 240 thisDepth = getTreeDepth(secondDict[key]) + 1 241 else: 242 thisDepth = 1 243 if thisDepth > maxDepth: 244 maxDepth = thisDepth 245 return maxDepth 246 247 def plotMidText(cntrPt, parentPt, txtString): 248 """ 249 輸入: 250 輸出: 251 描述: 252 """ 253 xMid = (parentPt[0] - cntrPt[0]) / 2.0 + cntrPt[0] 254 yMid = (parentPt[1] - cntrPt[1]) / 2.0 + cntrPt[1] 255 createPlot.ax1.text(xMid, yMid, txtString) 256 257 def plotTree(myTree, parentPt, nodeTxt): 258 """ 259 輸入: 260 輸出: 261 描述: 262 """ 263 numLeafs = getNumLeafs(myTree) 264 depth = getTreeDepth(myTree) 265 firstStr = list(myTree.keys())[0] 266 cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalw, plotTree.yOff) 267 plotMidText(cntrPt, parentPt, nodeTxt) 268 plotNode(firstStr, cntrPt, parentPt, decisionNode) 269 secondDict = myTree[firstStr] 270 plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD 271 for key in secondDict.keys(): 272 if type(secondDict[key]).__name__ == 'dict': 273 plotTree(secondDict[key], cntrPt, str(key)) 274 else: 275 plotTree.xOff = plotTree.xOff + 1.0 / plotTree.totalw 276 plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode) 277 plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key)) 278 plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalD 279 280 def createPlot(inTree): 281 """ 282 輸入:決策樹 283 輸出: 284 描述:繪制整個決策樹 285 """ 286 fig = plt.figure(1, facecolor='white') 287 fig.clf() 288 axprops = dict(xticks=[], yticks=[]) 289 createPlot.ax1 = plt.subplot(111, frameon=False, **axprops) 290 plotTree.totalw = float(getNumLeafs(inTree)) 291 plotTree.totalD = float(getTreeDepth(inTree)) 292 plotTree.xOff = -0.5 / plotTree.totalw 293 plotTree.yOff = 1.0 294 plotTree(inTree, (0.5, 1.0), '') 295 plt.show()
運行結果:
desicionTree:
{'outlook': {0: 'N', 1: 'Y', 2: {'windy': {0: 'Y', 1: 'N'}}}}
classifyResult:
['N', 'N', 'Y', 'N', 'Y', 'Y', 'N']
Process finished with exit code 0
畫出的決策樹圖形: