本次作業為實現天氣預測的樹模型,圖部分沒有實現,但是,框架部分實現了。
操作系統:win 10
編輯環境:anaconda
Python版本:3.6
先給出代碼:
from math import log import operator def calcShannonEnt(dataSet): #計算數據的熵 numEntries=len(dataSet) #數據條數 labelCounts={} for featVec in dataSet: currentLabel=featVec[-1]#每一行最后一個字(類別) if currentLabel not in labelCounts.keys(): labelCounts[currentLabel]=0 labelCounts[currentLabel]+=1 #統計有多少個類似以及每個類的數量 shannonEnt=0 for key in labelCounts: prob=float(labelCounts[key])/numEntries #計算單個類的熵值 shannonEnt-=prob*log(prob,2)#累加每個類的熵值 return shannonEnt def createDataSet1(): #創建示例數據 dataSet = [['晴天','高溫','中濕','無風','不宜'], ['晴天','高溫','中濕','有風','不宜'], ['多雲','高溫','低濕','無風','適宜'], ['雨天','低溫','高濕','無風','適宜'], ['雨天','低溫','低濕','無風','適宜'], ['雨天','低溫','低濕','有風','不宜'], ['多雲','低溫','低濕','有風','適宜'], ['晴天','中溫','高濕','無風','不宜'], ['晴天','低溫','低濕','無風','適宜'], ['雨天','中溫','低濕','無風','適宜'], ['晴天','中溫','低濕','有風','適宜'], ['多雲','中溫','中濕','有風','適宜'], ['多雲','高溫','低濕','無風','適宜'], ['雨天','中溫','低濕','有風','不宜']] labels = ['天氣','溫度','濕度','風況']#四個特征 return dataSet,labels def splitDataSet(dataSet,axis,value):#按某個特征分類后的數據 retDataSet=[] for featVec in dataSet: if featVec[axis]==value: reducedFeatVec = featVec[:axis] reducedFeatVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatVec) return retDataSet def chooseBestFeatureToSplit(dataSet): #選擇最優的分類特征 numFeatures = len(dataSet[0])-1 baseEntropy = calcShannonEnt(dataSet)#原始的熵 bestInfoGain = 0 bestFeature = -1 for i in range(numFeatures): featList = [example[i] for example in dataSet] uniqueVals = set(featList) newEntropy = 0 for value in uniqueVals: subDataSet = splitDataSet(dataSet,i,value) prob = len(subDataSet)/float(len(dataSet)) newEntropy +=prob*calcShannonEnt(subDataSet)#按特征分類后的熵 infoGain = baseEntropy - newEntropy #原始熵與按特征分類后的熵的差值 if (infoGain>bestInfoGain):#若按某特征划分后,熵值減少的最大,則次特征為最優分類特征 bestInfoGain=infoGain bestFeature = i return bestFeature def majorityCnt(classList):#按分類后類別數量排序: classCount={} for vote in classList: if vote not in classCount.keys(): classCount[vote]=0 classCount[vote]+=1 sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True) return sortedClassCount[0][0] def createTree(dataSet,labels): classList=[example[-1] for example in dataSet] #類別 if classList.count(classList[0])==len(classList): return classList[0] if len(dataSet[0])==1: return majorityCnt(classList) bestFeat=chooseBestFeatureToSplit(dataSet)#選擇最優特征 bestFeatLabel=labels[bestFeat] myTree={bestFeatLabel:{}}#分類結果以字典的形式保存 del(labels[bestFeat]) featValues=[example[bestFeat] for example in dataSet] uniqueVals=set(featValues) for value in uniqueVals: subLabels=labels[:] myTree[bestFeatLabel][value]=createTree(splitDataSet\ (dataSet,bestFeat,value),subLabels) return myTree if __name__=='__main__': dataSet, labels=createDataSet1()#創造示例數據 print(createTree(dataSet,labels))#輸出決策樹模型
其實現結果為:
手動畫出模型為:
另外,看看到一個利用自帶函數的一個寫法,筆者還沒有實現,希望大家集思廣益:
https://zhuanlan.zhihu.com/p/25428390
本文參考鏈接:
http://blog.csdn.net/csqazwsxedc/article/details/65697652
http://blog.csdn.net/liz_zhong/article/details/51448218