1.AdaBoost 思想
補充:這里的若分類器之間有比較強的依賴關系;對於若依賴關系的分類器一般使用Bagging的方法
弱分類器是指分類效果要比隨機猜測效果略好的分類器,我們可以通過構建多個弱分類器來進行最終抉擇(俗話說,三個臭皮匠頂個諸葛亮大概就這意思)。首先我們給每個樣例初始化一個權重,構成向量D,然后再更新D,更新規則如下:
當一個樣例被分類器正確分類時,我們就減小它的權重
否則,增大它的權重
對於每個弱分類器,我們根據它對樣例分類錯誤率來設置它的權重alpha,分類錯誤率越高,相應的alpha就會越小,如下所示
最終我們訓練出多個弱分類器,通過加權分類結果,輸出最終分類結果,如下圖所示
2.實驗過程
1 # -*- coding: utf-8 -*- 2 """ 3 Created on Wed Mar 29 16:57:37 2017 4 5 @author: MyHome 6 """ 7 import numpy as np 8 9 '''返回分類結果向量''' 10 def stumpClassify(dataMatrix,dimen,threshVal,threshIneq): 11 retArray = np.ones((np.shape(dataMatrix)[0],1)) 12 if threshIneq == "lt": 13 retArray[dataMatrix[:,dimen] <= threshVal] = -1.0 14 else: 15 retArray[dataMatrix[:,dimen] > threshVal] = -1.0 16 17 return retArray 18 19 '''構造一個最佳決策樹,返回決策樹字典''' 20 def buildStump(dataArr,classLabels,D): 21 dataMatrix = np.mat(dataArr) 22 labelMat = np.mat(classLabels).T 23 m,n = dataMatrix.shape 24 numSteps = 10.0 25 bestStump = {} 26 bestClassEst = np.mat(np.zeros((m,1))) 27 minError = np.inf 28 29 for i in xrange(n): 30 rangeMin = dataMatrix[:,i].min() 31 rangeMax = dataMatrix[:,i].max() 32 stepSize = (rangeMax - rangeMin)/numSteps 33 for j in xrange(-1,int(numSteps)+1): 34 for inequal in ["lt","gt"]: 35 threshVal = (rangeMin + float(j)*stepSize) 36 #print threshVal 37 predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal) 38 errArr = np.mat(np.ones((m,1))) 39 errArr[predictedVals==labelMat] = 0 40 weightedError = D.T*errArr 41 42 if weightedError < minError: 43 minError = weightedError 44 bestClassEst = predictedVals.copy() 45 bestStump["dim"] = i 46 bestStump["thresh"] = threshVal 47 bestStump["ineq"] = inequal 48 49 return bestStump,minError,bestClassEst 50 51 '''訓練多個單層決策樹分類器,構成一個數組''' 52 def adaBoostTrainDS(dataArr,classLabels,numIt =40): 53 weakClassArr = [] 54 m = np.shape(dataArr)[0] 55 D = np.mat(np.ones((m,1))/m) 56 aggClassEst = np.mat(np.zeros((m,1))) 57 for i in range(numIt): 58 bestStump,error,classEst = buildStump(dataArr,classLabels,D) 59 #print "D:",D.T 60 alpha = float(0.5*np.log((1.0-error)/max(error,1e-16))) 61 bestStump["alpha"] = alpha 62 weakClassArr.append(bestStump) 63 #print "ClassEst:",classEst.T.shape 64 expon = np.multiply(-1*alpha*np.mat(classLabels).T,classEst) 65 #print expon 66 D = np.multiply(D,np.exp(expon)) 67 D = D / D.sum() 68 aggClassEst += alpha*classEst 69 #print "aggClassEst: ",aggClassEst.T 70 aggErrors = np.multiply(np.sign(aggClassEst)!= np.mat(classLabels).T,np.ones((m,1))) 71 errorRate = aggErrors.sum()/m 72 print "total error:",errorRate,"\n" 73 if errorRate ==0.0: 74 break 75 return weakClassArr 76 77 78 '''分類器''' 79 def adaClassify(datToClass,classifierArr): 80 dataMatrix = np.mat(datToClass) 81 m = np.shape(dataMatrix)[0] 82 aggClassEst = np.mat(np.zeros((m,1))) 83 for i in range(len(classifierArr)): 84 classEst = stumpClassify(dataMatrix,classifierArr[i]["dim"],\ 85 classifierArr[i]["thresh"],classifierArr[i]["ineq"]) 86 aggClassEst += classifierArr[i]["alpha"]*classEst 87 #print aggClassEst 88 return np.sign(aggClassEst) 89 90 '''載入數據''' 91 def loadDataSet(fileName): 92 numFeat = len(open(fileName).readline().split("\t")) 93 dataMat = [] 94 labelMat = [] 95 fr = open(fileName) 96 for line in fr.readlines(): 97 lineArr = [] 98 curLine = line.strip().split("\t") 99 for i in range(numFeat-1): 100 lineArr.append(float(curLine[i])) 101 dataMat.append(lineArr) 102 labelMat.append(float(curLine[-1])) 103 #print dataMat,labelMat 104 return dataMat,labelMat 105 106 if __name__ == "__main__": 107 datArr,labelArr = loadDataSet("horseColicTraining2.txt") 108 109 classifierArray = adaBoostTrainDS(datArr,labelArr,10) 110 testData,testY = loadDataSet("horseColicTest2.txt") 111 predictionArr = adaClassify(testData,classifierArray) 112 errorArr = np.mat(np.ones((len(testData),1))) 113 FinalerrorRate = errorArr[predictionArr!= np.mat(testY).T].sum()/float(errorArr.shape[0]) 114 print "FinalerrorRate:",FinalerrorRate 115 116
3.實驗結果
total error: 0.284280936455
total error: 0.284280936455
total error: 0.247491638796
total error: 0.247491638796
total error: 0.254180602007
total error: 0.240802675585
total error: 0.240802675585
total error: 0.220735785953
total error: 0.247491638796
total error: 0.230769230769
FinalerrorRate: 0.238805970149
4.實驗總結
通過多個構建多個弱分類器,然后根據各個弱分類器的能力大小(即權重)來對分類結果進行加權求和,得出最終結果。只要數據集比較完整,這種方法還是很強大的,后續還可以嘗試更多其他的分類器進行集成。