July已經把Adaboost的實現步驟講述得非常清楚,我就不再贅述,這里用Python實現了一下。
# coding=utf-8 __author__ = "orisun" import numpy as np import sys class OneDimClassifier(object): def __init__(self, x, y): self.x = x self.y = y self.split = 0 # direct = True時:小於分割線的為正樣本,大於分割線的為負樣本 self.direct = True def train(self, w): '''根據各樣本的分類及權重,選擇使總誤差最小的分割線 ''' # 將x,y和w進行綁定 sx = np.concatenate((self.x, self.y.reshape( (self.y.shape[0], 1)), w.reshape((w.shape[0], 1))), axis=1) # 樣本排序 sx = [sx[i] for i in np.argsort(sx, axis=0)[:, 0]] prevY = sx[0][1] minErr = sys.float_info.max for i in xrange(1, len(sx)): # 嘗試在每一個y值發生變化的點上進行分割 if sx[i][1] != prevY: err = 0.0 d = True for j in xrange(i): if sx[j][1] != 1: err += sx[j][2] for j in xrange(i, len(sx)): if sx[j][1] != -1: err += sx[j][2] if err > 0.5: err = 1.0 - err d = False if err < minErr: self.split = (sx[i][0] + sx[i - 1][0]) / 2 minErr = err self.direct = d prevY = sx[i][1] print 'split=', self.split return minErr def predict(self, x): pre_y = np.zeros(x.shape[0]) factor = 1 if self.direct else -1 for i in xrange(x.shape[0]): if x[i][0] <= self.split: pre_y[i] = 1 * factor else: pre_y[i] = -1 * factor return pre_y class Adboost(object): def __init__(self, x, y, WeakClassifier, M): # 輸入樣本的特征向量 self.x = np.array(x) # 輸入樣本的分類標識,用1或-1表示 self.y = np.array(y) # 初始化每個樣本的權重 self.w = np.array([1.0 / self.x.shape[0] for i in xrange(self.x.shape[0])]) # 弱分類器的構建器 self.WeakClassifier = WeakClassifier # 弱分類器的數目上限 self.M = M # 實際使用的弱分類器的數目 self.Q = 0 # 弱分類器集合 self.G = [] # 各個弱分類器的權重 self.alpha = [] def predict(self, x): '''預測分類 ''' if self.Q <= 0: raise Exception("have not train before predict") pre_y = np.zeros(x.shape[0]) for i in xrange(self.Q): pre_y += self.G[i].predict(x) * self.alpha[i] return np.sign(pre_y) def train(self): '''訓練各個弱分類器及其權重 ''' for i in xrange(self.M): # 用WeakClassifier初始化第一個弱分類器 self.G.append(self.WeakClassifier(self.x, self.y)) # 用當前各個樣本的權重訓練當前的弱分類器,並返回錯誤率 e = self.G[i].train(self.w) # e不能等於0.5 while e == 0.5: e += np.random.uniform(-0.1, 0.1) # 計算當前分類器的權重 a = 1.0 / 2.0 * np.log((1 - e) / e) self.alpha.append(a) # 用當前的分類器預測每一個樣本的分類 pre_y = self.G[i].predict(self.x) # 計算下一輪中各個樣本的權重 self.w *= np.exp(-a * self.y * pre_y) # 對權重進行歸一化,使其是一個概率分布 self.w /= self.w.sum() self.Q = i + 1 errnum = (self.y != self.predict(self.x)).sum() if errnum == 0: print self.Q, "week classifiers is enough to make the error of train set to zero" break # 返回在訓練集上的錯誤率 return 1.0 * (self.y != self.predict(self.x)).sum() / self.x.shape[0] if __name__ == '__main__': x = [[0], [2], [4], [6], [8], [1], [3], [5], [7], [9]] y = [1, 1, -1, 1, 1, 1, -1, -1, 1, -1] boost = Adboost(x, y, OneDimClassifier, 5) errratio = boost.train() print 'week classifier weight:', boost.alpha print 'error ratio:', errratio print 'tag of 4.3 is', boost.predict(np.array([[4.3]]))