『科學計算』通過代碼理解SoftMax多分類

本文轉載自查看原文 2017-07-22 10:35 3610 ML/DL論文及原理

SoftMax實際上是Logistic的推廣，當分類數為2的時候會退化為Logistic分類

其計算公式和損失函數如下，

梯度如下，

1{條件} 表示True為1，False為0，在下圖中亦即對於每個樣本只有正確的分類才取1，對於損失函數實際上只有m個表達式（m個樣本每個有一個正確的分類）相加，

對於梯度實際上是把我們以前的最后一層和分類層合並了：

第一步則和之前的求法類似，1-概率 & 0-概率組成向量，作為分類層的梯度，對batch數據實現的話就是建立一個(m,k)的01矩陣，直接點乘控制開關，最后求np.sum
x的轉置乘分類層梯度
全batch數據求和，實際上這在代碼實現中和上一步放在了一塊

對於單個數據梯度：x.T.dot(y_pred-y)，維度是這樣的(k,1)*(1,c)=(k,c)

對於成批數據梯度：X.T.dot(y_pred-y)，維度是這樣的(k,m)*(1,c)=(k,c)，只不過結果矩陣的對應位置由x(i,1)*er(1,j)變換為x0(i,1)*er(1,j)+x1(i,1)*er(1,j)... ...正好是對全batch求了個和，所以后面需要除下去

X.T.dot(grad_next)結果是batch梯度累加和，所以需要除以樣本數m，這個結論對全部使用本公式的梯度均成立（1，這句話是廢話；2，但是幾乎全部機器or深度學習算法都需要矩陣乘法，亦即梯度必須使用本公式，所以是很重要的廢話）。

L2正則化：lamda*np.sum(W*W)或者lamda*np.sum(W.T.dot(W))均可，實際上就是W各個項的平方和

#計算Error，Cost，Grad
y_dash = self.softmax(X.dot(theta_n))   # 向前傳播結果
        
Y = np.zeros((m,10))                    # one-hot編碼label矩陣
for i in range(m):
    Y[i,y[i]]=1

error = np.sum(Y * np.log(y_dash), axis=1)   # 注意，這里是點乘
cost = -np.sum(error, axis=0)
grad = X.T.dot(y_dash-Y)
        
grad_n = grad.ravel()

代碼實現：

import numpy as np
import matplotlib.pyplot as plt
import math

def scale_n(x):
    return x
    #return (x-x.mean(axis=0))/(x.std(axis=0)+1e-10)


class SoftMaxModel(object):
    def __init__(self,alpha=0.06,threhold=0.0005):
        self.alpha = alpha         # 學習率
        self.threhold = threhold   # 循環終止閾值
        self.num_classes = 10      # 分類數
    
    def setup(self,X):
        # 初始化權重矩陣，注意，由於是多分類，所以權重由向量變化為矩陣
        # 而且這里面初始化的是flat為1維的矩陣
        m, n = X.shape  # 400,15   
        s = math.sqrt(6) / math.sqrt(n+self.num_classes)
        theta = np.random.rand(n*(self.num_classes))*2*s-s  #[15,1]
        return theta

    def softmax(self,x):
        # 先前傳播softmax多分類
        # 注意輸入的x是[batch數目n，類數目m]，輸出是[batch數目n，類數目m]
        e = np.exp(x)
        temp = np.sum(e, axis=1,keepdims=True)
        return e/temp
        
    def get_cost_grad(self,theta,X,y):
        m, n = X.shape
        theta_n = theta.reshape(n, self.num_classes)

        #計算Error，Cost，Grad
        y_dash = self.softmax(X.dot(theta_n))   # 向前傳播結果
        
        Y = np.zeros((m,10))                    # one-hot編碼label矩陣
        for i in range(m):
            Y[i,y[i]]=1

        error = np.sum(Y * np.log(y_dash), axis=1)
        cost = -np.sum(error, axis=0)
        grad = X.T.dot(y_dash-Y)
        
        grad_n = grad.ravel()           
        return cost,grad_n
        
        
 
    def train(self,X,y,max_iter=50, batch_size=200):
        m, n = X.shape  # 400,15 
        theta = self.setup(X)

        #our intial prediction
        prev_cost = None
        loop_num = 0
        n_samples = y.shape[0]
        n_batches = n_samples // batch_size
        # Stochastic gradient descent with mini-batches
        while loop_num < max_iter:
            for b in range(n_batches):
                batch_begin = b*batch_size
                batch_end = batch_begin+batch_size
                X_batch = X[batch_begin:batch_end]
                Y_batch = y[batch_begin:batch_end]
                      

                #intial cost
                cost,grad = self.get_cost_grad(theta,X_batch,Y_batch)
                
                theta = theta- self.alpha * grad/float(batch_size)
                
            loop_num+=1
            if loop_num%10==0:
                print (cost,loop_num)
            if prev_cost:
                if prev_cost - cost <= self.threhold:
                    break

            prev_cost = cost
                       
            
        self.theta = theta
        print (theta,loop_num)

    def train_scipy(self,X,y):
        m,n = X.shape
        import scipy.optimize
        options = {'maxiter': 50, 'disp': True}
        J = lambda x: self.get_cost_grad(x, X, y)
        theta = self.setup(X)

        result = scipy.optimize.minimize(J, theta, method='L-BFGS-B', jac=True, options=options)
        self.theta = result.x

    def predict(self,X):
        m,n = X.shape
        theta_n = self.theta.reshape(n, self.num_classes)
        a = np.argmax(self.softmax(X.dot(theta_n)),axis=1)
        return a


    def grad_check(self,X,y):
        epsilon = 10**-4
        m, n = X.shape   
        
        sum_error=0
        N=300
        
        for i in range(N):
            theta = self.setup(X)
            j = np.random.randint(1,len(theta))
            theta1=theta.copy()
            theta2=theta.copy()
            theta1[j]+=epsilon
            theta2[j]-=epsilon

            cost1,grad1 = self.get_cost_grad(theta1,X,y)
            cost2,grad2 = self.get_cost_grad(theta2,X,y)
            cost3,grad3 = self.get_cost_grad(theta,X,y)

            sum_error += np.abs(grad3[j]-(cost1-cost2)/float(2*epsilon))
        print ("grad check error is %e\n"%(sum_error/float(N)))
        

if __name__=="__main__":

    
    import cPickle, gzip
    # Load the dataset
    f = gzip.open('mnist.pkl.gz', 'rb')
    train_set, valid_set, test_set = cPickle.load(f)
    f.close()    
    train_X = scale_n(train_set[0])
    train_y = train_set[1]
    test_X = scale_n(test_set[0])
    test_y = test_set[1]



    l_model = SoftMaxModel()

    l_model.grad_check(test_X[0:200,:],test_y[0:200])

    l_model.train_scipy(train_X,train_y)


    predict_train_y = l_model.predict(train_X)
    b = predict_train_y!=train_y

    error_train = np.sum(b, axis=0)/float(b.size)    

    predict_test_y = l_model.predict(test_X)
    b = predict_test_y!=test_y

    error_test = np.sum(b, axis=0)/float(b.size)

    print ("Train Error rate = %.4f, \nTest Error rate = %.4f\n"%(error_train,error_test))

這里面有scipy的優化器應用，因為不是重點（暫時沒有學習這個庫的日程），所以標注出來，需要用優化器優化函數的時候記得有這么回事再深入學習即可：

def train_scipy(self,X,y):
        m,n = X.shape
        import scipy.optimize
        options = {'maxiter': 50, 'disp': True}
        J = lambda x: self.get_cost_grad(x, X, y)
        theta = self.setup(X)

        result = scipy.optimize.minimize(J, theta, method='L-BFGS-B', jac=True, options=options)
        self.theta = result.x

主要是提供了一些比較復雜的優化算法，而且是一個優化自建目標函數的demo，以后可能有所應用。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 softmax與多分類 tensorflow2知識總結---5、softmax多分類邏輯回歸，多分類推廣算法softmax回歸中《轉》Logistic回歸多分類問題的推廣算法--Softmax回歸機器學習(三)：理解邏輯回歸及二分類、多分類代碼實踐多分類問題中查全率和查准率的理解多分類問題的交叉熵計算多標簽分類和多分類機器學習-一對多（多分類）代碼實現（matlab） softmax和分類模型