SoftMax實際上是Logistic的推廣,當分類數為2的時候會退化為Logistic分類
其計算公式和損失函數如下,

梯度如下,
1{條件} 表示True為1,False為0,在下圖中亦即對於每個樣本只有正確的分類才取1,對於損失函數實際上只有m個表達式(m個樣本每個有一個正確的分類)相加,
對於梯度實際上是把我們以前的最后一層和分類層合並了:
- 第一步則和之前的求法類似,1-概率 & 0-概率組成向量,作為分類層的梯度,對batch數據實現的話就是建立一個(m,k)的01矩陣,直接點乘控制開關,最后求np.sum
- x的轉置乘分類層梯度
- 全batch數據求和,實際上這在代碼實現中和上一步放在了一塊
對於單個數據梯度:x.T.dot(y_pred-y),維度是這樣的(k,1)*(1,c)=(k,c)
對於成批數據梯度:X.T.dot(y_pred-y),維度是這樣的(k,m)*(1,c)=(k,c),只不過結果矩陣的對應位置由x(i,1)*er(1,j)變換為x0(i,1)*er(1,j)+x1(i,1)*er(1,j)... ...正好是對全batch求了個和,所以后面需要除下去
X.T.dot(grad_next)結果是batch梯度累加和,所以需要除以樣本數m,這個結論對全部使用本公式的梯度均成立(1,這句話是廢話;2,但是幾乎全部機器or深度學習算法都需要矩陣乘法,亦即梯度必須使用本公式,所以是很重要的廢話)。
L2正則化:lamda*np.sum(W*W)或者lamda*np.sum(W.T.dot(W))均可,實際上就是W各個項的平方和
#計算Error,Cost,Grad
y_dash = self.softmax(X.dot(theta_n)) # 向前傳播結果
Y = np.zeros((m,10)) # one-hot編碼label矩陣
for i in range(m):
Y[i,y[i]]=1
error = np.sum(Y * np.log(y_dash), axis=1) # 注意,這里是點乘
cost = -np.sum(error, axis=0)
grad = X.T.dot(y_dash-Y)
grad_n = grad.ravel()

代碼實現:
import numpy as np
import matplotlib.pyplot as plt
import math
def scale_n(x):
return x
#return (x-x.mean(axis=0))/(x.std(axis=0)+1e-10)
class SoftMaxModel(object):
def __init__(self,alpha=0.06,threhold=0.0005):
self.alpha = alpha # 學習率
self.threhold = threhold # 循環終止閾值
self.num_classes = 10 # 分類數
def setup(self,X):
# 初始化權重矩陣,注意,由於是多分類,所以權重由向量變化為矩陣
# 而且這里面初始化的是flat為1維的矩陣
m, n = X.shape # 400,15
s = math.sqrt(6) / math.sqrt(n+self.num_classes)
theta = np.random.rand(n*(self.num_classes))*2*s-s #[15,1]
return theta
def softmax(self,x):
# 先前傳播softmax多分類
# 注意輸入的x是[batch數目n,類數目m],輸出是[batch數目n,類數目m]
e = np.exp(x)
temp = np.sum(e, axis=1,keepdims=True)
return e/temp
def get_cost_grad(self,theta,X,y):
m, n = X.shape
theta_n = theta.reshape(n, self.num_classes)
#計算Error,Cost,Grad
y_dash = self.softmax(X.dot(theta_n)) # 向前傳播結果
Y = np.zeros((m,10)) # one-hot編碼label矩陣
for i in range(m):
Y[i,y[i]]=1
error = np.sum(Y * np.log(y_dash), axis=1)
cost = -np.sum(error, axis=0)
grad = X.T.dot(y_dash-Y)
grad_n = grad.ravel()
return cost,grad_n
def train(self,X,y,max_iter=50, batch_size=200):
m, n = X.shape # 400,15
theta = self.setup(X)
#our intial prediction
prev_cost = None
loop_num = 0
n_samples = y.shape[0]
n_batches = n_samples // batch_size
# Stochastic gradient descent with mini-batches
while loop_num < max_iter:
for b in range(n_batches):
batch_begin = b*batch_size
batch_end = batch_begin+batch_size
X_batch = X[batch_begin:batch_end]
Y_batch = y[batch_begin:batch_end]
#intial cost
cost,grad = self.get_cost_grad(theta,X_batch,Y_batch)
theta = theta- self.alpha * grad/float(batch_size)
loop_num+=1
if loop_num%10==0:
print (cost,loop_num)
if prev_cost:
if prev_cost - cost <= self.threhold:
break
prev_cost = cost
self.theta = theta
print (theta,loop_num)
def train_scipy(self,X,y):
m,n = X.shape
import scipy.optimize
options = {'maxiter': 50, 'disp': True}
J = lambda x: self.get_cost_grad(x, X, y)
theta = self.setup(X)
result = scipy.optimize.minimize(J, theta, method='L-BFGS-B', jac=True, options=options)
self.theta = result.x
def predict(self,X):
m,n = X.shape
theta_n = self.theta.reshape(n, self.num_classes)
a = np.argmax(self.softmax(X.dot(theta_n)),axis=1)
return a
def grad_check(self,X,y):
epsilon = 10**-4
m, n = X.shape
sum_error=0
N=300
for i in range(N):
theta = self.setup(X)
j = np.random.randint(1,len(theta))
theta1=theta.copy()
theta2=theta.copy()
theta1[j]+=epsilon
theta2[j]-=epsilon
cost1,grad1 = self.get_cost_grad(theta1,X,y)
cost2,grad2 = self.get_cost_grad(theta2,X,y)
cost3,grad3 = self.get_cost_grad(theta,X,y)
sum_error += np.abs(grad3[j]-(cost1-cost2)/float(2*epsilon))
print ("grad check error is %e\n"%(sum_error/float(N)))
if __name__=="__main__":
import cPickle, gzip
# Load the dataset
f = gzip.open('mnist.pkl.gz', 'rb')
train_set, valid_set, test_set = cPickle.load(f)
f.close()
train_X = scale_n(train_set[0])
train_y = train_set[1]
test_X = scale_n(test_set[0])
test_y = test_set[1]
l_model = SoftMaxModel()
l_model.grad_check(test_X[0:200,:],test_y[0:200])
l_model.train_scipy(train_X,train_y)
predict_train_y = l_model.predict(train_X)
b = predict_train_y!=train_y
error_train = np.sum(b, axis=0)/float(b.size)
predict_test_y = l_model.predict(test_X)
b = predict_test_y!=test_y
error_test = np.sum(b, axis=0)/float(b.size)
print ("Train Error rate = %.4f, \nTest Error rate = %.4f\n"%(error_train,error_test))
這里面有scipy的優化器應用,因為不是重點(暫時沒有學習這個庫的日程),所以標注出來,需要用優化器優化函數的時候記得有這么回事再深入學習即可:
def train_scipy(self,X,y):
m,n = X.shape
import scipy.optimize
options = {'maxiter': 50, 'disp': True}
J = lambda x: self.get_cost_grad(x, X, y)
theta = self.setup(X)
result = scipy.optimize.minimize(J, theta, method='L-BFGS-B', jac=True, options=options)
self.theta = result.x
主要是提供了一些比較復雜的優化算法,而且是一個優化自建目標函數的demo,以后可能有所應用。
