要求:實現任意層數的NN。
每一層結構包含:
1、前向傳播和反向傳播函數;2、每一層計算的相關數值
cell 1 依舊是顯示的初始設置
1 # As usual, a bit of setup 2 3 import time 4 import numpy as np 5 import matplotlib.pyplot as plt 6 from cs231n.classifiers.fc_net import * 7 from cs231n.data_utils import get_CIFAR10_data 8 from cs231n.gradient_check import eval_numerical_gradient, eval_numerical_gradient_array 9 from cs231n.solver import Solver 10 11 %matplotlib inline 12 plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots 13 plt.rcParams['image.interpolation'] = 'nearest' 14 plt.rcParams['image.cmap'] = 'gray' 15 16 # for auto-reloading external modules 17 # see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython 18 %load_ext autoreload 19 %autoreload 2 20 21 def rel_error(x, y): 22 """ returns relative error """ 23 return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))
cell 2 讀取cifar數據,並顯示維度信息
1 # Load the (preprocessed) CIFAR10 data. 2 3 data = get_CIFAR10_data() 4 for k, v in data.iteritems(): 5 print '%s: ' % k, v.shape
cell 3 使用隨機生成的數據,測試affine 前向傳播函數
1 # Test the affine_forward function 2 3 num_inputs = 2 4 input_shape = (4, 5, 6) 5 output_dim = 3 6 7 input_size = num_inputs * np.prod(input_shape) 8 # input_size 240 9 weight_size = output_dim * np.prod(input_shape) 10 # iweight_size 360 11 x = np.linspace(-0.1, 0.5, num=input_size).reshape(num_inputs, *input_shape) 12 #(2,4,5,6) -1->0.5 13 w = np.linspace(-0.2, 0.3, num=weight_size).reshape(np.prod(input_shape), output_dim) 14 #(120, 3) -0.2->0.3 15 b = np.linspace(-0.3, 0.1, num=output_dim) 16 #(3,) 0.3->0.1 17 #2 num_inputs 120 input_shape 2*120 * 120*3 >>2*3 18 out, _ = affine_forward(x, w, b) 19 correct_out = np.array([[ 1.49834967, 1.70660132, 1.91485297], 20 [ 3.25553199, 3.5141327, 3.77273342]]) 21 22 # Compare your output with ours. The error should be around 1e-9. 23 print 'Testing affine_forward function:' 24 print 'difference: ', rel_error(out, correct_out)
結果:
affine_forward(x, w, b)函數內容
1 def affine_forward(x, w, b): 2 """ 3 Computes the forward pass for an affine (fully-connected) layer. 4 5 The input x has shape (N, d_1, ..., d_k) and contains a minibatch of N 6 examples, where each example x[i] has shape (d_1, ..., d_k). We will 7 reshape each input into a vector of dimension D = d_1 * ... * d_k, and 8 then transform it to an output vector of dimension M. 9 10 Inputs: 11 - x: A numpy array containing input data, of shape (N, d_1, ..., d_k) 12 - w: A numpy array of weights, of shape (D, M) 13 - b: A numpy array of biases, of shape (M,) 14 15 Returns a tuple of: 16 - out: output, of shape (N, M) 17 - cache: (x, w, b) 18 """ 19 out = None 20 ############################################################################# 21 # TODO: Implement the affine forward pass. Store the result in out. You # 22 # will need to reshape the input into rows. # 23 ############################################################################# 24 N = x.shape[0] 25 D = x.size / N 26 x = x.reshape(N, D) 27 #2 num_inputs 120 input_shape 2*120 * 120*3 >>2*3 28 out = np.dot(x,w) + b 29 ############################################################################# 30 # END OF YOUR CODE # 31 ############################################################################# 32 cache = (x, w, b) 33 return out, cache
cell 4 反向傳播,計算梯度是否正確
1 # Test the affine_backward function 2 3 x = np.random.randn(10, 2, 3) 4 w = np.random.randn(6, 5) 5 b = np.random.randn(5) 6 dout = np.random.randn(10, 5) 7 #x (10,2,3) w (6,5) b 5 dout (10,5) 8 dx_num = eval_numerical_gradient_array(lambda x: affine_forward(x, w, b)[0], x, dout) 9 dw_num = eval_numerical_gradient_array(lambda w: affine_forward(x, w, b)[0], w, dout) 10 db_num = eval_numerical_gradient_array(lambda b: affine_forward(x, w, b)[0], b, dout) 11 _, cache = affine_forward(x, w, b) 12 #g = lambda i : range(i) 13 #print g(len(cache)) 14 #for i in range (len(cache)): 15 # print cache[i].shape 16 #(10, 6) 17 #(6, 5) 18 #(5,) 19 dx, dw, db = affine_backward(dout, cache) 20 print dx.shape 21 dx = dx.reshape(10, 2, 3) 22 # The error should be around 1e-10 23 print 'Testing affine_backward function:' 24 print 'dx error: ', rel_error(dx_num, dx) 25 print 'dw error: ', rel_error(dw_num, dw) 26 print 'db error: ', rel_error(db_num, db)
結果:
affine_backward(dout, cache)內容:
1 def affine_backward(dout, cache): 2 """ 3 Computes the backward pass for an affine layer. 4 5 Inputs: 6 - dout: Upstream derivative, of shape (N, M) 7 - cache: Tuple of: 8 - x: Input data, of shape (N, d_1, ... d_k) 9 - w: Weights, of shape (D, M) 10 11 Returns a tuple of: 12 - dx: Gradient with respect to x, of shape (N, d1, ..., d_k) 13 - dw: Gradient with respect to w, of shape (D, M) 14 - db: Gradient with respect to b, of shape (M,) 15 """ 16 x, w, b = cache 17 dx, dw, db = None, None, None 18 #(10, 6) 19 #(6, 5) 20 #(5,) 21 ############################################################################# 22 # TODO: Implement the affine backward pass. # 23 ############################################################################# 24 #loss ==>>dout 10 *5 25 #dx ==>> 10*5 * 5*6 >>>10*6 26 dx = np.dot(dout,w.T) 27 #dw ==>>6*10 * 10*5 >>>6*5 28 dw = np.dot(x.T,dout) 29 # db ==>> 5 30 db = np.sum(dout,axis=0) 31 ############################################################################# 32 # END OF YOUR CODE # 33 ############################################################################# 34 return dx, dw, db
cell 5 ReLU 的前向傳播
1 # Test the relu_forward function 2 3 x = np.linspace(-0.5, 0.5, num=12).reshape(3, 4) 4 5 out, _ = relu_forward(x) 6 correct_out = np.array([[ 0., 0., 0., 0., ], 7 [ 0., 0., 0.04545455, 0.13636364,], 8 [ 0.22727273, 0.31818182, 0.40909091, 0.5, ]]) 9 # Compare your output with ours. The error should be around 1e-8 10 print 'Testing relu_forward function:' 11 print 'difference: ', rel_error(out, correct_out)
結果:
relu_forward(x)內容:
1 def relu_forward(x): 2 """ 3 Computes the forward pass for a layer of rectified linear units (ReLUs). 4 5 Input: 6 - x: Inputs, of any shape 7 8 Returns a tuple of: 9 - out: Output, of the same shape as x 10 - cache: x 11 """ 12 out = None 13 ############################################################################# 14 # TODO: Implement the ReLU forward pass. # 15 ############################################################################# 16 out = x*(x>0) 17 ############################################################################# 18 # END OF YOUR CODE # 19 ############################################################################# 20 cache = x 21 return out, cache
cell 6 ReLU 反向傳播
1 x = np.random.randn(10, 10) 2 dout = np.random.randn(*x.shape) 3 dx_num = eval_numerical_gradient_array(lambda x: relu_forward(x)[0], x, dout) 4 _, cache = relu_forward(x) 5 dx = relu_backward(dout, cache) 6 # The error should be around 1e-12 7 print 'Testing relu_backward function:' 8 print 'dx error: ', rel_error(dx_num, dx)
結果:
relu_forward(x)內容:
1 def relu_backward(dout, cache): 2 """ 3 Computes the backward pass for a layer of rectified linear units (ReLUs). 4 5 Input: 6 - dout: Upstream derivatives, of any shape 7 - cache: Input x, of same shape as dout 8 9 Returns: 10 - dx: Gradient with respect to x 11 """ 12 dx, x = None, cache 13 ############################################################################# 14 # TODO: Implement the ReLU backward pass. # 15 ############################################################################# 16 dx = dout * (x>=0) 17 ############################################################################# 18 # END OF YOUR CODE # 19 ############################################################################# 20 return dx
cell 7 affine + ReLU 組合:
1 from cs231n.layer_utils import affine_relu_forward, affine_relu_backward 2 3 x = np.random.randn(2, 3, 4) 4 w = np.random.randn(12, 10) 5 b = np.random.randn(10) 6 dout = np.random.randn(2, 10) 7 8 out, cache = affine_relu_forward(x, w, b) 9 dx, dw, db = affine_relu_backward(dout, cache) 10 11 dx_num = eval_numerical_gradient_array(lambda x: affine_relu_forward(x, w, b)[0], x, dout) 12 dw_num = eval_numerical_gradient_array(lambda w: affine_relu_forward(x, w, b)[0], w, dout) 13 db_num = eval_numerical_gradient_array(lambda b: affine_relu_forward(x, w, b)[0], b, dout) 14 15 dx = dx.reshape(2, 3, 4) 16 print 'Testing affine_relu_forward:' 17 print 'dx error: ', rel_error(dx_num, dx) 18 print 'dw error: ', rel_error(dw_num, dw) 19 print 'db error: ', rel_error(db_num, db)
結果:
affine_relu_forward(x, w, b):
1 def affine_relu_forward(x, w, b): 2 """ 3 Convenience layer that perorms an affine transform followed by a ReLU 4 5 Inputs: 6 - x: Input to the affine layer 7 - w, b: Weights for the affine layer 8 9 Returns a tuple of: 10 - out: Output from the ReLU 11 - cache: Object to give to the backward pass 12 """ 13 a, fc_cache = affine_forward(x, w, b) 14 out, relu_cache = relu_forward(a) 15 cache = (fc_cache, relu_cache) 16 return out, cache
affine_relu_backward(dout, cache):
1 def affine_relu_backward(dout, cache): 2 """ 3 Backward pass for the affine-relu convenience layer 4 """ 5 fc_cache, relu_cache = cache 6 da = relu_backward(dout, relu_cache) 7 dx, dw, db = affine_backward(da, fc_cache) 8 return dx, dw, db
cell 8 Softmax SVM
這兩層的代碼在之前已經實現過。並且原文件也給出了。這里不再解釋。原理同上。
cell 9 Two-layer network
實現: The architecure should be affine - relu - affine - softmax.
原理依舊是鏈式法則。
先前向傳播,記錄傳播中用到的數值,之后的偏導需要用到,然后反向傳播。
1 N, D, H, C = 3, 5, 50, 7 2 X = np.random.randn(N, D) 3 y = np.random.randint(C, size=N) 4 5 std = 1e-2 6 model = TwoLayerNet(input_dim=D, hidden_dim=H, num_classes=C, weight_scale=std) 7 # 3 example 5 input 50 hidden 7 class 8 #w1 5*50 b1 50 w2 50*7 b2 7 9 print 'Testing initialization ... ' 10 W1_std = abs(model.params['W1'].std() - std) 11 b1 = model.params['b1'] 12 W2_std = abs(model.params['W2'].std() - std) 13 b2 = model.params['b2'] 14 assert W1_std < std / 10, 'First layer weights do not seem right' 15 assert np.all(b1 == 0), 'First layer biases do not seem right' 16 assert W2_std < std / 10, 'Second layer weights do not seem right' 17 assert np.all(b2 == 0), 'Second layer biases do not seem right' 18 19 print 'Testing test-time forward pass ... ' 20 model.params['W1'] = np.linspace(-0.7, 0.3, num=D*H).reshape(D, H) 21 model.params['b1'] = np.linspace(-0.1, 0.9, num=H) 22 model.params['W2'] = np.linspace(-0.3, 0.4, num=H*C).reshape(H, C) 23 model.params['b2'] = np.linspace(-0.9, 0.1, num=C) 24 X = np.linspace(-5.5, 4.5, num=N*D).reshape(D, N).T 25 scores = model.loss(X) 26 correct_scores = np.asarray( 27 [[11.53165108, 12.2917344, 13.05181771, 13.81190102, 14.57198434, 15.33206765, 16.09215096], 28 [12.05769098, 12.74614105, 13.43459113, 14.1230412, 14.81149128, 15.49994135, 16.18839143], 29 [12.58373087, 13.20054771, 13.81736455, 14.43418138, 15.05099822, 15.66781506, 16.2846319 ]]) 30 scores_diff = np.abs(scores - correct_scores).sum() 31 assert scores_diff < 1e-6, 'Problem with test-time forward pass' 32 33 print 'Testing training loss (no regularization)' 34 y = np.asarray([0, 5, 1]) 35 loss, grads = model.loss(X, y) 36 correct_loss = 3.4702243556 37 assert abs(loss - correct_loss) < 1e-10, 'Problem with training-time loss' 38 39 model.reg = 1.0 40 loss, grads = model.loss(X, y) 41 correct_loss = 26.5948426952 42 assert abs(loss - correct_loss) < 1e-10, 'Problem with regularization loss' 43 44 for reg in [0.0, 0.7]: 45 print 'Running numeric gradient check with reg = ', reg 46 model.reg = reg 47 loss, grads = model.loss(X, y) 48 49 for name in sorted(grads): 50 f = lambda _: model.loss(X, y)[0] 51 grad_num = eval_numerical_gradient(f, model.params[name], verbose=False) 52 print '%s relative error: %.2e' % (name, rel_error(grad_num, grads[name]))
結果:
涉及的TwoLayerNet 類:
1 class TwoLayerNet(object): 2 """ 3 A two-layer fully-connected neural network with ReLU nonlinearity and 4 softmax loss that uses a modular layer design. We assume an input dimension 5 of D, a hidden dimension of H, and perform classification over C classes. 6 7 The architecure should be affine - relu - affine - softmax. 8 9 Note that this class does not implement gradient descent; instead, it 10 will interact with a separate Solver object that is responsible for running 11 optimization. 12 13 The learnable parameters of the model are stored in the dictionary 14 self.params that maps parameter names to numpy arrays. 15 """ 16 17 def __init__(self, input_dim=3 * 32 * 32, hidden_dim=100, num_classes=10, 18 weight_scale=1e-3, reg=0.0): 19 """ 20 Initialize a new network. 21 22 Inputs: 23 - input_dim: An integer giving the size of the input 24 - hidden_dim: An integer giving the size of the hidden layer 25 - num_classes: An integer giving the number of classes to classify 26 - dropout: Scalar between 0 and 1 giving dropout strength. 27 - weight_scale: Scalar giving the standard deviation for random 28 initialization of the weights. 29 - reg: Scalar giving L2 regularization strength. 30 """ 31 self.params = {} 32 self.reg = reg 33 self.D = input_dim 34 self.M = hidden_dim 35 self.C = num_classes 36 self.reg = reg 37 38 w1 = weight_scale * np.random.randn(self.D, self.M) 39 b1 = np.zeros(hidden_dim) 40 w2 = weight_scale * np.random.randn(self.M, self.C) 41 b2 = np.zeros(self.C) 42 43 self.params.update({'W1': w1, 44 'W2': w2, 45 'b1': b1, 46 'b2': b2}) 47 48 def loss(self, X, y=None): 49 """ 50 Compute loss and gradient for a minibatch of data. 51 52 Inputs: 53 - X: Array of input data of shape (N, d_1, ..., d_k) 54 - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. 55 56 Returns: 57 If y is None, then run a test-time forward pass of the model and return: 58 - scores: Array of shape (N, C) giving classification scores, where 59 scores[i, c] is the classification score for X[i] and class c. 60 61 If y is not None, then run a training-time forward and backward pass and 62 return a tuple of: 63 - loss: Scalar value giving the loss 64 - grads: Dictionary with the same keys as self.params, mapping parameter 65 names to gradients of the loss with respect to those parameters. 66 """ 67 68 ####################################################################### 69 # TODO: Implement the backward pass for the two-layer net. Store the loss # 70 # in the loss variable and gradients in the grads dictionary. Compute data # 71 # loss using softmax, and make sure that grads[k] holds the gradients for # 72 # self.params[k]. Don't forget to add L2 regularization! # 73 # # 74 # NOTE: To ensure that your implementation matches ours and you pass the # 75 # automated tests, make sure that your L2 regularization includes a factor # 76 # of 0.5 to simplify the expression for the gradient. # 77 ####################################################################### 78 79 W1, b1, W2, b2 = self.params['W1'], self.params[ 80 'b1'], self.params['W2'], self.params['b2'] 81 82 X = X.reshape(X.shape[0], self.D) 83 # Forward into first layer 84 hidden_layer, cache_hidden_layer = affine_relu_forward(X, W1, b1) 85 # Forward into second layer 86 scores, cache_scores = affine_forward(hidden_layer, W2, b2) 87 88 # If y is None then we are in test mode so just return scores 89 if y is None: 90 return scores 91 92 data_loss, dscores = softmax_loss(scores, y) 93 reg_loss = 0.5 * self.reg * np.sum(W1**2) 94 reg_loss += 0.5 * self.reg * np.sum(W2**2) 95 loss = data_loss + reg_loss 96 97 # Backpropagaton 98 grads = {} 99 # Backprop into second layer 100 dx1, dW2, db2 = affine_backward(dscores, cache_scores) 101 dW2 += self.reg * W2 102 103 # Backprop into first layer 104 dx, dW1, db1 = affine_relu_backward( 105 dx1, cache_hidden_layer) 106 dW1 += self.reg * W1 107 108 grads.update({'W1': dW1, 109 'b1': db1, 110 'W2': dW2, 111 'b2': db2}) 112 113 return loss, grads
cell 10 使用獨立的solver對模型進行訓練。
之前訓練函數是包含在模型類的方法中的。這樣可以對參數》》batch size 正則衰減等值進行修改。
使用獨立的solver進行訓練,邏輯更清晰。
得到的結果用圖像顯示:
cell 13 建立隱藏層可選的模型
1 N, D, H1, H2, C = 2, 15, 20, 30, 10 2 X = np.random.randn(N, D) 3 y = np.random.randint(C, size=(N,)) 4 5 for reg in [0, 3.14]: 6 print 'Running check with reg = ', reg 7 model = FullyConnectedNet([H1, H2], input_dim=D, num_classes=C, 8 reg=reg, weight_scale=5e-2, dtype=np.float64) 9 10 loss, grads = model.loss(X, y) 11 print 'Initial loss: ', loss 12 13 for name in sorted(grads): 14 f = lambda _: model.loss(X, y)[0] 15 grad_num = eval_numerical_gradient(f, model.params[name], verbose=False, h=1e-5) 16 print '%s relative error: %.2e' % (name, rel_error(grad_num, grads[name]))
由於其中的FullyConnectedNet類包含的內容較多,不在這里貼了。
主要步驟:
對於不同的層數,建立對應的參數:
1 Ws = {'W' + str(i + 1): 2 weight_scale * np.random.randn(dims[i], dims[i + 1]) for i in range(len(dims) - 1)} 3 b = {'b' + str(i + 1): np.zeros(dims[i + 1]) 4 for i in range(len(dims) - 1)}
之后便是使用這些參數,原理是一致的。
cell 16 SGD+Momentum
1 def sgd_momentum(w, dw, config=None): 2 """ 3 Performs stochastic gradient descent with momentum. 4 5 config format: 6 - learning_rate: Scalar learning rate. 7 - momentum: Scalar between 0 and 1 giving the momentum value. 8 Setting momentum = 0 reduces to sgd. 9 - velocity: A numpy array of the same shape as w and dw used to store a moving 10 average of the gradients. 11 """ 12 if config is None: config = {} 13 config.setdefault('learning_rate', 1e-2) 14 config.setdefault('momentum', 0.9) 15 v = config.get('velocity', np.zeros_like(w)) 16 17 next_w = None 18 ############################################################################# 19 # TODO: Implement the momentum update formula. Store the updated value in # 20 # the next_w variable. You should also use and update the velocity v. # 21 ############################################################################# 22 v = config['momentum']*v - config['learning_rate']*dw 23 next_w = v+w 24 ############################################################################# 25 # END OF YOUR CODE # 26 ############################################################################# 27 config['velocity'] = v 28 29 return next_w, config
相比較而言,sgd_momentum 收斂的速度更快。
cell 18 rmsprop
1 def rmsprop(x, dx, config=None): 2 """ 3 Uses the RMSProp update rule, which uses a moving average of squared gradient 4 values to set adaptive per-parameter learning rates. 5 6 config format: 7 - learning_rate: Scalar learning rate. 8 - decay_rate: Scalar between 0 and 1 giving the decay rate for the squared 9 gradient cache. 10 - epsilon: Small scalar used for smoothing to avoid dividing by zero. 11 - cache: Moving average of second moments of gradients. 12 """ 13 if config is None: config = {} 14 config.setdefault('learning_rate', 1e-2) 15 config.setdefault('decay_rate', 0.99) 16 config.setdefault('epsilon', 1e-8) 17 config.setdefault('cache', np.zeros_like(x)) 18 19 next_x = None 20 ############################################################################# 21 # TODO: Implement the RMSprop update formula, storing the next value of x # 22 # in the next_x variable. Don't forget to update cache value stored in # 23 # config['cache']. # 24 ############################################################################# 25 config['cache'] = config['decay_rate']*config['cache'] + (1 - config['decay_rate'])*dx**2 26 next_x = x - config['learning_rate']*dx / (np.sqrt(config['cache']) + config['epsilon']) 27 ############################################################################# 28 # END OF YOUR CODE # 29 ############################################################################# 30 31 return next_x, config
cell 19 adam
1 def adam(x, dx, config=None): 2 """ 3 Uses the Adam update rule, which incorporates moving averages of both the 4 gradient and its square and a bias correction term. 5 6 config format: 7 - learning_rate: Scalar learning rate. 8 - beta1: Decay rate for moving average of first moment of gradient. 9 - beta2: Decay rate for moving average of second moment of gradient. 10 - epsilon: Small scalar used for smoothing to avoid dividing by zero. 11 - m: Moving average of gradient. 12 - v: Moving average of squared gradient. 13 - t: Iteration number. 14 """ 15 if config is None: config = {} 16 config.setdefault('learning_rate', 1e-3) 17 config.setdefault('beta1', 0.9) 18 config.setdefault('beta2', 0.999) 19 config.setdefault('epsilon', 1e-8) 20 config.setdefault('m', np.zeros_like(x)) 21 config.setdefault('v', np.zeros_like(x)) 22 config.setdefault('t', 1e5) 23 24 next_x = None 25 beta_1 = config['beta1'] 26 beta_2 = config['beta2'] 27 ############################################################################# 28 # TODO: Implement the Adam update formula, storing the next value of x in # 29 # the next_x variable. Don't forget to update the m, v, and t variables # 30 # stored in config. # 31 ############################################################################# 32 config['t'] = config['t'] + 1 33 config['m'] = config['m'] * config['beta1'] + (1 - config['beta1']) * dx 34 config['v'] = config['v'] * config['beta2'] + (1 - config['beta2']) * (dx ** 2) 35 beta_1 = 1 - (beta_1**config['t']) 36 beta_2 = np.sqrt(1 - (beta_2**config['t'])) 37 config['learning_rate'] = config['learning_rate'] * (beta_2/beta_1) 38 next_x = x - ((config['learning_rate'] * config['m']) / (np.sqrt(config['v']+config['epsilon']))) 39 ############################################################################# 40 # END OF YOUR CODE # 41 ############################################################################# 42 43 return next_x, config
4中方法的收斂速度比較:
最終會給出所有的代碼。
附:通關CS231n企鵝群:578975100 validation:DL-CS231n