下面是我從cs231n上整理的神經網絡的入門實現,麻雀雖小,五臟俱全,基本上神經網絡涉及到的知識點都有在代碼中體現。
理論看上千萬遍,不如看一遍源碼跑一跑。
源碼上我已經加了很多注釋,結合代碼看一遍很容易理解。
最后可視化權重的圖:
主文件,用來訓練調參
two_layer_net.py

1 # coding: utf-8 2 3 # 實現一個簡單的神經網絡並在CIFAR10上測試性能 4 5 import numpy as np 6 import matplotlib.pyplot as plt 7 from neural_net import TwoLayerNet 8 from data_utils import load_CIFAR10 9 from vis_utils import visualize_grid 10 11 def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000): 12 cifar10_dir = 'cs231n/datasets/cifar-10-batches-py' 13 X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir) 14 15 # 采樣 16 mask = list(range(num_training, num_training + num_validation)) 17 X_val = X_train[mask] 18 y_val = y_train[mask] 19 mask = list(range(num_training)) 20 X_train = X_train[mask] 21 y_train = y_train[mask] 22 mask = list(range(num_test)) 23 X_test = X_test[mask] 24 y_test = y_test[mask] 25 26 # 歸一化操作:減去均值,使得數據以0為中心 27 mean_image = np.mean(X_train, axis=0) 28 X_train -= mean_image 29 X_val -= mean_image 30 X_test -= mean_image 31 32 X_train = X_train.reshape(num_training, -1) 33 X_val = X_val.reshape(num_validation, -1) 34 X_test = X_test.reshape(num_test, -1) 35 36 return X_train, y_train, X_val, y_val, X_test, y_test 37 38 39 X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data() 40 print('Train data shape: ', X_train.shape) 41 print('Train labels shape: ', y_train.shape) 42 print('Validation data shape: ', X_val.shape) 43 print('Validation labels shape: ', y_val.shape) 44 print('Test data shape: ', X_test.shape) 45 print('Test labels shape: ', y_test.shape) 46 47 48 #第一次訓練 49 input_size = 32 * 32 * 3 50 hidden_size = 50 51 num_classes = 10 52 net = TwoLayerNet(input_size, hidden_size, num_classes) 53 stats = net.train(X_train, y_train, X_val, y_val, 54 num_iters=1000, batch_size=200, 55 learning_rate=1e-4, learning_rate_decay=0.95, 56 reg=0.25, verbose=True) 57 val_acc = (net.predict(X_val) == y_val).mean() 58 print('Validation accuracy: ', val_acc) 59 60 #效果不太理想,debug 61 62 # 先畫一下loss和正確率的曲線看一看 63 plt.subplot(2, 1, 1) 64 plt.plot(stats['loss_history']) 65 plt.title('Loss history') 66 plt.xlabel('Iteration') 67 plt.ylabel('Loss') 68 69 plt.subplot(2, 1, 2) 70 plt.plot(stats['train_acc_history'], label='train') 71 plt.plot(stats['val_acc_history'], label='val') 72 plt.title('Classification accuracy history') 73 plt.xlabel('Epoch') 74 plt.ylabel('Clasification accuracy') 75 plt.show() 76 77 78 79 #可視化一下權重 80 def show_net_weights(net): 81 W1 = net.params['W1'] 82 W1 = W1.reshape(32, 32, 3, -1).transpose(3, 0, 1, 2) 83 plt.imshow(visualize_grid(W1, padding=3).astype('uint8')) 84 plt.gca().axis('off') 85 plt.show() 86 87 show_net_weights(net) 88 89 90 #通過上面的曲線我們可以看到基本上loss還在線性下降,表示我們的loss下降的還不夠。 91 #一方面,我們可以加大學習率使loss更加快速的下降,另一方面,也可以增加迭代的次數,讓loss繼續下降。 92 #還有,在訓練集和驗證集上的正確率沒有明顯差距,表明網絡的容量可能不夠,可以嘗試增加網絡的復雜度使之擁有更強的表達能力。 93 94 95 96 #下面是我調出來的參數,實際上選了很久 ,在測試集上的正確率在55%左右 97 hidden_size = 150#[50,70,100,130] 98 learning_rates = 1e-3#np.array([0.5,1,1.5])*1e-3 99 regularization_strengths = 0.2#[0.1,0.2,0.3] 100 best_net = None 101 results = {} 102 best_val_acc = 0 103 104 105 for hs in hidden_size: 106 for lr in learning_rates: 107 for reg in regularization_strengths: 108 109 net = TwoLayerNet(input_size, hs, num_classes) 110 # Train the network 111 stats = net.train(X_train, y_train, X_val, y_val, 112 num_iters=3000, batch_size=200, 113 learning_rate=lr, learning_rate_decay=0.95, 114 reg= reg, verbose=False) 115 val_acc = (net.predict(X_val) == y_val).mean() 116 if val_acc > best_val_acc: 117 best_val_acc = val_acc 118 best_net = net 119 results[(hs,lr,reg)] = val_acc 120 121 plt.subplot(2, 1, 1) 122 plt.plot(stats['loss_history']) 123 plt.title('Loss history') 124 plt.xlabel('Iteration') 125 plt.ylabel('Loss') 126 127 plt.subplot(2, 1, 2) 128 plt.plot(stats['train_acc_history'], label='train') 129 plt.plot(stats['val_acc_history'], label='val') 130 plt.title('Classification accuracy history') 131 plt.xlabel('Epoch') 132 plt.ylabel('Clasification accuracy') 133 plt.show() 134 135 136 for hs,lr, reg in sorted(results): 137 val_acc = results[(hs, lr, reg)] 138 print ('hs %d lr %e reg %e val accuracy: %f' % (hs, lr, reg, val_acc)) 139 140 print ('best validation accuracy achieved during cross-validation: %f' % best_val_acc) 141 142 143 show_net_weights(best_net) 144 test_acc = (best_net.predict(X_test) == y_test).mean() 145 print('Test accuracy: ', test_acc)
定義神經網絡和前向反向計算、損失函數、自動訓練的類
neural_net.py

1 import numpy as np 2 import matplotlib.pyplot as plt 3 4 class TwoLayerNet(object): 5 """ 6 兩層的全連接網絡。使用sotfmax損失函數和L2正則,非線性函數采用Relu函數。 7 網絡結構:input - fully connected layer - ReLU - fully connected layer - softmax 8 """ 9 10 def __init__(self, input_size, hidden_size, output_size, std=1e-4): 11 """ 12 初始化模型。 13 初始化權重矩陣W和偏置b。這里b置為零,但是Alexnet論文中說采用Relu函數激活時b置為1可以更快的收斂。 14 參數都保存在self.params字典中。 15 鍵為: 16 W1 (D, H) 17 b1 (H,) 18 W2 (H, C) 19 b2 (C,) 20 D,H,C分別表示輸入數據的維度,隱藏層大小,輸出類別的個數 21 """ 22 self.params = {} 23 self.params['W1'] = std * np.random.randn(input_size, hidden_size) 24 self.params['b1'] = np.zeros(hidden_size) 25 self.params['W2'] = std * np.random.randn(hidden_size, output_size) 26 self.params['b2'] = np.zeros(output_size) 27 28 def loss(self, X, y=None, reg=0.0): 29 """ 30 如果是在訓練過程,計算損失和梯度,如果是在測試過程,返回最后一層的輸入,即每個類的得分。 31 32 Inputs: 33 - X (N, D). X[i] 為一個訓練樣本。 34 - y: 標簽。如果為None則表示是在進行測試過程,否則是在進行訓練過程。 35 - reg: Regularization strength. 36 37 Returns: 38 如果y=None,返回shape為(N, C)的矩陣,scores[i, c]表示輸入i在c類上的得分。 39 40 如果y!=None, 返回一個tuple: 41 - loss: 包括數據損失和正則損失兩部分。 42 - grads: 各個參數的梯度。 43 """ 44 45 W1, b1 = self.params['W1'], self.params['b1'] 46 W2, b2 = self.params['W2'], self.params['b2'] 47 N, D = X.shape 48 C=b2.shape[0] 49 50 #forward pass 51 h1=np.maximum(0,np.dot(X,W1)+b1) 52 h2=np.dot(h1,W2)+b2 53 scores=h2 54 55 if y is None: 56 return scores 57 58 # 計算loss 59 shift_scores=scores-np.max(scores,axis=1).reshape(-1,1) 60 exp_scores=np.exp(shift_scores) 61 softmax_out=exp_scores/np.sum(exp_scores,axis=1).reshape(-1,1) 62 loss=np.sum(-np.log(softmax_out[range(N),y]))/N+reg * (np.sum(W1 * W1) + np.sum(W2 * W2)) 63 print(np.sum(-np.log(softmax_out[range(N),y]))/N,reg * (np.sum(W1 * W1) + np.sum(W2 * W2))) 64 65 # Backward pass: 計算梯度,梯度的計算就是鏈式求導的過程 66 grads = {} 67 68 dscores = softmax_out.copy() 69 dscores[range(N),y]-=1 70 dscores /= N 71 72 grads['W2']=np.dot(h1.T,dscores)+2*reg*W2 73 grads['b2']=np.sum(dscores,axis=0) 74 75 dh=np.dot(dscores,W2.T) 76 d_max=(h1>0)*dh 77 78 grads['W1'] = X.T.dot(d_max) + 2*reg * W1 79 grads['b1'] = np.sum(d_max, axis = 0) 80 81 return loss, grads 82 83 def train(self, X, y, X_val, y_val, 84 learning_rate=1e-3, learning_rate_decay=0.95, 85 reg=5e-6, num_iters=100, 86 batch_size=200, verbose=False): 87 """ 88 自動化訓練過程。采用SGD優化。 89 90 Inputs: 91 - X (N, D):訓練輸入。 92 - y (N,) :標簽。 y[i] = c 表示X[i]的類別下標是c。 93 - X_val (N_val, D):驗證集輸入。 94 - y_val (N_val,): 驗證集標簽。 95 - learning_rate: 96 - learning_rate_decay: 學習率的損失因子。 97 - reg: regularization strength。 98 - num_iters: 迭代次數。 99 - batch_size: 每次迭代的數據批大小。. 100 - verbose: 是否顯示訓練進度。 101 """ 102 num_train = X.shape[0] 103 iterations_per_epoch = max(num_train / batch_size, 1) 104 105 loss_history = [] 106 train_acc_history = [] 107 val_acc_history = [] 108 109 for it in range(num_iters): 110 #隨機選擇一批數據 111 idx = np.random.choice(num_train, batch_size, replace=True) 112 X_batch = X[idx] 113 y_batch = y[idx] 114 # 計算損失和梯度 115 loss, grads = self.loss(X_batch, y=y_batch, reg=reg) 116 loss_history.append(loss) 117 #更新參數 118 self.params['W2'] += - learning_rate * grads['W2'] 119 self.params['b2'] += - learning_rate * grads['b2'] 120 self.params['W1'] += - learning_rate * grads['W1'] 121 self.params['b1'] += - learning_rate * grads['b1'] 122 #可視化進度 123 if verbose and it % 100 == 0: 124 print('iteration %d / %d: loss %f' % (it, num_iters, loss)) 125 126 # 每個epoch保存一次數據記錄 127 if it % iterations_per_epoch == 0: 128 train_acc = (self.predict(X_batch) == y_batch).mean() 129 val_acc = (self.predict(X_val) == y_val).mean() 130 train_acc_history.append(train_acc) 131 val_acc_history.append(val_acc) 132 #學習率衰減 133 learning_rate *= learning_rate_decay 134 return { 135 'loss_history': loss_history, 136 'train_acc_history': train_acc_history, 137 'val_acc_history': val_acc_history, 138 } 139 140 def predict(self, X): 141 """ 142 使用訓練好的參數預測輸入的標簽。 143 144 Inputs: 145 - X (N, D): 需要預測的輸入。 146 147 Returns: 148 - y_pred (N,):每個輸入的預測分類下標。 149 """ 150 151 h = np.maximum(0, X.dot(self.params['W1']) + self.params['b1']) 152 scores = h.dot(self.params['W2']) + self.params['b2'] 153 y_pred = np.argmax(scores, axis=1) 154 155 return y_pred
載入CIFAR10數據的函數
data_utils.py

1 from six.moves import cPickle as pickle 2 import numpy as np 3 import os 4 from scipy.misc import imread 5 import platform 6 7 def load_pickle(f): 8 version = platform.python_version_tuple() 9 if version[0] == '2': 10 return pickle.load(f) 11 elif version[0] == '3': 12 return pickle.load(f, encoding='latin1') 13 raise ValueError("invalid python version: {}".format(version)) 14 15 def load_CIFAR_batch(filename): 16 """ CIRAR的數據是分批的,這個函數的功能是載入一批數據 """ 17 with open(filename, 'rb') as f: 18 datadict = load_pickle(f) #以二進制方式打開文件 19 X = datadict['data'] 20 Y = datadict['labels'] 21 X = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float") 22 Y = np.array(Y) 23 return X, Y 24 25 def load_CIFAR10(ROOT): 26 """ load 所有的數據 """ 27 xs = [] 28 ys = [] 29 for b in range(1,6): 30 f = os.path.join(ROOT, 'data_batch_%d' % (b, )) 31 X, Y = load_CIFAR_batch(f) 32 xs.append(X) 33 ys.append(Y) 34 Xtr = np.concatenate(xs) 35 Ytr = np.concatenate(ys) 36 del X, Y 37 Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, 'test_batch')) 38 return Xtr, Ytr, Xte, Yte
可視化用到的函數
vis_utils.py

1 from math import sqrt, ceil 2 import numpy as np 3 4 def visualize_grid(Xs, ubound=255.0, padding=1): 5 """ 6 #把4維的數據顯示在平面圖上,也就是把(N, H, W, C)N張3通道的圖片同時顯示出來 7 8 Inputs: 9 - Xs:(N, H, W, C)shape的數據 10 - ubound: 像素會被放縮到【0,ubound】之間 11 - padding: 方塊之間的間隔填充 12 """ 13 (N, H, W, C) = Xs.shape 14 grid_size = int(ceil(sqrt(N))) 15 grid_height = H * grid_size + padding * (grid_size - 1) 16 grid_width = W * grid_size + padding * (grid_size - 1) 17 grid = np.zeros((grid_height, grid_width, C)) 18 next_idx = 0 19 y0, y1 = 0, H 20 for y in range(grid_size): 21 x0, x1 = 0, W 22 for x in range(grid_size): 23 if next_idx < N: 24 img = Xs[next_idx] 25 low, high = np.min(img), np.max(img) 26 grid[y0:y1, x0:x1] = ubound * (img - low) / (high - low) 27 next_idx += 1 28 x0 += W + padding 29 x1 += W + padding 30 y0 += H + padding 31 y1 += H + padding 32 return grid