轉自麥子學院
1 """ 2 network.py 3 ~~~~~~~~~~ 4 5 A module to implement the stochastic gradient descent learning 6 algorithm for a feedforward neural network. Gradients are calculated 7 using backpropagation. Note that I have focused on making the code 8 simple, easily readable, and easily modifiable. It is not optimized, 9 and omits many desirable features. 10 """ 11 12 #### Libraries 13 # Standard library 14 import random 15 16 # Third-party libraries 17 import numpy as np 18 19 class Network(object): 20 21 def __init__(self, sizes): 22 """The list ``sizes`` contains the number of neurons in the 23 respective layers of the network. For example, if the list 24 was [2, 3, 1] then it would be a three-layer network, with the 25 first layer containing 2 neurons, the second layer 3 neurons, 26 and the third layer 1 neuron. The biases and weights for the 27 network are initialized randomly, using a Gaussian 28 distribution with mean 0, and variance 1. Note that the first 29 layer is assumed to be an input layer, and by convention we 30 won't set any biases for those neurons, since biases are only 31 ever used in computing the outputs from later layers.""" 32 self.num_layers = len(sizes) 33 self.sizes = sizes 34 self.biases = [np.random.randn(y, 1) for y in sizes[1:]] 35 self.weights = [np.random.randn(y, x) 36 for x, y in zip(sizes[:-1], sizes[1:])] 37 38 def feedforward(self, a): 39 """Return the output of the network if ``a`` is input.""" 40 for b, w in zip(self.biases, self.weights): 41 a = sigmoid(np.dot(w, a)+b) 42 return a 43 44 def SGD(self, training_data, epochs, mini_batch_size, eta, 45 test_data=None): 46 """Train the neural network using mini-batch stochastic 47 gradient descent. The ``training_data`` is a list of tuples 48 ``(x, y)`` representing the training inputs and the desired 49 outputs. The other non-optional parameters are 50 self-explanatory. If ``test_data`` is provided then the 51 network will be evaluated against the test data after each 52 epoch, and partial progress printed out. This is useful for 53 tracking progress, but slows things down substantially.""" 54 if test_data: n_test = len(test_data) 55 n = len(training_data) 56 for j in range(epochs): 57 random.shuffle(training_data) 58 mini_batches = [ 59 training_data[k:k+mini_batch_size] 60 for k in range(0, n, mini_batch_size)] 61 for mini_batch in mini_batches: 62 self.update_mini_batch(mini_batch, eta) 63 if test_data: 64 print ("Epoch {0}: {1} / {2}".format( 65 j, self.evaluate(test_data), n_test)) 66 else: 67 print ("Epoch {0} complete".format(j)) 68 69 def update_mini_batch(self, mini_batch, eta): 70 """Update the network's weights and biases by applying 71 gradient descent using backpropagation to a single mini batch. 72 The ``mini_batch`` is a list of tuples ``(x, y)``, and ``eta`` 73 is the learning rate.""" 74 nabla_b = [np.zeros(b.shape) for b in self.biases] 75 nabla_w = [np.zeros(w.shape) for w in self.weights] 76 #一個一個的進行訓練 跟吳恩達的Mini-Batch 不一樣 77 for x, y in mini_batch: 78 delta_nabla_b, delta_nabla_w = self.backprop(x, y) 79 nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)] 80 nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)] 81 self.weights = [w-(eta/len(mini_batch))*nw 82 for w, nw in zip(self.weights, nabla_w)] 83 self.biases = [b-(eta/len(mini_batch))*nb 84 for b, nb in zip(self.biases, nabla_b)] 85 86 def backprop(self, x, y): 87 """Return a tuple ``(nabla_b, nabla_w)`` representing the 88 gradient for the cost function C_x. ``nabla_b`` and 89 ``nabla_w`` are layer-by-layer lists of numpy arrays, similar 90 to ``self.biases`` and ``self.weights``.""" 91 nabla_b = [np.zeros(b.shape) for b in self.biases] 92 nabla_w = [np.zeros(w.shape) for w in self.weights] 93 # feedforward 94 activation = x 95 activations = [x] # list to store all the activations, layer by layer 96 zs = [] # list to store all the z vectors, layer by layer 97 for b, w in zip(self.biases, self.weights): 98 z = np.dot(w, activation)+b 99 zs.append(z) 100 activation = sigmoid(z) 101 activations.append(activation) 102 # backward pass 103 delta = self.cost_derivative(activations[-1], y) * \ 104 sigmoid_prime(zs[-1]) 105 nabla_b[-1] = delta 106 nabla_w[-1] = np.dot(delta, activations[-2].transpose()) 107 # Note that the variable l in the loop below is used a little 108 # differently to the notation in Chapter 2 of the book. Here, 109 # l = 1 means the last layer of neurons, l = 2 is the 110 # second-last layer, and so on. It's a renumbering of the 111 # scheme in the book, used here to take advantage of the fact 112 # that Python can use negative indices in lists. 113 for l in range(2, self.num_layers): 114 z = zs[-l] 115 sp = sigmoid_prime(z) 116 delta = np.dot(self.weights[-l+1].transpose(), delta) * sp 117 nabla_b[-l] = delta 118 nabla_w[-l] = np.dot(delta, activations[-l-1].transpose()) 119 return (nabla_b, nabla_w) 120 121 def evaluate(self, test_data): 122 """Return the number of test inputs for which the neural 123 network outputs the correct result. Note that the neural 124 network's output is assumed to be the index of whichever 125 neuron in the final layer has the highest activation.""" 126 test_results = [(np.argmax(self.feedforward(x)), y) 127 for (x, y) in test_data] 128 return sum(int(x == y) for (x, y) in test_results) 129 130 def cost_derivative(self, output_activations, y): 131 """Return the vector of partial derivatives \partial C_x / 132 \partial a for the output activations.""" 133 return (output_activations-y) 134 135 #### Miscellaneous functions 136 def sigmoid(z): 137 """The sigmoid function.""" 138 return 1.0/(1.0+np.exp(-z)) 139 140 def sigmoid_prime(z): 141 """Derivative of the sigmoid function.""" 142 return sigmoid(z)*(1-sigmoid(z))
該算法比我之前寫的神經網絡算法准確率高,但是在測試過程中發現有錯誤,各個地方的注釋我是沒看明白,與理論結合不是很好。本人在他的基礎上進行了改進,提高了算法的擴展程度,自己也親測了改進后的代碼,效果杠杠的。
1 # -*- coding: utf-8 -*- 2 """ 3 Created on Thu Jan 18 15:27:24 2018 4 5 @author: markli 6 """ 7 8 import numpy as np; 9 import random; 10 11 def tanh(x): 12 return np.tanh(x); 13 14 def tanh_derivative(x): 15 return 1.0 - np.tanh(x)*np.tanh(x); 16 17 def logistic(x): 18 return 1/(1 + np.exp(-x)); 19 20 def logistic_derivative(x): 21 return logistic(x)*(1-logistic(x)); 22 23 def ReLU(x,a=1): 24 return max(0,a * x); 25 26 def ReLU_derivative(x,a=1): 27 return 0 if x < 0 else a; 28 29 class NeuralNetwork: 30 ''' 31 Z = W * x + b 32 A = sigmod(Z) 33 Z 凈輸入 34 x 樣本集合 n * m n 個特征 m 個樣本數量 35 b 偏移量 36 W 權重 37 A 凈輸出 38 ''' 39 def __init__(self,layers,active_function=[logistic],active_function_der=[logistic_derivative],learn_rate=0.9): 40 """ 41 初始化神經網絡 42 layer中存放每層的神經元數量,layer的長度即為網絡的層數 43 active_function 為每一層指定一個激活函數,若長度為1則表示所有層使用同一個激活函數 44 active_function_der 激活函數的導數 45 learn_rate 學習速率 46 """ 47 self.weights = [np.random.randn(x,y) for x,y in zip(layers[1:],layers[:-1])]; 48 self.biases = [np.random.randn(x,1) for x in layers[1:]]; 49 self.size = len(layers); 50 self.rate = learn_rate; 51 self.sigmoids = []; 52 self.sigmoids_der = []; 53 for i in range(len(layers)-1): 54 if(len(active_function) == self.size-1): 55 self.sigmoids = active_function; 56 else: 57 self.sigmoids.append(active_function[0]); 58 if(len(active_function_der)== self.size-1): 59 self.sigmoids_der = active_function_der; 60 else: 61 self.sigmoids_der.append(active_function_der[0]); 62 63 def fit(self,TrainData,epochs=1000,mini_batch_size=32): 64 """ 65 運用后向傳播算法學習神經網絡模型 66 TrainData 是(X,Y)值對 67 X 輸入特征矩陣 m*n 維 n 個特征,m個樣本 68 Y 輸入實際值 t*m 維 t個類別標簽,m個樣本 69 epochs 迭代次數 70 mini_batch_size mini_batch 一次的大小,不使用則mini_batch_size = 1 71 """ 72 n = len(TrainData); 73 for i in range(epochs): 74 random.shuffle(TrainData) 75 mini_batches = [ 76 TrainData[k:k+mini_batch_size] 77 for k in range(0, n, mini_batch_size)]; 78 for mini_batch in mini_batches: 79 self.BP(mini_batch, self.rate); 80 81 82 83 84 def predict(self, x): 85 """前向傳播""" 86 i = 0; 87 for b, w in zip(self.biases, self.weights): 88 x = self.sigmoids[i](np.dot(w, x)+b); 89 i = i + 1; 90 return x 91 92 def BP(self,mini_batch,rate): 93 """ 94 BP 神經網絡算法 95 """ 96 size = len(mini_batch); 97 98 nabla_b = [np.zeros(b.shape) for b in self.biases]; #存放每次訓練b的變化量 99 nabla_w = [np.zeros(w.shape) for w in self.weights]; #存放每次訓練w的變化量 100 #一個一個的進行訓練 101 for x, y in mini_batch: 102 delta_nabla_b, delta_nabla_w = self.backprop(x, y); 103 nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]; #累加每次訓練b的變化量 104 nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]; #累加每次訓練w的變化量 105 self.weights = [w-(rate/size)*nw 106 for w, nw in zip(self.weights, nabla_w)]; 107 self.biases = [b-(rate/size)*nb 108 for b, nb in zip(self.biases, nabla_b)]; 109 110 def backprop(self, x, y): 111 """ 112 x 是一維 的行向量 113 y 是一維行向量 114 """ 115 nabla_b = [np.zeros(b.shape) for b in self.biases]; 116 nabla_w = [np.zeros(w.shape) for w in self.weights]; 117 # feedforward 118 activation = np.atleast_2d(x).reshape((len(x),1)); #轉換為列向量 119 activations = [activation]; # 存放每層a 120 zs = []; # 存放每z值 121 i = 0; 122 for b, w in zip(self.biases, self.weights): 123 z = np.dot(w, activation)+b; 124 zs.append(z); 125 activation = self.sigmoids[i](z); 126 activations.append(activation); 127 i = i + 1; 128 # backward pass 129 y = np.atleast_2d(y).reshape((len(y),1)); #將y轉化為列向量 130 #delta cost對z的偏導數 131 delta = self.cost_der(activations[-1], y) * \ 132 self.sigmoids_der[-1](zs[-1]); 133 nabla_b[-1] = delta; 134 nabla_w[-1] = np.dot(delta, np.transpose(activations[-2])); 135 #從后往前遍歷每一層,從倒數第2層開始 141 for l in range(2, self.size): 142 z = zs[-l]; #當前層的z 143 sp = self.sigmoids_der[-l](z); #對z的偏導數值 144 delta = np.multiply(np.dot(np.transpose(self.weights[-l+1]), delta), sp); #求出當前層的誤差 145 nabla_b[-l] = delta; 146 nabla_w[-l] = np.dot(delta, np.transpose(activations[-l-1])); 147 return (nabla_b, nabla_w) 148 149 """ 150 損失函數 151 cost_der 差的平方損失函數對a 的導數 152 cost_cross_entropy_der 交叉熵損失函數對a的導數 153 """ 154 def cost_der(self,a,y): 155 return a - y; 156 157 def cost_cross_entropy_der(self,a,y): 158 return (a-y)/(a * (1-a)); 159 160
以上是BP神經網絡算法源碼,下面給出一個數字識別程序,用來測試上述代碼的正確性。
1 import numpy as np 2 from sklearn.datasets import load_digits 3 from sklearn.metrics import confusion_matrix, classification_report 4 from sklearn.preprocessing import LabelBinarizer 5 from network_mark import NeuralNetwork 6 from sklearn.cross_validation import train_test_split 7 8 9 10 digits = load_digits(); 11 X = digits.data; 12 y = digits.target; 13 X -= X.min(); # normalize the values to bring them into the range 0-1 14 X /= X.max(); 15 16 nn = NeuralNetwork([64,100,10]); 17 X_train, X_test, y_train, y_test = train_test_split(X, y); 18 labels_train = LabelBinarizer().fit_transform(y_train); 19 labels_test = LabelBinarizer().fit_transform(y_test); 20 21 22 # X_train.shape (1347,64) 23 #y_train.shape(1347) 24 #labels_train.shape (1347,10) 25 #labels_test.shape(450,10) 26 27 print ("start fitting"); 28 Data = [(x,y) for x,y in zip(X_train,labels_train)]; 29 #print(Data); 30 nn.fit(Data,epochs=500,mini_batch_size=32); 31 result = nn.predict(X_test.T); 32 predictions = [np.argmax(result[:,y]) for y in range(result.shape[1])]; 33 34 print(predictions); 35 #for i in range(result.shape[1]): 36 # y = result[:,i]; 37 # predictions.append(np.argmax(y)); 38 ##print(np.atleast_2d(predictions).shape); 39 print (confusion_matrix(y_test,predictions)); 40 print (classification_report(y_test,predictions)); 41
最后是測試結果,效果很客觀。