BP算法在minist上的簡單實現
數據:http://yann.lecun.com/exdb/mnist/
參考:blog,blog2,blog3,tensorflow
基本實現
import struct
import random
import numpy as np
from math import sqrt
class Data:
def __init__(self):
print 'parameter initializing...'
self.num_train= 50000
self.num_confirm=10000
self.num_test= 10000
self.node_in=28*28
self.node_out=10
# need to adjust
#epoch:8 hide_node:39 accuracy:0.9613
#epoch:8 hide_node:44 accuracy:0.9612
#epoch:8 hide_node:48 accuracy:0.9624
#epoch:9 hide_node:48 accuracy:0.9648
#epoch:10 hide_node:200 accuracy:0.9724
self.epoch= 15
self.node_hide= 30
self.study_rate= 0.05
self.error_limit= 1e-2
def read_train_image(self,filename):
print 'reading train-image data...'
binfile=open(filename,'rb')
buffer=binfile.read()
index=0
magic,num,rows,colums = struct.unpack_from('>IIII',buffer,index) #>I:big-endian,unsigned int
index+=struct.calcsize('IIII')
for i in range(self.num_train):
im=struct.unpack_from('784B',buffer,index) #28*28=786,B unsigned char
index+=struct.calcsize('784B')
im=np.array(im)
im=im.reshape(1,784)/255.0 #28*28-->1
self.train_imag_list[i,:]=im
j=0
for i in range(self.num_train,self.num_train+self.num_confirm):
im=struct.unpack_from('784B',buffer,index)
index+=struct.calcsize('784B')
im=np.array(im)
im=im.reshape(1,784)/255.0
self.confirm_imag_list[j,:]=im
j=j+1
def read_train_label(self,filename):
print 'reading train-label data...'
binfile=open(filename,'rb')
buffer=binfile.read()
index=0
magic,num= struct.unpack_from('>II',buffer,index)
index+=struct.calcsize('II')
for i in range(self.num_train):
lb=struct.unpack_from('B',buffer,index)
index+=struct.calcsize('B')
lb=int(lb[0])
self.train_label_list[i,:]=lb
j=0
for i in range(self.num_train,self.num_train+self.num_confirm):
lb=struct.unpack_from('B',buffer,index)
index+=struct.calcsize('B')
lb=int(lb[0])
self.confirm_label_list[j,:]=lb
j=j+1
def read_test_image(self,filename):
print 'reading test-image data...'
binfile=open(filename,'rb')
buffer=binfile.read()
index=0
magic,num,rows,colums = struct.unpack_from('>IIII',buffer,index)
index+=struct.calcsize('IIII')
for i in range(self.num_test):
im=struct.unpack_from('784B',buffer,index)
index+=struct.calcsize('784B')
im=np.array(im)
im=im.reshape(1,784)/256.0
self.test_imag_list[i,:]=im
def read_test_label(self,filename):
print 'reading test-label data...'
binfile=open(filename,'rb')
buffer=binfile.read()
index=0
magic,num= struct.unpack_from('>II',buffer,index)
index+=struct.calcsize('II')
for i in range(self.num_test):
lb=struct.unpack_from('B',buffer,index)
index+=struct.calcsize('B')
lb=int(lb[0])
self.test_label_list[i,:]=lb
def init_network(self):
print 'network initializing...'
self.train_imag_list=np.zeros((self.num_train,self.node_in))
self.train_label_list=np.zeros((self.num_train,1))
self.confirm_imag_list=np.zeros((self.num_confirm,self.node_in))
self.confirm_label_list=np.zeros((self.num_confirm,1))
self.test_imag_list=np.zeros((self.num_test,self.node_in))
self.test_label_list=np.zeros((self.num_test,1))
self.read_train_image('train-images.idx3-ubyte')
self.read_train_label('train-labels.idx1-ubyte')
self.read_test_image('t10k-images.idx3-ubyte')
self.read_test_label('t10k-labels.idx1-ubyte')
self.wjk=(np.random.rand(self.node_hide,self.node_out)-0.5)*2/sqrt(self.node_hide)
self.wj0=(np.random.rand(self.node_out)-0.5)*2/sqrt(self.node_hide)
self.wij=(np.random.rand(self.node_in,self.node_hide)-0.5)*2/sqrt(self.node_in)
self.wi0=(np.random.rand(self.node_hide)-0.5)*2/sqrt(self.node_in)
def sigmode(self,x):
return 1.0/(1.0+np.exp(-x))
def calc_yjzk(self,sample_i,imag_list):
self.netj=np.dot(imag_list[sample_i],self.wij)+self.wi0
self.yj=self.sigmode(self.netj)
self.netk=np.dot(self.yj,self.wjk)+self.wj0
self.zk=self.sigmode(self.netk)
def calc_error(self):
ans=0.0
for sample_i in range(self.num_confirm):
self.calc_yjzk(sample_i,self.confirm_imag_list)
label_tmp=np.zeros(self.node_out)
label_tmp[int(self.confirm_label_list[sample_i])]=1
ans=ans+sum(np.square(label_tmp-self.zk)/2.0)
# print ans
return ans
def training(self):
print 'training model...'
for epoch_i in range(self.epoch):
for circle in range(self.num_train):
sample_i=np.random.randint(0,self.num_train)
#print 'debug epoch:%d sample:%d' % (epoch_i,sample_i)
#calc error
#error_before=self.calc_error()
self.calc_yjzk(sample_i,self.train_imag_list)
#update weight hide->out
tmp_label=np.zeros(self.node_out)
tmp_label[int(self.train_label_list[sample_i])]=1
delta_k=(self.zk-tmp_label)*self.zk*(1-self.zk)
self.yj.shape=(self.node_hide,1)
delta_k.shape=(1,self.node_out)
self.wjk=self.wjk-self.study_rate*np.dot(self.yj,delta_k)
#update weight in->hide
self.yj=self.yj.T
delta_j=np.dot(delta_k,self.wjk.T)*self.yj*(1-self.yj)
tmp_imag=self.train_imag_list[sample_i]
tmp_imag.shape=(self.node_in,1)
self.wij=self.wij-self.study_rate*np.dot(tmp_imag,delta_j)
# calc error
# self.calc_yjzk(sample_i,self.train_imag_list)
# error_delta=error_before-self.calc_error()
# if np.abs(error_delta)<self.error_limit:
# print 'debug break'
# print error_delta
# break
#print 'error %d %.2f' % (epoch_i,self.calc_error())
def testing(self):
print 'testing...'
num_right=0.0
for sample_i in range(self.num_test):
self.calc_yjzk(sample_i,self.test_imag_list)
ans=self.zk.argmax()
if ans==int(self.test_label_list[sample_i]):
num_right=num_right+1
self.accuracy=num_right/self.num_test
print 'accuracy: %.4f' % (self.accuracy*100) +'%'
def main():
data=Data()
data.init_network()
data.training()
data.testing()
if __name__=='__main__':
main()
注意
- 注意數據的編碼格式,在數據來源網站最底下有指出,上面還展示了一些機器學習的經典模型在minist數據集上的錯誤率可供參考
- 權值合理的初始化,及迭代次數,學習速率,隱層節點數的設置可參考經驗值
- 數據的歸一化(防止sigmode函數溢出)
- 矩陣乘法時注意行列條件的滿足
- 合理的epoch(即迭代次數,學習速率小的時候可以大一點的迭代次數,學習速率大的時候迭代次數取較小值)
- 確認合適的迭代次數后可去掉確認集,用全部的樣本數據訓練模型
- 隱層節點基本上越多越好
調參腳本
import ann
f=open('best_parameter', 'a+')
for e in range(10,40):
for node in range(10,50):
data=ann.Data()
data.node_hide=node
data.epoch=e
data.init_network()
data.training()
data.testing()
ans='circling to get best parameter----->epoch:%d hide_node:%d accuracy:%.4f\n' % (e,node,data.accuracy)
print ans
f.write(ans)
f.close()
可迭代計算迭代次數和隱層節點的數目對准確率的影響,大致規律是在學習速率0.05時,迭代次數在10-15為宜,隱層節點30以上
一些試驗的結果如下:
circling to get best parameter----->epoch:14 hide_node:43 accuracy:0.9656
circling to get best parameter----->epoch:14 hide_node:44 accuracy:0.9651
circling to get best parameter----->epoch:14 hide_node:45 accuracy:0.9638
circling to get best parameter----->epoch:14 hide_node:46 accuracy:0.9641
circling to get best parameter----->epoch:14 hide_node:47 accuracy:0.9649
circling to get best parameter----->epoch:14 hide_node:48 accuracy:0.9651
circling to get best parameter----->epoch:14 hide_node:49 accuracy:0.9671
circling to get best parameter----->epoch:15 hide_node:46 accuracy:0.9661
circling to get best parameter----->epoch:15 hide_node:47 accuracy:0.9660
circling to get best parameter----->epoch:15 hide_node:48 accuracy:0.9650
circling to get best parameter----->epoch:15 hide_node:49 accuracy:0.9655
circling to get best parameter----->epoch:10 hide_node:100 accuracy:0.9685
circling to get best parameter----->epoch:10 hide_node:200 accuracy:0.9724
circling to get best parameter----->epoch:10 hide_node:300 accuracy:0.9718
circling to get best parameter----->epoch:10 hide_node:1000 accuracy:0.9568
Tensorflow實現
import argparse
# Import data
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
FLAGS = None
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME')
def add_layer(inputs, in_size, out_size, activation_function=None):
# add a fully collected layer
Weights = weight_variable([in_size, out_size])
biases = bias_variable([out_size])
Wx_plus_b = tf.matmul(inputs, Weights) + biases
if activation_function is None:
outputs = Wx_plus_b
else:
outputs = activation_function(Wx_plus_b)
return outputs
def main(_):
mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
# reshape the input to have batch size, width, height, channel size
x = tf.placeholder(tf.float32, [None, 784])
x_image = tf.reshape(x, [-1, 28, 28, 1])
# 5*5 patch size, input channel is 1, output channel is 32
W_conv1 = weight_variable([5, 5, 1, 32])
# bias, same size with the output channel
b_conv1 = bias_variable([32])
# the first convolutional layer with a max pooling layer
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)
#after pooling, we have a tensor with shape[-1, 14, 14, 32]
# the weights and bias for the second layer, we will get 64 channels
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])
# the second convolutional layer with a max pooling layer
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)
# after pooling, we have a tensor with shape[-1, 7, 7, 64]
# add a fully connected layer with 1024 neurons and use relu as the activation function
h_pool2_flat = tf.reshape(h_pool2, [-1,7*7*64])
h_fc1 = add_layer(h_pool2_flat, 7*7*64, 1024, tf.nn.relu)
# we add dropout for the fully connected layer to avoid overfitting
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
# finally, the output layer
y_conv = add_layer(h_fc1_drop, 1024, 10, None)
# loss function and so on
y_ = tf.placeholder(tf.float32, [None, 10])
cross_entropy = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=y_conv, labels=y_))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# start training, and we test our model every 100 steps
sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
for i in range(10000):
batch = mnist.train.next_batch(100)
if i % 100 == 0:
train_accuracy = accuracy.eval(feed_dict={x: batch[0], y_: batch[1], keep_prob: 1.0})
test_accuracy = accuracy.eval(feed_dict={x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0})
print("step %d, training accuracy %g, test accuracy %g" % (i, train_accuracy, test_accuracy))
train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# modify the dir path to your own dataset
parser.add_argument('--data_dir', type=str, default='/tmp/mnist',
help='Directory for storing data')
FLAGS = parser.parse_args()
tf.app.run()
需要配置tensorflow和python3.+的運行環境
結果如下
step 0, training accuracy 0.06, test accuracy 0.0892
step 100, training accuracy 0.86, test accuracy 0.8692
step 200, training accuracy 0.97, test accuracy 0.9207
step 300, training accuracy 0.92, test accuracy 0.9403
step 400, training accuracy 0.95, test accuracy 0.9485
step 500, training accuracy 0.91, test accuracy 0.9522
step 600, training accuracy 0.97, test accuracy 0.9565
step 700, training accuracy 0.97, test accuracy 0.9622
step 800, training accuracy 0.96, test accuracy 0.9638
step 900, training accuracy 0.98, test accuracy 0.9687
step 1000, training accuracy 0.97, test accuracy 0.9703
有任何環境配置的問題請聯系,歡迎指出錯誤