Kaggle系列1:手把手教你用tensorflow建立卷積神經網絡實現貓狗圖像分類


去年研一的時候想做kaggle上的一道題目:貓狗分類,但是苦於對卷積神經網絡一直沒有很好的認識,現在把這篇文章的內容補上去。(部分代碼參考網上的,我改變了卷積神經網絡的網絡結構,其實主要部分我加了一層1X1的卷積層,至於作用,我會在后文詳細介紹)

題目地址:貓狗大戰

同時數據集也可以在上面下載到。

既然是手把手,那么就要從前期的導入數據開始:

  1. 導入數據
  2. #import sys, io
  3. #sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') # Change default encoding to utf8
  4. #coding=utf-8
  5. import tensorflow as tf
  6. import numpy as np
  7. import os
  8. train_dir='G:/data/CatVSdogtrain/train/' #訓練數據的文件夾,這里你們要換成自己的
  9. file_dir=train_dir
  10. #定義一個函數把訓練樣本和測試樣本集合起來
  11. def get_files(file_dir):
  12. '''''
  13. input:
  14. 訓練圖片放的圖片集
  15. returns
  16. 圖片列表和標簽列表
  17. '''
  18. cats=[]
  19. label_cats=[]
  20. dogs=[]
  21. label_dogs=[]
  22. # file=[]
  23. for file in os.listdir(file_dir):
  24. # file=np.hstack[files,file]
  25. name=file.split(sep='.')
  26. # print (name)
  27. if name[0]=='cat':
  28. cats.append(file_dir+file)
  29. label_cats.append(0)
  30. else:
  31. dogs.append(file_dir+file)
  32. label_dogs.append(1)
  33. print ('there is %d cats and %d dogs' %(len(cats),len(dogs)))
  34. #打亂文件的順序,其實在獲取batch的時候也可以做,但是為了方便還是在這里做了
  35. image_list=np.hstack((cats,dogs))
  36. label_list=np.hstack((label_cats,label_dogs))
  37.  
  38. temp=np.array([image_list,label_list])
  39. temp=temp.transpose()
  40. np.random.shuffle(temp)#打亂順序函數
  41. image_list=list(temp[:,0])
  42. label_list=list(temp[:,1])
  43. label_list = [int(i) for i in label_list]
  44.  
  45. return image_list,label_list

    其實這一段沒什么好說的,無非就是做好訓練樣本,和標簽。。。。代碼僅供參考。

    2 get_batch

    def get_batch(image,label,image_W,image_H,batch_size,capacity):

    #image, label:生成的batch的圖像和標簽list

    #image_w, image_H:圖片的大小

    #batch_size: 每個batch共有多少張圖片

    #capacity :隊列的容量

    # return圖像和標簽的batch

     

    # image=image_list

    # label=label_list

    #轉換格式,讓python 可以識別的格式,其實就是兩個tensor

    image=tf.cast(image,tf.string)

    label=tf.cast(label,tf.int32)

     

    #生成隊列

    input_queue=tf.train.slice_input_producer([image,label])

    label=input_queue[1]

    image_contents=tf.read_file(input_queue[0])

    image=tf.image.decode_jpeg(image_contents,channels=3)

     

    ##數據增強應該在這里

    image = tf.image.resize_image_with_crop_or_pad(image, image_W, image_H)

    image=tf.image.per_image_standardization(image)

    image_batch,label_batch=tf.train.batch([image,label],batch_size=batch_size,

    num_threads=64,capacity=capacity)

    label_batch=tf.reshape(label_batch,[batch_size])

    image_batch=tf.cast(image_batch,tf.float32)

    return image_batch,label_batch

    為什么要設置一個batch,一個batch 呢?

    如果損失函數是非凸的話,整個樣本就算在超級計算機上可以算的動,也會卡在局部最優上,分批訓練表示全樣本的抽樣實現,也就相當於人為引入修正梯度上的采樣噪聲,使'一路不通找別路'更有可能搜索最優值。

    其中LCLR 2017上有一篇文章專門討論了這個問題:On Large-Batch Training for Deep Learning: Generalization Gap and Sharp Minima

    3建立卷積神經網絡

    import tensorflow as tf

     

    def inference(images,batch_size,n_classes):

    ''''Build the model

    args:

    images:images batch, 4D tensor ,tf,float32,[batch_size,width,height,channels]

    returns:

    output tensor with the computed logits,floar, [batch_size,n_classes]

    #conv1,shape=[kernel size, kernel size,channels, kernel numbers]

    '''

    with tf.variable_scope('conv1') as scope:

    weights=tf.get_variable('weights',shape=[1,1,3, 16],dtype=tf.float32,

    initializer=tf.truncated_normal_initializer(stddev=0.1,dtype=tf.float32))

    biases=tf.get_variable('biases',shape=[16],dtype=tf.float32,

    initializer=tf.constant_initializer(0.1))

    conv=tf.nn.conv2d(images,weights,strides=[1,1,1,1],padding='SAME')

    pre_activation=tf.nn.bias_add(conv,biases)

    conv1=tf.nn.relu(pre_activation,name=scope.name)

     

    #poo11 and norm1

    with tf.variable_scope('pooling1_lrn') as scope:

    pool1=tf.nn.max_pool(conv1,ksize=[1,3,3,1],strides=[1,2,2,1],

    padding='SAME',name='pooling1')

    normal=tf.nn.lrn(pool1,depth_radius=4,bias=1.0,alpha=0.001/9.0,beta=0.75,name='norm1')

    #conv2

    with tf.variable_scope('conv2') as scope:

    weights=tf.get_variable('weights',shape=[3,3,16, 16],dtype=tf.float32,

    initializer=tf.truncated_normal_initializer(stddev=0.1,dtype=tf.float32))

    biases=tf.get_variable('biases',shape=[16],dtype=tf.float32,

    initializer=tf.constant_initializer(0.1))

    conv=tf.nn.conv2d(normal,weights,strides=[1,1,1,1],padding='SAME')

    pre_activation=tf.nn.bias_add(conv,biases)

    conv2=tf.nn.relu(pre_activation,name=scope.name)

     

    #pool2 and norm2

    with tf.variable_scope('pooling1_2rn') as scope:

    pool2=tf.nn.max_pool(conv2,ksize=[1,3,3,1],strides=[1,2,2,1],

    padding='SAME',name='pooling2')

    norma2=tf.nn.lrn(pool2,depth_radius=4,bias=1.0,alpha=0.001/9.0,beta=0.75,name='norm2')

    ##conv3

    with tf.variable_scope('conv3') as scope:

    weights=tf.get_variable('weights',shape=[3,3,16,16],dtype=tf.float32,

    initializer=tf.truncated_normal_initializer(stddev=0.1,dtype=tf.float32))

    biases=tf.get_variable('biases',shape=[16],dtype=tf.float32,

    initializer=tf.constant_initializer(0.1))

    conv=tf.nn.conv2d(norma2,weights,strides=[1,1,1,1],padding='SAME')

    pre_activation=tf.nn.bias_add(conv,biases)

    conv2=tf.nn.relu(pre_activation,name=scope.name)

    #poo11 and norm1

    with tf.variable_scope('pooling3_lrn') as scope:

    norma3=tf.nn.lrn(conv2,depth_radius=4,bias=1.0,alpha=0.001/9.0,beta=0.75,name='norm3')

    pool3=tf.nn.max_pool(norma3,ksize=[1,3,3,1],strides=[1,1,1,1],

    padding='SAME',name='pooling3')

     

    # # local3

    with tf.variable_scope('local3') as scope:

    reshape=tf.reshape(pool3,shape=[batch_size,-1])

    dim=reshape.get_shape()[1].value

    weights=tf.get_variable('weights',shape=[dim,128],dtype=tf.float32,

    initializer=tf.truncated_normal_initializer(stddev=0.005,dtype=tf.float32))

    biases=tf.get_variable('biases',shape=[128],dtype=tf.float32,

    initializer=tf.constant_initializer(0.1))

    local3=tf.nn.relu(tf.matmul(reshape,weights)+biases,name=scope.name)

    # #local4

    # with tf.variable_scope('local4') as scope:

    # weights = tf.get_variable('weights',

    # shape=[128,128],

    # dtype=tf.float32,

    # initializer=tf.truncated_normal_initializer(stddev=0.005,dtype=tf.float32))

    # biases = tf.get_variable('biases',

    # shape=[128],

    # dtype=tf.float32,

    # initializer=tf.constant_initializer(0.1))

    # local4 = tf.nn.relu(tf.matmul(local3, weights) + biases, name='local4')

    #local4

    with tf.variable_scope('local4') as scope:

    weights=tf.get_variable('weights',shape=[128,128],dtype=tf.float32,

    initializer=tf.truncated_normal_initializer(stddev=0.005,dtype=tf.float32))

    biases=tf.get_variable('biases',shape=[128],dtype=tf.float32,

    initializer=tf.constant_initializer(0.1))

    local4=tf.nn.relu(tf.matmul(local3,weights)+biases,name='local4')

    #softmax

    with tf.variable_scope('softmax_linear') as scope:

    weights=tf.get_variable('softmax_linear',shape=[128,n_classes],dtype=tf.float32,

    initializer=tf.truncated_normal_initializer(stddev=0.005,dtype=tf.float32))

    biases = tf.get_variable('biases',

    shape=[n_classes],

    dtype=tf.float32,

    initializer=tf.constant_initializer(0.1))

     

    softmax_linear=tf.add(tf.matmul(local4,weights),biases,name='softmax_linear')

    return softmax_linear

    這里面,我建立了一個1X1的卷積核,建立這個卷積核的作用主要有以下幾個方面考慮:

    假設如果這個1X1卷積層的輸入與輸出都是一個平面,那么1X1卷積僅僅可以對數據進行非線性變化,但是它是完全不考慮像素與周邊其他像素關系。但卷記得輸入輸出如果是長方體,所以1X1卷積實際上是對每個像素點在不同的channels上進行線性組合(信息整合),同時保留了圖片原有的平面結構,通過調節depth,從而完成升維或者降維的功能。

    如下圖,如果選擇2個filters 的1X1 卷積層,那么數據就從原本的depth3 降到2.若用4個filters ,那么就起到了升維的作用。

    我的整個網絡包括三個卷積層,三個全連接層。

    4損失函數部分

    def losses(logits,labels):

    with tf.variable_scope('loss') as scope:

    # cross_entropy=tf.nn.sparse_softmax_cross_entropy_with_logits\(logits=logits,labels=labels,name='xentropy_per_example')

    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels, name='xentropy_per_example')

    loss=tf.reduce_mean(cross_entropy,name='loss')

    tf.summary.scalar(scope.name+'/loss',loss)

    return loss

    def training(loss,learning_rate):

    with tf.name_scope('optimizer'):

    optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate)

    global_step = tf.Variable(0, name='global_step', trainable=False)

    train_op=optimizer.minimize(loss,global_step=global_step)

    return train_op

     

    def evaluation(logits,labels):

    with tf.variable_scope('accuracy') as scope:

    correct=tf.nn.in_top_k(logits,labels,1)

    correct=tf.cast(correct,tf.float16)

    accuracy=tf.reduce_mean(correct)

    tf.summary.scalar(scope.name+'/accuracy',accuracy)

    return accuracy

     

    這部分沒什么好講的,從tensorflow官網上有相似的例程,就是按照那個編寫的。損失函數就是最常用的softmax損失函數。優化方法是AdamOptimizer。。。感覺tensorflow最讓我爽的點就是這里不用自己求梯度。。。曾經因為求梯度,頭發掉了一地。。。。

    5training

    # -*- coding: utf-8 -*-

    """

    Created on Fri Oct 13 08:42:54 2017

     

    @author: Administrator

    """

     

    import os

    import numpy as np

    import tensorflow as tf

    import myinput_data

    import mymodel

     

    ##

     

    N_CLASSES=2

    IMAGE_W=208

    IMAGE_H=208

    BATCH_SIZE=16

    CAPACITY=2000

     

    MAX_STEP=10000

    learning_rate=0.0001

     

    ##

    def run_training():

    train_dir='G:/data/CatVSdogtrain/train/'

    logs_train_dir='G:/data/CatVSdogtrain/logits/train/'

    train,train_label=myinput_data.get_files(train_dir)

    train_batch,train_label_batch=myinput_data.get_batch(train,train_label,

    IMAGE_W,

    IMAGE_H,

    BATCH_SIZE,

    CAPACITY

    )

    train_logits=mymodel.inference(train_batch,BATCH_SIZE,N_CLASSES)

    train_loss=mymodel.losses(train_logits,train_label_batch)

    train_op=mymodel.training(train_loss,learning_rate)

    train_acc=mymodel.evaluation(train_logits,train_label_batch)

     

    summary_op=tf.summary.merge_all()

    sess=tf.Session()

    train_writer=tf.summary.FileWriter(logs_train_dir,sess.graph)

    saver=tf.train.Saver()

     

    sess.run(tf.global_variables_initializer())

    coord=tf.train.Coordinator()

    threads=tf.train.start_queue_runners(sess=sess,coord=coord)

     

    try:

    for step in np.arange(MAX_STEP):

    if coord.should_stop():

    break

    _, tra_loss,tra_acc=sess.run([train_op,train_loss,train_acc])

    if step % 50==0:

    print ('Step %d,train loss=%.2f, train accuracy=%.2f%%'% (step,tra_loss,tra_acc*100.0))

    summary_str = sess.run(summary_op)

    train_writer.add_summary(summary_str, step)

    if step % 2000 == 0 or (step + 1) == MAX_STEP:

    checkpoint_path = os.path.join(logs_train_dir, 'model.ckpt')

    saver.save(sess, checkpoint_path, global_step=step)

    except tf.errors.OutOfRangeError:

    print('Done training -- epoch limit reached')

    finally:

    coord.request_stop()

     

    coord.join(threads)

    sess.close()

     

    這一部分就是保存訓練結果,然后把損失函數調到最小。。。識別率就會高,編寫可以參照tensorflow的例程。

    6 mytest

    from PIL import Image

    import tensorflow as tf

    import matplotlib.pyplot as plt

    import numpy as np

    import myinput_data

    import mytraining

    import mymodel

     

    def get_one_image(train):

    ##隨機的選取一張圖片

    ##return :ndarry

    n=len(train)

    ind=np.random.randint(0,n)

    img_dir=train[ind]

     

    image=Image.open(img_dir)

    plt.imshow(image)

    image=image.resize([208,208])

    image=np.array(image)

    return image

     

    def evaluate_one_image():

    train_dir='G:/data/CatVSdogtrain/train/'

    train,train_label=myinput_data.get_files(train_dir)

    image_array=get_one_image(train)

     

    with tf.Graph().as_default():

    BATCH_SIZE=1

    N_CLASSES=2

     

    image=tf.cast(image_array, tf.float32)

    image=tf.image.per_image_standardization(image)

    image=tf.reshape(image,[1,208,208,3])

    logit=mymodel.inference(image,BATCH_SIZE,N_CLASSES)

    x=tf.placeholder(tf.float32,shape=[208,208,3])

     

    logs_train_dir='G:/data/CatVSdogtrain/logits/train/'

    saver=tf.train.Saver()

    with tf.Session() as sess:

     

    print("Reading checkpoints...")

    ckpt = tf.train.get_checkpoint_state(logs_train_dir)

    if ckpt and ckpt.model_checkpoint_path:

    global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]

    saver.restore(sess, ckpt.model_checkpoint_path)

    print('Loading success, global_step is %s' % global_step)

    # print(sess.run())

    else:

    print('No checkpoint file found')

     

    prediction = sess.run(logit, feed_dict={x: image_array})

    max_index = np.argmax(prediction)

    if max_index==0:

    print('This is a cat with possibility %.6f' %prediction[:, 0])

    print('This is a dog with possibility %.6f' %prediction[:, 1])

    else:

    print('This is a dog with possibility %.6f' %prediction[:, 1])

    print('This is a cat with possibility %.6f' %prediction[:, 0])

     

    運行這一段代碼,然后在命令行執行evaluate_one_image()

    結果如下:

    這個只是最簡單的卷積神經網絡,所以說整個實現過程很簡單,但是追求遠遠不止這些,如果大家有什么對卷積的想法可以一起交流。


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM