SSD算法详解及其 keras实现

本文转载自查看原文 2019-07-22 19:27 1526 目标检测

在上一篇的博客讲述了SSD的原理，这一篇主要是讲解keras的实现。

keras代码的github地址为：点击打开链接

model 的框架实现（ssd.py）：

先给出了改变后的VGG16的实现：


    
    
   
  
   
    
     
      
     
     
     
       def SSD300(input_shape, num_classes=21): 
      
     
    
     
      
     
     
     
       #Input_shape 为输入的形状（300,300,3） 
      
     
    
     
      
     
     
     
       #num_class 为需要检测的种类。 
      
     
    
     
      
     
     
     
        # Block 1 
      
     
    
     
      
     
     
     
       input_tensor = input_tensor = Input(shape=input_shape) 
      
     
    
     
      
     
     
     
       img_size = (input_shape[1], input_shape[0]) 
      
     
    
     
      
     
     
     
       net['input'] = input_tensor 
      
     
    
     
      
     
     
     
       net['conv1_1'] = Convolution2D(64, 3, 3, 
      
     
    
     
      
     
     
     
       activation='relu', 
      
     
    
     
      
     
     
     
       border_mode='same', 
      
     
    
     
      
     
     
     
       name='conv1_1')(net['input']) 
      
     
    
     
      
     
     
     
       net['conv1_2'] = Convolution2D(64, 3, 3, 
      
     
    
     
      
     
     
     
       activation='relu', 
      
     
    
     
      
     
     
     
       border_mode='same', 
      
     
    
     
      
     
     
     
       name='conv1_2')(net['conv1_1']) 
      
     
    
     
      
     
     
     
       net['pool1'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same', 
      
     
    
     
      
     
     
     
       name='pool1')(net['conv1_2']) 
      
     
    
     
      
     
     
     
       # Block 2 
      
     
    
     
      
     
     
     
       net['conv2_1'] = Convolution2D(128, 3, 3, 
      
     
    
     
      
     
     
     
       activation='relu', 
      
     
    
     
      
     
     
     
       border_mode='same', 
      
     
    
     
      
     
     
     
       name='conv2_1')(net['pool1']) 
      
     
    
     
      
     
     
     
       net['conv2_2'] = Convolution2D(128, 3, 3, 
      
     
    
     
      
     
     
     
       activation='relu', 
      
     
    
     
      
     
     
     
       border_mode='same', 
      
     
    
     
      
     
     
     
       name='conv2_2')(net['conv2_1']) 
      
     
    
     
      
     
     
     
       net['pool2'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same', 
      
     
    
     
      
     
     
     
       name='pool2')(net['conv2_2']) 
      
     
    
     
      
     
     
     
       # Block 3 
      
     
    
     
      
     
     
     
       net['conv3_1'] = Convolution2D(256, 3, 3, 
      
     
    
     
      
     
     
     
       activation='relu', 
      
     
    
     
      
     
     
     
       border_mode='same', 
      
     
    
     
      
     
     
     
       name='conv3_1')(net['pool2']) 
      
     
    
     
      
     
     
     
       net['conv3_2'] = Convolution2D(256, 3, 3, 
      
     
    
     
      
     
     
     
       activation='relu', 
      
     
    
     
      
     
     
     
       border_mode='same', 
      
     
    
     
      
     
     
     
       name='conv3_2')(net['conv3_1']) 
      
     
    
     
      
     
     
     
       net['conv3_3'] = Convolution2D(256, 3, 3, 
      
     
    
     
      
     
     
     
       activation='relu', 
      
     
    
     
      
     
     
     
       border_mode='same', 
      
     
    
     
      
     
     
     
       name='conv3_3')(net['conv3_2']) 
      
     
    
     
      
     
     
     
       net['pool3'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same', 
      
     
    
     
      
     
     
     
       name='pool3')(net['conv3_3']) 
      
     
    
     
      
     
     
     
       # Block 4 
      
     
    
     
      
     
     
     
       net['conv4_1'] = Convolution2D(512, 3, 3, 
      
     
    
     
      
     
     
     
       activation='relu', 
      
     
    
     
      
     
     
     
       border_mode='same', 
      
     
    
     
      
     
     
     
       name='conv4_1')(net['pool3']) 
      
     
    
     
      
     
     
     
       net['conv4_2'] = Convolution2D(512, 3, 3, 
      
     
    
     
      
     
     
     
       activation='relu', 
      
     
    
     
      
     
     
     
       border_mode='same', 
      
     
    
     
      
     
     
     
       name='conv4_2')(net['conv4_1']) 
      
     
    
     
      
     
     
     
       net['conv4_3'] = Convolution2D(512, 3, 3, 
      
     
    
     
      
     
     
     
       activation='relu', 
      
     
    
     
      
     
     
     
       border_mode='same', 
      
     
    
     
      
     
     
     
       name='conv4_3')(net['conv4_2']) 
      
     
    
     
      
     
     
     
       net['pool4'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same', 
      
     
    
     
      
     
     
     
       name='pool4')(net['conv4_3']) 
      
     
    
     
      
     
     
     
       # Block 5 
      
     
    
     
      
     
     
     
       net['conv5_1'] = Convolution2D(512, 3, 3, 
      
     
    
     
      
     
     
     
       activation='relu', 
      
     
    
     
      
     
     
     
       border_mode='same', 
      
     
    
     
      
     
     
     
       name='conv5_1')(net['pool4']) 
      
     
    
     
      
     
     
     
       net['conv5_2'] = Convolution2D(512, 3, 3, 
      
     
    
     
      
     
     
     
       activation='relu', 
      
     
    
     
      
     
     
     
       border_mode='same', 
      
     
    
     
      
     
     
     
       name='conv5_2')(net['conv5_1']) 
      
     
    
     
      
     
     
     
       net['conv5_3'] = Convolution2D(512, 3, 3, 
      
     
    
     
      
     
     
     
       activation='relu', 
      
     
    
     
      
     
     
     
       border_mode='same', 
      
     
    
     
      
     
     
     
       name='conv5_3')(net['conv5_2']) 
      
     
    
     
      
     
     
     
       net['pool5'] = MaxPooling2D((3, 3), strides=(1, 1), border_mode='same', 
      
     
    
     
      
     
     
     
       name='pool5')(net['conv5_3']) 
      
     
    
     
      
     
     
      
       # FC6 
      
     
    
     
      
     
     
     
       net['fc6'] = AtrousConvolution2D(1024, 3, 3, atrous_rate=(6, 6), 
      
     
    
     
      
     
     
     
       activation='relu', border_mode='same', 
      
     
    
     
      
     
     
     
       name='fc6')(net['pool5']) 
      
     
    
     
      
     
     
     
       # FC7 
      
     
    
     
      
     
     
     
       net['fc7'] = Convolution2D(1024, 1, 1, activation='relu', 
      
     
    
     
      
     
     
     
       border_mode='same', name='fc7')(net['fc6']) 
      
     
    
     
      
     
     
     
       # Block 6 
      
     
    
     
      
     
     
     
       net['conv6_1'] = Convolution2D(256, 1, 1, activation='relu', 
      
     
    
     
      
     
     
     
       border_mode='same', 
      
     
    
     
      
     
     
     
       name='conv6_1')(net['fc7']) 
      
     
    
     
      
     
     
     
       net['conv6_2'] = Convolution2D(512, 3, 3, subsample=(2, 2), 
      
     
    
     
      
     
     
     
       activation='relu', border_mode='same', 
      
     
    
     
      
     
     
     
       name='conv6_2')(net['conv6_1']) 
      
     
    
     
      
     
     
     
       # Block 7 
      
     
    
     
      
     
     
     
       net['conv7_1'] = Convolution2D(128, 1, 1, activation='relu', 
      
     
    
     
      
     
     
     
       border_mode='same', 
      
     
    
     
      
     
     
     
       name='conv7_1')(net['conv6_2']) 
      
     
    
     
      
     
     
     
       net['conv7_2'] = ZeroPadding2D()(net['conv7_1']) 
      
     
    
     
      
     
     
     
       net['conv7_2'] = Convolution2D(256, 3, 3, subsample=(2, 2), 
      
     
    
     
      
     
     
     
       activation='relu', border_mode='valid', 
      
     
    
     
      
     
     
     
       name='conv7_2')(net['conv7_2']) 
      
     
    
     
      
     
     
     
       # Block 8 
      
     
    
     
      
     
     
     
       net['conv8_1'] = Convolution2D(128, 1, 1, activation='relu', 
      
     
    
     
      
     
     
     
       border_mode='same', 
      
     
    
     
      
     
     
     
       name='conv8_1')(net['conv7_2']) 
      
     
    
     
      
     
     
     
       net['conv8_2'] = Convolution2D(256, 3, 3, subsample=(2, 2), 
      
     
    
     
      
     
     
     
       activation='relu', border_mode='same', 
      
     
    
     
      
     
     
     
       name='conv8_2')(net['conv8_1']) 
      
     
    
     
      
     
     
     
       # Last Pool 
      
     
    
     
      
     
     
     
       net['pool6'] = GlobalAveragePooling2D(name='pool6')(net['conv8_2'])

标红部分就是进行改变的部分，可以看出把FC6换成了空洞卷积，和普通卷积差不多，就是把一次卷积的感受域扩大了。FC7换成了普通卷积，之后再添加了几个卷积块。

接下来就是通过改变后的VGG16得到的多层feature map来预测location 和 confidence。使用到的feature map 有：conv4_3、fc7、conv6_2、conv7_2、conv8_2、pool6。总共6层的feature map。因为对于每层的处理步骤差不多，所以就贴出conv4_3处理的代码：


    
    
   
  
   
    
     
      
     
     
     
       # Prediction from conv4_3 
      
     
    
     
      
     
     
     
           net['conv4_3_norm'] = Normalize(20, name='conv4_3_norm')(net['conv4_3']) 
      
     
    
     
      
     
     
     
           num_priors = 3 
      
     
    
     
      
     
     
     
           x = Convolution2D(num_priors * 4, 3, 3, border_mode='same', 
      
     
    
     
      
     
     
     
                             name='conv4_3_norm_mbox_loc')(net['conv4_3_norm']) 
      
     
    
     
      
     
     
     
           net['conv4_3_norm_mbox_loc'] = x 
      
     
    
     
      
     
     
     
           flatten = Flatten(name='conv4_3_norm_mbox_loc_flat') 
      
     
    
     
      
     
     
     
           net['conv4_3_norm_mbox_loc_flat'] = flatten(net['conv4_3_norm_mbox_loc']) 
      
     
    
     
      
     
     
     
           name = 'conv4_3_norm_mbox_conf' 
      
     
    
     
      
     
     
     
           if num_classes != 21: 
      
     
    
     
      
     
     
     
               name += '_{}'.format(num_classes) 
      
     
    
     
      
     
     
     
           x = Convolution2D(num_priors * num_classes, 3, 3, border_mode='same', 
      
     
    
     
      
     
     
     
                             name=name)(net['conv4_3_norm']) 
      
     
    
     
      
     
     
     
           net['conv4_3_norm_mbox_conf'] = x 
      
     
    
     
      
     
     
     
           flatten = Flatten(name='conv4_3_norm_mbox_conf_flat') 
      
     
    
     
      
     
     
     
           net['conv4_3_norm_mbox_conf_flat'] = flatten(net['conv4_3_norm_mbox_conf']) 
      
     
    
     
      
     
     
     
           priorbox = PriorBox(img_size, 30.0, aspect_ratios=[2], 
      
     
    
     
      
     
     
     
                               variances=[0.1, 0.1, 0.2, 0.2], 
      
     
    
     
      
     
     
     
                               name='conv4_3_norm_mbox_priorbox') 
      
     
    
     
      
     
     
     
           net['conv4_3_norm_mbox_priorbox'] = priorbox(net['conv4_3_norm'])

可以看出对于conv4_3这层的feature map，采用的default box 的个数为3。所以location预测这个卷积层使用的卷积核个数为：3*4=12个。卷积完之后进行flatten，因为最后的输出是多层feature map预测的concatenate。同理，对于confidence预测采用的卷积核个数为：21*3=36（对于voc数据集而言）。对于PriorBox这一层，目前只需要知道它是对feature map 进行相应的操作，来得到default box的，而且对于特定的一层feature map而言，它是固定不变的，不随train或者predict的过程改变的。

对于pool6产生的feature map处理有一些不一样，这里单独的拿出来说一下，因为pool6层使用的是globa laverage pool，所以它输出的大小为1*1*256，比较小，不太适合用卷积处理了，就直接用Dense层来处理了：


    
    
   
  
   
    
     
      
     
     
     
       # Prediction from pool6 
      
     
    
     
      
     
     
     
           num_priors = 6 
      
     
    
     
      
     
     
     
           x = Dense(num_priors * 4, name='pool6_mbox_loc_flat')(net['pool6']) 
      
     
    
     
      
     
     
     
           net['pool6_mbox_loc_flat'] = x 
      
     
    
     
      
     
     
     
           name = 'pool6_mbox_conf_flat' 
      
     
    
     
      
     
     
     
           if num_classes != 21: 
      
     
    
     
      
     
     
     
               name += '_{}'.format(num_classes) 
      
     
    
     
      
     
     
     
           x = Dense(num_priors * num_classes, name=name)(net['pool6']) 
      
     
    
     
      
     
     
     
           net['pool6_mbox_conf_flat'] = x 
      
     
    
     
      
     
     
     
           priorbox = PriorBox(img_size, 276.0, max_size=330.0, aspect_ratios=[2, 3], 
      
     
    
     
      
     
     
     
                               variances=[0.1, 0.1, 0.2, 0.2], 
      
     
    
     
      
     
     
     
                               name='pool6_mbox_priorbox') 
      
     
    
     
      
     
     
     
           if K.image_dim_ordering() == 'tf': 
      
     
    
     
      
     
     
     
               target_shape = (1, 1, 256) 
      
     
    
     
      
     
     
     
           else: 
      
     
    
     
      
     
     
     
               target_shape = (256, 1, 1) 
      
     
    
     
      
     
     
     
           net['pool6_reshaped'] = Reshape(target_shape, 
      
     
    
     
      
     
     
     
                                           name='pool6_reshaped')(net['pool6']) 
      
     
    
     
      
     
     
     
           net['pool6_mbox_priorbox'] = priorbox(net['pool6_reshaped'])

每层预测完事之后呢，当然是把他们都给concatenate起来，就贴location的实现，其他两个类似：


    
    
   
  
   
    
     
      
     
     
     
       net['mbox_loc'] = merge([net['conv4_3_norm_mbox_loc_flat'], 
      
     
    
     
      
     
     
     
       net['fc7_mbox_loc_flat'], 
      
     
    
     
      
     
     
     
       net['conv6_2_mbox_loc_flat'], 
      
     
    
     
      
     
     
     
       net['conv7_2_mbox_loc_flat'], 
      
     
    
     
      
     
     
     
       net['conv8_2_mbox_loc_flat'], 
      
     
    
     
      
     
     
     
       net['pool6_mbox_loc_flat']], 
      
     
    
     
      
     
     
     
       mode='concat', concat_axis=1, name='mbox_loc')

因为之前进行了flatten，所以concatenate得到的是一个batch中每个sample所有的location位置，并且是一个一维的形式存在，需要把它给重新reshape成[batch, number of default box, 4 ]的形式；预测的class分类也是类似的：[batch, number of default box, 21 ]。最后再将location、class、default box三者进行merge得到最终的预测结果。


    
    
   
  
   
    
     
      
     
     
     
           #计算default box 的个数 
      
     
    
     
      
     
     
     
          if hasattr(net['mbox_loc'], '_keras_shape'): 
      
     
    
     
      
     
     
     
       num_boxes = net['mbox_loc']._keras_shape[-1] // 4 
      
     
    
     
      
     
     
     
       elif hasattr(net['mbox_loc'], 'int_shape'): 
      
     
    
     
      
     
     
     
       num_boxes = K.int_shape(net['mbox_loc'])[-1] // 4 
      
     
    
     
      
     
     
     
       net['mbox_loc'] = Reshape((num_boxes, 4), 
      
     
    
     
      
     
     
     
       name='mbox_loc_final')(net['mbox_loc']) 
      
     
    
     
      
     
     
     
       net['mbox_conf'] = Reshape((num_boxes, num_classes), 
      
     
    
     
      
     
     
     
       name='mbox_conf_logits')(net['mbox_conf']) 
      
     
    
     
      
     
     
     
       net['mbox_conf'] = Activation('softmax', 
      
     
    
     
      
     
     
     
       name='mbox_conf_final')(net['mbox_conf']) 
      
     
    
     
      
     
     
     
       net['predictions'] = merge([net['mbox_loc'], 
      
     
    
     
      
     
     
     
       net['mbox_conf'], 
      
     
    
     
      
     
     
     
       net['mbox_priorbox']], 
      
     
    
     
      
     
     
     
       mode='concat', concat_axis=2, 
      
     
    
     
      
     
     
     
       name='predictions')

我们来计算一下这六层feature map总共拥有的default box的数量：38*38*3+19*19*6+10*10*6+5*5*6+3*3*6+1*1*6=7308。和论文中还是存在一定的差别的。

接一下就是介绍一下model中使用到的PriorBox层的作用。它是作用在每一层的feature map上的，根据输入的不同aspect ratio 和 scale 以及 num_prior来返回特定的default box，default box 的数目是feature map的height*width*num_prior。具体看代码：


    
    
   
  
   
    
     
      
     
     
     
       class PriorBox(Layer): 
      
     
    
     
      
     
     
     
              ''' 
      
     
    
     
      
     
     
     
               img_size: 输入图片的大小（w, h）. 
      
     
    
     
      
     
     
     
       min_size: 每个feature cell中最小的scale，不是归一化后的值，而是实际的大小 
      
     
    
     
      
     
     
     
       max_size: 每个feature cell中最大的scale，不是归一化的值，而是实际的大小 
      
     
    
     
      
     
     
     
       aspect_ratios: 长宽比 
      
     
    
     
      
     
     
     
       flip:是否需要对长宽比进行反转。 
      
     
    
     
      
     
     
     
       variances: 添加的方差x,y,w,h 
      
     
    
     
      
     
     
     
       clip: 让输出保持在[0,1之间 
      
     
    
     
      
     
     
     
       输入的shape： 
      
     
    
     
      
     
     
     
       `4D的tensor：(samples, rows, cols, channels) 
      
     
    
     
      
     
     
     
       输出的shape： 
      
     
    
     
      
     
     
     
       3D的tensor：(samples, num_boxes, 8) 
      
     
    
     
      
     
     
     
               其中的8具体为：（xmin, ymin, xmax, ymax, variance[0], variance[1], variance[2], variance[3]） 
      
     
    
     
      
     
     
     
       """ 
      
     
    
     
      
     
     
     
       def __init__(self, img_size, min_size, max_size=None, aspect_ratios=None, 
      
     
    
     
      
     
     
     
       flip=True, variances=[0.1], clip=True, **kwargs): 
      
     
    
     
      
     
     
     
         self.waxis = 2 
      
     
    
     
      
     
     
     
       self.haxis = 1 
      
     
    
     
      
     
     
     
       self.img_size = img_size 
      
     
    
     
      
     
     
     
       if min_size 
      <= 0: 
      
     
    
     
      
     
     
      
       raise Exception('min_size must be positive.') 
      
     
    
     
      
     
     
      
       self.min_size = min_size 
      
     
    
     
      
     
     
      
       self.max_size = max_size 
      
     
    
     
      
     
     
      
       self.aspect_ratios = [1.0] 
      
     
    
     
      
     
     
      
       if max_size: 
      
     
    
     
      
     
     
      
       if max_size < min_size: 
      
     
    
     
      
     
     
      
       raise Exception('max_size must be greater than min_size.') 
      
     
    
     
      
     
     
      
       self.aspect_ratios.append(1.0) 
      
     
    
     
      
     
     
      
              #根据给定的aspect_ratio来计算全部的aspect ratio 
      
     
    
     
      
     
     
      
              if aspect_ratios: 
      
     
    
     
      
     
     
      
       for ar in aspect_ratios: 
      
     
    
     
      
     
     
      
       if ar in self.aspect_ratios: 
      
     
    
     
      
     
     
      
       continue 
      
     
    
     
      
     
     
      
       self.aspect_ratios.append(ar) 
      
     
    
     
      
     
     
      
       if flip: 
      
     
    
     
      
     
     
      
       self.aspect_ratios.append(1.0 / ar) 
      
     
    
     
      
     
     
      
       self.variances = np.array(variances) 
      
     
    
     
      
     
     
      
       self.clip = True 
      
     
    
     
      
     
     
      
       super(PriorBox, self).__init__(**kwargs) 
      
     
    
     
      
     
     
      
          #用于返回自定义层的输出shape 
      
     
    
     
      
     
     
      
       def compute_output_shape(self, input_shape): 
      
     
    
     
      
     
     
      
       num_priors_ = len(self.aspect_ratios) 
      
     
    
     
      
     
     
      
       layer_width = input_shape[self.waxis] 
      
     
    
     
      
     
     
      
       layer_height = input_shape[self.haxis] 
      
     
    
     
      
     
     
      
       num_boxes = num_priors_ * layer_width * layer_height 
      
     
    
     
      
     
     
      
       return (input_shape[0], num_boxes, 8) 
      
     
    
     
      
     
     
      
       
      
     
    
     
      
     
     
      
       def call(self, x, mask=None): 
      
     
    
     
      
     
     
      
       if hasattr(x, '_keras_shape'): 
      
     
    
     
      
     
     
      
       input_shape = x._keras_shape 
      
     
    
     
      
     
     
      
       elif hasattr(K, 'int_shape'): 
      
     
    
     
      
     
     
      
       input_shape = K.int_shape(x) 
      
     
    
     
      
     
     
      
       layer_width = input_shape[self.waxis] 
      
     
    
     
      
     
     
      
       layer_height = input_shape[self.haxis] 
      
     
    
     
      
     
     
      
       img_width = self.img_size[0] 
      
     
    
     
      
     
     
      
       img_height = self.img_size[1] 
      
     
    
     
      
     
     
      
       # define prior boxes shapes 
      
     
    
     
      
     
     
      
       box_widths = [] 
      
     
    
     
      
     
     
      
       box_heights = [] 
      
     
    
     
      
     
     
      
       for ar in self.aspect_ratios: 
      
     
    
     
      
     
     
      
       if ar == 1 and len(box_widths) == 0: 
      
     
    
     
      
     
     
      
       box_widths.append(self.min_size) 
      
     
    
     
      
     
     
      
       box_heights.append(self.min_size) 
      
     
    
     
      
     
     
      
       elif ar == 1 and len(box_widths) > 0: 
      
     
    
     
      
     
     
     
       box_widths.append(np.sqrt(self.min_size * self.max_size)) 
      
     
    
     
      
     
     
     
       box_heights.append(np.sqrt(self.min_size * self.max_size)) 
      
     
    
     
      
     
     
     
       elif ar != 1: 
      
     
    
     
      
     
     
     
       box_widths.append(self.min_size * np.sqrt(ar)) 
      
     
    
     
      
     
     
     
       box_heights.append(self.min_size / np.sqrt(ar)) 
      
     
    
     
      
     
     
     
       box_widths = 0.5 * np.array(box_widths) 
      
     
    
     
      
     
     
     
       box_heights = 0.5 * np.array(box_heights) 
      
     
    
     
      
     
     
     
       # define centers of prior boxes 
      
     
    
     
      
     
     
     
       step_x = img_width / layer_width 
      
     
    
     
      
     
     
     
       step_y = img_height / layer_height 
      
     
    
     
      
     
     
     
       #用于产生default box的中心坐标 
      
     
    
     
      
     
     
     
       linx = np.linspace(0.5 * step_x, img_width - 0.5 * step_x, 
      
     
    
     
      
     
     
     
       layer_width) 
      
     
    
     
      
     
     
     
       liny = np.linspace(0.5 * step_y, img_height - 0.5 * step_y, 
      
     
    
     
      
     
     
     
       layer_height) 
      
     
    
     
      
     
     
     
       centers_x, centers_y = np.meshgrid(linx, liny) 
      
     
    
     
      
     
     
     
       centers_x = centers_x.reshape(-1, 1) 
      
     
    
     
      
     
     
     
       centers_y = centers_y.reshape(-1, 1) 
      
     
    
     
      
     
     
     
       # define xmin, ymin, xmax, ymax of prior boxes 
      
     
    
     
      
     
     
     
       num_priors_ = len(self.aspect_ratios) 
      
     
    
     
      
     
     
     
               #concatenate之后得到了一连串的(centers_x，centers_y)形式的坐标 
      
     
    
     
      
     
     
     
               prior_boxes = np.concatenate((centers_x, centers_y), axis=1) 
      
     
    
     
      
     
     
     
               #扩充得到（centers_x, centers_y, centers_x, centers_y）形式的坐标 
      
     
    
     
      
     
     
     
               prior_boxes = np.tile(prior_boxes, (1, 2 * num_priors_)) 
      
     
    
     
      
     
     
     
       prior_boxes[:, ::4] -= box_widths 
      
     
    
     
      
     
     
     
       prior_boxes[:, 1::4] -= box_heights 
      
     
    
     
      
     
     
     
       prior_boxes[:, 2::4] += box_widths 
      
     
    
     
      
     
     
     
       prior_boxes[:, 3::4] += box_heights 
      
     
    
     
      
     
     
     
       prior_boxes[:, ::2] /= img_width 
      
     
    
     
      
     
     
     
       prior_boxes[:, 1::2] /= img_height 
      
     
    
     
      
     
     
     
               #最终得到各个default box的归一化后的（Xmin，Ymin， Xmax， Ymax） 
      
     
    
     
      
     
     
     
               #reshape成[num_box, 4]的形式 
      
     
    
     
      
     
     
     
               prior_boxes = prior_boxes.reshape(-1, 4) 
      
     
    
     
      
     
     
     
       if self.clip: 
      
     
    
     
      
     
     
     
       prior_boxes = np.minimum(np.maximum(prior_boxes, 0.0), 1.0) 
      
     
    
     
      
     
     
     
       # define variances 
      
     
    
     
      
     
     
     
       num_boxes = len(prior_boxes) 
      
     
    
     
      
     
     
     
       if len(self.variances) == 1: 
      
     
    
     
      
     
     
     
       variances = np.ones((num_boxes, 4)) * self.variances[0] 
      
     
    
     
      
     
     
     
       elif len(self.variances) == 4: 
      
     
    
     
      
     
     
     
       variances = np.tile(self.variances, (num_boxes, 1)) 
      
     
    
     
      
     
     
     
       else: 
      
     
    
     
      
     
     
     
       raise Exception('Must provide one or four variances.') 
      
     
    
     
      
     
     
     
               ##把variance加入到输出之中。 
      
     
    
     
      
     
     
     
               prior_boxes = np.concatenate((prior_boxes, variances), axis=1) 
      
     
    
     
      
     
     
     
       prior_boxes_tensor = K.expand_dims(K.variable(prior_boxes), 0) 
      
     
    
     
      
     
     
     
       if K.backend() == 'tensorflow': 
      
     
    
     
      
     
     
     
       pattern = [tf.shape(x)[0], 1, 1] 
      
     
    
     
      
     
     
     
       prior_boxes_tensor = tf.tile(prior_boxes_tensor, pattern) 
      
     
    
     
      
     
     
     
       return prior_boxes_tensor

综合上面对model的分析，最后预测输出的shape为：[batch_size, num_box, location+num_class+8]

整体的架构完事之后，就需要准备好数据和loss function了，先看看如何预处理数据吧。

model的数据准备：

代码中编写了一个处理VOC数据集的py文件：


    
    
   
  
   
    
     
      
     
     
     
       import numpy as np 
      
     
    
     
      
     
     
     
       import os 
      
     
    
     
      
     
     
     
       from xml.etree import ElementTree 
      
     
    
     
      
     
     
      
      
     
    
     
      
     
     
     
       class XML_preprocessor(object): 
      
     
    
     
      
     
     
     
       #输出为：{image_name: [num_image, num_object_per_image, location+num_class]} 
      
     
    
     
      
     
     
     
       def __init__(self, data_path): 
      
     
    
     
      
     
     
     
       self.path_prefix = data_path 
      
     
    
     
      
     
     
     
       self.num_classes = 20 
      
     
    
     
      
     
     
     
       self.data = dict() 
      
     
    
     
      
     
     
     
       self._preprocess_XML() 
      
     
    
     
      
     
     
      
      
     
    
     
      
     
     
     
       def _preprocess_XML(self): 
      
     
    
     
      
     
     
     
       filenames = os.listdir(self.path_prefix) 
      
     
    
     
      
     
     
     
       for filename in filenames: 
      
     
    
     
      
     
     
     
       tree = ElementTree.parse(self.path_prefix + filename) 
      
     
    
     
      
     
     
     
       root = tree.getroot() 
      
     
    
     
      
     
     
     
       bounding_boxes = [] 
      
     
    
     
      
     
     
     
       one_hot_classes = [] 
      
     
    
     
      
     
     
     
       size_tree = root.find('size') 
      
     
    
     
      
     
     
     
       width = float(size_tree.find('width').text) 
      
     
    
     
      
     
     
     
       height = float(size_tree.find('height').text) 
      
     
    
     
      
     
     
     
       for object_tree in root.findall('object'): 
      
     
    
     
      
     
     
     
       for bounding_box in object_tree.iter('bndbox'): 
      
     
    
     
      
     
     
     
       xmin = float(bounding_box.find('xmin').text)/width 
      
     
    
     
      
     
     
     
       ymin = float(bounding_box.find('ymin').text)/height 
      
     
    
     
      
     
     
     
       xmax = float(bounding_box.find('xmax').text)/width 
      
     
    
     
      
     
     
     
       ymax = float(bounding_box.find('ymax').text)/height 
      
     
    
     
      
     
     
     
       bounding_box = [xmin,ymin,xmax,ymax] 
      
     
    
     
      
     
     
     
       bounding_boxes.append(bounding_box) 
      
     
    
     
      
     
     
     
       class_name = object_tree.find('name').text 
      
     
    
     
      
     
     
     
       one_hot_class = self._to_one_hot(class_name) 
      
     
    
     
      
     
     
     
       one_hot_classes.append(one_hot_class) 
      
     
    
     
      
     
     
     
       image_name = root.find('filename').text 
      
     
    
     
      
     
     
     
       bounding_boxes = np.asarray(bounding_boxes) 
      
     
    
     
      
     
     
     
       one_hot_classes = np.asarray(one_hot_classes) 
      
     
    
     
      
     
     
     
       image_data = np.hstack((bounding_boxes, one_hot_classes)) 
      
     
    
     
      
     
     
     
       self.data[image_name] = image_data 
      
     
    
     
      
     
     
      
      
     
    
     
      
     
     
     
       def _to_one_hot(self,name): 
      
     
    
     
      
     
     
     
       one_hot_vector = [0] * self.num_classes 
      
     
    
     
      
     
     
     
       if name == 'aeroplane': 
      
     
    
     
      
     
     
     
       one_hot_vector[0] = 1 
      
     
    
     
      
     
     
     
       elif name == 'bicycle': 
      
     
    
     
      
     
     
     
       one_hot_vector[1] = 1 
      
     
    
     
      
     
     
     
       elif name == 'bird': 
      
     
    
     
      
     
     
     
       one_hot_vector[2] = 1 
      
     
    
     
      
     
     
     
       elif name == 'boat': 
      
     
    
     
      
     
     
     
       one_hot_vector[3] = 1 
      
     
    
     
      
     
     
     
       elif name == 'bottle': 
      
     
    
     
      
     
     
     
       one_hot_vector[4] = 1 
      
     
    
     
      
     
     
     
       elif name == 'bus': 
      
     
    
     
      
     
     
     
       one_hot_vector[5] = 1 
      
     
    
     
      
     
     
     
       elif name == 'car': 
      
     
    
     
      
     
     
     
       one_hot_vector[6] = 1 
      
     
    
     
      
     
     
     
       elif name == 'cat': 
      
     
    
     
      
     
     
     
       one_hot_vector[7] = 1 
      
     
    
     
      
     
     
     
       elif name == 'chair': 
      
     
    
     
      
     
     
     
       one_hot_vector[8] = 1 
      
     
    
     
      
     
     
     
       elif name == 'cow': 
      
     
    
     
      
     
     
     
       one_hot_vector[9] = 1 
      
     
    
     
      
     
     
     
       elif name == 'diningtable': 
      
     
    
     
      
     
     
     
       one_hot_vector[10] = 1 
      
     
    
     
      
     
     
     
       elif name == 'dog': 
      
     
    
     
      
     
     
     
       one_hot_vector[11] = 1 
      
     
    
     
      
     
     
     
       elif name == 'horse': 
      
     
    
     
      
     
     
     
       one_hot_vector[12] = 1 
      
     
    
     
      
     
     
     
       elif name == 'motorbike': 
      
     
    
     
      
     
     
     
       one_hot_vector[13] = 1 
      
     
    
     
      
     
     
     
       elif name == 'person': 
      
     
    
     
      
     
     
     
       one_hot_vector[14] = 1 
      
     
    
     
      
     
     
     
       elif name == 'pottedplant': 
      
     
    
     
      
     
     
     
       one_hot_vector[15] = 1 
      
     
    
     
      
     
     
     
       elif name == 'sheep': 
      
     
    
     
      
     
     
     
       one_hot_vector[16] = 1 
      
     
    
     
      
     
     
     
       elif name == 'sofa': 
      
     
    
     
      
     
     
     
       one_hot_vector[17] = 1 
      
     
    
     
      
     
     
     
       elif name == 'train': 
      
     
    
     
      
     
     
     
       one_hot_vector[18] = 1 
      
     
    
     
      
     
     
     
       elif name == 'tvmonitor': 
      
     
    
     
      
     
     
     
       one_hot_vector[19] = 1 
      
     
    
     
      
     
     
     
       else: 
      
     
    
     
      
     
     
     
       print('unknown label: %s' %name) 
      
     
    
     
      
     
     
     
       return one_hot_vector 
      
     
    
     
      
     
     
     
       ## 写入到pkl文件中。 
      
     
    
     
      
     
     
     
       import pickle 
      
     
    
     
      
     
     
     
       data = XML_preprocessor('VOC2007/Annotations/').data 
      
     
    
     
      
     
     
     
        pickle.dump(data,open('VOC2007.p','wb'))

把标注写入到pkl文件中后，再利用定义一个Generator类来产生x_batch和 y_batch用于训练，直接看重点，类中的generate函数：


    
    
   
  
   
    
     
      
     
     
     
       def generate(self, train=True): 
      
     
    
     
      
     
     
     
       while True: 
      
     
    
     
      
     
     
     
       if train: 
      
     
    
     
      
     
     
     
       shuffle(self.train_keys) 
      
     
    
     
      
     
     
     
       keys = self.train_keys 
      
     
    
     
      
     
     
     
       else: 
      
     
    
     
      
     
     
     
       shuffle(self.val_keys) 
      
     
    
     
      
     
     
     
       keys = self.val_keys 
      
     
    
     
      
     
     
     
       inputs = [] 
      
     
    
     
      
     
     
     
       targets = [] 
      
     
    
     
      
     
     
     
       for key in keys: 
      
     
    
     
      
     
     
     
       img_path = self.path_prefix + key 
      
     
    
     
      
     
     
     
       img = imread(img_path).astype('float32') 
      
     
    
     
      
     
     
     
       y = self.gt[key].copy()#从pkl文件读取而来的groud truth 
      
     
    
     
      
     
     
     
       ##y的shape是一张图片中box的数目和位置+类别。（num_box, coordinate+num_class） 
      
     
    
     
      
     
     
     
       if train and self.do_crop: 
      
     
    
     
      
     
     
     
       img, y = self.random_sized_crop(img, y) 
      
     
    
     
      
     
     
     
       img = imresize(img, self.image_size).astype('float32') 
      
     
    
     
      
     
     
     
       if train:#进行数据扩充 
      
     
    
     
      
     
     
     
       shuffle(self.color_jitter) 
      
     
    
     
      
     
     
     
       for jitter in self.color_jitter: 
      
     
    
     
      
     
     
     
       img = jitter(img) 
      
     
    
     
      
     
     
     
       if self.lighting_std: 
      
     
    
     
      
     
     
     
       img = self.lighting(img) 
      
     
    
     
      
     
     
     
       if self.hflip_prob > 0: 
      
     
    
     
      
     
     
     
       img, y = self.horizontal_flip(img, y) 
      
     
    
     
      
     
     
     
       if self.vflip_prob > 0: 
      
     
    
     
      
     
     
     
       img, y = self.vertical_flip(img, y) 
      
     
    
     
      
     
     
     
        y = self.bbox_util.assign_boxes(y) #给groud truth 分配 default box 
      
     
    
     
      
     
     
     
       inputs.append(img) 
      
     
    
     
      
     
     
     
       targets.append(y) 
      
     
    
     
      
     
     
     
       if len(targets) == self.batch_size: 
      
     
    
     
      
     
     
     
       tmp_inp = np.array(inputs) 
      
     
    
     
      
     
     
     
       tmp_targets = np.array(targets) 
      
     
    
     
      
     
     
     
       inputs = [] 
      
     
    
     
      
     
     
     
       targets = [] 
      
     
    
     
      
     
     
     
       yield preprocess_input(tmp_inp), tmp_targets#产生一个batch的输入数据，及其标准的输出label。

在给groud truth 分配 default box 时用到了BBoxUtility类中的assign_boxes函数，这个类是写在ssd_utils.py文件中的，其中的assign_boxes函数的代码如下：


    
    
   
  
   
    
     
      
     
     
     
       #用于给label分配高分的default box 
      
     
    
     
      
     
     
     
       def assign_boxes(self, boxes): 
      
     
    
     
      
     
     
     
       #变量： boxes: Box,它的shape为：(num_boxes, 4 + num_classes),其中num_classes没有包括背景 
      
     
    
     
      
     
     
     
       #返回值： assignment：它的shape为： (num_boxes, 4 + num_classes + 8), 
      
     
    
     
      
     
     
     
       #第二维上的8其实很多都是0，只有在assignment[:, -8]存在1，代表给default box分配了哪个groud truth 
      
     
    
     
      
     
     
     
               assignment = np.zeros((self.num_priors, 4 + self.num_classes + 8)) 
      
     
    
     
      
     
     
     
               assignment[:, 4] = 1.0 
      
     
    
     
      
     
     
     
               if len(boxes) == 0: 
      
     
    
     
      
     
     
     
                   return assignment 
      
     
    
     
      
     
     
     
               encoded_boxes = np.apply_along_axis(self.encode_box, 1, boxes[:, :4]) 
      
     
    
     
      
     
     
     
               encoded_boxes = encoded_boxes.reshape(-1, self.num_priors, 5) 
      
     
    
     
      
     
     
     
               #找出一张图中的所有的object与所有的prior box的最大IOU，即每个prior box对应一个object 
      
     
    
     
      
     
     
     
               best_iou = encoded_boxes[:, :, -1].max(axis=0) 
      
     
    
     
      
     
     
     
               ##找出每个prior box对应的那个object的索引。len(best_iou_idx)=num_priors 
      
     
    
     
      
     
     
     
               best_iou_idx = encoded_boxes[:, :, -1].argmax(axis=0) 
      
     
    
     
      
     
     
     
               ##找出与groud truth 存在IOU的prior box 
      
     
    
     
      
     
     
     
               best_iou_mask = best_iou > 0 
      
     
    
     
      
     
     
     
               best_iou_idx = best_iou_idx[best_iou_mask] 
      
     
    
     
      
     
     
     
               assign_num = len(best_iou_idx) 
      
     
    
     
      
     
     
     
               ##筛选出与groud truth 有IOU的prior box 
      
     
    
     
      
     
     
     
               encoded_boxes = encoded_boxes[:, best_iou_mask, :] 
      
     
    
     
      
     
     
     
               #确定给assignment分配中的prior box分配 具体哪一个groud truth。best_iou_idx中元素的范围为：range(num_object)。 
      
     
    
     
      
     
     
     
               assignment[:, :4][best_iou_mask] = encoded_boxes[best_iou_idx, np.arange(assign_num),:4] 
      
     
    
     
      
     
     
     
               assignment[:, 4][best_iou_mask] = 0 
      
     
    
     
      
     
     
     
               assignment[:, 5:-8][best_iou_mask] = boxes[best_iou_idx, 4:] 
      
     
    
     
      
     
     
     
               assignment[:, -8][best_iou_mask] = 1 
      
     
    
     
      
     
     
     
               return assignment

返回了最终的assignment，用于作为训练时候的标准输出。

值得注意的是，在这个类里面用到self.prior,即default box都是作者先写入到了pkl文件中的，方便于使用，而且对于特定大小的feature map而言，default box是保持不变的，所以提前给出是不会影响训练的。

输入的数据和标准的输出都知道了，接下来就是定义loss function 了

model 的 loss function：

model 的loss function定义在了ssd_training.py文件中了，里面定义了一些有用的功能函数，来帮助最终loss计算的，我们就直接看最终计算那个loss的函数：


    
    
   
  
   
    
     
      
     
     
     
           def compute_loss(self, y_true, y_pred): 
      
     
    
     
      
     
     
     
              # 在keras中自定义loss函数，它的两个输入必须为预测的输出和标准的输出 
      
     
    
     
      
     
     
     
       # 变量： 
      
     
    
     
      
     
     
     
       # y_pred: 它的shape为： (?, num_boxes, 4 + num_classes + 8). 就是在model框架部分介绍的输出。 
      
     
    
     
      
     
     
     
               # y_truth：它的shape和y_pred的shape是一样的，就是上一节我们介绍assignment那一块的输出，具体参考上一节。 
      
     
    
     
      
     
     
     
               # 返回最终的所有loss总和 
      
     
    
     
      
     
     
     
               batch_size = tf.shape(y_true)[0] 
      
     
    
     
      
     
     
     
               num_boxes = tf.to_float(tf.shape(y_true)[1]) 
      
     
    
     
      
     
     
     
               # 计算出所有default box的loss 
      
     
    
     
      
     
     
     
               conf_loss = self._softmax_loss(y_true[:, :, 4:-8], 
      
     
    
     
      
     
     
     
                                              y_pred[:, :, 4:-8]) 
      
     
    
     
      
     
     
     
               loc_loss = self._l1_smooth_loss(y_true[:, :, :4], 
      
     
    
     
      
     
     
     
                                               y_pred[:, :, :4]) 
      
     
    
     
      
     
     
     
               #计算positive 样本的loss 
      
     
    
     
      
     
     
     
               #num_pos 为一个一维的array：len(num_pos)=batch 
      
     
    
     
      
     
     
     
               num_pos = tf.reduce_sum(y_true[:, :, -8], axis=-1) 
      
     
    
     
      
     
     
     
               ##只需计算存在gt_box与其对应的loss 
      
     
    
     
      
     
     
     
               pos_loc_loss = tf.reduce_sum(loc_loss * y_true[:, :, -8], 
      
     
    
     
      
     
     
     
                                            axis=1) 
      
     
    
     
      
     
     
     
               pos_conf_loss = tf.reduce_sum(conf_loss * y_true[:, :, -8], 
      
     
    
     
      
     
     
     
                                             axis=1) 
      
     
    
     
      
     
     
     
               #计算negative sample的loss，只计算了confidence loss 
      
     
    
     
      
     
     
     
               num_neg = tf.minimum(self.neg_pos_ratio * num_pos, 
      
     
    
     
      
     
     
     
                                    num_boxes - num_pos) 
      
     
    
     
      
     
     
     
               pos_num_neg_mask = tf.greater(num_neg, 0) 
      
     
    
     
      
     
     
     
               has_min = tf.to_float(tf.reduce_any(pos_num_neg_mask)) 
      
     
    
     
      
     
     
     
               num_neg = tf.concat(axis=0, values=[num_neg, 
      
     
    
     
      
     
     
     
                                       [(1 - has_min) * self.negatives_for_hard]]) 
      
     
    
     
      
     
     
     
               #tf.boolen_mask(a,b)，例如b=[true, false],a=[[[2,2],[2,3]]],则输出为[2,2]。 
      
     
    
     
      
     
     
     
               #实际上就是取num_neg为正数的那些元素，然后再在其中取num_neg中的最小的元素作为num_neg_batch。 
      
     
    
     
      
     
     
     
               num_neg_batch = tf.reduce_min(tf.boolean_mask(num_neg, 
      
     
    
     
      
     
     
     
                                                             tf.greater(num_neg, 0))) 
      
     
    
     
      
     
     
     
               num_neg_batch = tf.to_int32(num_neg_batch) 
      
     
    
     
      
     
     
     
               confs_start = 4 + self.background_label_id + 1 
      
     
    
     
      
     
     
     
               confs_end = confs_start + self.num_classes - 1 
      
     
    
     
      
     
     
     
               #max_confs的shape为：(batch, num_prior) 
      
     
    
     
      
     
     
     
               max_confs = tf.reduce_max(y_pred[:, :, confs_start:confs_end], 
      
     
    
     
      
     
     
     
                                         axis=2) 
      
     
    
     
      
     
     
     
               #返回负样本的top-K个元素,最终返回的indices的shape为(batch, K=num_neg_batch) 
      
     
    
     
      
     
     
     
               _, indices = tf.nn.top_k(max_confs * (1 - y_true[:, :, -8]), 
      
     
    
     
      
     
     
     
                                        k=num_neg_batch) 
      
     
    
     
      
     
     
     
               #创建一个shape也为(batch,num_neg_batch)的indices 
      
     
    
     
      
     
     
     
               batch_idx = tf.expand_dims(tf.range(0, batch_size), 1) 
      
     
    
     
      
     
     
     
               batch_idx = tf.tile(batch_idx, (1, num_neg_batch)) 
      
     
    
     
      
     
     
     
               #乘以num_boxes后得到batch中每一个sample的index的起始值，再加上top_k得到的index就得到了一个一维的full_indices。 
      
     
    
     
      
     
     
     
               full_indices = (tf.reshape(batch_idx, [-1]) * tf.to_int32(num_boxes) + 
      
     
    
     
      
     
     
     
                               tf.reshape(indices, [-1])) 
      
     
    
     
      
     
     
     
               #把得到的conf_loss也reshape成一维，然后用full_indices对其进行取值 
      
     
    
     
      
     
     
     
               neg_conf_loss = tf.gather(tf.reshape(conf_loss, [-1]), 
      
     
    
     
      
     
     
     
                                         full_indices) 
      
     
    
     
      
     
     
     
               #最终把负样本的confidence loss reshape 成(batch, num_neg_batch),再对每个sample上的loss求和。 
      
     
    
     
      
     
     
     
               neg_conf_loss = tf.reshape(neg_conf_loss, 
      
     
    
     
      
     
     
     
                                          [batch_size, num_neg_batch]) 
      
     
    
     
      
     
     
     
               neg_conf_loss = tf.reduce_sum(neg_conf_loss, axis=1) 
      
     
    
     
      
     
     
     
               #整合所有的loss：positive loss 和 negative loss 
      
     
    
     
      
     
     
     
               total_loss = pos_conf_loss + neg_conf_loss 
      
     
    
     
      
     
     
     
               total_loss /= (num_pos + tf.to_float(num_neg_batch)) 
      
     
    
     
      
     
     
     
               num_pos = tf.where(tf.not_equal(num_pos, 0), num_pos, 
      
     
    
     
      
     
     
     
                                   tf.ones_like(num_pos)) 
      
     
    
     
      
     
     
     
               total_loss += (self.alpha * pos_loc_loss) / num_pos 
      
     
    
     
      
     
     
     
               return total_loss

这时候function loss 也准备好了，属于一切都准备就绪了。当然就是进行训练了。其实在写这篇blog之前我还是对loss function 这块没有太细看明白，写完之后顿时就恍然大悟的，写blog确实是一个自我学习的一个很好过程。

model 进行 training

training这一块是写在SSD_training.ipynb的jupyter notebook文件中的，上面那些model 的部件准备好了之后，training就按照keras的流程照搬就好了。

不过需要注意一下，作者给的这个训练并不是voc数据集的训练，而是对3种瓶子的检测。

1.必要的库和自己编写的模块的导入：


    
    
   
  
   
    
     
      
     
     
     
       import cv2 
      
     
    
     
      
     
     
     
       import keras 
      
     
    
     
      
     
     
     
       from keras.applications.imagenet_utils import preprocess_input 
      
     
    
     
      
     
     
     
       from keras.backend.tensorflow_backend import set_session 
      
     
    
     
      
     
     
     
       from keras.models import Model 
      
     
    
     
      
     
     
     
       from keras.preprocessing import image 
      
     
    
     
      
     
     
     
       import matplotlib.pyplot as plt 
      
     
    
     
      
     
     
     
       import numpy as np 
      
     
    
     
      
     
     
     
       import pickle 
      
     
    
     
      
     
     
     
       from random import shuffle 
      
     
    
     
      
     
     
     
       from scipy.misc import imread 
      
     
    
     
      
     
     
     
       from scipy.misc import imresize 
      
     
    
     
      
     
     
     
       import tensorflow as tf 
      
     
    
     
      
     
     
     
       from ssd import SSD300 
      
     
    
     
      
     
     
     
       from ssd_training import MultiboxLoss 
      
     
    
     
      
     
     
     
       from ssd_utils import BBoxUtility 
      
     
    
     
      
     
     
      
      
     
    
     
      
     
     
     
       %matplotlib inline 
      
     
    
     
      
     
     
     
       plt.rcParams['figure.figsize'] = (8, 8) 
      
     
    
     
      
     
     
     
       plt.rcParams['image.interpolation'] = 'nearest' 
      
     
    
     
      
     
     
      
      
     
    
     
      
     
     
     
       np.set_printoptions(suppress=True)

2.必要的初始化参数和prior box 的读取，以及输入数据的读取：


    
    
   
  
   
    
     
      
     
     
     
       NUM_CLASSES = 4 
      
     
    
     
      
     
     
     
       input_shape = (300, 300, 3) 
      
     
    
     
      
     
     
     
       #prior_boxes_ssd300.pkl 存放了所有的prior:[xmin, ymin, xmax, ymax,var[0],var[1],var[2],var[3]] 
      
     
    
     
      
     
     
     
       priors = pickle.load(open('prior_boxes_ssd300.pkl', 'rb')) 
      
     
    
     
      
     
     
     
       bbox_util = BBoxUtility(NUM_CLASSES, priors) 
      
     
    
     
      
     
     
     
       #获得输入数据的file_name、bounding box 和 label 
      
     
    
     
      
     
     
     
       gt = pickle.load(open('gt_pascal.pkl', 'rb')) 
      
     
    
     
      
     
     
     
       keys = sorted(gt.keys()) 
      
     
    
     
      
     
     
     
       num_train = int(round(0.8 * len(keys))) 
      
     
    
     
      
     
     
     
       train_keys = keys[:num_train] 
      
     
    
     
      
     
     
     
       val_keys = keys[num_train:] 
      
     
    
     
      
     
     
     
       num_val = len(val_keys)

3.输入数据和label的generator类定义，有点长，就把generate 那个函数贴出来：


    
    
   
  
   
    
     
      
     
     
     
       class Generator(object): 
      
     
    
     
      
     
     
     
           def generate(self, train=True): 
      
     
    
     
      
     
     
     
               while True: 
      
     
    
     
      
     
     
     
                   if train: 
      
     
    
     
      
     
     
     
                       shuffle(self.train_keys) 
      
     
    
     
      
     
     
     
                       keys = self.train_keys 
      
     
    
     
      
     
     
     
                   else: 
      
     
    
     
      
     
     
     
                       shuffle(self.val_keys) 
      
     
    
     
      
     
     
     
                       keys = self.val_keys 
      
     
    
     
      
     
     
     
                   inputs = [] 
      
     
    
     
      
     
     
     
                   targets = [] 
      
     
    
     
      
     
     
     
                   for key in keys:             
      
     
    
     
      
     
     
     
                       img_path = self.path_prefix + key 
      
     
    
     
      
     
     
     
                       img = imread(img_path).astype('float32') 
      
     
    
     
      
     
     
     
                       y = self.gt[key].copy() 
      
     
    
     
      
     
     
     
                       ##y的shape是一张图片中box的数目和位置+类别。（num_box, coordinate+num_class） 
      
     
    
     
      
     
     
     
                       if train and self.do_crop: 
      
     
    
     
      
     
     
     
                           img, y = self.random_sized_crop(img, y) 
      
     
    
     
      
     
     
     
                       img = imresize(img, self.image_size).astype('float32') 
      
     
    
     
      
     
     
     
                       if train: 
      
     
    
     
      
     
     
     
                           shuffle(self.color_jitter) 
      
     
    
     
      
     
     
     
                           for jitter in self.color_jitter: 
      
     
    
     
      
     
     
     
                               img = jitter(img) 
      
     
    
     
      
     
     
     
                           if self.lighting_std: 
      
     
    
     
      
     
     
     
                               img = self.lighting(img) 
      
     
    
     
      
     
     
     
                           if self.hflip_prob > 0: 
      
     
    
     
      
     
     
     
                               img, y = self.horizontal_flip(img, y) 
      
     
    
     
      
     
     
     
                           if self.vflip_prob > 0: 
      
     
    
     
      
     
     
     
                               img, y = self.vertical_flip(img, y) 
      
     
    
     
      
     
     
     
                       y = self.bbox_util.assign_boxes(y) 
      
     
    
     
      
     
     
     
                       inputs.append(img)                 
      
     
    
     
      
     
     
     
                       targets.append(y) 
      
     
    
     
      
     
     
     
                       if len(targets) == self.batch_size: 
      
     
    
     
      
     
     
     
                           tmp_inp = np.array(inputs) 
      
     
    
     
      
     
     
     
                           tmp_targets = np.array(targets) 
      
     
    
     
      
     
     
     
                           inputs = [] 
      
     
    
     
      
     
     
     
                           targets = [] 
      
     
    
     
      
     
     
     
                           yield preprocess_input(tmp_inp), tmp_targets #batch 生成器

4.必要的初始化


    
    
   
  
   
    
     
      
     
     
     
       #输入数据（图片）的root directory 
      
     
    
     
      
     
     
     
       path_prefix = '../../frames/' 
      
     
    
     
      
     
     
     
       gen = Generator(gt, bbox_util, 16, '../../frames/', 
      
     
    
     
      
     
     
     
       train_keys, val_keys, 
      
     
    
     
      
     
     
     
       (input_shape[0], input_shape[1]), do_crop=False) 
      
     
    
     
      
     
     
     
       #构建SSD300的model 
      
     
    
     
      
     
     
     
       model = SSD300(input_shape, num_classes=NUM_CLASSES) 
      
     
    
     
      
     
     
     
       model.load_weights('weights_SSD300.hdf5', by_name=True) 
      
     
    
     
      
     
     
     
       #也没太弄懂，为什么需要把他们给freeze，为啥也对他们train 
      
     
    
     
      
     
     
     
       freeze = ['input_1', 'conv1_1', 'conv1_2', 'pool1', 
      
     
    
     
      
     
     
     
       'conv2_1', 'conv2_2', 'pool2', 
      
     
    
     
      
     
     
     
       'conv3_1', 'conv3_2', 'conv3_3', 'pool3'] 
      
     
    
     
      
     
     
     
       for L in model.layers: 
      
     
    
     
      
     
     
     
       if L.name in freeze: 
      
     
    
     
      
     
     
     
       L.trainable = False

5.keras的一些callback function的定义以及model的compile and training：


    
    
   
  
   
    
     
      
     
     
     
       def schedule(epoch, decay=0.9): 
      
     
    
     
      
     
     
     
       return base_lr * decay**(epoch) 
      
     
    
     
      
     
     
      
      
     
    
     
      
     
     
     
       callbacks = [keras.callbacks.ModelCheckpoint('./checkpoints/weights.{epoch:02d}-{val_loss:.2f}.hdf5', 
      
     
    
     
      
     
     
     
       verbose=1, 
      
     
    
     
      
     
     
     
       save_weights_only=True), 
      
     
    
     
      
     
     
     
       keras.callbacks.LearningRateScheduler(schedule)] 
      
     
    
     
      
     
     
     
       base_lr = 3e-4 
      
     
    
     
      
     
     
     
       optim = keras.optimizers.Adam(lr=base_lr) 
      
     
    
     
      
     
     
     
       # optim = keras.optimizers.RMSprop(lr=base_lr) 
      
     
    
     
      
     
     
     
       # optim = keras.optimizers.SGD(lr=base_lr, momentum=0.9, decay=decay, nesterov=True) 
      
     
    
     
      
     
     
     
       model.compile(optimizer=optim, 
      
     
    
     
      
     
     
     
       loss=MultiboxLoss(NUM_CLASSES, neg_pos_ratio=2.0).compute_loss) 
      
     
    
     
      
     
     
     
       nb_epoch = 30 
      
     
    
     
      
     
     
     
       history = model.fit_generator(gen.generate(True), gen.train_batches, 
      
     
    
     
      
     
     
     
       nb_epoch, verbose=1, 
      
     
    
     
      
     
     
     
       callbacks=callbacks, 
      
     
    
     
      
     
     
     
       validation_data=gen.generate(False), 
      
     
    
     
      
     
     
     
       nb_val_samples=gen.val_batches, 
      
     
    
     
      
     
     
     
       nb_worker=1)

6.train完了之后，当然是检测了：


    
    
   
  
   
    
     
      
     
     
     
       #数据的读取 
      
     
    
     
      
     
     
     
       inputs = [] 
      
     
    
     
      
     
     
     
       images = [] 
      
     
    
     
      
     
     
     
       img_path = path_prefix + sorted(val_keys)[0] 
      
     
    
     
      
     
     
     
       img = image.load_img(img_path, target_size=(300, 300)) 
      
     
    
     
      
     
     
     
       img = image.img_to_array(img) 
      
     
    
     
      
     
     
     
       images.append(imread(img_path)) 
      
     
    
     
      
     
     
     
       inputs.append(img.copy()) 
      
     
    
     
      
     
     
     
       inputs = preprocess_input(np.array(inputs)) 
      
     
    
     
      
     
     
     
       #进行预测和预测后对预测结果的解码 
      
     
    
     
      
     
     
     
       preds = model.predict(inputs, batch_size=1, verbose=1) 
      
     
    
     
      
     
     
     
       results = bbox_util.detection_out(preds) 
      
     
    
     
      
     
     
     
       #可视化预测结果 
      
     
    
     
      
     
     
     
       for i, img in enumerate(images): 
      
     
    
     
      
     
     
     
       # Parse the outputs. 
      
     
    
     
      
     
     
     
       det_label = results[i][:, 0] 
      
     
    
     
      
     
     
     
       det_conf = results[i][:, 1] 
      
     
    
     
      
     
     
     
       det_xmin = results[i][:, 2] 
      
     
    
     
      
     
     
     
       det_ymin = results[i][:, 3] 
      
     
    
     
      
     
     
     
       det_xmax = results[i][:, 4] 
      
     
    
     
      
     
     
     
       det_ymax = results[i][:, 5] 
      
     
    
     
      
     
     
     
       # Get detections with confidence higher than 0.6. 
      
     
    
     
      
     
     
     
       top_indices = [i for i, conf in enumerate(det_conf) if conf >= 0.6] 
      
     
    
     
      
     
     
     
       top_conf = det_conf[top_indices] 
      
     
    
     
      
     
     
     
       top_label_indices = det_label[top_indices].tolist() 
      
     
    
     
      
     
     
     
       top_xmin = det_xmin[top_indices] 
      
     
    
     
      
     
     
     
       top_ymin = det_ymin[top_indices] 
      
     
    
     
      
     
     
     
       top_xmax = det_xmax[top_indices] 
      
     
    
     
      
     
     
     
       top_ymax = det_ymax[top_indices] 
      
     
    
     
      
     
     
      
      
     
    
     
      
     
     
     
       colors = plt.cm.hsv(np.linspace(0, 1, 4)).tolist() 
      
     
    
     
      
     
     
      
      
     
    
     
      
     
     
     
       plt.imshow(img / 255.) 
      
     
    
     
      
     
     
     
       currentAxis = plt.gca() 
      
     
    
     
      
     
     
      
      
     
    
     
      
     
     
     
       for i in range(top_conf.shape[0]): 
      
     
    
     
      
     
     
     
       xmin = int(round(top_xmin[i] * img.shape[1])) 
      
     
    
     
      
     
     
     
       ymin = int(round(top_ymin[i] * img.shape[0])) 
      
     
    
     
      
     
     
     
       xmax = int(round(top_xmax[i] * img.shape[1])) 
      
     
    
     
      
     
     
     
       ymax = int(round(top_ymax[i] * img.shape[0])) 
      
     
    
     
      
     
     
     
       score = top_conf[i] 
      
     
    
     
      
     
     
     
       label = int(top_label_indices[i]) 
      
     
    
     
      
     
     
     
            #注意这里的label直接使用的数字，因为它train的数据集不是voc，而是几种瓶子的种类。 
      
     
    
     
      
     
     
     
       display_txt = '{:0.2f}, {}'.format(score, label) 
      
     
    
     
      
     
     
     
       coords = (xmin, ymin), xmax-xmin+1, ymax-ymin+1 
      
     
    
     
      
     
     
     
       color = colors[label] 
      
     
    
     
      
     
     
     
       currentAxis.add_patch(plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2)) 
      
     
    
     
      
     
     
     
       currentAxis.text(xmin, ymin, display_txt, bbox={'facecolor':color, 'alpha':0.5}) 
      
     
    
     
      
     
     
     
       plt.show()

7.predict 的结果：

整个过程也就基本上的结束了。SSD的keras实现还是比较简单的，没有mask r-cnn那么费劲。不知道为啥我先看的yolo的原理和实现，但是不太想写yolo的实现和原理（手动白眼），直接跳到了SSD，大概是觉得SSD比较好理解把，yolo等有时间再写吧。

之后我再把生成prior box pkl文件的代码贴上来，自己写的代码有点乱。希望看到了最后你对SDD的模型架构和具体实现都有了一个很好的认识。因为也是一个新手，所以其中有什么理解不到位，或者写错的，欢迎指出。

添加：prior box 的 pkl文件生成代码：其实也很简单，就是稍微修改了一下PriorBox这个自定义的keras layer，把输出用来产生对于特定feature map 大小的 default box：


    
    
   
  
   
    
     
      
     
     
     
       import numpy as np 
      
     
    
     
      
     
     
     
       class PriorBox(): 
      
     
    
     
      
     
     
     
       def __init__(self, img_size, min_size, max_size=None, aspect_ratios=None, 
      
     
    
     
      
     
     
     
       flip=True, variances=[0.1,0.1,0.2,0.2], clip=True, layer_shape=[8,8],**kwargs): 
      
     
    
     
      
     
     
     
       self.input_shape = layer_shape 
      
     
    
     
      
     
     
     
       self.img_size = img_size 
      
     
    
     
      
     
     
     
       if min_size 
      <= 0: 
      
     
    
     
      
     
     
      
      raise 
      Exception(' 
      min_size 
      must 
      be 
      positive.') 
      
     
    
     
      
     
     
      
      self.min_size = 
      min_size 
      
     
    
     
      
     
     
      
      self.max_size = 
      max_size 
      
     
    
     
      
     
     
      
      self.aspect_ratios = 
      [1.0] 
      
     
    
     
      
     
     
      
      if 
      max_size: 
      
     
    
     
      
     
     
      
      if 
      max_size < 
      min_size: 
      
     
    
     
      
     
     
      
      raise 
      Exception(' 
      max_size 
      must 
      be 
      greater 
      than 
      min_size.') 
      
     
    
     
      
     
     
      
      self.aspect_ratios.append( 
      1.0) 
      
     
    
     
      
     
     
      
      if 
      aspect_ratios: 
      
     
    
     
      
     
     
      
      for 
      ar 
      in 
      aspect_ratios: 
      
     
    
     
      
     
     
      
      if 
      ar 
      in 
      self.aspect_ratios: 
      
     
    
     
      
     
     
      
      continue 
      
     
    
     
      
     
     
      
      self.aspect_ratios.append( 
      ar) 
      
     
    
     
      
     
     
      
      if 
      flip: 
      
     
    
     
      
     
     
      
      self.aspect_ratios.append( 
      1.0 / 
      ar) 
      
     
    
     
      
     
     
      
      self.variances = 
      np.array(variances) 
      
     
    
     
      
     
     
      
      self.clip = 
      True 
      
     
    
     
      
     
     
      
      super( 
      PriorBox, 
      self) 
      .__init__(** 
      kwargs) 
      
     
    
     
      
     
     
      
      
     
    
     
      
     
     
      
      def 
      compute_default_box( 
      self) 
      : 
      
     
    
     
      
     
     
      
      layer_height = 
      self.input_shape[0] 
      
     
    
     
      
     
     
      
      layer_width = 
      self.input_shape[1] 
      
     
    
     
      
     
     
      
      img_width = 
      self.img_size[0] 
      
     
    
     
      
     
     
      
      img_height = 
      self.img_size[1] 
      
     
    
     
      
     
     
     
       # 
      define 
      prior 
      boxes 
      shapes 
      
     
    
     
      
     
     
      
      box_widths = 
      [] 
      
     
    
     
      
     
     
      
      box_heights = 
      [] 
      
     
    
     
      
     
     
      
      for 
      ar 
      in 
      self.aspect_ratios: 
      
     
    
     
      
     
     
      
      if 
      ar == 
      1 
      and 
      len( 
      box_widths) == 
      0: 
      
     
    
     
      
     
     
      
      box_widths.append( 
      self.min_size) 
      
     
    
     
      
     
     
      
      box_heights.append( 
      self.min_size) 
      
     
    
     
      
     
     
      
      elif 
      ar == 
      1 
      and 
      len( 
      box_widths) > 0: 
      
     
    
     
      
     
     
     
       box_widths.append(np.sqrt(self.min_size * self.max_size)) 
      
     
    
     
      
     
     
     
       box_heights.append(np.sqrt(self.min_size * self.max_size)) 
      
     
    
     
      
     
     
     
       elif ar != 1: 
      
     
    
     
      
     
     
     
       box_widths.append(self.min_size * np.sqrt(ar)) 
      
     
    
     
      
     
     
     
       box_heights.append(self.min_size / np.sqrt(ar)) 
      
     
    
     
      
     
     
     
       box_widths = 0.5 * np.array(box_widths) 
      
     
    
     
      
     
     
     
       box_heights = 0.5 * np.array(box_heights) 
      
     
    
     
      
     
     
     
       # define centers of prior boxes 
      
     
    
     
      
     
     
     
       step_x = img_width / layer_width 
      
     
    
     
      
     
     
     
       step_y = img_height / layer_height 
      
     
    
     
      
     
     
     
       #generate a list data 
      
     
    
     
      
     
     
     
       linx = np.linspace(0.5 * step_x, img_width - 0.5 * step_x, 
      
     
    
     
      
     
     
     
       layer_width) 
      
     
    
     
      
     
     
     
       liny = np.linspace(0.5 * step_y, img_height - 0.5 * step_y, 
      
     
    
     
      
     
     
     
       layer_height) 
      
     
    
     
      
     
     
     
       ##ulitize meshgrid function to generate default box's coordinates 
      
     
    
     
      
     
     
     
       centers_x, centers_y = np.meshgrid(linx, liny) 
      
     
    
     
      
     
     
     
       centers_x = centers_x.reshape(-1, 1) 
      
     
    
     
      
     
     
     
       centers_y = centers_y.reshape(-1, 1) 
      
     
    
     
      
     
     
     
       # define xmin, ymin, xmax, ymax of prior boxes 
      
     
    
     
      
     
     
     
       num_priors_ = len(self.aspect_ratios) 
      
     
    
     
      
     
     
     
       prior_boxes = np.concatenate((centers_x, centers_y), axis=1) 
      
     
    
     
      
     
     
     
       prior_boxes = np.tile(prior_boxes, (1, 2 * num_priors_)) 
      
     
    
     
      
     
     
     
       prior_boxes[:, ::4] -= box_widths 
      
     
    
     
      
     
     
     
       prior_boxes[:, 1::4] -= box_heights 
      
     
    
     
      
     
     
     
       prior_boxes[:, 2::4] += box_widths 
      
     
    
     
      
     
     
     
       prior_boxes[:, 3::4] += box_heights 
      
     
    
     
      
     
     
     
       prior_boxes[:, ::2] /= img_width 
      
     
    
     
      
     
     
     
       prior_boxes[:, 1::2] /= img_height 
      
     
    
     
      
     
     
     
       prior_boxes = prior_boxes.reshape(-1, 4) 
      
     
    
     
      
     
     
     
       if self.clip: 
      
     
    
     
      
     
     
     
       prior_boxes = np.minimum(np.maximum(prior_boxes, 0.0), 1.0) 
      
     
    
     
      
     
     
     
       # define variances 
      
     
    
     
      
     
     
     
       num_boxes = len(prior_boxes) 
      
     
    
     
      
     
     
     
       if len(self.variances) == 1: 
      
     
    
     
      
     
     
     
       variances = np.ones((num_boxes, 4)) * self.variances[0] 
      
     
    
     
      
     
     
     
       elif len(self.variances) == 4: 
      
     
    
     
      
     
     
     
       variances = np.tile(self.variances, (num_boxes, 1)) 
      
     
    
     
      
     
     
     
       else: 
      
     
    
     
      
     
     
     
       raise Exception('Must provide one or four variances.') 
      
     
    
     
      
     
     
     
       prior_boxes = np.concatenate((prior_boxes, variances), axis=1) 
      
     
    
     
      
     
     
     
       return prior_boxes 
      
     
    
     
      
     
     
      
      
     
    
     
      
     
     
     
       #调用修改后的PriorBox类 
      
     
    
     
      
     
     
     
       img_size = (300, 300) 
      
     
    
     
      
     
     
     
       default_box_layer1 = PriorBox(img_size, 30, [], aspect_ratios=[2], layer_shape=(38,38)).compute_default_box() 
      
     
    
     
      
     
     
     
       default_box_layer2 = PriorBox(img_size, 60, 114, aspect_ratios=[2,3], layer_shape=(19,19)).compute_default_box() 
      
     
    
     
      
     
     
     
       default_box_layer3 = PriorBox(img_size, 114, 168, aspect_ratios=[2,3], layer_shape=(10,10)).compute_default_box() 
      
     
    
     
      
     
     
     
       default_box_layer4 = PriorBox(img_size, 168, 222, aspect_ratios=[2,3], layer_shape=(5,5)).compute_default_box() 
      
     
    
     
      
     
     
     
       default_box_layer5 = PriorBox(img_size, 222, 276, aspect_ratios=[2,3], layer_shape=(3,3)).compute_default_box() 
      
     
    
     
      
     
     
     
       default_box_layer6 = PriorBox(img_size, 276, 330, aspect_ratios=[2,3], layer_shape=(1,1)).compute_default_box() 
      
     
    
     
      
     
     
     
       #把各层的输出concatenate起来 
      
     
    
     
      
     
     
     
       default_box = np.concatenate((default_box_layer1, default_box_layer2, default_box_layer3,\ 
      
     
    
     
      
     
     
     
       default_box_layer4, default_box_layer5, default_box_layer6), axis=0) 
      
     
    
     
      
     
     
     
       #写入到pkl文件中 
      
     
    
     
      
     
     
     
       import pickle 
      
     
    
     
      
     
     
     
       pickle.dump(default_box,open("default_box_information","wb"))

免责声明！

本站转载的文章为个人学习借鉴使用，本站对版权不负任何法律责任。如果侵犯了您的隐私权益，请联系本站邮箱yoyou2525@163.com删除。

猜您在找 SSD算法的实现目标检测：SSD算法详解 SSD算法思想和结构详解 FM算法keras实现 ssd算法的pytorch实现与解读 ssd原理及代码实现详解 ssd原理及代码实现详解 SSD详解 SSD（single shot multibox detector）算法及Caffe代码详解[转] FM算法原理、细节问答、keras实现