在上一篇的博客講述了SSD的原理,這一篇主要是講解keras的實現。
keras代碼的github地址為:點擊打開鏈接
model 的框架實現(ssd.py):
先給出了改變后的VGG16的實現:
-
def SSD300(input_shape, num_classes=21):
-
#Input_shape 為輸入的形狀(300,300,3)
-
#num_class 為需要檢測的種類。
-
# Block 1
-
input_tensor = input_tensor = Input(shape=input_shape)
-
img_size = (input_shape[1], input_shape[0])
-
net['input'] = input_tensor
-
net['conv1_1'] = Convolution2D(64, 3, 3,
-
activation='relu',
-
border_mode='same',
-
name='conv1_1')(net['input'])
-
net['conv1_2'] = Convolution2D(64, 3, 3,
-
activation='relu',
-
border_mode='same',
-
name='conv1_2')(net['conv1_1'])
-
net['pool1'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same',
-
name='pool1')(net['conv1_2'])
-
# Block 2
-
net['conv2_1'] = Convolution2D(128, 3, 3,
-
activation='relu',
-
border_mode='same',
-
name='conv2_1')(net['pool1'])
-
net['conv2_2'] = Convolution2D(128, 3, 3,
-
activation='relu',
-
border_mode='same',
-
name='conv2_2')(net['conv2_1'])
-
net['pool2'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same',
-
name='pool2')(net['conv2_2'])
-
# Block 3
-
net['conv3_1'] = Convolution2D(256, 3, 3,
-
activation='relu',
-
border_mode='same',
-
name='conv3_1')(net['pool2'])
-
net['conv3_2'] = Convolution2D(256, 3, 3,
-
activation='relu',
-
border_mode='same',
-
name='conv3_2')(net['conv3_1'])
-
net['conv3_3'] = Convolution2D(256, 3, 3,
-
activation='relu',
-
border_mode='same',
-
name='conv3_3')(net['conv3_2'])
-
net['pool3'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same',
-
name='pool3')(net['conv3_3'])
-
# Block 4
-
net['conv4_1'] = Convolution2D(512, 3, 3,
-
activation='relu',
-
border_mode='same',
-
name='conv4_1')(net['pool3'])
-
net['conv4_2'] = Convolution2D(512, 3, 3,
-
activation='relu',
-
border_mode='same',
-
name='conv4_2')(net['conv4_1'])
-
net['conv4_3'] = Convolution2D(512, 3, 3,
-
activation='relu',
-
border_mode='same',
-
name='conv4_3')(net['conv4_2'])
-
net['pool4'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same',
-
name='pool4')(net['conv4_3'])
-
# Block 5
-
net['conv5_1'] = Convolution2D(512, 3, 3,
-
activation='relu',
-
border_mode='same',
-
name='conv5_1')(net['pool4'])
-
net['conv5_2'] = Convolution2D(512, 3, 3,
-
activation='relu',
-
border_mode='same',
-
name='conv5_2')(net['conv5_1'])
-
net['conv5_3'] = Convolution2D(512, 3, 3,
-
activation='relu',
-
border_mode='same',
-
name='conv5_3')(net['conv5_2'])
-
net['pool5'] = MaxPooling2D((3, 3), strides=(1, 1), border_mode='same',
-
name='pool5')(net['conv5_3'])
-
# FC6
-
net['fc6'] = AtrousConvolution2D(1024, 3, 3, atrous_rate=(6, 6),
-
activation='relu', border_mode='same',
-
name='fc6')(net['pool5'])
-
# FC7
-
net['fc7'] = Convolution2D(1024, 1, 1, activation='relu',
-
border_mode='same', name='fc7')(net['fc6'])
-
# Block 6
-
net['conv6_1'] = Convolution2D(256, 1, 1, activation='relu',
-
border_mode='same',
-
name='conv6_1')(net['fc7'])
-
net['conv6_2'] = Convolution2D(512, 3, 3, subsample=(2, 2),
-
activation='relu', border_mode='same',
-
name='conv6_2')(net['conv6_1'])
-
# Block 7
-
net['conv7_1'] = Convolution2D(128, 1, 1, activation='relu',
-
border_mode='same',
-
name='conv7_1')(net['conv6_2'])
-
net['conv7_2'] = ZeroPadding2D()(net['conv7_1'])
-
net['conv7_2'] = Convolution2D(256, 3, 3, subsample=(2, 2),
-
activation='relu', border_mode='valid',
-
name='conv7_2')(net['conv7_2'])
-
# Block 8
-
net['conv8_1'] = Convolution2D(128, 1, 1, activation='relu',
-
border_mode='same',
-
name='conv8_1')(net['conv7_2'])
-
net['conv8_2'] = Convolution2D(256, 3, 3, subsample=(2, 2),
-
activation='relu', border_mode='same',
-
name='conv8_2')(net['conv8_1'])
-
# Last Pool
-
net['pool6'] = GlobalAveragePooling2D(name='pool6')(net['conv8_2'])
標紅部分就是進行改變的部分,可以看出把FC6換成了空洞卷積,和普通卷積差不多,就是把一次卷積的感受域擴大了。FC7換成了普通卷積,之后再添加了幾個卷積塊。
接下來就是通過改變后的VGG16得到的多層feature map來預測location 和 confidence。使用到的feature map 有:conv4_3、fc7、conv6_2、conv7_2、conv8_2、pool6。總共6層的feature map。因為對於每層的處理步驟差不多,所以就貼出conv4_3處理的代碼:
-
# Prediction from conv4_3
-
net['conv4_3_norm'] = Normalize(20, name='conv4_3_norm')(net['conv4_3'])
-
num_priors = 3
-
x = Convolution2D(num_priors * 4, 3, 3, border_mode='same',
-
name='conv4_3_norm_mbox_loc')(net['conv4_3_norm'])
-
net['conv4_3_norm_mbox_loc'] = x
-
flatten = Flatten(name='conv4_3_norm_mbox_loc_flat')
-
net['conv4_3_norm_mbox_loc_flat'] = flatten(net['conv4_3_norm_mbox_loc'])
-
name = 'conv4_3_norm_mbox_conf'
-
if num_classes != 21:
-
name += '_{}'.format(num_classes)
-
x = Convolution2D(num_priors * num_classes, 3, 3, border_mode='same',
-
name=name)(net['conv4_3_norm'])
-
net['conv4_3_norm_mbox_conf'] = x
-
flatten = Flatten(name='conv4_3_norm_mbox_conf_flat')
-
net['conv4_3_norm_mbox_conf_flat'] = flatten(net['conv4_3_norm_mbox_conf'])
-
priorbox = PriorBox(img_size, 30.0, aspect_ratios=[2],
-
variances=[0.1, 0.1, 0.2, 0.2],
-
name='conv4_3_norm_mbox_priorbox')
-
net['conv4_3_norm_mbox_priorbox'] = priorbox(net['conv4_3_norm'])
可以看出對於conv4_3這層的feature map,采用的default box 的個數為3。所以location預測這個卷積層使用的卷積核個數為:3*4=12個。卷積完之后進行flatten,因為最后的輸出是多層feature map預測的concatenate。同理,對於confidence預測采用的卷積核個數為:21*3=36(對於voc數據集而言)。對於PriorBox這一層,目前只需要知道它是對feature map 進行相應的操作,來得到default box的,而且對於特定的一層feature map而言,它是固定不變的,不隨train或者predict的過程改變的。
對於pool6產生的feature map處理有一些不一樣,這里單獨的拿出來說一下,因為pool6層使用的是globa laverage pool,所以它輸出的大小為1*1*256,比較小,不太適合用卷積處理了,就直接用Dense層來處理了:
-
# Prediction from pool6
-
num_priors = 6
-
x = Dense(num_priors * 4, name='pool6_mbox_loc_flat')(net['pool6'])
-
net['pool6_mbox_loc_flat'] = x
-
name = 'pool6_mbox_conf_flat'
-
if num_classes != 21:
-
name += '_{}'.format(num_classes)
-
x = Dense(num_priors * num_classes, name=name)(net['pool6'])
-
net['pool6_mbox_conf_flat'] = x
-
priorbox = PriorBox(img_size, 276.0, max_size=330.0, aspect_ratios=[2, 3],
-
variances=[0.1, 0.1, 0.2, 0.2],
-
name='pool6_mbox_priorbox')
-
if K.image_dim_ordering() == 'tf':
-
target_shape = (1, 1, 256)
-
else:
-
target_shape = (256, 1, 1)
-
net['pool6_reshaped'] = Reshape(target_shape,
-
name='pool6_reshaped')(net['pool6'])
-
net['pool6_mbox_priorbox'] = priorbox(net['pool6_reshaped'])
每層預測完事之后呢,當然是把他們都給concatenate起來,就貼location的實現,其他兩個類似:
-
net['mbox_loc'] = merge([net['conv4_3_norm_mbox_loc_flat'],
-
net['fc7_mbox_loc_flat'],
-
net['conv6_2_mbox_loc_flat'],
-
net['conv7_2_mbox_loc_flat'],
-
net['conv8_2_mbox_loc_flat'],
-
net['pool6_mbox_loc_flat']],
-
mode='concat', concat_axis=1, name='mbox_loc')
因為之前進行了flatten,所以concatenate得到的是一個batch中每個sample所有的location位置,並且是一個一維的形式存在,需要把它給重新reshape成[batch, number of default box, 4 ]的形式;預測的class分類也是類似的:[batch, number of default box, 21 ]。最后再將location、class、default box三者進行merge得到最終的預測結果。
-
#計算default box 的個數
-
if hasattr(net['mbox_loc'], '_keras_shape'):
-
num_boxes = net['mbox_loc']._keras_shape[-1] // 4
-
elif hasattr(net['mbox_loc'], 'int_shape'):
-
num_boxes = K.int_shape(net['mbox_loc'])[-1] // 4
-
net['mbox_loc'] = Reshape((num_boxes, 4),
-
name='mbox_loc_final')(net['mbox_loc'])
-
net['mbox_conf'] = Reshape((num_boxes, num_classes),
-
name='mbox_conf_logits')(net['mbox_conf'])
-
net['mbox_conf'] = Activation('softmax',
-
name='mbox_conf_final')(net['mbox_conf'])
-
net['predictions'] = merge([net['mbox_loc'],
-
net['mbox_conf'],
-
net['mbox_priorbox']],
-
mode='concat', concat_axis=2,
-
name='predictions')
我們來計算一下這六層feature map總共擁有的default box的數量:38*38*3+19*19*6+10*10*6+5*5*6+3*3*6+1*1*6=7308。和論文中還是存在一定的差別的。
接一下就是介紹一下model中使用到的PriorBox層的作用。它是作用在每一層的feature map上的,根據輸入的不同aspect ratio 和 scale 以及 num_prior來返回特定的default box,default box 的數目是feature map的height*width*num_prior。具體看代碼:
-
class PriorBox(Layer):
-
'''
-
img_size: 輸入圖片的大小(w, h).
-
min_size: 每個feature cell中最小的scale,不是歸一化后的值,而是實際的大小
-
max_size: 每個feature cell中最大的scale,不是歸一化的值,而是實際的大小
-
aspect_ratios: 長寬比
-
flip:是否需要對長寬比進行反轉。
-
variances: 添加的方差x,y,w,h
-
clip: 讓輸出保持在[0,1之間
-
輸入的shape:
-
`4D的tensor:(samples, rows, cols, channels)
-
輸出的shape:
-
3D的tensor:(samples, num_boxes, 8)
-
其中的8具體為:(xmin, ymin, xmax, ymax, variance[0], variance[1], variance[2], variance[3])
-
"""
-
def __init__(self, img_size, min_size, max_size=None, aspect_ratios=None,
-
flip=True, variances=[0.1], clip=True, **kwargs):
-
self.waxis = 2
-
self.haxis = 1
-
self.img_size = img_size
-
if min_size
<= 0:
-
raise Exception('min_size must be positive.')
-
self.min_size = min_size
-
self.max_size = max_size
-
self.aspect_ratios = [1.0]
-
if max_size:
-
if max_size < min_size:
-
raise Exception('max_size must be greater than min_size.')
-
self.aspect_ratios.append(1.0)
-
#根據給定的aspect_ratio來計算全部的aspect ratio
-
if aspect_ratios:
-
for ar in aspect_ratios:
-
if ar in self.aspect_ratios:
-
continue
-
self.aspect_ratios.append(ar)
-
if flip:
-
self.aspect_ratios.append(1.0 / ar)
-
self.variances = np.array(variances)
-
self.clip = True
-
super(PriorBox, self).__init__(**kwargs)
-
#用於返回自定義層的輸出shape
-
def compute_output_shape(self, input_shape):
-
num_priors_ = len(self.aspect_ratios)
-
layer_width = input_shape[self.waxis]
-
layer_height = input_shape[self.haxis]
-
num_boxes = num_priors_ * layer_width * layer_height
-
return (input_shape[0], num_boxes, 8)
-
-
def call(self, x, mask=None):
-
if hasattr(x, '_keras_shape'):
-
input_shape = x._keras_shape
-
elif hasattr(K, 'int_shape'):
-
input_shape = K.int_shape(x)
-
layer_width = input_shape[self.waxis]
-
layer_height = input_shape[self.haxis]
-
img_width = self.img_size[0]
-
img_height = self.img_size[1]
-
# define prior boxes shapes
-
box_widths = []
-
box_heights = []
-
for ar in self.aspect_ratios:
-
if ar == 1 and len(box_widths) == 0:
-
box_widths.append(self.min_size)
-
box_heights.append(self.min_size)
-
elif ar == 1 and len(box_widths) > 0:
-
box_widths.append(np.sqrt(self.min_size * self.max_size))
-
box_heights.append(np.sqrt(self.min_size * self.max_size))
-
elif ar != 1:
-
box_widths.append(self.min_size * np.sqrt(ar))
-
box_heights.append(self.min_size / np.sqrt(ar))
-
box_widths = 0.5 * np.array(box_widths)
-
box_heights = 0.5 * np.array(box_heights)
-
# define centers of prior boxes
-
step_x = img_width / layer_width
-
step_y = img_height / layer_height
-
#用於產生default box的中心坐標
-
linx = np.linspace(0.5 * step_x, img_width - 0.5 * step_x,
-
layer_width)
-
liny = np.linspace(0.5 * step_y, img_height - 0.5 * step_y,
-
layer_height)
-
centers_x, centers_y = np.meshgrid(linx, liny)
-
centers_x = centers_x.reshape(-1, 1)
-
centers_y = centers_y.reshape(-1, 1)
-
# define xmin, ymin, xmax, ymax of prior boxes
-
num_priors_ = len(self.aspect_ratios)
-
#concatenate之后得到了一連串的(centers_x,centers_y)形式的坐標
-
prior_boxes = np.concatenate((centers_x, centers_y), axis=1)
-
#擴充得到(centers_x, centers_y, centers_x, centers_y)形式的坐標
-
prior_boxes = np.tile(prior_boxes, (1, 2 * num_priors_))
-
prior_boxes[:, ::4] -= box_widths
-
prior_boxes[:, 1::4] -= box_heights
-
prior_boxes[:, 2::4] += box_widths
-
prior_boxes[:, 3::4] += box_heights
-
prior_boxes[:, ::2] /= img_width
-
prior_boxes[:, 1::2] /= img_height
-
#最終得到各個default box的歸一化后的(Xmin,Ymin, Xmax, Ymax)
-
#reshape成[num_box, 4]的形式
-
prior_boxes = prior_boxes.reshape(-1, 4)
-
if self.clip:
-
prior_boxes = np.minimum(np.maximum(prior_boxes, 0.0), 1.0)
-
# define variances
-
num_boxes = len(prior_boxes)
-
if len(self.variances) == 1:
-
variances = np.ones((num_boxes, 4)) * self.variances[0]
-
elif len(self.variances) == 4:
-
variances = np.tile(self.variances, (num_boxes, 1))
-
else:
-
raise Exception('Must provide one or four variances.')
-
##把variance加入到輸出之中。
-
prior_boxes = np.concatenate((prior_boxes, variances), axis=1)
-
prior_boxes_tensor = K.expand_dims(K.variable(prior_boxes), 0)
-
if K.backend() == 'tensorflow':
-
pattern = [tf.shape(x)[0], 1, 1]
-
prior_boxes_tensor = tf.tile(prior_boxes_tensor, pattern)
-
return prior_boxes_tensor
綜合上面對model的分析,最后預測輸出的shape為:[batch_size, num_box, location+num_class+8]
整體的架構完事之后,就需要准備好數據和loss function了,先看看如何預處理數據吧。
model的數據准備:
代碼中編寫了一個處理VOC數據集的py文件:
-
import numpy as np
-
import os
-
from xml.etree import ElementTree
-
-
class XML_preprocessor(object):
-
#輸出為:{image_name: [num_image, num_object_per_image, location+num_class]}
-
def __init__(self, data_path):
-
self.path_prefix = data_path
-
self.num_classes = 20
-
self.data = dict()
-
self._preprocess_XML()
-
-
def _preprocess_XML(self):
-
filenames = os.listdir(self.path_prefix)
-
for filename in filenames:
-
tree = ElementTree.parse(self.path_prefix + filename)
-
root = tree.getroot()
-
bounding_boxes = []
-
one_hot_classes = []
-
size_tree = root.find('size')
-
width = float(size_tree.find('width').text)
-
height = float(size_tree.find('height').text)
-
for object_tree in root.findall('object'):
-
for bounding_box in object_tree.iter('bndbox'):
-
xmin = float(bounding_box.find('xmin').text)/width
-
ymin = float(bounding_box.find('ymin').text)/height
-
xmax = float(bounding_box.find('xmax').text)/width
-
ymax = float(bounding_box.find('ymax').text)/height
-
bounding_box = [xmin,ymin,xmax,ymax]
-
bounding_boxes.append(bounding_box)
-
class_name = object_tree.find('name').text
-
one_hot_class = self._to_one_hot(class_name)
-
one_hot_classes.append(one_hot_class)
-
image_name = root.find('filename').text
-
bounding_boxes = np.asarray(bounding_boxes)
-
one_hot_classes = np.asarray(one_hot_classes)
-
image_data = np.hstack((bounding_boxes, one_hot_classes))
-
self.data[image_name] = image_data
-
-
def _to_one_hot(self,name):
-
one_hot_vector = [0] * self.num_classes
-
if name == 'aeroplane':
-
one_hot_vector[0] = 1
-
elif name == 'bicycle':
-
one_hot_vector[1] = 1
-
elif name == 'bird':
-
one_hot_vector[2] = 1
-
elif name == 'boat':
-
one_hot_vector[3] = 1
-
elif name == 'bottle':
-
one_hot_vector[4] = 1
-
elif name == 'bus':
-
one_hot_vector[5] = 1
-
elif name == 'car':
-
one_hot_vector[6] = 1
-
elif name == 'cat':
-
one_hot_vector[7] = 1
-
elif name == 'chair':
-
one_hot_vector[8] = 1
-
elif name == 'cow':
-
one_hot_vector[9] = 1
-
elif name == 'diningtable':
-
one_hot_vector[10] = 1
-
elif name == 'dog':
-
one_hot_vector[11] = 1
-
elif name == 'horse':
-
one_hot_vector[12] = 1
-
elif name == 'motorbike':
-
one_hot_vector[13] = 1
-
elif name == 'person':
-
one_hot_vector[14] = 1
-
elif name == 'pottedplant':
-
one_hot_vector[15] = 1
-
elif name == 'sheep':
-
one_hot_vector[16] = 1
-
elif name == 'sofa':
-
one_hot_vector[17] = 1
-
elif name == 'train':
-
one_hot_vector[18] = 1
-
elif name == 'tvmonitor':
-
one_hot_vector[19] = 1
-
else:
-
print('unknown label: %s' %name)
-
return one_hot_vector
-
## 寫入到pkl文件中。
-
import pickle
-
data = XML_preprocessor('VOC2007/Annotations/').data
-
pickle.dump(data,open('VOC2007.p','wb'))
把標注寫入到pkl文件中后,再利用定義一個Generator類來產生x_batch和 y_batch用於訓練,直接看重點,類中的generate函數:
-
def generate(self, train=True):
-
while True:
-
if train:
-
shuffle(self.train_keys)
-
keys = self.train_keys
-
else:
-
shuffle(self.val_keys)
-
keys = self.val_keys
-
inputs = []
-
targets = []
-
for key in keys:
-
img_path = self.path_prefix + key
-
img = imread(img_path).astype('float32')
-
y = self.gt[key].copy()#從pkl文件讀取而來的groud truth
-
##y的shape是一張圖片中box的數目和位置+類別。(num_box, coordinate+num_class)
-
if train and self.do_crop:
-
img, y = self.random_sized_crop(img, y)
-
img = imresize(img, self.image_size).astype('float32')
-
if train:#進行數據擴充
-
shuffle(self.color_jitter)
-
for jitter in self.color_jitter:
-
img = jitter(img)
-
if self.lighting_std:
-
img = self.lighting(img)
-
if self.hflip_prob > 0:
-
img, y = self.horizontal_flip(img, y)
-
if self.vflip_prob > 0:
-
img, y = self.vertical_flip(img, y)
-
y = self.bbox_util.assign_boxes(y) #給groud truth 分配 default box
-
inputs.append(img)
-
targets.append(y)
-
if len(targets) == self.batch_size:
-
tmp_inp = np.array(inputs)
-
tmp_targets = np.array(targets)
-
inputs = []
-
targets = []
-
yield preprocess_input(tmp_inp), tmp_targets#產生一個batch的輸入數據,及其標准的輸出label。
在給groud truth 分配 default box 時用到了BBoxUtility類中的assign_boxes函數,這個類是寫在ssd_utils.py文件中的,其中的assign_boxes函數的代碼如下:
-
#用於給label分配高分的default box
-
def assign_boxes(self, boxes):
-
#變量: boxes: Box,它的shape為:(num_boxes, 4 + num_classes),其中num_classes沒有包括背景
-
#返回值: assignment:它的shape為: (num_boxes, 4 + num_classes + 8),
-
#第二維上的8其實很多都是0,只有在assignment[:, -8]存在1,代表給default box分配了哪個groud truth
-
assignment = np.zeros((self.num_priors, 4 + self.num_classes + 8))
-
assignment[:, 4] = 1.0
-
if len(boxes) == 0:
-
return assignment
-
encoded_boxes = np.apply_along_axis(self.encode_box, 1, boxes[:, :4])
-
encoded_boxes = encoded_boxes.reshape(-1, self.num_priors, 5)
-
#找出一張圖中的所有的object與所有的prior box的最大IOU,即每個prior box對應一個object
-
best_iou = encoded_boxes[:, :, -1].max(axis=0)
-
##找出每個prior box對應的那個object的索引。len(best_iou_idx)=num_priors
-
best_iou_idx = encoded_boxes[:, :, -1].argmax(axis=0)
-
##找出與groud truth 存在IOU的prior box
-
best_iou_mask = best_iou > 0
-
best_iou_idx = best_iou_idx[best_iou_mask]
-
assign_num = len(best_iou_idx)
-
##篩選出與groud truth 有IOU的prior box
-
encoded_boxes = encoded_boxes[:, best_iou_mask, :]
-
#確定給assignment分配中的prior box分配 具體哪一個groud truth。best_iou_idx中元素的范圍為:range(num_object)。
-
assignment[:, :4][best_iou_mask] = encoded_boxes[best_iou_idx, np.arange(assign_num),:4]
-
assignment[:, 4][best_iou_mask] = 0
-
assignment[:, 5:-8][best_iou_mask] = boxes[best_iou_idx, 4:]
-
assignment[:, -8][best_iou_mask] = 1
-
return assignment
-
返回了最終的assignment,用於作為訓練時候的標准輸出。
值得注意的是,在這個類里面用到self.prior,即default box都是作者先寫入到了pkl文件中的,方便於使用,而且對於特定大小的feature map而言,default box是保持不變的,所以提前給出是不會影響訓練的。
輸入的數據和標准的輸出都知道了,接下來就是定義loss function 了
model 的 loss function:
model 的loss function定義在了ssd_training.py文件中了,里面定義了一些有用的功能函數,來幫助最終loss計算的,我們就直接看最終計算那個loss的函數:
-
def compute_loss(self, y_true, y_pred):
-
# 在keras中自定義loss函數,它的兩個輸入必須為預測的輸出和標准的輸出
-
# 變量:
-
# y_pred: 它的shape為: (?, num_boxes, 4 + num_classes + 8). 就是在model框架部分介紹的輸出。
-
# y_truth:它的shape和y_pred的shape是一樣的,就是上一節我們介紹assignment那一塊的輸出,具體參考上一節。
-
# 返回最終的所有loss總和
-
batch_size = tf.shape(y_true)[0]
-
num_boxes = tf.to_float(tf.shape(y_true)[1])
-
# 計算出所有default box的loss
-
conf_loss = self._softmax_loss(y_true[:, :, 4:-8],
-
y_pred[:, :, 4:-8])
-
loc_loss = self._l1_smooth_loss(y_true[:, :, :4],
-
y_pred[:, :, :4])
-
#計算positive 樣本的loss
-
#num_pos 為一個一維的array:len(num_pos)=batch
-
num_pos = tf.reduce_sum(y_true[:, :, -8], axis=-1)
-
##只需計算存在gt_box與其對應的loss
-
pos_loc_loss = tf.reduce_sum(loc_loss * y_true[:, :, -8],
-
axis=1)
-
pos_conf_loss = tf.reduce_sum(conf_loss * y_true[:, :, -8],
-
axis=1)
-
#計算negative sample的loss,只計算了confidence loss
-
num_neg = tf.minimum(self.neg_pos_ratio * num_pos,
-
num_boxes - num_pos)
-
pos_num_neg_mask = tf.greater(num_neg, 0)
-
has_min = tf.to_float(tf.reduce_any(pos_num_neg_mask))
-
num_neg = tf.concat(axis=0, values=[num_neg,
-
[(1 - has_min) * self.negatives_for_hard]])
-
#tf.boolen_mask(a,b),例如b=[true, false],a=[[[2,2],[2,3]]],則輸出為[2,2]。
-
#實際上就是取num_neg為正數的那些元素,然后再在其中取num_neg中的最小的元素作為num_neg_batch。
-
num_neg_batch = tf.reduce_min(tf.boolean_mask(num_neg,
-
tf.greater(num_neg, 0)))
-
num_neg_batch = tf.to_int32(num_neg_batch)
-
confs_start = 4 + self.background_label_id + 1
-
confs_end = confs_start + self.num_classes - 1
-
#max_confs的shape為:(batch, num_prior)
-
max_confs = tf.reduce_max(y_pred[:, :, confs_start:confs_end],
-
axis=2)
-
#返回負樣本的top-K個元素,最終返回的indices的shape為(batch, K=num_neg_batch)
-
_, indices = tf.nn.top_k(max_confs * (1 - y_true[:, :, -8]),
-
k=num_neg_batch)
-
#創建一個shape也為(batch,num_neg_batch)的indices
-
batch_idx = tf.expand_dims(tf.range(0, batch_size), 1)
-
batch_idx = tf.tile(batch_idx, (1, num_neg_batch))
-
#乘以num_boxes后得到batch中每一個sample的index的起始值,再加上top_k得到的index就得到了一個一維的full_indices。
-
full_indices = (tf.reshape(batch_idx, [-1]) * tf.to_int32(num_boxes) +
-
tf.reshape(indices, [-1]))
-
#把得到的conf_loss也reshape成一維,然后用full_indices對其進行取值
-
neg_conf_loss = tf.gather(tf.reshape(conf_loss, [-1]),
-
full_indices)
-
#最終把負樣本的confidence loss reshape 成(batch, num_neg_batch),再對每個sample上的loss求和。
-
neg_conf_loss = tf.reshape(neg_conf_loss,
-
[batch_size, num_neg_batch])
-
neg_conf_loss = tf.reduce_sum(neg_conf_loss, axis=1)
-
#整合所有的loss:positive loss 和 negative loss
-
total_loss = pos_conf_loss + neg_conf_loss
-
total_loss /= (num_pos + tf.to_float(num_neg_batch))
-
num_pos = tf.where(tf.not_equal(num_pos, 0), num_pos,
-
tf.ones_like(num_pos))
-
total_loss += (self.alpha * pos_loc_loss) / num_pos
-
return total_loss
這時候function loss 也准備好了,屬於一切都准備就緒了。當然就是進行訓練了。其實在寫這篇blog之前我還是對loss function 這塊沒有太細看明白,寫完之后頓時就恍然大悟的,寫blog確實是一個自我學習的一個很好過程。
model 進行 training
training這一塊是寫在SSD_training.ipynb的jupyter notebook文件中的,上面那些model 的部件准備好了之后,training就按照keras的流程照搬就好了。
不過需要注意一下,作者給的這個訓練並不是voc數據集的訓練,而是對3種瓶子的檢測。
1.必要的庫和自己編寫的模塊的導入:
-
import cv2
-
import keras
-
from keras.applications.imagenet_utils import preprocess_input
-
from keras.backend.tensorflow_backend import set_session
-
from keras.models import Model
-
from keras.preprocessing import image
-
import matplotlib.pyplot as plt
-
import numpy as np
-
import pickle
-
from random import shuffle
-
from scipy.misc import imread
-
from scipy.misc import imresize
-
import tensorflow as tf
-
from ssd import SSD300
-
from ssd_training import MultiboxLoss
-
from ssd_utils import BBoxUtility
-
-
%matplotlib inline
-
plt.rcParams['figure.figsize'] = (8, 8)
-
plt.rcParams['image.interpolation'] = 'nearest'
-
-
np.set_printoptions(suppress=True)
2.必要的初始化參數和prior box 的讀取,以及輸入數據的讀取:
-
NUM_CLASSES = 4
-
input_shape = (300, 300, 3)
-
#prior_boxes_ssd300.pkl 存放了所有的prior:[xmin, ymin, xmax, ymax,var[0],var[1],var[2],var[3]]
-
priors = pickle.load(open('prior_boxes_ssd300.pkl', 'rb'))
-
bbox_util = BBoxUtility(NUM_CLASSES, priors)
-
#獲得輸入數據的file_name、bounding box 和 label
-
gt = pickle.load(open('gt_pascal.pkl', 'rb'))
-
keys = sorted(gt.keys())
-
num_train = int(round(0.8 * len(keys)))
-
train_keys = keys[:num_train]
-
val_keys = keys[num_train:]
-
num_val = len(val_keys)
3.輸入數據和label的generator類定義,有點長,就把generate 那個函數貼出來:
-
class Generator(object):
-
def generate(self, train=True):
-
while True:
-
if train:
-
shuffle(self.train_keys)
-
keys = self.train_keys
-
else:
-
shuffle(self.val_keys)
-
keys = self.val_keys
-
inputs = []
-
targets = []
-
for key in keys:
-
img_path = self.path_prefix + key
-
img = imread(img_path).astype('float32')
-
y = self.gt[key].copy()
-
##y的shape是一張圖片中box的數目和位置+類別。(num_box, coordinate+num_class)
-
if train and self.do_crop:
-
img, y = self.random_sized_crop(img, y)
-
img = imresize(img, self.image_size).astype('float32')
-
if train:
-
shuffle(self.color_jitter)
-
for jitter in self.color_jitter:
-
img = jitter(img)
-
if self.lighting_std:
-
img = self.lighting(img)
-
if self.hflip_prob > 0:
-
img, y = self.horizontal_flip(img, y)
-
if self.vflip_prob > 0:
-
img, y = self.vertical_flip(img, y)
-
y = self.bbox_util.assign_boxes(y)
-
inputs.append(img)
-
targets.append(y)
-
if len(targets) == self.batch_size:
-
tmp_inp = np.array(inputs)
-
tmp_targets = np.array(targets)
-
inputs = []
-
targets = []
-
yield preprocess_input(tmp_inp), tmp_targets #batch 生成器
4.必要的初始化
-
#輸入數據(圖片)的root directory
-
path_prefix = '../../frames/'
-
gen = Generator(gt, bbox_util, 16, '../../frames/',
-
train_keys, val_keys,
-
(input_shape[0], input_shape[1]), do_crop=False)
-
#構建SSD300的model
-
model = SSD300(input_shape, num_classes=NUM_CLASSES)
-
model.load_weights('weights_SSD300.hdf5', by_name=True)
-
#也沒太弄懂,為什么需要把他們給freeze,為啥也對他們train
-
freeze = ['input_1', 'conv1_1', 'conv1_2', 'pool1',
-
'conv2_1', 'conv2_2', 'pool2',
-
'conv3_1', 'conv3_2', 'conv3_3', 'pool3']
-
for L in model.layers:
-
if L.name in freeze:
-
L.trainable = False
5.keras的一些callback function的定義以及model的compile and training:
-
def schedule(epoch, decay=0.9):
-
return base_lr * decay**(epoch)
-
-
callbacks = [keras.callbacks.ModelCheckpoint('./checkpoints/weights.{epoch:02d}-{val_loss:.2f}.hdf5',
-
verbose=1,
-
save_weights_only=True),
-
keras.callbacks.LearningRateScheduler(schedule)]
-
base_lr = 3e-4
-
optim = keras.optimizers.Adam(lr=base_lr)
-
# optim = keras.optimizers.RMSprop(lr=base_lr)
-
# optim = keras.optimizers.SGD(lr=base_lr, momentum=0.9, decay=decay, nesterov=True)
-
model.compile(optimizer=optim,
-
loss=MultiboxLoss(NUM_CLASSES, neg_pos_ratio=2.0).compute_loss)
-
nb_epoch = 30
-
history = model.fit_generator(gen.generate(True), gen.train_batches,
-
nb_epoch, verbose=1,
-
callbacks=callbacks,
-
validation_data=gen.generate(False),
-
nb_val_samples=gen.val_batches,
-
nb_worker=1)
6.train完了之后,當然是檢測了:
-
#數據的讀取
-
inputs = []
-
images = []
-
img_path = path_prefix + sorted(val_keys)[0]
-
img = image.load_img(img_path, target_size=(300, 300))
-
img = image.img_to_array(img)
-
images.append(imread(img_path))
-
inputs.append(img.copy())
-
inputs = preprocess_input(np.array(inputs))
-
#進行預測和預測后對預測結果的解碼
-
preds = model.predict(inputs, batch_size=1, verbose=1)
-
results = bbox_util.detection_out(preds)
-
#可視化預測結果
-
for i, img in enumerate(images):
-
# Parse the outputs.
-
det_label = results[i][:, 0]
-
det_conf = results[i][:, 1]
-
det_xmin = results[i][:, 2]
-
det_ymin = results[i][:, 3]
-
det_xmax = results[i][:, 4]
-
det_ymax = results[i][:, 5]
-
# Get detections with confidence higher than 0.6.
-
top_indices = [i for i, conf in enumerate(det_conf) if conf >= 0.6]
-
top_conf = det_conf[top_indices]
-
top_label_indices = det_label[top_indices].tolist()
-
top_xmin = det_xmin[top_indices]
-
top_ymin = det_ymin[top_indices]
-
top_xmax = det_xmax[top_indices]
-
top_ymax = det_ymax[top_indices]
-
-
colors = plt.cm.hsv(np.linspace(0, 1, 4)).tolist()
-
-
plt.imshow(img / 255.)
-
currentAxis = plt.gca()
-
-
for i in range(top_conf.shape[0]):
-
xmin = int(round(top_xmin[i] * img.shape[1]))
-
ymin = int(round(top_ymin[i] * img.shape[0]))
-
xmax = int(round(top_xmax[i] * img.shape[1]))
-
ymax = int(round(top_ymax[i] * img.shape[0]))
-
score = top_conf[i]
-
label = int(top_label_indices[i])
-
#注意這里的label直接使用的數字,因為它train的數據集不是voc,而是幾種瓶子的種類。
-
display_txt = '{:0.2f}, {}'.format(score, label)
-
coords = (xmin, ymin), xmax-xmin+1, ymax-ymin+1
-
color = colors[label]
-
currentAxis.add_patch(plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2))
-
currentAxis.text(xmin, ymin, display_txt, bbox={'facecolor':color, 'alpha':0.5})
-
plt.show()
7.predict 的結果:
整個過程也就基本上的結束了。SSD的keras實現還是比較簡單的,沒有mask r-cnn那么費勁。不知道為啥我先看的yolo的原理和實現,但是不太想寫yolo的實現和原理(手動白眼),直接跳到了SSD,大概是覺得SSD比較好理解把,yolo等有時間再寫吧。
之后我再把生成prior box pkl文件的代碼貼上來,自己寫的代碼有點亂。希望看到了最后你對SDD的模型架構和具體實現都有了一個很好的認識。因為也是一個新手,所以其中有什么理解不到位,或者寫錯的,歡迎指出。
添加:prior box 的 pkl文件生成代碼:其實也很簡單,就是稍微修改了一下PriorBox這個自定義的keras layer,把輸出用來產生對於特定feature map 大小的 default box:
-
import numpy as np
-
class PriorBox():
-
def __init__(self, img_size, min_size, max_size=None, aspect_ratios=None,
-
flip=True, variances=[0.1,0.1,0.2,0.2], clip=True, layer_shape=[8,8],**kwargs):
-
self.input_shape = layer_shape
-
self.img_size = img_size
-
if min_size
<= 0:
-
raise
Exception('
min_size
must
be
positive.')
-
self.min_size =
min_size
-
self.max_size =
max_size
-
self.aspect_ratios =
[1.0]
-
if
max_size:
-
if
max_size <
min_size:
-
raise
Exception('
max_size
must
be
greater
than
min_size.')
-
self.aspect_ratios.append(
1.0)
-
if
aspect_ratios:
-
for
ar
in
aspect_ratios:
-
if
ar
in
self.aspect_ratios:
-
continue
-
self.aspect_ratios.append(
ar)
-
if
flip:
-
self.aspect_ratios.append(
1.0 /
ar)
-
self.variances =
np.array(variances)
-
self.clip =
True
-
super(
PriorBox,
self)
.__init__(**
kwargs)
-
-
def
compute_default_box(
self)
:
-
layer_height =
self.input_shape[0]
-
layer_width =
self.input_shape[1]
-
img_width =
self.img_size[0]
-
img_height =
self.img_size[1]
-
#
define
prior
boxes
shapes
-
box_widths =
[]
-
box_heights =
[]
-
for
ar
in
self.aspect_ratios:
-
if
ar ==
1
and
len(
box_widths) ==
0:
-
box_widths.append(
self.min_size)
-
box_heights.append(
self.min_size)
-
elif
ar ==
1
and
len(
box_widths) > 0:
-
box_widths.append(np.sqrt(self.min_size * self.max_size))
-
box_heights.append(np.sqrt(self.min_size * self.max_size))
-
elif ar != 1:
-
box_widths.append(self.min_size * np.sqrt(ar))
-
box_heights.append(self.min_size / np.sqrt(ar))
-
box_widths = 0.5 * np.array(box_widths)
-
box_heights = 0.5 * np.array(box_heights)
-
# define centers of prior boxes
-
step_x = img_width / layer_width
-
step_y = img_height / layer_height
-
#generate a list data
-
linx = np.linspace(0.5 * step_x, img_width - 0.5 * step_x,
-
layer_width)
-
liny = np.linspace(0.5 * step_y, img_height - 0.5 * step_y,
-
layer_height)
-
##ulitize meshgrid function to generate default box's coordinates
-
centers_x, centers_y = np.meshgrid(linx, liny)
-
centers_x = centers_x.reshape(-1, 1)
-
centers_y = centers_y.reshape(-1, 1)
-
# define xmin, ymin, xmax, ymax of prior boxes
-
num_priors_ = len(self.aspect_ratios)
-
prior_boxes = np.concatenate((centers_x, centers_y), axis=1)
-
prior_boxes = np.tile(prior_boxes, (1, 2 * num_priors_))
-
prior_boxes[:, ::4] -= box_widths
-
prior_boxes[:, 1::4] -= box_heights
-
prior_boxes[:, 2::4] += box_widths
-
prior_boxes[:, 3::4] += box_heights
-
prior_boxes[:, ::2] /= img_width
-
prior_boxes[:, 1::2] /= img_height
-
prior_boxes = prior_boxes.reshape(-1, 4)
-
if self.clip:
-
prior_boxes = np.minimum(np.maximum(prior_boxes, 0.0), 1.0)
-
# define variances
-
num_boxes = len(prior_boxes)
-
if len(self.variances) == 1:
-
variances = np.ones((num_boxes, 4)) * self.variances[0]
-
elif len(self.variances) == 4:
-
variances = np.tile(self.variances, (num_boxes, 1))
-
else:
-
raise Exception('Must provide one or four variances.')
-
prior_boxes = np.concatenate((prior_boxes, variances), axis=1)
-
return prior_boxes
-
-
#調用修改后的PriorBox類
-
img_size = (300, 300)
-
default_box_layer1 = PriorBox(img_size, 30, [], aspect_ratios=[2], layer_shape=(38,38)).compute_default_box()
-
default_box_layer2 = PriorBox(img_size, 60, 114, aspect_ratios=[2,3], layer_shape=(19,19)).compute_default_box()
-
default_box_layer3 = PriorBox(img_size, 114, 168, aspect_ratios=[2,3], layer_shape=(10,10)).compute_default_box()
-
default_box_layer4 = PriorBox(img_size, 168, 222, aspect_ratios=[2,3], layer_shape=(5,5)).compute_default_box()
-
default_box_layer5 = PriorBox(img_size, 222, 276, aspect_ratios=[2,3], layer_shape=(3,3)).compute_default_box()
-
default_box_layer6 = PriorBox(img_size, 276, 330, aspect_ratios=[2,3], layer_shape=(1,1)).compute_default_box()
-
#把各層的輸出concatenate起來
-
default_box = np.concatenate((default_box_layer1, default_box_layer2, default_box_layer3,\
-
default_box_layer4, default_box_layer5, default_box_layer6), axis=0)
-
#寫入到pkl文件中
-
import pickle
-
pickle.dump(default_box,open("default_box_information","wb"))
