single shot multibox detectior
一、SSD重要參數設置
在ssd_vgg_300.py文件中初始化重要的網絡參數,主要有用於生成默認框的特征層,每層默認框的默認尺寸以及長寬比例:
1 # Copyright 2016 Paul Balanca. All Rights Reserved. 2 # 3 # Licensed under the Apache License, Version 2.0 (the "License"); 4 # you may not use this file except in compliance with the License. 5 # You may obtain a copy of the License at 6 # 7 # http://www.apache.org/licenses/LICENSE-2.0 8 # 9 # Unless required by applicable law or agreed to in writing, software 10 # distributed under the License is distributed on an "AS IS" BASIS, 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 # See the License for the specific language governing permissions and 13 # limitations under the License. 14 # ============================================================================== 15 """Definition of 300 VGG-based SSD network. 16 17 This model was initially introduced in: 18 SSD: Single Shot MultiBox Detector 19 Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, 20 Cheng-Yang Fu, Alexander C. Berg 21 https://arxiv.org/abs/1512.02325 22 23 Two variants of the model are defined: the 300x300 and 512x512 models, the 24 latter obtaining a slightly better accuracy on Pascal VOC. 25 26 Usage: 27 with slim.arg_scope(ssd_vgg.ssd_vgg()): 28 outputs, end_points = ssd_vgg.ssd_vgg(inputs) 29 30 This network port of the original Caffe model. The padding in TF and Caffe 31 is slightly different, and can lead to severe accuracy drop(精度嚴重下降) if not taken care 32 in a correct way! 33 34 In Caffe, the output size of convolution and pooling layers are computing as 35 following: h_o = (h_i + 2 * pad_h - kernel_h) / stride_h + 1 36 37 Nevertheless(然而), there is a subtle(微妙的) difference between both for stride > 1. In 38 the case of convolution(在卷積的情況下): 39 top_size = floor((bottom_size + 2*pad - kernel_size) / stride) + 1 40 whereas for pooling: 41 top_size = ceil((bottom_size + 2*pad - kernel_size) / stride) + 1 42 Hence implicitely allowing some additional padding even if pad = 0(隱含的允許一些額外的填充). This 43 behaviour explains why pooling with stride and kernel of size 2 are behaving 44 the same way in TensorFlow and Caffe. 45 46 Nevertheless, this is not the case anymore for other kernel sizes()對於其他kernel,情況就不同了, hence 47 motivating the use of special padding layer for controlling these side-effects.(鼓勵使用特殊的填充層來控制這種副作用) 48 49 @@ssd_vgg_300 50 """ 51 import math 52 from collections import namedtuple 53 54 import numpy as np 55 import tensorflow as tf 56 57 import tf_extended as tfe 58 from nets import custom_layers 59 from nets import ssd_common 60 61 slim = tf.contrib.slim 62 63 64 # =========================================================================== # 65 # SSD class definition. 66 # =========================================================================== # 67 #collections模塊的namedtuple子類不僅可以使用item的index訪問item, 68 # 還可以通過item的name進行訪問可以將namedtuple理解為c中的struct結構, 69 # 其首先將各個item命名,然后對每個item賦予數據 70 # nametuple(tuple名字,域名) 71 SSDParams = namedtuple('SSDParameters', ['img_shape', #輸入圖像大小 72 'num_classes', #類+1(背景) 73 'no_annotation_label', #無標注標簽???? 74 'feat_layers', #特征層 75 'feat_shapes', #特征層形狀 76 'anchor_size_bounds', #錨點框大小上下邊界,相對於原圖的比例值 77 'anchor_sizes', #初始錨點框尺寸 78 'anchor_ratios', #錨點框長寬比 79 'anchor_steps', #feature map相對於原圖的縮小倍數,后面會解釋 80 'anchor_offset', #錨點框中心的偏移 81 'normalizations', #是否正則化 82 'prior_scaling' ##特征圖上每個目標與參考框間的尺寸縮放(y,x,h,w)解碼時用到 83 ]) 84 85 86 class SSDNet(object): 87 """Implementation of the SSD VGG-based 300 network. 88 89 The default features layers with 300x300 image input are: 90 conv4 ==> 38 x 38 91 conv7 ==> 19 x 19 92 conv8 ==> 10 x 10 93 conv9 ==> 5 x 5 94 conv10 ==> 3 x 3 95 conv11 ==> 1 x 1 96 The default image size used to train this network is 300x300. 97 """ 98 default_params = SSDParams( #默認參數 99 img_shape=(300, 300), 100 num_classes=21, #類數 + 1(背景) 101 no_annotation_label=21, #同上 102 feat_layers=['block4', 'block7', 'block8', 'block9', 'block10', 'block11'], #特征層名字 103 feat_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], #特征層尺寸 104 anchor_size_bounds=[0.15, 0.90], #第一層feature map的default box縮放比例Sk,大小為:300x0.15,300x0.9 105 # anchor_size_bounds=[0.20, 0.90], #論文中是300x0.2,300x0.9 106 107 #anchor的大小,一共6個比例,下面的是原圖根據比例計算后的得到的實際anchor大小 108 #4,6,6,6,4,4(每層feature map的dafault box的個數) 109 #長寬都是有計算公式的,得到Sk后,通過公式得到h,w 110 anchor_sizes=[(21., 45.), #h,w 111 (45., 99.), 112 (99., 153.), 113 (153., 207.), 114 (207., 261.), 115 (261., 315.)], #越小的anchor box,得到的信息越大,這個是相對於原圖的大小,越來越大 116 # anchor_sizes=[(30., 60.), 117 # (60., 111.), 118 # (111., 162.), 119 # (162., 213.), 120 # (213., 264.), 121 # (264., 315.)], 122 123 ##每個特征層上的每個特征點預測的box長寬比及數量,例如:[2, .5]:(1:1)、(2:1)、(1:2)、(1:1),這里是把重復的省去了 124 #實際上是有4個default box的 125 anchor_ratios=[[2, .5], #block4: def_boxes:4 126 [2, .5, 3, 1./3], #def_boxes:6 (ratios中的4個+默認的1:1+額外增加的一個(S'k)=6) 127 [2, .5, 3, 1./3], #def_boxes:6 128 [2, .5, 3, 1./3], #def_boxes:6 129 [2, .5], #def_boxes:4 130 [2, .5]], #def_boxes:4 131 anchor_steps=[8, 16, 32, 64, 100, 300], #8x38=304,16x19=304,32x10=320,64x5=320,100x3=300,1x300=300 132 anchor_offset=0.5, 133 #是否歸一化,大於0則進行,否則不做歸一化; 134 # 目前看來只對block_4進行正則化,因為該層比較靠前,其norm(范數)較大,需做L2正則化 135 # (僅僅對每個像素在channel維度做歸一化)以保證和后面檢測層差異不是很大; 136 normalizations=[20, -1, -1, -1, -1, -1], 137 prior_scaling=[0.1, 0.1, 0.2, 0.2] #特征圖上每個目標與參考框間的尺寸縮放(y,x,h,w)解碼時用到 138 ) 139 140 def __init__(self, params=None): #網絡參數初始化 141 """ 142 Init the SSD net with some parameters. Use the default ones if none provided. 143 """ 144 if isinstance(params, SSDParams): #是否有參數輸入,是則用輸入的,否則使用默認的 145 self.params = params #isinstance是python的內建函數,如果參數1與參數2的類型相同則返回true; 146 else: # 147 self.params = SSDNet.default_params 148 149 # ======================================================================= # 150 #定義網絡模型 151 def net(self, inputs, 152 is_training=True, #是否訓練 153 update_feat_shapes=True, #是否更新特征層的尺寸 154 dropout_keep_prob=0.5, ##dropout=0.5 155 prediction_fn=slim.softmax, #采用softmax預測結果 156 reuse=None, 157 scope='ssd_300_vgg'): #網絡名:ssd_300_vgg(基礎網絡時VGG,輸入訓練圖像size是300x300) 158 """ 159 SSD network definition. 160 """ 161 #網絡輸入參數 162 r = ssd_net(inputs, 163 num_classes=self.params.num_classes, 164 feat_layers=self.params.feat_layers, 165 anchor_sizes=self.params.anchor_sizes, 166 anchor_ratios=self.params.anchor_ratios, 167 normalizations=self.params.normalizations, 168 is_training=is_training, 169 dropout_keep_prob=dropout_keep_prob, 170 prediction_fn=prediction_fn, 171 reuse=reuse, 172 scope=scope) 173 # Update feature shapes (try at least!) 174 # 下面這步我的理解就是讓讀者自行更改特征層的輸入,未必論文中介紹的那幾個block 175 if update_feat_shapes: #是否更新特征層圖像尺寸? 176 #輸入特征層圖像尺寸以及inputs(應該是預測的特征尺寸),輸出更新后的特征圖尺寸列表 177 shapes = ssd_feat_shapes_from_net(r[0], self.params.feat_shapes) 178 #將更新的特征圖尺寸shapes替換當前的特征圖尺寸 179 self.params = self.params._replace(feat_shapes=shapes) 180 return r ##更新網絡輸入參數r 181 182 # 定義權重衰減=0.0005,L2正則化項系數;數據類型是NHWC:[batch, height, width, channels] 183 def arg_scope(self, weight_decay=0.0005, data_format='NHWC'): 184 """Network arg_scope. 185 """ 186 return ssd_arg_scope(weight_decay, data_format=data_format) 187 188 def arg_scope_caffe(self, caffe_scope): 189 """Caffe arg_scope used for weights importing. 190 """ 191 return ssd_arg_scope_caffe(caffe_scope) 192 193 # ======================================================================= # 194 ##更新特征形狀尺寸(來自預測結果) 195 def update_feature_shapes(self, predictions): 196 """Update feature shapes from predictions collection (Tensor or Numpy 197 array). 198 """ 199 shapes = ssd_feat_shapes_from_net(predictions, self.params.feat_shapes) 200 self.params = self.params._replace(feat_shapes=shapes) 201 #輸入原始圖像尺寸;返回每個特征層每個參考錨點框的位置及尺寸信息(x,y,h,w) 202 def anchors(self, img_shape, dtype=np.float32): 203 """Compute the default anchor boxes, given an image shape. 204 """ 205 return ssd_anchors_all_layers(img_shape, 206 self.params.feat_shapes, 207 self.params.anchor_sizes, 208 self.params.anchor_ratios, 209 self.params.anchor_steps, 210 self.params.anchor_offset, 211 dtype) 212 #編碼,用於將標簽信息,真實目標信息和錨點框信息編碼在一起;得到預測真實框到參考框的轉換值 213 def bboxes_encode(self, labels, bboxes, anchors, 214 scope=None): 215 """Encode labels and bounding boxes. 216 """ 217 return ssd_common.tf_ssd_bboxes_encode( 218 labels, bboxes, anchors, 219 self.params.num_classes, 220 self.params.no_annotation_label, #未標注的標簽(應該代表背景) 221 ignore_threshold=0.5, #IOU篩選閾值 222 prior_scaling=self.params.prior_scaling, #特征圖目標與參考框間的尺寸縮放(0.1,0.1,0.2,0.2) 223 scope=scope) 224 #解碼,用錨點框信息,錨點框與預測真實框間的轉換值,得到真實的預測框(ymin,xmin,ymax,xmax) 225 def bboxes_decode(self, feat_localizations, anchors, 226 scope='ssd_bboxes_decode'): 227 """Encode labels and bounding boxes. 228 """ 229 return ssd_common.tf_ssd_bboxes_decode( 230 feat_localizations, anchors, 231 prior_scaling=self.params.prior_scaling, 232 scope=scope) 233 #通過SSD網絡,得到檢測到的bbox 234 def detected_bboxes(self, predictions, localisations, 235 select_threshold=None, nms_threshold=0.5, 236 clipping_bbox=None, top_k=400, keep_top_k=200): 237 """Get the detected bounding boxes from the SSD network output. 238 """ 239 # Select top_k bboxes from predictions, and clip 240 # 選取top_k=400個框,並對框做修建(超出原圖尺寸范圍的切掉) 241 242 # 得到對應某個類別的得分值以及bbox 243 rscores, rbboxes = \ 244 ssd_common.tf_ssd_bboxes_select(predictions, localisations, 245 select_threshold=select_threshold, 246 num_classes=self.params.num_classes) 247 #按照得分高低,篩選出400個bbox和對應得分 248 rscores, rbboxes = \ 249 tfe.bboxes_sort(rscores, rbboxes, top_k=top_k) 250 # Apply NMS algorithm. 251 # 應用非極大值抑制,去掉與得分最高的bbox的重疊率大於nms_threshold=0.5的,保留200個 252 rscores, rbboxes = \ 253 tfe.bboxes_nms_batch(rscores, rbboxes, 254 nms_threshold=nms_threshold, 255 keep_top_k=keep_top_k) 256 if clipping_bbox is not None: 257 rbboxes = tfe.bboxes_clip(clipping_bbox, rbboxes) 258 return rscores, rbboxes #返回裁剪好的bbox和對應得分 259 260 # 盡管一個ground truth可以與多個先驗框匹配,但是ground truth相對先驗框還是太少了, 261 # 所以負樣本相對正樣本會很多。為了保證正負樣本盡量平衡,SSD采用了hard negative mining, 262 # 就是對負樣本進行抽樣,抽樣時按照置信度誤差(預測背景的置信度越小(預測背景,但實際上不是背景的概率很大),誤差越大)進行降序排列, 263 # 選取誤差的較大的top-k作為訓練的負樣本,以保證正負樣本比例接近1:3 264 def losses(self, logits, localisations, 265 gclasses, glocalisations, gscores, 266 match_threshold=0.5, 267 negative_ratio=3., 268 alpha=1., 269 label_smoothing=0., 270 scope='ssd_losses'): 271 """ 272 Define the SSD network losses. 273 """ 274 return ssd_losses(logits, localisations, 275 gclasses, glocalisations, gscores, 276 match_threshold=match_threshold, 277 negative_ratio=negative_ratio, 278 alpha=alpha, 279 label_smoothing=label_smoothing, 280 scope=scope) 281 282 283 # =========================================================================== # 284 # SSD tools... 285 # =========================================================================== # 286 # ???? 287 def ssd_size_bounds_to_values(size_bounds, 288 n_feat_layers, 289 img_shape=(300, 300)): 290 """ 291 Compute the reference sizes of the anchor boxes from relative bounds. 292 The absolute values are measured in pixels, based on the network 293 default size (300 pixels). 294 295 This function follows the computation performed in the original 296 implementation of SSD in Caffe. 297 298 Return: 299 list of list containing the absolute sizes at each scale. For each scale, 300 the ratios only apply to the first value. 301 """ 302 assert img_shape[0] == img_shape[1] 303 304 img_size = img_shape[0] 305 min_ratio = int(size_bounds[0] * 100) 306 max_ratio = int(size_bounds[1] * 100) 307 step = int(math.floor((max_ratio - min_ratio) / (n_feat_layers - 2))) 308 # Start with the following smallest sizes. 309 sizes = [[img_size * size_bounds[0] / 2, img_size * size_bounds[0]]] 310 for ratio in range(min_ratio, max_ratio + 1, step): 311 sizes.append((img_size * ratio / 100., 312 img_size * (ratio + step) / 100.)) 313 return sizes 314 315 # 得到更新后的特征尺寸list 316 def ssd_feat_shapes_from_net(predictions, default_shapes=None): 317 """Try to obtain the feature shapes from the prediction layers. The latter 318 can be either a Tensor or Numpy ndarray. 319 320 Return: 321 如果預測沒有完全成型,就是用默認值 322 list of feature shapes. Default values if predictions shape not fully 323 determined. 324 """ 325 feat_shapes = [] 326 for l in predictions: #l:預測的特征形狀 327 # Get the shape, from either a np array or a tensor. 328 # 如果l是np.ndarray類型,則將l的形狀賦給shape;否則將shape作為list 329 if isinstance(l, np.ndarray): 330 shape = l.shape 331 else: 332 shape = l.get_shape().as_list() 333 shape = shape[1:4] 334 # Problem: undetermined shape... 335 # 如果預測的特征尺寸未定,則使用默認的形狀;否則將shape中的值賦給特征形狀列表中 336 if None in shape: 337 return default_shapes 338 else: 339 feat_shapes.append(shape) 340 return feat_shapes #返回更新后的特征尺寸list 341 342 #default box 的生成 343 #生成一層anchor box 344 def ssd_anchor_one_layer(img_shape, #原始圖像shape 345 feat_shape, #特征圖shape 346 sizes, #默認box大小,兩個正方形,兩個長方形,僅僅就是長寬比例相反,所以就兩個 347 ratios, #默認box長寬比,list,就是那些比率列表,元素值是比例,列表長度是框的個數 348 step, #特征圖上一步對應在原圖上的跨度 349 offset=0.5, 350 dtype=np.float32): 351 """Computer SSD default anchor boxes for one feature layer. 352 353 Determine the relative position grid of the centers, and the relative 354 width and height.確定中心的相對位置網格和相對位置網格寬度和高度。 355 356 Arguments: 357 feat_shape: Feature shape, used for computing relative position grids; 358 size: Absolute reference sizes; 359 ratios: Ratios to use on these features; 360 img_shape: Image shape, used for computing height, width relatively to the 361 former; 362 offset: Grid offset. 363 364 Return: 365 y, x, h, w: Relative x and y grids, and height and width. 366 """ 367 # Compute the position grid: simple way. 368 # y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]] 369 # y = (y.astype(dtype) + offset) / feat_shape[0] 370 # x = (x.astype(dtype) + offset) / feat_shape[1] 371 # Weird SSD-Caffe computation using steps values... 372 # 歸一化到原圖的錨點中心坐標(x,y);其坐標值域為(0,1) 373 # 計算default box中心坐標(相對於原圖) 374 y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]] # 對於第一個特征圖(block4:38x38); 375 # y=[[0,0,……0],[1,1,……1],……[37,37,……,37]]; 376 # 而x=[[0,1,2……,37],[0,1,2……,37],……[0,1,2……,37]] 377 y = (y.astype(dtype) + offset) * step / img_shape[0]# 將38個cell對應錨點框的y坐標偏移至每個cell中心,然后乘以相對原圖縮放的比例,再除以原圖 378 x = (x.astype(dtype) + offset) * step / img_shape[1]#可以得到在原圖上,相對原圖比例大小的每個錨點中心坐標x,y 379 380 # Expand dims to support easy broadcasting.#將錨點中心坐標擴大維度 381 y = np.expand_dims(y, axis=-1) #對於第一個特征圖,y的shape=38x38x1;x的shape=38x38x1 382 x = np.expand_dims(x, axis=-1) 383 384 # Compute relative height and width. 385 # Tries to follow the original implementation of SSD for the order. 386 # 默認框的個數,該特征圖上每個cell對應的錨點框數量;如:對於第一個特征圖每個點預測4個錨點框(block4:38x38),2+2=4 387 num_anchors = len(sizes) + len(ratios) 388 h = np.zeros((num_anchors, ), dtype=dtype) #第一個錨點框的高h[0]=起始錨點的高/原圖大小的高;例如:h[0]=21/300 389 w = np.zeros((num_anchors, ), dtype=dtype) #第一個錨點框的寬w[0]=起始錨點的寬/原圖大小的寬;例如:w[0]=21/300 390 # Add first anchor boxes with ratio=1. 391 h[0] = sizes[0] / img_shape[0]# 添加長寬比為1的默認框 392 w[0] = sizes[0] / img_shape[1] 393 di = 1 #錨點框個數偏移 394 if len(sizes) > 1: 395 # 添加一組特殊的默認框,就是用S'k計算出來的box,長寬比為1,大小為sqrt(s(i) + s(i+1)) 396 #第二個錨點框的高h[1]=sqrt(起始錨點的高*起始錨點的寬)/原圖大小的高;例如:h[1]=sqrt(21*45)/300 397 h[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[0] 398 #第二個錨點框的高w[1]=sqrt(起始錨點的高*起始錨點的寬)/原圖大小的寬;例如:w[1]=sqrt(21*45)/300 399 w[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[1] 400 di += 1 401 # 添加不同比例的默認框(ratios中不含1) 402 # #遍歷長寬比例,第一個特征圖,r只有兩個,2和0.5;共四個錨點框size(h[0]~h[3]) 403 for i, r in enumerate(ratios): 404 # 例如:對於第一個特征圖,h[0+2]=h[2]=21/300/sqrt(2);w[0+2]=w[2]=45/300*sqrt(2) 405 h[i+di] = sizes[0] / img_shape[0] / math.sqrt(r) 406 # 例如:對於第一個特征圖,h[1+2]=h[3]=21/300/sqrt(0.5);w[1+2]=w[3]=45/300*sqrt(0.5) 407 w[i+di] = sizes[0] / img_shape[1] * math.sqrt(r) 408 return y, x, h, w #返回沒有歸一化前的錨點坐標和尺寸 409 410 #檢測所有特征圖中錨點框的四個坐標信息 411 def ssd_anchors_all_layers(img_shape, #輸入原始圖大小 412 layers_shape,#每個特征層形狀尺寸 413 anchor_sizes,#起始特征圖中框的長寬size 414 anchor_ratios,#錨點框長寬比列表 415 anchor_steps,#錨點框相對原圖縮放比例 416 offset=0.5,#錨點中心在每個特征圖cell中的偏移 417 dtype=np.float32): 418 """Compute anchor boxes for all feature layers. 419 """ 420 layers_anchors = [] #用於存放所有特征圖中錨點框位置尺寸信息 421 for i, s in enumerate(layers_shape):#6個特征圖尺寸;如:第0個是38x38 422 # 分別計算每個特征圖中錨點框的位置尺寸信息; 423 anchor_bboxes = ssd_anchor_one_layer(img_shape, s, 424 anchor_sizes[i],#輸入:第i個特征圖中起始錨點框大小;如第0個是(21., 45.) 425 anchor_ratios[i],#輸入:第i個特征圖中錨點框長寬比列表;如第0個是[2, .5] 426 anchor_steps[i],#輸入:第i個特征圖中錨點框相對原始圖的縮放比;如第0個是8 427 offset=offset, dtype=dtype)#輸入:第i個特征圖中錨點框相對原始圖的縮放比;如第0個是8 428 # 將6個特征圖中每個特征圖上的點對應的錨點框(6個或4個)保存 429 layers_anchors.append(anchor_bboxes) 430 return layers_anchors #返回所有特征圖的錨點框尺寸信息 431 432 433 # =========================================================================== # 434 # Functional definition of VGG-based SSD 300.功能定義 435 # =========================================================================== # 436 #得到一個tensor的dim,list 437 def tensor_shape(x, rank=3): 438 """Returns the dimensions of a tensor. 439 Args: 440 image: A N-D Tensor of shape. 441 Returns: 442 A list of dimensions. Dimensions that are statically known are python 443 integers,otherwise they are integer scalar tensors. 444 """ 445 if x.get_shape().is_fully_defined(): 446 return x.get_shape().as_list() 447 else: 448 static_shape = x.get_shape().with_rank(rank).as_list() 449 dynamic_shape = tf.unstack(tf.shape(x), rank) 450 return [s if s is not None else d 451 for s, d in zip(static_shape, dynamic_shape)] 452 453 #對指定feature layers的位置預測以及類別預測 454 #首先計算anchors的數量,對於位置信息,輸出16通道的feature map,將其reshape為[N,W,H,num_anchors,4]。 455 #對於類別信息,輸出84通道的feature maps,再將其reshape為[N,W,H,num_anchors,num_classes]。返回計算得到的位置和類別預測。 456 #返回計算得到的位置和類別預測。 457 def ssd_multibox_layer(inputs,#輸入特征層 458 num_classes,#類別數 459 sizes,#參考先驗框的尺度 460 ratios=[1],#默認的先驗框長寬比為1 461 normalization=-1,#默認不做正則化 462 bn_normalization=False): 463 """ 464 Construct a multibox layer, return a class and localization predictions. 465 """ 466 net = inputs 467 if normalization > 0:#如果輸入整數,則進行L2正則化 468 net = custom_layers.l2_normalization(net, scaling=True)#對通道所在維度進行正則化,隨后乘以gamma縮放系數 469 # Number of anchors. 470 num_anchors = len(sizes) + len(ratios)#每層特征圖參考先驗框的個數[4,6,6,6,4,4] 471 472 # Location.#每個先驗框對應4個坐標信息 473 # 最后整個特征圖所有錨點框預測目標位置 tensor為[h*w*每個cell先驗框數,4] 474 num_loc_pred = num_anchors * 4#特征圖上每個單元預測的坐標所需維度=錨點框數*4 475 # 通過對特征圖進行3x3卷積得到位置信息和類別權重信息 476 loc_pred = slim.conv2d(net, num_loc_pred, [3, 3], activation_fn=None, 477 scope='conv_loc') #該部分是定位信息,輸出維度為[特征圖h,特征圖w,每個單元所有錨點框坐標] 478 loc_pred = custom_layers.channel_to_last(loc_pred) 479 loc_pred = tf.reshape(loc_pred,tensor_shape(loc_pred, 4)[:-1]+[num_anchors, 4]) 480 # Class prediction. 481 #特征圖上每個單元預測的類別所需維度=錨點框數*種類數 482 num_cls_pred = num_anchors * num_classes 483 # 該部分是類別信息,輸出維度為[特征圖h,特征圖w,每個單元所有錨點框對應類別信息] 484 ##最后整個特征圖所有錨點框預測類別 tensor為[h*w*每個cell先驗框數,種類數] 485 cls_pred = slim.conv2d(net, num_cls_pred, [3, 3], activation_fn=None,scope='conv_cls') 486 cls_pred = custom_layers.channel_to_last(cls_pred) 487 cls_pred = tf.reshape(cls_pred,tensor_shape(cls_pred, 4)[:-1]+[num_anchors, num_classes]) 488 return cls_pred, loc_pred #返回預測得到的類別和box位置 tensor 489 490 #定義ssd網絡結構 491 def ssd_net(inputs, 492 num_classes=SSDNet.default_params.num_classes, #分類數 493 feat_layers=SSDNet.default_params.feat_layers, #特征層 494 anchor_sizes=SSDNet.default_params.anchor_sizes, 495 anchor_ratios=SSDNet.default_params.anchor_ratios, 496 normalizations=SSDNet.default_params.normalizations,#正則化 497 is_training=True, 498 dropout_keep_prob=0.5, 499 prediction_fn=slim.softmax, 500 reuse=None, 501 scope='ssd_300_vgg'): 502 """SSD net definition. 503 """ 504 # if data_format == 'NCHW': 505 # inputs = tf.transpose(inputs, perm=(0, 3, 1, 2)) 506 507 # End_points collect relevant activations for external use. 508 end_points = {} #用於收集每一層輸出結果 509 with tf.variable_scope(scope, 'ssd_300_vgg', [inputs], reuse=reuse): 510 # Original VGG-16 blocks. #VGG16網絡的第一個conv,重復2次卷積,核為3x3,64個特征 511 net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1') 512 end_points['block1'] = net #conv1_2結果存入end_points,name='block1' 513 net = slim.max_pool2d(net, [2, 2], scope='pool1') 514 # Block 2. #重復2次卷積,核為3x3,128個特征 515 net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2') 516 end_points['block2'] = net #conv2_2結果存入end_points,name='block2' 517 net = slim.max_pool2d(net, [2, 2], scope='pool2') 518 # Block 3.#重復3次卷積,核為3x3,256個特征 519 net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3') 520 end_points['block3'] = net#conv3_3結果存入end_points,name='block3' 521 net = slim.max_pool2d(net, [2, 2], scope='pool3') 522 # Block 4.#重復3次卷積,核為3x3,512個特征 523 net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4') 524 end_points['block4'] = net #conv4_3結果存入end_points,name='block4' 525 net = slim.max_pool2d(net, [2, 2], scope='pool4') 526 # Block 5.#重復3次卷積,核為3x3,512個特征 527 net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5') 528 end_points['block5'] = net #conv5_3結果存入end_points,name='block5' 529 net = slim.max_pool2d(net, [3, 3], stride=1, scope='pool5') 530 531 # Additional SSD blocks. #去掉了VGG的全連接層 532 # Block 6: let's dilate the hell out of it! 533 # 將VGG基礎網絡最后的池化層結果做擴展卷積(帶孔卷積); 534 net = slim.conv2d(net, 1024, [3, 3], rate=6, scope='conv6') 535 end_points['block6'] = net #conv6結果存入end_points,name='block6' 536 net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training)#dropout層 537 # Block 7: 1x1 conv. Because the fuck. 538 # 將dropout后的網絡做1x1卷積,輸出1024特征,name='block7' 539 net = slim.conv2d(net, 1024, [1, 1], scope='conv7') 540 end_points['block7'] = net 541 net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training)#將卷積后的網絡繼續做dropout 542 543 # Block 8/9/10/11: 1x1 and 3x3 convolutions stride 2 (except lasts). 544 end_point = 'block8' #對上述dropout的網絡做1x1卷積,然后做3x3卷積,,輸出512特征圖,name=‘block8’ 545 with tf.variable_scope(end_point): 546 net = slim.conv2d(net, 256, [1, 1], scope='conv1x1') 547 net = custom_layers.pad2d(net, pad=(1, 1)) 548 net = slim.conv2d(net, 512, [3, 3], stride=2, scope='conv3x3', padding='VALID') 549 end_points[end_point] = net 550 end_point = 'block9' #對上述網絡做1x1卷積,然后做3x3卷積,輸出256特征圖,name=‘block9’ 551 with tf.variable_scope(end_point): 552 net = slim.conv2d(net, 128, [1, 1], scope='conv1x1') 553 net = custom_layers.pad2d(net, pad=(1, 1)) 554 net = slim.conv2d(net, 256, [3, 3], stride=2, scope='conv3x3', padding='VALID') 555 end_points[end_point] = net 556 end_point = 'block10' #對上述網絡做1x1卷積,然后做3x3卷積,輸出256特征圖,name=‘block10’ 557 with tf.variable_scope(end_point): 558 net = slim.conv2d(net, 128, [1, 1], scope='conv1x1') 559 net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID') 560 end_points[end_point] = net 561 end_point = 'block11' #對上述網絡做1x1卷積,然后做3x3卷積,輸出256特征圖,name=‘block11’ 562 with tf.variable_scope(end_point): 563 net = slim.conv2d(net, 128, [1, 1], scope='conv1x1') 564 net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID') 565 end_points[end_point] = net 566 567 # Prediction and localisations layers. 568 # 預測和定位 569 predictions = [] 570 logits = [] 571 localisations = [] 572 for i, layer in enumerate(feat_layers): #遍歷特征層 573 with tf.variable_scope(layer + '_box'): #起個命名范圍 574 # 做多尺度大小box預測的特征層,返回每個cell中每個先驗框預測的類別p和預測的位置l 575 p, l = ssd_multibox_layer(end_points[layer], 576 num_classes,#種類數 577 anchor_sizes[i],#先驗框尺度(同一特征圖上的先驗框尺度和長寬比一致) 578 anchor_ratios[i],#先驗框長寬比 579 normalizations[i])#每個特征正則化信息,目前是只對第一個特征圖做歸一化操作; 580 # 把每一層的預測收集 581 predictions.append(prediction_fn(p))#prediction_fn為softmax,預測類別 582 logits.append(p)#把每個cell每個先驗框預測的類別的概率值存在logits中 583 localisations.append(l)#預測位置信息 584 # 返回類別預測結果,位置預測結果,所屬某個類別的概率值,以及特征層 585 return predictions, localisations, logits, end_points 586 ssd_net.default_image_size = 300 587 588 # 權重衰減系數=0.0005;其是L2正則化項的系數 589 def ssd_arg_scope(weight_decay=0.0005, data_format='NHWC'): 590 """ 591 Defines the VGG arg scope. 592 Args: 593 weight_decay: The l2 regularization coefficient. 594 Returns: 595 An arg_scope. 596 """ 597 with slim.arg_scope([slim.conv2d, slim.fully_connected], 598 activation_fn=tf.nn.relu, 599 weights_regularizer=slim.l2_regularizer(weight_decay), 600 weights_initializer=tf.contrib.layers.xavier_initializer(), 601 biases_initializer=tf.zeros_initializer()): 602 with slim.arg_scope([slim.conv2d, slim.max_pool2d], 603 padding='SAME', 604 data_format=data_format): 605 with slim.arg_scope([custom_layers.pad2d, 606 custom_layers.l2_normalization, 607 custom_layers.channel_to_last], 608 data_format=data_format) as sc: 609 return sc 610 611 # =========================================================================== # 612 # Caffe scope: importing weights at initialization. 613 # =========================================================================== # 614 615 def ssd_arg_scope_caffe(caffe_scope): 616 """Caffe scope definition. 617 618 Args: 619 caffe_scope: Caffe scope object with loaded weights. 620 621 Returns: 622 An arg_scope. 623 """ 624 # Default network arg scope. 625 with slim.arg_scope([slim.conv2d], 626 activation_fn=tf.nn.relu, 627 weights_initializer=caffe_scope.conv_weights_init(), 628 biases_initializer=caffe_scope.conv_biases_init()): 629 with slim.arg_scope([slim.fully_connected], 630 activation_fn=tf.nn.relu): 631 with slim.arg_scope([custom_layers.l2_normalization], 632 scale_initializer=caffe_scope.l2_norm_scale_init()): 633 with slim.arg_scope([slim.conv2d, slim.max_pool2d], 634 padding='SAME') as sc: 635 return sc 636 637 638 # =========================================================================== # 639 # SSD loss function. 640 # =========================================================================== # 641 def ssd_losses(logits, localisations, #損失函數定義為位置誤差和置信度誤差的加權和; 642 gclasses, glocalisations, gscores, 643 match_threshold=0.5, 644 negative_ratio=3., 645 alpha=1., #位置誤差權重系數 646 label_smoothing=0., 647 device='/cpu:0', 648 scope=None): 649 with tf.name_scope(scope, 'ssd_losses'): 650 lshape = tfe.get_shape(logits[0], 5) 651 num_classes = lshape[-1] 652 batch_size = lshape[0] 653 654 # Flatten out all vectors! 655 flogits = [] 656 fgclasses = [] 657 fgscores = [] 658 flocalisations = [] 659 fglocalisations = [] 660 for i in range(len(logits)): 661 flogits.append(tf.reshape(logits[i], [-1, num_classes])) #將類別的概率值reshape成(-1,21) 662 fgclasses.append(tf.reshape(gclasses[i], [-1])) #真實類別 663 fgscores.append(tf.reshape(gscores[i], [-1])) #預測真實目標的得分 664 flocalisations.append(tf.reshape(localisations[i], [-1, 4])) #預測真實目標邊框坐標(編碼形式的值) 665 fglocalisations.append(tf.reshape(glocalisations[i], [-1, 4])) #用於將真實目標gt的坐標進行編碼存儲 666 # And concat the crap! 667 logits = tf.concat(flogits, axis=0) 668 gclasses = tf.concat(fgclasses, axis=0) 669 gscores = tf.concat(fgscores, axis=0) 670 localisations = tf.concat(flocalisations, axis=0) 671 glocalisations = tf.concat(fglocalisations, axis=0) 672 dtype = logits.dtype 673 674 # Compute positive matching mask... 675 pmask = gscores > match_threshold #預測框與真實框IOU>0.5則將這個先驗作為正樣本 676 fpmask = tf.cast(pmask, dtype) 677 n_positives = tf.reduce_sum(fpmask) #求正樣本數量N 678 679 # Hard negative mining... 680 #為了保證正負樣本盡量平衡,SSD采用了hard negative mining, 681 # 就是對負樣本進行抽樣,抽樣時按照置信度誤差(預測背景的置信度越小,誤差越大)進行降序排列, 682 # 選取誤差的較大的top - k作為訓練的負樣本,以保證正負樣本比例接近1: 3 683 no_classes = tf.cast(pmask, tf.int32) 684 predictions = slim.softmax(logits) #類別預測 685 nmask = tf.logical_and(tf.logical_not(pmask), 686 gscores > -0.5) 687 fnmask = tf.cast(nmask, dtype) 688 nvalues = tf.where(nmask, 689 predictions[:, 0], 690 1. - fnmask) 691 nvalues_flat = tf.reshape(nvalues, [-1]) 692 # Number of negative entries to select. 693 max_neg_entries = tf.cast(tf.reduce_sum(fnmask), tf.int32) 694 n_neg = tf.cast(negative_ratio * n_positives, tf.int32) + batch_size #負樣本數量,保證是正樣本3倍 695 n_neg = tf.minimum(n_neg, max_neg_entries) 696 # 抽樣時按照置信度誤差(預測背景的置信度越小,誤差越大)進行降序排列,選取誤差的較大的top-k作為訓練的負樣本 697 val, idxes = tf.nn.top_k(-nvalues_flat, k=n_neg) 698 max_hard_pred = -val[-1] 699 # Final negative mask. 700 nmask = tf.logical_and(nmask, nvalues < max_hard_pred) 701 fnmask = tf.cast(nmask, dtype) 702 703 # Add cross-entropy loss.#交叉熵 704 with tf.name_scope('cross_entropy_pos'): 705 # 類別置信度誤差 706 loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,labels=gclasses) 707 # 將置信度誤差除以正樣本數后除以batch-size 708 loss = tf.div(tf.reduce_sum(loss * fpmask), batch_size, name='value') 709 tf.losses.add_loss(loss) 710 711 with tf.name_scope('cross_entropy_neg'): 712 loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, 713 labels=no_classes) 714 loss = tf.div(tf.reduce_sum(loss * fnmask), batch_size, name='value') 715 tf.losses.add_loss(loss) 716 717 # Add localization loss: smooth L1, L2, ... 718 with tf.name_scope('localization'): 719 # Weights Tensor: positive mask + random negative. 720 weights = tf.expand_dims(alpha * fpmask, axis=-1) 721 # 先驗框對應邊界的位置預測值-真實位置;然后做Smooth L1 loss 722 loss = custom_layers.abs_smooth(localisations - glocalisations) 723 # 將上面的loss*權重(=alpha/正樣本數)求和后除以batch-size 724 loss = tf.div(tf.reduce_sum(loss * weights), batch_size, name='value') 725 tf.losses.add_loss(loss)#獲得置信度誤差和位置誤差的加權和 726 727 728 def ssd_losses_old(logits, localisations, 729 gclasses, glocalisations, gscores, 730 match_threshold=0.5, 731 negative_ratio=3., 732 alpha=1., 733 label_smoothing=0., 734 device='/cpu:0', 735 scope=None): 736 """Loss functions for training the SSD 300 VGG network. 737 738 This function defines the different loss components of the SSD, and 739 adds them to the TF loss collection. 740 741 Arguments: 742 logits: (list of) predictions logits Tensors; 743 localisations: (list of) localisations Tensors; 744 gclasses: (list of) groundtruth labels Tensors; 745 glocalisations: (list of) groundtruth localisations Tensors; 746 gscores: (list of) groundtruth score Tensors; 747 """ 748 with tf.device(device): 749 with tf.name_scope(scope, 'ssd_losses'): 750 l_cross_pos = [] 751 l_cross_neg = [] 752 l_loc = [] 753 for i in range(len(logits)): 754 dtype = logits[i].dtype 755 with tf.name_scope('block_%i' % i): 756 # Sizing weight... 757 wsize = tfe.get_shape(logits[i], rank=5) 758 wsize = wsize[1] * wsize[2] * wsize[3] 759 760 # Positive mask. 761 pmask = gscores[i] > match_threshold 762 fpmask = tf.cast(pmask, dtype) 763 n_positives = tf.reduce_sum(fpmask) 764 765 # Select some random negative entries. 766 # n_entries = np.prod(gclasses[i].get_shape().as_list()) 767 # r_positive = n_positives / n_entries 768 # r_negative = negative_ratio * n_positives / (n_entries - n_positives) 769 770 # Negative mask. 771 no_classes = tf.cast(pmask, tf.int32) 772 predictions = slim.softmax(logits[i]) 773 nmask = tf.logical_and(tf.logical_not(pmask), 774 gscores[i] > -0.5) 775 fnmask = tf.cast(nmask, dtype) 776 nvalues = tf.where(nmask, 777 predictions[:, :, :, :, 0], 778 1. - fnmask) 779 nvalues_flat = tf.reshape(nvalues, [-1]) 780 # Number of negative entries to select. 781 n_neg = tf.cast(negative_ratio * n_positives, tf.int32) 782 n_neg = tf.maximum(n_neg, tf.size(nvalues_flat) // 8) 783 n_neg = tf.maximum(n_neg, tf.shape(nvalues)[0] * 4) 784 max_neg_entries = 1 + tf.cast(tf.reduce_sum(fnmask), tf.int32) 785 n_neg = tf.minimum(n_neg, max_neg_entries) 786 787 val, idxes = tf.nn.top_k(-nvalues_flat, k=n_neg) 788 max_hard_pred = -val[-1] 789 # Final negative mask. 790 nmask = tf.logical_and(nmask, nvalues < max_hard_pred) 791 fnmask = tf.cast(nmask, dtype) 792 793 # Add cross-entropy loss. 794 with tf.name_scope('cross_entropy_pos'): 795 fpmask = wsize * fpmask 796 loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits[i], 797 labels=gclasses[i]) 798 loss = tf.losses.compute_weighted_loss(loss, fpmask) 799 l_cross_pos.append(loss) 800 801 with tf.name_scope('cross_entropy_neg'): 802 fnmask = wsize * fnmask 803 loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits[i], 804 labels=no_classes) 805 loss = tf.losses.compute_weighted_loss(loss, fnmask) 806 l_cross_neg.append(loss) 807 808 # Add localization loss: smooth L1, L2, ... 809 with tf.name_scope('localization'): 810 # Weights Tensor: positive mask + random negative. 811 weights = tf.expand_dims(alpha * fpmask, axis=-1) 812 loss = custom_layers.abs_smooth(localisations[i] - glocalisations[i]) 813 loss = tf.losses.compute_weighted_loss(loss, weights) 814 l_loc.append(loss) 815 816 # Additional total losses... 817 with tf.name_scope('total'): 818 total_cross_pos = tf.add_n(l_cross_pos, 'cross_entropy_pos') 819 total_cross_neg = tf.add_n(l_cross_neg, 'cross_entropy_neg') 820 total_cross = tf.add(total_cross_pos, total_cross_neg, 'cross_entropy') 821 total_loc = tf.add_n(l_loc, 'localization') 822 823 # Add to EXTRA LOSSES TF.collection 824 tf.add_to_collection('EXTRA_LOSSES', total_cross_pos) 825 tf.add_to_collection('EXTRA_LOSSES', total_cross_neg) 826 tf.add_to_collection('EXTRA_LOSSES', total_cross) 827 tf.add_to_collection('EXTRA_LOSSES', total_loc)
custom_layers.py的代碼解析如下:
1 # Copyright 2015 Paul Balanca. All Rights Reserved. 2 # 3 # Licensed under the Apache License, Version 2.0 (the "License"); 4 # you may not use this file except in compliance with the License. 5 # You may obtain a copy of the License at 6 # 7 # http://www.apache.org/licenses/LICENSE-2.0 8 # 9 # Unless required by applicable law or agreed to in writing, software 10 # distributed under the License is distributed on an "AS IS" BASIS, 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 # See the License for the specific language governing permissions and 13 # limitations under the License. 14 # ============================================================================== 15 """Implement some custom layers, not provided by TensorFlow. 16 實現一些TensorFlow沒有提供的自定義層 17 Trying to follow as much as possible the style/standards used in 18 tf.contrib.layers 19 盡可能多地遵循這種風格/標准 20 """ 21 import tensorflow as tf 22 23 from tensorflow.contrib.framework.python.ops import add_arg_scope 24 from tensorflow.contrib.layers.python.layers import initializers 25 from tensorflow.contrib.framework.python.ops import variables 26 from tensorflow.contrib.layers.python.layers import utils 27 from tensorflow.python.ops import nn 28 from tensorflow.python.ops import init_ops 29 from tensorflow.python.ops import variable_scope 30 31 32 def abs_smooth(x): 33 """Smoothed absolute function. Useful to compute an L1 smooth error. 34 #絕對平滑函數,用於計算L1平滑誤差 35 #當預測值與目標值相差很大時, 梯度容易爆炸,因此L1 loss對噪聲(outliers)更魯棒 36 Define as: 37 x^2 / 2 if abs(x) < 1 38 abs(x) - 0.5 if abs(x) > 1 39 We use here a differentiable definition using min(x) and abs(x). Clearly 40 not optimal, but good enough for our purpose! 41 """ 42 absx = tf.abs(x) 43 minx = tf.minimum(absx, 1) 44 r = 0.5 * ((absx - 1) * minx + absx) #計算得到L1 smooth loss 45 return r 46 47 @add_arg_scope 48 #L2正則化:稀疏正則化操作 49 def l2_normalization( 50 inputs,#輸入特征層,[batch_size,h,w,c] 51 scaling=False,#默認歸一化后是否設置縮放變量gamma 52 scale_initializer=init_ops.ones_initializer(),#scale初始化為1 53 reuse=None, 54 variables_collections=None, 55 outputs_collections=None, 56 data_format='NHWC', 57 trainable=True, 58 scope=None): 59 """Implement L2 normalization on every feature (i.e. spatial normalization). 60 對每個特性實現L2規范化,空間歸一化 61 Should be extended in some near future to other dimensions, providing a more 62 flexible normalization framework. 63 是否應該在不久的將來擴展到其他維度,提供更多靈活的標准化框架。 64 Args: 65 inputs: a 4-D tensor with dimensions [batch_size, height, width, channels]. 66 scaling: whether or not to add a post scaling operation along the dimensions 67 which have been normalized. 68 scale_initializer: An initializer for the weights. 69 reuse: whether or not the layer and its variables should be reused. To be 70 able to reuse the layer scope must be given. 71 variables_collections: optional list of collections for all the variables or 72 a dictionary containing a different list of collection per variable. 73 outputs_collections: collection to add the outputs. 74 data_format: NHWC or NCHW data format. 75 trainable: If `True` also add variables to the graph collection 76 `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). 77 scope: Optional scope for `variable_scope`. 78 Returns: 79 A `Tensor` representing the output of the operation. 80 """ 81 82 with variable_scope.variable_scope( 83 scope, 'L2Normalization', [inputs], reuse=reuse) as sc: 84 inputs_shape = inputs.get_shape()#得到輸入特征層的維度信息 85 inputs_rank = inputs_shape.ndims #維度數=4 86 dtype = inputs.dtype.base_dtype#數據類型 87 if data_format == 'NHWC': 88 # norm_dim = tf.range(1, inputs_rank-1) 89 norm_dim = tf.range(inputs_rank-1, inputs_rank)#需要正則化的維度是4-1=3即channel這個維度 90 params_shape = inputs_shape[-1:]#通道數 91 elif data_format == 'NCHW': 92 # norm_dim = tf.range(2, inputs_rank) 93 norm_dim = tf.range(1, 2)#需要正則化的維度是第1維,即channel這個維度 94 params_shape = (inputs_shape[1])#通道數 95 96 # Normalize along spatial dimensions. 97 # 對通道所在維度進行正則化,其中epsilon是避免除0風險 98 outputs = nn.l2_normalize(inputs, norm_dim, epsilon=1e-12) 99 # Additional scaling. 100 # 判斷是否對正則化后設置縮放變量 101 if scaling: 102 scale_collections = utils.get_variable_collections( 103 variables_collections, 'scale') 104 scale = variables.model_variable('gamma', 105 shape=params_shape, 106 dtype=dtype, 107 initializer=scale_initializer, 108 collections=scale_collections, 109 trainable=trainable) 110 if data_format == 'NHWC': 111 outputs = tf.multiply(outputs, scale) 112 elif data_format == 'NCHW': 113 scale = tf.expand_dims(scale, axis=-1) 114 scale = tf.expand_dims(scale, axis=-1) 115 outputs = tf.multiply(outputs, scale) 116 # outputs = tf.transpose(outputs, perm=(0, 2, 3, 1)) 117 # 即返回L2_norm*gamma 118 return utils.collect_named_outputs(outputs_collections, 119 sc.original_name_scope, outputs) 120 121 122 @add_arg_scope 123 def pad2d(inputs, 124 pad=(0, 0), 125 mode='CONSTANT', 126 data_format='NHWC', 127 trainable=True, 128 scope=None): 129 """ 130 2D Padding layer, adding a symmetric padding to H and W dimensions. 131 2D填充層,為H和W維度添加對稱填充 132 Aims to mimic padding in Caffe and MXNet, helping the port of models to 133 TensorFlow. Tries to follow the naming convention of `tf.contrib.layers`. 134 目的是在Caffe和MXNet中模擬填充,幫助模型移植到TensorFlow。 135 嘗試遵循“tf.contrib.layers”的命名約定。 136 Args: 137 inputs: 4D input Tensor; 138 pad: 2-Tuple with padding values for H and W dimensions; 139 mode: Padding mode. C.f. `tf.pad` 140 data_format: NHWC or NCHW data format. 141 """ 142 with tf.name_scope(scope, 'pad2d', [inputs]): 143 # Padding shape. 144 if data_format == 'NHWC': 145 paddings = [[0, 0], [pad[0], pad[0]], [pad[1], pad[1]], [0, 0]] 146 elif data_format == 'NCHW': 147 paddings = [[0, 0], [0, 0], [pad[0], pad[0]], [pad[1], pad[1]]] 148 net = tf.pad(inputs, paddings, mode=mode) 149 return net 150 151 152 @add_arg_scope 153 #作用,將輸入的特征圖網絡的通道維度放在最后,返回變形后的網絡 154 def channel_to_last(inputs, 155 data_format='NHWC', 156 scope=None): 157 """Move the channel axis to the last dimension. Allows to 158 provide a single output format whatever the input data format. 159 將通道軸移動到最后一個維度。允許無論輸入數據格式如何,都要提供單一的輸出格式。 160 Args: 161 inputs: Input Tensor; 162 data_format: NHWC or NCHW. 163 Return: 164 Input in NHWC format. 165 """ 166 with tf.name_scope(scope, 'channel_to_last', [inputs]): 167 if data_format == 'NHWC': 168 net = inputs 169 elif data_format == 'NCHW': 170 net = tf.transpose(inputs, perm=(0, 2, 3, 1)) 171 return net
ssd_common.py
1 # Copyright 2015 Paul Balanca. All Rights Reserved. 2 # 3 # Licensed under the Apache License, Version 2.0 (the "License"); 4 # you may not use this file except in compliance with the License. 5 # You may obtain a copy of the License at 6 # 7 # http://www.apache.org/licenses/LICENSE-2.0 8 # 9 # Unless required by applicable law or agreed to in writing, software 10 # distributed under the License is distributed on an "AS IS" BASIS, 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 # See the License for the specific language governing permissions and 13 # limitations under the License. 14 # ============================================================================== 15 """Shared function between different SSD implementations. 16 """ 17 import numpy as np 18 import tensorflow as tf 19 import tf_extended as tfe 20 21 22 # =========================================================================== # 23 # TensorFlow implementation of boxes SSD encoding / decoding. 24 # =========================================================================== # 25 def tf_ssd_bboxes_encode_layer(labels, #gt標簽,1D的tensor 26 bboxes, #Nx4的Tensor(float),真實的bbox 27 anchors_layer, #參考錨點list 28 num_classes, #分類類別數 29 no_annotation_label, 30 ignore_threshold=0.5, #gt和錨點框間的匹配閾值,大於該值則為正樣本 31 prior_scaling=[0.1, 0.1, 0.2, 0.2], #真實值到預測值轉換中用到的縮放 32 dtype=tf.float32): 33 """Encode groundtruth labels and bounding boxes using SSD anchors from 34 one layer. 35 Arguments: 36 labels: 1D Tensor(int64) containing groundtruth labels; 37 bboxes: Nx4 Tensor(float) with bboxes relative coordinates; 38 anchors_layer: Numpy array with layer anchors; 39 matching_threshold: Threshold for positive match with groundtruth bboxes; 40 prior_scaling: Scaling of encoded coordinates. 41 Return: 42 (target_labels, target_localizations, target_scores): Target Tensors. 返回:包含目標標簽類別,目標位置,目標置信度的tesndor 43 """ 44 # Anchors coordinates and volume. 45 yref, xref, href, wref = anchors_layer #此前每個特征圖上點對應生成的錨點框作為參考框 46 ymin = yref - href / 2. #求參考框的左上角點(xmin,ymin)和右下角點(xmax,ymax) 47 xmin = xref - wref / 2. #yref和xref的shape為(38,38,1);href和wref的shape為(4,) 48 ymax = yref + href / 2. 49 xmax = xref + wref / 2. 50 vol_anchors = (xmax - xmin) * (ymax - ymin) #求參考框面積vol_anchors 51 52 # Initialize tensors... #shape表示每個特征圖上總錨點數 53 shape = (yref.shape[0], yref.shape[1], href.size) #對於第一個特征圖,shape=(38,38,4);第二個特征圖的shape=(19,19,6) 54 feat_labels = tf.zeros(shape, dtype=tf.int64) #初始化每個特征圖上的點對應的各個box所屬標簽維度 如:38x38x4 55 feat_scores = tf.zeros(shape, dtype=dtype) #初始化每個特征圖上的點對應的各個box所屬標目標的得分值維度 如:38x38x4 56 57 feat_ymin = tf.zeros(shape, dtype=dtype) #預測每個特征圖每個點所屬目標的坐標 ;如38x38x4;初始化為全0 58 feat_xmin = tf.zeros(shape, dtype=dtype) 59 feat_ymax = tf.ones(shape, dtype=dtype) 60 feat_xmax = tf.ones(shape, dtype=dtype) 61 62 def jaccard_with_anchors(bbox): #計算gt的框和參考錨點框的重合度 63 """Compute jaccard score between a box and the anchors. 64 """ 65 int_ymin = tf.maximum(ymin, bbox[0]) #計算重疊區域的坐標 66 int_xmin = tf.maximum(xmin, bbox[1]) 67 int_ymax = tf.minimum(ymax, bbox[2]) 68 int_xmax = tf.minimum(xmax, bbox[3]) 69 h = tf.maximum(int_ymax - int_ymin, 0.) #計算重疊區域的長與寬 70 w = tf.maximum(int_xmax - int_xmin, 0.) 71 # Volumes. 72 inter_vol = h * w #重疊區域的面積 73 union_vol = vol_anchors - inter_vol \ #計算bbox和參考框的並集區域 74 + (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) 75 jaccard = tf.div(inter_vol, union_vol) #計算IOU並返回該值 76 return jaccard 77 78 def intersection_with_anchors(bbox): #計算某個參考框包含真實框的得分情況 79 """Compute intersection between score a box and the anchors. 80 """ 81 int_ymin = tf.maximum(ymin, bbox[0]) #計算bbox和錨點框重疊區域的坐標和長寬 82 int_xmin = tf.maximum(xmin, bbox[1]) 83 int_ymax = tf.minimum(ymax, bbox[2]) 84 int_xmax = tf.minimum(xmax, bbox[3]) 85 h = tf.maximum(int_ymax - int_ymin, 0.) 86 w = tf.maximum(int_xmax - int_xmin, 0.) 87 inter_vol = h * w #重疊區域面積 88 scores = tf.div(inter_vol, vol_anchors) #將重疊區域面積除以參考框面積作為該參考框得分值; 89 return scores 90 91 def condition(i, feat_labels, feat_scores, 92 feat_ymin, feat_xmin, feat_ymax, feat_xmax): 93 """Condition: check label index. 94 """ 95 r = tf.less(i, tf.shape(labels)) # 逐元素比較大小,遍歷labels,因為i在body返回的時候加1了 96 return r[0] 97 98 def body(i, feat_labels, feat_scores, #該函數大致意思是選擇與gt box IOU最大的錨點框負責回歸任務,並預測對應的邊界框,如此循環 99 feat_ymin, feat_xmin, feat_ymax, feat_xmax): 100 """Body: update feature labels, scores and bboxes. 101 Follow the original SSD paper for that purpose: 102 - assign values when jaccard > 0.5; 103 - only update if beat the score of other bboxes. 104 """ 105 # Jaccard score. #計算bbox與參考框的IOU值 106 label = labels[i] 107 bbox = bboxes[i] 108 jaccard = jaccard_with_anchors(bbox) 109 # Mask: check threshold + scores + no annotations + num_classes. 110 mask = tf.greater(jaccard, feat_scores) #當IOU大於feat_scores時,對應的mask至1,做篩選 111 # mask = tf.logical_and(mask, tf.greater(jaccard, matching_threshold)) 112 mask = tf.logical_and(mask, feat_scores > -0.5) 113 mask = tf.logical_and(mask, label < num_classes) #label滿足<21 114 imask = tf.cast(mask, tf.int64) #將mask轉換數據類型int型 115 fmask = tf.cast(mask, dtype) #將mask轉換數據類型float型 116 # Update values using mask. 117 feat_labels = imask * label + (1 - imask) * feat_labels #當mask=1,則feat_labels=1;否則為0,即背景 118 feat_scores = tf.where(mask, jaccard, feat_scores) #tf.where表示如果mask為真則jaccard,否則為feat_scores 119 120 feat_ymin = fmask * bbox[0] + (1 - fmask) * feat_ymin #選擇與GT bbox IOU最大的框作為GT bbox,然后循環 121 feat_xmin = fmask * bbox[1] + (1 - fmask) * feat_xmin 122 feat_ymax = fmask * bbox[2] + (1 - fmask) * feat_ymax 123 feat_xmax = fmask * bbox[3] + (1 - fmask) * feat_xmax 124 125 # Check no annotation label: ignore these anchors... #對沒有標注標簽的錨點框做忽視,應該是背景 126 # interscts = intersection_with_anchors(bbox) 127 # mask = tf.logical_and(interscts > ignore_threshold, 128 # label == no_annotation_label) 129 # # Replace scores by -1. 130 # feat_scores = tf.where(mask, -tf.cast(mask, dtype), feat_scores) 131 132 return [i+1, feat_labels, feat_scores, 133 feat_ymin, feat_xmin, feat_ymax, feat_xmax] 134 # Main loop definition. 135 i = 0 136 [i, feat_labels, feat_scores, 137 feat_ymin, feat_xmin, 138 feat_ymax, feat_xmax] = tf.while_loop(condition, body, 139 [i, feat_labels, feat_scores, 140 feat_ymin, feat_xmin, 141 feat_ymax, feat_xmax]) 142 # Transform to center / size. #轉換為中心及長寬形式(計算補償后的中心) 143 feat_cy = (feat_ymax + feat_ymin) / 2. #真實預測值其實是邊界框相對於先驗框的轉換值,encode就是為了求這個轉換值 144 feat_cx = (feat_xmax + feat_xmin) / 2. 145 feat_h = feat_ymax - feat_ymin 146 feat_w = feat_xmax - feat_xmin 147 # Encode features. 148 feat_cy = (feat_cy - yref) / href / prior_scaling[0] #(預測真實邊界框中心y-參考框中心y)/參考框高/縮放尺度 149 feat_cx = (feat_cx - xref) / wref / prior_scaling[1] 150 feat_h = tf.log(feat_h / href) / prior_scaling[2] #log(預測真實邊界框高h/參考框高h)/縮放尺度 151 feat_w = tf.log(feat_w / wref) / prior_scaling[3] 152 # Use SSD ordering: x / y / w / h instead of ours. 153 feat_localizations = tf.stack([feat_cx, feat_cy, feat_w, feat_h], axis=-1) #返回(cx轉換值,cy轉換值,w轉換值,h轉換值)形式的邊界框的預測值(其實是預測框相對於參考框的轉換) 154 return feat_labels, feat_localizations, feat_scores #返回目標標簽,目標預測值(位置轉換值),目標置信度 155 #經過我們回歸得到的變換,經過變換得到真實框,所以這個地方損失函數其實是我們預測的是變換,我們實際的框和anchor之間的變換和我們預測的變換之間的loss。我們回歸的是一種變換。並不是直接預測框,這個和YOLO是不一樣的。和Faster RCNN是一樣的 156 157 158 def tf_ssd_bboxes_encode(labels, #1D的tensor 包含gt標簽 159 bboxes, #Nx4的tensor包含真實框的相對坐標 160 anchors, #參考錨點框信息(y,x,h,w) 其中y,x是中心坐標 161 num_classes, 162 no_annotation_label, 163 ignore_threshold=0.5, 164 prior_scaling=[0.1, 0.1, 0.2, 0.2], 165 dtype=tf.float32, 166 scope='ssd_bboxes_encode'): 167 """Encode groundtruth labels and bounding boxes using SSD net anchors. 168 Encoding boxes for all feature layers. 169 Arguments: 170 labels: 1D Tensor(int64) containing groundtruth labels; 171 bboxes: Nx4 Tensor(float) with bboxes relative coordinates; 172 anchors: List of Numpy array with layer anchors; 173 matching_threshold: Threshold for positive match with groundtruth bboxes; 174 prior_scaling: Scaling of encoded coordinates. 175 Return: 176 (target_labels, target_localizations, target_scores): #返回:目標標簽,目標位置,目標得分值(都是list形式) 177 Each element is a list of target Tensors. 178 """ 179 with tf.name_scope(scope): 180 target_labels = [] #目標標簽 181 target_localizations = [] #目標位置 182 target_scores = [] #目標得分 183 for i, anchors_layer in enumerate(anchors): #對所有特征圖中的參考框做遍歷 184 with tf.name_scope('bboxes_encode_block_%i' % i): 185 t_labels, t_loc, t_scores = \ 186 tf_ssd_bboxes_encode_layer(labels, bboxes, anchors_layer, #輸入真實標簽,gt位置大小,參考框位置大小……得到預測真實標簽,參考框到真實框的轉換以及得分 187 num_classes, no_annotation_label, 188 ignore_threshold, 189 prior_scaling, dtype) 190 target_labels.append(t_labels) 191 target_localizations.append(t_loc) 192 target_scores.append(t_scores) 193 return target_labels, target_localizations, target_scores 194 195 196 def tf_ssd_bboxes_decode_layer(feat_localizations, #解碼,在預測時用到,根據之前得到的預測值相對於參考框的轉換值后,反推出真實位置(該位置包括真實的x,y,w,h) 197 anchors_layer, #需要輸入:預測框和參考框的轉換feat_localizations,參考框位置尺度信息anchors_layer,以及轉換時用到的縮放 198 prior_scaling=[0.1, 0.1, 0.2, 0.2]): #輸出真實預測框的ymin,xmin,ymax,xmax 199 """Compute the relative bounding boxes from the layer features and 200 reference anchor bounding boxes. 201 Arguments: 202 feat_localizations: Tensor containing localization features. 203 anchors: List of numpy array containing anchor boxes. 204 Return: 205 Tensor Nx4: ymin, xmin, ymax, xmax 206 """ 207 yref, xref, href, wref = anchors_layer #錨點框的參考中心點以及長寬 208 209 # Compute center, height and width 210 cx = feat_localizations[:, :, :, :, 0] * wref * prior_scaling[0] + xref 211 cy = feat_localizations[:, :, :, :, 1] * href * prior_scaling[1] + yref 212 w = wref * tf.exp(feat_localizations[:, :, :, :, 2] * prior_scaling[2]) 213 h = href * tf.exp(feat_localizations[:, :, :, :, 3] * prior_scaling[3]) 214 # Boxes coordinates. 215 ymin = cy - h / 2. 216 xmin = cx - w / 2. 217 ymax = cy + h / 2. 218 xmax = cx + w / 2. 219 bboxes = tf.stack([ymin, xmin, ymax, xmax], axis=-1) 220 return bboxes #預測真實框的坐標信息(兩點式的框) 221 222 223 def tf_ssd_bboxes_decode(feat_localizations, 224 anchors, 225 prior_scaling=[0.1, 0.1, 0.2, 0.2], 226 scope='ssd_bboxes_decode'): 227 """Compute the relative bounding boxes from the SSD net features and 228 reference anchors bounding boxes. 229 Arguments: 230 feat_localizations: List of Tensors containing localization features. 231 anchors: List of numpy array containing anchor boxes. 232 Return: 233 List of Tensors Nx4: ymin, xmin, ymax, xmax 234 """ 235 with tf.name_scope(scope): 236 bboxes = [] 237 for i, anchors_layer in enumerate(anchors): 238 bboxes.append( 239 tf_ssd_bboxes_decode_layer(feat_localizations[i], 240 anchors_layer, 241 prior_scaling)) 242 return bboxes 243 244 245 # =========================================================================== # 246 # SSD boxes selection. 247 # =========================================================================== # 248 def tf_ssd_bboxes_select_layer(predictions_layer, localizations_layer, #輸入預測得到的類別和位置做篩選 249 select_threshold=None, 250 num_classes=21, 251 ignore_class=0, 252 scope=None): 253 """Extract classes, scores and bounding boxes from features in one layer. 254 Batch-compatible: inputs are supposed to have batch-type shapes. 255 Args: 256 predictions_layer: A SSD prediction layer; 257 localizations_layer: A SSD localization layer; 258 select_threshold: Classification threshold for selecting a box. All boxes 259 under the threshold are set to 'zero'. If None, no threshold applied. 260 Return: 261 d_scores, d_bboxes: Dictionary of scores and bboxes Tensors of 262 size Batches X N x 1 | 4. Each key corresponding to a class. 263 """ 264 select_threshold = 0.0 if select_threshold is None else select_threshold 265 with tf.name_scope(scope, 'ssd_bboxes_select_layer', 266 [predictions_layer, localizations_layer]): 267 # Reshape features: Batches x N x N_labels | 4 268 p_shape = tfe.get_shape(predictions_layer) 269 predictions_layer = tf.reshape(predictions_layer, 270 tf.stack([p_shape[0], -1, p_shape[-1]])) 271 l_shape = tfe.get_shape(localizations_layer) 272 localizations_layer = tf.reshape(localizations_layer, 273 tf.stack([l_shape[0], -1, l_shape[-1]])) 274 275 d_scores = {} 276 d_bboxes = {} 277 for c in range(0, num_classes): 278 if c != ignore_class: #如果不是背景類別 279 # Remove boxes under the threshold. #去掉低於閾值的box 280 scores = predictions_layer[:, :, c] #預測為第c類別的得分值 281 fmask = tf.cast(tf.greater_equal(scores, select_threshold), scores.dtype) 282 scores = scores * fmask #保留得分值大於閾值的得分 283 bboxes = localizations_layer * tf.expand_dims(fmask, axis=-1) 284 # Append to dictionary. 285 d_scores[c] = scores 286 d_bboxes[c] = bboxes 287 288 return d_scores, d_bboxes #返回字典,每個字典里是對應某類的預測權重和框位置信息; 289 290 291 def tf_ssd_bboxes_select(predictions_net, localizations_net, #輸入:SSD網絡輸出的預測層list;定位層list;類別選擇框閾值(None表示都選) 292 select_threshold=None, #返回一個字典,key為類別,值為得分和bbox坐標 293 num_classes=21, #包含了背景類別 294 ignore_class=0, #第0類是背景 295 scope=None): 296 """Extract classes, scores and bounding boxes from network output layers. 297 Batch-compatible: inputs are supposed to have batch-type shapes. 298 Args: 299 predictions_net: List of SSD prediction layers; 300 localizations_net: List of localization layers; 301 select_threshold: Classification threshold for selecting a box. All boxes 302 under the threshold are set to 'zero'. If None, no threshold applied. 303 Return: 304 d_scores, d_bboxes: Dictionary of scores and bboxes Tensors of #返回一個字典,其中key是對應類別,值對應得分值和坐標信息 305 size Batches X N x 1 | 4. Each key corresponding to a class. 306 """ 307 with tf.name_scope(scope, 'ssd_bboxes_select', 308 [predictions_net, localizations_net]): 309 l_scores = [] 310 l_bboxes = [] 311 for i in range(len(predictions_net)): 312 scores, bboxes = tf_ssd_bboxes_select_layer(predictions_net[i], 313 localizations_net[i], 314 select_threshold, 315 num_classes, 316 ignore_class) 317 l_scores.append(scores) #對應某個類別的得分 318 l_bboxes.append(bboxes) #對應某個類別的box坐標信息 319 # Concat results. 320 d_scores = {} 321 d_bboxes = {} 322 for c in l_scores[0].keys(): 323 ls = [s[c] for s in l_scores] 324 lb = [b[c] for b in l_bboxes] 325 d_scores[c] = tf.concat(ls, axis=1) 326 d_bboxes[c] = tf.concat(lb, axis=1) 327 return d_scores, d_bboxes 328 329 330 def tf_ssd_bboxes_select_layer_all_classes(predictions_layer, localizations_layer, 331 select_threshold=None): 332 """Extract classes, scores and bounding boxes from features in one layer. 333 Batch-compatible: inputs are supposed to have batch-type shapes. 334 Args: 335 predictions_layer: A SSD prediction layer; 336 localizations_layer: A SSD localization layer; 337 select_threshold: Classification threshold for selecting a box. If None, 338 select boxes whose classification score is higher than 'no class'. 339 Return: 340 classes, scores, bboxes: Input Tensors. #輸出:類別,得分,框 341 """ 342 # Reshape features: Batches x N x N_labels | 4 343 p_shape = tfe.get_shape(predictions_layer) 344 predictions_layer = tf.reshape(predictions_layer, 345 tf.stack([p_shape[0], -1, p_shape[-1]])) 346 l_shape = tfe.get_shape(localizations_layer) 347 localizations_layer = tf.reshape(localizations_layer, 348 tf.stack([l_shape[0], -1, l_shape[-1]])) 349 # Boxes selection: use threshold or score > no-label criteria. 350 if select_threshold is None or select_threshold == 0: 351 # Class prediction and scores: assign 0. to 0-class 352 classes = tf.argmax(predictions_layer, axis=2) 353 scores = tf.reduce_max(predictions_layer, axis=2) 354 scores = scores * tf.cast(classes > 0, scores.dtype) 355 else: 356 sub_predictions = predictions_layer[:, :, 1:] 357 classes = tf.argmax(sub_predictions, axis=2) + 1 358 scores = tf.reduce_max(sub_predictions, axis=2) 359 # Only keep predictions higher than threshold. 360 mask = tf.greater(scores, select_threshold) 361 classes = classes * tf.cast(mask, classes.dtype) 362 scores = scores * tf.cast(mask, scores.dtype) 363 # Assume localization layer already decoded. 364 bboxes = localizations_layer 365 return classes, scores, bboxes #尋找當前特征圖中類別,得分,bbox 366 367 368 def tf_ssd_bboxes_select_all_classes(predictions_net, localizations_net, 369 select_threshold=None, 370 scope=None): 371 """Extract classes, scores and bounding boxes from network output layers. 372 Batch-compatible: inputs are supposed to have batch-type shapes. 373 Args: 374 predictions_net: List of SSD prediction layers; 375 localizations_net: List of localization layers; 376 select_threshold: Classification threshold for selecting a box. If None, 377 select boxes whose classification score is higher than 'no class'. 378 Return: 379 classes, scores, bboxes: Tensors. 380 """ 381 with tf.name_scope(scope, 'ssd_bboxes_select', 382 [predictions_net, localizations_net]): 383 l_classes = [] 384 l_scores = [] 385 l_bboxes = [] 386 for i in range(len(predictions_net)): 387 classes, scores, bboxes = \ 388 tf_ssd_bboxes_select_layer_all_classes(predictions_net[i], 389 localizations_net[i], 390 select_threshold) 391 l_classes.append(classes) 392 l_scores.append(scores) 393 l_bboxes.append(bboxes) 394 395 classes = tf.concat(l_classes, axis=1) 396 scores = tf.concat(l_scores, axis=1) 397 bboxes = tf.concat(l_bboxes, axis=1) 398 return classes, scores, bboxes #返回所有特征圖綜合得出的類別,得分,bbox