(一)test_single_image.py
默認輸入圖片尺寸為[416,416]。
# coding: utf-8
from __future__ import division, print_function
import tensorflow as tf
import numpy as np
import argparse
import cv2
from utils.misc_utils import parse_anchors, read_class_names
from utils.nms_utils import gpu_nms
from utils.plot_utils import get_color_table, plot_one_box
from model import yolov3
# 設置命令行參數,具體可參見每一個命令行參數的含義
parser = argparse.ArgumentParser(description="YOLO-V3 test single image test procedure.")
parser.add_argument("input_image", type=str,
help="The path of the input image.")
parser.add_argument("--anchor_path", type=str, default="./data/yolo_anchors.txt",
help="The path of the anchor txt file.")
parser.add_argument("--new_size", nargs='*', type=int, default=[416, 416],
help="Resize the input image with `new_size`, size format: [width, height]")
parser.add_argument("--class_name_path", type=str, default="./data/coco.names",
help="The path of the class names.")
parser.add_argument("--restore_path", type=str, default="./data/darknet_weights/yolov3.ckpt",
help="The path of the weights to restore.")
args = parser.parse_args()
# 處理anchors,這些anchors是通過數據聚類獲得,一共9個,shape為:[9, 2]。
# 需要注意的是,最后一個維度的順序是[width, height]
args.anchors = parse_anchors(args.anchor_path)
# 處理classes, 這里是將所有的class的名稱提取了出來,組成了一個列表
args.classes = read_class_names(args.class_name_path)
# 類別的數目
args.num_class = len(args.classes)
# 根據類別的數目為每一個類別分配不同的顏色,以便展示
color_table = get_color_table(args.num_class)
# 讀取圖片
img_ori = cv2.imread(args.input_image)
# 獲取圖片的尺寸
height_ori, width_ori = img_ori.shape[:2]
# resize,根據之前設定的尺寸值進行resize,默認是[416, 416],還是[width, height]的順序
img = cv2.resize(img_ori, tuple(args.new_size))
# 對圖片像素進行一定的數據處理
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = np.asarray(img, np.float32)
img = img[np.newaxis, :] / 255.
# TF會話
with tf.Session() as sess:
# 輸入的placeholder,用於輸入圖片
input_data = tf.placeholder(tf.float32, [1, args.new_size[1], args.new_size[0], 3], name='input_data')
# 定義一個YOLOv3的類,在后面可以用來做模型建立以及loss計算等操作,參數分別是類別的數目和anchors
yolo_model = yolov3(args.num_class, args.anchors)
with tf.variable_scope('yolov3'):
# 對圖片進行正向傳播,返回多張特征圖
pred_feature_maps = yolo_model.forward(input_data, False)
# 對這些特征圖進行處理,獲得計算出的bounding box以及屬於前景的概率已經每一個類別的概率分布
pred_boxes, pred_confs, pred_probs = yolo_model.predict(pred_feature_maps)
# 將兩個概率值分別相乘就可以獲得最終的概率值
pred_scores = pred_confs * pred_probs
# 對這些bounding boxes和概率值進行非最大抑制(NMS)就可以獲得最后的bounding boxes和與其對應的概率值以及標簽
boxes, scores, labels = gpu_nms(pred_boxes, pred_scores, args.num_class, max_boxes=30, score_thresh=0.4, nms_thresh=0.5)
# Saver類,用以保存和恢復模型
saver = tf.train.Saver()
# 恢復模型參數
saver.restore(sess, args.restore_path)
# 運行graph,獲得對應tensors的具體數值,這里是[boxes, scores, labels],對應於NMS之后獲得的結果
boxes_, scores_, labels_ = sess.run([boxes, scores, labels], feed_dict={input_data: img})
# rescale the coordinates to the original image
# 將坐標重新映射到原始圖片上,因為前面的計算都是在resize之后的圖片上進行的,所以需要進行映射
boxes_[:, 0] *= (width_ori/float(args.new_size[0]))
boxes_[:, 2] *= (width_ori/float(args.new_size[0]))
boxes_[:, 1] *= (height_ori/float(args.new_size[1]))
boxes_[:, 3] *= (height_ori/float(args.new_size[1]))
# 輸出
print("box coords:")
print(boxes_)
print('*' * 30)
print("scores:")
print(scores_)
print('*' * 30)
print("labels:")
print(labels_)
# 繪制並展示,保存最后的結果
for i in range(len(boxes_)):
x0, y0, x1, y1 = boxes_[i]
plot_one_box(img_ori, [x0, y0, x1, y1], label=args.classes[labels_[i]], color=color_table[labels_[i]])
cv2.imshow('Detection result', img_ori)
cv2.imwrite('detection_result.jpg', img_ori)
cv2.waitKey(0)
(二)get_kmeans.py
這里函數的主要作用是使用kmeans聚類產生若干個anchors中心,在訓練的時候使用這些作為一種先驗條件。這里的聚類主要是對目標檢測框的尺寸進行聚類。
# coding: utf-8
# This script is modified from https://github.com/lars76/kmeans-anchor-boxes
from __future__ import division, print_function
import numpy as np
# 計算IOU,box一個長度為2的數組,表示box的尺寸,clusters表示的是若干集群的中心,同樣也是尺寸。
def iou(box, clusters):
""" Calculates the Intersection over Union (IoU) between a box and k clusters. param: box: tuple or array, shifted to the origin (i. e. width and height) clusters: numpy array of shape (k, 2) where k is the number of clusters return: numpy array of shape (k, 0) where k is the number of clusters """
x = np.minimum(clusters[:, 0], box[0])
y = np.minimum(clusters[:, 1], box[1])
if np.count_nonzero(x == 0) > 0 or np.count_nonzero(y == 0) > 0:
raise ValueError("Box has no area")
intersection = x * y
box_area = box[0] * box[1]
cluster_area = clusters[:, 0] * clusters[:, 1]
iou_ = intersection / (box_area + cluster_area - intersection + 1e-10)
return iou_
def avg_iou(boxes, clusters):
""" Calculates the average Intersection over Union (IoU) between a numpy array of boxes and k clusters. param: boxes: numpy array of shape (r, 2), where r is the number of rows clusters: numpy array of shape (k, 2) where k is the number of clusters return: average IoU as a single float """
# 計算平均IOU
return np.mean([np.max(iou(boxes[i], clusters)) for i in range(boxes.shape[0])])
# 這個函數並未在任何地方被使用
def translate_boxes(boxes):
""" Translates all the boxes to the origin. param: boxes: numpy array of shape (r, 4) return: numpy array of shape (r, 2) """
new_boxes = boxes.copy()
for row in range(new_boxes.shape[0]):
new_boxes[row][2] = np.abs(new_boxes[row][2] - new_boxes[row][0])
new_boxes[row][3] = np.abs(new_boxes[row][3] - new_boxes[row][1])
return np.delete(new_boxes, [0, 1], axis=1)
def kmeans(boxes, k, dist=np.median):
""" Calculates k-means clustering with the Intersection over Union (IoU) metric. param: boxes: numpy array of shape (r, 2), where r is the number of rows k: number of clusters dist: distance function return: numpy array of shape (k, 2) """
# rows表示的是數據集中一共有多少個標注框
rows = boxes.shape[0]
# 初始化統計距離的矩陣和每一個標注框的所屬集群編號,
# 這里使用last cluster記錄下一輪循環開始時標注框的集群編號,如果在這某一輪的迭代中不發生改變則算法已經收斂。
distances = np.empty((rows, k))
last_clusters = np.zeros((rows,))
np.random.seed()
# the Forgy method will fail if the whole array contains the same rows
# 隨機選擇幾個數據作為初始的集群中心
clusters = boxes[np.random.choice(rows, k, replace=False)]
# 循環
while True:
# 對每一個標注框,計算其與每個集群中心的距離,這里的距離采用的是(1 - 標注框與集群中心的IOU)來表示,
# IOU數值越大, 則(1- IOU)越小, 則表示距離越接近.
for row in range(rows):
distances[row] = 1 - iou(boxes[row], clusters)
# 對每個標注框選擇與其距離最接近的集群中心的標號作為所屬類別的編號。
nearest_clusters = np.argmin(distances, axis=1)
# 如果在這輪循環中所有的標注框的所屬類別不再變化,則說明算法已經收斂,可以跳出循環。
if (last_clusters == nearest_clusters).all():
break
# 對每一類集群,取出所有屬於該集群的數據,並按照給定的方法計算集群的中心,
# 這里默認采用中位數的方法來計算集群中心
for cluster in range(k):
clusters[cluster] = dist(boxes[nearest_clusters == cluster], axis=0)
# 更新每一個標注框所屬的集群類別。
last_clusters = nearest_clusters
# 返回所有的集群中心
return clusters
def parse_anno(annotation_path):
# 打開數據標記的文件
anno = open(annotation_path, 'r')
# 用以儲存最后的提取出的所有的高度和寬度的結果,
result = []
# 對每一個標記圖片
for line in anno:
# 根據空格將數據行進行分割
s = line.strip().split(' ')
# 按照數據的標記規則,每一行的第一個數據是編號,第二個數據是圖片地址,從第三個開始才是標記框的信息。
s = s[2:]
# 當前圖片的標記框的數目,每個標記框包含五個信息,四個坐標信息和一個類別信息
box_cnt = len(s) // 5
# 分別處理每一個標記框的信息,並提取標記框的高度和寬度,存入result 列表。
for i in range(box_cnt):
x_min, y_min, x_max, y_max = float(s[i*5+1]), float(s[i*5+2]), float(s[i*5+3]), float(s[i*5+4])
width = x_max - x_min
height = y_max - y_min
assert width > 0
assert height > 0
result.append([width, height])
# 將list變為numpy的數組
result = np.asarray(result)
# 返回
return result
def get_kmeans(anno, cluster_num=9):
# 使用kmeans算法計算需要的anchors
anchors = kmeans(anno, cluster_num)
# 計算平均IOU
ave_iou = avg_iou(anno, anchors)
# 格式化為int類型
anchors = anchors.astype('int').tolist()
# 按照面積大小排序,
anchors = sorted(anchors, key=lambda x: x[0] * x[1])
# 返回
return anchors, ave_iou
if __name__ == '__main__':
annotation_path = "./data/my_data/train.txt"
anno_result = parse_anno(annotation_path)
anchors, ave_iou = get_kmeans(anno_result, 9)
# 格式化輸出anchors數據
anchor_string = ''
for anchor in anchors:
anchor_string += '{},{}, '.format(anchor[0], anchor[1])
anchor_string = anchor_string[:-2]
print('anchors are:')
print(anchor_string)
print('the average iou is:')
print(ave_iou)
(三)model.py
這里函數和類的主要作用是對YOLO模型進行封裝,類中的函數主要包括:
- 模型的簡歷
- 特征圖信息和anchors的聯合使用
- loss的計算
# coding=utf-8
# for better understanding about yolov3 architecture, refer to this website (in Chinese):
# https://blog.csdn.net/leviopku/article/details/82660381
from __future__ import division, print_function
import tensorflow as tf
slim = tf.contrib.slim
from utils.layer_utils import conv2d, darknet53_body, yolo_block, upsample_layer
class yolov3(object):
def __init__(self,
class_num,
anchors,
use_label_smooth=False,
use_focal_loss=False,
batch_norm_decay=0.999,
weight_decay=5e-4):
""" yolov3 class :param class_num: 類別數目 :param anchors: anchors,一般來說是9個anchors :param use_label_smooth: 是否使用label smooth,默認為False :param use_focal_loss: 是否使用focal loss,默認為False :param batch_norm_decay: BN的衰減系數 :param weight_decay: 權重衰減系數 """
# self.anchors = [[10, 13], [16, 30], [33, 23],
# [30, 61], [62, 45], [59, 119],
# [116, 90], [156, 198], [373,326]]
self.class_num = class_num
self.anchors = anchors
self.batch_norm_decay = batch_norm_decay
self.use_label_smooth = use_label_smooth
self.use_focal_loss = use_focal_loss
self.weight_decay = weight_decay
def forward(self, inputs, is_training=False, reuse=False):
""" 進行正向傳播,返回的是若干特征圖 :param inputs: shape: [N, height, width, channel] :param is_training: :param reuse: :return: """
# 獲取輸入圖片的高度height和寬度width
# the input img_size, form: [height, width]
# it will be used later
self.img_size = tf.shape(inputs)[1:3]
# batch normalization的相關參數
# set batch norm params
batch_norm_params = {
'decay': self.batch_norm_decay,
'epsilon': 1e-05,
'scale': True,
'is_training': is_training,
'fused': None, # Use fused batch norm if possible.
}
# slim的arg scope,可以簡化代碼的編寫,共用一套參數設置
with slim.arg_scope([slim.conv2d, slim.batch_norm], reuse=reuse):
with slim.arg_scope([slim.conv2d],
normalizer_fn=slim.batch_norm,
normalizer_params=batch_norm_params,
biases_initializer=None,
activation_fn=lambda x: tf.nn.leaky_relu(x, alpha=0.1),
weights_regularizer=slim.l2_regularizer(self.weight_decay)):
# DarkNet 的主體部分,主要作用是提取圖片中的各種特征信息。
# 這里可以獲取三張特征圖,分別取自DarkNet的三個不同的階段。
# 每一個階段對應於不同的特征粒度,結合更多的特征可以增強模型的表達能力。
# 理論上來說特征提取網絡也可以采用其他的網絡結構,但是效果可能會有所差異。
# 如果輸入圖片的尺寸為[416, 416],則三張特征圖的尺寸分別為
# route_1 : [1, 52, 52, 256]
# route_2 : [1, 26, 26, 512]
# route_3 : [1, 13, 13, 1024]
with tf.variable_scope('darknet53_body'):
route_1, route_2, route_3 = darknet53_body(inputs)
# 根據前面的特征圖,進行特征融合操作,這樣可以提供更多的信息。
with tf.variable_scope('yolov3_head'):
# 使用YOLO_block函數來處理得到的特征圖,並返回兩張特征圖。
# 本質上,YOLO_block函數僅僅包含若干層卷積層。
# 其中,inter1的作用是用來后續進行特征融合,net的主要作用是用以計算后續的坐標和概率等信息。
inter1, net = yolo_block(route_3, 512)
# 進行依次卷積,主要是為了進行通道數目調整
feature_map_1 = slim.conv2d(net, 3 * (5 + self.class_num), 1,
stride=1, normalizer_fn=None,
activation_fn=None, biases_initializer=tf.zeros_initializer())
feature_map_1 = tf.identity(feature_map_1, name='feature_map_1')
# 進行一次卷積,調整通道數目為256。並進行上采樣,這里的上采樣主要是用最近鄰插值法。
inter1 = conv2d(inter1, 256, 1)
inter1 = upsample_layer(inter1, tf.shape(route_2))
# 進行特征的融合,這里是通道的融合
concat1 = tf.concat([inter1, route_2], axis=3)
# 下面的和前面的過程是一致的,不再贅述。
inter2, net = yolo_block(concat1, 256)
feature_map_2 = slim.conv2d(net, 3 * (5 + self.class_num), 1,
stride=1, normalizer_fn=None,
activation_fn=None, biases_initializer=tf.zeros_initializer())
feature_map_2 = tf.identity(feature_map_2, name='feature_map_2')
inter2 = conv2d(inter2, 128, 1)
inter2 = upsample_layer(inter2, tf.shape(route_1))
concat2 = tf.concat([inter2, route_1], axis=3)
_, feature_map_3 = yolo_block(concat2, 128)
feature_map_3 = slim.conv2d(feature_map_3, 3 * (5 + self.class_num), 1,
stride=1, normalizer_fn=None,
activation_fn=None, biases_initializer=tf.zeros_initializer())
feature_map_3 = tf.identity(feature_map_3, name='feature_map_3')
# 將三張特征圖返回,shape分別如下:(輸入圖片尺寸默認為[416, 416])
# feature_map_1: [1, 13, 13, 255]
# feature_map_2: [1, 26, 25, 255]
# feature_map_3: [1, 52, 52, 255]
return feature_map_1, feature_map_2, feature_map_3
def reorg_layer(self, feature_map, anchors):
''' feature_map: a feature_map from [feature_map_1, feature_map_2, feature_map_3] returned from `forward` function anchors: shape: [3, 2] '''
"""需要注意的是,我們在下面的代碼中會經常涉及到height, width這兩個概念,在YOLOv3中,height表示的是豎直方向, width表示的是水平方向,同樣,x的方向也表示的是水平方向,y的方向是豎直方向"""
# NOTE: size in [h, w] format! don't get messed up!
# 獲取特征圖的尺寸信息,順序為: [height, width]
grid_size = tf.shape(feature_map)[1:3] # [13, 13]
# the downscale ratio in height and weight
# 計算此特征圖和原圖片的縮放尺寸,順序為: [height, width]
ratio = tf.cast(self.img_size / grid_size, tf.float32)
# rescale the anchors to the feature_map
# NOTE: the anchor is in [w, h] format!
# 將anchors映射到特征圖上,主要是大小上的映射,將anchors的尺寸分別處以下采樣倍數即可
# 需要注意的是,anchors的順序是[width, height]!所因此下面代碼中ratio的下標是反的.
# 所以計算出的rescaled_anchors的順序也是[width, height]。
rescaled_anchors = [(anchor[0] / ratio[1], anchor[1] / ratio[0]) for anchor in anchors]
# 將特征圖reshape一下,主要是將最后一個通道進行分離
feature_map = tf.reshape(feature_map, [-1, grid_size[0], grid_size[1], 3, 5 + self.class_num])
# split the feature_map along the last dimension
# shape info: take 416x416 input image and the 13*13 feature_map for example:
# box_centers: [N, 13, 13, 3, 2] last_dimension: [center_x, center_y].
# 需要注意的是這里的center_x, 和center_y的方向表示,center_x表示的是
# box_sizes: [N, 13, 13, 3, 2] last_dimension: [width, height]
# conf_logits: [N, 13, 13, 3, 1]
# prob_logits: [N, 13, 13, 3, class_num]
# 沿着最后一個數據通道進行分離,分別分離成2, 2, 1, class_num的矩陣.
box_centers, box_sizes, conf_logits, prob_logits = tf.split(feature_map, [2, 2, 1, self.class_num], axis=-1)
# 將box的中心數據限制在(0, 1)的范圍之內,
# 因為YOLO將圖片分成了一個一個的格子,每一個格子的長寬被設置為1,這里的中心數據本質上是相對於格子左上角的偏移。
box_centers = tf.nn.sigmoid(box_centers)
# use some broadcast tricks to get the mesh coordinates
# grid_x: [0, 1, 2, ..., width - 1]
grid_x = tf.range(grid_size[1], dtype=tf.int32)
# grid_y: [0, 1, 2, ..., height - 1]
grid_y = tf.range(grid_size[0], dtype=tf.int32)
# grid_x: [[0, 1, 2, ..., width - 1],
# [0, 1, 2, ..., width - 1],
# ...
# [0, 1, 2, ..., width - 1]]
# grid_y: [[0, 0, 0, ..., 0],
# [1, 1, 1, ..., 1],
# ...
# [height - 1, height - 1, height - 1, ..., height - 1]]
grid_x, grid_y = tf.meshgrid(grid_x, grid_y)
x_offset = tf.reshape(grid_x, (-1, 1)) # [0, 1, 2, .., width - 1, 0, 1, 2, ..width - 1, ......, 0, 1, 2, .. width - 1]
y_offset = tf.reshape(grid_y, (-1, 1)) # [0, 0, 0, .., 0, 1, 1, 1, ...1, ......, height -1, height -1, .., height - 1]
# x_y_offset: [[0, 0],
# [1, 0],
# ...
# [width - 1, 0],
# [0, 1],
# [1, 1],
# ...
# [width - 1, 1],
# ......
# [0, height - 1],
# [1, height - 1],
# ...
# [width - 1, height - 1]]
x_y_offset = tf.concat([x_offset, y_offset], axis=-1)
# shape: [13, 13, 1, 2] 、[height, width, 1, 2]
x_y_offset = tf.cast(tf.reshape(x_y_offset, [grid_size[0], grid_size[1], 1, 2]), tf.float32)
# get the absolute box coordinates on the feature_map
# broadcast機制: [N, height, width, 3, 2] = [N, height, width, 3, 2] + [height, width, 1, 2]
box_centers = box_centers + x_y_offset
# rescale to the original image scale
# 將box的中心重新映射到原始尺寸的圖片上。
# 在前面的代碼中,最后一個維度的順序一直是[width, height]的格式,二ratio的順序是[height, width],
# 因此這是需要對ratio取反遍歷,結果的順序依然是[width, height]。
box_centers = box_centers * ratio[::-1]
# avoid getting possible nan value with tf.clip_by_value
# 和前面的過程一樣,這里對box的尺寸進行變換,最后一維度的順序依然是[width, height]
box_sizes = tf.exp(box_sizes) * rescaled_anchors
# box_sizes = tf.clip_by_value(tf.exp(box_sizes), 1e-9, 100) * rescaled_anchors
# rescale to the original image scale
# 一樣是將box的尺寸重新映射到原始圖片上
box_sizes = box_sizes * ratio[::-1]
# shape: [N, 13, 13, 3, 4]、[N, height, width, 3, 4]
# last dimension: (center_x, center_y, w, h)
boxes = tf.concat([box_centers, box_sizes], axis=-1)
# shape:
# x_y_offset: [13, 13, 1, 2], [height, width, 1, 2]
# boxes: [N, 13, 13, 3, 4], rescaled to the original image scale
# conf_logits: [N, 13, 13, 3, 1]、 [N, height, width, 3, 1]
# prob_logits: [N, 13, 13, 3, class_num]、 [N, height, width, 3, class_num]
return x_y_offset, boxes, conf_logits, prob_logits
def predict(self, feature_maps):
''' Receive the returned feature_maps from `forward` function, the produce the output predictions at the test stage. '''
#
feature_map_1, feature_map_2, feature_map_3 = feature_maps
# 將特征圖和不同尺寸的anchors相結合,縮放程度大的特征圖和大尺寸的anchors相結合,
# 反之,縮放程度小的特征圖和小尺寸的anchors相結合
feature_map_anchors = [(feature_map_1, self.anchors[6:9]),
(feature_map_2, self.anchors[3:6]),
(feature_map_3, self.anchors[0:3])]
# 利用特征圖和其對應的anchors計算每一張特征圖的預測回歸框,置信程度,分類概率等
reorg_results = [self.reorg_layer(feature_map, anchors) for (feature_map, anchors) in feature_map_anchors]
def _reshape(result):
# 取出每一個特征圖對應的所有信息,包括預測回歸框,置信程度,分類概率等
x_y_offset, boxes, conf_logits, prob_logits = result
# 獲得特征圖的尺寸,[height, width]
grid_size = tf.shape(x_y_offset)[:2]
# 將boxes, 前景置信度,分類概率展開
boxes = tf.reshape(boxes, [-1, grid_size[0] * grid_size[1] * 3, 4])
conf_logits = tf.reshape(conf_logits, [-1, grid_size[0] * grid_size[1] * 3, 1])
prob_logits = tf.reshape(prob_logits, [-1, grid_size[0] * grid_size[1] * 3, self.class_num])
# shape: (take 416*416 input image and feature_map_1 for example),
# boxes: [N, 13*13*3, 4] , [N, height * width * anchor_num, 4]
# conf_logits: [N, 13*13*3, 1], [N, height * width * anchor_num, 1]
# prob_logits: [N, 13*13*3, class_num], [N, height * width * anchor_num, class_num]
return boxes, conf_logits, prob_logits
boxes_list, confs_list, probs_list = [], [], []
for result in reorg_results:
# 對每個特征圖的偏移量,boxes,前景置信度,分類概率等進行處理(主要是reshape),得到boxes,前景置信度,分類概率。
boxes, conf_logits, prob_logits = _reshape(result)
# 對置信度和概率進行sigmoid處理,保證數值位於0~1之間
confs = tf.sigmoid(conf_logits)
probs = tf.sigmoid(prob_logits)
# 將所有的boxes, 前景置信度,分類概率保存起來
boxes_list.append(boxes)
confs_list.append(confs)
probs_list.append(probs)
# collect results on three scales
# take 416*416 input image for example:
# shape: [N, (13*13+26*26+52*52)*3, 4]、[N, box_num, 4]
boxes = tf.concat(boxes_list, axis=1)
# shape: [N, (13*13+26*26+52*52)*3, 1]、[N, box_num, 1]
confs = tf.concat(confs_list, axis=1)
# shape: [N, (13*13+26*26+52*52)*3, class_num]、[N, box_num, class_num]
probs = tf.concat(probs_list, axis=1)
# 接下來處理boxes,我們需要將存儲格式為中心加尺寸的box數據變換成左上角和右下角的坐標。
center_x, center_y, width, height = tf.split(boxes, [1, 1, 1, 1], axis=-1)
x_min = center_x - width / 2
y_min = center_y - height / 2
x_max = center_x + width / 2
y_max = center_y + height / 2
boxes = tf.concat([x_min, y_min, x_max, y_max], axis=-1)
# 返回boxes,前景置信度,以及分類概率
return boxes, confs, probs
def loss_layer(self, feature_map_i, y_true, anchors):
''' calc loss function from a certain scale input: feature_map_i: feature maps of a certain scale. shape: [N, 13, 13, 3*(5 + num_class)] etc. y_true: y_ture from a certain scale. shape: [N, 13, 13, 3, 5 + num_class + 1] etc. anchors: shape [9, 2] '''
# size in [h, w] format! don't get messed up!
# 獲取特征圖的尺寸,這里的順序是[height, width]
grid_size = tf.shape(feature_map_i)[1:3]
# the downscale ratio in height and weight
# 計算下采樣的倍數,使用的是原始圖片的尺寸除以特征圖的尺寸,所以順序依然是[height, width]
ratio = tf.cast(self.img_size / grid_size, tf.float32)
# N: batch_size
# 樣本數目,或者說batch size,這里轉換成了浮點數
N = tf.cast(tf.shape(feature_map_i)[0], tf.float32)
# 根據特征圖和每一個特征圖對應的anchors計算預測的Bboxes,每一個框的概率以及每一個框屬於前景的概率。
# 這里返回的第一個參數是每一張特征圖上的偏移量。
# x_y_offset: [height, width, 1, 2]
# pred_boxes: [N, height, width, 3, 4]
# pred_conf_logits: [N, height, width, 3, 1]
# pred_prob_logits: [N, height, width, 3, 80(num_class)]
x_y_offset, pred_boxes, pred_conf_logits, pred_prob_logits = self.reorg_layer(feature_map_i, anchors)
###########
# get mask
###########
# shape: take 416x416 input image and 13*13 feature_map for example:
# [N, 13, 13, 3, 1]
# y true的最后一維的格式是[4, 1, 80, 1],分別表示4位坐標, 1位前景標志位,80個分類標記,1位mix up標記位
# y_true的最后一個維度的4號位(由0開始計數)上存儲的是當前位置是否是一個有效的前景.
# 如果某一個目標的中心落入框中,則是一個有效的前景,當前位是1,否則當前位置是0.
# 以13 * 13的特征圖為例,object mask的shape是[N, 13, 13, 3, 1] ([N, height, width, 3, 1]).
object_mask = y_true[..., 4:5]
# shape: [N, 13, 13, 3, 4] & [N, 13, 13, 3] ==> [V, 4]
# V: num of true gt box
# 根據上面計算出來的有效前景框,提取有效的ground truth前景框的坐標,
# valid true boxes的shape:[V, 4], 這里的V表示的是有效的ground truth前景框的數目。
valid_true_boxes = tf.boolean_mask(y_true[..., 0:4], tf.cast(object_mask[..., 0], 'bool'))
# shape: [V, 2]
# 將gt目標框的中心和高度寬度分離成兩個矩陣,每個矩陣的shape都是[V, 2]
valid_true_box_xy = valid_true_boxes[:, 0:2]
valid_true_box_wh = valid_true_boxes[:, 2:4]
# shape: [N, 13, 13, 3, 2]
# 同樣,我們將特征圖預測的每個位置的目標框的中心坐標和高度寬度提取出來。
# pred boxes的最后一個維度是[2, 2, 1, 80, 1],
# 分別表示預測的邊界框的中心位置(2),預測的邊界框的高度寬度(2),預測的邊界框的前景置信度(1),分類置信度(80),mixup權重(1)
pred_box_xy = pred_boxes[..., 0:2]
pred_box_wh = pred_boxes[..., 2:4]
# calc iou
# shape: [N, 13, 13, 3, V]
# 計算在每個位置上,每個預測的目標框和V個gt目標框之間的iou,返回相對應的矩陣。
iou = self.broadcast_iou(valid_true_box_xy, valid_true_box_wh, pred_box_xy, pred_box_wh)
# shape: [N, 13, 13, 3]
# 這一步相當於是為每一個預測的目標框匹配一個最佳的iou。
# 當然有些預測的目標框是不和任何的gt目標框相交的,此時它的最佳匹配的iou就是0.
best_iou = tf.reduce_max(iou, axis=-1)
# get_ignore_mask
# 計算出那些和任何一個gt目標邊界框的iou都小於0.5的預測目標框的標記。
# 雖然某些框和目標有一定的重疊,但是重疊部分不是很大,我們忽略掉這些框
# shape:[N, 13, 13, 3]
ignore_mask = tf.cast(best_iou < 0.5, tf.float32)
# shape: [N, 13, 13, 3, 1]
# 擴展出最后一個維度,這個ignore mask后面計算損失會用到
ignore_mask = tf.expand_dims(ignore_mask, -1)
# get xy coordinates in one cell from the feature_map
# numerical range: 0 ~ 1
# shape: [N, 13, 13, 3, 2]
# 計算gt目標框和預測的目標框相對於網格坐標的偏移量。
true_xy = y_true[..., 0:2] / ratio[::-1] - x_y_offset
pred_xy = pred_box_xy / ratio[::-1] - x_y_offset
# get_tw_th
# numerical range: 0 ~ 1
# shape: [N, 13, 13, 3, 2]
# 計算gt目標框和預測的目標框相對於anchors的大小縮放量
true_tw_th = y_true[..., 2:4] / anchors
pred_tw_th = pred_box_wh / anchors
# for numerical stability
# 為了保證數據的穩定性,因為log(0)會趨向於負無窮大,因此將0設置為1,log之后就會變成0,可以看作不影響。
true_tw_th = tf.where(condition=tf.equal(true_tw_th, 0),
x=tf.ones_like(true_tw_th), y=true_tw_th)
pred_tw_th = tf.where(condition=tf.equal(pred_tw_th, 0),
x=tf.ones_like(pred_tw_th), y=pred_tw_th)
# 取對數,這里使用了范圍的限制,小於1e-9的會強制變成1e-9,大於1e9的數據會變成1e9。
# shape: [N, 13, 13, 3, 2]
true_tw_th = tf.log(tf.clip_by_value(true_tw_th, 1e-9, 1e9))
pred_tw_th = tf.log(tf.clip_by_value(pred_tw_th, 1e-9, 1e9))
# box size punishment:
# box with smaller area has bigger weight. This is taken from the yolo darknet C source code.
# shape: [N, 13, 13, 3, 1]
# 對於目標框尺寸的懲罰,尺寸較小的框具有較大的權重。
box_loss_scale = 2. - (y_true[..., 2:3] / tf.cast(self.img_size[1], tf.float32)) * (
y_true[..., 3:4] / tf.cast(self.img_size[0], tf.float32))
############
# loss_part
############
# mix_up weight
# [N, 13, 13, 3, 1]
# mix up 權重
mix_w = y_true[..., -1:]
# shape: [N, 13, 13, 3, 1]
# 這里計算目標框的中心偏移的損失和高度寬度的損失,這里使用了均方和的方式計算。
# 從式子中可以看出,我們關注的只有object mask為1的目標,即有效的目標框,其他的目標框就被忽略了。
xy_loss = tf.reduce_sum(tf.square(true_xy - pred_xy) * object_mask * box_loss_scale * mix_w) / N
wh_loss = tf.reduce_sum(tf.square(true_tw_th - pred_tw_th) * object_mask * box_loss_scale * mix_w) / N
# shape: [N, 13, 13, 3, 1]
# 前景的正樣本mask,這里直接使用了object mask,因為這一部分肯定是正確的前景
conf_pos_mask = object_mask
# 前景的負樣本mask
# 這里的采樣法是沒有任何一個gt目標框的中心落入框中,並且和任何一個gt目標框的iou都小於0.5的框作為前景采樣的負樣本。
# 這里的iou控制就是使用的ignore mask
conf_neg_mask = (1 - object_mask) * ignore_mask
# 使用交叉熵公式計算最后的損失,唯一的區別就是采樣的方式,一個是正樣本采樣,一個是負樣本采樣
conf_loss_pos = conf_pos_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=object_mask,
logits=pred_conf_logits)
conf_loss_neg = conf_neg_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=object_mask,
logits=pred_conf_logits)
# TODO: may need to balance the pos-neg by multiplying some weights
# 二者相加就是最后的前景分類的損失
conf_loss = conf_loss_pos + conf_loss_neg
# 是否使用focal loss,默認為False
if self.use_focal_loss:
alpha = 1.0
gamma = 2.0
# TODO: alpha should be a mask array if needed
# Focal loss的計算,這不是YOLO的中點,在此省略
focal_mask = alpha * tf.pow(tf.abs(object_mask - tf.sigmoid(pred_conf_logits)), gamma)
conf_loss *= focal_mask
# 將結果和mis up權重相乘,並取均值作為最后的損失標量
conf_loss = tf.reduce_sum(conf_loss * mix_w) / N
# shape: [N, 13, 13, 3, 1]
# whether to use label smooth
# 是否使用label smooth,默認為False
if self.use_label_smooth:
delta = 0.01
label_target = (1 - delta) * y_true[..., 5:-1] + delta * 1. / self.class_num
else:
label_target = y_true[..., 5:-1]
# 分類損失,這里仍然使用的是交叉熵損失。這里還是只對有效的前景框計算損失。最后仍然要和mix up權重相乘
class_loss = object_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=label_target,
logits=pred_prob_logits) * mix_w
# 取均值作為最后的分類損失的標量
class_loss = tf.reduce_sum(class_loss) / N
# 返回最后的所有損失
return xy_loss, wh_loss, conf_loss, class_loss
def compute_loss(self, y_pred, y_true):
''' param: y_pred: returned feature_map list by `forward` function: [feature_map_1, feature_map_2, feature_map_3] y_true: input y_true by the tf.data pipeline '''
# 以下的四個變量分別用來保存四個方面的loss。
loss_xy, loss_wh, loss_conf, loss_class = 0., 0., 0., 0.
# 對anchors進行分組,因為每一層特征圖都對應三個不同尺度的anchors。
anchor_group = [self.anchors[6:9], self.anchors[3:6], self.anchors[0:3]]
# 對每一張特征圖和其對應的真實值以及其對應的anchors計算損失。
# 一共有三張特征圖,故一共存在三個不同尺度的損失。
# calc loss in 3 scales
for i in range(len(y_pred)):
# 分別計算損失
result = self.loss_layer(y_pred[i], y_true[i], anchor_group[i])
loss_xy += result[0]
loss_wh += result[1]
loss_conf += result[2]
loss_class += result[3]
total_loss = loss_xy + loss_wh + loss_conf + loss_class
return [total_loss, loss_xy, loss_wh, loss_conf, loss_class]
def broadcast_iou(self, true_box_xy, true_box_wh, pred_box_xy, pred_box_wh):
''' maintain an efficient way to calculate the ios matrix between ground truth true boxes and the predicted boxes note: here we only care about the size match '''
# shape:
# true_box_??: [V, 2]
# pred_box_??: [N, 13, 13, 3, 2]
# shape: [N, 13, 13, 3, 1, 2]
pred_box_xy = tf.expand_dims(pred_box_xy, -2)
pred_box_wh = tf.expand_dims(pred_box_wh, -2)
# shape: [1, V, 2]
true_box_xy = tf.expand_dims(true_box_xy, 0)
true_box_wh = tf.expand_dims(true_box_wh, 0)
# [N, 13, 13, 3, 1, 2] & [1, V, 2] ==> [N, 13, 13, 3, V, 2]
intersect_mins = tf.maximum(pred_box_xy - pred_box_wh / 2.,
true_box_xy - true_box_wh / 2.)
intersect_maxs = tf.minimum(pred_box_xy + pred_box_wh / 2.,
true_box_xy + true_box_wh / 2.)
intersect_wh = tf.maximum(intersect_maxs - intersect_mins, 0.)
# shape: [N, 13, 13, 3, V]
intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]
# shape: [N, 13, 13, 3, 1]
pred_box_area = pred_box_wh[..., 0] * pred_box_wh[..., 1]
# shape: [1, V]
true_box_area = true_box_wh[..., 0] * true_box_wh[..., 1]
# [N, 13, 13, 3, V]
iou = intersect_area / (pred_box_area + true_box_area - intersect_area + 1e-10)
return iou
(四)layer_utils.py
這里函數的主要作用是對卷積等操作做出一定的個性化封裝,方便代碼的編寫。主要包括:
- 卷積的封裝
- darknet網絡結構的定義
- resize的定義,默認是最近鄰方法
- 在主體網絡的基礎上做的YOLO的附加的卷積操作,為后面的特征融合做准備
# coding: utf-8
from __future__ import division, print_function
import numpy as np
import tensorflow as tf
slim = tf.contrib.slim
def conv2d(inputs, filters, kernel_size, strides=1):
# 對conv2d做一定的個性化封裝,方便代碼的編寫和閱讀
def _fixed_padding(inputs, kernel_size):
pad_total = kernel_size - 1
pad_beg = pad_total // 2
pad_end = pad_total - pad_beg
padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end],
[pad_beg, pad_end], [0, 0]], mode='CONSTANT')
return padded_inputs
if strides > 1:
inputs = _fixed_padding(inputs, kernel_size)
inputs = slim.conv2d(inputs, filters, kernel_size, stride=strides,
padding=('SAME' if strides == 1 else 'VALID'))
return inputs
def darknet53_body(inputs):
""" darknet的主體網絡框架 :param inputs: :return: 三張不同尺度的特征圖 """
def res_block(inputs, filters):
shortcut = inputs
net = conv2d(inputs, filters * 1, 1)
net = conv2d(net, filters * 2, 3)
net = net + shortcut
return net
# first two conv2d layers
net = conv2d(inputs, 32, 3, strides=1)
net = conv2d(net, 64, 3, strides=2)
# res_block * 1
net = res_block(net, 32)
net = conv2d(net, 128, 3, strides=2)
# res_block * 2
for i in range(2):
net = res_block(net, 64)
net = conv2d(net, 256, 3, strides=2)
# res_block * 8
for i in range(8):
net = res_block(net, 128)
route_1 = net
net = conv2d(net, 512, 3, strides=2)
# res_block * 8
for i in range(8):
net = res_block(net, 256)
route_2 = net
net = conv2d(net, 1024, 3, strides=2)
# res_block * 4
for i in range(4):
net = res_block(net, 512)
route_3 = net
return route_1, route_2, route_3
def yolo_block(inputs, filters):
""" 在darknet主體網絡提取特征的基礎上增加的若干卷積層,為了后面的特征融合做准備 :param inputs: :param filters: :return: """
net = conv2d(inputs, filters * 1, 1)
net = conv2d(net, filters * 2, 3)
net = conv2d(net, filters * 1, 1)
net = conv2d(net, filters * 2, 3)
net = conv2d(net, filters * 1, 1)
route = net
net = conv2d(net, filters * 2, 3)
return route, net
def upsample_layer(inputs, out_shape):
""" 這一部分主要是對特征圖進行resize,默認使用最近鄰方法 :param inputs: :param out_shape: :return: """
new_height, new_width = out_shape[1], out_shape[2]
# NOTE: here height is the first
# TODO: Do we need to set `align_corners` as True?
inputs = tf.image.resize_nearest_neighbor(inputs, (new_height, new_width), name='upsampled')
return inputs
(五)nms_utils.py
這一部分代碼主要是非最大值抑制(NMS)的實現,原理都是相同,過程大致如下:
- 首先按照目標的置信度從大到小排序
- 取出當前最大的置信度的目標框
- 計算剩下的目標框和取出的目標框的iou
- 依次檢查iou的大小,如果iou高於一定的閾值,則說明對應的目標框被取出的目標框抑制了,因此只留下iou小於一定閾值的框。
- 重復2~4步驟,直至處理完所有的目標框
- 返回所有取出的目標框,就是NMS的結果
需要注意的是,NMS只針對於一類類別的數據,如果有多個類別,則需要分別處理。
# coding: utf-8
from __future__ import division, print_function
import numpy as np
import tensorflow as tf
def gpu_nms(boxes, scores, num_classes, max_boxes=50, score_thresh=0.5, nms_thresh=0.5):
""" Perform NMS on GPU using TensorFlow. params: boxes: tensor of shape [1, 10647, 4] # 10647=(13*13+26*26+52*52)*3, for input 416*416 image scores: tensor of shape [1, 10647, num_classes], score=conf*prob num_classes: total number of classes max_boxes: integer, maximum number of predicted boxes you'd like, default is 50 score_thresh: if [ highest class probability score < score_threshold] then get rid of the corresponding box nms_thresh: real value, "intersection over union" threshold used for NMS filtering """
boxes_list, label_list, score_list = [], [], []
max_boxes = tf.constant(max_boxes, dtype='int32')
# since we do nms for single image, then reshape it
boxes = tf.reshape(boxes, [-1, 4]) # '-1' means we don't konw the exact number of boxes
score = tf.reshape(scores, [-1, num_classes])
# Step 1: Create a filtering mask based on "box_class_scores" by using "threshold".
mask = tf.greater_equal(score, tf.constant(score_thresh))
# Step 2: Do non_max_suppression for each class
for i in range(num_classes):
# Step 3: Apply the mask to scores, boxes and pick them out
filter_boxes = tf.boolean_mask(boxes, mask[:, i])
filter_score = tf.boolean_mask(score[:, i], mask[:, i])
nms_indices = tf.image.non_max_suppression(boxes=filter_boxes,
scores=filter_score,
max_output_size=max_boxes,
iou_threshold=nms_thresh, name='nms_indices')
label_list.append(tf.ones_like(tf.gather(filter_score, nms_indices), 'int32') * i)
boxes_list.append(tf.gather(filter_boxes, nms_indices))
score_list.append(tf.gather(filter_score, nms_indices))
boxes = tf.concat(boxes_list, axis=0)
score = tf.concat(score_list, axis=0)
label = tf.concat(label_list, axis=0)
return boxes, score, label
def py_nms(boxes, scores, max_boxes=50, iou_thresh=0.5):
""" Pure Python NMS baseline. Arguments: boxes: shape of [-1, 4], the value of '-1' means that dont know the exact number of boxes scores: shape of [-1,] max_boxes: representing the maximum of boxes to be selected by non_max_suppression iou_thresh: representing iou_threshold for deciding to keep boxes """
assert boxes.shape[1] == 4 and len(scores.shape) == 1
# 下面幾行的代碼主要是用於求解每個box的面積,然后按照每個box的score的大小進行排序
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
areas = (x2 - x1) * (y2 - y1)
# 按照每個box的score大小進行排序,這里返回的是排序之后的box的index。
# 本質上order儲存的是需要處理的box的索引
order = scores.argsort()[::-1]
# keep用於儲存保留下來的box的索引index
keep = []
# 如果還存在沒有被處理的box的索引
while order.size > 0:
# 由於之前進行了排序,所以order的第一個肯定是score最高的
i = order[0]
# 將這個索引保存起來
keep.append(i)
# 下面的代碼主要是求解第一個box和剩下的所有的box的IOU,
# 因為第一個是目標box,所以在order的選取上需要加上[1:],取遍剩下的所有的box
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
w = np.maximum(0.0, xx2 - xx1 + 1)
h = np.maximum(0.0, yy2 - yy1 + 1)
inter = w * h
# IOU計算
ovr = inter / (areas[i] + areas[order[1:]] - inter)
# 將和目標box的IOU小於一定閾值的box的索引取出,因為高於這一閾值的box都已經被目標box抑制了
inds = np.where(ovr <= iou_thresh)[0]
# 然后更新我們的order,重復下一輪循環。
order = order[inds + 1]
# 最后返回給定數目的box的索引
return keep[:max_boxes]
def cpu_nms(boxes, scores, num_classes, max_boxes=50, score_thresh=0.5, iou_thresh=0.5):
""" Perform NMS on CPU. Arguments: boxes: shape [1, 10647, 4] scores: shape [1, 10647, num_classes] """
boxes = boxes.reshape(-1, 4)
scores = scores.reshape(-1, num_classes)
# Picked bounding boxes
picked_boxes, picked_score, picked_label = [], [], []
for i in range(num_classes):
indices = np.where(scores[:, i] >= score_thresh)
filter_boxes = boxes[indices]
filter_scores = scores[:, i][indices]
if len(filter_boxes) == 0:
continue
# do non_max_suppression on the cpu
indices = py_nms(filter_boxes, filter_scores,
max_boxes=max_boxes, iou_thresh=iou_thresh)
picked_boxes.append(filter_boxes[indices])
picked_score.append(filter_scores[indices])
picked_label.append(np.ones(len(indices), dtype='int32') * i)
if len(picked_boxes) == 0:
return None, None, None
boxes = np.concatenate(picked_boxes, axis=0)
score = np.concatenate(picked_score, axis=0)
label = np.concatenate(picked_label, axis=0)
return boxes, score, label
(六)train.py
這一部分代碼主要是訓練模型的入口,按照要求准備號訓練數據之后,就可以從這里開始訓練了。
# coding: utf-8
from __future__ import division, print_function
import tensorflow as tf
import numpy as np
import logging
from tqdm import trange
import args
from utils.data_utils import get_batch_data
from utils.misc_utils import shuffle_and_overwrite, make_summary, config_learning_rate, config_optimizer, AverageMeter
from utils.eval_utils import evaluate_on_cpu, evaluate_on_gpu, get_preds_gpu, voc_eval, parse_gt_rec
from utils.nms_utils import gpu_nms
from model import yolov3
# setting loggers
# 設置日志記錄
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S', filename=args.progress_log_path, filemode='w')
# setting placeholders
# 整個網絡的數據輸入入口
# 是否是訓練階段,針對BN等操作有用
is_training = tf.placeholder(tf.bool, name="phase_train")
# 這個數據輸入入口未被使用,原因不明
handle_flag = tf.placeholder(tf.string, [], name='iterator_handle_flag')
# register the gpu nms operation here for the following evaluation scheme
# 為了后面的模型評估的計算,這里首先定義好在gpu上的nms操作
pred_boxes_flag = tf.placeholder(tf.float32, [1, None, None])
pred_scores_flag = tf.placeholder(tf.float32, [1, None, None])
gpu_nms_op = gpu_nms(pred_boxes_flag, pred_scores_flag, args.class_num, args.nms_topk, args.score_threshold, args.nms_threshold)
##################
# tf.data pipeline
##################
# 輸入輸入流,我們是從一個文本文件讀入數據,因此,可以使用TextLineDataset類來幫助數據讀入
train_dataset = tf.data.TextLineDataset(args.train_file)
# 隨機打亂
train_dataset = train_dataset.shuffle(args.train_img_cnt)
# 設定batch size
train_dataset = train_dataset.batch(args.batch_size)
# 自定義輸入的返回格式,因為文本文件中的數據不一定就是正式的使用數據,可以自定義真正的數據讀取操作
train_dataset = train_dataset.map(
lambda x: tf.py_func(get_batch_data,
inp=[x, args.class_num, args.img_size, args.anchors, 'train', args.multi_scale_train, args.use_mix_up],
Tout=[tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]),
num_parallel_calls=args.num_threads
)
# 預先讀取
train_dataset = train_dataset.prefetch(args.prefetech_buffer)
# 和訓練數據的讀取類似,這里讀取的是驗證集的數據
val_dataset = tf.data.TextLineDataset(args.val_file)
val_dataset = val_dataset.batch(1)
val_dataset = val_dataset.map(
lambda x: tf.py_func(get_batch_data,
inp=[x, args.class_num, args.img_size, args.anchors, 'val', False, False],
Tout=[tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]),
num_parallel_calls=args.num_threads
)
val_dataset.prefetch(args.prefetech_buffer)
# 定義迭代器
iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes)
train_init_op = iterator.make_initializer(train_dataset)
val_init_op = iterator.make_initializer(val_dataset)
# get an element from the chosen dataset iterator
# 利用迭代器獲取數據.由於之前我們自定義了數據的讀取方式,這里返回的正是我們希望的數據
image_ids, image, y_true_13, y_true_26, y_true_52 = iterator.get_next()
y_true = [y_true_13, y_true_26, y_true_52]
# tf.data pipeline will lose the data `static` shape, so we need to set it manually
# 手動設置shape
image_ids.set_shape([None])
image.set_shape([None, None, None, 3])
for y in y_true:
y.set_shape([None, None, None, None, None])
##################
# Model definition
##################
# 模型定義,這一部分和預測時的一致.
yolo_model = yolov3(args.class_num, args.anchors, args.use_label_smooth, args.use_focal_loss, args.batch_norm_decay, args.weight_decay)
with tf.variable_scope('yolov3'):
pred_feature_maps = yolo_model.forward(image, is_training=is_training)
# 計算損失
loss = yolo_model.compute_loss(pred_feature_maps, y_true)
# 計算預測的結果
y_pred = yolo_model.predict(pred_feature_maps)
# 正則化的損失
l2_loss = tf.losses.get_regularization_loss()
# setting restore parts and vars to update
# 定義Saver,
saver_to_restore = tf.train.Saver(var_list=tf.contrib.framework.get_variables_to_restore(include=args.restore_part))
update_vars = tf.contrib.framework.get_variables_to_restore(include=args.update_part)
# 這一部分是為了tensor board可視化做的准備,主要是一些曲線,反映loss的變化
tf.summary.scalar('train_batch_statistics/total_loss', loss[0])
tf.summary.scalar('train_batch_statistics/loss_xy', loss[1])
tf.summary.scalar('train_batch_statistics/loss_wh', loss[2])
tf.summary.scalar('train_batch_statistics/loss_conf', loss[3])
tf.summary.scalar('train_batch_statistics/loss_class', loss[4])
tf.summary.scalar('train_batch_statistics/loss_l2', l2_loss)
tf.summary.scalar('train_batch_statistics/loss_ratio', l2_loss / loss[0])
# global step
global_step = tf.Variable(float(args.global_step), trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES])
# 是否使用warm up,默認是True,主要是定義學習率的方法上有些區別
if args.use_warm_up:
learning_rate = tf.cond(tf.less(global_step, args.train_batch_num * args.warm_up_epoch),
lambda: args.learning_rate_init * global_step / (args.train_batch_num * args.warm_up_epoch),
lambda: config_learning_rate(args, global_step - args.train_batch_num * args.warm_up_epoch))
else:
learning_rate = config_learning_rate(args, global_step)
tf.summary.scalar('learning_rate', learning_rate)
#
if not args.save_optimizer:
saver_to_save = tf.train.Saver()
saver_best = tf.train.Saver()
# 優化器
optimizer = config_optimizer(args.optimizer_name, learning_rate)
if args.save_optimizer:
saver_to_save = tf.train.Saver()
saver_best = tf.train.Saver()
# set dependencies for BN ops
# 為BN操作設置依賴
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(loss[0] + l2_loss, var_list=update_vars, global_step=global_step)
# 設置會話Session
with tf.Session() as sess:
# 初始化全局的variable
sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])
saver_to_restore.restore(sess, args.restore_path)
merged = tf.summary.merge_all()
writer = tf.summary.FileWriter(args.log_dir, sess.graph)
print('\n----------- start to train -----------\n')
best_mAP = -np.Inf
# 開始循環訓練
for epoch in range(args.total_epoches):
sess.run(train_init_op)
# 定義記錄數據的類,主要是保存當前為止的所有數據的均值
loss_total, loss_xy, loss_wh, loss_conf, loss_class = AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter()
# 對每一個bacth size
for i in trange(args.train_batch_num):
_, summary, __y_pred, __y_true, __loss, __global_step, __lr = sess.run(
[train_op, merged, y_pred, y_true, loss, global_step, learning_rate],
feed_dict={is_training: True})
writer.add_summary(summary, global_step=__global_step)
# 更新均值
loss_total.update(__loss[0], len(__y_pred[0]))
loss_xy.update(__loss[1], len(__y_pred[0]))
loss_wh.update(__loss[2], len(__y_pred[0]))
loss_conf.update(__loss[3], len(__y_pred[0]))
loss_class.update(__loss[4], len(__y_pred[0]))
# 每隔一段時間進行模型的評估,這里主要計算的是recall和precision
# 這里計算的是訓練集上的評估結果
if __global_step % args.train_evaluation_step == 0 and __global_step > 0:
# recall, precision = evaluate_on_cpu(__y_pred, __y_true, args.class_num, args.nms_topk, args.score_threshold, args.eval_threshold)
recall, precision = evaluate_on_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag, __y_pred, __y_true, args.class_num, args.eval_threshold)
info = "Epoch: {}, global_step: {} | loss: total: {:.2f}, xy: {:.2f}, wh: {:.2f}, conf: {:.2f}, class: {:.2f} | ".format(
epoch, int(__global_step), loss_total.average, loss_xy.average, loss_wh.average, loss_conf.average, loss_class.average)
info += 'Last batch: rec: {:.3f}, prec: {:.3f} | lr: {:.5g}'.format(recall, precision, __lr)
print(info)
logging.info(info)
writer.add_summary(make_summary('evaluation/train_batch_recall', recall), global_step=__global_step)
writer.add_summary(make_summary('evaluation/train_batch_precision', precision), global_step=__global_step)
if np.isnan(loss_total.average):
print('****' * 10)
raise ArithmeticError(
'Gradient exploded! Please train again and you may need modify some parameters.')
# 重置相關的均值記錄類
tmp_total_loss = loss_total.average
loss_total.reset()
loss_xy.reset()
loss_wh.reset()
loss_conf.reset()
loss_class.reset()
# 保存模型
# NOTE: this is just demo. You can set the conditions when to save the weights.
if epoch % args.save_epoch == 0 and epoch > 0:
if tmp_total_loss <= 2.:
saver_to_save.save(sess, args.save_dir + 'model-epoch_{}_step_{}_loss_{:.4f}_lr_{:.5g}'.format(epoch, int(__global_step), loss_total.last_avg, __lr))
# 驗證集用以評估模型,這一部分和前面類似
# switch to validation dataset for evaluation
if epoch % args.val_evaluation_epoch == 0 and epoch > 0:
sess.run(val_init_op)
val_loss_total, val_loss_xy, val_loss_wh, val_loss_conf, val_loss_class = \
AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter()
val_preds = []
for j in trange(args.val_img_cnt):
__image_ids, __y_pred, __loss = sess.run([image_ids, y_pred, loss],
feed_dict={is_training: False})
pred_content = get_preds_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag, __image_ids, __y_pred)
val_preds.extend(pred_content)
val_loss_total.update(__loss[0])
val_loss_xy.update(__loss[1])
val_loss_wh.update(__loss[2])
val_loss_conf.update(__loss[3])
val_loss_class.update(__loss[4])
# calc mAP
# 計算mAP
rec_total, prec_total, ap_total = AverageMeter(), AverageMeter(), AverageMeter()
gt_dict = parse_gt_rec(args.val_file, args.img_size)
info = '======> Epoch: {}, global_step: {}, lr: {:.6g} <======\n'.format(epoch, __global_step, __lr)
for ii in range(args.class_num):
npos, nd, rec, prec, ap = voc_eval(gt_dict, val_preds, ii, iou_thres=args.eval_threshold, use_07_metric=False)
info += 'EVAL: Class {}: Recall: {:.4f}, Precision: {:.4f}, AP: {:.4f}\n'.format(ii, rec, prec, ap)
rec_total.update(rec, npos)
prec_total.update(prec, nd)
ap_total.update(ap, 1)
mAP = ap_total.avg
info += 'EVAL: Recall: {:.4f}, Precison: {:.4f}, mAP: {:.4f}\n'.format(rec_total.avg, prec_total.avg, mAP)
info += 'EVAL: loss: total: {:.2f}, xy: {:.2f}, wh: {:.2f}, conf: {:.2f}, class: {:.2f}\n'.format(
val_loss_total.avg, val_loss_xy.avg, val_loss_wh.avg, val_loss_conf.avg, val_loss_class.avg)
print(info)
logging.info(info)
if mAP > best_mAP:
best_mAP = mAP
saver_best.save(sess, args.save_dir + 'best_model_Epoch_{}_step_{}_mAP_{:.4f}_loss_{:.4f}_lr_{:.7g}'.format(
epoch, __global_step, best_mAP, val_loss_total.last_avg, __lr))
writer.add_summary(make_summary('evaluation/val_mAP', mAP), global_step=epoch)
writer.add_summary(make_summary('evaluation/val_recall', rec_total.last_avg), global_step=epoch)
writer.add_summary(make_summary('evaluation/val_precision', prec_total.last_avg), global_step=epoch)
writer.add_summary(make_summary('validation_statistics/total_loss', val_loss_total.last_avg), global_step=epoch)
writer.add_summary(make_summary('validation_statistics/loss_xy', val_loss_xy.last_avg), global_step=epoch)
writer.add_summary(make_summary('validation_statistics/loss_wh', val_loss_wh.last_avg), global_step=epoch)
writer.add_summary(make_summary('validation_statistics/loss_conf', val_loss_conf.last_avg), global_step=epoch)
writer.add_summary(make_summary('validation_statistics/loss_class', val_loss_class.last_avg), global_step=epoch)
(七)data_utils.py
這一部分代碼主要是准備訓練用的數據。算得上是YOLO模型中另一個十分重要的部分。
# coding: utf-8
from __future__ import division, print_function
import numpy as np
import cv2
import sys
from utils.data_aug import *
import random
PY_VERSION = sys.version_info[0]
iter_cnt = 0
def parse_line(line):
''' Given a line from the training/test txt file, return parsed info. return: line_idx: int64 pic_path: string. boxes: shape [N, 4], N is the ground truth count, elements in the second dimension are [x_min, y_min, x_max, y_max] labels: shape [N]. class index. '''
""" 這一部分代碼的主要功能是對給定的數據字符串進行處理,提取出其中的有效信息,包括如下: 1. 圖片索引 2. 圖片路徑 3. 每一個目標框的坐標, 4. 每一個目標框的label """
if 'str' not in str(type(line)):
line = line.decode()
# 按照空格划分數據
s = line.strip().split(' ')
# 第一個數據是圖片索引
line_idx = int(s[0])
# 第二個數據是圖片的路徑
pic_path = s[1]
# 去除掉前兩個數據之后,剩下的就和目標框有關系了
s = s[2:]
# 每一個目標框都包含五個數據,4個坐標信息和1個label信息,因此數據總數除以5之后就是目標框的總數目
box_cnt = len(s) // 5
# 存儲數據的list
boxes = []
labels = []
# 對每一個目標框
for i in range(box_cnt):
# 提取出label以及四個坐標數據
label, x_min, y_min, x_max, y_max = int(s[i * 5]), float(s[i * 5 + 1]), float(s[i * 5 + 2]), float(
s[i * 5 + 3]), float(s[i * 5 + 4])
boxes.append([x_min, y_min, x_max, y_max])
labels.append(label)
# numpy處理一下
boxes = np.asarray(boxes, np.float32)
labels = np.asarray(labels, np.int64)
# 返回
return line_idx, pic_path, boxes, labels
def process_box(boxes, labels, img_size, class_num, anchors):
''' Generate the y_true label, i.e. the ground truth feature_maps in 3 different scales. params: boxes: [N, 5] shape, float32 dtype. `x_min, y_min, x_max, y_mix, mixup_weight`. labels: [N] shape, int64 dtype. class_num: int64 num. anchors: [9, 4] shape, float32 dtype. '''
""" 這一部分是數據預處理中最重要的一部分,因為這里才是生成最后的y true的地方 """
# anchor的編號,分別對應於每一個不同尺寸的特征圖,
# 大尺寸的特征圖對應的anchor是6,7,8,中尺寸的特征圖對應的是3,4,5,小尺寸的對應的是0,1,2
anchors_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
# convert boxes form:
# shape: [N, 2]
# (x_center, y_center)
# 計算目標框的中心坐標
box_centers = (boxes[:, 0:2] + boxes[:, 2:4]) / 2
# (width, height)
# 計算目標框的大小
box_sizes = boxes[:, 2:4] - boxes[:, 0:2]
# [13, 13, 3, 5+num_class+1] `5` means coords and labels. `1` means mix up weight.
# 儲存數據的矩陣,初始全部數據都是0,分別對應的是三個不同尺寸的特征圖
# 矩陣的shape: [height, width , 3, 4 + 1 + num_class + 1],
# 最后一維的第一個1表示的是前景后景的標志位,最后一個1表示的是mix up的權重.
y_true_13 = np.zeros((img_size[1] // 32, img_size[0] // 32, 3, 6 + class_num), np.float32)
y_true_26 = np.zeros((img_size[1] // 16, img_size[0] // 16, 3, 6 + class_num), np.float32)
y_true_52 = np.zeros((img_size[1] // 8, img_size[0] // 8, 3, 6 + class_num), np.float32)
# mix up weight default to 1.
# mix up的權重默認值設置為1
y_true_13[..., -1] = 1.
y_true_26[..., -1] = 1.
y_true_52[..., -1] = 1.
# 將他們放在一起,可以統一操作
y_true = [y_true_13, y_true_26, y_true_52]
# [N, 1, 2]
# 擴展一維,shape: [N, 1, 2]
# 需要注意的是,這里的N表示的是目標框的數目,而不是樣本的數據.
box_sizes = np.expand_dims(box_sizes, 1)
# broadcast tricks
# [N, 1, 2] & [9, 2] ==> [N, 9, 2]
# 使用numpy的廣播機制,很容易計算出目標框和anchor之間的交集部分.
mins = np.maximum(- box_sizes / 2, - anchors / 2)
maxs = np.minimum(box_sizes / 2, anchors / 2)
# [N, 9, 2]
whs = maxs - mins
# [N, 9]
# 計算每一個目標框和每一個anchor的iou值
iou = (whs[:, :, 0] * whs[:, :, 1]) / (
box_sizes[:, :, 0] * box_sizes[:, :, 1] + anchors[:, 0] * anchors[:, 1] - whs[:, :, 0] * whs[:, :,
1] + 1e-10)
# [N]
# 計算每一個目標框和某一個anchor之間的最佳iou值,並返回最佳iou值對應的下標索引
best_match_idx = np.argmax(iou, axis=1)
# 這個字典是為了后續的計算方便才定義的
ratio_dict = {1.: 8., 2.: 16., 3.: 32.}
for i, idx in enumerate(best_match_idx):
# idx: 0,1,2 ==> 2; 3,4,5 ==> 1; 6,7,8 ==> 0
# 根據上面的下標索引,下面的代碼可以計算出該目標框應該對應與哪一張特征圖.
# 因為不同的anchor對應於不同尺寸的特征圖,
# 所以如果一個目標框和其中一個anchor具有最大的iou,那么我們應該將該目標框和這個anchor對應的特征圖聯系起來.
feature_map_group = 2 - idx // 3
# scale ratio: 0,1,2 ==> 8; 3,4,5 ==> 16; 6,7,8 ==> 32
# 這里就是利用了前面定義的字典,方便的獲取縮放倍數
ratio = ratio_dict[np.ceil((idx + 1) / 3.)]
# 計算目標框的中心.這里是指縮放之后的中心
x = int(np.floor(box_centers[i, 0] / ratio))
y = int(np.floor(box_centers[i, 1] / ratio))
# 根據特征圖的編號,獲取anchor的下標索引
k = anchors_mask[feature_map_group].index(idx)
# 類別標記
c = labels[i]
# print(feature_map_group, '|', y,x,k,c)
# 分別將數據添加到合適的位置,其中需要注意的是k的使用,它表明的是目標框對應的是哪個anchor
# 目標框的中心
y_true[feature_map_group][y, x, k, :2] = box_centers[i]
# 目標框的尺寸
y_true[feature_map_group][y, x, k, 2:4] = box_sizes[i]
# 前景標記
y_true[feature_map_group][y, x, k, 4] = 1.
# 類別標記
y_true[feature_map_group][y, x, k, 5 + c] = 1.
# mix up權重
y_true[feature_map_group][y, x, k, -1] = boxes[i, -1]
# 當我們處理好所有的目標框之后就返回
return y_true_13, y_true_26, y_true_52
def parse_data(line, class_num, img_size, anchors, mode):
''' param: line: a line from the training/test txt file class_num: totol class nums. img_size: the size of image to be resized to. [width, height] format. anchors: anchors. mode: 'train' or 'val'. When set to 'train', data_augmentation will be applied. '''
# 如果line不是一個list,說明這里的line是一個str
if not isinstance(line, list):
# 直接處理即可,返回圖片索引,圖片路徑,以及gt目標框的坐標和對應的labels
img_idx, pic_path, boxes, labels = parse_line(line)
# 根據圖片路徑讀取圖片
img = cv2.imread(pic_path)
# expand the 2nd dimension, mix up weight default to 1.
# 擴展矩陣的維度,這里主要是在每一行的末尾添加一個表示mix up權重的信息,此處默認設置為1
boxes = np.concatenate((boxes, np.full(shape=(boxes.shape[0], 1), fill_value=1., dtype=np.float32)), axis=-1)
else:
# the mix up case
# 如果line表示的是一個list,說明需要使用mix up策略
# 處理第一張圖片
_, pic_path1, boxes1, labels1 = parse_line(line[0])
# 讀取第一張圖片
img1 = cv2.imread(pic_path1)
# 處理第二張圖片
img_idx, pic_path2, boxes2, labels2 = parse_line(line[1])
# 讀取第二張圖片
img2 = cv2.imread(pic_path2)
# 將他們混合在一起
img, boxes = mix_up(img1, img2, boxes1, boxes2)
labels = np.concatenate((labels1, labels2))
# 如果是訓練階段,則會做一些數據增強的操作,如隨機顏色抖動,隨機裁剪,隨機翻轉等操作
if mode == 'train':
# random color jittering
# NOTE: applying color distort may lead to bad performance sometimes
# img = random_color_distort(img)
# random expansion with prob 0.5
if np.random.uniform(0, 1) > 0.5:
img, boxes = random_expand(img, boxes, 2)
# random cropping
h, w, _ = img.shape
boxes, crop = random_crop_with_constraints(boxes, (w, h))
x0, y0, w, h = crop
img = img[y0: y0+h, x0: x0+w]
# resize with random interpolation
h, w, _ = img.shape
interp = np.random.randint(0, 5)
img, boxes = resize_with_bbox(img, boxes, img_size[0], img_size[1], interp)
# random horizontal flip
h, w, _ = img.shape
img, boxes = random_flip(img, boxes, px=0.5)
else:
img, boxes = resize_with_bbox(img, boxes, img_size[0], img_size[1], interp=1)
# 將顏色的通道順序進行更改
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
# 規范化數據至0~1
# the input of yolo_v3 should be in range 0~1
img = img / 255.
# 將給出的gt 目標框進行處理,返回對應的gt矩陣,用以后面的損失計算。
y_true_13, y_true_26, y_true_52 = process_box(boxes, labels, img_size, class_num, anchors)
# 返回
return img_idx, img, y_true_13, y_true_26, y_true_52
def get_batch_data(batch_line, class_num, img_size, anchors, mode, multi_scale=False, mix_up=False, interval=10):
''' generate a batch of imgs and labels param: batch_line: a batch of lines from train/val.txt files class_num: num of total classes. img_size: the image size to be resized to. format: [width, height]. anchors: anchors. shape: [9, 2]. mode: 'train' or 'val'. if set to 'train', data augmentation will be applied. multi_scale: whether to use multi_scale training, img_size varies from [320, 320] to [640, 640] by default. Note that it will take effect only when mode is set to 'train'. interval: change the scale of image every interval batches. Note that it's indeterministic because of the multi threading. '''
# 全局的計數器
global iter_cnt
# multi_scale training
# 是否使用多種尺寸進行訓練, 默認是False
if multi_scale and mode == 'train':
# 設置隨機數種子
random.seed(iter_cnt // interval)
# 設定選擇范圍,並隨機采樣
random_img_size = [[x * 32, x * 32] for x in range(10, 20)]
img_size = random.sample(random_img_size, 1)[0]
# 計數器加1
iter_cnt += 1
# 用以保存數據的list
img_idx_batch, img_batch, y_true_13_batch, y_true_26_batch, y_true_52_batch = [], [], [], [], []
# mix up strategy
# 是否使用mix up策略,默認是False
if mix_up and mode == 'train':
mix_lines = []
batch_line = batch_line.tolist()
for idx, line in enumerate(batch_line):
if np.random.uniform(0, 1) < 0.5:
mix_lines.append([line, random.sample(batch_line[:idx] + batch_line[idx+1:], 1)[0]])
else:
mix_lines.append(line)
batch_line = mix_lines
# 對一個batch中的數據,這里的line一般指的是一行文本數據
for line in batch_line:
# 處里數據中的信息,主要是數據索引(一般用不上)圖片的像素矩陣,不同特征圖所對應的gt信息。
img_idx, img, y_true_13, y_true_26, y_true_52 = parse_data(line, class_num, img_size, anchors, mode)
# 附加到這些list的末尾
img_idx_batch.append(img_idx)
img_batch.append(img)
y_true_13_batch.append(y_true_13)
y_true_26_batch.append(y_true_26)
y_true_52_batch.append(y_true_52)
# 使用numpy處理一下
img_idx_batch, img_batch, y_true_13_batch, y_true_26_batch, y_true_52_batch = np.asarray(img_idx_batch, np.int64), np.asarray(img_batch), np.asarray(y_true_13_batch), np.asarray(y_true_26_batch), np.asarray(y_true_52_batch)
# 返回
return img_idx_batch, img_batch, y_true_13_batch, y_true_26_batch, y_true_52_batch