yolox-整體結構和數據增強
概要
論文地址: [2107.08430] YOLOX: Exceeding YOLO Series in 2021 (arxiv.org)
YOLOX-L和YOLOv4-CSP、YOLOv5-L有差不多參數量的情況下,YOLOX-L在COCO上取得50.0%AP(比YOLOv5-L高出1.8%的AP),且YOLOX-L在單張Tesla V100上能達到68.9FPS。
主要結構
① 輸入端: 兩種數據增強方式
② BackBone主干網絡:Darknet53。
③ Neck:FPN結構。
基礎網絡
以 YOLOv3+DarkNet53 作為基線
YOLOv3 baseline基線模型采用了DarkNet53骨干+SPP層(即所謂的YOLOv3-SPP
基線模型在COCO val上取得了38.5%AP指標
數據增強
強大的數據增強: 使用Mosaic和Mixup來提高YoloX的表現,Mosaic是Yolov3變體提出的一種有效的增強策略,廣泛的被使用在yolov4,yoloV5和其他檢測器中,MixUp通常被使用在圖片分類任務中,但是被BOF算法使用在檢測訓練中,yolox采用MiXUp和Mosaic功能在我們的模型中,並且在最后15個epoch進行關閉,論文中的經驗,如果我們從頭到尾都開這兩個數據增強,性能反而提升不大。
輸入端:Strong augmentation數據增強
在網絡的輸入端,Yolox主要采用了Mosaic、Mixup兩種數據增強方法。
通過隨機縮放、隨機裁剪、隨機排布的方式進行拼接,對於小目標的檢測效果提升,還是很不錯的。而且在Yolov4、Yolov5算法中,也得到了廣泛的應用。是一種非常有效的增強方式。
MixUp是在Mosaic基礎上,增加的一種額外的增強策略。
Mosaic數據增強
代碼位置: yolox/data/datasets/mosaicdetection.py
核心代碼:
def get_mosaic_coordinate(mosaic_image, mosaic_index, xc, yc, w, h, input_h, input_w):
# TODO update doc
# index0 to top left part of image
if mosaic_index == 0:
x1, y1, x2, y2 = max(xc - w, 0), max(yc - h, 0), xc, yc
small_coord = w - (x2 - x1), h - (y2 - y1), w, h
# index1 to top right part of image
elif mosaic_index == 1:
x1, y1, x2, y2 = xc, max(yc - h, 0), min(xc + w, input_w * 2), yc
small_coord = 0, h - (y2 - y1), min(w, x2 - x1), h
# index2 to bottom left part of image
elif mosaic_index == 2:
x1, y1, x2, y2 = max(xc - w, 0), yc, xc, min(input_h * 2, yc + h)
small_coord = w - (x2 - x1), 0, w, min(y2 - y1, h)
# index2 to bottom right part of image
elif mosaic_index == 3:
x1, y1, x2, y2 = xc, yc, min(xc + w, input_w * 2), min(input_h * 2, yc + h) # noqa
small_coord = 0, 0, min(w, x2 - x1), min(y2 - y1, h)
return (x1, y1, x2, y2), small_coord
與之配套執行代碼 類 MosaicDetection(Dataset)
# 隨機中心點
yc = int(random.uniform(0.5 * input_h, 1.5 * input_h))
xc = int(random.uniform(0.5 * input_w, 1.5 * input_w))
# 3 additional image indices
# 隨機抽取的4張圖片
indices = [idx] + [random.randint(0, len(self._dataset) - 1) for _ in range(3)]
for i_mosaic, index in enumerate(indices):
img, _labels, _, _ = self._dataset.pull_item(index)
h0, w0 = img.shape[:2] # orig hw
scale = min(1. * input_h / h0, 1. * input_w / w0)
img = cv2.resize(
img, (int(w0 * scale), int(h0 * scale)), interpolation=cv2.INTER_LINEAR
)
# generate output mosaic image
(h, w, c) = img.shape[:3]
if i_mosaic == 0:
mosaic_img = np.full((input_h * 2, input_w * 2, c), 114, dtype=np.uint8)
# suffix l means large image, while s means small image in mosaic aug.
(l_x1, l_y1, l_x2, l_y2), (s_x1, s_y1, s_x2, s_y2) = get_mosaic_coordinate(
mosaic_img, i_mosaic, xc, yc, w, h, input_h, input_w
)
#
# 拼接四張圖片
mosaic_img[l_y1:l_y2, l_x1:l_x2] = img[s_y1:s_y2, s_x1:s_x2]
padw, padh = l_x1 - s_x1, l_y1 - s_y1
labels = _labels.copy()
# Normalized xywh to pixel xyxy format
if _labels.size > 0:
labels[:, 0] = scale * _labels[:, 0] + padw
labels[:, 1] = scale * _labels[:, 1] + padh
labels[:, 2] = scale * _labels[:, 2] + padw
labels[:, 3] = scale * _labels[:, 3] + padh
mosaic_labels.append(labels)
# 標簽拼接
if len(mosaic_labels):
mosaic_labels = np.concatenate(mosaic_labels, 0)
np.clip(mosaic_labels[:, 0], 0, 2 * input_w, out=mosaic_labels[:, 0])
np.clip(mosaic_labels[:, 1], 0, 2 * input_h, out=mosaic_labels[:, 1])
np.clip(mosaic_labels[:, 2], 0, 2 * input_w, out=mosaic_labels[:, 2])
np.clip(mosaic_labels[:, 3], 0, 2 * input_h, out=mosaic_labels[:, 3])
# 隨機旋轉角度,標簽
mosaic_img, mosaic_labels = random_perspective(
mosaic_img,
mosaic_labels,
degrees=self.degrees,
translate=self.translate,
scale=self.scale,
shear=self.shear,
perspective=self.perspective,
border=[-input_h // 2, -input_w // 2],
) # border to remove
if (
self.enable_mixup
and not len(mosaic_labels) == 0
and random.random() < self.mixup_prob
):
## maxup 圖像拼接增強
mosaic_img, mosaic_labels = self.mixup(mosaic_img, mosaic_labels, self.input_dim)
MixUp數據增強
MixUp數據增強是在Mosaic基礎上,額外的一種增強策略。
數據增原理來源論文: https://arxiv.org/pdf/2012.07177.pdf
博客解釋: https://zhuanlan.zhihu.com/p/397993315
主要想法和步驟 將數據復制粘貼實現訓練階段的數據增廣
主要步驟
- 先讀取一張圖片,圖片兩側填充,縮放的640x640里面,存在一個檢測框
- 再隨機選取一張圖片,圖片上下填充,縮放的640x640里面,也存在一個檢測框
- 之后設置一個融合系數,將兩個圖片融合,得到新圖片和兩個檢測框
self.mixup
def mixup(self, origin_img, origin_labels, input_dim):
jit_factor = random.uniform(*self.mixup_scale)
FLIP = random.uniform(0, 1) > 0.5
cp_labels = []
while len(cp_labels) == 0:
cp_index = random.randint(0, self.__len__() - 1)
cp_labels = self._dataset.load_anno(cp_index)
img, cp_labels, _, _ = self._dataset.pull_item(cp_index)
if len(img.shape) == 3:
cp_img = np.ones((input_dim[0], input_dim[1], 3), dtype=np.uint8) * 114
else:
cp_img = np.ones(input_dim, dtype=np.uint8) * 114
cp_scale_ratio = min(input_dim[0] / img.shape[0], input_dim[1] / img.shape[1])
resized_img = cv2.resize(img,
(int(img.shape[1] * cp_scale_ratio), int(img.shape[0] * cp_scale_ratio)),
interpolation=cv2.INTER_LINEAR, )
cp_img[: int(img.shape[0] * cp_scale_ratio), : int(img.shape[1] * cp_scale_ratio)] = resized_img
cp_img = cv2.resize(cp_img, (int(cp_img.shape[1] * jit_factor), int(cp_img.shape[0] * jit_factor)), )
cp_scale_ratio *= jit_factor
if FLIP:
cp_img = cp_img[:, ::-1, :]
origin_h, origin_w = cp_img.shape[:2]
target_h, target_w = origin_img.shape[:2]
padded_img = np.zeros((max(origin_h, target_h), max(origin_w, target_w), 3), dtype=np.uint8)
padded_img[:origin_h, :origin_w] = cp_img
x_offset, y_offset = 0, 0
if padded_img.shape[0] > target_h:
y_offset = random.randint(0, padded_img.shape[0] - target_h - 1)
if padded_img.shape[1] > target_w:
x_offset = random.randint(0, padded_img.shape[1] - target_w - 1)
padded_cropped_img = padded_img[y_offset: y_offset + target_h, x_offset: x_offset + target_w]
cp_bboxes_origin_np = adjust_box_anns(cp_labels[:, :4].copy(), cp_scale_ratio, 0, 0, origin_w, origin_h)
if FLIP:
cp_bboxes_origin_np[:, 0::2] = (origin_w - cp_bboxes_origin_np[:, 0::2][:, ::-1])
cp_bboxes_transformed_np = cp_bboxes_origin_np.copy()
cp_bboxes_transformed_np[:, 0::2] = np.clip(cp_bboxes_transformed_np[:, 0::2] - x_offset, 0, target_w)
cp_bboxes_transformed_np[:, 1::2] = np.clip(cp_bboxes_transformed_np[:, 1::2] - y_offset, 0, target_h)
keep_list = box_candidates(cp_bboxes_origin_np.T, cp_bboxes_transformed_np.T, 5)
if keep_list.sum() >= 1.0:
cls_labels = cp_labels[keep_list, 4:5].copy()
box_labels = cp_bboxes_transformed_np[keep_list]
labels = np.hstack((box_labels, cls_labels))
origin_labels = np.vstack((origin_labels, labels))
origin_img = origin_img.astype(np.float32)
origin_img = 0.5 * origin_img + 0.5 * padded_cropped_img.astype(np.float32)
return origin_img.astype(np.uint8), origin_labels
論文中標注了2點
(1)在訓練的最后15個epoch,這兩個數據增強會被關閉掉。
而在此之前,Mosaic和Mixup數據增強,都是打開的,這個細節需要注意。
(2)由於采取了更強的數據增強方式,作者在研究中發現,ImageNet預訓練將毫無意義,因此,所有的模型,均是從頭開始訓練的。
針對代碼這里的理解並不是很清晰,先分析其他的內容