yolox 主體結構-Darknet53
概要:
詳細解讀可參考:https://jishuin.proginn.com/p/763bfbd628ce
源碼對應的文件是yolo_fpn
中類YOLOFPN
該模型結構中沒有 Focus
結構
除此之外還有一個文件模型 yolo_pafpn
中的 YOLOPAFPN
包含是一個 CSPDarknet
的模型結構
Yolox-Darknet53整體的改進思路
在Yolov3_spp結構,並添加一些常用的改進方式
- 基准模型:Yolov3_spp

- Yolox模型結構,並添加一些常用的改進方式

本文只介紹backbone
模塊
- 該模型結構中沒有
Focus
結構
backbone模塊
主網絡為Darknet21 或者 Darknet53
參考文獻:https://blog.csdn.net/jizhidexiaoming/article/details/119760198
激活函數
代碼位置:network_blocks.py
def get_activation(name="silu", inplace=True):
if name == "silu":
module = nn.SiLU(inplace=inplace)
elif name == "relu":
module = nn.ReLU(inplace=inplace)
elif name == "lrelu":
module = nn.LeakyReLU(0.1, inplace=inplace)
else:
raise AttributeError("Unsupported act type: {}".format(name))
return module
CBL模塊
代碼位置:network_blocks.py
即Conv + BN + LeakyReLU的縮寫,在BaseConv中實現。
不改變特征圖的HW。圖像大小不變,圖像深度變化
- Conv
- BN
- LeakyReLU
class BaseConv(nn.Module):
"""A Conv2d -> Batchnorm -> silu/leaky relu block"""
def __init__(
self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu"):
super().__init__()
# same padding
pad = (ksize - 1) // 2
self.conv = nn.Conv2d(
in_channels,
out_channels,
kernel_size=ksize,
stride=stride,
padding=pad,
groups=groups,
bias=bias,
)
self.bn = nn.BatchNorm2d(out_channels)
self.act = get_activation(act, inplace=True)
def forward(self, x):
return self.act(self.bn(self.conv(x)))
def fuseforward(self, x):
return self.act(self.conv(x))
ResUnit單個殘差塊
代碼位置:network_blocks.py
典型的沙漏型殘差塊,通道數2C -> C -> 2C
在類ResLayer中實現,不改變特征圖的CHW。圖像深度不變
- 先用1x1的卷積,將通道減半;
- 然后接個BaseConv,將通道數復原,得到一個結果;
- 把上面的結果和輸入相加conCat,即可。
class ResLayer(nn.Module):
"Residual layer with `in_channels` inputs."
def __init__(self, in_channels: int):
super().__init__()
mid_channels = in_channels // 2
self.layer1 = BaseConv(
in_channels, mid_channels, ksize=1, stride=1, act="lrelu")
self.layer2 = BaseConv(
mid_channels, in_channels, ksize=3, stride=1, act="lrelu")
def forward(self, x):
out = self.layer2(self.layer1(x))
out = x + out
return out
Resx模塊
代碼位置:darknet.py
多個殘差塊組成。殘差塊的個數分別是1,2,8,8,4。后面5個模塊都可以作為輸出層。這里是后面三個作為輸出層。
實現上,這里把Darknet53分成了5部分,如下。
Res1,Res2,Res8,Res8,Res4
通用組件
## num_blocks 為殘差重復的次數
## depth2blocks = {21: [1, 2, 2, 1], 53: [2, 8, 8, 4]}
## Darknet21 重復的次數分別為[1,2,2,1,]
## Darknet53 重復的次數分別為[2,8,8,4,]
def make_group_layer(self, in_channels: int, num_blocks: int, stride: int = 1):
"starts with conv layer then has `num_blocks` `ResLayer`"
return [
BaseConv(in_channels, in_channels * 2, ksize=3, stride=stride, act="lrelu"),
*[(ResLayer(in_channels * 2)) for _ in range(num_blocks)],
]
核心 Resx
# Darknet 類中
def forward(self, x):
outputs = {}
x = self.stem(x)
outputs["stem"] = x
x = self.dark2(x)
outputs["dark2"] = x
x = self.dark3(x)
outputs["dark3"] = x
x = self.dark4(x)
outputs["dark4"] = x
x = self.dark5(x)
outputs["dark5"] = x
return {k: v for k, v in outputs.items() if k in self.out_features}
self.stem(x
self.stem(x) 包含兩個部分 CBL , Res1
self.stem = nn.Sequential(
BaseConv(in_channels, stem_out_channels, ksize=3, stride=1, act="lrelu"),
*self.make_group_layer(stem_out_channels, num_blocks=1, stride=2),)
self.dark2
self.dark2
包含了 Res2
self.dark2
包含了 Res2
num_blocks = Darknet.depth2blocks[depth]
self.dark2 = nn.Sequential(
*self.make_group_layer(in_channels, num_blocks[0], stride=2),)
self.dark3
self.dark3
包含了 Res3 的部分
self.dark4
包含了 Res4 的部分
in_channels *= 2 # 128
self.dark3 = nn.Sequential(
*self.make_group_layer(in_channels, num_blocks[1], stride=2))
in_channels *= 2 # 256
self.dark4 = nn.Sequential(
*self.make_group_layer(in_channels, num_blocks[2], stride=2))
self.dark4
self.dark4
包含了 Res4, 2 * CBL+SPP
+2 * CBL.
in_channels *= 2 # 512
self.dark5 = nn.Sequential(
*self.make_group_layer(in_channels, num_blocks[3], stride=2),
*self.make_spp_block([in_channels, in_channels * 2], in_channels * 2),
)
# spp 的實現
def make_spp_block(self, filters_list, in_filters):
m = nn.Sequential(
*[
BaseConv(in_filters, filters_list[0], 1, stride=1, act="lrelu"),
BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"),
SPPBottleneck(
in_channels=filters_list[1],
out_channels=filters_list[0],
activation="lrelu",
),
BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"),
BaseConv(filters_list[1], filters_list[0], 1, stride=1, act="lrelu"),
]
)
return m
SPPBottleneck 模塊
代碼位置:network_blocks.py
給定三種尺度的最大池化(5,9,13),核心的函數nn.MaxPool2d。
class SPPBottleneck(nn.Module):
"""Spatial pyramid pooling layer used in YOLOv3-SPP"""
def __init__(
self, in_channels, out_channels, kernel_sizes=(5, 9, 13), activation="silu"
):
super().__init__()
hidden_channels = in_channels // 2
self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=activation)
self.m = nn.ModuleList(
[
nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
for ks in kernel_sizes
]
)
conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
self.conv2 = BaseConv(conv2_channels, out_channels, 1, stride=1, act=activation)
def forward(self, x):
x = self.conv1(x)
x = torch.cat([x] + [m(x) for m in self.m], dim=1)
x = self.conv2(x)
return x
Neck模塊
在Neck結構中,Yolox-Darknet53和Yolov3 baseline的Neck結構,也是一樣的,都是采用FPN的結構進行融合。
FPN自頂向下,將高層的特征信息,通過上采樣的方式進行傳遞融合,得到進行預測的特征圖。

代碼位置: yolox/models/yolo_fpn.py
self.in_features=["dark3", "dark4", "dark5"],
self.backbone = Darknet(depth)
def forward(self, inputs):
"""
Args:
inputs (Tensor): input image.
Returns:
Tuple[Tensor]: FPN output features..
"""
# 先經過 backbone 獲得圖像特征
out_features = self.backbone(inputs)
# 獲得指定層的圖像特征 ["dark3", "dark4", "dark5"]
x2, x1, x0 = [out_features[f] for f in self.in_features]
# 最內層 dark5和dark4合並
# yolo branch 1
x1_in = self.out1_cbl(x0)
x1_in = self.upsample(x1_in)
x1_in = torch.cat([x1_in, x1], 1)
out_dark4 = self.out1(x1_in)
# 中間層 dark4和dark3合並
# yolo branch 2
x2_in = self.out2_cbl(out_dark4)
x2_in = self.upsample(x2_in)
x2_in = torch.cat([x2_in, x2], 1)
out_dark3 = self.out2(x2_in)
# 最外層 dark3,dark4,dark5合並
outputs = (out_dark3, out_dark4, x0)
return outputs
細節問題
self.out1_cbl = self._make_cbl(512, 256, 1)
self.out1 = self._make_embedding([256, 512], 512 + 256)
# out 2
self.out2_cbl = self._make_cbl(256, 128, 1)
self.out2 = self._make_embedding([128, 256], 256 + 128)
# upsample
self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
def _make_cbl(self, _in, _out, ks):
return BaseConv(_in, _out, ks, stride=1, act="lrelu")
def _make_embedding(self, filters_list, in_filters):
m = nn.Sequential(
*[
self._make_cbl(in_filters, filters_list[0], 1),
self._make_cbl(filters_list[0], filters_list[1], 3),
self._make_cbl(filters_list[1], filters_list[0], 1),
self._make_cbl(filters_list[0], filters_list[1], 3),
self._make_cbl(filters_list[1], filters_list[0], 1),
]
)
return m