ResNet 主干特征提取
殘差網絡根據堆疊的層數不同,采用兩個不同的單元。
ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) //18
ResNet(BasicBlock, [3, 4, 6, 3], **kwargs) // 34
ResNet(Bottleneck
, [3, 4, 6, 3], **kwargs)
//50
ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) //101
ResNet(Bottleneck, [3, 8, 36, 3], **kwargs) //152
為了保證可以相加,如果經過卷積后輸出的h、w發生改變 ,在分支上會加上一個下采樣的單元
if stride != 1 or self.inplanes != planes * block.expansion: # 1x1卷積,保證維度相同,可以相加 downsample = nn.Sequential( nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), norm_layer(planes * block.expansion) )
相比於圖像分類采用的殘差網絡,在PSPnet中后兩個layer中strides=1,即不對圖像進行下采樣,最終得到 /8的feature_map
self.layer1 = self._make_layer(block, 64, layers[0], norm_layer=norm_layer)
self.layer2 = self._make_layer(block, 128, layers[1], stride=2, norm_layer=norm_layer)
self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2, norm_layer=norm_layer)
self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4, norm_layer=norm_layer)
為什么只進行了8倍的下采樣呢?因為下采樣越多對於圖像的細節信息損失的就越多,為了避免太多細節信息的損失
所以相對於以前的分類問題,只進行了8倍的下采樣。但這會帶來一個新的問題,卷積操作的視野域減小
輸出的feature_map上損失了全局信息。如何解決視野域的問題呢?答案就是空洞卷積。即在3*3卷積核中間填充0。

空洞卷積有一個參數可以設置dilation rate,具體含義就是在卷積核中填充dilation rate-1個0,
因此,當設置不同dilation rate時,感受野就會不一樣,也即獲取了多尺度信息。其計算輸出的方式與普通卷積一樣
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=dilation, dilation=dilation, bias=False)
當kernel_size = 3 , striides=1時,設置dilation=2, kernel_szie 被擴充為5 , 當padding=2 時,輸出的分辨率和輸入保持一致
而視野域相當於kernel_size = 5 的視野域。
還有一個好處,相比於分類采用的下采樣32倍的殘差網絡,參數量並沒有增加,所以可以采用遷移學習,直接用預訓練網絡的權重
Pyramid parsing module(PPM)
在feature map上做不同尺度的自適應平均池化(AdaptiveAvgPool2d)
prior = nn.AdaptiveAvgPool2d(output_size=bin_sz)
通過指定輸出的大小,自適應調節kernel_size 和 strides 來進行平均池化
在PSPNet 中分別進行了四種不同尺度的均值池化,分別是(1, 2, 3, 6)
最后對這四個特征層進行上采樣到與原feature_map同樣大小,並於feature_map進行堆疊
pyramids = [features]
pyramids.extend([F.interpolate(stage(features), size=(h,w), mode='bilinear', align_corners=True)
for stage in self.stages])
torch.cat(pyramids, dim=1)
class _PSPModule(nn.Module): def __init__(self, in_channels, bin_sizes, norm_layer): super(_PSPModule, self).__init__() out_channels = in_channels // len(bin_sizes) # 不同大小均值池化,獲得不同大小視野域 self.stages = nn.ModuleList([self._make_stages(in_channels, out_channels, b_s, norm_layer) for b_s in bin_sizes]) self.bottleneck = nn.Sequential(nn.Conv2d(in_channels+out_channels*len(bin_sizes), out_channels, kernel_size=3, padding=1, bias=False), norm_layer(out_channels), nn.ReLU(inplace=True), nn.Dropout2d(0.1)) def _make_stages(self, in_channels, out_channels, bin_sz, norm_layer): # 采用池化下采樣 prior = nn.AdaptiveAvgPool2d(output_size=bin_sz) # 1X1 壓縮維度 conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False) bn = norm_layer(out_channels) relu = nn.ReLU(inplace=True) return nn.Sequential(prior, conv, bn, relu) def forward(self, features): h, w = features.size(2), features.size(3) pyramids = [features] # 組合不同視野域信息和淺層信息 # 雙線性插值 上采樣 pyramids.extend([F.interpolate(stage(features), size=(h,w), mode='bilinear', align_corners=True) for stage in self.stages ]) output = self.bottleneck(torch.cat(pyramids, dim=1)) return output
PSPNet總體結構
class PSPNet(nn.Module): def __init__(self, num_classes, backbone=resnet152, in_channels=3, pretrained=True, use_aux=True, freeze_bn=False, freeze_backbone=False): super(PSPNet, self).__init__() norm_layer = nn.BatchNorm2d model = backbone(pretrained, norm_layer=norm_layer) # 全連接輸入單元數 : 512 * block.expansion m_out_sz = model.fc.in_features print("全連接層輸入", m_out_sz) self.use_aux = use_aux # conv1, bn1, relu, maxpool self.initial = nn.Sequential(*list(model.children())[:4]) if in_channels != 3: self.initial[0] = nn.Conv2d(in_channels, 64, kernel_size=7, stride=2, padding=3, bias=False) self.initial = nn.Sequential(*self.initial) self.layer1 = model.layer1 self.layer2 = model.layer2 self.layer3 = model.layer3 self.layer4 = model.layer4 # 2048*n*n->2048*1*1->512*1*1->512*n*n # 2048*n*n->2048*2*2->512*2*2->512*n*n # 2048*n*n->2048*3*3->512*3*3->512*n*n # 2048*n*n->2048*6*6->512*6*6->512*n*n # (2048+512*4)*n*n -> 512*n*n -> num_classes*n*n self.master_branch = nn.Sequential( _PSPModule(m_out_sz, bin_sizes=[1, 2, 3, 6], norm_layer=norm_layer), nn.Conv2d(m_out_sz // 4, num_classes, kernel_size=1) ) # 1024*2n*2n -> 512*2n*2n -> num_classes*2n*2n self.auxiliary_branch = nn.Sequential( nn.Conv2d(m_out_sz // 2, m_out_sz // 4, kernel_size=3, padding=1, bias=False), norm_layer(m_out_sz // 4), nn.ReLU(inplace=True), nn.Dropout2d(0.1), nn.Conv2d(m_out_sz // 4, num_classes, kernel_size=1) ) #參數初始化 initialize_weights(self.master_branch, self.auxiliary_branch) if freeze_bn: self.freeze_bn() if freeze_backbone: set_trainable([self.initial, self.layer1, self.layer2, self.layer3, self.layer4], False) def forward(self, x): input_size = (x.size(2), x.size(3)) x = self.initial(x) #/4 x = self.layer1(x) #/4 x = self.layer2(x) #/8 x = self.layer3(x) #/8 dilated = True x_aux = x x = self.layer4(x) #/8 output = self.master_branch(x) # num_classes /8 /8 # num_classes /1 /1 output = F.interpolate(output, size=input_size, mode='bilinear') if self.training and self.use_aux: # num_classes /8 /8 aux = self.auxiliary_branch(x_aux) # num_classes /1 /1 aux = F.interpolate(aux, size=input_size, mode='bilinear') return output, aux return output