在CIFAR10上的正確率
這里我都是取了最好的結果,同一模型還有更細致的對比實驗,詳情參見實驗對比。
MODEL | ACCURACY |
---|---|
VGG16 | 90.06% |
BN-VGG16 | 92.31% |
BN-Inception | 92.41% |
Inception-v3 | 92.94% |
ResNet-v1 | 93.54% |
ResNet-v2 | 95.35% |
這里只講解了ResNet,更多代碼見__我的GitHub__
ResNet的pytorch實現
import torch.nn as nn
import torch.utils.model_zoo as model_zoo
#實現的resnet種類和與訓練參數的地址
__all__=['ResNet','resnet18','resnet34','resnet50','resnet101','resnet152']
model_urls = {
'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}
def conv3x3(in_planes,out_planes,stride=1):
'''3x3 convolution with padding'''
return nn.Conv2d(in_planes,out_planes,kernel_size=3,stride=stride,padding=1,bias=False)
def conv1x1(in_planes,out_planes,stride=1):
'''1x1 convolution'''
return nn.Conv2d(in_planes,out_planes,kernel_size=1,stride=stride,bias=False)
普通殘差模塊的實現
class BasicBlock(nn.Module):
expansion=1
def __init__(self,inplanes,planes,stride=1,downsample=None):
super(BasicBlock,self).__init__()
self.conv1=conv3x3(inplanes,planes,stride)
self.bn1=nn.BatchNorm2d(planes)
self.relu=nn.ReLU(inplace=True)
self.conv2=conv3x3(planes,planes)
self.bn2=nn.BatchNorm2d(planes)
self.downsample=downsample
self.stride=stride
def forward(self,x):
residual=x
out=self.conv1(x)
out=self.bn1(out)
out=self.relu(out)
out=self.conv2(out)
out=self.bn2(out)
if self.downsample is not None:
residual=self.downsample(x)
#只有通道數翻倍的時候,空間分辨率才會縮小
#也就是只有每個大模塊的第一次卷積會使用stide=2
out+=residual
out=self.relu(out)
return out
BottleNeck殘差的實現
可以減少參數,在比較深的網絡中用BottleNeck模塊替換普通模塊可以大幅減少參數
class Bottleneck(nn.Module):
expansion=4
def __init__(self,inplanes,planes,stride=1,downsample=None):
super(Bottleneck,self).__init__()
self.conv1=conv1x1(inplanes,planes)
self.bn1=nn.BatchNorm2d(planes)
self.conv2=conv3x3(planes,planes,stride)
self.bn2=NN.BatchNorm2d(planes)
self.conv3=conv1x1(planes,planes*self.expansion)
#一般1x1會將通道縮小4倍,在這里還原
self.relu=nn.ReLU(inplace=True)
self.downsample=downsample
self.stride=stride
def forward(self,x):
residual=x
out=self.conv1(x)
out=self.bn1(out)
out=self.relu(out)
out=self.conv2(out)
out=self.bn2(out)
out=self.relu(out)
out=self.conv3(out)
out=self.bn(out)
if self.downsample is not None:
residual=self.downsample(x)
out+=residual
out=self.relu(out)
return out
ResNet的整體結構:
不同深度resnet的詳細參數
class ResNet(nn.Module):
def __init__(self, bolck,layer,num_classes=1000):
'''
block:殘差模塊的選擇,普通/瓶頸
layer:每個模塊重復的次數
numclasses:fc層的輸出個數,也就是分類的個數
'''
super(ResNet, self).__init__()
self.inplanes=64
self.conv1=nn.Conv2d(3,64,kernel_size=7,stride=2,padding=3,bias=False)
self.bn1=nn.BatchNorm2d(64)
self.relu=nn.ReLU(inplace=True)
self.maxpool=nn.MaxPool2d(kernel_size=3,stride=2,padding=1)
#所有深度的resnet第一步操作都是相同的
#input:224x224 output:56x56 conv1和maxpool都進行了downsample
#這里的第一步消耗掉的信息太多了,目的應該是為了節省內存空間
self.layer1=self._make_layer(block,64,layers[0])
self.layer2=self._make_layer(block,128,layers[1],stride=2)
self.layer3=self._make_layer(block,256,layers[2],stride=2)
self.layer4=self._make_layer(block,512,layers[3],stride=2)
#每一層通道翻倍,空間分辨率寬高減半
self.avgpool=nn.AdaptiveAvgPool2d((1,1))
self.fc=nn.Linear(512*block.expansion,num_classes)
for m in self.modules():
if isinstance(m,nn.Conv2d):
nn.init.kaiming_normal_(m.weight,mode='fan_out',nonlinearity='relu')
elif isinstance(m,nn.BatchNorm2d):
nn.init.constant_(m.weight,1)
nn.init.constant_(m.bias,0)
def _make_layer(self,bolck,planes,bolcks,stride=1):
downsample=None
if stride!=1 or self.inplanes!=planes*block.expansion:
downsample=nn.Sequential(
conv1x1(self.inplanes,planes*block.expansion,stride),
nn.BatchNorm2d(planes*block.expansion),
)
#如果stride!=1,說明x也需要降采樣。
#但是這里空間降采樣的同時重組了通道信息。
layers=[]
layers.append(block(self.inplanes,planes,stride,downsample))
self.inplanes=planes*block.expansion
for _ in range(1,blocks):
layers.append(block(self.inplanes,planes))
return nn.Sequential(*layers)
def forward(self,x):
x=self.conv1(x)
x=self.bn1(x)
x=self.relu(x)
x=self.maxpool(x)
x=self.layer1(x)
x=self.layer2(x)
x=self.layer3(x)
x=self.layer4(x)
x=self.avgpool(x)
x=x.view(x.size(0),-1)
x=self.fc(x)
return x
def resnet18(pretrained=False,**kwargs):
model=ResNet(BasicBlock,[2,2,2,2],**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
return model
def resnet34(pretrained=False,**kwargs):
model=ResNet(BasicBlock,[3,4,6,3],**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
return model
def resnet50(pretrained=False,**kwargs):
model=ResNet(Bottleneck,[3,4,6,3],**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
return model
def resnet101(pretrained=False,**kwargs):
model=ResNet(Bottleneck,[3,4,23,3],**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
return model
def resnet152(pretrained=False,**kwargs):
model=ResNet(Bottleneck,[3,8,36,3],**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
return model
改進的殘差結構
上面介紹的原始的殘差結構效果很不錯,但是到了1202層網絡之后,出現了比101層的網絡效果更差,根據作者的分析,網絡越深應該效果越好,出現這種情況的原因是因為目標函數不太好優化。
作者統一了表達式:
原始的shortcut采用了恆等映射,所以\(h(x_l)=x\),但是\(x(l+1)=relu(x+F(x_l,W_l))\),不是恆等映射。作者認為這樣的信息流是造成優化的難度,所以作者把這兩步操作都改成了恆等操作:
所以對於任意層L的特征可以表示為
這樣任意層的梯度:
這樣題目就可以傳播到任意淺層,不會出現梯度消失的情況。而且第二項有正有負,初始化適當的情況下反向梯度會成高斯分布,也不會出現梯度爆炸。
class BasicBlock(nn.Module):
expansion=1
def __init__(self,inplanes,planes,stride=1,downsample=None):
super(BasicBlock,self).__init__()
self.bn1=nn.BatchNorm2d(inplanes)
self.relu=nn.ReLU(inplace=True)
self.conv1=conv3x3(inplanes,planes,stride)
self.bn2=nn.BatchNorm2d(planes)
self.conv2=conv3x3(planes,planes)
self.downsample=downsample
self.stride=stride
def forward(self,x):
residual=x
out=self.bn1(x)
out=self.relu(out)
out=self.conv1(out)
out=self.bn2(out)
out=self.relu(out)
out=self.conv2(out)
if self.downsample is not None:
residual=self.downsample(x)
out+=residual
return out
class Bottleneck(nn.Module):
expansion=4
def __init__(self,inplanes,planes,stride=1,downsample=None):
super(Bottleneck,self).__init__()
self.bn1=nn.BatchNorm2d(inplanes)
self.conv1=conv1x1(inplanes,planes)
self.bn2=nn.BatchNorm2d(planes)
self.conv2=conv3x3(planes,planes,stride)
self.bn3 = nn.BatchNorm2d(planes)
self.conv3=conv1x1(planes,planes*self.expansion)
self.relu=nn.ReLU(inplace=True)
self.downsample=downsample
self.stride=stride
def forward(self,x):
residual=x
out=self.bn1(x)
out=self.relu(out)
out=self.conv1(out)
out=self.bn2(out)
out=self.relu(out)
out=self.conv2(out)
out=self.bn3(out)
out=self.relu(out)
out=self.conv3(out)
if self.downsample is not None:
residual=self.downsample(x)
out+=residual
return out
實驗對比
在__CIFAI10__上的測試結果:
ResNet:
這里是原論文中描述的實現18,34,50,101,154層resnet-v1的效果,忘記加標記了,左邊是18和34層。右邊是50,101,154層。層數越多acc越低。


為了適應CIFAR10的低分辨率,我把第一層的卷積核從7x7改成了3x3,並且保持分辨率,后面遵從原論文的設置。為了對比v1和v2的效果,達到實驗中說的v1層數越多效果反而越差的效果,我最高設到了301層,但是沒有明顯的效果,1000多層的實在是跑不動,302層我在GTX 2080Ti上跑了10個小時。看實驗圖:


可以看到隨着網絡加深,v1版本的正確率基本不變了,但是v2版本的還有微小的提升。而且橫向對比,ResNet-v2也更有優勢。
VGG16:

vgg16_bn初始學習率0.1,vgg初始學習率為0.01
如果vgg初始學習率也為0.1的話,整個網絡就發散了,之后也不會再收斂,可以看到BN版的vgg對步長不是那么敏感,而且效果有明顯的提升。而且加了BN層在100個epoch之前是不如原始vgg的,原因應該是步長太大,步長減少10倍之后,效果就超過了原始VGG。
Inception:
BN-Inception:
我都對0.01和0.001的測試率做了測試。但是按照原論文中設置weight-decay=0.00001怎么也到不了90%以上的正確率,所以我設置了weight-decay分別為1e-5(左圖)、5e-5(右圖)。這個結果比較有意思,左圖不同學習率最終結果沒什么差別,說明加了BN之后確實對學習率有一定的容忍性。右圖的差別就大了,我個人分析應該是BN並不具備正則的作用,所以還是得到額外的正則項來防止過擬合,但是這次學習率的影響又比較大了,這我還不知道怎么解釋,我猜想是因為學習率太小,被正則項限制之后欠擬合了,但是這還需要更多的實驗去證明。


Inception-v3
比BN-Inception效果稍好。
s