在CIFAR10上的正确率
这里我都是取了最好的结果,同一模型还有更细致的对比实验,详情参见实验对比。
MODEL | ACCURACY |
---|---|
VGG16 | 90.06% |
BN-VGG16 | 92.31% |
BN-Inception | 92.41% |
Inception-v3 | 92.94% |
ResNet-v1 | 93.54% |
ResNet-v2 | 95.35% |
这里只讲解了ResNet,更多代码见__我的GitHub__
ResNet的pytorch实现
import torch.nn as nn
import torch.utils.model_zoo as model_zoo
#实现的resnet种类和与训练参数的地址
__all__=['ResNet','resnet18','resnet34','resnet50','resnet101','resnet152']
model_urls = {
'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}
def conv3x3(in_planes,out_planes,stride=1):
'''3x3 convolution with padding'''
return nn.Conv2d(in_planes,out_planes,kernel_size=3,stride=stride,padding=1,bias=False)
def conv1x1(in_planes,out_planes,stride=1):
'''1x1 convolution'''
return nn.Conv2d(in_planes,out_planes,kernel_size=1,stride=stride,bias=False)
普通残差模块的实现
class BasicBlock(nn.Module):
expansion=1
def __init__(self,inplanes,planes,stride=1,downsample=None):
super(BasicBlock,self).__init__()
self.conv1=conv3x3(inplanes,planes,stride)
self.bn1=nn.BatchNorm2d(planes)
self.relu=nn.ReLU(inplace=True)
self.conv2=conv3x3(planes,planes)
self.bn2=nn.BatchNorm2d(planes)
self.downsample=downsample
self.stride=stride
def forward(self,x):
residual=x
out=self.conv1(x)
out=self.bn1(out)
out=self.relu(out)
out=self.conv2(out)
out=self.bn2(out)
if self.downsample is not None:
residual=self.downsample(x)
#只有通道数翻倍的时候,空间分辨率才会缩小
#也就是只有每个大模块的第一次卷积会使用stide=2
out+=residual
out=self.relu(out)
return out
BottleNeck残差的实现
可以减少参数,在比较深的网络中用BottleNeck模块替换普通模块可以大幅减少参数
class Bottleneck(nn.Module):
expansion=4
def __init__(self,inplanes,planes,stride=1,downsample=None):
super(Bottleneck,self).__init__()
self.conv1=conv1x1(inplanes,planes)
self.bn1=nn.BatchNorm2d(planes)
self.conv2=conv3x3(planes,planes,stride)
self.bn2=NN.BatchNorm2d(planes)
self.conv3=conv1x1(planes,planes*self.expansion)
#一般1x1会将通道缩小4倍,在这里还原
self.relu=nn.ReLU(inplace=True)
self.downsample=downsample
self.stride=stride
def forward(self,x):
residual=x
out=self.conv1(x)
out=self.bn1(out)
out=self.relu(out)
out=self.conv2(out)
out=self.bn2(out)
out=self.relu(out)
out=self.conv3(out)
out=self.bn(out)
if self.downsample is not None:
residual=self.downsample(x)
out+=residual
out=self.relu(out)
return out
ResNet的整体结构:
不同深度resnet的详细参数
class ResNet(nn.Module):
def __init__(self, bolck,layer,num_classes=1000):
'''
block:残差模块的选择,普通/瓶颈
layer:每个模块重复的次数
numclasses:fc层的输出个数,也就是分类的个数
'''
super(ResNet, self).__init__()
self.inplanes=64
self.conv1=nn.Conv2d(3,64,kernel_size=7,stride=2,padding=3,bias=False)
self.bn1=nn.BatchNorm2d(64)
self.relu=nn.ReLU(inplace=True)
self.maxpool=nn.MaxPool2d(kernel_size=3,stride=2,padding=1)
#所有深度的resnet第一步操作都是相同的
#input:224x224 output:56x56 conv1和maxpool都进行了downsample
#这里的第一步消耗掉的信息太多了,目的应该是为了节省内存空间
self.layer1=self._make_layer(block,64,layers[0])
self.layer2=self._make_layer(block,128,layers[1],stride=2)
self.layer3=self._make_layer(block,256,layers[2],stride=2)
self.layer4=self._make_layer(block,512,layers[3],stride=2)
#每一层通道翻倍,空间分辨率宽高减半
self.avgpool=nn.AdaptiveAvgPool2d((1,1))
self.fc=nn.Linear(512*block.expansion,num_classes)
for m in self.modules():
if isinstance(m,nn.Conv2d):
nn.init.kaiming_normal_(m.weight,mode='fan_out',nonlinearity='relu')
elif isinstance(m,nn.BatchNorm2d):
nn.init.constant_(m.weight,1)
nn.init.constant_(m.bias,0)
def _make_layer(self,bolck,planes,bolcks,stride=1):
downsample=None
if stride!=1 or self.inplanes!=planes*block.expansion:
downsample=nn.Sequential(
conv1x1(self.inplanes,planes*block.expansion,stride),
nn.BatchNorm2d(planes*block.expansion),
)
#如果stride!=1,说明x也需要降采样。
#但是这里空间降采样的同时重组了通道信息。
layers=[]
layers.append(block(self.inplanes,planes,stride,downsample))
self.inplanes=planes*block.expansion
for _ in range(1,blocks):
layers.append(block(self.inplanes,planes))
return nn.Sequential(*layers)
def forward(self,x):
x=self.conv1(x)
x=self.bn1(x)
x=self.relu(x)
x=self.maxpool(x)
x=self.layer1(x)
x=self.layer2(x)
x=self.layer3(x)
x=self.layer4(x)
x=self.avgpool(x)
x=x.view(x.size(0),-1)
x=self.fc(x)
return x
def resnet18(pretrained=False,**kwargs):
model=ResNet(BasicBlock,[2,2,2,2],**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
return model
def resnet34(pretrained=False,**kwargs):
model=ResNet(BasicBlock,[3,4,6,3],**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
return model
def resnet50(pretrained=False,**kwargs):
model=ResNet(Bottleneck,[3,4,6,3],**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
return model
def resnet101(pretrained=False,**kwargs):
model=ResNet(Bottleneck,[3,4,23,3],**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
return model
def resnet152(pretrained=False,**kwargs):
model=ResNet(Bottleneck,[3,8,36,3],**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
return model
改进的残差结构
上面介绍的原始的残差结构效果很不错,但是到了1202层网络之后,出现了比101层的网络效果更差,根据作者的分析,网络越深应该效果越好,出现这种情况的原因是因为目标函数不太好优化。
作者统一了表达式:
原始的shortcut采用了恒等映射,所以\(h(x_l)=x\),但是\(x(l+1)=relu(x+F(x_l,W_l))\),不是恒等映射。作者认为这样的信息流是造成优化的难度,所以作者把这两步操作都改成了恒等操作:
所以对于任意层L的特征可以表示为
这样任意层的梯度:
这样题目就可以传播到任意浅层,不会出现梯度消失的情况。而且第二项有正有负,初始化适当的情况下反向梯度会成高斯分布,也不会出现梯度爆炸。
class BasicBlock(nn.Module):
expansion=1
def __init__(self,inplanes,planes,stride=1,downsample=None):
super(BasicBlock,self).__init__()
self.bn1=nn.BatchNorm2d(inplanes)
self.relu=nn.ReLU(inplace=True)
self.conv1=conv3x3(inplanes,planes,stride)
self.bn2=nn.BatchNorm2d(planes)
self.conv2=conv3x3(planes,planes)
self.downsample=downsample
self.stride=stride
def forward(self,x):
residual=x
out=self.bn1(x)
out=self.relu(out)
out=self.conv1(out)
out=self.bn2(out)
out=self.relu(out)
out=self.conv2(out)
if self.downsample is not None:
residual=self.downsample(x)
out+=residual
return out
class Bottleneck(nn.Module):
expansion=4
def __init__(self,inplanes,planes,stride=1,downsample=None):
super(Bottleneck,self).__init__()
self.bn1=nn.BatchNorm2d(inplanes)
self.conv1=conv1x1(inplanes,planes)
self.bn2=nn.BatchNorm2d(planes)
self.conv2=conv3x3(planes,planes,stride)
self.bn3 = nn.BatchNorm2d(planes)
self.conv3=conv1x1(planes,planes*self.expansion)
self.relu=nn.ReLU(inplace=True)
self.downsample=downsample
self.stride=stride
def forward(self,x):
residual=x
out=self.bn1(x)
out=self.relu(out)
out=self.conv1(out)
out=self.bn2(out)
out=self.relu(out)
out=self.conv2(out)
out=self.bn3(out)
out=self.relu(out)
out=self.conv3(out)
if self.downsample is not None:
residual=self.downsample(x)
out+=residual
return out
实验对比
在__CIFAI10__上的测试结果:
ResNet:
这里是原论文中描述的实现18,34,50,101,154层resnet-v1的效果,忘记加标记了,左边是18和34层。右边是50,101,154层。层数越多acc越低。


为了适应CIFAR10的低分辨率,我把第一层的卷积核从7x7改成了3x3,并且保持分辨率,后面遵从原论文的设置。为了对比v1和v2的效果,达到实验中说的v1层数越多效果反而越差的效果,我最高设到了301层,但是没有明显的效果,1000多层的实在是跑不动,302层我在GTX 2080Ti上跑了10个小时。看实验图:


可以看到随着网络加深,v1版本的正确率基本不变了,但是v2版本的还有微小的提升。而且横向对比,ResNet-v2也更有优势。
VGG16:

vgg16_bn初始学习率0.1,vgg初始学习率为0.01
如果vgg初始学习率也为0.1的话,整个网络就发散了,之后也不会再收敛,可以看到BN版的vgg对步长不是那么敏感,而且效果有明显的提升。而且加了BN层在100个epoch之前是不如原始vgg的,原因应该是步长太大,步长减少10倍之后,效果就超过了原始VGG。
Inception:
BN-Inception:
我都对0.01和0.001的测试率做了测试。但是按照原论文中设置weight-decay=0.00001怎么也到不了90%以上的正确率,所以我设置了weight-decay分别为1e-5(左图)、5e-5(右图)。这个结果比较有意思,左图不同学习率最终结果没什么差别,说明加了BN之后确实对学习率有一定的容忍性。右图的差别就大了,我个人分析应该是BN并不具备正则的作用,所以还是得到额外的正则项来防止过拟合,但是这次学习率的影响又比较大了,这我还不知道怎么解释,我猜想是因为学习率太小,被正则项限制之后欠拟合了,但是这还需要更多的实验去证明。


Inception-v3
比BN-Inception效果稍好。
s