1. LeNet
class LeNet(nn.Module): def __init__(self): super(LeNet, self).__init__() # 1, 32, 32 layer1 = nn.Sequential() layer1.add_module('conv1', nn.Conv2d(1, 6, 5, 0)) # 6, 28, 28 layer1.add_module('pool1', nn.MaxPool2d(2, 2)) # 6, 14, 14 self.layer1 = layer1 layer2 = nn.Sequential() layer2.add_module('conv2', nn.Conv2d(6, 16, 5, 0)) # 16, 10, 10 layer2.add_module('pool2', nn.MaxPool2d(2, 2)) # 16, 5, 5 self.layer2 = layer2 layer3 = nn.Sequential() layer3.add_module('fc1', nn.Linear(400, 120)) layer3.add_module('fc2', nn.Linear(120, 84)) layer3.add_module('fc3', nn.Linear(84, 10)) self.layer3 = layer3 def forward(self, x): x = self.layer1(x) x = self.layer2(x) x = x.view(x.size(0), -1) out = self.layer3(x) return out
2. AlexNet
2012年,Alex等人提出的AlexNet網絡在ImageNet大賽上以遠超第二名的成績奪冠。AlexNet的特點:
- 更深的網絡結構
- 使用層疊的卷積層,即卷積層+卷積層+池化層來提取圖像的特征
- 使用Dropout抑制過擬合
- 使用數據增強Data Augmentation抑制過擬合
- 使用Relu替換之前的sigmoid的作為激活函數
class AlexNet(nn.Module): def __init__(self): super(AlexNet, self).__init__() # 224, 224, 3 self.features = nn.Sequential( nn.Conv2d(3, 96, 11, 4, padding=2), # 55, 55, 96 nn.ReLU(True), nn.MaxPool2d(3, 2), # 27, 27, 96 nn.Conv2d(96, 256, 5, 1, padding=2), # 27, 27, 256 nn.ReLU(True), nn.MaxPool2d(3, 2), # 13, 13, 256 nn.Conv2d(256, 384, 3, 1, padding=1), # 13, 13, 384 nn.ReLU(True), nn.Conv2d(384, 384, 3, 1, padding=1), nn.ReLU(True), nn.Conv2d(384, 256, 3, 1, padding=0), # 13, 13, 256 nn.ReLU(True), nn.MaxPool2d(3, 2), # 6, 6, 256 ) self.classifier = nn.Sequential( nn.Dropout(), nn.Linear(6*6*256, 4096), nn.ReLU(True), nn.Dropout(), nn.Linear(4096, 4096), nn.ReLU(True), nn.Linear(4096, 1000), ) def forward(self, x): x = self.features(x) x = x.view(x.size(0), -1) out = self.classifier(x) return out
3. VGGNet
VGGNet獲得了2014年ImageNet比賽的亞軍和定位項目的冠軍,VGG16包含13個卷積層和3個全連接層,VGG19包含16個卷積層和3個全連接層。VGG網絡的結構非常一致,從頭到尾全部使用的是3x3的卷積和2x2的max pooling。下面是VGG16的代碼。
class VGGNet(nn.Module): def __init__(self): super(VGGNet, self).__init__() # 224, 224, 3 self.features = nn.Sequential( nn.Conv2d(3, 64, 3, 1, padding=1), # 224, 224, 64 nn.ReLU(True), nn.Conv2d(64, 64, 3, 1, padding=1), # 224, 224, 64 nn.ReLU(True), nn.MaxPool2d(2, 2), # 112, 112, 64 nn.Conv2d(64, 128, 3, 1, padding=1), # 112, 112, 128 nn.ReLU(True), nn.Conv2d(128, 128, 3, 1, padding=1), # 112, 112, 128 nn.ReLU(True), nn.MaxPool2d(2, 2), # 56, 56, 128 nn.Conv2d(128, 256, 3, 1, padding=1), # 56, 56, 256 nn.ReLU(True), nn.Conv2d(256, 256, 3, 1, padding=1), # 56, 56, 256 nn.ReLU(True), nn.Conv2d(256, 256, 3, 1, padding=1), # 56, 56, 256 nn.ReLU(True), nn.MaxPool2d(2, 2), # 28, 28, 256 nn.ReLU(True), nn.Conv2d(256, 512, 3, 1, padding=1), # 28, 28, 512 nn.ReLU(True), nn.Conv2d(512, 512, 3, 1, padding=1), # 28, 28, 512 nn.ReLU(True), nn.Conv2d(512, 512, 3, 1, padding=1), # 28, 28, 512 nn.ReLU(True), nn.MaxPool2d(2, 2), # 14, 14, 512 nn.Conv2d(512, 512, 3, 1, padding=1), # 14, 14, 512 nn.ReLU(True), nn.Conv2d(512, 512, 3, 1, padding=1), # 14, 14, 512 nn.ReLU(True), nn.Conv2d(512, 512, 3, 1, padding=1), # 14, 14, 512 nn.ReLU(True), nn.MaxPool2d(2, 2), # 7, 7, 512 ) self.classifier = nn.Sequential( nn.Linear(7*7*512, 4096), nn.ReLU(True), nn.Dropout(), nn.Linear(4096, 4096), nn.ReLU(True), nn.Dropout(), nn.Linear(4096, 1000), ) self._initlize_weights() def forward(self, x): x = self.features(x) x = x.view(x.size(0), -1) out = self.classifier(x) return out
4. GoogleNet
GoogleNet提出Inception模塊設計,原始Inception模塊的基本結構如下圖(a),但是該模塊的參數量很大,計算量也很大,為此,Google團隊提出如下圖(b)的Inception結構。
(a)
(b)
下圖摘自:Inception網絡模型
1x1的卷積核有什么用呢?[5]
1x1卷積的主要目的是為了減少維度,還用於修正線性激活(ReLU)。比如,上一層的輸出為100x100x128,經過具有256個通道的5x5卷積層之后(stride=1,pad=2),輸出數據為100x100x256,其中,卷積層的參數為128x5x5x256= 819200。而假如上一層輸出先經過具有32個通道的1x1卷積層,再經過具有256個輸出的5x5卷積層,那么輸出數據仍為為100x100x256,但卷積參數量已經減少為128x1x1x32 + 32x5x5x256= 204800,大約減少了4倍。
GoogLeNet結構:
GoogLeNet網絡結構明細表解析如下:[5]
0、輸入
原始輸入圖像為224x224x3,且都進行了零均值化的預處理操作(圖像每個像素減去均值)。
1、第一層(卷積層)
使用7x7的卷積核(滑動步長2,padding為3),64通道,輸出為112x112x64,卷積后進行ReLU操作
經過3x3的max pooling(步長為2),輸出為((112 - 3+1)/2)+1=56,即56x56x64,再進行ReLU操作
2、第二層(卷積層)
使用3x3的卷積核(滑動步長為1,padding為1),192通道,輸出為56x56x192,卷積后進行ReLU操作
經過3x3的max pooling(步長為2),輸出為((56 - 3+1)/2)+1=28,即28x28x192,再進行ReLU操作
3a、第三層(Inception 3a層)
分為四個分支,采用不同尺度的卷積核來進行處理
(1)64個1x1的卷積核,然后RuLU,輸出28x28x64
(2)96個1x1的卷積核,作為3x3卷積核之前的降維,變成28x28x96,然后進行ReLU計算,再進行128個3x3的卷積(padding為1),輸出28x28x128
(3)16個1x1的卷積核,作為5x5卷積核之前的降維,變成28x28x16,進行ReLU計算后,再進行32個5x5的卷積(padding為2),輸出28x28x32
(4)pool層,使用3x3的核(padding為1),輸出28x28x192,然后進行32個1x1的卷積,輸出28x28x32。
將四個結果進行連接,對這四部分輸出結果的第三維並聯,即64+128+32+32=256,最終輸出28x28x256
3b、第三層(Inception 3b層)
(1)128個1x1的卷積核,然后RuLU,輸出28x28x128
(2)128個1x1的卷積核,作為3x3卷積核之前的降維,變成28x28x128,進行ReLU,再進行192個3x3的卷積(padding為1),輸出28x28x192
(3)32個1x1的卷積核,作為5x5卷積核之前的降維,變成28x28x32,進行ReLU計算后,再進行96個5x5的卷積(padding為2),輸出28x28x96
(4)pool層,使用3x3的核(padding為1),輸出28x28x256,然后進行64個1x1的卷積,輸出28x28x64。
將四個結果進行連接,對這四部分輸出結果的第三維並聯,即128+192+96+64=480,最終輸出輸出為28x28x480
第四層(4a,4b,4c,4d,4e)、第五層(5a,5b)……,與3a、3b類似
以上是基於Inception version 1的,關於Inception version 2、3、4的詳情可參考 大話CNN經典模型:GoogLeNet(從Inception v1到v4的演進) 或者GoogLeNet團隊的論文。其中Inception version 2有一個基本結構是這樣的,用兩個3*3卷積代替5*5卷積:
基於上圖的Inception構建GoogLeNet的代碼如下[6]:
import torch import torch.nn as nn import torch.nn.functional as F #basic block: conv+bn+relu class BasicConv2d(nn.Module): def __init__(self, in_channels, out_channels, **kwargs): super(BasicConv2d, self).__init__() self.conv = nn.Conv2d(in_channels, out_channels, **kwargs) self.bn = nn.BatchNorm2d(out_channels) def forward(self, x): x = self.conv(x) x = self.bn(x) return F.relu(x) #inception module class Inception(nn.Module): def __init__(self, in_planes, n1x1, n3x3red, n3x3, n5x5red, n5x5, pool_planes): super(Inception, self).__init__() # 1x1 conv branch self.b1 = BasicConv2d(in_planes, n1x1, kernel_size=1) # 1x1 conv -> 3x3 conv branch self.b2_1x1 = BasicConv2d(in_planes, n3x3red, kernel_size=1) self.b2_3x3 = BasicConv2d(n3x3red, n3x3, kernel_size=3, padding=1) # 1x1 conv -> 3x3 conv -> 3x3 conv branch self.b3_1x1 = BasicConv2d(in_planes, n5x5red, kernel_size=1) self.b3_3x3_a = BasicConv2d(n5x5red, n5x5, kernel_size=3, padding=1) self.b3_3x3_b = BasicConv2d(n5x5, n5x5, kernel_size=3, padding=1) # 3x3 pool -> 1x1 conv branch self.b4_pool = nn.MaxPool2d(3, stride=1, padding=1) self.b4_1x1 = BasicConv2d(in_planes, pool_planes, kernel_size=1) def forward(self, x): y1 = self.b1(x) y2 = self.b2_1x1(x) y2 = self.b2_3x3(y2) y3 = self.b3_3x3_b(self.b3_3x3_a(self.b3_1x1(x))) y4 = self.b4_1x1(self.b4_pool(x)) return torch.cat([y1, y2, y3, y4], 1) class GoogLeNet(nn.Module): def __init__(self): super(GoogLeNet, self).__init__() self.pre_layers = BasicConv2d(3, 192, kernel_size=3, padding=1) self.a3 = Inception(192, 64, 96, 128, 16, 32, 32) self.b3 = Inception(256, 128, 128, 192, 32, 96, 64) self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) self.a4 = Inception(480, 192, 96, 208, 16, 48, 64) self.b4 = Inception(512, 160, 112, 224, 24, 64, 64) self.c4 = Inception(512, 128, 128, 256, 24, 64, 64) self.d4 = Inception(512, 112, 144, 288, 32, 64, 64) self.e4 = Inception(528, 256, 160, 320, 32, 128, 128) self.a5 = Inception(832, 256, 160, 320, 32, 128, 128) self.b5 = Inception(832, 384, 192, 384, 48, 128, 128) self.avgpool = nn.AvgPool2d(8, stride=1) self.linear = nn.Linear(1024, 10) def forward(self, x): out = self.pre_layers(x) out = self.a3(out) out = self.b3(out) out = self.maxpool(out) out = self.a4(out) out = self.b4(out) out = self.c4(out) out = self.d4(out) out = self.e4(out) out = self.maxpool(out) out = self.a5(out) out = self.b5(out) out = self.avgpool(out) out = out.view(out.size(0), -1) out = self.linear(out) return out def test(): net = GoogLeNet() x = torch.randn(1, 3, 32, 32) y = net(x) print(y.shape) test()
5. ResNet
ResNet在2015年被提出,在ImageNet比賽classification任務上獲得第一名。ResNet提出殘差模塊,如下圖所示,假設某神經網絡的輸入是x,期望輸出是H(x), 如果直接把輸入x傳到輸出作為初始結果,那么此時需要學習的目標是F(x)=H(x)-x,也就是只需要學習殘差。
殘差學習為什么更容易[7]
各個ResNet網絡結構
ResNet18和ResNet50的詳細結構[8]
ResNet的PyTorch代碼實現和解釋: 解讀PyTorch對ResNet的官方實現
參考文獻:
[1]. 經典卷積神經網絡結構——LeNet-5、AlexNet、VGG-16
[2]. 初探Alexnet網絡結構
[3]. VGGNet網絡結構
[4]. VGGNet
[5]. 大話CNN經典模型:GoogLeNet(從Inception v1到v4的演進)
[6]. inception v2 代碼實現
[7]. Resnet結構及代碼解析
[8]. resnet18 50網絡結構以及pytorch實現代碼