殘差神經網絡(ResNet)
為什么神經網絡的層數越來越深
由圖可知隨着神經網絡層數的增加,神經網絡的訓練誤差和測試誤差相較於較低層數的神經網絡都要更高。但是這個與越深的網絡,訓練誤差和測試誤差都要更小這個第一感覺相違背。
在相同的迭代次數下,更深的神經網絡的誤差更容易趨向於平穩。
神經網絡隨着層數增加誤差率反而上升
這是因為數據在經過常規的神經網絡層后,經過鏈式法則梯度相乘后,會發生梯度消失,神經網絡的收斂速度就會下降。
Plain Net
數據直接經過權重層和和非線性變換得到輸出\(H(x)\)
Residual Net
將輸入直接加到輸出層,輸出就等於經過變換的x和原始輸入x,\(H(x)=F(x)+x\),這樣在求導時會多出一個
\(\frac{\partial{H(x)}}{\partial x}=\frac{\partial{F(x)}}{\partial{x}}+1\)
就在求梯度時增加了導數1
這里以MNIST數據集來構建一個殘差神經網絡模型
殘差神經網絡塊(Residual Bolck)
class ResidualBlock(nn.Module):
def __init__(self, channels):
super(ResidualBolck, self).__init__()
self.channels = channels
self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
def forward(self, x):
y = F.relu(self.conv1(x))
y = F.relu(self.conv2(x))
return F.relu(x + y)
# 創建兩個通道數不變的卷積層,通過填充padding使得圖像大小不變,然后將經過兩個卷積層變換后的y加上原始輸入x,最后返回x+y
# 卷積后圖像減少(kernel_size-1)/stride,也就是padding=(kernel_size-1)/stride就能保持圖像不變
整個神經網絡代碼
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 16, kernel_size=5) # 卷積層1
self.conv2 = nn.Conv2d(16, 32, kernel_size=5) # 卷積層2
self.mp = nn.MaxPool2d(2) # 最大池化層
self.rblock1 = ResidualBlock(16) # 殘差塊1
self.rblock2 = ResidualBlock(32) # 殘差塊2
self.fc = nn.Linear(512, 10) # Full Connect 全連接層
def forward(self, x): # 定義網絡的正向傳播
in_size = x.size(0) # 獲取數據的batch_size,方便后面展平數據(batch_size,channel,height,width)
x = self.mp(F.relu(self.conv1(x))) # (1,28,28)->(16,24,24)->(16,12,12)
x = self.rblock1(x) # (16,12,12)->(16,12,12)
x = self.mp(F.relu(self.conv2(x))) # (16,12,12)->(32,8,8)->(32,4,4)
x = self.rbolck2(x) # (32,4,4)->(32,4,4)
x = x.view(in_size, -1) # 將數據展平接入全連接層
x = self.fc(x) # 32*4*4=512
return x
完整代碼:
import torch
import torch.nn as nn
from torchvision import transforms
from torchvision import datasets
from torch.utils.data import DataLoader
import torch.nn.functional as F
batch_size = 64
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
train_dataset = datasets.MNIST(root='./dataset/mnist/',
train=True,
download=True,
transform=transform)
train_loader = DataLoader(dataset=train_dataset,
shuffle=True,
batch_size=batch_size)
test_dataset = datasets.MNIST(root='./dataset/mnist/',
train=False,
download=True,
transform=transform)
test_loader = DataLoader(dataset=test_dataset,
shuffle=False,
batch_size=batch_size)
class ResidualBlock(nn.Module):
def __init__(self, channels):
super(ResidualBlock, self).__init__()
self.channels = channels
self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
def forward(self, x):
y = F.relu(self.conv1(x))
y = self.conv2(y)
return F.relu(x + y)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 16, kernel_size=5)
self.conv2 = nn.Conv2d(16, 32, kernel_size=5)
self.mp = nn.MaxPool2d(2)
self.rblock1 = ResidualBlock(16)
self.rblock2 = ResidualBlock(32)
self.fc = nn.Linear(512, 10)
def forward(self, x):
in_size = x.size(0)
x = self.mp(F.relu(self.conv1(x)))
x = self.rblock1(x)
x = self.mp(F.relu(self.conv2(x)))
x = self.rblock2(x)
x = x.view(in_size, -1)
x = self.fc(x)
return x
device = torch.device('cuda:0')
model = Net().to(device)
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=.5)
loss_l = []
def train(epoch):
running_loss = 0
for batch_idx, (x, y) in enumerate(train_loader):
inputs, target = x, y
inputs, target = inputs.to(device), target.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, target)
loss.backward()
optimizer.step()
running_loss += loss
if batch_idx % 300 == 299:
print('[{},{}] loss:{:.3f}'.format(epoch + 1, batch_idx + 1, running_loss / 300))
running_loss = 0
def test():
corrent = 0
total = 0
with torch.no_grad():
for data in test_loader:
images, labels = data
images, labels = images.to(device), labels.to(device)
outputs = model(images)
_, prediction = torch.max(outputs, dim=1)
total += labels.size(0)
corrent += (prediction == labels).sum().item()
loss_l.append(100 * corrent / total)
print('Accuracy on test set %d %%' % (100 * corrent / total))
if __name__ == '__main__':
for epoch in range(3):
train(epoch)
test()
運行結果:
准確率達到了98.64%
模型參數:
Net(
(conv1): Conv2d(1, 16, kernel_size=(5, 5), stride=(1, 1))
(conv2): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1))
(mp): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(rblock1): ResidualBlock(
(conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(rblock2): ResidualBlock(
(conv1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(fc): Linear(in_features=512, out_features=10, bias=True)
)``