目錄:
- 分類模型訓練代碼
- 分類模型測試代碼
- 自定義損失函數
- 標簽平滑
- mixup訓練
- L1正則化
- 不對偏置項進行權重衰減
- 梯度裁剪
- 得到當前學習率
- 學習率衰減
- 優化器鏈式更新
- 模型訓練可視化
- 保存和加載斷點
- 提取Imagenet預訓練模型的某層特征
- 提取imagenet預訓練模型的多層特征
- 微調全連接層
- 以較大學習率微調全連接層,較小學習率微調卷積層
1、分類模型訓練代碼
# Loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # Train the model total_step = len(train_loader) for epoch in range(num_epochs): for i ,(images, labels) in enumerate(train_loader): images = images.to(device) labels = labels.to(device) # Forward pass outputs = model(images) loss = criterion(outputs, labels) # Backward and optimizer optimizer.zero_grad() loss.backward() optimizer.step() if (i+1) % 100 == 0: print('Epoch: [{}/{}], Step: [{}/{}], Loss: {}' .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
2、分類模型測試代碼
# Test the model model.eval() # eval mode(batch norm uses moving mean/variance #instead of mini-batch mean/variance) with torch.no_grad(): correct = 0 total = 0 for images, labels in test_loader: images = images.to(device) labels = labels.to(device) outputs = model(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() print('Test accuracy of the model on the 10000 test images: {} %' .format(100 * correct / total))
3、自定義損失函數
繼承torch.nn.Module類寫自己的loss。
class MyLoss(torch.nn.Module): def __init__(self): super(MyLoss, self).__init__() def forward(self, x, y): loss = torch.mean((x - y) ** 2) return loss
4、標簽平滑
寫一個label_smoothing.py的文件,然后在訓練代碼里引用,用LSR代替交叉熵損失即可。label_smoothing.py內容如下:
import torch import torch.nn as nn class LSR(nn.Module): def __init__(self, e=0.1, reduction='mean'): super().__init__() self.log_softmax = nn.LogSoftmax(dim=1) self.e = e self.reduction = reduction def _one_hot(self, labels, classes, value=1): """ Convert labels to one hot vectors Args: labels: torch tensor in format [label1, label2, label3, ...] classes: int, number of classes value: label value in one hot vector, default to 1 Returns: return one hot format labels in shape [batchsize, classes] """ one_hot = torch.zeros(labels.size(0), classes) #labels and value_added size must match labels = labels.view(labels.size(0), -1) value_added = torch.Tensor(labels.size(0), 1).fill_(value) value_added = value_added.to(labels.device) one_hot = one_hot.to(labels.device) one_hot.scatter_add_(1, labels, value_added) return one_hot def _smooth_label(self, target, length, smooth_factor): """convert targets to one-hot format, and smooth them. Args: target: target in form with [label1, label2, label_batchsize] length: length of one-hot format(number of classes) smooth_factor: smooth factor for label smooth Returns: smoothed labels in one hot format """ one_hot = self._one_hot(target, length, value=1 - smooth_factor) one_hot += smooth_factor / (length - 1) return one_hot.to(target.device) def forward(self, x, target): if x.size(0) != target.size(0): raise ValueError('Expected input batchsize ({}) to match target batch_size({})' .format(x.size(0), target.size(0))) if x.dim() < 2: raise ValueError('Expected input tensor to have least 2 dimensions(got {})' .format(x.size(0))) if x.dim() != 2: raise ValueError('Only 2 dimension tensor are implemented, (got {})' .format(x.size())) smoothed_target = self._smooth_label(target, x.size(1), self.e) x = self.log_softmax(x) loss = torch.sum(- x * smoothed_target, dim=1) if self.reduction == 'none': return loss elif self.reduction == 'sum': return torch.sum(loss) elif self.reduction == 'mean': return torch.mean(loss) else: raise ValueError('unrecognized option, expect reduction to be one of none, mean, sum')
或者直接在訓練文件里做label smoothing
for images, labels in train_loader: images, labels = images.cuda(), labels.cuda() N = labels.size(0) # C is the number of classes. smoothed_labels = torch.full(size=(N, C), fill_value=0.1 / (C - 1)).cuda() smoothed_labels.scatter_(dim=1, index=torch.unsqueeze(labels, dim=1), value=0.9) score = model(images) log_prob = torch.nn.functional.log_softmax(score, dim=1) loss = -torch.sum(log_prob * smoothed_labels) / N optimizer.zero_grad() loss.backward() optimizer.step()
5、mixup訓練
beta_distribution = torch.distributions.beta.Beta(alpha, alpha) for images, labels in train_loader: images, labels = images.cuda(), labels.cuda() # Mixup images and labels. lambda_ = beta_distribution.sample([]).item() index = torch.randperm(images.size(0)).cuda() mixed_images = lambda_ * images + (1 - lambda_) * images[index, :] label_a, label_b = labels, labels[index] # Mixup loss. scores = model(mixed_images) loss = (lambda_ * loss_function(scores, label_a) + (1 - lambda_) * loss_function(scores, label_b)) optimizer.zero_grad() loss.backward() optimizer.step()
6、L1正則化
l1_regularization = torch.nn.L1Loss(reduction='sum') loss = ... # Standard cross-entropy loss for param in model.parameters(): loss += torch.sum(torch.abs(param)) loss.backward()
7、不對偏置進行權重衰減
pytorch里的weight decay相當於l2正則
bias_list = (param for name, param in model.named_parameters() if name[-4:] == 'bias') others_list = (param for name, param in model.named_parameters() if name[-4:] != 'bias') parameters = [{'parameters': bias_list, 'weight_decay': 0}, {'parameters': others_list}] optimizer = torch.optim.SGD(parameters, lr=1e-2, momentum=0.9, weight_decay=1e-4)
8、梯度裁剪
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=20)
9、得到當前學習率
# If there is one global learning rate (which is the common case). lr = next(iter(optimizer.param_groups))['lr'] # If there are multiple learning rates for different layers. all_lr = [] for param_group in optimizer.param_groups: all_lr.append(param_group['lr'])
另一種方法,在一個batch訓練代碼里,當前的lr是optimizer.param_groups[0]['lr']
10、學習率衰減
# Reduce learning rate when validation accuarcy plateau. scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=5, verbose=True) for t in range(0, 80): train(...) val(...) scheduler.step(val_acc) # Cosine annealing learning rate. scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=80) # Reduce learning rate by 10 at given epochs. scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[50, 70], gamma=0.1) for t in range(0, 80): scheduler.step() train(...) val(...) # Learning rate warmup by 10 epochs. scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda t: t / 10) for t in range(0, 10): scheduler.step() train(...) val(...)
11、優化器鏈式更新
從1.4版本開始,torch.optim.lr_scheduler 支持鏈式更新(chaining),即用戶可以定義兩個 schedulers,並交替在訓練中使用。
import torch from torch.optim import SGD from torch.optim.lr_scheduler import ExponentialLR, StepLR model = [torch.nn.Parameter(torch.randn(2, 2, requires_grad=True))] optimizer = SGD(model, 0.1) scheduler1 = ExponentialLR(optimizer, gamma=0.9) scheduler2 = StepLR(optimizer, step_size=3, gamma=0.1) for epoch in range(4): print(epoch, scheduler2.get_last_lr()[0]) optimizer.step() scheduler1.step() scheduler2.step()
12、模型訓練可視化
pip install tensorboard
tensorboard --logdir=runs
使用SummaryWriter類來收集和可視化相應的數據,放了方便查看,可以使用不同的文件夾,比如'Loss/train'和'Loss/test'。
from torch.utils.tensorboard import SummaryWriter import numpy as np writer = SummaryWriter() for n_iter in range(100): writer.add_scalar('Loss/train', np.random.random(), n_iter) writer.add_scalar('Loss/test', np.random.random(), n_iter) writer.add_scalar('Accuracy/train', np.random.random(), n_iter) writer.add_scalar('Accuracy/test', np.random.random(), n_iter)
13、保存和加載斷點
tart_epoch = 0 # Load checkpoint. if resume: # resume為參數,第一次訓練時設為0,中斷再訓練時設為1 model_path = os.path.join('model', 'best_checkpoint.pth.tar') assert os.path.isfile(model_path) checkpoint = torch.load(model_path) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print('Load checkpoint at epoch {}.'.format(start_epoch)) print('Best accuracy so far {}.'.format(best_acc)) # Train the model for epoch in range(start_epoch, num_epochs): ... # Test the model ... # save checkpoint is_best = current_acc > best_acc best_acc = max(current_acc, best_acc) checkpoint = { 'best_acc': best_acc, 'epoch': epoch + 1, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), } model_path = os.path.join('model', 'checkpoint.pth.tar') best_model_path = os.path.join('model', 'best_checkpoint.pth.tar') torch.save(checkpoint, model_path) if is_best: shutil.copy(model_path, best_model_path)
14、提取Imagenet預訓練模型某層的特征
# VGG-16 relu5-3 feature. model = torchvision.models.vgg16(pretrained=True).features[:-1] # VGG-16 pool5 feature. model = torchvision.models.vgg16(pretrained=True).features # VGG-16 fc7 feature. model = torchvision.models.vgg16(pretrained=True) model.classifier = torch.nn.Sequential(*list(model.classifier.children())[:-3]) # ResNet GAP feature. model = torchvision.models.resnet18(pretrained=True) model = torch.nn.Sequential(collections.OrderedDict( list(model.named_children())[:-1])) with torch.no_grad(): model.eval() conv_representation = model(image)
15、提取imagenet預訓練模型多層卷積特征
class FeatureExtractor(torch.nn.Module): """Helper class to extract several convolution features from the given pre-trained model. Attributes: _model, torch.nn.Module. _layers_to_extract, list<str> or set<str> Example: >>> model = torchvision.models.resnet152(pretrained=True) >>> model = torch.nn.Sequential(collections.OrderedDict( list(model.named_children())[:-1])) >>> conv_representation = FeatureExtractor( pretrained_model=model, layers_to_extract={'layer1', 'layer2', 'layer3', 'layer4'})(image) """ def __init__(self, pretrained_model, layers_to_extract): torch.nn.Module.__init__(self) self._model = pretrained_model self._model.eval() self._layers_to_extract = set(layers_to_extract) def forward(self, x): with torch.no_grad(): conv_representation = [] for name, layer in self._model.named_children(): x = layer(x) if name in self._layers_to_extract: conv_representation.append(x) return conv_representation
16、微調全連接層
model = torchvision.models.resnet18(pretrained=True) for param in model.parameters(): param.requires_grad = False model.fc = nn.Linear(512, 100) # Replace the last fc layer optimizer = torch.optim.SGD(model.fc.parameters(), lr=1e-2, momentum=0.9, weight_decay=1e-4
17、以較大學習率微調全連接層,較小學習率微調卷積層
model = torchvision.models.resnet18(pretrained=True) finetuned_parameters = list(map(id, model.fc.parameters())) conv_parameters = (p for p in model.parameters() if id(p) not in finetuned_parameters) parameters = [{'params': conv_parameters, 'lr': 1e-3}, {'params': model.fc.parameters()}] optimizer = torch.optim.SGD(parameters, lr=1e-2, momentum=0.9, weight_decay=1e-4)