掛載Google Drive,避免數據集重復下載
from google.colab import drive
drive.mount('/content/drive')
導入包、設置GPU、設定隨機種子
import numpy as np
import matplotlib.pyplot as plt
import os
import torch
import torch.nn as nn
import torchvision
from torchvision import models,transforms,datasets
import time
import json
import shutil
from PIL import Image
import csv
# 判斷是否存在GPU設備
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Using gpu: %s ' % torch.cuda.is_available())
# 設置隨機種子,方便復現
torch.manual_seed(10000) # 為CPU設置隨機種子
torch.cuda.manual_seed(10000) # 為當前GPU設置隨機種子
torch.cuda.manual_seed_all(10000) # 為所有GPU設置隨機種子
下載數據集,並將數據及分類
#! wget https://static.leiphone.com/cat_dog.rar
!unrar x "/content/drive/My Drive/catdog/cat_dog.rar" "/content/sample_data"
%cd sample_data/
#將訓練集驗證集的貓狗圖像分別放入單獨文件夾內,方便ImageFolder讀取
for x in ['train','val']:
imgPath = "cat_dog/"+x
pathlist=os.listdir(imgPath)
data_destination = 'cat_dog/'+x+'/cat/'
label_destination = 'cat_dog/'+x+'/dog/'
if not (os.path.exists(data_destination) and os.path.exists(label_destination)):
os.makedirs(data_destination)
os.makedirs(label_destination)
# 根據文件名的特征進行分類並復制相應的文件到新文件夾
for item in pathlist:
# print(os.path.splitext(item)[0],os.path.splitext(item)[1])
if os.path.splitext(item)[1] == '.jpg' and 'cat' in os.path.splitext(item)[0]:
print(os.path.join(imgPath,item))
shutil.move(os.path.join(imgPath,item), data_destination)
elif os.path.splitext(item)[1] == '.jpg' and 'dog' in os.path.splitext(item)[0]:
print(os.path.join(imgPath,item))
shutil.move(os.path.join(imgPath,item), label_destination)
載入數據集,並對數據進行處理
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
resnet_format = transforms.Compose([
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
])
data_dir = './cat_dog'
dsets = {x: datasets.ImageFolder(os.path.join(data_dir, x), resnet_format)
for x in ['train', 'val']}
dset_sizes = {x: len(dsets[x]) for x in ['train', 'val']}
dset_classes = dsets['train'].classes
#resnet152下,需要顯存太大,將batch size調小為48
loader_train = torch.utils.data.DataLoader(dsets['train'], batch_size=48, shuffle=True, num_workers=6)
loader_valid = torch.utils.data.DataLoader(dsets['val'], batch_size=5, shuffle=False, num_workers=6)
載入ResNet152並修改模型全連接層
model = models.resnet152(pretrained=True)
model_new = model;
model_new.fc = nn.Linear(2048, 2,bias=True)
model_new = model_new.to(device)
print(model_new)
部分參數
#采用交叉熵損失函數
criterion = nn.CrossEntropyLoss()
# 學習率0.001,每10epoch *0.1
lr = 0.001
# 隨機梯度下降,momentum加速學習,Weight decay防止過擬合
optimizer = torch.optim.SGD(model_new.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
模型訓練
def val_model(model,dataloader,size):
model.eval()
predictions = np.zeros(size)
all_classes = np.zeros(size)
all_proba = np.zeros((size,2))
i = 0
running_loss = 0.0
running_corrects = 0
with torch.no_grad():
for inputs,classes in dataloader:
inputs = inputs.to(device)
classes = classes.to(device)
outputs = model(inputs)
loss = criterion(outputs,classes)
_,preds = torch.max(outputs.data,1)
# statistics
running_loss += loss.data.item()
running_corrects += torch.sum(preds == classes.data)
#predictions[i:i+len(classes)] = preds.to('cpu').numpy()
#all_classes[i:i+len(classes)] = classes.to('cpu').numpy()
#all_proba[i:i+len(classes),:] = outputs.data.to('cpu').numpy()
i += len(classes)
#print('Testing: No. ', i, ' process ... total: ', size)
epoch_loss = running_loss / size
epoch_acc = running_corrects.data.item() / size
#print('Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss, epoch_acc))
return epoch_loss, epoch_acc
def train_model(model,dataloader,size,epochs=1,optimizer=None):
for epoch in range(epochs):
model.train()
running_loss = 0.0
running_corrects = 0
count = 0
for inputs,classes in dataloader:
inputs = inputs.to(device)
classes = classes.to(device)
outputs = model(inputs)
loss = criterion(outputs,classes)
optimizer = optimizer
optimizer.zero_grad()
loss.backward()
optimizer.step()
_,preds = torch.max(outputs.data,1)
# statistics
running_loss += loss.data.item()
running_corrects += torch.sum(preds == classes.data)
count += len(inputs)
#print('Training: No. ', count, ' process ... total: ', size)
epoch_loss = running_loss / size
epoch_acc = running_corrects.data.item() / size
epoch_Valloss, epoch_Valacc = val_model(model,loader_valid,dset_sizes['val'])
print('epoch: ',epoch,' Loss: {:.5f} Acc: {:.5f} ValLoss: {:.5f} ValAcc: {:.5f}'.format(
epoch_loss, epoch_acc,epoch_Valloss,epoch_Valacc))
scheduler.step()
#學習率衰減
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
# 模型訓練
train_model(model_new,loader_train,size=dset_sizes['train'], epochs=20,
optimizer=optimizer)
模型測試並輸出csv文件
model_new.eval()
csvfile = open('csv.csv', 'w')
writer = csv.writer(csvfile)
test_root='./cat_dog/test/'
img_test=os.listdir(test_root)
img_test.sort(key= lambda x:int(x[:-4]))
for i in range(len(img_test)):
img = Image.open(test_root+img_test[i])
img = img.convert('RGB')
input=resnet_format(img)
input=input.unsqueeze(0)
input = input.to(device)
output=model_new(input)
_,pred = torch.max(output.data,1)
print(i,pred.tolist()[0])
writer.writerow([i,pred.tolist()[0]])
csvfile.close()
訓練驗證結果如下:
測試結果如下:
一開始采用的VGG16進行訓練,凍結FC層之前參數,將優化器由SGD改為Adam,1個epoch下測試結果得分98.1。多個epoch跑下來,效果提升有限,遂采用ResNet。
遇到的問題:
- Colab數據集訓練時不要放在Google Drive中!Google Drive中數據和服務器計算是分離的,每次讀取數據都需要向Drive進行網絡請求,導致訓練速度被網絡速度拖慢,特別是在傳輸大量小圖片數據時。
- 為了使結果復現,嘗試給PyTorch設置隨機種子,但還不能保證精確一致,或許還需要設置cudnn、python,numpy。PyTorch的可重復性問題 (如何使實驗結果可復現)
- ResNet訓練時曾嘗試凍結FC層之前參數,效果不理想。
- 大部分情況下,Adam效果相較SGD更好,然而在ResNet下,SGD效果比Adam好。
- 修改網絡結構還可通過繼承的方式
class Net(nn.Module):
def __init__(self, model):
super(Net, self).__init__()
# 取掉model的后1層
self.resnet_layer = nn.Sequential(*list(model.children())[:-1])
self.Linear_layer = nn.Linear(2048, 2) #加上一層參數修改好的全連接層
def forward(self, x):
x = self.resnet_layer(x)
x = x.view(x.size(0), -1)
x = self.Linear_layer(x)
return x
model_new = Net(model)
model_new = model_new.to(device)
- 學習率的調整可采用動態的方式。學習率太小收斂太慢,學習率太大會導致參數在最優點來回波動。通常先采用較大學習率進行訓練,在訓練過程中不斷衰減。動態調整Learning Rate:TORCH.OPTIM.LR_SCHEDULER
- model.eval()與with torch.no_grad()可同時使用,更加節省算力。深入理解model.eval()與torch.no_grad()
待解決
訓練和驗證的loss相差大約十倍,是不是哪里寫錯了。
吐槽
Google Drive掛載功能維護了一天,Colab限額又鎖了一天,果然羊毛不是那么好薅的 - -。