在上一篇博客中我們介紹並實現了自動編碼器,本文將用PyTorch實現變分自動編碼器(Variational AutoEncoder, VAE)。自動變分編碼器原理與一般的自動編碼器的區別在於需要在編碼過程增加一點限制,迫使它生成的隱含向量能夠粗略的遵循標准正態分布。這樣一來,當需要生成一張新圖片時,只需要給解碼器一個標准正態分布的隱含隨機向量就可以了。
在實際操作中,實際上不是生成一個隱含向量,而是生成兩個向量:一個表示均值,一個表示標准差,然后通過這兩個統計量合成隱含向量,用一個標准正態分布先乘標准差再加上均值就行了。具體關於變分自動編碼器的內容,可參考廖星宇的《深度學習之PyTorch》的第六章,下面的代碼也是來自這個資料,但本文對原代碼做了一點改動。
import os
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
from torchvision import transforms as tfs
from torchvision.utils import save_image
# Hyper parameters
EPOCH = 1
LR = 1e-3
BATCHSIZE = 128
im_tfs = tfs.Compose([
tfs.ToTensor(), # Converts a PIL.Image or numpy.ndarray to
# torch.FloatTensor of shape (C x H x W) and normalize in the range [0.0, 1.0]
tfs.Normalize([0.5], [0.5]) # 把[0.0, 1.0]的數據擴大范圍到[-1., 1]
])
train_set = MNIST(
root='/Users/wangpeng/Desktop/all/CS/Courses/Deep Learning/mofan_PyTorch/mnist/', # mnist has been downloaded before, use it directly
train=True,
transform=im_tfs,
)
train_loader = DataLoader(train_set, batch_size=BATCHSIZE, shuffle=True)
class VAE(nn.Module):
def __init__(self):
super(VAE, self).__init__()
self.fc1 = nn.Linear(784, 400)
self.fc21 = nn.Linear(400, 20) # mean
self.fc22 = nn.Linear(400, 20) # var
self.fc3 = nn.Linear(20, 400)
self.fc4 = nn.Linear(400, 784)
def encode(self, x):
h1 = F.relu(self.fc1(x))
return self.fc21(h1), self.fc22(h1)
def reparametrize(self, mu, logvar):
std = logvar.mul(0.5).exp_() # 矩陣點對點相乘之后再把這些元素作為e的指數
eps = torch.FloatTensor(std.size()).normal_() # 生成隨機數組
if torch.cuda.is_available():
eps = eps.cuda()
return eps.mul(std).add_(mu) # 用一個標准正態分布乘標准差,再加上均值,使隱含向量變為正太分布
def decode(self, z):
h3 = F.relu(self.fc3(z))
return torch.tanh(self.fc4(h3))
def forward(self, x):
mu, logvar = self.encode(x) # 編碼
z = self.reparametrize(mu, logvar) # 重新參數化成正態分布
return self.decode(z), mu, logvar # 解碼,同時輸出均值方差
net = VAE() # 實例化網絡
if torch.cuda.is_available():
net = net.cuda()
reconstruction_function = nn.MSELoss(size_average=False)
def loss_function(recon_x, x, mu, logvar):
"""
recon_x: generating images
x: origin images
mu: latent mean
logvar: latent log variance
"""
MSE = reconstruction_function(recon_x, x)
# loss = 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
KLD_element = mu.pow(2).add_(logvar.exp()).mul_(-1).add_(1).add_(logvar)
KLD = torch.sum(KLD_element).mul_(-0.5)
# KL divergence
return MSE + KLD
optimizer = torch.optim.Adam(net.parameters(), lr=LR)
def to_img(x): # x shape (bachsize, 28*28), x中每個像素點的大小范圍[-1., 1.]
'''
定義一個函數將最后的結果轉換回圖片
'''
x = 0.5 * (x + 1.)
x = x.clamp(0, 1)
x = x.view(x.shape[0], 1, 28, 28)
return x
for epoch in range(EPOCH):
for iteration, (im, y) in enumerate(train_loader):
im = im.view(im.shape[0], -1)
if torch.cuda.is_available():
im = im.cuda()
recon_im, mu, logvar = net(im)
loss = loss_function(recon_im, im, mu, logvar) / im.shape[0] # 將 loss 平均
optimizer.zero_grad()
loss.backward()
optimizer.step()
if iteration % 100 == 0:
print('epoch: {:2d} | iteration: {:4d} | Loss: {:.4f}'.format(epoch, iteration, loss.data.numpy()))
save = to_img(recon_im.cpu().data)
if not os.path.exists('./vae_img'):
os.mkdir('./vae_img')
save_image(save, './vae_img/image_{}_{}.png'.format(epoch, iteration))
# test
code = torch.randn(1, 20) # 隨機給一個符合正態分布的張量
out = net.decode(code)
img = to_img(out)
save_image(img, './vae_img/test_img.png')
