圖像分類(lèi)案例2
Kaggle上的狗品種識(shí)別(ImageNet Dogs)
在本節(jié)中,我們將解決Kaggle競(jìng)賽中的犬種識(shí)別挑戰(zhàn),比賽的網(wǎng)址是https://www.kaggle.com/c/dog-breed-identification 在這項(xiàng)比賽中,我們嘗試確定120種不同的狗。該比賽中使用的數(shù)據(jù)集實(shí)際上是著名的ImageNet數(shù)據(jù)集的子集。
# 在本節(jié)notebook中,使用后續(xù)設(shè)置的參數(shù)在完整訓(xùn)練集上訓(xùn)練模型,大致需要40-50分鐘
# 請(qǐng)大家合理安排GPU時(shí)長(zhǎng),盡量只在訓(xùn)練時(shí)切換到GPU資源
# 也可以在Kaggle上訪問(wèn)本節(jié)notebook:
# https://www.kaggle.com/boyuai/boyu-d2l-dog-breed-identification-imagenet-dogs
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torchvision.models as models
import os
import shutil
import time
import pandas as pd
import random
# 設(shè)置隨機(jī)數(shù)種子
random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
整理數(shù)據(jù)集
我們可以從比賽網(wǎng)址上下載數(shù)據(jù)集,其目錄結(jié)構(gòu)為:
| Dog Breed Identification
| train
| | 000bec180eb18c7604dcecc8fe0dba07.jpg
| | 00a338a92e4e7bf543340dc849230e75.jpg
| | ...
| test
| | 00a3edd22dc7859c487a64777fc8d093.jpg
| | 00a6892e5c7f92c1f465e213fd904582.jpg
| | ...
| labels.csv
| sample_submission.csv
train和test目錄下分別是訓(xùn)練集和測(cè)試集的圖像,訓(xùn)練集包含10,222張圖像,測(cè)試集包含10,357張圖像,圖像格式都是JPEG,每張圖像的文件名是一個(gè)唯一的id。labels.csv包含訓(xùn)練集圖像的標(biāo)簽,文件包含10,222行,每行包含兩列,第一列是圖像id,第二列是狗的類(lèi)別。狗的類(lèi)別一共有120種。
我們希望對(duì)數(shù)據(jù)進(jìn)行整理,方便后續(xù)的讀取,我們的主要目標(biāo)是:
從訓(xùn)練集中劃分出驗(yàn)證數(shù)據(jù)集,用于調(diào)整超參數(shù)。劃分之后,數(shù)據(jù)集應(yīng)該包含4個(gè)部分:劃分后的訓(xùn)練集、劃分后的驗(yàn)證集、完整訓(xùn)練集、完整測(cè)試集
對(duì)于4個(gè)部分,建立4個(gè)文件夾:train, valid, train_valid, test。在上述文件夾中,對(duì)每個(gè)類(lèi)別都建立一個(gè)文件夾,在其中存放屬于該類(lèi)別的圖像。前三個(gè)部分的標(biāo)簽已知,所以各有120個(gè)子文件夾,而測(cè)試集的標(biāo)簽未知,所以僅建立一個(gè)名為unknown的子文件夾,存放所有測(cè)試數(shù)據(jù)。
我們希望整理后的數(shù)據(jù)集目錄結(jié)構(gòu)為:
| train_valid_test
| train
| | affenpinscher
| | | 00ca18751837cd6a22813f8e221f7819.jpg
| | | ...
| | afghan_hound
| | | 0a4f1e17d720cdff35814651402b7cf4.jpg
| | | ...
| | ...
| valid
| | affenpinscher
| | | 56af8255b46eb1fa5722f37729525405.jpg
| | | ...
| | afghan_hound
| | | 0df400016a7e7ab4abff824bf2743f02.jpg
| | | ...
| | ...
| train_valid
| | affenpinscher
| | | 00ca18751837cd6a22813f8e221f7819.jpg
| | | ...
| | afghan_hound
| | | 0a4f1e17d720cdff35814651402b7cf4.jpg
| | | ...
| | ...
| test
| | unknown
| | | 00a3edd22dc7859c487a64777fc8d093.jpg
| | | ...
data_dir = '/home/kesci/input/Kaggle_Dog6357/dog-breed-identification' # 數(shù)據(jù)集目錄
label_file, train_dir, test_dir = 'labels.csv', 'train', 'test' # data_dir中的文件夾、文件
new_data_dir = './train_valid_test' # 整理之后的數(shù)據(jù)存放的目錄
valid_ratio = 0.1 # 驗(yàn)證集所占比例
def mkdir_if_not_exist(path):
# 若目錄path不存在,則創(chuàng)建目錄
if not os.path.exists(os.path.join(*path)):
os.makedirs(os.path.join(*path))
def reorg_dog_data(data_dir, label_file, train_dir, test_dir, new_data_dir, valid_ratio):
# 讀取訓(xùn)練數(shù)據(jù)標(biāo)簽
labels = pd.read_csv(os.path.join(data_dir, label_file))
id2label = {Id: label for Id, label in labels.values} # (key: value): (id: label)
# 隨機(jī)打亂訓(xùn)練數(shù)據(jù)
train_files = os.listdir(os.path.join(data_dir, train_dir))
random.shuffle(train_files)
# 原訓(xùn)練集
valid_ds_size = int(len(train_files) * valid_ratio) # 驗(yàn)證集大小
for i, file in enumerate(train_files):
img_id = file.split('.')[0] # file是形式為id.jpg的字符串
img_label = id2label[img_id]
if i < valid_ds_size:
mkdir_if_not_exist([new_data_dir, 'valid', img_label])
shutil.copy(os.path.join(data_dir, train_dir, file),
os.path.join(new_data_dir, 'valid', img_label))
else:
mkdir_if_not_exist([new_data_dir, 'train', img_label])
shutil.copy(os.path.join(data_dir, train_dir, file),
os.path.join(new_data_dir, 'train', img_label))
mkdir_if_not_exist([new_data_dir, 'train_valid', img_label])
shutil.copy(os.path.join(data_dir, train_dir, file),
os.path.join(new_data_dir, 'train_valid', img_label))
# 測(cè)試集
mkdir_if_not_exist([new_data_dir, 'test', 'unknown'])
for test_file in os.listdir(os.path.join(data_dir, test_dir)):
shutil.copy(os.path.join(data_dir, test_dir, test_file),
os.path.join(new_data_dir, 'test', 'unknown'))
reorg_dog_data(data_dir, label_file, train_dir, test_dir, new_data_dir, valid_ratio)
圖像增強(qiáng)
transform_train = transforms.Compose([
# 隨機(jī)對(duì)圖像裁剪出面積為原圖像面積0.08~1倍、且高和寬之比在3/4~4/3的圖像,再放縮為高和寬均為224像素的新圖像
transforms.RandomResizedCrop(224, scale=(0.08, 1.0),
ratio=(3.0/4.0, 4.0/3.0)),
# 以0.5的概率隨機(jī)水平翻轉(zhuǎn)
transforms.RandomHorizontalFlip(),
# 隨機(jī)更改亮度、對(duì)比度和飽和度
transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4),
transforms.ToTensor(),
# 對(duì)各個(gè)通道做標(biāo)準(zhǔn)化,(0.485, 0.456, 0.406)和(0.229, 0.224, 0.225)是在ImageNet上計(jì)算得的各通道均值與方差
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) # ImageNet上的均值和方差
])
# 在測(cè)試集上的圖像增強(qiáng)只做確定性的操作
transform_test = transforms.Compose([
transforms.Resize(256),
# 將圖像中央的高和寬均為224的正方形區(qū)域裁剪出來(lái)
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
讀取數(shù)據(jù)
# new_data_dir目錄下有train, valid, train_valid, test四個(gè)目錄
# 這四個(gè)目錄中,每個(gè)子目錄表示一種類(lèi)別,目錄中是屬于該類(lèi)別的所有圖像
train_ds = torchvision.datasets.ImageFolder(root=os.path.join(new_data_dir, 'train'),
transform=transform_train)
valid_ds = torchvision.datasets.ImageFolder(root=os.path.join(new_data_dir, 'valid'),
transform=transform_test)
train_valid_ds = torchvision.datasets.ImageFolder(root=os.path.join(new_data_dir, 'train_valid'),
transform=transform_train)
test_ds = torchvision.datasets.ImageFolder(root=os.path.join(new_data_dir, 'test'),
transform=transform_test)
batch_size = 128
train_iter = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_iter = torch.utils.data.DataLoader(valid_ds, batch_size=batch_size, shuffle=True)
train_valid_iter = torch.utils.data.DataLoader(train_valid_ds, batch_size=batch_size, shuffle=True)
test_iter = torch.utils.data.DataLoader(test_ds, batch_size=batch_size, shuffle=False) # shuffle=False
定義模型
這個(gè)比賽的數(shù)據(jù)屬于ImageNet數(shù)據(jù)集的子集,我們使用微調(diào)的方法,選用在ImageNet完整數(shù)據(jù)集上預(yù)訓(xùn)練的模型來(lái)抽取圖像特征,以作為自定義小規(guī)模輸出網(wǎng)絡(luò)的輸入。
此處我們使用與訓(xùn)練的ResNet-34模型,直接復(fù)用預(yù)訓(xùn)練模型在輸出層的輸入,即抽取的特征,然后我們重新定義輸出層,本次我們僅對(duì)重定義的輸出層的參數(shù)進(jìn)行訓(xùn)練,而對(duì)于用于抽取特征的部分,我們保留預(yù)訓(xùn)練模型的參數(shù)
def get_net(device):
finetune_net = models.resnet34(pretrained=False) # 預(yù)訓(xùn)練的resnet34網(wǎng)絡(luò)
finetune_net.load_state_dict(torch.load('/home/kesci/input/resnet347742/resnet34-333f7ec4.pth'))
for param in finetune_net.parameters(): # 凍結(jié)參數(shù)
param.requires_grad = False
# 原finetune_net.fc是一個(gè)輸入單元數(shù)為512,輸出單元數(shù)為1000的全連接層
# 替換掉原finetune_net.fc,新finetuen_net.fc中的模型參數(shù)會(huì)記錄梯度
finetune_net.fc = nn.Sequential(
nn.Linear(in_features=512, out_features=256),
nn.ReLU(),
nn.Linear(in_features=256, out_features=120) # 120是輸出類(lèi)別數(shù)
)
return finetune_net
定義訓(xùn)練函數(shù)
def evaluate_loss_acc(data_iter, net, device):
# 計(jì)算data_iter上的平均損失與準(zhǔn)確率
loss = nn.CrossEntropyLoss()
is_training = net.training # Bool net是否處于train模式
net.eval()
l_sum, acc_sum, n = 0, 0, 0
with torch.no_grad():
for X, y in data_iter:
X, y = X.to(device), y.to(device)
y_hat = net(X)
l = loss(y_hat, y)
l_sum += l.item() * y.shape[0]
acc_sum += (y_hat.argmax(dim=1) == y).sum().item()
n += y.shape[0]
net.train(is_training) # 恢復(fù)net的train/eval狀態(tài)
return l_sum / n, acc_sum / n
def train(net, train_iter, valid_iter, num_epochs, lr, wd, device, lr_period,
lr_decay):
loss = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.fc.parameters(), lr=lr, momentum=0.9, weight_decay=wd)
net = net.to(device)
for epoch in range(num_epochs):
train_l_sum, n, start = 0.0, 0, time.time()
if epoch > 0 and epoch % lr_period == 0: # 每lr_period個(gè)epoch,學(xué)習(xí)率衰減一次
lr = lr * lr_decay
for param_group in optimizer.param_groups:
param_group['lr'] = lr
for X, y in train_iter:
X, y = X.to(device), y.to(device)
optimizer.zero_grad()
y_hat = net(X)
l = loss(y_hat, y)
l.backward()
optimizer.step()
train_l_sum += l.item() * y.shape[0]
n += y.shape[0]
time_s = "time %.2f sec" % (time.time() - start)
if valid_iter is not None:
valid_loss, valid_acc = evaluate_loss_acc(valid_iter, net, device)
epoch_s = ("epoch %d, train loss %f, valid loss %f, valid acc %f, "
% (epoch + 1, train_l_sum / n, valid_loss, valid_acc))
else:
epoch_s = ("epoch %d, train loss %f, "
% (epoch + 1, train_l_sum / n))
print(epoch_s + time_s + ', lr ' + str(lr))
調(diào)參
num_epochs, lr_period, lr_decay = 20, 10, 0.1
lr, wd = 0.03, 1e-4
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
net = get_net(device)
train(net, train_iter, valid_iter, num_epochs, lr, wd, device, lr_period, lr_decay)
在完整數(shù)據(jù)集上訓(xùn)練模型
# 使用上面的參數(shù)設(shè)置,在完整數(shù)據(jù)集上訓(xùn)練模型大致需要40-50分鐘的時(shí)間
net = get_net(device)
train(net, train_valid_iter, None, num_epochs, lr, wd, device, lr_period, lr_decay)
對(duì)測(cè)試集分類(lèi)并提交結(jié)果
用訓(xùn)練好的模型對(duì)測(cè)試數(shù)據(jù)進(jìn)行預(yù)測(cè)。比賽要求對(duì)測(cè)試集中的每張圖片,都要預(yù)測(cè)其屬于各個(gè)類(lèi)別的概率。
preds = []
for X, _ in test_iter:
X = X.to(device)
output = net(X)
output = torch.softmax(output, dim=1)
preds += output.tolist()
ids = sorted(os.listdir(os.path.join(new_data_dir, 'test/unknown')))
with open('submission.csv', 'w') as f:
f.write('id,' + ','.join(train_valid_ds.classes) + '\n')
for i, output in zip(ids, preds):
f.write(i.split('.')[0] + ',' + ','.join(
[str(num) for num in output]) + '\n')
GAN生成對(duì)抗網(wǎng)絡(luò)
介紹生成對(duì)抗網(wǎng)絡(luò)的原理和實(shí)現(xiàn)
Generative Adversarial Networks
Throughout most of this book, we have talked about how to make predictions. In some form or another, we used deep neural networks learned mappings from data points to labels. This kind of learning is called discriminative learning, as in, we'd like to be able to discriminate between photos cats and photos of dogs. Classifiers and regressors are both examples of discriminative learning. And neural networks trained by backpropagation have upended everything we thought we knew about discriminative learning on large complicated datasets. Classification accuracies on high-res images has gone from useless to human-level (with some caveats) in just 5-6 years. We will spare you another spiel about all the other discriminative tasks where deep neural networks do astoundingly well.
But there is more to machine learning than just solving discriminative tasks. For example, given a large dataset, without any labels, we might want to learn a model that concisely captures the characteristics of this data. Given such a model, we could sample synthetic data points that resemble the distribution of the training data. For example, given a large corpus of photographs of faces, we might want to be able to generate a new photorealistic image that looks like it might plausibly have come from the same dataset. This kind of learning is called generative modeling.
Until recently, we had no method that could synthesize novel photorealistic images. But the success of deep neural networks for discriminative learning opened up new possibilities. One big trend over the last three years has been the application of discriminative deep nets to overcome challenges in problems that we do not generally think of as supervised learning problems. The recurrent neural network language models are one example of using a discriminative network (trained to predict the next character) that once trained can act as a generative model.
The GAN architecture is illustrated.As you can see, there are two pieces in GAN architecture - first off, we need a device (say, a deep network but it really could be anything, such as a game rendering engine) that might potentially be able to generate data that looks just like the real thing. If we are dealing with images, this needs to generate images. If we are dealing with speech, it needs to generate audio sequences, and so on. We call this the generator network. The second component is the discriminator network. It attempts to distinguish fake and real data from each other. Both networks are in competition with each other. The generator network attempts to fool the discriminator network. At that point, the discriminator network adapts to the new fake data. This information, in turn is used to improve the generator network, and so on.
%matplotlib inline
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from torch import nn
import numpy as np
from torch.autograd import Variable
import torch
Generate some "real" data
Since this is going to be the world's lamest example, we simply generate data drawn from a Gaussian.
X=np.random.normal(size=(1000,2))
A=np.array([[1,2],[-0.1,0.5]])
b=np.array([1,2])
data=X.dot(A)+b
plt.figure(figsize=(3.5,2.5))
plt.scatter(X[:100,0],X[:100,1],color='red')
plt.show()
plt.figure(figsize=(3.5,2.5))
plt.scatter(data[:100,0],data[:100,1],color='blue')
plt.show()
print("The covariance matrix is\n%s" % np.dot(A.T, A))
The covariance matrix is
[[1.01 1.95]
[1.95 4.25]]
batch_size=8
data_iter=DataLoader(data,batch_size=batch_size)
Generator
Our generator network will be the simplest network possible - a single layer linear model. This is since we will be driving that linear network with a Gaussian data generator. Hence, it literally only needs to learn the parameters to fake things perfectly.
class net_G(nn.Module):
def __init__(self):
super(net_G,self).__init__()
self.model=nn.Sequential(
nn.Linear(2,2),
)
self._initialize_weights()
def forward(self,x):
x=self.model(x)
return x
def _initialize_weights(self):
for m in self.modules():
if isinstance(m,nn.Linear):
m.weight.data.normal_(0,0.02)
m.bias.data.zero_()
Discriminator
For the discriminator we will be a bit more discriminating: we will use an MLP with 3 layers to make things a bit more interesting.
class net_D(nn.Module):
def __init__(self):
super(net_D,self).__init__()
self.model=nn.Sequential(
nn.Linear(2,5),
nn.Tanh(),
nn.Linear(5,3),
nn.Tanh(),
nn.Linear(3,1),
nn.Sigmoid()
)
self._initialize_weights()
def forward(self,x):
x=self.model(x)
return x
def _initialize_weights(self):
for m in self.modules():
if isinstance(m,nn.Linear):
m.weight.data.normal_(0,0.02)
m.bias.data.zero_()
Training
First we define a function to update the discriminator.
# Saved in the d2l package for later use
def update_D(X,Z,net_D,net_G,loss,trainer_D):
batch_size=X.shape[0]
Tensor=torch.FloatTensor
ones=Variable(Tensor(np.ones(batch_size))).view(batch_size,1)
zeros = Variable(Tensor(np.zeros(batch_size))).view(batch_size,1)
real_Y=net_D(X.float())
fake_X=net_G(Z)
fake_Y=net_D(fake_X)
loss_D=(loss(real_Y,ones)+loss(fake_Y,zeros))/2
loss_D.backward()
trainer_D.step()
return float(loss_D.sum())
The generator is updated similarly. Here we reuse the cross-entropy loss but change the label of the fake data from 0 to 1 .
# Saved in the d2l package for later use
def update_G(Z,net_D,net_G,loss,trainer_G):
batch_size=Z.shape[0]
Tensor=torch.FloatTensor
ones=Variable(Tensor(np.ones((batch_size,)))).view(batch_size,1)
fake_X=net_G(Z)
fake_Y=net_D(fake_X)
loss_G=loss(fake_Y,ones)
loss_G.backward()
trainer_G.step()
return float(loss_G.sum())
Both the discriminator and the generator performs a binary logistic regression with the cross-entropy loss. We use Adam to smooth the training process. In each iteration, we first update the discriminator and then the generator. We visualize both losses and generated examples.
def train(net_D,net_G,data_iter,num_epochs,lr_D,lr_G,latent_dim,data):
loss=nn.BCELoss()
Tensor=torch.FloatTensor
trainer_D=torch.optim.Adam(net_D.parameters(),lr=lr_D)
trainer_G=torch.optim.Adam(net_G.parameters(),lr=lr_G)
plt.figure(figsize=(7,4))
d_loss_point=[]
g_loss_point=[]
d_loss=0
g_loss=0
for epoch in range(1,num_epochs+1):
d_loss_sum=0
g_loss_sum=0
batch=0
for X in data_iter:
batch+=1
X=Variable(X)
batch_size=X.shape[0]
Z=Variable(Tensor(np.random.normal(0,1,(batch_size,latent_dim))))
trainer_D.zero_grad()
d_loss = update_D(X, Z, net_D, net_G, loss, trainer_D)
d_loss_sum+=d_loss
trainer_G.zero_grad()
g_loss = update_G(Z, net_D, net_G, loss, trainer_G)
g_loss_sum+=g_loss
d_loss_point.append(d_loss_sum/batch)
g_loss_point.append(g_loss_sum/batch)
plt.ylabel('Loss', fontdict={'size': 14})
plt.xlabel('epoch', fontdict={'size': 14})
plt.xticks(range(0,num_epochs+1,3))
plt.plot(range(1,num_epochs+1),d_loss_point,color='orange',label='discriminator')
plt.plot(range(1,num_epochs+1),g_loss_point,color='blue',label='generator')
plt.legend()
plt.show()
print(d_loss,g_loss)
Z =Variable(Tensor( np.random.normal(0, 1, size=(100, latent_dim))))
fake_X=net_G(Z).detach().numpy()
plt.figure(figsize=(3.5,2.5))
plt.scatter(data[:,0],data[:,1],color='blue',label='real')
plt.scatter(fake_X[:,0],fake_X[:,1],color='orange',label='generated')
plt.legend()
plt.show()
Now we specify the hyper-parameters to fit the Gaussian distribution.
if __name__ == '__main__':
lr_D,lr_G,latent_dim,num_epochs=0.05,0.005,2,20
generator=net_G()
discriminator=net_D()
train(discriminator,generator,data_iter,num_epochs,lr_D,lr_G,latent_dim,data)
0.6932446360588074 0.6927103996276855
DCGAN
介紹Deep Convolutional Generative Adversarial Networks的原理和實(shí)現(xiàn)
Deep Convolutional Generative Adversarial Networks
we introduced the basic ideas behind how GANs work. We showed that they can draw samples from some simple, easy-to-sample distribution, like a uniform or normal distribution, and transform them into samples that appear to match the distribution of some dataset. And while our example of matching a 2D Gaussian distribution got the point across, it is not especially exciting.
In this section, we will demonstrate how you can use GANs to generate photorealistic images. We will be basing our models on the deep convolutional GANs (DCGAN) introduced in :cite:Radford.Metz.Chintala.2015
. We will borrow the convolutional architecture that have proven so successful for discriminative computer vision problems and show how via GANs, they can be leveraged to generate photorealistic images.
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from torch import nn
import numpy as np
from torch.autograd import Variable
import torch
from torchvision.datasets import ImageFolder
from torchvision.transforms import transforms
import zipfile
cuda = True if torch.cuda.is_available() else False
print(cuda)
The Pokemon Dataset
The dataset we will use is a collection of Pokemon sprites obtained from pokemondb. First download, extract and load this dataset.
We resize each image into 64X64. The ToTensor
transformation will project the pixel value into[0,1], while our generator will use the tanh function to obtain outputs in [-1,1]. Therefore we normalize the data with 0.5 mean and0.5standard deviation to match the value range.
data_dir='/home/kesci/input/pokemon8600/'
batch_size=256
transform=transforms.Compose([
transforms.Resize((64,64)),
transforms.ToTensor(),
transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))
])
pokemon=ImageFolder(data_dir+'pokemon',transform)
data_iter=DataLoader(pokemon,batch_size=batch_size,shuffle=True)
Let's visualize the first 20 images.
fig=plt.figure(figsize=(4,4))
imgs=data_iter.dataset.imgs
for i in range(20):
img = plt.imread(imgs[i*150][0])
plt.subplot(4,5,i+1)
plt.imshow(img)
plt.axis('off')
plt.show()
class G_block(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size=4,strides=2, padding=1):
super(G_block,self).__init__()
self.conv2d_trans=nn.ConvTranspose2d(in_channels, out_channels, kernel_size=kernel_size,
stride=strides, padding=padding, bias=False)
self.batch_norm=nn.BatchNorm2d(out_channels,0.8)
self.activation=nn.ReLU()
def forward(self,x):
return self.activation(self.batch_norm(self.conv2d_trans(x)))
Tensor=torch.cuda.FloatTensor
x=Variable(Tensor(np.zeros((2,3,16,16))))
g_blk=G_block(3,20)
g_blk.cuda()
print(g_blk(x).shape)
out
torch.Size([2, 20, 32, 32])
x=Variable(Tensor(np.zeros((2,3,1,1))))
g_blk=G_block(3,20,strides=1,padding=0)
g_blk.cuda()
print(g_blk(x).shape)
out
torch.Size([2, 20, 4, 4])
class net_G(nn.Module):
def __init__(self,in_channels):
super(net_G,self).__init__()
n_G=64
self.model=nn.Sequential(
G_block(in_channels,n_G*8,strides=1,padding=0),
G_block(n_G*8,n_G*4),
G_block(n_G*4,n_G*2),
G_block(n_G*2,n_G),
nn.ConvTranspose2d(
n_G,3,kernel_size=4,stride=2,padding=1,bias=False
),
nn.Tanh()
)
def forward(self,x):
x=self.model(x)
return x
def weights_init_normal(m):
classname = m.__class__.__name__
if classname.find("Conv") != -1:
torch.nn.init.normal_(m.weight.data, mean=0, std=0.02)
elif classname.find("BatchNorm2d") != -1:
torch.nn.init.normal_(m.weight.data, mean=1.0, std=0.02)
torch.nn.init.constant_(m.bias.data, 0.0)
Generate a 100 dimensional latent variable to verify the generator's output shape.
x=Variable(Tensor(np.zeros((1,100,1,1))))
generator=net_G(100)
generator.cuda()
generator.apply(weights_init_normal)
print(generator(x).shape)
out
torch.Size([1, 3, 64, 64])
alphas = [0, 0.2, 0.4, .6]
x = np.arange(-2, 1, 0.1)
Y = [nn.LeakyReLU(alpha)(Tensor(x)).cpu().numpy()for alpha in alphas]
plt.figure(figsize=(4,4))
for y in Y:
plt.plot(x,y)
plt.show()
The basic block of the discriminator is a convolution layer followed by a batch normalization layer and a leaky ReLU activation. The hyper-parameters of the convolution layer are similar to the transpose convolution layer in the generator block.
class D_block(nn.Module):
def __init__(self,in_channels,out_channels,kernel_size=4,strides=2,
padding=1,alpha=0.2):
super(D_block,self).__init__()
self.conv2d=nn.Conv2d(in_channels,out_channels,kernel_size,strides,padding,bias=False)
self.batch_norm=nn.BatchNorm2d(out_channels,0.8)
self.activation=nn.LeakyReLU(alpha)
def forward(self,X):
return self.activation(self.batch_norm(self.conv2d(X)))
x = Variable(Tensor(np.zeros((2, 3, 16, 16))))
d_blk = D_block(3,20)
d_blk.cuda()
print(d_blk(x).shape)
out
torch.Size([2, 20, 8, 8])
The discriminator is a mirror of the generator.
class net_D(nn.Module):
def __init__(self,in_channels):
super(net_D,self).__init__()
n_D=64
self.model=nn.Sequential(
D_block(in_channels,n_D),
D_block(n_D,n_D*2),
D_block(n_D*2,n_D*4),
D_block(n_D*4,n_D*8)
)
self.conv=nn.Conv2d(n_D*8,1,kernel_size=4,bias=False)
self.activation=nn.Sigmoid()
# self._initialize_weights()
def forward(self,x):
x=self.model(x)
x=self.conv(x)
x=self.activation(x)
return x
It uses a convolution layer with output channel 1 as the last layer to obtain a single prediction value.
x = Variable(Tensor(np.zeros((1, 3, 64, 64))))
discriminator=net_D(3)
discriminator.cuda()
discriminator.apply(weights_init_normal)
print(discriminator(x).shape)
out
torch.Size([1, 1, 1, 1])
def update_D(X,Z,net_D,net_G,loss,trainer_D):
batch_size=X.shape[0]
Tensor=torch.cuda.FloatTensor
ones=Variable(Tensor(np.ones(batch_size,)),requires_grad=False).view(batch_size,1)
zeros = Variable(Tensor(np.zeros(batch_size,)),requires_grad=False).view(batch_size,1)
real_Y=net_D(X).view(batch_size,-1)
fake_X=net_G(Z)
fake_Y=net_D(fake_X).view(batch_size,-1)
loss_D=(loss(real_Y,ones)+loss(fake_Y,zeros))/2
loss_D.backward()
trainer_D.step()
return float(loss_D.sum())
def update_G(Z,net_D,net_G,loss,trainer_G):
batch_size=Z.shape[0]
Tensor=torch.cuda.FloatTensor
ones=Variable(Tensor(np.ones((batch_size,))),requires_grad=False).view(batch_size,1)
fake_X=net_G(Z)
fake_Y=net_D(fake_X).view(batch_size,-1)
loss_G=loss(fake_Y,ones)
loss_G.backward()
trainer_G.step()
return float(loss_G.sum())
def train(net_D,net_G,data_iter,num_epochs,lr,latent_dim):
loss=nn.BCELoss()
Tensor=torch.cuda.FloatTensor
trainer_D=torch.optim.Adam(net_D.parameters(),lr=lr,betas=(0.5,0.999))
trainer_G=torch.optim.Adam(net_G.parameters(),lr=lr,betas=(0.5,0.999))
plt.figure(figsize=(7,4))
d_loss_point=[]
g_loss_point=[]
d_loss=0
g_loss=0
for epoch in range(1,num_epochs+1):
d_loss_sum=0
g_loss_sum=0
batch=0
for X in data_iter:
X=X[:][0]
batch+=1
X=Variable(X.type(Tensor))
batch_size=X.shape[0]
Z=Variable(Tensor(np.random.normal(0,1,(batch_size,latent_dim,1,1))))
trainer_D.zero_grad()
d_loss = update_D(X, Z, net_D, net_G, loss, trainer_D)
d_loss_sum+=d_loss
trainer_G.zero_grad()
g_loss = update_G(Z, net_D, net_G, loss, trainer_G)
g_loss_sum+=g_loss
d_loss_point.append(d_loss_sum/batch)
g_loss_point.append(g_loss_sum/batch)
print(
"[Epoch %d/%d] [D loss: %f] [G loss: %f]"
% (epoch, num_epochs, d_loss_sum/batch_size, g_loss_sum/batch_size)
)
plt.ylabel('Loss', fontdict={ 'size': 14})
plt.xlabel('epoch', fontdict={ 'size': 14})
plt.xticks(range(0,num_epochs+1,3))
plt.plot(range(1,num_epochs+1),d_loss_point,color='orange',label='discriminator')
plt.plot(range(1,num_epochs+1),g_loss_point,color='blue',label='generator')
plt.legend()
plt.show()
print(d_loss,g_loss)
Z = Variable(Tensor(np.random.normal(0, 1, size=(21, latent_dim, 1, 1))),requires_grad=False)
fake_x = generator(Z)
fake_x=fake_x.cpu().detach().numpy()
plt.figure(figsize=(14,6))
for i in range(21):
im=np.transpose(fake_x[i])
plt.subplot(3,7,i+1)
plt.imshow(im)
plt.show()
Now let's train the model.
if __name__ == '__main__':
lr,latent_dim,num_epochs=0.005,100,50
train(discriminator,generator,data_iter,num_epochs,lr,latent_dim)