什么是生成式對抗網絡GAN
(本教程的代碼,訓練數據全部來自—《深度學習框架Pytroch入門與實踐》, many thanks,此書采用pytorch 0.4.0版本,API接口與1.0有所差別,1.0版本pytroch中已經不推薦使用Variable)
開發/測試環境
- Ubuntu 18.04
- anaconda3, python3.6
- pycharm
- pytroch 1.0
訓練過程
剛開始訓練,輸入為噪聲向量, 生成的圖像也是噪聲
image.png
訓練了幾十次迭代之后
image.png
隨著迭代次數增加,逐漸產生輪廓,仔細觀察剛開始生成的圖像為黑白灰度圖像,沒有彩色信息。
image.png
image.png
image.png
繼續迭代,逐漸產生了彩色信息。
image.png
image.png
image.png
image.png
image.png
image.png
image.png
Loss曲線的變化
image.png
image.png
image.png
image.png
image.png
使用GPU進行訓練
CPU進行訓練太慢了,筆者采用Intel i7 5500u CPU進行訓練,一秒鐘大概只能迭代一次,而且batch size設置為4~8。之后切換到GPU上(Nvdia 1080ti), 單塊GPU, 計算速度為20~30iter/sec, batch size=64, 直觀上比CPU計算塊20倍多。
image.png
迭代30K次
image.png
Process
深度錄屏_選擇區域_20190206000807.gif
深度錄屏_TeamViewer_20190206000840.gif
深度錄屏_TeamViewer_20190206001653.gif
代碼
網絡定義
GAN網絡不同于一般的分類網絡,由2部分組成: 生成器,判別器。
生成器
NetG
輸入: 1x100x1x1 (NxCxHxW) 100維的噪聲向量
輸出: 1x3x96x96 3(Channels)x96(Height)x96(Width)的圖像
from torch import nn
class NetG(nn.Module):
'''
生成器定義
'''
def __init__(self, opt):
super(NetG, self).__init__()
ngf = opt.ngf # 生成器feature map數
self.main = nn.Sequential(
# 輸入是一個nz維度的噪聲,我們可以認為它是一個1*1*nz的feature map
nn.ConvTranspose2d(opt.nz, ngf * 8, 4, 1, 0, bias=False),
nn.BatchNorm2d(ngf * 8),
nn.ReLU(True),
# 上一步的輸出形狀:(ngf*8) x 4 x 4
nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf * 4),
nn.ReLU(True),
# 上一步的輸出形狀: (ngf*4) x 8 x 8
nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf * 2),
nn.ReLU(True),
# 上一步的輸出形狀: (ngf*2) x 16 x 16
nn.ConvTranspose2d(ngf * 2, ngf, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf),
nn.ReLU(True),
# 上一步的輸出形狀:(ngf) x 32 x 32
nn.ConvTranspose2d(ngf, 3, 5, 3, 1, bias=False),
nn.Tanh() # 輸出范圍 -1~1 故而采用Tanh
# 輸出形狀:3 x 96 x 96
)
def forward(self, input):
return self.main(input)
判別器
NetD
輸入: 1x3x96x96 的圖像
輸出: 1x1x1x1 的一個數,表示概率值
class NetD(nn.Module):
'''
判別器定義
'''
def __init__(self, opt):
super(NetD, self).__init__()
ndf = opt.ndf
self.main = nn.Sequential(
# 輸入 3 x 96 x 96
nn.Conv2d(3, ndf, 5, 3, 1, bias=False),
nn.LeakyReLU(0.2, inplace=True),
# 輸出 (ndf) x 32 x 32
nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
nn.BatchNorm2d(ndf * 2),
nn.LeakyReLU(0.2, inplace=True),
# 輸出 (ndf*2) x 16 x 16
nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False),
nn.BatchNorm2d(ndf * 4),
nn.LeakyReLU(0.2, inplace=True),
# 輸出 (ndf*4) x 8 x 8
nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False),
nn.BatchNorm2d(ndf * 8),
nn.LeakyReLU(0.2, inplace=True),
# 輸出 (ndf*8) x 4 x 4
nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
nn.Sigmoid() # 輸出一個數(概率)
)
def forward(self, input):
return self.main(input).view(-1)
參數配置
- batch_size
- learning_rate
- max_epoch 最大迭代epoch個數
import os
import ipdb
import torch as t
import torchvision as tv
import tqdm
from model import NetG, NetD
from torch.autograd import Variable
from torchnet.meter import AverageValueMeter
class Config(object):
data_path = 'data/' # 數據集存放路徑
num_workers = 4 # 多進程加載數據所用的進程數
image_size = 96 # 圖片尺寸
batch_size = 16
max_epoch = 200
lr1 = 2e-4 # 生成器的學習率
lr2 = 2e-4 # 判別器的學習率
beta1=0.5 # Adam優化器的beta1參數
gpu=False # 是否使用GPU
nz=100 # 噪聲維度
ngf = 64 # 生成器feature map數
ndf = 64 # 判別器feature map數
save_path = 'imgs/' #生成圖片保存路徑
vis = True # 是否使用visdom可視化
env = 'GAN' # visdom的env
plot_every = 20 # 每間隔20 batch,visdom畫圖一次
debug_file = '/tmp/debuggan' # 存在該文件則進入debug模式
d_every = 1 # 每1個batch訓練一次判別器
g_every = 5 # 每5個batch訓練一次生成器
decay_every = 10 # 沒10個epoch保存一次模型
netd_path = './checkpoints/netd_100.pth' # 'checkpoints/netd_.pth' #預訓練模型
netg_path = './checkpoints/netg_100.pth' # 'checkpoints/netg_211.pth'
# 只測試不訓練
gen_img = 'result.png'
# 從512張生成的圖片中保存最好的64張
gen_num = 64
gen_search_num = 512
gen_mean = 0 # 噪聲的均值
gen_std = 1 #噪聲的方差
opt = Config()
訓練
訓練生成器網絡
訓練判別器網絡
def train(**kwargs):
for k_,v_ in kwargs.items():
setattr(opt,k_,v_)
if opt.vis:
from visualize import Visualizer
vis = Visualizer(opt.env)
transforms = tv.transforms.Compose([
tv.transforms.Scale(opt.image_size),
tv.transforms.CenterCrop(opt.image_size),
tv.transforms.ToTensor(),
tv.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
dataset = tv.datasets.ImageFolder(opt.data_path,transform=transforms)
dataloader = t.utils.data.DataLoader(dataset,
batch_size = opt.batch_size,
shuffle = True,
num_workers= opt.num_workers,
drop_last=True
)
# 定義網絡
netg, netd = NetG(opt), NetD(opt)
map_location=lambda storage, loc: storage
if opt.netd_path:
netd.load_state_dict(t.load(opt.netd_path, map_location = map_location))
if opt.netg_path:
netg.load_state_dict(t.load(opt.netg_path, map_location = map_location))
# 定義優化器和損失
optimizer_g = t.optim.Adam(netg.parameters(),opt.lr1,betas=(opt.beta1, 0.999))
optimizer_d = t.optim.Adam(netd.parameters(),opt.lr2,betas=(opt.beta1, 0.999))
criterion = t.nn.BCELoss()
# 真圖片label為1,假圖片label為0
# noises為生成網絡的輸入
true_labels = Variable(t.ones(opt.batch_size))
fake_labels = Variable(t.zeros(opt.batch_size))
fix_noises = Variable(t.randn(opt.batch_size,opt.nz,1,1))
noises = Variable(t.randn(opt.batch_size,opt.nz,1,1))
errord_meter = AverageValueMeter()
errorg_meter = AverageValueMeter()
if opt.gpu:
netd.cuda()
netg.cuda()
criterion.cuda()
true_labels,fake_labels = true_labels.cuda(), fake_labels.cuda()
fix_noises,noises = fix_noises.cuda(),noises.cuda()
epochs = range(opt.max_epoch)
for epoch in iter(epochs):
for ii,(img,_) in tqdm.tqdm(enumerate(dataloader)):
real_img = Variable(img)
if opt.gpu:
real_img=real_img.cuda()
if ii%opt.d_every==0:
# 訓練判別器
optimizer_d.zero_grad()
## 盡可能的把真圖片判別為正確
output = netd(real_img)
error_d_real = criterion(output,true_labels)
error_d_real.backward()
## 盡可能把假圖片判別為錯誤
noises.data.copy_(t.randn(opt.batch_size,opt.nz,1,1))
fake_img = netg(noises).detach() # 根據噪聲生成假圖
output = netd(fake_img)
error_d_fake = criterion(output,fake_labels)
error_d_fake.backward()
optimizer_d.step()
error_d = error_d_fake + error_d_real
errord_meter.add(error_d.data.item())
if ii%opt.g_every==0:
# 訓練生成器
optimizer_g.zero_grad()
noises.data.copy_(t.randn(opt.batch_size,opt.nz,1,1))
fake_img = netg(noises)
output = netd(fake_img)
error_g = criterion(output,true_labels)
error_g.backward()
optimizer_g.step()
errorg_meter.add(error_g.data.item())
if opt.vis and ii%opt.plot_every == opt.plot_every-1:
## 可視化
if os.path.exists(opt.debug_file):
ipdb.set_trace()
fix_fake_imgs = netg(fix_noises)
vis.images(fix_fake_imgs.data.cpu().numpy()[:64]*0.5+0.5,win='fixfake')
vis.images(real_img.data.cpu().numpy()[:64]*0.5+0.5,win='real')
vis.plot('errord',errord_meter.value()[0])
vis.plot('errorg',errorg_meter.value()[0])
if epoch%opt.decay_every==0:
# 保存模型、圖片
tv.utils.save_image(fix_fake_imgs.data[:64],'%s/%s.png' %(opt.save_path,epoch),normalize=True,range=(-1,1))
t.save(netd.state_dict(),'checkpoints/netd_%s.pth' %epoch)
t.save(netg.state_dict(),'checkpoints/netg_%s.pth' %epoch)
errord_meter.reset()
errorg_meter.reset()
optimizer_g = t.optim.Adam(netg.parameters(),opt.lr1,betas=(opt.beta1, 0.999))
optimizer_d = t.optim.Adam(netd.parameters(),opt.lr2,betas=(opt.beta1, 0.999))
visualize.py
#coding:utf8
from itertools import chain
import visdom
import torch
import time
import torchvision as tv
import numpy as np
class Visualizer():
'''
封裝了visdom的基本操作,但是你仍然可以通過`self.vis.function`
調用原生的visdom接口
'''
def __init__(self, env='default', **kwargs):
import visdom
self.vis = visdom.Visdom(env=env, **kwargs)
# 畫的第幾個數,相當于橫座標
# 保存(’loss',23) 即loss的第23個點
self.index = {}
self.log_text = ''
def reinit(self,env='default',**kwargs):
'''
修改visdom的配置
'''
self.vis = visdom.Visdom(env=env,**kwargs)
return self
def plot_many(self, d):
'''
一次plot多個
@params d: dict (name,value) i.e. ('loss',0.11)
'''
for k, v in d.iteritems():
self.plot(k, v)
def img_many(self, d):
for k, v in d.iteritems():
self.img(k, v)
def plot(self, name, y):
'''
self.plot('loss',1.00)
'''
x = self.index.get(name, 0)
self.vis.line(Y=np.array([y]), X=np.array([x]),
win=(name),
opts=dict(title=name),
update=None if x == 0 else 'append'
)
self.index[name] = x + 1
def img(self, name, img_):
'''
self.img('input_img',t.Tensor(64,64))
'''
if len(img_.size())<3:
img_ = img_.cpu().unsqueeze(0)
self.vis.image(img_.cpu(),
win=unicode(name),
opts=dict(title=name)
)
def img_grid_many(self,d):
for k, v in d.iteritems():
self.img_grid(k, v)
def img_grid(self, name, input_3d):
'''
一個batch的圖片轉成一個網格圖,i.e. input(36,64,64)
會變成 6*6 的網格圖,每個格子大小64*64
'''
self.img(name, tv.utils.make_grid(
input_3d.cpu()[0].unsqueeze(1).clamp(max=1,min=0)))
def log(self,info,win='log_text'):
'''
self.log({'loss':1,'lr':0.0001})
'''
self.log_text += ('[{time}] {info} <br>'.format(
time=time.strftime('%m%d_%H%M%S'),\
info=info))
self.vis.text(self.log_text,win='log_text')
def __getattr__(self, name):
return getattr(self.vis, name)
測試
輸入: 1x100x1x1的噪聲向量
輸出: 1x3x96x96 的圖像
def generate(**kwargs):
'''
隨機生成動漫頭像,并根據netd的分數選擇較好的
'''
for k_,v_ in kwargs.items():
setattr(opt,k_,v_)
netg, netd = NetG(opt).eval(), NetD(opt).eval()
noises = t.randn(opt.gen_search_num,opt.nz,1,1).normal_(opt.gen_mean,opt.gen_std)
noises = Variable(noises, volatile=True)
map_location=lambda storage, loc: storage
print(opt.netd_path)
print(opt.netg_path)
netd.load_state_dict(t.load(opt.netd_path, map_location='cpu'))
netg.load_state_dict(t.load(opt.netg_path, map_location='cpu'))
# netd.load_state_dict(t.load(opt.netd_path, map_location= map_location))
# netg.load_state_dict(t.load(opt.netg_path, map_location= map_location))
if opt.gpu:
netd.cuda()
netg.cuda()
noises = noises.cuda()
# 生成圖片,并計算圖片在判別器的分數
fake_img = netg(noises)
scores = netd(fake_img).data
# 挑選最好的某幾張
indexs = scores.topk(opt.gen_num)[1]
result = []
for ii in indexs:
result.append(fake_img.data[ii])
# 保存圖片
tv.utils.save_image(t.stack(result),opt.gen_img,normalize=True,range=(-1,1))
生成的圖像:
result.png