機(jī)器翻譯和數(shù)據(jù)集
機(jī)器翻譯(MT):將一段文本從一種語(yǔ)言自動(dòng)翻譯為另一種語(yǔ)言,用神經(jīng)網(wǎng)絡(luò)解決這個(gè)問(wèn)題通常稱為神經(jīng)機(jī)器翻譯(NMT)。 主要特征:輸出是單詞序列而不是單個(gè)單詞。 輸出序列的長(zhǎng)度可能與源序列的長(zhǎng)度不同。
import sys
sys.path.append('/home/kesci/input/d2l9528/')
import collections
import d2l
import zipfile
from d2l.data.base import Vocab
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
from torch import optim
數(shù)據(jù)預(yù)處理
將數(shù)據(jù)集清洗、轉(zhuǎn)化為神經(jīng)網(wǎng)絡(luò)的輸入minbatch
with open('/home/kesci/input/fraeng6506/fra.txt', 'r') as f:
raw_text = f.read()
print(raw_text[0:1000])
Go. Va ! CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)
Hi. Salut ! CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)
Hi. Salut. CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4320462 (gillux)
Run! Cours?! CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906331 (sacredceltic)
Run! Courez?! CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906332 (sacredceltic)
Who? Qui ? CC-BY 2.0 (France) Attribution: tatoeba.org #2083030 (CK) & #4366796 (gillux)
Wow! ?a alors?! CC-BY 2.0 (France) Attribution: tatoeba.org #52027 (Zifre) & #374631 (zmoo)
Fire! Au feu ! CC-BY 2.0 (France) Attribution: tatoeba.org #1829639 (Spamster) & #4627939 (sacredceltic)
Help! à l'aide?! CC-BY 2.0 (France) Attribution: tatoeba.org #435084 (lukaszpp) & #128430 (sysko)
Jump. Saute. CC-BY 2.0 (France) Attribution: tatoeba.org #631038 (Shishir) & #2416938 (Phoenix)
Stop! ?a suffit?! CC-BY 2.0 (France) Attribution: tato
def preprocess_raw(text):
text = text.replace('\u202f', ' ').replace('\xa0', ' ')
out = ''
for i, char in enumerate(text.lower()):
if char in (',', '!', '.') and i > 0 and text[i-1] != ' ':
out += ' '
out += char
return out
text = preprocess_raw(raw_text)
print(text[0:1000])
go . va ! cc-by 2 .0 (france) attribution: tatoeba .org #2877272 (cm) & #1158250 (wittydev)
hi . salut ! cc-by 2 .0 (france) attribution: tatoeba .org #538123 (cm) & #509819 (aiji)
hi . salut . cc-by 2 .0 (france) attribution: tatoeba .org #538123 (cm) & #4320462 (gillux)
run ! cours ! cc-by 2 .0 (france) attribution: tatoeba .org #906328 (papabear) & #906331 (sacredceltic)
run ! courez ! cc-by 2 .0 (france) attribution: tatoeba .org #906328 (papabear) & #906332 (sacredceltic)
who? qui ? cc-by 2 .0 (france) attribution: tatoeba .org #2083030 (ck) & #4366796 (gillux)
wow ! ?a alors ! cc-by 2 .0 (france) attribution: tatoeba .org #52027 (zifre) & #374631 (zmoo)
fire ! au feu ! cc-by 2 .0 (france) attribution: tatoeba .org #1829639 (spamster) & #4627939 (sacredceltic)
help ! à l'aide ! cc-by 2 .0 (france) attribution: tatoeba .org #435084 (lukaszpp) & #128430 (sysko)
jump . saute . cc-by 2 .0 (france) attribution: tatoeba .org #631038 (shishir) & #2416938 (phoenix)
stop ! ?a suffit ! cc-b
字符在計(jì)算機(jī)里是以編碼的形式存在,我們通常所用的空格是 \x20 ,是在標(biāo)準(zhǔn)ASCII可見字符 0x20~0x7e 范圍內(nèi)。 而 \xa0 屬于 latin1 (ISO/IEC_8859-1)中的擴(kuò)展字符集字符,代表不間斷空白符nbsp(non-breaking space),超出gbk編碼范圍,是需要去除的特殊字符。再數(shù)據(jù)預(yù)處理的過(guò)程中,我們首先需要對(duì)數(shù)據(jù)進(jìn)行清洗。
分詞
字符串---單詞組成的列表
num_examples = 50000
source, target = [], []
for i, line in enumerate(text.split('\n')):
if i > num_examples:
break
parts = line.split('\t')
if len(parts) >= 2:
source.append(parts[0].split(' '))
target.append(parts[1].split(' '))
source[0:3], target[0:3]
([['go', '.'], ['hi', '.'], ['hi', '.']],
[['va', '!'], ['salut', '!'], ['salut', '.']])
d2l.set_figsize()
d2l.plt.hist([[len(l) for l in source], [len(l) for l in target]],label=['source', 'target'])
d2l.plt.legend(loc='upper right')
建立詞典
單詞組成的列表---單詞id組成的列表
def build_vocab(tokens):
tokens = [token for line in tokens for token in line]
return d2l.data.base.Vocab(tokens, min_freq=3, use_special_tokens=True)
src_vocab = build_vocab(source)
len(src_vocab
結(jié)果:
3789
載入數(shù)據(jù)集
def pad(line, max_len, padding_token):
if len(line) > max_len:
return line[:max_len]
return line + [padding_token] * (max_len - len(line))
pad(src_vocab[source[0]], 10, src_vocab.pad)
結(jié)果:
[38, 4, 0, 0, 0, 0, 0, 0, 0, 0]
def build_array(lines, vocab, max_len, is_source):
lines = [vocab[line] for line in lines]
if not is_source:
lines = [[vocab.bos] + line + [vocab.eos] for line in lines]
array = torch.tensor([pad(line, max_len, vocab.pad) for line in lines])
valid_len = (array != vocab.pad).sum(1) #第一個(gè)維度
return array, valid_len
def load_data_nmt(batch_size, max_len): # This function is saved in d2l.
src_vocab, tgt_vocab = build_vocab(source), build_vocab(target)
src_array, src_valid_len = build_array(source, src_vocab, max_len, True)
tgt_array, tgt_valid_len = build_array(target, tgt_vocab, max_len, False)
train_data = data.TensorDataset(src_array, src_valid_len, tgt_array, tgt_valid_len)
train_iter = data.DataLoader(train_data, batch_size, shuffle=True)
return src_vocab, tgt_vocab, train_iter
src_vocab, tgt_vocab, train_iter = load_data_nmt(batch_size=2, max_len=8)
for X, X_valid_len, Y, Y_valid_len, in train_iter:
print('X =', X.type(torch.int32), '\nValid lengths for X =', X_valid_len,
'\nY =', Y.type(torch.int32), '\nValid lengths for Y =', Y_valid_len)
break
X = tensor([[ 5, 24, 3, 4, 0, 0, 0, 0],
[ 12, 1388, 7, 3, 4, 0, 0, 0]], dtype=torch.int32)
Valid lengths for X = tensor([4, 5])
Y = tensor([[ 1, 23, 46, 3, 3, 4, 2, 0],
[ 1, 15, 137, 27, 4736, 4, 2, 0]], dtype=torch.int32)
Valid lengths for Y = tensor([7, 7])
Encoder-Decoder
encoder:輸入到隱藏狀態(tài)
decoder:隱藏狀態(tài)到輸出
class Encoder(nn.Module):
def __init__(self, **kwargs):
super(Encoder, self).__init__(**kwargs)
def forward(self, X, *args):
raise NotImplementedError
class Decoder(nn.Module):
def __init__(self, **kwargs):
super(Decoder, self).__init__(**kwargs)
def init_state(self, enc_outputs, *args):
raise NotImplementedError
def forward(self, X, state):
raise NotImplementedError
class EncoderDecoder(nn.Module):
def __init__(self, encoder, decoder, **kwargs):
super(EncoderDecoder, self).__init__(**kwargs)
self.encoder = encoder
self.decoder = decoder
def forward(self, enc_X, dec_X, *args):
enc_outputs = self.encoder(enc_X, *args)
dec_state = self.decoder.init_state(enc_outputs, *args)
return self.decoder(dec_X, dec_state)
可以應(yīng)用在對(duì)話系統(tǒng)、生成式任務(wù)中。
Sequence to Sequence模型
模型:
訓(xùn)練
預(yù)測(cè)
具體結(jié)構(gòu)
Encoder
class Seq2SeqEncoder(d2l.Encoder):
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
dropout=0, **kwargs):
super(Seq2SeqEncoder, self).__init__(**kwargs)
self.num_hiddens=num_hiddens
self.num_layers=num_layers
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.LSTM(embed_size,num_hiddens, num_layers, dropout=dropout)
def begin_state(self, batch_size, device):
return [torch.zeros(size=(self.num_layers, batch_size, self.num_hiddens), device=device),
torch.zeros(size=(self.num_layers, batch_size, self.num_hiddens), device=device)]
def forward(self, X, *args):
X = self.embedding(X) # X shape: (batch_size, seq_len, embed_size)
X = X.transpose(0, 1) # RNN needs first axes to be time
# state = self.begin_state(X.shape[1], device=X.device)
out, state = self.rnn(X)
# The shape of out is (seq_len, batch_size, num_hiddens).
# state contains the hidden state and the memory cell
# of the last time step, the shape is (num_layers, batch_size, num_hiddens)
return out, state
encoder = Seq2SeqEncoder(vocab_size=10, embed_size=8,num_hiddens=16, num_layers=2)
X = torch.zeros((4, 7),dtype=torch.long)
output, state = encoder(X)
output.shape, len(state), state[0].shape, state[1].shape
out
(torch.Size([7, 4, 16]), 2, torch.Size([2, 4, 16]), torch.Size([2, 4, 16]))
Decoder
class Seq2SeqDecoder(d2l.Decoder):
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
dropout=0, **kwargs):
super(Seq2SeqDecoder, self).__init__(**kwargs)
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.LSTM(embed_size,num_hiddens, num_layers, dropout=dropout)
self.dense = nn.Linear(num_hiddens,vocab_size)
def init_state(self, enc_outputs, *args):
return enc_outputs[1]
def forward(self, X, state):
X = self.embedding(X).transpose(0, 1)
out, state = self.rnn(X, state)
# Make the batch to be the first dimension to simplify loss computation.
out = self.dense(out).transpose(0, 1)
return out, state
decoder = Seq2SeqDecoder(vocab_size=10, embed_size=8,num_hiddens=16, num_layers=2)
state = decoder.init_state(encoder(X))
out, state = decoder(X, state)
out.shape, len(state), state[0].shape, state[1].shape
out:
(torch.Size([4, 7, 10]), 2, torch.Size([2, 4, 16]), torch.Size([2, 4, 16]))
損失函數(shù)
def SequenceMask(X, X_len,value=0): maxlen = X.size(1) mask = torch.arange(maxlen)[None, :].to(X_len.device) < X_len[:, None] X[~mask]=value return X
X = torch.tensor([[1,2,3], [4,5,6]])
SequenceMask(X,torch.tensor([1,2]))
out:
tensor([[1, 0, 0],
[4, 5, 0]]
X = torch.ones((2,3, 4))
SequenceMask(X, torch.tensor([1,2]),value=-1)
out:
tensor([[[ 1., 1., 1., 1.],
[-1., -1., -1., -1.],
[-1., -1., -1., -1.]],
[[ 1., 1., 1., 1.],
[ 1., 1., 1., 1.],
[-1., -1., -1., -1.]]])
class MaskedSoftmaxCELoss(nn.CrossEntropyLoss):
# pred shape: (batch_size, seq_len, vocab_size)
# label shape: (batch_size, seq_len)
# valid_length shape: (batch_size, )
def forward(self, pred, label, valid_length):
# the sample weights shape should be (batch_size, seq_len)
weights = torch.ones_like(label)
weights = SequenceMask(weights, valid_length).float()
self.reduction='none'
output=super(MaskedSoftmaxCELoss, self).forward(pred.transpose(1,2), label)
return (output*weights).mean(dim=1)
loss = MaskedSoftmaxCELoss()
loss(torch.ones((3, 4, 10)), torch.ones((3,4),dtype=torch.long), torch.tensor([4,3,0]))
out:
tensor([2.3026, 1.7269, 0.0000])
訓(xùn)練
def train_ch7(model, data_iter, lr, num_epochs, device): # Saved in d2l
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
loss = MaskedSoftmaxCELoss()
tic = time.time()
for epoch in range(1, num_epochs+1):
l_sum, num_tokens_sum = 0.0, 0.0
for batch in data_iter:
optimizer.zero_grad()
X, X_vlen, Y, Y_vlen = [x.to(device) for x in batch]
Y_input, Y_label, Y_vlen = Y[:,:-1], Y[:,1:], Y_vlen-1
Y_hat, _ = model(X, Y_input, X_vlen, Y_vlen)
l = loss(Y_hat, Y_label, Y_vlen).sum()
l.backward()
with torch.no_grad():
d2l.grad_clipping_nn(model, 5, device)
num_tokens = Y_vlen.sum().item()
optimizer.step()
l_sum += l.sum().item()
num_tokens_sum += num_tokens
if epoch % 50 == 0:
print("epoch {0:4d},loss {1:.3f}, time {2:.1f} sec".format(
epoch, (l_sum/num_tokens_sum), time.time()-tic))
tic = time.time()
embed_size, num_hiddens, num_layers, dropout = 32, 32, 2, 0.0
batch_size, num_examples, max_len = 64, 1e3, 10
lr, num_epochs, ctx = 0.005, 300, d2l.try_gpu()
src_vocab, tgt_vocab, train_iter = d2l.load_data_nmt(
batch_size, max_len,num_examples)
encoder = Seq2SeqEncoder(
len(src_vocab), embed_size, num_hiddens, num_layers, dropout)
decoder = Seq2SeqDecoder(
len(tgt_vocab), embed_size, num_hiddens, num_layers, dropout)
model = d2l.EncoderDecoder(encoder, decoder)
train_ch7(model, train_iter, lr, num_epochs, ctx)
out:
epoch 50,loss 0.093, time 38.2 sec
epoch 100,loss 0.046, time 37.9 sec
epoch 150,loss 0.032, time 36.8 sec
epoch 200,loss 0.027, time 37.5 sec
epoch 250,loss 0.026, time 37.8 sec
epoch 300,loss 0.025, time 37.3 sec
測(cè)試
def translate_ch7(model, src_sentence, src_vocab, tgt_vocab, max_len, device):
src_tokens = src_vocab[src_sentence.lower().split(' ')]
src_len = len(src_tokens)
if src_len < max_len:
src_tokens += [src_vocab.pad] * (max_len - src_len)
enc_X = torch.tensor(src_tokens, device=device)
enc_valid_length = torch.tensor([src_len], device=device)
# use expand_dim to add the batch_size dimension.
enc_outputs = model.encoder(enc_X.unsqueeze(dim=0), enc_valid_length)
dec_state = model.decoder.init_state(enc_outputs, enc_valid_length)
dec_X = torch.tensor([tgt_vocab.bos], device=device).unsqueeze(dim=0)
predict_tokens = []
for _ in range(max_len):
Y, dec_state = model.decoder(dec_X, dec_state)
# The token with highest score is used as the next time step input.
dec_X = Y.argmax(dim=2)
py = dec_X.squeeze(dim=0).int().item()
if py == tgt_vocab.eos:
break
predict_tokens.append(py)
return' '.join(tgt_vocab.to_tokens(predict_tokens))
for sentence in ['Go .', 'Wow !', "I'm OK .", 'I won !']:
print(sentence + ' => ' + translate_ch7(
model, sentence, src_vocab, tgt_vocab, max_len, ctx))
out:
Go . => va ! Wow ! => <unk> ! I'm OK . => ?a va . I won ! => j'ai gagné !
Beam Search
簡(jiǎn)單greedy search:
維特比算法:選擇整體分?jǐn)?shù)最高的句子(搜索空間太大) 集束搜索:
注意力機(jī)制與Seq2seq模型
在“編碼器—解碼器(seq2seq)”?節(jié)?,解碼器在各個(gè)時(shí)間步依賴相同的背景變量(context vector)來(lái)獲取輸?序列信息。當(dāng)編碼器為循環(huán)神經(jīng)?絡(luò)時(shí),背景變量來(lái)?它最終時(shí)間步的隱藏狀態(tài)。將源序列輸入信息以循環(huán)單位狀態(tài)編碼,然后將其傳遞給解碼器以生成目標(biāo)序列。然而這種結(jié)構(gòu)存在著問(wèn)題,尤其是RNN機(jī)制實(shí)際中存在長(zhǎng)程梯度消失的問(wèn)題,對(duì)于較長(zhǎng)的句子,我們很難寄希望于將輸入的序列轉(zhuǎn)化為定長(zhǎng)的向量而保存所有的有效信息,所以隨著所需翻譯句子的長(zhǎng)度的增加,這種結(jié)構(gòu)的效果會(huì)顯著下降。
與此同時(shí),解碼的目標(biāo)詞語(yǔ)可能只與原輸入的部分詞語(yǔ)有關(guān),而并不是與所有的輸入有關(guān)。例如,當(dāng)把“Hello world”翻譯成“Bonjour le monde”時(shí),“Hello”映射成“Bonjour”,“world”映射成“monde”。在seq2seq模型中,解碼器只能隱式地從編碼器的最終狀態(tài)中選擇相應(yīng)的信息。然而,注意力機(jī)制可以將這種選擇過(guò)程顯式地建模。
注意力機(jī)制框架
Attention 是一種通用的帶權(quán)池化方法,輸入由兩部分構(gòu)成:詢問(wèn)(query)和鍵值對(duì)(key-value pairs)。,
. Query
, attention layer得到輸出與value的維度一致
. 對(duì)于一個(gè)query來(lái)說(shuō),attention layer 會(huì)與每一個(gè)key計(jì)算注意力分?jǐn)?shù)并進(jìn)行權(quán)重的歸一化,輸出的向量o則是value的加權(quán)求和,而每個(gè)key計(jì)算的權(quán)重與value一一對(duì)應(yīng)。
為了計(jì)算輸出,我們首先假設(shè)有一個(gè)函數(shù)α 用于計(jì)算query和key的相似性,然后可以計(jì)算所有的 attention scores a1,…,
我們使用 softmax函數(shù) 獲得注意力權(quán)重:
最終的輸出就是value的加權(quán)求和:
不同的attetion layer的區(qū)別在于score函數(shù)的選擇,在本節(jié)的其余部分,我們將討論兩個(gè)常用的注意層 Dot-product Attention 和 Multilayer Perceptron Attention;隨后我們將實(shí)現(xiàn)一個(gè)引入attention的seq2seq模型并在英法翻譯語(yǔ)料上進(jìn)行訓(xùn)練與測(cè)試。
import math
import torch
import torch.nn as nn
import os
def file_name_walk(file_dir):
for root, dirs, files in os.walk(file_dir):
# print("root", root) # 當(dāng)前目錄路徑
print("dirs", dirs) # 當(dāng)前路徑下所有子目錄
print("files", files) # 當(dāng)前路徑下所有非目錄子文件
file_name_walk("/home/kesci/input/fraeng6506")
out:
dirs []
files ['_about.txt', 'fra.txt']
Softmax屏蔽
在深入研究實(shí)現(xiàn)之前,我們首先介紹softmax操作符的一個(gè)屏蔽操作。
def SequenceMask(X, X_len,value=-1e6):
maxlen = X.size(1)
#print(X.size(),torch.arange((maxlen),dtype=torch.float)[None, :],'\n',X_len[:, None] )
mask = torch.arange((maxlen),dtype=torch.float)[None, :] >= X_len[:, None]
#print(mask)
X[mask]=value
return X
def masked_softmax(X, valid_length):
# X: 3-D tensor, valid_length: 1-D or 2-D tensor
softmax = nn.Softmax(dim=-1)
if valid_length is None:
return softmax(X)
else:
shape = X.shape
if valid_length.dim() == 1:
try:
valid_length = torch.FloatTensor(valid_length.numpy().repeat(shape[1], axis=0))#[2,2,3,3]
except:
valid_length = torch.FloatTensor(valid_length.cpu().numpy().repeat(shape[1], axis=0))#[2,2,3,3]
else:
valid_length = valid_length.reshape((-1,))
# fill masked elements with a large negative, whose exp is 0
X = SequenceMask(X.reshape((-1, shape[-1])), valid_length)
return softmax(X).reshape(shape)
masked_softmax(torch.rand((2,2,4),dtype=torch.float), torch.FloatTensor([2,3]))
out:
tensor([[[0.5423, 0.4577, 0.0000, 0.0000],
[0.5290, 0.4710, 0.0000, 0.0000]],
[[0.2969, 0.2966, 0.4065, 0.0000],
[0.3607, 0.2203, 0.4190, 0.0000]]])
超出2維矩陣的乘法
X 和 Y 是維度分別為(b,n,m) 和(b,m,k)的張量,進(jìn)行 b 次二維矩陣乘法后得到 Z, 維度為 (b,n,k)。
Z[i,:,:]=dot(X[i,:,:],Y[i,:,:]) for i=1,…,n .
torch.bmm(torch.ones((2,1,3), dtype = torch.float), torch.ones((2,3,2), dtype = torch.float))
out:
tensor([[[3., 3.]],
[[3., 3.]]])
點(diǎn)積注意力
The dot product 假設(shè)query和keys有相同的維度, 即 ?i,??,????∈???. 通過(guò)計(jì)算query和key轉(zhuǎn)置的乘積來(lái)計(jì)算attention score,通常還會(huì)除去
減少計(jì)算出來(lái)的score對(duì)維度??的依賴性,如下
假設(shè) ??∈?^{??×??} 有 n 個(gè)keys. 我們可以通過(guò)矩陣運(yùn)算的方式計(jì)算所有 mn 個(gè)score:
現(xiàn)在讓我們實(shí)現(xiàn)這個(gè)層,它支持一批查詢和鍵值對(duì)。此外,它支持作為正則化隨機(jī)刪除一些注意力權(quán)重.
# Save to the d2l package.
class DotProductAttention(nn.Module):
def __init__(self, dropout, **kwargs):
super(DotProductAttention, self).__init__(**kwargs)
self.dropout = nn.Dropout(dropout)
# query: (batch_size, #queries, d)
# key: (batch_size, #kv_pairs, d)
# value: (batch_size, #kv_pairs, dim_v)
# valid_length: either (batch_size, ) or (batch_size, xx)
def forward(self, query, key, value, valid_length=None):
d = query.shape[-1]
# set transpose_b=True to swap the last two dimensions of key
scores = torch.bmm(query, key.transpose(1,2)) / math.sqrt(d)
attention_weights = self.dropout(masked_softmax(scores, valid_length))
print("attention_weight\n",attention_weights)
return torch.bmm(attention_weights, value)
測(cè)試
現(xiàn)在我們創(chuàng)建了兩個(gè)批,每個(gè)批有一個(gè)query和10個(gè)key-values對(duì)。我們通過(guò)valid_length指定,對(duì)于第一批,我們只關(guān)注前2個(gè)鍵-值對(duì),而對(duì)于第二批,我們將檢查前6個(gè)鍵-值對(duì)。因此,盡管這兩個(gè)批處理具有相同的查詢和鍵值對(duì),但我們獲得的輸出是不同的。
atten = DotProductAttention(dropout=0)
keys = torch.ones((2,10,2),dtype=torch.float)
values = torch.arange((40), dtype=torch.float).view(1,10,4).repeat(2,1,1)
atten(torch.ones((2,1,2),dtype=torch.float), keys, values, torch.FloatTensor([2, 6]))
attention_weight
tensor([[[0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000]],
[[0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000,
0.0000, 0.0000]]])
out:
tensor([[[ 2.0000, 3.0000, 4.0000, 5.0000]],
[[10.0000, 11.0000, 12.0000, 13.0000]]])
多層感知機(jī)注意力
在多層感知器中,我們首先將 query and keys 投影到 .為了更具體,我們將可以學(xué)習(xí)的參數(shù)做如下映射
,
, and
. 將score函數(shù)定義
. 然后將key 和 value 在特征的維度上合并(concatenate),然后送至 a single hidden layer perceptron 這層中 hidden layer 為 ? and 輸出的size為 1 .隱層激活函數(shù)為tanh,無(wú)偏置.
# Save to the d2l package.
class MLPAttention(nn.Module):
def __init__(self, units,ipt_dim,dropout, **kwargs):
super(MLPAttention, self).__init__(**kwargs)
# Use flatten=True to keep query's and key's 3-D shapes.
self.W_k = nn.Linear(ipt_dim, units, bias=False)
self.W_q = nn.Linear(ipt_dim, units, bias=False)
self.v = nn.Linear(units, 1, bias=False)
self.dropout = nn.Dropout(dropout)
def forward(self, query, key, value, valid_length):
query, key = self.W_k(query), self.W_q(key)
#print("size",query.size(),key.size())
# expand query to (batch_size, #querys, 1, units), and key to
# (batch_size, 1, #kv_pairs, units). Then plus them with broadcast.
features = query.unsqueeze(2) + key.unsqueeze(1)
#print("features:",features.size()) #--------------開啟
scores = self.v(features).squeeze(-1)
attention_weights = self.dropout(masked_softmax(scores, valid_length))
return torch.bmm(attention_weights, value)
測(cè)試
盡管MLPAttention包含一個(gè)額外的MLP模型,但如果給定相同的輸入和相同的鍵,我們將獲得與DotProductAttention相同的輸出
atten = MLPAttention(ipt_dim=2,units = 8, dropout=0)
atten(torch.ones((2,1,2), dtype = torch.float), keys, values, torch.FloatTensor([2, 6]))
out:
tensor([[[ 2.0000, 3.0000, 4.0000, 5.0000]],
[[10.0000, 11.0000, 12.0000, 13.0000]]], grad_fn=<BmmBackward>)
總結(jié)
注意力層顯式地選擇相關(guān)的信息。
注意層的內(nèi)存由鍵-值對(duì)組成,因此它的輸出接近于鍵類似于查詢的值。
引入注意力機(jī)制的Seq2seq模型
本節(jié)中將注意機(jī)制添加到sequence to sequence 模型中,以顯式地使用權(quán)重聚合states。下圖展示encoding 和decoding的模型結(jié)構(gòu),在時(shí)間步為t的時(shí)候。此刻attention layer保存著encodering看到的所有信息——即encoding的每一步輸出。在decoding階段,解碼器的t時(shí)刻的隱藏狀態(tài)被當(dāng)作query,encoder的每個(gè)時(shí)間步的hidden states作為key和value進(jìn)行attention聚合. Attetion model的輸出當(dāng)作成上下文信息context vector,并與解碼器輸入Dt拼接起來(lái)一起送到解碼器:
下圖展示了seq2seq機(jī)制的所以層的關(guān)系,下面展示了encoder和decoder的layer結(jié)構(gòu)
import sys
sys.path.append('/home/kesci/input/d2len9900')
import d2l
解碼器
由于帶有注意機(jī)制的seq2seq的編碼器與之前章節(jié)中的Seq2SeqEncoder相同,所以在此處我們只關(guān)注解碼器。我們添加了一個(gè)MLP注意層(MLPAttention),它的隱藏大小與解碼器中的LSTM層相同。然后我們通過(guò)從編碼器傳遞三個(gè)參數(shù)來(lái)初始化解碼器的狀態(tài):
- the encoder outputs of all timesteps:encoder輸出的各個(gè)狀態(tài),被用于attetion layer的memory部分,有相同的key和values
- the hidden state of the encoder’s final timestep:編碼器最后一個(gè)時(shí)間步的隱藏狀態(tài),被用于初始化decoder 的hidden state
- the encoder valid length: 編碼器的有效長(zhǎng)度,借此,注意層不會(huì)考慮編碼器輸出中的填充標(biāo)記(Paddings)
在解碼的每個(gè)時(shí)間步,我們使用解碼器的最后一個(gè)RNN層的輸出作為注意層的query。然后,將注意力模型的輸出與輸入嵌入向量連接起來(lái),輸入到RNN層。雖然RNN層隱藏狀態(tài)也包含來(lái)自解碼器的歷史信息,但是attention model的輸出顯式地選擇了enc_valid_len以內(nèi)的編碼器輸出,這樣attention機(jī)制就會(huì)盡可能排除其他不相關(guān)的信息。
class Seq2SeqAttentionDecoder(d2l.Decoder):
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
dropout=0, **kwargs):
super(Seq2SeqAttentionDecoder, self).__init__(**kwargs)
self.attention_cell = MLPAttention(num_hiddens,num_hiddens, dropout)
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.LSTM(embed_size+ num_hiddens,num_hiddens, num_layers, dropout=dropout)
self.dense = nn.Linear(num_hiddens,vocab_size)
def init_state(self, enc_outputs, enc_valid_len, *args):
outputs, hidden_state = enc_outputs
# print("first:",outputs.size(),hidden_state[0].size(),hidden_state[1].size())
# Transpose outputs to (batch_size, seq_len, hidden_size)
return (outputs.permute(1,0,-1), hidden_state, enc_valid_len)
#outputs.swapaxes(0, 1)
def forward(self, X, state):
enc_outputs, hidden_state, enc_valid_len = state
#("X.size",X.size())
X = self.embedding(X).transpose(0,1)
# print("Xembeding.size2",X.size())
outputs = []
for l, x in enumerate(X):
# print(f"\n{l}-th token")
# print("x.first.size()",x.size())
# query shape: (batch_size, 1, hidden_size)
# select hidden state of the last rnn layer as query
query = hidden_state[0][-1].unsqueeze(1) # np.expand_dims(hidden_state[0][-1], axis=1)
# context has same shape as query
# print("query enc_outputs, enc_outputs:\n",query.size(), enc_outputs.size(), enc_outputs.size())
context = self.attention_cell(query, enc_outputs, enc_outputs, enc_valid_len)
# Concatenate on the feature dimension
# print("context.size:",context.size())
x = torch.cat((context, x.unsqueeze(1)), dim=-1)
# Reshape x to (1, batch_size, embed_size+hidden_size)
# print("rnn",x.size(), len(hidden_state))
out, hidden_state = self.rnn(x.transpose(0,1), hidden_state)
outputs.append(out)
outputs = self.dense(torch.cat(outputs, dim=0))
return outputs.transpose(0, 1), [enc_outputs, hidden_state,
enc_valid_len]
現(xiàn)在我們可以用注意力模型來(lái)測(cè)試seq2seq。為了與第9.7節(jié)中的模型保持一致,我們對(duì)vocab_size、embed_size、num_hiddens和num_layers使用相同的超參數(shù)。結(jié)果,我們得到了相同的解碼器輸出形狀,但是狀態(tài)結(jié)構(gòu)改變了。
encoder = d2l.Seq2SeqEncoder(vocab_size=10, embed_size=8,
num_hiddens=16, num_layers=2)
# encoder.initialize()
decoder = Seq2SeqAttentionDecoder(vocab_size=10, embed_size=8,
num_hiddens=16, num_layers=2)
X = torch.zeros((4, 7),dtype=torch.long)
print("batch size=4\nseq_length=7\nhidden dim=16\nnum_layers=2\n")
print('encoder output size:', encoder(X)[0].size())
print('encoder hidden size:', encoder(X)[1][0].size())
print('encoder memory size:', encoder(X)[1][1].size())
state = decoder.init_state(encoder(X), None)
out, state = decoder(X, state)
out.shape, len(state), state[0].shape, len(state[1]), state[1][0].shape
batch size=4
seq_length=7
hidden dim=16
num_layers=2
encoder output size: torch.Size([7, 4, 16])
encoder hidden size: torch.Size([2, 4, 16])
encoder memory size: torch.Size([2, 4, 16])
out:
(torch.Size([4, 7, 10]), 3, torch.Size([4, 7, 16]), 2, torch.Size([2, 4, 16]))
訓(xùn)練
與第9.7.4節(jié)相似,通過(guò)應(yīng)用相同的訓(xùn)練超參數(shù)和相同的訓(xùn)練損失來(lái)嘗試一個(gè)簡(jiǎn)單的娛樂(lè)模型。從結(jié)果中我們可以看出,由于訓(xùn)練數(shù)據(jù)集中的序列相對(duì)較短,額外的注意層并沒(méi)有帶來(lái)顯著的改進(jìn)。由于編碼器和解碼器的注意層的計(jì)算開銷,該模型比沒(méi)有注意的seq2seq模型慢得多。
import zipfile
import torch
import requests
from io import BytesIO
from torch.utils import data
import sys
import collections
class Vocab(object): # This class is saved in d2l.
def __init__(self, tokens, min_freq=0, use_special_tokens=False):
# sort by frequency and token
counter = collections.Counter(tokens)
token_freqs = sorted(counter.items(), key=lambda x: x[0])
token_freqs.sort(key=lambda x: x[1], reverse=True)
if use_special_tokens:
# padding, begin of sentence, end of sentence, unknown
self.pad, self.bos, self.eos, self.unk = (0, 1, 2, 3)
tokens = ['', '', '', '']
else:
self.unk = 0
tokens = ['']
tokens += [token for token, freq in token_freqs if freq >= min_freq]
self.idx_to_token = []
self.token_to_idx = dict()
for token in tokens:
self.idx_to_token.append(token)
self.token_to_idx[token] = len(self.idx_to_token) - 1
def __len__(self):
return len(self.idx_to_token)
def __getitem__(self, tokens):
if not isinstance(tokens, (list, tuple)):
return self.token_to_idx.get(tokens, self.unk)
else:
return [self.__getitem__(token) for token in tokens]
def to_tokens(self, indices):
if not isinstance(indices, (list, tuple)):
return self.idx_to_token[indices]
else:
return [self.idx_to_token[index] for index in indices]
def load_data_nmt(batch_size, max_len, num_examples=1000):
"""Download an NMT dataset, return its vocabulary and data iterator."""
# Download and preprocess
def preprocess_raw(text):
text = text.replace('\u202f', ' ').replace('\xa0', ' ')
out = ''
for i, char in enumerate(text.lower()):
if char in (',', '!', '.') and text[i-1] != ' ':
out += ' '
out += char
return out
with open('/home/kesci/input/fraeng6506/fra.txt', 'r') as f:
raw_text = f.read()
text = preprocess_raw(raw_text)
# Tokenize
source, target = [], []
for i, line in enumerate(text.split('\n')):
if i >= num_examples:
break
parts = line.split('\t')
if len(parts) >= 2:
source.append(parts[0].split(' '))
target.append(parts[1].split(' '))
# Build vocab
def build_vocab(tokens):
tokens = [token for line in tokens for token in line]
return Vocab(tokens, min_freq=3, use_special_tokens=True)
src_vocab, tgt_vocab = build_vocab(source), build_vocab(target)
# Convert to index arrays
def pad(line, max_len, padding_token):
if len(line) > max_len:
return line[:max_len]
return line + [padding_token] * (max_len - len(line))
def build_array(lines, vocab, max_len, is_source):
lines = [vocab[line] for line in lines]
if not is_source:
lines = [[vocab.bos] + line + [vocab.eos] for line in lines]
array = torch.tensor([pad(line, max_len, vocab.pad) for line in lines])
valid_len = (array != vocab.pad).sum(1)
return array, valid_len
src_vocab, tgt_vocab = build_vocab(source), build_vocab(target)
src_array, src_valid_len = build_array(source, src_vocab, max_len, True)
tgt_array, tgt_valid_len = build_array(target, tgt_vocab, max_len, False)
train_data = data.TensorDataset(src_array, src_valid_len, tgt_array, tgt_valid_len)
train_iter = data.DataLoader(train_data, batch_size, shuffle=True)
return src_vocab, tgt_vocab, train_iter
embed_size, num_hiddens, num_layers, dropout = 32, 32, 2, 0.0
batch_size, num_steps = 64, 10
lr, num_epochs, ctx = 0.005, 500, d2l.try_gpu()
src_vocab, tgt_vocab, train_iter = load_data_nmt(batch_size, num_steps)
encoder = d2l.Seq2SeqEncoder(
len(src_vocab), embed_size, num_hiddens, num_layers, dropout)
decoder = Seq2SeqAttentionDecoder(
len(tgt_vocab), embed_size, num_hiddens, num_layers, dropout)
model = d2l.EncoderDecoder(encoder, decoder)
訓(xùn)練和預(yù)測(cè)
d2l.train_s2s_ch9(model, train_iter, lr, num_epochs, ctx)
epoch 50,loss 0.104, time 54.7 sec
epoch 100,loss 0.046, time 54.8 sec
epoch 150,loss 0.031, time 54.7 sec
epoch 200,loss 0.027, time 54.3 sec
epoch 250,loss 0.025, time 54.3 sec
epoch 300,loss 0.024, time 54.4 sec
epoch 350,loss 0.024, time 54.4 sec
epoch 400,loss 0.024, time 54.5 sec
epoch 450,loss 0.023, time 54.4 sec
epoch 500,loss 0.023, time 54.7 sec
for sentence in ['Go .', 'Good Night !', "I'm OK .", 'I won !']:
print(sentence + ' => ' + d2l.predict_s2s_ch9(
model, sentence, src_vocab, tgt_vocab, num_steps, ctx))
Go . => va !
Good Night ! => !
I'm OK . => ?a va .
I won ! => j'ai gagné !
Transformer
在之前的章節(jié)中,我們已經(jīng)介紹了主流的神經(jīng)網(wǎng)絡(luò)架構(gòu)如卷積神經(jīng)網(wǎng)絡(luò)(CNNs)和循環(huán)神經(jīng)網(wǎng)絡(luò)(RNNs)。讓我們進(jìn)行一些回顧:
- CNNs 易于并行化,卻不適合捕捉變長(zhǎng)序列內(nèi)的依賴關(guān)系。
- RNNs 適合捕捉長(zhǎng)距離變長(zhǎng)序列的依賴,但是卻難以實(shí)現(xiàn)并行化處理序列。
為了整合CNN和RNN的優(yōu)勢(shì),[Vaswani et al., 2017] 創(chuàng)新性地使用注意力機(jī)制設(shè)計(jì)了Transformer模型。該模型利用attention機(jī)制實(shí)現(xiàn)了并行化捕捉序列依賴,并且同時(shí)處理序列的每個(gè)位置的tokens,上述優(yōu)勢(shì)使得Transformer模型在性能優(yōu)異的同時(shí)大大減少了訓(xùn)練時(shí)間。
圖10.3.1展示了Transformer模型的架構(gòu),與9.7節(jié)的seq2seq模型相似,Transformer同樣基于編碼器-解碼器架構(gòu),其區(qū)別主要在于以下三點(diǎn):
- Transformer blocks:將seq2seq模型重的循環(huán)網(wǎng)絡(luò)替換為了Transformer Blocks,該模塊包含一個(gè)多頭注意力層(Multi-head Attention Layers)以及兩個(gè)position-wise feed-forward networks(FFN)。對(duì)于解碼器來(lái)說(shuō),另一個(gè)多頭注意力層被用于接受編碼器的隱藏狀態(tài)。
- Add and norm:多頭注意力層和前饋網(wǎng)絡(luò)的輸出被送到兩個(gè)“add and norm”層進(jìn)行處理,該層包含殘差結(jié)構(gòu)以及層歸一化。
-
Position encoding:由于自注意力層并沒(méi)有區(qū)分元素的順序,所以一個(gè)位置編碼層被用于向序列元素里添加位置信息。
Fig.10.3.1Trans former 架構(gòu)
在接下來(lái)的部分,我們將會(huì)帶領(lǐng)大家實(shí)現(xiàn)Transformer里全新的子結(jié)構(gòu),并且構(gòu)建一個(gè)神經(jīng)機(jī)器翻譯模型用以訓(xùn)練和測(cè)試。
import os
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import sys
sys.path.append('/home/kesci/input/d2len9900')
import d2l
以下是復(fù)制了上一小節(jié)中 masked softmax 實(shí)現(xiàn),這里就不再贅述了.
def SequenceMask(X, X_len,value=-1e6):
maxlen = X.size(1)
X_len = X_len.to(X.device)
#print(X.size(),torch.arange((maxlen),dtype=torch.float)[None, :],'\n',X_len[:, None] )
mask = torch.arange((maxlen), dtype=torch.float, device=X.device)
mask = mask[None, :] < X_len[:, None]
#print(mask)
X[~mask]=value
return X
def masked_softmax(X, valid_length):
# X: 3-D tensor, valid_length: 1-D or 2-D tensor
softmax = nn.Softmax(dim=-1)
if valid_length is None:
return softmax(X)
else:
shape = X.shape
if valid_length.dim() == 1:
try:
valid_length = torch.FloatTensor(valid_length.numpy().repeat(shape[1], axis=0))#[2,2,3,3]
except:
valid_length = torch.FloatTensor(valid_length.cpu().numpy().repeat(shape[1], axis=0))#[2,2,3,3]
else:
valid_length = valid_length.reshape((-1,))
# fill masked elements with a large negative, whose exp is 0
X = SequenceMask(X.reshape((-1, shape[-1])), valid_length)
return softmax(X).reshape(shape)
# Save to the d2l package.
class DotProductAttention(nn.Module):
def __init__(self, dropout, **kwargs):
super(DotProductAttention, self).__init__(**kwargs)
self.dropout = nn.Dropout(dropout)
# query: (batch_size, #queries, d)
# key: (batch_size, #kv_pairs, d)
# value: (batch_size, #kv_pairs, dim_v)
# valid_length: either (batch_size, ) or (batch_size, xx)
def forward(self, query, key, value, valid_length=None):
d = query.shape[-1]
# set transpose_b=True to swap the last two dimensions of key
scores = torch.bmm(query, key.transpose(1,2)) / math.sqrt(d)
attention_weights = self.dropout(masked_softmax(scores, valid_length))
return torch.bmm(attention_weights, value)
多頭注意力層
在我們討論多頭注意力層之前,先來(lái)迅速理解以下自注意力(self-attention)的結(jié)構(gòu)。自注意力模型是一個(gè)正規(guī)的注意力模型,序列的每一個(gè)元素對(duì)應(yīng)的key,value,query是完全一致的。如圖10.3.2 自注意力輸出了一個(gè)與輸入長(zhǎng)度相同的表征序列,與循環(huán)神經(jīng)網(wǎng)絡(luò)相比,自注意力對(duì)每個(gè)元素輸出的計(jì)算是并行的,所以我們可以高效的實(shí)現(xiàn)這個(gè)模塊。
多頭注意力層包含 h 個(gè)并行的自注意力層,每一個(gè)這種層被成為一個(gè)head。對(duì)每個(gè)頭來(lái)說(shuō),在進(jìn)行注意力計(jì)算之前,我們會(huì)將query、key和value用三個(gè)現(xiàn)行層進(jìn)行映射,這 h 個(gè)注意力頭的輸出將會(huì)被拼接之后輸入最后一個(gè)線性層進(jìn)行整合。
假設(shè)query,key和value的維度分別是 、 和 。那么對(duì)于每一個(gè)頭 i=1,…,h ,我們可以訓(xùn)練相應(yīng)的模型權(quán)重 ,以得到每個(gè)頭的輸出: 這里的attention可以是任意的attention function,比如前一節(jié)介紹的dot-product attention以及MLP attention。之后我們將所有head對(duì)應(yīng)的輸出拼接起來(lái),送入最后一個(gè)線性層進(jìn)行整合,這個(gè)層的權(quán)重可以表示為 接下來(lái)我們就可以來(lái)實(shí)現(xiàn)多頭注意力了,假設(shè)我們有h個(gè)頭,隱藏層權(quán)重 與query,key,value的維度一致。除此之外,因?yàn)槎囝^注意力層保持輸入與輸出張量的維度不變,所以輸出feature的維度也設(shè)置為 。
class MultiHeadAttention(nn.Module):
def __init__(self, input_size, hidden_size, num_heads, dropout, **kwargs):
super(MultiHeadAttention, self).__init__(**kwargs)
self.num_heads = num_heads
self.attention = DotProductAttention(dropout)
self.W_q = nn.Linear(input_size, hidden_size, bias=False)
self.W_k = nn.Linear(input_size, hidden_size, bias=False)
self.W_v = nn.Linear(input_size, hidden_size, bias=False)
self.W_o = nn.Linear(hidden_size, hidden_size, bias=False)
def forward(self, query, key, value, valid_length):
# query, key, and value shape: (batch_size, seq_len, dim),
# where seq_len is the length of input sequence
# valid_length shape is either (batch_size, )
# or (batch_size, seq_len).
# Project and transpose query, key, and value from
# (batch_size, seq_len, hidden_size * num_heads) to
# (batch_size * num_heads, seq_len, hidden_size).
query = transpose_qkv(self.W_q(query), self.num_heads)
key = transpose_qkv(self.W_k(key), self.num_heads)
value = transpose_qkv(self.W_v(value), self.num_heads)
if valid_length is not None:
# Copy valid_length by num_heads times
device = valid_length.device
valid_length = valid_length.cpu().numpy() if valid_length.is_cuda else valid_length.numpy()
if valid_length.ndim == 1:
valid_length = torch.FloatTensor(np.tile(valid_length, self.num_heads))
else:
valid_length = torch.FloatTensor(np.tile(valid_length, (self.num_heads,1)))
valid_length = valid_length.to(device)
output = self.attention(query, key, value, valid_length)
output_concat = transpose_output(output, self.num_heads)
return self.W_o(output_concat)
def transpose_qkv(X, num_heads):
# Original X shape: (batch_size, seq_len, hidden_size * num_heads),
# -1 means inferring its value, after first reshape, X shape:
# (batch_size, seq_len, num_heads, hidden_size)
X = X.view(X.shape[0], X.shape[1], num_heads, -1)
# After transpose, X shape: (batch_size, num_heads, seq_len, hidden_size)
X = X.transpose(2, 1).contiguous()
# Merge the first two dimensions. Use reverse=True to infer shape from
# right to left.
# output shape: (batch_size * num_heads, seq_len, hidden_size)
output = X.view(-1, X.shape[2], X.shape[3])
return output
# Saved in the d2l package for later use
def transpose_output(X, num_heads):
# A reversed version of transpose_qkv
X = X.view(-1, num_heads, X.shape[1], X.shape[2])
X = X.transpose(2, 1).contiguous()
return X.view(X.shape[0], X.shape[1], -1)
cell = MultiHeadAttention(5, 9, 3, 0.5)
X = torch.ones((2, 4, 5))
valid_length = torch.FloatTensor([2, 3])
cell(X, X, X, valid_length).shape
out:
torch.Size([2, 4, 9])
基于位置的前饋網(wǎng)絡(luò)
Transformer 模塊另一個(gè)非常重要的部分就是基于位置的前饋網(wǎng)絡(luò)(FFN),它接受一個(gè)形狀為(batch_size,seq_length, feature_size)的三維張量。Position-wise FFN由兩個(gè)全連接層組成,他們作用在最后一維上。因?yàn)樾蛄械拿總€(gè)位置的狀態(tài)都會(huì)被單獨(dú)地更新,所以我們稱他為position-wise,這等效于一個(gè)1x1的卷積。
下面我們來(lái)實(shí)現(xiàn)PositionWiseFFN:
# Save to the d2l package.
class PositionWiseFFN(nn.Module):
def __init__(self, input_size, ffn_hidden_size, hidden_size_out, **kwargs):
super(PositionWiseFFN, self).__init__(**kwargs)
self.ffn_1 = nn.Linear(input_size, ffn_hidden_size)
self.ffn_2 = nn.Linear(ffn_hidden_size, hidden_size_out)
def forward(self, X):
return self.ffn_2(F.relu(self.ffn_1(X)))
與多頭注意力層相似,F(xiàn)FN層同樣只會(huì)對(duì)最后一維的大小進(jìn)行改變;除此之外,對(duì)于兩個(gè)完全相同的輸入,F(xiàn)FN層的輸出也將相等。
ffn = PositionWiseFFN(4, 4, 8)
out = ffn(torch.ones((2,3,4)))
print(out, out.shape)
tensor([[[ 0.2040, -0.1118, -0.1163, 0.1494, 0.3978, -0.5561, 0.4662,
-0.6598],
[ 0.2040, -0.1118, -0.1163, 0.1494, 0.3978, -0.5561, 0.4662,
-0.6598],
[ 0.2040, -0.1118, -0.1163, 0.1494, 0.3978, -0.5561, 0.4662,
-0.6598]],
[[ 0.2040, -0.1118, -0.1163, 0.1494, 0.3978, -0.5561, 0.4662,
-0.6598],
[ 0.2040, -0.1118, -0.1163, 0.1494, 0.3978, -0.5561, 0.4662,
-0.6598],
[ 0.2040, -0.1118, -0.1163, 0.1494, 0.3978, -0.5561, 0.4662,
-0.6598]]], grad_fn=<AddBackward0>) torch.Size([2, 3, 8])
Add and Norm
除了上面兩個(gè)模塊之外,Transformer還有一個(gè)重要的相加歸一化層,它可以平滑地整合輸入和其他層的輸出,因此我們?cè)诿總€(gè)多頭注意力層和FFN層后面都添加一個(gè)含殘差連接的Layer Norm層。這里 Layer Norm 與7.5小節(jié)的Batch Norm很相似,唯一的區(qū)別在于Batch Norm是對(duì)于batch size這個(gè)維度進(jìn)行計(jì)算均值和方差的,而Layer Norm則是對(duì)最后一維進(jìn)行計(jì)算。層歸一化可以防止層內(nèi)的數(shù)值變化過(guò)大,從而有利于加快訓(xùn)練速度并且提高泛化性能。 (ref)
layernorm = nn.LayerNorm(normalized_shape=2, elementwise_affine=True)
batchnorm = nn.BatchNorm1d(num_features=2, affine=True)
X = torch.FloatTensor([[1,2], [3,4]])
print('layer norm:', layernorm(X))
print('batch norm:', batchnorm(X))
# Save to the d2l package.
class AddNorm(nn.Module):
def __init__(self, hidden_size, dropout, **kwargs):
super(AddNorm, self).__init__(**kwargs)
self.dropout = nn.Dropout(dropout)
self.norm = nn.LayerNorm(hidden_size)
def forward(self, X, Y):
return self.norm(self.dropout(Y) + X)
由于殘差連接,X和Y需要有相同的維度。
add_norm = AddNorm(4, 0.5)
add_norm(torch.ones((2,3,4)), torch.ones((2,3,4))).shape
位置編碼
與循環(huán)神經(jīng)網(wǎng)絡(luò)不同,無(wú)論是多頭注意力網(wǎng)絡(luò)還是前饋神經(jīng)網(wǎng)絡(luò)都是獨(dú)立地對(duì)每個(gè)位置的元素進(jìn)行更新,這種特性幫助我們實(shí)現(xiàn)了高效的并行,卻丟失了重要的序列順序的信息。為了更好的捕捉序列信息,Transformer模型引入了位置編碼去保持輸入序列元素的位置。 假設(shè)輸入序列的嵌入表示 , 序列長(zhǎng)度為 l 嵌入向量維度為 d ,則其位置編碼為
,輸出的向量就是二者相加 X+P 。 位置編碼是一個(gè)二維的矩陣,i對(duì)應(yīng)著序列中的順序,j對(duì)應(yīng)其embedding vector內(nèi)部的維度索引。我們可以通過(guò)以下等式計(jì)算位置編碼:
class PositionalEncoding(nn.Module):
def __init__(self, embedding_size, dropout, max_len=1000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(dropout)
self.P = np.zeros((1, max_len, embedding_size))
X = np.arange(0, max_len).reshape(-1, 1) / np.power(
10000, np.arange(0, embedding_size, 2)/embedding_size)
self.P[:, :, 0::2] = np.sin(X)
self.P[:, :, 1::2] = np.cos(X)
self.P = torch.FloatTensor(self.P)
def forward(self, X):
if X.is_cuda and not self.P.is_cuda:
self.P = self.P.cuda()
X = X + self.P[:, :X.shape[1], :]
return self.dropout(X)
測(cè)試
下面我們用PositionalEncoding這個(gè)類進(jìn)行一個(gè)小測(cè)試,取其中的四個(gè)維度進(jìn)行可視化。 我們可以看到,第4維和第5維有相同的頻率但偏置不同。第6維和第7維具有更低的頻率;因此positional encoding對(duì)于不同維度具有可區(qū)分性。
import numpy as np
pe = PositionalEncoding(20, 0)
Y = pe(torch.zeros((1, 100, 20))).numpy()
d2l.plot(np.arange(100), Y[0, :, 4:8].T, figsize=(6, 2.5),
legend=["dim %d" % p for p in [4, 5, 6, 7]])
編碼器
我們已經(jīng)有了組成Transformer的各個(gè)模塊,現(xiàn)在我們可以開始搭建了!編碼器包含一個(gè)多頭注意力層,一個(gè)position-wise FFN,和兩個(gè) Add and Norm層。對(duì)于attention模型以及FFN模型,我們的輸出維度都是與embedding維度一致的,這也是由于殘差連接天生的特性導(dǎo)致的,因?yàn)槲覀円獙⑶耙粚拥妮敵雠c原始輸入相加并歸一化。
class EncoderBlock(nn.Module):
def __init__(self, embedding_size, ffn_hidden_size, num_heads,
dropout, **kwargs):
super(EncoderBlock, self).__init__(**kwargs)
self.attention = MultiHeadAttention(embedding_size, embedding_size, num_heads, dropout)
self.addnorm_1 = AddNorm(embedding_size, dropout)
self.ffn = PositionWiseFFN(embedding_size, ffn_hidden_size, embedding_size)
self.addnorm_2 = AddNorm(embedding_size, dropout)
def forward(self, X, valid_length):
Y = self.addnorm_1(X, self.attention(X, X, X, valid_length))
return self.addnorm_2(Y, self.ffn(Y))
batch_size = 2, seq_len = 100, embedding_size = 24
# ffn_hidden_size = 48, num_head = 8, dropout = 0.5
X = torch.ones((2, 100, 24))
encoder_blk = EncoderBlock(24, 48, 8, 0.5)
encoder_blk(X, valid_length).shape
out:
torch.Size([2, 100, 24])
現(xiàn)在我們來(lái)實(shí)現(xiàn)整個(gè)Transformer 編碼器模型,整個(gè)編碼器由n個(gè)剛剛定義的Encoder Block堆疊而成,因?yàn)闅埐钸B接的緣故,中間狀態(tài)的維度始終與嵌入向量的維度d一致;同時(shí)注意到我們把嵌入向量乘以 以防止其值過(guò)小。
class TransformerEncoder(d2l.Encoder):
def __init__(self, vocab_size, embedding_size, ffn_hidden_size,
num_heads, num_layers, dropout, **kwargs):
super(TransformerEncoder, self).__init__(**kwargs)
self.embedding_size = embedding_size
self.embed = nn.Embedding(vocab_size, embedding_size)
self.pos_encoding = PositionalEncoding(embedding_size, dropout)
self.blks = nn.ModuleList()
for i in range(num_layers):
self.blks.append(
EncoderBlock(embedding_size, ffn_hidden_size,
num_heads, dropout))
def forward(self, X, valid_length, *args):
X = self.pos_encoding(self.embed(X) * math.sqrt(self.embedding_size))
for blk in self.blks:
X = blk(X, valid_length)
return X
# test encoder
encoder = TransformerEncoder(200, 24, 48, 8, 2, 0.5)
encoder(torch.ones((2, 100)).long(), valid_length).shape
out:
torch.Size([2, 100, 24])
解碼器
Transformer 模型的解碼器與編碼器結(jié)構(gòu)類似,然而,除了之前介紹的幾個(gè)模塊之外,編碼器部分有另一個(gè)子模塊。該模塊也是多頭注意力層,接受編碼器的輸出作為key和value,decoder的狀態(tài)作為query。與編碼器部分相類似,解碼器同樣是使用了add and norm機(jī)制,用殘差和層歸一化將各個(gè)子層的輸出相連。 仔細(xì)來(lái)講,在第t個(gè)時(shí)間步,當(dāng)前輸入 是query,那么self attention接受了第t步以及前t-1步的所有輸入
,…,
。在訓(xùn)練時(shí),由于第t位置的輸入可以觀測(cè)到全部的序列,這與預(yù)測(cè)階段的情形項(xiàng)矛盾,所以我們要通過(guò)將第t個(gè)時(shí)間步所對(duì)應(yīng)的可觀測(cè)長(zhǎng)度設(shè)置為t,以消除不需要看到的未來(lái)的信息。
class DecoderBlock(nn.Module):
def __init__(self, embedding_size, ffn_hidden_size, num_heads,dropout,i,**kwargs):
super(DecoderBlock, self).__init__(**kwargs)
self.i = i
self.attention_1 = MultiHeadAttention(embedding_size, embedding_size, num_heads, dropout)
self.addnorm_1 = AddNorm(embedding_size, dropout)
self.attention_2 = MultiHeadAttention(embedding_size, embedding_size, num_heads, dropout)
self.addnorm_2 = AddNorm(embedding_size, dropout)
self.ffn = PositionWiseFFN(embedding_size, ffn_hidden_size, embedding_size)
self.addnorm_3 = AddNorm(embedding_size, dropout)
def forward(self, X, state):
enc_outputs, enc_valid_length = state[0], state[1]
# state[2][self.i] stores all the previous t-1 query state of layer-i
# len(state[2]) = num_layers
# If training:
# state[2] is useless.
# If predicting:
# In the t-th timestep:
# state[2][self.i].shape = (batch_size, t-1, hidden_size)
# Demo:
# love dogs ! [EOS]
# | | | |
# Transformer
# Decoder
# | | | |
# I love dogs !
if state[2][self.i] is None:
key_values = X
else:
# shape of key_values = (batch_size, t, hidden_size)
key_values = torch.cat((state[2][self.i], X), dim=1)
state[2][self.i] = key_values
if self.training:
batch_size, seq_len, _ = X.shape
# Shape: (batch_size, seq_len), the values in the j-th column are j+1
valid_length = torch.FloatTensor(np.tile(np.arange(1, seq_len+1), (batch_size, 1)))
valid_length = valid_length.to(X.device)
else:
valid_length = None
X2 = self.attention_1(X, key_values, key_values, valid_length)
Y = self.addnorm_1(X, X2)
Y2 = self.attention_2(Y, enc_outputs, enc_outputs, enc_valid_length)
Z = self.addnorm_2(Y, Y2)
return self.addnorm_3(Z, self.ffn(Z)), state
decoder_blk = DecoderBlock(24, 48, 8, 0.5, 0)
X = torch.ones((2, 100, 24))
state = [encoder_blk(X, valid_length), valid_length, [None]]
decoder_blk(X, state)[0].shape
out:
torch.Size([2, 100, 24])
對(duì)于Transformer解碼器來(lái)說(shuō),構(gòu)造方式與編碼器一樣,除了最后一層添加一個(gè)dense layer以獲得輸出的置信度分?jǐn)?shù)。下面讓我們來(lái)實(shí)現(xiàn)一下Transformer Decoder,除了常規(guī)的超參數(shù)例如vocab_size embedding_size 之外,解碼器還需要編碼器的輸出 enc_outputs 和句子有效長(zhǎng)度 enc_valid_length。
class TransformerDecoder(d2l.Decoder):
def __init__(self, vocab_size, embedding_size, ffn_hidden_size,
num_heads, num_layers, dropout, **kwargs):
super(TransformerDecoder, self).__init__(**kwargs)
self.embedding_size = embedding_size
self.num_layers = num_layers
self.embed = nn.Embedding(vocab_size, embedding_size)
self.pos_encoding = PositionalEncoding(embedding_size, dropout)
self.blks = nn.ModuleList()
for i in range(num_layers):
self.blks.append(
DecoderBlock(embedding_size, ffn_hidden_size, num_heads,
dropout, i))
self.dense = nn.Linear(embedding_size, vocab_size)
def init_state(self, enc_outputs, enc_valid_length, *args):
return [enc_outputs, enc_valid_length, [None]*self.num_layers]
def forward(self, X, state):
X = self.pos_encoding(self.embed(X) * math.sqrt(self.embedding_size))
for blk in self.blks:
X, state = blk(X, state)
return self.dense(X), state
訓(xùn)練
import zipfile
import torch
import requests
from io import BytesIO
from torch.utils import data
import sys
import collections
class Vocab(object): # This class is saved in d2l.
def __init__(self, tokens, min_freq=0, use_special_tokens=False):
# sort by frequency and token
counter = collections.Counter(tokens)
token_freqs = sorted(counter.items(), key=lambda x: x[0])
token_freqs.sort(key=lambda x: x[1], reverse=True)
if use_special_tokens:
# padding, begin of sentence, end of sentence, unknown
self.pad, self.bos, self.eos, self.unk = (0, 1, 2, 3)
tokens = ['', '', '', '']
else:
self.unk = 0
tokens = ['']
tokens += [token for token, freq in token_freqs if freq >= min_freq]
self.idx_to_token = []
self.token_to_idx = dict()
for token in tokens:
self.idx_to_token.append(token)
self.token_to_idx[token] = len(self.idx_to_token) - 1
def __len__(self):
return len(self.idx_to_token)
def __getitem__(self, tokens):
if not isinstance(tokens, (list, tuple)):
return self.token_to_idx.get(tokens, self.unk)
else:
return [self.__getitem__(token) for token in tokens]
def to_tokens(self, indices):
if not isinstance(indices, (list, tuple)):
return self.idx_to_token[indices]
else:
return [self.idx_to_token[index] for index in indices]
def load_data_nmt(batch_size, max_len, num_examples=1000):
"""Download an NMT dataset, return its vocabulary and data iterator."""
# Download and preprocess
def preprocess_raw(text):
text = text.replace('\u202f', ' ').replace('\xa0', ' ')
out = ''
for i, char in enumerate(text.lower()):
if char in (',', '!', '.') and text[i-1] != ' ':
out += ' '
out += char
return out
with open('/home/kesci/input/fraeng6506/fra.txt', 'r') as f:
raw_text = f.read()
text = preprocess_raw(raw_text)
# Tokenize
source, target = [], []
for i, line in enumerate(text.split('\n')):
if i >= num_examples:
break
parts = line.split('\t')
if len(parts) >= 2:
source.append(parts[0].split(' '))
target.append(parts[1].split(' '))
# Build vocab
def build_vocab(tokens):
tokens = [token for line in tokens for token in line]
return Vocab(tokens, min_freq=3, use_special_tokens=True)
src_vocab, tgt_vocab = build_vocab(source), build_vocab(target)
# Convert to index arrays
def pad(line, max_len, padding_token):
if len(line) > max_len:
return line[:max_len]
return line + [padding_token] * (max_len - len(line))
def build_array(lines, vocab, max_len, is_source):
lines = [vocab[line] for line in lines]
if not is_source:
lines = [[vocab.bos] + line + [vocab.eos] for line in lines]
array = torch.tensor([pad(line, max_len, vocab.pad) for line in lines])
valid_len = (array != vocab.pad).sum(1)
return array, valid_len
src_vocab, tgt_vocab = build_vocab(source), build_vocab(target)
src_array, src_valid_len = build_array(source, src_vocab, max_len, True)
tgt_array, tgt_valid_len = build_array(target, tgt_vocab, max_len, False)
train_data = data.TensorDataset(src_array, src_valid_len, tgt_array, tgt_valid_len)
train_iter = data.DataLoader(train_data, batch_size, shuffle=True)
return src_vocab, tgt_vocab, train_iter
import os
import d2l
# 平臺(tái)暫時(shí)不支持gpu,現(xiàn)在會(huì)自動(dòng)使用cpu訓(xùn)練,gpu可以用了之后會(huì)使用gpu來(lái)訓(xùn)練
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
embed_size, embedding_size, num_layers, dropout = 32, 32, 2, 0.05
batch_size, num_steps = 64, 10
lr, num_epochs, ctx = 0.005, 250, d2l.try_gpu()
print(ctx)
num_hiddens, num_heads = 64, 4
src_vocab, tgt_vocab, train_iter = load_data_nmt(batch_size, num_steps)
encoder = TransformerEncoder(
len(src_vocab), embedding_size, num_hiddens, num_heads, num_layers,
dropout)
decoder = TransformerDecoder(
len(src_vocab), embedding_size, num_hiddens, num_heads, num_layers,
dropout)
model = d2l.EncoderDecoder(encoder, decoder)
d2l.train_s2s_ch9(model, train_iter, lr, num_epochs, ctx)
cpu
epoch 50,loss 0.048, time 53.3 sec
epoch 100,loss 0.040, time 53.4 sec
epoch 150,loss 0.037, time 53.5 sec
epoch 200,loss 0.036, time 53.6 sec
epoch 250,loss 0.035, time 53.5 sec
model.eval()
for sentence in ['Go .', 'Wow !', "I'm OK .", 'I won !']:
print(sentence + ' => ' + d2l.predict_s2s_ch9(
model, sentence, src_vocab, tgt_vocab, num_steps, ctx))
Go . => !
Wow ! => !
I'm OK . => ?a va .
I won ! => j'ai gagné !