这个算是在课程学习之外的探索,不过希望能尽快用到项目实践中。在文章里会引用较多的博客,文末会进行reference。
搜索Transformer机制,会发现高分结果基本上都源于一篇论文Jay Alammar的《The Illustrated Transformer》(图解Transformer),提到最多的Attention是Google的《Attention Is All You Need》。
-~~对于Transformer的运行机制了解即可,所以会基于这篇论文来学习Transformer,结合《Sklearn+Tensorflow》中Attention注意力机制一章完成基本的概念学习;
- 找一个基于Transformer的项目练手
5.代码实现
整合各个module
在之前的modules中,已经完成了一个Transformer网络各个组件的编写,在这里要将不同组件进行整合。
我不知道这个代码的质量,还是先搞定,再看Google的原版,以及使用Pytorch的实现。
主要包括四个方法,对应Transformer的Encoder和Decoder结构以及训练和评价。
- def encode(self, xs, training=True)
- def decode(self, ys, memory, training=True)
- def train(self, xs, ys)
- def eval(self, xs, ys)
引入必要库
import tensorflow as tf
from data_load import load_vocab
from modules import get_token_embeddings, ff, positional_encoding, multihead_attention, label_smoothing, noam_scheme
from utils import convert_idx_to_token_tensor
from tqdm import tqdm
import logging
logging.basicConfig(level=logging.INFO)
构建类
class Transformer:
'''
xs: tuple of
x: int32 tensor. (N, T1)
x_seqlens: int32 tensor. (N,)
sents1: str tensor. (N,)
ys: tuple of
decoder_input: int32 tensor. (N, T2)
y: int32 tensor. (N, T2)
y_seqlen: int32 tensor. (N, )
sents2: str tensor. (N,)
training: boolean.
'''
def __init__(self, hp):
self.hp = hp
self.token2idx, self.idx2token = load_vocab(hp.vocab)
self.embeddings = get_token_embeddings(self.hp.vocab_size, self.hp.d_model, zero_pad=True)
def encode(self, xs, training=True):
'''
encoder部分,注意看里面的实现过程,
:param xs:
:param training:
:return: encoder outputs. (N, T1, d_model)
'''
def decode(self, ys, memory, training=True):
'''
decoder部分,注意看里面的实现过程
:param ys:
:param memory: 就是encoder的输出。
:param training:
:return:
'''
def train(self, xs, ys):
'''
训练部分
:param xs:
:param ys:
:return:
'''
def eval(self, xs, ys):
'''
回归预测,推理中忽略输入
:param xs:
:param ys:
:return:
'''
构建encoder
注意实现过程,对比和后面decoder的区别
def encode(self, xs, training=True):
'''
encoder部分,注意看里面的实现过程,
:param xs:
:param training:
:return: encoder outputs. (N, T1, d_model)
'''
with tf.Variable_scope("encoder", reuse=tf.AUTO_REUSE):
x, seqlens, sents1 = xs
# 嵌入 enc意思为encoder
enc = tf.nn.embedding_lookup(self.embeddings, x)
enc *= self.hp.d_model ** 0.5
enc += positional_encoding(enc, self.hp.maxlen1)
enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training)
# blocks
for i in range(self.hp.num_blocks):
with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
# self-attention
enc = multihead_attention(
queries=enc,
keys=enc,
values=enc,
num_heads=self.hp.num_heads,
dropout_rate=self.hp.dropout_rate,
training=training,
causality=False
)
# 前向传播
enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
memory = enc
return memory, sents1
构建decoder
def decode(self, ys, memory, training=True):
'''
decoder部分,注意看里面的实现过程
:param ys:
:param memory: 就是encoder的输出。
:param training:
:return:
'''
with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
decoder_inputs, y, seqlens, sents2 = ys
# 嵌入
dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs)
dec *= self.hp.d_model ** 0.5
dec += positional_encoding(dec, self.hp.maxlen2)
dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training)
# blocks
for i in range(self.hp.num_blocks):
with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
# Masked self-attention (Note that causality is True at this time)
dec = multihead_attention(
queries=dec,
keys=dec,
values=dec,
num_heads=self.hp.num_heads,
dropout_rate=self.hp.dropout_rate,
training=training,
causality=True,
scope="self_attention"
)
# vanilla attention
dec = multihead_attention(
queries=dec,
keys=memory,
values=memory,
num_heads=self.hp.num_heads,
dropout_rate=self.hp.dropout_rate,
training=training,
causality=False,
scope="vanilla_attention"
)
# 前向传播
dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model])
# 最后的线性投影(嵌入权重)
weights = tf.transpose(self.embeddings)
logits = tf.einsum('ntd,dk->ntk', dec, weights)
y_hat = tf.to_int32(tf.argmax(logits, axis=-1))
return logits, y_hat, y, sents2
构建训练和评价方法
def train(self, xs, ys):
'''
训练部分
:param xs:
:param ys:
:return:
'''
memory, sents1 = self.encode(xs)
logits, preds, y, sents2 = self.decode(ys, memory)
y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size))
ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_)
nonpadding = tf.to_float(tf.not_equal(y, self.token2idx["<pad>"])) # 0: <pad>
loss = tf.reduce_sum(ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)
global_step = tf.train.get_or_create_global_step()
lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps)
optimizer = tf.train.AdamOptimizer(lr)
train_op = optimizer.minimize(loss, global_step=global_step)
tf.summary.scalar('lr', lr)
tf.summary.scalar("loss", loss)
tf.summary.scalar("global_step", global_step)
summaries = tf.summary.merge_all()
return loss, train_op, global_step, summaries
def eval(self, xs, ys):
'''
回归预测,推理中忽略输入
:param xs:
:param ys:
:return:
'''
decoder_inputs, y, y_seqlen, sents2 = ys
decoder_inputs = tf.ones((tf.shape(xs[0])[0], 1), tf.int32) * self.token2idx["<s>"]
ys = (decoder_inputs, y, y_seqlen, sents2)
memory, sents1 = self.encode(xs, False)
logging.info("Inference graph is being built. Please be patient.")
for _ in tqdm(range(self.hp.maxlen2)):
logits, y_hat, y, sents2 = self.decode(ys, memory, False)
if tf.reduce_sum(y_hat, 1) == self.token2idx["<pad>"]: break
_decoder_inputs = tf.concat((decoder_inputs, y_hat), 1)
ys = (_decoder_inputs, y, y_seqlen, sents2)
# monitor a random sample
n = tf.random_uniform((), 0, tf.shape(y_hat)[0] - 1, tf.int32)
sent1 = sents1[n]
pred = convert_idx_to_token_tensor(y_hat[n], self.idx2token)
sent2 = sents2[n]
tf.summary.text("sent1", sent1)
tf.summary.text("pred", pred)
tf.summary.text("sent2", sent2)
summaries = tf.summary.merge_all()
return y_hat, summaries
到此为止,整个Transformer就搭建完毕,接下来要做的就是训练和调试。我在想要不要花一天时间训练,还是找一个更成熟的demo来做。
接下来要安装Pytorch,因为AllenNLP已经做好了这些。