序
word embedding之后,经过连续的N次(如12or4)的BertLayer串联之后的之后的输出,即为Bert的输出。本篇文章,主要分析一次 BertLayer的过程。
所有的代码都在 pytorch_pretrained_bert/modeling.py 文件中。
BertLayer
首先,我们看下BertLayer前向时的处理过程,分别经过BertAttention层,BertIntermediate层,
BertOutput层,后续我们会依次分析这三层的实现。
图 1 一次 BertLayer的前向运算过程
class BertLayer(nn.Module):
def __init__(self, config):
super(BertLayer, self).__init__()
self.attention = BertAttention(config)
self.intermediate = BertIntermediate(config)
self.output = BertOutput(config)
def forward(self, hidden_states, attention_mask):
# BertAttention 层
attention_output = self.attention(hidden_states, attention_mask)
# BertIntermediate 层
intermediate_output = self.intermediate(attention_output)
# BertOutput层
layer_output = self.output(intermediate_output, attention_output)
return layer_output
BertAttention
Bert中的SelfAttention主要是借鉴了<Attention is all you need>论文中提出的自注意力机制
如下图中,左图为selft-attention的过程。一组(Q,K,V),可对输入进行一种处理。Mutli_head Attention是多组(h)(Q,K,V)同时存在时,对输入进行多种变换,提取多种特征的方法。多个Attention输出结果进行Contact。每个Attention可独立进行前向运算。他们之间在前向运行时,没有关联。所以可以组成矩阵的形式,利用GPU对矩阵并行计算的优势,加快运算速度。详细处理过程,请见下文代码分析。
图 2 Scaled Dot-Porduct Attention 和 Multi-Head Attention的图示
BertSelfAttention
class BertSelfAttention(nn.Module):
def __init__(self, config):
super(BertSelfAttention, self).__init__()
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (config.hidden_size, config.num_attention_heads))
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
# 此处可标红,哈哈,query,key,value都是对输入进行全连接层的变换得到的。 输入的节点数为config.hidden_size, 输出为self.all_head_size(也是config.hidden_size),即权重矩阵为[config.hidden_size, config.hidden_size]
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def transpose_for_scores(self, x):
# 进行维度变换,如batch_size:32, seq_len:300, num_attention_heads:12,attention_head_size:26 (12 * 26 = 312即hidden_size为312)
# new_x_shape 维度变换 [batch_size, seq_len, hidden_size]->[seq_len, seq_len, num_attention_heads, attention_head_size]即:[32, 300, 312] 转换为 [32, 300, 12, 26]
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
# 重构涨量的维度
x = x.view(*new_x_shape)
# 维度换位:即[32, 300, 12, 26]经过换位之后的维度[32, 12, 300, 26]即[batch_size, num_attention_heads, seq_len, attention_head_size]
return x.permute(0, 2, 1, 3)
def forward(self, hidden_states, attention_mask):
# 维度变换:[batch_size, seq_len, embedding_size]->[batch_size, seq_len, embedding_size]
mixed_query_layer = self.query(hidden_states)
mixed_key_layer = self.key(hidden_states)
mixed_value_layer = self.value(hidden_states)
# [batch_size, seq_len, embedding_size]-->[batch_size, num_attention_heads, seq_len, attention_head_size ]
query_layer = self.transpose_for_scores(mixed_query_layer)
key_layer = self.transpose_for_scores(mixed_key_layer)
value_layer = self.transpose_for_scores(mixed_value_layer)
# Take the dot product between "query" and "key" to get the raw attention scores.
# query_layer[32, 12, 300, 26], key_layer.transpose(-1, -2):[32, 12, 26, 300],
# attention_scores:[32, 12, 300, 300]
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
# 除以矩阵相乘时累加元素的格式 26,即左乘矩阵的列数 26,或是右乘矩阵的行数 26
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
# attention_mask,原句内容的位置值为0,补充到max_seq位置处的值为-1000.相加之后, 补充位置处的值会非常小。经过下面的softmax之后是一个接近于0的值。
attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)
# 注意力矩阵attention_probs[32, 12, 300, 300] 和 value_layer[32, 12, 300, 26],
# context_layer 维度为[32, 12, 300, 26]
context_layer = torch.matmul(attention_probs, value_layer)
# 维度换位 [32, 12, 300, 26]->[32, 300, 12, 26]
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
# 定义输出维度 [32, 300, 312]
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
# 维度变换
context_layer = context_layer.view(*new_context_layer_shape)
return context_layer
这里和上图2稍有不同,上图2的右图中,contact之后的linear处理,放在了BertSelfOutput层。即contact之后之后的输出,作为BertSelfOutput层的输入。BertSelfOutput依次做了(Linear->Add->BertLayerNorm )的处理。
class BertSelfOutput(nn.Module):
def __init__(self, config):
super(BertSelfOutput, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
# 全连接层处理
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
# ResNet结构 :hidden_states(SelfAttention层之后再进行一次前向运算的输出) + input_tensor(SelfAttention层的输入)
# 相加之后的和矩阵,进行LayerNorm运算。layerNorm的介绍,可参考前文。
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
BertAttention
class BertAttention(nn.Module):
def __init__(self, config):
super(BertAttention, self).__init__()
self.self = BertSelfAttention(config)
self.output = BertSelfOutput(config)
def forward(self, input_tensor, attention_mask):
# 执行上文中的BertSelfAttention::forward函数,self_output维度为[batch_size, seq_len, embedding_size]
self_output = self.self(input_tensor, attention_mask)
# 执行上文中介绍的BertSelfOutput::forward函数函数,attention_output的维度为[batch_size, seq_len, embedding_size]
attention_output = self.output(self_output, input_tensor)
return attention_output
BertIntermediate
对应图1中的FeedForword处理
class BertIntermediate(nn.Module):
def __init__(self, config):
super(BertIntermediate, self).__init__()
# 定义一个变换矩阵[config.hidden_size, config.intermediate_size], config.hidden_size即为embedding_size
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
# 定义激活函数
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states):
# 全连接层,矩阵维度变换[batch_size, seq_len, embedding_size]->[batch_size, seq_len, intermediate_size]
hidden_states = self.dense(hidden_states)
# 激活处理(gelu)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
BertOutput
图1中的Add&Norm处理放在了此层。BertOutput和BertSelfOutput是比较像的。 只有变换矩阵维度不同BertSelfOutput为[embedding_size, embedding_size], BertOutput为[intermediate_size, hidden_size]
class BertOutput(nn.Module):
def __init__(self, config):
super(BertOutput, self).__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
# Add & Norm
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
其他
初学pytorch,在看代码的时候有些函数的用法不是很明确,写下些小的测试例子,这里记录如下。
permute测试:
x = torch.randn(2, 3)
print(x)
'''
输入为:
tensor([[ 1.0246, -0.5259, -1.9292],
[-0.4087, -0.0218, 0.5222]])
'''
x = x.permute(1, 0)
print(x)
'''
输出为:
tensor([[ 1.0246, -0.4087],
[-0.5259, -0.0218],
[-1.9292, 0.5222]])
'''
矩阵的加法,只要求两个矩阵的列数相等即可,
x = torch.randn(2, 2)
print(x)
'''
tensor([[ 0.0062, 0.5476],
[-1.5412, -0.4478]])
'''
y = torch.randn(1, 2)
print(y)
'''
tensor([[-0.3993, -1.3920]])
'''
z = x + y
print(z)
'''
tensor([[-0.3930, -0.8444],
[-1.9405, -1.8398]])
'''
view
和respahe的作用基本相同
contiguous: 没有修改底层一维数组,但是新建了一份Tensor元信息
t = torch.arange(12).reshape(3, 4)
print(t)
'''
tensor([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
'''
t2 = t.transpose(0, 1)
print(t2)
'''
tensor([[ 0, 4, 8],
[ 1, 5, 9],
[ 2, 6, 10],
[ 3, 7, 11]])
'''
print(t.data_ptr() == t2.data_ptr())
'''
True
'''
t3 = t2.contiguous()
print(t3)
'''
tensor([[ 0, 4, 8],
[ 1, 5, 9],
[ 2, 6, 10],
[ 3, 7, 11]])
'''
print(t3.data_ptr() == t2.data_ptr())
'''
False
'''
print(t.data_ptr() == t3.data_ptr())
'''
False
'''
dense
可以看到这里的dense,都是Linear类型。即进行的是线性变换。
class Linear(Module):
__constants__ = ['in_features', 'out_features']
in_features: int
out_features: int
weight: Tensor
def __init__(self, in_features: int, out_features: int, bias: bool = True) -> None:
super(Linear, self).__init__()
self.in_features = in_features
self.out_features = out_features
self.weight = Parameter(torch.Tensor(out_features, in_features))
if bias:
self.bias = Parameter(torch.Tensor(out_features))
else:
self.register_parameter('bias', None)
self.reset_parameters()
def reset_parameters(self) -> None:
init.kaiming_uniform_(self.weight, a=math.sqrt(5))
if self.bias is not None:
fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
bound = 1 / math.sqrt(fan_in)
init.uniform_(self.bias, -bound, bound)
def forward(self, input: Tensor) -> Tensor:
return F.linear(input, self.weight, self.bias)
def extra_repr(self) -> str:
return 'in_features={}, out_features={}, bias={}'.format(
self.in_features, self.out_features, self.bias is not None
)
tf版本的dense稍有不同, 其中有一参数activation,表示 使用什么激活函数(神经网络的非线性层),默认为None,不使用激活函数。
tf.layers.dense
tf.layers.dense( inputs,
units,
activation=None,
use_bias=True,
kernel_initializer=None,
bias_initializer=tf.zeros_initializer(),
kernel_regularizer=None,
bias_regularizer=None,
activity_regularizer=None,
kernel_constraint=None,
bias_constraint=None,
trainable=True,
name=None,
reuse=None )