从源码理解部分transformer

从源码理解部分transformer

源码来自

https://github.com/graykode/nlp-tutorial

位置编码

  • 原理

    image.png

  • 源码

    class PositionalEncoding(nn.Module):
      def __init__(self, d_model, dropout=0.1, max_len=5000):
          super(PositionalEncoding, self).__init__()
          self.dropout = nn.Dropout(p=dropout)
    
          pe = torch.zeros(max_len, d_model)
          position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
          div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # math.pow(10000, (2i/dmodel)),先取对数,再指数,对数和指数是反函数中和了;指数上加负号math.pow(2,-4)=1/math.pow(2,4)
          pe[:, 0::2] = torch.sin(position * div_term) # position * div_term  维度:max_len,d_model/2
          pe[:, 1::2] = torch.cos(position * div_term)
          pe = pe.unsqueeze(0).transpose(0, 1) # 转置目的是批量上的字加相关位置(第二维度变成batsize,且pe的batsize是1),如[[1, 2, 3],[4, 5, 6]],1和4,2和5,3和6加的位置相同,同一个字d_model向量维度加的不同;[seq_len,1,d_model]
          self.register_buffer('pe', pe)
    
      def forward(self, x):
          '''
          x: [seq_len, batch_size, d_model]
          '''
          x = x + self.pe[:x.size(0), :]
          return self.dropout(x)
    
  • 理解

    • div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

      的公式变换如下

DINGTALK_IM_2417386734.JPG.JPG
  • pe = pe.unsqueeze(0).transpose(0, 1)

    与输入

    x: [seq_len, batch_size, d_model]

    理解为不同batch句子加位置相同

  • 同一词的不同向量(d_model)维度加的值不同

    pe[:, 0::2] = torch.sin(position * div_term) # position * div_term 维度:max_len,d_model/2

    pe[:, 1::2] = torch.cos(position * div_term)

    • 打印d_model维度对应的位置值
      import numpy as np
      import matplotlib.pyplot as plt
      
      plt.rcParams['font.sans-serif']=['SimHei']###解决中文乱码
      plt.rcParams['axes.unicode_minus']=False
      
      pe = np.load('./pe.npy')
      x = np.array(range(0, 30)) # 代表0到30的词
      y0 = pe[0:30, 0]  # 代表0到30的词的d_model0
      y1 = pe[0:30, 1]  # 代表0到30的词的d_model1
      y2 = pe[0:30, 2]  # 代表0到30的词的d_model2
      y3 = pe[0:30, 3]  # 代表0到30的词的d_model3
      plt.plot(x, y0, label='d_model0')
      plt.plot(x, y1, label='d_model1')
      plt.plot(x, y2, label='d_model2')
      plt.plot(x, y3, label='d_model3')
      plt.legend()
      plt.show()
      
d_model.png

get_attn_pad_mask(seq_q, seq_k)

  • 个人感觉是因为输入固定长度的序列字符串,屏蔽没用的是pad(0)的字符
  • 代码流程
    • 编码阶段(enc_input=seq_q=seq_k;对seq_k的句子,是pad(0) 的词设置为Ture,其它词设置为False,且拉伸到len(seq_q))

      (1)
      # enc_input           dec_input         dec_output
      ['ich mochte ein bier P', 'S i want a beer .', 'i want a beer . E'],
      ['ich mochte ein cola P', 'S i want a coke .', 'i want a coke . E']
      
      src_vocab = {'P' : 0, 'ich' : 1, 'mochte' : 2, 'ein' : 3, 'bier' : 4, 'cola' : 5}
      tgt_vocab = {'P' : 0, 'i' : 1, 'want' : 2, 'a' : 3, 'beer' : 4, 'coke' : 5, 'S' : 6, 'E' : 7, '.' : 8}
      
      (2)
      seq_k = seq_q = array([[1, 2, 3, 5, 0],
                             [1, 2, 3, 4, 0]])
      
      (3)
      # eq(zero) is PAD token
      pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # [batch_size, 1, len_k], False is masked
      
      pad_attn_mask的值
      array([[[False, False, False, False,  True]],
            [[False, False, False, False,  True]]])  # [batch_size, 1, len_k], False is masked
      
      (4)拉伸到len(seq_q)长度
      pad_attn_mask.expand(batch_size, len_q, len_k)的值
      array([[[False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True]],
      
         [[False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True]]])  # [batch_size, len_q, len_k]
      
    • 解码阶段1(dec_input=seq_q=seq_k; 对seq_k的句子,是pad(0) 的词设置为Ture,其它词设置为False,且拉伸到len(seq_q))

      (1)
      # enc_input           dec_input         dec_output
      ['ich mochte ein bier P', 'S i want a beer .', 'i want a beer . E'],
      ['ich mochte ein cola P', 'S i want a coke .', 'i want a coke . E']
      
      src_vocab = {'P' : 0, 'ich' : 1, 'mochte' : 2, 'ein' : 3, 'bier' : 4, 'cola' : 5}
      tgt_vocab = {'P' : 0, 'i' : 1, 'want' : 2, 'a' : 3, 'beer' : 4, 'coke' : 5, 'S' : 6, 'E' : 7, '.' : 8}
      
      (2)
      seq_k = seq_q = array([[6, 1, 2, 3, 4, 8],
                            [6, 1, 2, 3, 5, 8]])
      
      (3)
      # eq(zero) is PAD token
      pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # [batch_size, 1, len_k], False is masked
      
      pad_attn_mask的值
      array([[[False, False, False, False, False, False]],
            [[False, False, False, False, False, False]]])  # [batch_size, 1, len_k], False is masked
      
      (4)拉伸到len(seq_q)长度
      pad_attn_mask.expand(batch_size, len_q, len_k)的值
      array([[[False, False, False, False, False, False],
          [False, False, False, False, False, False],
          [False, False, False, False, False, False],
          [False, False, False, False, False, False],
          [False, False, False, False, False, False],
          [False, False, False, False, False, False]],
      
        [[False, False, False, False, False, False],
          [False, False, False, False, False, False],
          [False, False, False, False, False, False],
          [False, False, False, False, False, False],
          [False, False, False, False, False, False],
          [False, False, False, False, False, False]]])  # [batch_size, len_q, len_k]
      
    • 解码阶段2(dec_input=seq_q,enc_inputs=seq_k; 对seq_k的句子,是pad(0) 的词设置为Ture,其它词设置为False,且拉伸到len(seq_q))

      (1)
      # enc_input           dec_input         dec_output
      ['ich mochte ein bier P', 'S i want a beer .', 'i want a beer . E'],
      ['ich mochte ein cola P', 'S i want a coke .', 'i want a coke . E']
      
      src_vocab = {'P' : 0, 'ich' : 1, 'mochte' : 2, 'ein' : 3, 'bier' : 4, 'cola' : 5}
      tgt_vocab = {'P' : 0, 'i' : 1, 'want' : 2, 'a' : 3, 'beer' : 4, 'coke' : 5, 'S' : 6, 'E' : 7, '.' : 8}
      
      (2)
      seq_q = array([[6, 1, 2, 3, 4, 8],
                     [6, 1, 2, 3, 5, 8]])
      seq_k = array([[1, 2, 3, 4, 0],
                     [1, 2, 3, 5, 0]])
      
      (3)
      # eq(zero) is PAD token
      pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # [batch_size, 1, len_k], False is masked
      
      pad_attn_mask的值
      array([[[False, False, False, False,  True]],
             [[False, False, False, False,  True]]])  # [batch_size, 1, len_k], False is masked
      
      (4)拉伸到len(seq_q)长度
      pad_attn_mask.expand(batch_size, len_q, len_k)的值
      array([[[False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True]],
      
         [[False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True]]])  # [batch_size, len_q, len_k]
      

get_attn_subsequence_mask(seq)

  • 屏蔽序列下文

  • 返回三角矩阵 [batch_size, tgt_len, tgt_len]

  • 返回内容(只有解码流程 seq=dec_inputs)

    tensor([[[0, 1, 1, 1, 1, 1],
           [0, 0, 1, 1, 1, 1],
           [0, 0, 0, 1, 1, 1],
           [0, 0, 0, 0, 1, 1],
           [0, 0, 0, 0, 0, 1],
           [0, 0, 0, 0, 0, 0]],
    
          [[0, 1, 1, 1, 1, 1],
           [0, 0, 1, 1, 1, 1],
           [0, 0, 0, 1, 1, 1],
           [0, 0, 0, 0, 1, 1],
           [0, 0, 0, 0, 0, 1],
           [0, 0, 0, 0, 0, 0]]], dtype=torch.uint8)
    

    class ScaledDotProductAttention(nn.Module)

    • 固定参数说明
      • d_k == d_v 预先设置配置好
      • n_heads 预先设置配置好
      • d_model 预先设置配置好,词生成的Embedding Size
      • 一般 d_k * n_heads == d_model
    • Q、K、V 参数由来
      self.W_Q = nn.Linear(d_model, d_k * n_heads, bias=False)
      self.W_K = nn.Linear(d_model, d_k * n_heads, bias=False)
      self.W_V = nn.Linear(d_model, d_v * n_heads, bias=False)
      
      Q = self.W_Q(input_Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # Q: [batch_size, n_heads, len_q, d_k]
      K = self.W_K(input_K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # K: [batch_size, n_heads, len_k, d_k]
      V = self.W_V(input_V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # V: [batch_size, n_heads, len_v(=len_k), d_v]
      
    • attn_mask 参数
      • 编码来自 get_attn_pad_mask(seq_q, seq_k),
      • 解码来自 get_attn_pad_mask(seq_q, seq_k),get_attn_subsequence_mask(seq)
    • 返回参数 attn pad(0)的字符为0
    tensor([[[[0.2763, 0.1676, 0.3137, 0.2424, 0.0000],
            [0.1488, 0.2446, 0.2804, 0.3262, 0.0000],
            [0.1457, 0.2651, 0.2659, 0.3232, 0.0000],
            [0.1611, 0.3513, 0.2042, 0.2834, 0.0000],
            [0.2473, 0.2574, 0.2483, 0.2470, 0.0000]],
    
           [[0.2334, 0.3264, 0.2237, 0.2165, 0.0000],
            [0.1732, 0.4632, 0.1530, 0.2107, 0.0000],
            [0.1774, 0.4091, 0.1821, 0.2315, 0.0000],
            [0.2029, 0.2564, 0.2512, 0.2895, 0.0000],
            [0.1930, 0.3559, 0.2416, 0.2095, 0.0000]]],
    
    
          [[[0.2988, 0.1964, 0.2787, 0.2261, 0.0000],
            [0.1587, 0.3660, 0.3240, 0.1514, 0.0000],
            [0.1618, 0.3244, 0.2693, 0.2445, 0.0000],
            [0.2168, 0.3195, 0.2736, 0.1901, 0.0000],
            [0.1520, 0.3054, 0.2551, 0.2874, 0.0000]],
    
           [[0.2571, 0.3385, 0.2586, 0.1458, 0.0000],
            [0.2371, 0.4731, 0.1560, 0.1339, 0.0000],
            [0.2473, 0.3912, 0.1984, 0.1632, 0.0000],
            [0.2586, 0.2994, 0.2078, 0.2342, 0.0000],
            [0.2261, 0.3706, 0.1942, 0.2091, 0.0000]]]], device='cuda:0',
         grad_fn=<SoftmaxBackward0>)
    
    • 返回参数 attn 上下文解码
    tensor([[[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
            [0.5283, 0.4717, 0.0000, 0.0000, 0.0000, 0.0000],
            [0.3347, 0.1790, 0.4864, 0.0000, 0.0000, 0.0000],
            [0.3931, 0.3598, 0.1147, 0.1324, 0.0000, 0.0000],
            [0.2269, 0.2316, 0.0421, 0.0928, 0.4066, 0.0000],
            [0.1697, 0.1361, 0.1696, 0.2551, 0.1201, 0.1493]],
    
           [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
            [0.4688, 0.5312, 0.0000, 0.0000, 0.0000, 0.0000],
            [0.2173, 0.4401, 0.3426, 0.0000, 0.0000, 0.0000],
            [0.0493, 0.1505, 0.0469, 0.7534, 0.0000, 0.0000],
            [0.0564, 0.1002, 0.0642, 0.4745, 0.3047, 0.0000],
            [0.1956, 0.1259, 0.1935, 0.0706, 0.0597, 0.3547]]],
    
    
          [[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
            [0.4690, 0.5310, 0.0000, 0.0000, 0.0000, 0.0000],
            [0.3483, 0.1474, 0.5043, 0.0000, 0.0000, 0.0000],
            [0.3429, 0.4056, 0.0692, 0.1823, 0.0000, 0.0000],
            [0.3588, 0.2282, 0.0887, 0.1288, 0.1956, 0.0000],
            [0.1614, 0.1630, 0.1179, 0.2421, 0.1809, 0.1348]],
    
           [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
            [0.4932, 0.5068, 0.0000, 0.0000, 0.0000, 0.0000],
            [0.2127, 0.4520, 0.3353, 0.0000, 0.0000, 0.0000],
            [0.0308, 0.0730, 0.0270, 0.8691, 0.0000, 0.0000],
            [0.0171, 0.0461, 0.0254, 0.4332, 0.4783, 0.0000],
            [0.2427, 0.1489, 0.2457, 0.0840, 0.0753, 0.2035]]]], device='cuda:0',
         grad_fn=<SoftmaxBackward0>)
    
©著作权归作者所有,转载或内容合作请联系作者
【社区内容提示】社区部分内容疑似由AI辅助生成,浏览时请结合常识与多方信息审慎甄别。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

相关阅读更多精彩内容

友情链接更多精彩内容