Bert源碼解讀(一)之主框架

本文轉載自查看原文 2020-02-29 12:13 2451 自然語言處理(NLP)/ keras/ TensorFlow

一、BertModel主入口

class BertModel(object):
  """BERT model ("Bidirectional Encoder Representations from Transformers").

  Example usage:

  ```python
  # Already been converted into WordPiece token ids
  input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
  input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
  token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])

  config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
    num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)

  model = modeling.BertModel(config=config, is_training=True,
    input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)

  label_embeddings = tf.get_variable(...)
  pooled_output = model.get_pooled_output()
  logits = tf.matmul(pooled_output, label_embeddings)
  ...
  ```
  """

  def __init__(self,
               config,                            # BertConfig對象
               is_training,
               input_ids,                        # 【batch_size, seq_length】
               input_mask=None,                    # 【batch_size, seq_length】
               token_type_ids=None,                # 【batch_size, seq_length】
               use_one_hot_embeddings=False,    # 是否使用one-hot；否則tf.gather()
               scope=None):

    config = copy.deepcopy(config)
    if not is_training:
      config.hidden_dropout_prob = 0.0
      config.attention_probs_dropout_prob = 0.0

    input_shape = get_shape_list(input_ids, expected_rank=2)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    # 不做mask，即所有元素為1
    if input_mask is None:
      input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)

    if token_type_ids is None:
      token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)

    with tf.variable_scope(scope, default_name="bert"):
      with tf.variable_scope("embeddings"):
        # word embedding,首先可以隨機初始化每個詞的embedding，通過訓練最后得出具有上下文關系的詞向量，Transformer的最后輸出就是每個詞的詞向量。
        (self.embedding_output, self.embedding_table) = embedding_lookup(
            input_ids=input_ids,
            vocab_size=config.vocab_size,
            embedding_size=config.hidden_size, #這里詞向量的維度就設置為hidden_size，也即論文中的H
            initializer_range=config.initializer_range,
            word_embedding_name="word_embeddings",
            use_one_hot_embeddings=use_one_hot_embeddings)

        # 添加position embedding和segment embedding,layer norm + dropout
        self.embedding_output = embedding_postprocessor(
            input_tensor=self.embedding_output,
            use_token_type=True,
            token_type_ids=token_type_ids,
            token_type_vocab_size=config.type_vocab_size,
            token_type_embedding_name="token_type_embeddings",
            use_position_embeddings=True,
            position_embedding_name="position_embeddings",
            initializer_range=config.initializer_range,
            max_position_embeddings=config.max_position_embeddings,
            dropout_prob=config.hidden_dropout_prob)

      with tf.variable_scope("encoder"):

        # input_ids是經過padding的word_ids：[25, 120, 34, 0, 0]，input_mask是有效詞標記：[1, 1, 1, 0, 0]，這里形成一個attention_mask矩陣
        attention_mask = create_attention_mask_from_input_mask(
            input_ids, input_mask)

        # transformer模塊疊加， 輸入是[batch_size, seq_length, hidden_size]，輸出也是[batch_size, seq_length, hidden_size].
        self.all_encoder_layers = transformer_model(
            input_tensor=self.embedding_output,
            attention_mask=attention_mask,
            hidden_size=config.hidden_size,
            num_hidden_layers=config.num_hidden_layers,
            num_attention_heads=config.num_attention_heads,
            intermediate_size=config.intermediate_size,
            intermediate_act_fn=get_activation(config.hidden_act),
            hidden_dropout_prob=config.hidden_dropout_prob,
            attention_probs_dropout_prob=config.attention_probs_dropout_prob,
            initializer_range=config.initializer_range,
            do_return_all_layers=True)

      # `self.sequence_output`是最后一層的輸出，shape為【batch_size, seq_length, hidden_size】
      self.sequence_output = self.all_encoder_layers[-1]

      # ‘pooler’部分將encoder輸出【batch_size, seq_length, hidden_size】，轉成【batch_size, hidden_size】
      #這一部分主要是為分類任務做准備，取每個sequence的第一個位置CLS的輸出向量結果為整個句子的訓練結果向量，然后后面再加一層全連接網絡並softmax就可以做句子分類任務了。
      #其實上面的sequence_output生成的是每個token的詞向量，可以認為是bert論文中Mask LM任務的結果；pooled_output是Next sentence prdict任務的結果。
      with tf.variable_scope("pooler"):
        # 取最終輸出結果層的第一個位置[CLS]對應的tensor， 對於分類任務很重要，sequence_output[:, 0:1, :]得到的是[batch_size, 1, hidden_size]，我們需要用squeeze把第二維去掉
        first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
        # 然后再加一個全連接層，輸出仍然是[batch_size, hidden_size]
        self.pooled_output = tf.layers.dense(
            first_token_tensor,
            config.hidden_size,
            activation=tf.tanh,
            kernel_initializer=create_initializer(config.initializer_range))

總結：Bert的輸出最終有兩個結果可用

sequence_output：維度【batch_size, seq_length, hidden_size】，這是訓練后每個token的詞向量。

pooled_output：維度是【batch_size, hidden_size】，每個sequence第一個位置CLS的向量輸出，用於分類任務。

class BertConfig(object):
  """Configuration for `BertModel`."""

  def __init__(self,
               vocab_size,
               hidden_size=768,
               num_hidden_layers=12,
               num_attention_heads=12,
               intermediate_size=3072,
               hidden_act="gelu",
               hidden_dropout_prob=0.1,
               attention_probs_dropout_prob=0.1,
               max_position_embeddings=512,
               type_vocab_size=16,
               initializer_range=0.02):
    """Constructs BertConfig.

    Args:
      vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
      hidden_size: Size of the encoder layers and the pooler layer.
      num_hidden_layers: Number of hidden layers in the Transformer encoder.
      num_attention_heads: Number of attention heads for each attention layer in
        the Transformer encoder.
      intermediate_size: The size of the "intermediate" (i.e., feed-forward)
        layer in the Transformer encoder.
      hidden_act: The non-linear activation function (function or string) in the
        encoder and pooler.
      hidden_dropout_prob: The dropout probability for all fully connected
        layers in the embeddings, encoder, and pooler.
      attention_probs_dropout_prob: The dropout ratio for the attention
        probabilities.
      max_position_embeddings: The maximum sequence length that this model might
        ever be used with. Typically set this to something large just in case
        (e.g., 512 or 1024 or 2048).
      type_vocab_size: The vocabulary size of the `token_type_ids` passed into
        `BertModel`.
      initializer_range: The stdev of the truncated_normal_initializer for
        initializing all weight matrices.
    """
    self.vocab_size = vocab_size
    self.hidden_size = hidden_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.hidden_act = hidden_act
    self.intermediate_size = intermediate_size
    self.hidden_dropout_prob = hidden_dropout_prob
    self.attention_probs_dropout_prob = attention_probs_dropout_prob
    self.max_position_embeddings = max_position_embeddings
    self.type_vocab_size = type_vocab_size
    self.initializer_range = initializer_range

  @classmethod
  def from_dict(cls, json_object):
    """Constructs a `BertConfig` from a Python dictionary of parameters."""
    config = BertConfig(vocab_size=None)
    for (key, value) in six.iteritems(json_object):
      config.__dict__[key] = value
    return config

  @classmethod
  def from_json_file(cls, json_file):
    """Constructs a `BertConfig` from a json file of parameters."""
    with tf.gfile.GFile(json_file, "r") as reader:
      text = reader.read()
    return cls.from_dict(json.loads(text))

  def to_dict(self):
    """Serializes this instance to a Python dictionary."""
    output = copy.deepcopy(self.__dict__)
    return output

  def to_json_string(self):
    """Serializes this instance to a JSON string."""
    return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

vocab_size：詞表大小
hidden_size：隱藏層神經元數，可以理解為d_model，即單個Transformer block第一層(輸入層后面鏈接的層)和最后一層(輸出層)的節點數，對應於論文中的H
num_hidden_layers：Transformer 的層數，對應於論文中的L
num_attention_heads：multi-head attention 的 head 數，對應於論文中的A
intermediate_size：encoder 的“中間”隱層神經元數（例如 feed-forward layer），對應於論文中的4H。
hidden_act：隱藏層激活函數
hidden_dropout_prob：隱層 dropout 率
attention_probs_dropout_prob：注意力部分的 dropout
max_position_embeddings：最大位置編碼
type_vocab_size：token_type_ids 的詞典大小
initializer_range：truncated_normal_initializer 初始化方法的 stdev
這里要注意一點，可能剛看的時候對type_vocab_size這個參數會有點不理解，其實就是在next sentence prediction任務里的Segment A和 Segment B。在下載的bert_config.json文件里也有說明，默認值應該為 2。

二、獲取詞向量（Embedding_lookup）

對於輸入 word_ids，返回 embedding table。可以選用 one-hot 或者 tf.gather()

def embedding_lookup(input_ids,                        # word_id：【batch_size, seq_length】
                     vocab_size,
                     embedding_size=128,
                     initializer_range=0.02,
                     word_embedding_name="word_embeddings",
                     use_one_hot_embeddings=False):

  # 該函數默認輸入的形狀為【batch_size, seq_length, input_num】
  # 如果輸入為2D的【batch_size, seq_length】，則擴展到【batch_size, seq_length, 1】
  if input_ids.shape.ndims == 2:
    input_ids = tf.expand_dims(input_ids, axis=[-1])

  embedding_table = tf.get_variable(
      name=word_embedding_name,
      shape=[vocab_size, embedding_size],
      initializer=create_initializer(initializer_range))

  flat_input_ids = tf.reshape(input_ids, [-1])    #【batch_size*seq_length*input_num】
  if use_one_hot_embeddings:
    one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
    output = tf.matmul(one_hot_input_ids, embedding_table)
  else:    # 按索引取值
    output = tf.gather(embedding_table, flat_input_ids)

  input_shape = get_shape_list(input_ids)

  # output：[batch_size, seq_length, num_inputs]
  # 轉成:[batch_size, seq_length, num_inputs*embedding_size]
  output = tf.reshape(output,
                        input_shape[0:-1] + [input_shape[-1] * embedding_size])
  return (output, embedding_table)

這里是首先隨機初始化embedding_table，shape為[vocab_size, embedding_size]，詞向量的維度是128維，也就是bert輸入層是128維，通過bert的訓練，形成最終的詞向量。所以，這里bert預訓練的過程就是詞向量形成的過程，load bert的參數就可以直接生成詞向量。

Return：【batch_size, seq_length, embedding_size】

1) tf.gather 用法

import tensorflow as tf
a = tf.Variable([[1,2,3,4,5], [6,7,8,9,10], [11,12,13,14,15]])
index_a = tf.Variable([0,2])
 
b = tf.Variable([1,2,3,4,5,6,7,8,9,10])
index_b = tf.Variable([2,4,6,8])
 
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    #從a中取出第0個和第2個索引位置的值，因為a里面的元素值都是list，所以是取出了兩個list
    print(sess.run(tf.gather(a, index_a))) 
    #從b中取出索引位置為2，3，6，8的元素值。
    print(sess.run(tf.gather(b, index_b)))
    
out：
#  [[ 1  2  3  4  5]
#   [11 12 13 14 15]]
 
#  [3 5 7 9]

2）get_shape_list(tensor, expected_rank=None, name=None) list形式返回tensor的shap，並做維度校驗

參數：
tensor：一個需要返回shape的tf.Tensor
expected_rank：int或者是一個int的list。輸入tensor期望的rank(也就是維度)，如果輸入tensor的維度不等於這個數或者不再這個list中，就會拋出異常。

tensor.shape.ndims可以返回該tensor數據的維度

Return:tensor的shape

one = tf.constant([[0.0, 0.1, 0.2],[0.0, 0.1, 0.2]])#這是一個矩陣顯然是二維數據，一個向量是一維數據，如[1,2,3]
one_shape = get_shape_list(one, expected_rank=[2,3,4])#希望one的維度在2，3，4中，不超出這個范圍。
print('one_shape:',one_shape)
print(one.shape.ndims)

out：
one_shape: [2, 3]
2

三、詞向量的后續處理（embedding_postprocessor）

我們知道 BERT 模型的輸入有三部分：token embedding ，segment embedding以及position embedding。上一節中我們只獲得了 token embedding，這部分代碼對其完善信息，正則化，dropout 之后輸出最終 embedding。注意，在 Transformer 論文中的position embedding是由 sin/cos 函數生成的固定的值，而在這里代碼實現中是跟普通 word embedding 一樣隨機生成的，可以訓練的。作者這里這樣選擇的原因可能是 BERT 訓練的數據比 Transformer 那篇大很多，完全可以讓模型自己去學習。

def embedding_postprocessor(input_tensor,                # [batch_size, seq_length, embedding_size]
                            use_token_type=False,
                            token_type_ids=None,
                            token_type_vocab_size=16,        # 一般是2
                            token_type_embedding_name="token_type_embeddings",
                            use_position_embeddings=True,
                            position_embedding_name="position_embeddings",
                            initializer_range=0.02,
                            max_position_embeddings=512,    #最大位置編碼，必須大於等於max_seq_len
                            dropout_prob=0.1):

  input_shape = get_shape_list(input_tensor, expected_rank=3)   #【batch_size,seq_length,embedding_size】
  batch_size = input_shape[0]
  seq_length = input_shape[1]
  width = input_shape[2]

  output = input_tensor

  # Segment embedding信息
  if use_token_type:
    if token_type_ids is None:
      raise ValueError("`token_type_ids` must be specified if"
                       "`use_token_type` is True.")
    token_type_table = tf.get_variable(
        name=token_type_embedding_name,
        shape=[token_type_vocab_size, width],
        initializer=create_initializer(initializer_range))
    # 由於token-type-table比較小，所以這里采用one-hot的embedding方式加速
    flat_token_type_ids = tf.reshape(token_type_ids, [-1])
    one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
    token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
    token_type_embeddings = tf.reshape(token_type_embeddings,
                                       [batch_size, seq_length, width])
    output += token_type_embeddings

  # Position embedding信息
  if use_position_embeddings:
    # 確保seq_length小於等於max_position_embeddings
    assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
    with tf.control_dependencies([assert_op]):
      full_position_embeddings = tf.get_variable(
          name=position_embedding_name,
          shape=[max_position_embeddings, width],
          initializer=create_initializer(initializer_range))

      # 這里position embedding是可學習的參數，[max_position_embeddings, width]
      # 但是通常實際輸入序列沒有達到max_position_embeddings
      # 所以為了提高訓練速度，使用tf.slice取出句子長度的embedding
      position_embeddings = tf.slice(full_position_embeddings, [0, 0],
                                     [seq_length, -1])
      num_dims = len(output.shape.as_list())

      # word embedding之后的tensor是[batch_size, seq_length, width]
      # 因為位置編碼是與輸入內容無關，它的shape總是[seq_length, width]
      # 我們無法把位置Embedding加到word embedding上
      # 因此我們需要擴展位置編碼為[1, seq_length, width]
      # 然后就能通過broadcasting加上去了。
      position_broadcast_shape = []
      for _ in range(num_dims - 2):
        position_broadcast_shape.append(1)
      position_broadcast_shape.extend([seq_length, width])
      position_embeddings = tf.reshape(position_embeddings,
                                       position_broadcast_shape)
      output += position_embeddings

  output = layer_norm_and_dropout(output, dropout_prob)
  return output

四、構造 attention_mask

因為每個樣本都經過padding了，所以一個sequence中每個詞對於占位符的位置要mask(因為pad的占位符原本是不存在的，所以置為0，表示看到不到；其它位置為1)，這里就是構造每個詞的可視域矩陣attention_mask，看得到的詞就置為1，看不到的就置為0，進而帶入tranformer模型中備用。

Return：將shape為[batch_size, to_seq_length]的2D mask轉換為一個shape 為[batch_size, from_seq_length, to_seq_length] 的3D mask用於attention當中。

def create_attention_mask_from_input_mask(from_tensor, to_mask):
#這里的to_mask就是input_mask,from_tensor就是input_ids，兩者長度都是max_seq_length。
  """Create 3D attention mask from a 2D tensor mask.

  Args:
    from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
    to_mask: int32 Tensor of shape [batch_size, to_seq_length].

  Returns:
    float Tensor of shape [batch_size, from_seq_length, to_seq_length].
  """
  from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
  batch_size = from_shape[0]
  from_seq_length = from_shape[1]

  to_shape = get_shape_list(to_mask, expected_rank=2)
  to_seq_length = to_shape[1]

  to_mask = tf.cast(
      tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)

  # We don't assume that `from_tensor` is a mask (although it could be). We
  # don't actually care if we attend *from* padding tokens (only *to* padding)
  # tokens so we create a tensor of all ones.
  #
  # `broadcast_ones` = [batch_size, from_seq_length, 1]
  broadcast_ones = tf.ones(
      shape=[batch_size, from_seq_length, 1], dtype=tf.float32)

  # Here we broadcast along two dimensions to create the mask.
  mask = broadcast_ones * to_mask
　return mask

舉例：

import tensorflow as tf
import six
batch_size=2
to_seq_length=3
from_seq_length=3
to_mask=[[1,0,0],[1,1,0]]
to_mask = tf.cast(
      tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)
print(to_mask)
broadcast_ones = tf.ones(
      shape=[batch_size, from_seq_length, 1], dtype=tf.float32)
print(broadcast_ones)
mask = broadcast_ones * to_mask
mask

輸出：

tf.Tensor(
[[[1. 0. 0.]]

 [[1. 1. 0.]]], shape=(2, 1, 3), dtype=float32)
tf.Tensor(
[[[1.]
  [1.]
  [1.]]

 [[1.]
  [1.]
  [1.]]], shape=(2, 3, 1), dtype=float32)



<tf.Tensor: id=63, shape=(2, 3, 3), dtype=float32, numpy=
array([[[1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.]],

       [[1., 1., 0.],
        [1., 1., 0.],
        [1., 1., 0.]]], dtype=float32)>

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Bert系列（三）——源碼解讀之Pre-train bert系列二：《BERT》論文解讀 Flask框架整個流程源碼解讀微擎框架的緩存機制實現源碼解讀 Abp框架多租戶源碼解讀及自定義拓展 BERT源碼分析及使用方法【算法】Bert預訓練源碼閱讀 Spring源碼解讀--（一）源碼下載 Autoware 源碼解讀 lombok builder源碼解讀