一、BertModel主入口
class BertModel(object): """BERT model ("Bidirectional Encoder Representations from Transformers"). Example usage: ```python # Already been converted into WordPiece token ids input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) config = modeling.BertConfig(vocab_size=32000, hidden_size=512, num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) model = modeling.BertModel(config=config, is_training=True, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) label_embeddings = tf.get_variable(...) pooled_output = model.get_pooled_output() logits = tf.matmul(pooled_output, label_embeddings) ... ``` """ def __init__(self, config, # BertConfig對象 is_training, input_ids, # 【batch_size, seq_length】 input_mask=None, # 【batch_size, seq_length】 token_type_ids=None, # 【batch_size, seq_length】 use_one_hot_embeddings=False, # 是否使用one-hot;否則tf.gather() scope=None): config = copy.deepcopy(config) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 input_shape = get_shape_list(input_ids, expected_rank=2) batch_size = input_shape[0] seq_length = input_shape[1] # 不做mask,即所有元素為1 if input_mask is None: input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) if token_type_ids is None: token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) with tf.variable_scope(scope, default_name="bert"): with tf.variable_scope("embeddings"): # word embedding,首先可以隨機初始化每個詞的embedding,通過訓練最后得出具有上下文關系的詞向量,Transformer的最后輸出就是每個詞的詞向量。 (self.embedding_output, self.embedding_table) = embedding_lookup( input_ids=input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, #這里詞向量的維度就設置為hidden_size,也即論文中的H initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) # 添加position embedding和segment embedding,layer norm + dropout self.embedding_output = embedding_postprocessor( input_tensor=self.embedding_output, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope("encoder"): # input_ids是經過padding的word_ids:[25, 120, 34, 0, 0],input_mask是有效詞標記:[1, 1, 1, 0, 0],這里形成一個attention_mask矩陣 attention_mask = create_attention_mask_from_input_mask( input_ids, input_mask) # transformer模塊疊加, 輸入是[batch_size, seq_length, hidden_size],輸出也是[batch_size, seq_length, hidden_size]. self.all_encoder_layers = transformer_model( input_tensor=self.embedding_output, attention_mask=attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config.attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) # `self.sequence_output`是最后一層的輸出,shape為【batch_size, seq_length, hidden_size】 self.sequence_output = self.all_encoder_layers[-1] # ‘pooler’部分將encoder輸出【batch_size, seq_length, hidden_size】,轉成【batch_size, hidden_size】 #這一部分主要是為分類任務做准備,取每個sequence的第一個位置CLS的輸出向量結果為整個句子的訓練結果向量,然后后面再加一層全連接網絡並softmax就可以做句子分類任務了。 #其實上面的sequence_output生成的是每個token的詞向量,可以認為是bert論文中Mask LM任務的結果;pooled_output是Next sentence prdict任務的結果。 with tf.variable_scope("pooler"): # 取最終輸出結果層的第一個位置[CLS]對應的tensor, 對於分類任務很重要,sequence_output[:, 0:1, :]得到的是[batch_size, 1, hidden_size],我們需要用squeeze把第二維去掉 first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) # 然后再加一個全連接層,輸出仍然是[batch_size, hidden_size] self.pooled_output = tf.layers.dense( first_token_tensor, config.hidden_size, activation=tf.tanh, kernel_initializer=create_initializer(config.initializer_range))
總結:Bert的輸出最終有兩個結果可用
sequence_output:維度【batch_size, seq_length, hidden_size】,這是訓練后每個token的詞向量。
pooled_output:維度是【batch_size, hidden_size】,每個sequence第一個位置CLS的向量輸出,用於分類任務。
class BertConfig(object): """Configuration for `BertModel`.""" def __init__(self, vocab_size, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=16, initializer_range=0.02): """Constructs BertConfig. Args: vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. hidden_size: Size of the encoder layers and the pooler layer. num_hidden_layers: Number of hidden layers in the Transformer encoder. num_attention_heads: Number of attention heads for each attention layer in the Transformer encoder. intermediate_size: The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. hidden_act: The non-linear activation function (function or string) in the encoder and pooler. hidden_dropout_prob: The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob: The dropout ratio for the attention probabilities. max_position_embeddings: The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). type_vocab_size: The vocabulary size of the `token_type_ids` passed into `BertModel`. initializer_range: The stdev of the truncated_normal_initializer for initializing all weight matrices. """ self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.hidden_act = hidden_act self.intermediate_size = intermediate_size self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range @classmethod def from_dict(cls, json_object): """Constructs a `BertConfig` from a Python dictionary of parameters.""" config = BertConfig(vocab_size=None) for (key, value) in six.iteritems(json_object): config.__dict__[key] = value return config @classmethod def from_json_file(cls, json_file): """Constructs a `BertConfig` from a json file of parameters.""" with tf.gfile.GFile(json_file, "r") as reader: text = reader.read() return cls.from_dict(json.loads(text)) def to_dict(self): """Serializes this instance to a Python dictionary.""" output = copy.deepcopy(self.__dict__) return output def to_json_string(self): """Serializes this instance to a JSON string.""" return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
- vocab_size:詞表大小
- hidden_size:隱藏層神經元數,可以理解為dmodel,即單個Transformer block第一層(輸入層后面鏈接的層)和最后一層(輸出層)的節點數,對應於論文中的H
- num_hidden_layers:Transformer 的層數,對應於論文中的L
- num_attention_heads:multi-head attention 的 head 數,對應於論文中的A
- intermediate_size:encoder 的“中間”隱層神經元數(例如 feed-forward layer),對應於論文中的4H。
- hidden_act:隱藏層激活函數
- hidden_dropout_prob:隱層 dropout 率
- attention_probs_dropout_prob:注意力部分的 dropout
- max_position_embeddings:最大位置編碼
- type_vocab_size:token_type_ids 的詞典大小
- initializer_range:truncated_normal_initializer 初始化方法的 stdev
- 這里要注意一點,可能剛看的時候對type_vocab_size這個參數會有點不理解,其實就是在next sentence prediction任務里的Segment A和 Segment B。在下載的bert_config.json文件里也有說明,默認值應該為 2。
二、獲取詞向量(Embedding_lookup)
對於輸入 word_ids,返回 embedding table。可以選用 one-hot 或者 tf.gather()
def embedding_lookup(input_ids, # word_id:【batch_size, seq_length】 vocab_size, embedding_size=128, initializer_range=0.02, word_embedding_name="word_embeddings", use_one_hot_embeddings=False): # 該函數默認輸入的形狀為【batch_size, seq_length, input_num】 # 如果輸入為2D的【batch_size, seq_length】,則擴展到【batch_size, seq_length, 1】 if input_ids.shape.ndims == 2: input_ids = tf.expand_dims(input_ids, axis=[-1]) embedding_table = tf.get_variable( name=word_embedding_name, shape=[vocab_size, embedding_size], initializer=create_initializer(initializer_range)) flat_input_ids = tf.reshape(input_ids, [-1]) #【batch_size*seq_length*input_num】 if use_one_hot_embeddings: one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) output = tf.matmul(one_hot_input_ids, embedding_table) else: # 按索引取值 output = tf.gather(embedding_table, flat_input_ids) input_shape = get_shape_list(input_ids) # output:[batch_size, seq_length, num_inputs] # 轉成:[batch_size, seq_length, num_inputs*embedding_size] output = tf.reshape(output, input_shape[0:-1] + [input_shape[-1] * embedding_size]) return (output, embedding_table)
這里是首先隨機初始化embedding_table,shape為[vocab_size, embedding_size],詞向量的維度是128維,也就是bert輸入層是128維,通過bert的訓練,形成最終的詞向量。所以,這里bert預訓練的過程就是詞向量形成的過程,load bert的參數就可以直接生成詞向量。
- Return:【batch_size, seq_length, embedding_size】
1) tf.gather 用法
import tensorflow as tf a = tf.Variable([[1,2,3,4,5], [6,7,8,9,10], [11,12,13,14,15]]) index_a = tf.Variable([0,2]) b = tf.Variable([1,2,3,4,5,6,7,8,9,10]) index_b = tf.Variable([2,4,6,8]) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) #從a中取出第0個和第2個索引位置的值,因為a里面的元素值都是list,所以是取出了兩個list print(sess.run(tf.gather(a, index_a))) #從b中取出索引位置為2,3,6,8的元素值。 print(sess.run(tf.gather(b, index_b))) out: # [[ 1 2 3 4 5] # [11 12 13 14 15]] # [3 5 7 9]
2)get_shape_list(tensor, expected_rank=None, name=None) list形式返回tensor的shap,並做維度校驗
tensor:一個需要返回shape的tf.Tensor
expected_rank:int或者是一個int的list。輸入tensor期望的rank(也就是維度),如果輸入tensor的維度不等於這個數或者不再這個list中,就會拋出異常。
one = tf.constant([[0.0, 0.1, 0.2],[0.0, 0.1, 0.2]])#這是一個矩陣顯然是二維數據,一個向量是一維數據,如[1,2,3] one_shape = get_shape_list(one, expected_rank=[2,3,4])#希望one的維度在2,3,4中,不超出這個范圍。 print('one_shape:',one_shape) print(one.shape.ndims) out: one_shape: [2, 3] 2
三、詞向量的后續處理(embedding_postprocessor)
我們知道 BERT 模型的輸入有三部分:token embedding
,segment embedding
以及position embedding
。上一節中我們只獲得了 token embedding,這部分代碼對其完善信息,正則化,dropout 之后輸出最終 embedding。注意,在 Transformer 論文中的position embedding
是由 sin/cos 函數生成的固定的值,而在這里代碼實現中是跟普通 word embedding 一樣隨機生成的,可以訓練的。作者這里這樣選擇的原因可能是 BERT 訓練的數據比 Transformer 那篇大很多,完全可以讓模型自己去學習。
def embedding_postprocessor(input_tensor, # [batch_size, seq_length, embedding_size] use_token_type=False, token_type_ids=None, token_type_vocab_size=16, # 一般是2 token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=0.02, max_position_embeddings=512, #最大位置編碼,必須大於等於max_seq_len dropout_prob=0.1): input_shape = get_shape_list(input_tensor, expected_rank=3) #【batch_size,seq_length,embedding_size】 batch_size = input_shape[0] seq_length = input_shape[1] width = input_shape[2] output = input_tensor # Segment embedding信息 if use_token_type: if token_type_ids is None: raise ValueError("`token_type_ids` must be specified if" "`use_token_type` is True.") token_type_table = tf.get_variable( name=token_type_embedding_name, shape=[token_type_vocab_size, width], initializer=create_initializer(initializer_range)) # 由於token-type-table比較小,所以這里采用one-hot的embedding方式加速 flat_token_type_ids = tf.reshape(token_type_ids, [-1]) one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) token_type_embeddings = tf.reshape(token_type_embeddings, [batch_size, seq_length, width]) output += token_type_embeddings # Position embedding信息 if use_position_embeddings: # 確保seq_length小於等於max_position_embeddings assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) with tf.control_dependencies([assert_op]): full_position_embeddings = tf.get_variable( name=position_embedding_name, shape=[max_position_embeddings, width], initializer=create_initializer(initializer_range)) # 這里position embedding是可學習的參數,[max_position_embeddings, width] # 但是通常實際輸入序列沒有達到max_position_embeddings # 所以為了提高訓練速度,使用tf.slice取出句子長度的embedding position_embeddings = tf.slice(full_position_embeddings, [0, 0], [seq_length, -1]) num_dims = len(output.shape.as_list()) # word embedding之后的tensor是[batch_size, seq_length, width] # 因為位置編碼是與輸入內容無關,它的shape總是[seq_length, width] # 我們無法把位置Embedding加到word embedding上 # 因此我們需要擴展位置編碼為[1, seq_length, width] # 然后就能通過broadcasting加上去了。 position_broadcast_shape = [] for _ in range(num_dims - 2): position_broadcast_shape.append(1) position_broadcast_shape.extend([seq_length, width]) position_embeddings = tf.reshape(position_embeddings, position_broadcast_shape) output += position_embeddings output = layer_norm_and_dropout(output, dropout_prob) return output
四、構造 attention_mask
因為每個樣本都經過padding了,所以一個sequence中每個詞對於占位符的位置要mask(因為pad的占位符原本是不存在的,所以置為0,表示看到不到;其它位置為1),這里就是構造每個詞的可視域矩陣attention_mask,看得到的詞就置為1,看不到的就置為0,進而帶入tranformer模型中備用。
Return:將shape為[batch_size, to_seq_length]的2D mask轉換為一個shape 為[batch_size, from_seq_length, to_seq_length] 的3D mask用於attention當中。
def create_attention_mask_from_input_mask(from_tensor, to_mask):
#這里的to_mask就是input_mask,from_tensor就是input_ids,兩者長度都是max_seq_length。 """Create 3D attention mask from a 2D tensor mask. Args: from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...]. to_mask: int32 Tensor of shape [batch_size, to_seq_length]. Returns: float Tensor of shape [batch_size, from_seq_length, to_seq_length]. """ from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) batch_size = from_shape[0] from_seq_length = from_shape[1] to_shape = get_shape_list(to_mask, expected_rank=2) to_seq_length = to_shape[1] to_mask = tf.cast( tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) # We don't assume that `from_tensor` is a mask (although it could be). We # don't actually care if we attend *from* padding tokens (only *to* padding) # tokens so we create a tensor of all ones. # # `broadcast_ones` = [batch_size, from_seq_length, 1] broadcast_ones = tf.ones( shape=[batch_size, from_seq_length, 1], dtype=tf.float32) # Here we broadcast along two dimensions to create the mask. mask = broadcast_ones * to_mask
return mask
舉例:
import tensorflow as tf import six batch_size=2 to_seq_length=3 from_seq_length=3 to_mask=[[1,0,0],[1,1,0]] to_mask = tf.cast( tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) print(to_mask) broadcast_ones = tf.ones( shape=[batch_size, from_seq_length, 1], dtype=tf.float32) print(broadcast_ones) mask = broadcast_ones * to_mask mask 輸出: tf.Tensor( [[[1. 0. 0.]] [[1. 1. 0.]]], shape=(2, 1, 3), dtype=float32) tf.Tensor( [[[1.] [1.] [1.]] [[1.] [1.] [1.]]], shape=(2, 3, 1), dtype=float32) <tf.Tensor: id=63, shape=(2, 3, 3), dtype=float32, numpy= array([[[1., 0., 0.], [1., 0., 0.], [1., 0., 0.]], [[1., 1., 0.], [1., 1., 0.], [1., 1., 0.]]], dtype=float32)>