【NLP】使用bert

本文轉載自查看原文 2019-02-13 16:04 6731 python/ NLP/ 機器學習/ 詞向量模型/ BERT/ TensorFlow

# 參考 https://blog.csdn.net/luoyexuge/article/details/84939755 小做改動

需要：

　　github上下載bert的代碼：https://github.com/google-research/bert

　　下載google訓練好的中文語料模型：https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip

使用：

　　使用bert，其實是使用幾個checkpoint（ckpt）文件。上面下載的zip是google訓練好的bert，我們可以在那個zip內的ckpt文件基礎上繼續訓練，獲得更貼近具體任務的ckpt文件。

如果是直接使用訓練好的ckpt文件（就是bert模型），只需如下代碼，定義model，獲得model的值

from bert import modeling    
# 使用數據加載BertModel,獲取對應的字embedding
model = modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings
)
# 獲取對應的embedding 輸入數據[batch_size, seq_length, embedding_size]
embedding = model.get_sequence_output()

這里的bert_config 是之前定義的bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)；輸入是input_ids, input_mask, segment_ids三個向量；還有兩個設置is_training（False）, use_one_hot_embedding（False），這樣的設置還有很多，這里只列舉這兩個。。

關於FLAGS，需要提到TensorFlow的flags，相當於配置運行變量，設置如下：

import tensorflow as tf

flags = tf.flags
FLAGS = flags.FLAGS

# 預訓練的中文model路徑和項目路徑
bert_path = '/home/xiangbo_wang/xiangbo/NER/chinese_L-12_H-768_A-12/'
root_path = '/home/xiangbo_wang/xiangbo/NER/BERT-BiLSTM-CRF-NER'

# 設置bert_config_file
flags.DEFINE_string(
    "bert_config_file", os.path.join(bert_path, 'bert_config.json'),
    "The config json file corresponding to the pre-trained BERT model."
)

關於輸入的三個向量，具體內容可以參照之前的博客https://www.cnblogs.com/rucwxb/p/10277217.html

input_ids, segment_ids 分別是 token embedding, segment embedding

position embedding會自動生成

input_mask 是input中需要mask的位置，本來是隨機取一部分，這里的做法是把全部輸入位置都mask住。

獲得輸入的這三個向量的方式如下：

# 獲得三個向量的函數
def inputs(vectors,maxlen=10):
    length=len(vectors)
    if length>=maxlen:
        return  vectors[0:maxlen],[1]*maxlen,[0]*maxlen
    else:
        input=vectors+[0]*(maxlen-length)
        mask=[1]*length+[0]*(maxlen-length)
        segment=[0]*maxlen
        return input,mask,segment

# 測試的句子
text = request.args.get('text')
vectors = [di.get("[CLS]")] + [di.get(i) if i in di else di.get("[UNK]") for i in list(text)] + [di.get("[SEP]")]

# 轉成1*maxlen的向量
input, mask, segment = inputs(vectors)
input_ids = np.reshape(np.array(input), [1, -1])
input_mask = np.reshape(np.array(mask), [1, -1])
segment_ids = np.reshape(np.array(segment), [1, -1])

最后是將變量輸入模型獲得最終的bert向量：

# 定義輸入向量形狀
input_ids_p=tf.placeholder(shape=[None,None],dtype=tf.int32,name="input_ids_p")
input_mask_p=tf.placeholder(shape=[None,None],dtype=tf.int32,name="input_mask_p")
segment_ids_p=tf.placeholder(shape=[None,None],dtype=tf.int32,name="segment_ids_p")
 
model = modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_ids_p,
        input_mask=input_mask_p,
        token_type_ids=segment_ids_p,
        use_one_hot_embeddings=use_one_hot_embeddings
    )

# 載入預訓練模型
restore_saver = tf.train.Saver()
restore_saver.restore(sess, init_checkpoint)

# 一個[batch_size, seq_length, embedding_size]大小的向量
embedding = tf.squeeze(model.get_sequence_output())
# 運行結果
ret=sess.run(embedding,feed_dict={"input_ids_p:0":input_ids,"input_mask_p:0":input_mask,"segment_ids_p:0":segment_ids})

完整可運行代碼如下：

import tensorflow as tf 
from bert import modeling
import collections
import os
import numpy as np 
import json

flags = tf.flags
FLAGS = flags.FLAGS
bert_path = '/home/xiangbo_wang/xiangbo/NER/chinese_L-12_H-768_A-12/'

flags.DEFINE_string(
    'bert_config_file', os.path.join(bert_path, 'bert_config.json'),
    'config json file corresponding to the pre-trained BERT model.'
)
flags.DEFINE_string(
    'bert_vocab_file', os.path.join(bert_path,'vocab.txt'),
    'the config vocab file',
)
flags.DEFINE_string(
    'init_checkpoint', os.path.join(bert_path,'bert_model.ckpt'),
    'from a pre-trained BERT get an initial checkpoint',
)
flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")

def convert2Uni(text):
    if isinstance(text, str):
        return text
    elif isinstance(text, bytes):
        return text.decode('utf-8','ignore')
    else:
        print(type(text))
        print('####################wrong################')


def load_vocab(vocab_file):
    vocab = collections.OrderedDict()
    vocab.setdefault('blank', 2)
    index = 0
    with open(vocab_file) as reader:
    # with tf.gfile.GFile(vocab_file, 'r') as reader:
        while True:
            tmp = reader.readline()
            if not tmp:
                break
            token = convert2Uni(tmp)
            token = token.strip()
            vocab[token] = index 
            index+=1
    return vocab


def inputs(vectors, maxlen = 50):
    length = len(vectors)
    if length > maxlen:
        return vectors[0:maxlen], [1]*maxlen, [0]*maxlen
    else:
        input = vectors+[0]*(maxlen-length)
        mask = [1]*length + [0]*(maxlen-length)
        segment = [0]*maxlen
        return input, mask, segment


def response_request(text):
    vectors = [dictionary.get('[CLS]')] + [dictionary.get(i) if i in dictionary else dictionary.get('[UNK]') for i in list(text)] + [dictionary.get('[SEP]')]
    input, mask, segment = inputs(vectors)

    input_ids = np.reshape(np.array(input), [1, -1])
    input_mask = np.reshape(np.array(mask), [1, -1])
    segment_ids = np.reshape(np.array(segment), [1, -1])

    embedding = tf.squeeze(model.get_sequence_output())
    rst = sess.run(embedding, feed_dict={'input_ids_p:0':input_ids, 'input_mask_p:0':input_mask, 'segment_ids_p:0':segment_ids})

    return json.dumps(rst.tolist(), ensure_ascii=False)


dictionary = load_vocab(FLAGS.bert_vocab_file)
init_checkpoint = FLAGS.init_checkpoint

sess = tf.Session()
bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

input_ids_p = tf.placeholder(shape=[None, None], dtype = tf.int32, name='input_ids_p')
input_mask_p = tf.placeholder(shape=[None, None], dtype = tf.int32, name='input_mask_p')
segment_ids_p = tf.placeholder(shape=[None, None], dtype = tf.int32, name='segment_ids_p')

model = modeling.BertModel(
    config = bert_config,
    is_training = FLAGS.use_tpu,
    input_ids = input_ids_p,
    input_mask = input_mask_p,
    token_type_ids = segment_ids_p,
    use_one_hot_embeddings = FLAGS.use_tpu,
)
print('####################################')
restore_saver = tf.train.Saver()
restore_saver.restore(sess, init_checkpoint)

print(response_request('我叫水奈樾。'))

View Code

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 【NLP】徹底搞懂BERT NLP（九）：pytorch用transformer庫實現BERT NLP與深度學習（五）BERT預訓練模型圖解BERT（NLP中的遷移學習） NLP（三十三）：中文BERT字字量 BERT總結：最先進的NLP預訓練技術 NLP 基於kashgari和BERT實現中文命名實體識別（NER） NLP（三十）：BertForSequenceClassification：Kaggle的bert文本分類，基於transformers的BERT分類 NLP突破性成果 BERT 模型詳細解讀 bert參數微調 [NLP]四大模型與bert的對比：ernie1.0,xlnet,roberta,albert,bert