nlp中各中文預訓練模型的輸入和輸出


Bert

from transformers import (  
  BertTokenizer,
  BertConfig,
  BertModel,
)
# clue/roberta_chinese_base
bertTokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
bertModel = BertModel.from_pretrained('bert-base-chinese')
sen = 'Transformers提供了NLP領域大量state-of-art的 預訓練語言模型結構的模型和調用框架。'
inputs = bertTokenizer(sen, return_tensors='pt')
tokens = bertTokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
print(inputs)
print(tokens)
outputs = bertModel(**inputs)
# print(len(outputs))
print(outputs[0].shape, outputs[1].shape)
{'input_ids': tensor([[  101,   100,  2990,   897,   749,   100,  7566,  1818,  1920,  7030,
         10223,   118,  8205,   118,  9143,  4638,  7564,  6378,  5298,  6427,
          6241,  3563,  1798,  5310,  3354,  4638,  3563,  1798,  1469,  6444,
          4500,  3427,  3373,   511,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
['[CLS]', '[UNK]', '提', '供', '了', '[UNK]', '領', '域', '大', '量', 'state', '-', 'of', '-', 'art', '的', '預', '訓', '練', '語', '言', '模', '型', '結', '構', '的', '模', '型', '和', '調', '用', '框', '架', '。', '[SEP]']
torch.Size([1, 35, 768]) torch.Size([1, 768])

Roberta

from transformers import (  
  BertTokenizer,
  BertConfig,
  BertModel,
)
# clue/roberta_chinese_base
robertTokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')
robertModel = BertModel.from_pretrained('hfl/chinese-roberta-wwm-ext')
sen = 'Transformers提供了NLP領域大量state-of-art的 預訓練語言模型結構的模型和調用框架。'
inputs = robertTokenizer(sen, return_tensors='pt')
tokens = robertTokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
print(inputs)
print(tokens)
outputs = robertModel(**inputs)
print(outputs)
print(outputs[0].shape, outputs[1].shape)
{'input_ids': tensor([[  101,   162, 10477,  8118, 12725,  8755,  2990,   897,   749,   156,
         10986,  7566,  1818,  1920,  7030, 10223,   118,  8205,   118,  9143,
          4638,  7564,  6378,  5298,  6427,  6241,  3563,  1798,  5310,  3354,
          4638,  3563,  1798,  1469,  6444,  4500,  3427,  3373,   511,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
['[CLS]', 't', '##ran', '##s', '##form', '##ers', '提', '供', '了', 'n', '##lp', '領', '域', '大', '量', 'state', '-', 'of', '-', 'art', '的', '預', '訓', '練', '語', '言', '模', '型', '結', '構', '的', '模', '型', '和', '調', '用', '框', '架', '。', '[SEP]']
torch.Size([1, 40, 768]) torch.Size([1, 768])

ALBert

from transformers import (  
  BertTokenizer,
  AlbertModel,
)
# clue/roberta_chinese_base
albertTokenizer = BertTokenizer.from_pretrained('clue/albert_chinese_tiny')
albertModel = AlbertModel.from_pretrained('clue/albert_chinese_tiny')
sen = 'Transformers提供了NLP領域大量state-of-art的 預訓練語言模型結構的模型和調用框架。'
inputs = albertTokenizer(sen, return_tensors='pt')
tokens = albertTokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
print(inputs)
print(tokens)
outputs = albertModel(**inputs)
# print(len(outputs))
print(outputs[0].shape, outputs[1].shape)	
{'input_ids': tensor([[  101,   162, 10477,  8118, 12725,  8755,  2990,   897,   749,   156,
         10986,  7566,  1818,  1920,  7030, 10223,   118,  8205,   118,  9143,
          4638,  7564,  6378,  5298,  6427,  6241,  3563,  1798,  5310,  3354,
          4638,  3563,  1798,  1469,  6444,  4500,  3427,  3373,   511,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
['[CLS]', 't', '##ran', '##s', '##form', '##ers', '提', '供', '了', 'n', '##lp', '領', '域', '大', '量', 'state', '-', 'of', '-', 'art', '的', '預', '訓', '練', '語', '言', '模', '型', '結', '構', '的', '模', '型', '和', '調', '用', '框', '架', '。', '[SEP]']
torch.Size([1, 40, 312]) torch.Size([1, 312])

XLNet

from transformers import AutoTokenizer, AutoModel
  
xlnettokenizer = AutoTokenizer.from_pretrained("hfl/chinese-xlnet-base")
xlnetModel = AutoModel.from_pretrained('hfl/chinese-xlnet-base')
sen = 'Transformers提供了NLP領域大量state-of-art的 預訓練語言模型結構的模型和調用框架。'
inputs = xlnettokenizer(sen, return_tensors='pt')
tokens = xlnettokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
print(inputs)
print(tokens)
outputs = xlnetModel(**inputs)
# print(outputs)
print(outputs[0].shape, len(outputs[1]))
{'input_ids': tensor([[   19, 13932,  9560,  4127,  3810,   603,   602,   412,  3336,  1144,
          3025,  4402,    13, 16636,    13,  7717,    20,    19,  3712,  3620,
          1723,  2280,  1301,    20,  2280,    24, 16338,  7921,    18,     4,
             3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1]])}
['▁', 'Trans', 'form', 'ers', '提供了', 'N', 'L', 'P', '領域', '大量', 'st', 'ate', '-', 'of', '-', 'art', '的', '▁', '預', '訓練', '語言', '模型', '結構', '的', '模型', '和', '調用', '框架', '。', '<sep>', '<cls>']
torch.Size([1, 31, 768]) 12

Electra

from transformers import AutoTokenizer, AutoModel
  

electratokenizer = AutoTokenizer.from_pretrained("hfl/chinese-electra-180g-base-discriminator")
electraModel = AutoModel.from_pretrained("hfl/chinese-electra-180g-base-discriminator")
sen = 'Transformers提供了NLP領域大量state-of-art的 預訓練語言模型結構的模型和調用框架。'
inputs = electratokenizer(sen, return_tensors='pt')
tokens = electratokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
print(inputs)
print(tokens)
outputs = electraModel(**inputs)
# print(outputs)
print(outputs[0].shape)
{'input_ids': tensor([[  101,   162, 10477,  8118, 12725,  8755,  2990,   897,   749,   156,
         10986,  7566,  1818,  1920,  7030, 10223,   118,  8205,   118,  9143,
          4638,  7564,  6378,  5298,  6427,  6241,  3563,  1798,  5310,  3354,
          4638,  3563,  1798,  1469,  6444,  4500,  3427,  3373,   511,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
['[CLS]', 't', '##ran', '##s', '##form', '##ers', '提', '供', '了', 'n', '##lp', '領', '域', '大', '量', 'state', '-', 'of', '-', 'art', '的', '預', '訓', '練', '語', '言', '模', '型', '結', '構', '的', '模', '型', '和', '調', '用', '框', '架', '。', '[SEP]']
torch.Size([1, 40, 768])

MacBert

from transformers import AutoTokenizer, AutoModel
  

mactokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")
macModel = AutoModel.from_pretrained("hfl/chinese-macbert-base")
sen = 'Transformers提供了NLP領域大量state-of-art的 預訓練語言模型結構的模型和調用框架。'
inputs = electratokenizer(sen, return_tensors='pt')
tokens = electratokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
print(inputs)
print(tokens)
outputs = electraModel(**inputs)
# print(outputs)
print(outputs[0].shape)
{'input_ids': tensor([[  101,   162, 10477,  8118, 12725,  8755,  2990,   897,   749,   156,
         10986,  7566,  1818,  1920,  7030, 10223,   118,  8205,   118,  9143,
          4638,  7564,  6378,  5298,  6427,  6241,  3563,  1798,  5310,  3354,
          4638,  3563,  1798,  1469,  6444,  4500,  3427,  3373,   511,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
['[CLS]', 't', '##ran', '##s', '##form', '##ers', '提', '供', '了', 'n', '##lp', '領', '域', '大', '量', 'state', '-', 'of', '-', 'art', '的', '預', '訓', '練', '語', '言', '模', '型', '結', '構', '的', '模', '型', '和', '調', '用', '框', '架', '。', '[SEP]']
torch.Size([1, 40, 768])


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM