1. 數據集

import numpy as np
import torch
from torch import nn, optim
from torchtext import data, datasets

import numpy as np
import torch
from torch import nn, optim
from torchtext import data, datasets

# use torchtext to load data, no need to download dataset
# set up fields
# 兩個Field對象定義字段的處理方法（文本字段、標簽字段）
TEXT = data.Field(tokenize='spacy')  # 分詞
LABEL = data.LabelField(dtype=torch.float)

# make splits for data
# IMDB共50000影評，包含正面和負面兩個類別。數據被前面的Field處理
# 按照(TEXT, LABEL) 分割成 訓練集，測試集
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

print('len of train data:', len(train_data))        # 25000
print('len of test data:', len(test_data))          # 25000

# torchtext.data.Example : 用來表示一個樣本，數據+標簽
print(train_data.examples[15].text)                 # 文本：句子的單詞列表
print(train_data.examples[15].label)                # 標簽: 積極
# ['The', 'movie', 'is', 'a', 'bit', '"', 'thin', '"', 'after', 'reading', 'the', 'book', ',', 'but', 'it', "'s", 'still', 'one', 'of', 'the', 'greatest', 'movies', 'ever', 'made', '.', 'Sheryl', 'Lee', 'is', 'beautiful', 'and', 'Nick', 'Nolte', 'is', 'really', '"', 'vonneguty', '"', '.', 'He', 'makes', 'great', 'job', 'expressing', 'the', 'feelings', 'from', 'the', 'book', 'to', 'the', 'film', '.', 'Not', 'many', 'films', 'engage', 'the', 'feeling', 'of', 'the', 'book', 'as', 'well', 'as', 'Mother', 'Night', 'does', '.']
# pos

# build the vocabulary
# 在這種情況下，會默認下載glove.6B.zip文件，進而解壓出glove.6B.50d.txt, glove.6B.100d.txt, glove.6B.200d.txt, glove.6B.300d.txt這四個文件
# 因此我們可以事先將glove.6B.zip或glove.6B.100d.txt放在當前文件夾下
TEXT.build_vocab(train_data, max_size=10000, vectors='glove.6B.100d')  # 等價:text.build_vocab(train, vectors=GloVe(name='6B', dim=100))
LABEL.build_vocab(train_data)
print(len(TEXT.vocab))             # 10002
print(TEXT.vocab.itos[:12])        # ['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is', 'in', 'I']
print(TEXT.vocab.stoi['and'])      # 5
print(LABEL.vocab.stoi)            # defaultdict(None, {'neg': 0, 'pos': 1})

# 一個batch30個句子，將 text 全部轉換成 數字
batchsz = 30
device = torch.device('cuda')
train_iterator, test_iterator = data.BucketIterator.splits(
                                (train_data, test_data),
                                batch_size = batchsz,
                                device=device
                               )

Tips:

訓練時，傳入rnn的數據，是 sentences_to_indices之后的字符串對應的數字

2. 使用預訓練的詞向量

class RNN(nn.Module):

  def __init__(self, vocab_size, embedding_dim, hidden_dim):
    super().__init__()

    # [0-10001] => [100]
    # 參數1:embedding個數(單詞數), 參數2:embedding的維度(詞向量維度)
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    ...
    pass
    ...

  def forward(self, x):
    """
    x: [seq_len, b] vs [b, 3, 28, 28]
    """
    # [seq_len, b, 1] => [seq_len, b, 100]
    embedding = self.dropout(self.embedding(x))
    ...
    pass
    ...
    return out


rnn = RNN(len(TEXT.vocab), 100, 256)                        # 詞個數，詞嵌入維度，輸出維度

pretrained_embedding = TEXT.vocab.vectors
print('pretrained_embedding:', pretrained_embedding.shape)  # torch.Size([10002, 100])

# 使用預訓練過的embedding來替換隨機初始化
rnn.embedding.weight.data.copy_(pretrained_embedding)
print('embedding layer inited.')

Tips:

輸出的預測也是數字，可以用 LABEL.vocab.itos[idx] 轉成字符串

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 PyTorch在NLP任務中使用預訓練詞向量 pytorch中如何使用預訓練詞向量英文詞向量：使用fastText預訓練的詞向量各種預訓練的詞向量(Pretrained Word Embeddings) 【騰訊詞向量】騰訊中文預訓練詞向量 tensorflow如何正確加載預訓練詞向量在Keras模型中one-hot編碼,Embedding層,使用預訓練的詞向量/處理圖片將glove預訓練詞向量轉為word2vector形式 NLP之預訓練最強 NLP 預訓練模型庫 PyTorch-Transformers 正式開源：支持 6 個預訓練框架，27 個預訓練模型