1. 數據集
import numpy as np
import torch
from torch import nn, optim
from torchtext import data, datasets
import numpy as np
import torch
from torch import nn, optim
from torchtext import data, datasets
# use torchtext to load data, no need to download dataset
# set up fields
# 兩個Field對象定義字段的處理方法(文本字段、標簽字段)
TEXT = data.Field(tokenize='spacy') # 分詞
LABEL = data.LabelField(dtype=torch.float)
# make splits for data
# IMDB共50000影評,包含正面和負面兩個類別。數據被前面的Field處理
# 按照(TEXT, LABEL) 分割成 訓練集,測試集
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
print('len of train data:', len(train_data)) # 25000
print('len of test data:', len(test_data)) # 25000
# torchtext.data.Example : 用來表示一個樣本,數據+標簽
print(train_data.examples[15].text) # 文本:句子的單詞列表
print(train_data.examples[15].label) # 標簽: 積極
# ['The', 'movie', 'is', 'a', 'bit', '"', 'thin', '"', 'after', 'reading', 'the', 'book', ',', 'but', 'it', "'s", 'still', 'one', 'of', 'the', 'greatest', 'movies', 'ever', 'made', '.', 'Sheryl', 'Lee', 'is', 'beautiful', 'and', 'Nick', 'Nolte', 'is', 'really', '"', 'vonneguty', '"', '.', 'He', 'makes', 'great', 'job', 'expressing', 'the', 'feelings', 'from', 'the', 'book', 'to', 'the', 'film', '.', 'Not', 'many', 'films', 'engage', 'the', 'feeling', 'of', 'the', 'book', 'as', 'well', 'as', 'Mother', 'Night', 'does', '.']
# pos
# build the vocabulary
# 在這種情況下,會默認下載glove.6B.zip文件,進而解壓出glove.6B.50d.txt, glove.6B.100d.txt, glove.6B.200d.txt, glove.6B.300d.txt這四個文件
# 因此我們可以事先將glove.6B.zip或glove.6B.100d.txt放在當前文件夾下
TEXT.build_vocab(train_data, max_size=10000, vectors='glove.6B.100d') # 等價:text.build_vocab(train, vectors=GloVe(name='6B', dim=100))
LABEL.build_vocab(train_data)
print(len(TEXT.vocab)) # 10002
print(TEXT.vocab.itos[:12]) # ['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is', 'in', 'I']
print(TEXT.vocab.stoi['and']) # 5
print(LABEL.vocab.stoi) # defaultdict(None, {'neg': 0, 'pos': 1})
# 一個batch30個句子,將 text 全部轉換成 數字
batchsz = 30
device = torch.device('cuda')
train_iterator, test_iterator = data.BucketIterator.splits(
(train_data, test_data),
batch_size = batchsz,
device=device
)
Tips:
- 訓練時,傳入rnn的數據,是 sentences_to_indices之后的 字符串對應的數字
2. 使用預訓練的詞向量
class RNN(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim):
super().__init__()
# [0-10001] => [100]
# 參數1:embedding個數(單詞數), 參數2:embedding的維度(詞向量維度)
self.embedding = nn.Embedding(vocab_size, embedding_dim)
...
pass
...
def forward(self, x):
"""
x: [seq_len, b] vs [b, 3, 28, 28]
"""
# [seq_len, b, 1] => [seq_len, b, 100]
embedding = self.dropout(self.embedding(x))
...
pass
...
return out
rnn = RNN(len(TEXT.vocab), 100, 256) # 詞個數,詞嵌入維度,輸出維度
pretrained_embedding = TEXT.vocab.vectors
print('pretrained_embedding:', pretrained_embedding.shape) # torch.Size([10002, 100])
# 使用預訓練過的embedding來替換隨機初始化
rnn.embedding.weight.data.copy_(pretrained_embedding)
print('embedding layer inited.')
Tips:
- 輸出的預測也是數字,可以用
LABEL.vocab.itos[idx]
轉成字符串