1. 創建vocabulary
-
學習詞向量的概念
-
用Skip-thought模型訓練詞向量
-
學習使用PyTorch dataset 和 dataloader
-
學習定義PyTorch模型
-
學習torch.nn中常見的Module
- Embedding
-
學習常見的PyTorch operations
-
bmm
-
logsigmoid
-
-
保存和讀取PyTorch模型
訓練數據:
鏈接:https://pan.baidu.com/s/1tFeK3mXuVXEy3EMarfeWvg 密碼:v2z5
在這一份notebook中,我們會(盡可能)嘗試復現論文Distributed Representations of Words and Phrases and their Compositionality中訓練詞向量的方法.
我們會實現Skip-gram模型,並且使用論文中noice contrastive sampling的目標函數。
以下是一些我們沒有實現的細節
- subsampling:參考論文section 2.3
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as tud
from torch.nn.parameter import Parameter
from collections import Counter
import numpy as np
import random
import math
import pandas as pd
import scipy
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
USE_CUDA = torch.cuda.is_available() #有GPU可以用
# 為了保證實驗結果可以復現,我們經常會把各種random seed固定在某一個值
random.seed(53113)
np.random.seed(53113)
torch.manual_seed(53113)
if USE_CUDA:
torch.cuda.manual_seed(53113)
# 設定一些超參數
K = 100 # number of negative samples 負樣本隨機采樣數量
C = 3 # nearby words threshold 指定周圍三個單詞進行預測
NUM_EPOCHS = 2 # The number of epochs of training 迭代輪數
MAX_VOCAB_SIZE = 30000 # the vocabulary size 詞匯表多大
BATCH_SIZE = 128 # the batch size 每輪迭代1個batch的數量
LEARNING_RATE = 0.2 # the initial learning rate #學習率
EMBEDDING_SIZE = 100 # 詞向量維度
LOG_FILE = "word-embedding.log"
# tokenize函數,把一篇文本轉化成一個個單詞
def word_tokenize(text):
return text.split()
-
從文本文件中讀取所有的文字,通過這些文本創建一個vocabulary
-
由於單詞數量可能太大,我們只選取最常見的MAX_VOCAB_SIZE個單詞
-
我們添加一個UNK單詞表示所有不常見的單詞
-
我們需要記錄單詞到index的mapping,以及index到單詞的mapping,單詞的count,單詞的(normalized) frequency,以及單詞總數。
with open("./text8.train.txt", "r") as fin:
text = fin.read()
text = [w for w in word_tokenize(text.lower())] # 分詞,在這里類似於text.split()
vocab = dict(Counter(text).most_common(MAX_VOCAB_SIZE-1)) # 字典格式,把(MAX_VOCAB_SIZE-1)個最頻繁出現的單詞取出來,-1是留給不常見的單詞
vocab["<unk>"] = len(text) - np.sum(list(vocab.values())) # 不常見單詞數=總單詞數-常見單詞數,這里計算的vocab["<unk>"]=29999
idx_to_word = [word for word in vocab.keys()] # 取出字典的所有單詞key
word_to_idx = {word:i for i, word in enumerate(idx_to_word)} # 取出所有單詞的單詞和對應的索引,索引值與單詞出現次數相反,最常見單詞索引為0。
word_counts = np.array([count for count in vocab.values()], dtype=np.float32) # 所有單詞的頻數values
word_freqs = word_counts / np.sum(word_counts) # 所有單詞的頻率
word_freqs = word_freqs ** (3./4.) # 論文里頻率乘以3/4次方
word_freqs = word_freqs / np.sum(word_freqs) # 被選作negative sampling的單詞概率
VOCAB_SIZE = len(idx_to_word) # 詞匯表單詞數30000=MAX_VOCAB_SIZE
2. 實現Dataloader
一個dataloader需要以下內容:
-
把所有text編碼成數字,然后用subsampling預處理這些文字。
-
保存vocabulary,單詞count,normalized word frequency
-
每個iteration sample一個中心詞
-
根據當前的中心詞,返回context單詞
-
根據中心詞sample一些negative單詞,返回單詞的counts
這里有一個好的tutorial介紹如何使用PyTorch dataloader.
為了使用dataloader,我們需要定義以下兩個function:
__len__
function需要返回整個數據集中有多少個item__get__
根據給定的index返回一個item
有了dataloader之后,我們可以輕松隨機打亂整個數據集,拿到一個batch的數據等等。
class WordEmbeddingDataset(tud.Dataset): # tud.Dataset父類
def __init__(self, text, word_to_idx, idx_to_word, word_freqs, word_counts):
''' text: a list of words, all text from the training dataset
word_to_idx: the dictionary from word to idx
idx_to_word: idx to word mapping
word_freq: the frequency of each word
word_counts: the word counts
'''
super(WordEmbeddingDataset, self).__init__() # 初始化模型
self.text_encoded = [word_to_idx.get(t, VOCAB_SIZE-1) for t in text] # 取出text里每個單詞word_to_idx字典里對應的索引,不在字典里返回"<unk>"的索引,get括號里第二個參數應該寫word_to_idx["<unk>"]
self.text_encoded = torch.LongTensor(self.text_encoded) # 變成Longtensor類型
self.word_to_idx = word_to_idx # 以下皆為保存數據
self.idx_to_word = idx_to_word
self.word_freqs = torch.Tensor(word_freqs)
self.word_counts = torch.Tensor(word_counts)
def __len__(self):
''' 返回整個數據集(所有單詞)的長度
'''
return len(self.text_encoded)
def __getitem__(self, idx): # 這里__getitem__函數是個迭代器,idx代表了所有的單詞索引
''' 這個function返回以下數據用於訓練
- 中心詞
- 這個單詞附近的(positive)單詞
- 隨機采樣的K個單詞作為negative sample
'''
center_word = self.text_encoded[idx] # 中心詞索引
pos_indices = list(range(idx-C, idx)) + list(range(idx+1, idx+C+1)) # 除中心詞外,周圍詞的索引,比如idx=0時,pos_indices = [-3, -2, -1, 1, 2, 3]
pos_indices = [i%len(self.text_encoded) for i in pos_indices] # idx可能超出詞匯總數,需要取余
pos_words = self.text_encoded[pos_indices] # 周圍詞索引,是正例單詞
# 負例采樣單詞索引
# torch.multinomial:對self.word_freqs做 K * pos_words.shape[0](正確單詞數量)次取值,輸出的是self.word_freqs對應的下標
# 取樣方式采用有放回的采樣,並且self.word_freqs數值越大,取樣概率越大
neg_words = torch.multinomial(self.word_freqs, K * pos_words.shape[0], True)
return center_word, pos_words, neg_words
創建dataset和dataloader
dataset = WordEmbeddingDataset(text, word_to_idx, idx_to_word,
word_freqs, word_counts)
# dataset[5]
dataloader = tud.DataLoader(dataset, batch_size=BATCH_SIZE,
shuffle=True, num_workers=4)
注意:如果沒有gpu,num_workers這里要設置為0,即,不適用多線程
測試dataloader內容:
for i, (input_labels, pos_labels, neg_labels) in enumerate(dataloader):
print(input_labels.shape, pos_labels.shape, neg_labels.shape)
break
torch.Size([128]) torch.Size([128, 6]) torch.Size([128, 600])
3. 定義pytorch模型
class EmbeddingModel(nn.Module):
def __init__(self, vocab_size, embed_size):
'''
初始化輸出和輸入embedding
'''
super(EmbeddingModel, self).__init__()
self.vocab_size = vocab_size
self.embed_size = embed_size
initrange = 0.5 / self.embed_size
# 模型輸入nn.Embedding(30000, 100)
self.in_embed = nn.Embedding(self.vocab_size, self.embed_size, sparse=False)
# 權重初始化
self.in_embed.weight.data.uniforms_(-initrange, initrange)
# 模型輸出nn.Embedding(30000, 100)
self.out_embed = nn.Embeddings(self.vocab_size, self.embed_size, sparse=False)
self.out_embed.weight.data.uniform_(-initrange, initrange) # 正則化
def forward(self, input_labels, pos_labels, neg_labels):
'''
input_labels: 中心詞, [batch_size]
pos_labels: 中心詞周圍 context window 出現過的單詞 [batch_size * (window_size * 2)]
neg_labels: 中心詞周圍沒有出現過的單詞,從 negative sampling 得到 [batch_size, (window_size * 2 * K)]
return: loss, [batch_size]
'''
batch_size = input_labels.size(0)
# [batch_size, embed_size],這里估計進行了運算:(128,30000)*(30000,100)= 128 * 100
input_embedding = self.in_embed(input_labels)
# [batch_size, 2*C, embed_size],增加了維度(2*C)
# 表示一個batch有B組周圍詞單詞,一組周圍詞有(2*C)個單詞,每個單詞有embed_size個維度
pos_embedding = self.out_embed(pos_labels)
# [batch_size, 2*C*K, embed_size],增加了維度(2*C*K)
neg_embedding = self.out_embed(neg_labels)
# input_embedding.unsqueeze(2)的維度[batch_size, embed_size, 1]
# torch.bmm()為batch間的矩陣相乘(b,n.m)*(b,m,p)=(b,n,p)
# 調用bmm之后[batch, 2*C, 1],再壓縮掉最后一維
# [b, 2*C, embed_size] * [b, embed_size, 1] -> [b, 2*c, 1] -> [b, 2*c]
pos_dot = torch.bmm(pos_embedding, input_embedding.unsqueeze(2)).squeeze() # [batch_size, 2*C]
neg_dot = torch.bmm(neg_embedding, -input_embedding.unsqueeze(2)).squeeze() # [batch_size, 2*C*K]
# 下面loss計算就是論文里的公式
log_pos = F.logsigmoid(pos_dot).sum(1) # batch_size
log_neg = F.logsigmoid(neg_dot).sum(1)
loss = log_pos + log_neg
return -loss
def input_embedding(self): # 取出self.in_embed數據參數
return self.in_embed().weight.data.cpu().numpy()
定義一個模型以及把模型移動到GPU:
model = EmbeddingModel(VOCAB_SIZE, EMBEDDING_SIZE)
if USE_CUDA:
mode = model.to(device)
4. 評估模型
評估的文件類似如下結構(word1 word2 相似度分值):
def evaluate(filename, embedding_weight):
if filename.endswith('.csv'):
data = pd.read_csv(filename, seq=',')
else:
data = pd.read_csv(filename, seq='\t')
human_similarity = []
model_similarity = []
for i in data.iloc[i, 0:2].index: # data.iloc[:, 0:2]取所有行索引為0、1的數據
word1 , word2 = data.iloc[i, 0], data.iloc[i, 1]
if word1 not in word_to_idx or word2 not in word_to_idx:
continue
else:
word1_idx, word2_idx = word_to_idx[word1], word_to_idx[word2]
word1_embed, word2_embed = embedding_weights[[word1_idx]], embedding_weights[[word2_idx]]
# 模型計算的相似度
model_similarity.append(float(sklearn.metrics.pairwise.cosine_similarity(word1_embed, word2_embed)))
# 已知的相似度
human_similarity.append(float(data.iloc[i, 2]))
# 兩者相似度的差異性
return scipy.stats.spearmanr(human_similarity, model_similarity)
# 取cos 最近的十個單詞
def find_nearest(word):
index = word_to_idx[word]
embedding = embedding_weights[index]
cos_dis = np.array([scipy.spatial.distance.cosine(e, embedding) for e in embedding_weights])
return [idx_to_word[i] for i in cos_dis.argsort()[:10]]
5. 訓練模型
-
模型一般需要訓練若干個epoch
-
每個epoch我們都把所有的數據分成若干個batch
-
把每個batch的輸入和輸出都包裝成cuda tensor
-
forward pass,通過輸入的句子預測每個單詞的下一個單詞
-
用模型的預測和正確的下一個單詞計算cross entropy loss
-
清空模型當前gradient
-
backward pass
-
更新模型參數
-
每隔一定的iteration輸出模型在當前iteration的loss,以及在驗證數據集上做模型的評估
# Adam
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
for e in range(NUM_EPOCHS): # 開始迭代
for i, (input_labels, pos_labels, neg_labels) in enumerate(dataloader):
input_labels = input_labels.long().to(device) # longtensor
pos_labels = pos_labels.long().to(device)
neg_labels = neg_labels.long().to(device)
optimizer.zero_grad() # 梯度歸零
loss = model(input_labels, pos_labels, neg_labels).mean() # 計算loss
loss.backward() # 反向傳播
optimizer.step() # 更新梯度
# 打印結果
if i % 100 == 0:
with open(LOG_FILE, 'a') as f:
f.write("epoch: {}, iter: {}, loss: {}\n".format(e, i, loss.item()))
print("epoch: {}, iter: {}, loss: {}".format(e, i, loss.item()))
if i % 2000 == 0:
embedding_weights = model.input_embedding()
sim_simlex = evaluate('./embedding/simlex-999.txt', embedding_weights)
sim_men = evaluate('./embedding/men.txt', embedding_weights)
sim_353 = evaluate('./embedding/wordsim353.csv', embedding_weights)
with open(LOG_FILE, 'a') as f:
print("epoch: {}, iteration: {}, simlex-999: {}, men: {}, sim353: {}, nearest to monster: {}\n".format(
e, i, sim_simlex, sim_men, sim_353, find_nearest("monster")))
f.write("epoch: {}, iteration: {}, simlex-999: {}, men: {}, sim353: {}, nearest to monster: {}\n".format(
e, i, sim_simlex, sim_men, sim_353, find_nearest("monster")))
embedding_weights = model.input_embedding()
np.save('embedding-{}'.format(EMBEDDING_SIZE), embedding_weights)
torch.save(model.state_dict(), 'embedding-{}.th'.format(EMBEDDING_SIZE))
保存狀態
model.load_state_dict(torch.load("embedding-{}.th".format(EMBEDDING_SIZE)))
6. 在 MEN 和 Simplex-999 數據集上做評估
embedding_weights = model.input_embedding()
print("simlex-999", evaluate("simlex-999.txt", embedding_weights))
print("men", evaluate("men.txt", embedding_weights))
print("wordsim353", evaluate("wordsim353.csv", embedding_weights))
simlex-999 SpearmanrResult(correlation=0.17251697429101504, pvalue=7.863946056740345e-08)
men SpearmanrResult(correlation=0.1778096817088841, pvalue=7.565661657312768e-20)
wordsim353 SpearmanrResult(correlation=0.27153702278146635, pvalue=8.842165885381714e-07)
7. 尋找nearest neighbors
for word in ["good", "fresh", "monster", "green", "like", "america", "chicago", "work", "computer", "language"]:
print(word, find_nearest(word))
good ['good', 'strong', 'software', 'free', 'better', 'low', 'relatively', 'simple', 'special', 'individual']
fresh ['fresh', 'oral', 'uniform', 'mechanical', 'noise', 'evolutionary', 'marketing', 'freight', 'ammunition', 'reasoning']
monster ['monster', 'noun', 'protocol', 'giant', 'scheme', 'curve', 'operator', 'pen', 'camera', 'rifle']
green ['green', 'plant', 'dark', 'ice', 'bass', 'audio', 'mountain', 'deep', 'pro', 'oil']
like ['like', 'non', 'using', 'without', 'body', 'cell', 'animal', 'include', 'good', 'human']
america ['america', 'africa', 'australia', 'europe', 'asia', 'canada', 'india', 'germany', 'middle', 'union']
chicago ['chicago', 'sweden', 'poland', 'los', 'francisco', 'virginia', 'georgia', 'victoria', 'hungary', 'texas']
work ['work', 'life', 'death', 'position', 'upon', 'works', 'body', 'family', 'father', 'name']
computer ['computer', 'standard', 'big', 'video', 'space', 'special', 'basic', 'science', 'historical', 'text']
language ['language', 'art', 'modern', 'arabic', 'historical', 'word', 'culture', 'ancient', 'science', 'greek']
10. 單詞之間的關系
man_idx = word_to_idx["man"]
king_idx = word_to_idx["king"]
woman_idx = word_to_idx["woman"]
embedding = embedding_weights[woman_idx] - embedding_weights[man_idx] + embedding_weights[king_idx]
cos_dis = np.array([scipy.spatial.distance.cosine(e, embedding) for e in embedding_weights])
for i in cos_dis.argsort()[:20]:
print(idx_to_word[i])
charles
king
james
henry
david
pope
william
louis
iii
albert
george
iv
paul
emperor
peter
thomas
joseph
john
president
sir