word2vector(含code)

本文轉載自查看原文 2019-08-07 10:40 480

Word2Vec其實就是通過學習文本來用詞向量的方式表征詞的語義信息，即通過一個嵌入空間使得語義上相似的單詞在該空間內距離很近。

Embedding其實就是一個映射，將單詞從原先所屬的空間映射到新的多維空間中，也就是把原先詞所在空間嵌入到一個新的空間中去。

Word2Vec模型實際上分為了兩個部分，第一部分為建立模型，第二部分是通過模型獲取嵌入詞向量。Word2Vec的整個建模過程實際上與自編碼器（auto-encoder）的思想很相似，即先基於訓練數據構建一個神經網絡，當這個模型訓練好以后，我們並不會用這個訓練好的模型處理新的任務，我們真正需要的是這個模型通過訓練數據所學得的參數，例如隱層的權重矩陣——后面我們將會看到這些權重在Word2Vec中實際上就是我們試圖去學習的“word vectors”。基於訓練數據建模的過程，我們給它一個名字叫“Fake Task”，意味着建模並不是我們最終的目的。

上面提到的這種方法實際上會在無監督特征學習（unsupervised feature learning）中見到，最常見的就是自編碼器（auto-encoder）：通過在隱層將輸入進行編碼壓縮，繼而在輸出層將數據解碼恢復初始狀態，訓練完成后，我們會將輸出層“砍掉”，僅保留隱層。

https://www.leiphone.com/news/201706/QprrvzsrZCl4S2lw.html

基於Python版本的實現：
import math
import sys
import numpy as np

class Ngram:
def init(self, tokens):
self.tokens = tokens
self.count = 0
self.score = 0.0

def set_score(self, score):
    self.score = score

def get_string(self):
    return '_'.join(self.tokens)

class Corpus: #語料庫
def init(self, filename, word_phrase_passes, word_phrase_delta, word_phrase_threshold, word_phrase_filename):
i = 0
file_pointer = open(filename, 'r')

    all_tokens = []

    for line in file_pointer:
        line_tokens = line.split()
        for token in line_tokens:
            token = token.lower() #大寫轉小寫

            if len(token) > 1 and token.isalnum():  # isalnum() 方法檢測字符串是否由字母和數字組成
                all_tokens.append(token)

            i += 1
            if i % 10000 == 0:
                sys.stdout.flush() #刷新輸出
                sys.stdout.write("\rReading corpus: %d" % i)

    sys.stdout.flush()
    print( "\rCorpus read: %d" % i)

    file_pointer.close()

    self.tokens = all_tokens

    for x in range(1, word_phrase_passes + 1):
        self.build_ngrams(x, word_phrase_delta, word_phrase_threshold, word_phrase_filename)

    self.save_to_file(filename)

def build_ngrams(self, x, word_phrase_delta, word_phrase_threshold, word_phrase_filename):

    ngrams = []
    ngram_map = {}

    token_count_map = {}
    for token in self.tokens:
        if token not in token_count_map:
            token_count_map[token] = 1
        else:
            token_count_map[token] += 1

    i = 0
    ngram_l = []
    for token in self.tokens:

        if len(ngram_l) == 2:
            ngram_l.pop(0)

        ngram_l.append(token)
        ngram_t = tuple(ngram_l)

        if ngram_t not in ngram_map:
            ngram_map[ngram_t] = len(ngrams)
            ngrams.append(Ngram(ngram_t))

        ngrams[ngram_map[ngram_t]].count += 1

        i += 1
        if i % 10000 == 0:
            sys.stdout.flush()
            sys.stdout.write("\rBuilding n-grams (%d pass): %d" % (x, i))

    sys.stdout.flush()
    print( "\rn-grams (%d pass) built: %d" % (x, i))

    filtered_ngrams_map = {}
    file_pointer = open(word_phrase_filename + ('-%d' % x), 'w')

    # http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf
    i = 0
    for ngram in ngrams:
        product = 1
        for word_string in ngram.tokens:
            product *= token_count_map[word_string]
        ngram.set_score((float(ngram.count) - word_phrase_delta) / float(product))

        if ngram.score > word_phrase_threshold:
            filtered_ngrams_map[ngram.get_string()] = ngram
            file_pointer.write('%s %d\n' % (ngram.get_string(), ngram.count))

        i += 1
        if i % 10000 == 0:
            sys.stdout.flush()
            sys.stdout.write("\rScoring n-grams: %d" % i)

    sys.stdout.flush()
    print( "\rScored n-grams: %d, filtered n-grams: %d" % (i, len(filtered_ngrams_map)))
    file_pointer.close()

    # Combining the tokens
    all_tokens = []
    i = 0

    while i < len(self.tokens):

        if i + 1 < len(self.tokens):
            ngram_l = []
            ngram_l.append(self.tokens[i])
            ngram_l.append(self.tokens[i+1])
            ngram_string = '_'.join(ngram_l)

            if len(ngram_l) == 2 and (ngram_string in filtered_ngrams_map):
                ngram = filtered_ngrams_map[ngram_string]
                all_tokens.append(ngram.get_string())
                i += 2
            else:
                all_tokens.append(self.tokens[i])
                i += 1
        else:
            all_tokens.append(self.tokens[i])
            i += 1

    print("Tokens combined")

    self.tokens = all_tokens

def save_to_file(self, filename):

    i = 1

    filepointer = open('preprocessed-' + filename, 'w')
    line = ''
    for token in self.tokens:
        if i % 20 == 0:
            line += token
            filepointer.write('%s\n' % line)
            line = ''
        else:
            line += token + ' '
        i += 1

        if i % 10000 == 0:
            sys.stdout.flush()
            sys.stdout.write("\rWriting to preprocessed input file")

    sys.stdout.flush()
    print ("\rPreprocessed input file written")

    filepointer.close()


def __getitem__(self, i):
    return self.tokens[i]

def __len__(self):
    return len(self.tokens)

def __iter__(self):
    return iter(self.tokens)

class Word:
def init(self, word):
self.word = word
self.count = 0

class Vocabulary:
def init(self, corpus, min_count):
self.words = []
self.word_map = {}
self.build_words(corpus, min_count)

    self.filter_for_rare_and_common()

def build_words(self, corpus, min_count):
    words = []
    word_map = {}

    i = 0
    for token in corpus:
        if token not in word_map:
            word_map[token] = len(words)
            words.append(Word(token))
        words[word_map[token]].count += 1

        i += 1
        if i % 10000 == 0:
            sys.stdout.flush()
            sys.stdout.write("\rBuilding vocabulary: %d" % len(words))

    sys.stdout.flush()
    print("\rVocabulary built: %d" % len(words))

    self.words = words
    self.word_map = word_map # Mapping from each token to its index in vocab

def __getitem__(self, i):
    return self.words[i]

def __len__(self):
    return len(self.words)

def __iter__(self):
    return iter(self.words)

def __contains__(self, key):
    return key in self.word_map

def indices(self, tokens):
    return [self.word_map[token] if token in self else self.word_map['{rare}'] for token in tokens]

def filter_for_rare_and_common(self):
    # Remove rare words and sort
    tmp = []
    tmp.append(Word('{rare}'))
    unk_hash = 0

    count_unk = 0
    for token in self.words:
        if token.count < min_count:
            count_unk += 1
            tmp[unk_hash].count += token.count
        else:
            tmp.append(token)

    tmp.sort(key=lambda token : token.count, reverse=True)

    # Update word_map
    word_map = {}
    for i, token in enumerate(tmp):
        word_map[token.word] = i

    self.words = tmp
    self.word_map = word_map
    pass

class TableForNegativeSamples:
def init(self, vocab):
power = 0.75
norm = sum([math.pow(t.count, power) for t in vocab]) # Normalizing constants

    table_size = int(1e6)
    table = np.zeros(table_size, dtype=np.uint32)

    p = 0 # Cumulative probability
    i = 0
    for j, word in enumerate(vocab):
        p += float(math.pow(word.count, power))/norm
        while i < table_size and float(i) / table_size < p:
            table[i] = j
            i += 1
    self.table = table

def sample(self, count):
    indices = np.random.randint(low=0, high=len(self.table), size=count)
    return [self.table[i] for i in indices]

def sigmoid(z):
if z > 6:
return 1.0
elif z < -6:
return 0.0
else:
return 1 / (1 + math.exp(-z))

def save(vocab, nn0, filename):
file_pointer = open(filename, 'w')
for token, vector in zip(vocab, nn0):
word = token.word.replace(' ', '_')
vector_str = ' '.join([str(s) for s in vector])
file_pointer.write('%s %s\n' % (word, vector_str))
file_pointer.close()

if name == 'main':

for input_filename in ['in.txt']:
#for input_filename in ['news-2012-phrases-10000.txt']:

    # Number of negative examples
    k_negative_sampling = 5

    # Min count for words to be used in the model, else {rare}
    min_count = 3

    # Number of word phrase passes
    word_phrase_passes = 3 # 3

    # min count for word phrase formula
    word_phrase_delta = 3 # 5

    # Threshold for word phrase creation
    word_phrase_threshold = 1e-4

    # Read the corpus 讀取語料庫
    corpus = Corpus(input_filename, word_phrase_passes, word_phrase_delta, word_phrase_threshold, 'phrases-%s' % input_filename)

    # Read train file to init vocab讀取訓練文件初始化vocab
    vocab = Vocabulary(corpus, min_count)
    table = TableForNegativeSamples(vocab)

    # Max window length
    for window in [5]: # 5 for large set

        # Dimensionality of word embeddings
        for dim in [100]: # 100

            print( "Training: %s-%d-%d-%d" % (input_filename, window, dim, word_phrase_passes))

            # Initialize network
            nn0 = np.random.uniform(low=-0.5/dim, high=0.5/dim, size=(len(vocab), dim))
            nn1 = np.zeros(shape=(len(vocab), dim))

            # Initial learning rate
            initial_alpha = 0.01 # 0.01

            # Modified in loop
            global_word_count = 0
            alpha = initial_alpha
            word_count = 0
            last_word_count = 0

            tokens = vocab.indices(corpus)

            for token_idx, token in enumerate(tokens):
                if word_count % 10000 == 0:
                    global_word_count += (word_count - last_word_count)
                    last_word_count = word_count

                    # Recalculate alpha
                    # alpha = initial_alpha * (1 - float(global_word_count) / len(corpus))
                    # if alpha < initial_alpha * 0.0001:
                    #     alpha = initial_alpha * 0.0001

                    sys.stdout.flush()
                    sys.stdout.write("\rTraining: %d of %d" % (global_word_count, len(corpus)))

                # Randomize window size, where win is the max window size
                current_window = np.random.randint(low=1, high=window+1)
                context_start = max(token_idx - current_window, 0)
                context_end = min(token_idx + current_window + 1, len(tokens))
                context = tokens[context_start:token_idx] + tokens[token_idx+1:context_end] # Turn into an iterator?

                for context_word in context:
                    # Init neu1e with zeros
                    neu1e = np.zeros(dim)
                    classifiers = [(token, 1)] + [(target, 0) for target in table.sample(k_negative_sampling)]
                    for target, label in classifiers:
                        z = np.dot(nn0[context_word], nn1[target])
                        p = sigmoid(z)
                        g = alpha * (label - p)
                        neu1e += g * nn1[target]              # Error to backpropagate to nn0
                        nn1[target] += g * nn0[context_word]  # Update nn1

                    # Update nn0
                    nn0[context_word] += neu1e

                word_count += 1

            global_word_count += (word_count - last_word_count)
            sys.stdout.flush()
            print("\rTraining finished: %d" % global_word_count)

            # Save model to file
            save(vocab, nn0, 'output-%s-%d-%d-%d' % (input_filename, window, dim, word_phrase_passes))

基於tensorflow版本的實現

import time
import numpy as np
import tensorflow as tf
import random
from collections import Counter

主要包括以下四個部分的代碼：

數據預處理：替換文本中特殊符號並去除低頻詞；對文本分詞；構建語料；單詞映射表

訓練樣本構建

模型構建

模型驗證

首先加載數據

with open('text8') as f:
text = f.read()

定義函數來完成數據的預處理

def preprocess(text, freq=5):
'''
對文本進行預處理

參數
---
text: 文本數據
freq: 詞頻閾值
'''
# 對文本中的符號進行替換
text = text.lower()
text = text.replace('.', ' <PERIOD> ')
text = text.replace(',', ' <COMMA> ')
text = text.replace('"', ' <QUOTATION_MARK> ')
text = text.replace(';', ' <SEMICOLON> ')
text = text.replace('!', ' <EXCLAMATION_MARK> ')
text = text.replace('?', ' <QUESTION_MARK> ')
text = text.replace('(', ' <LEFT_PAREN> ')
text = text.replace(')', ' <RIGHT_PAREN> ')
text = text.replace('--', ' <HYPHENS> ')
text = text.replace('?', ' <QUESTION_MARK> ')
# text = text.replace('\n', ' <NEW_LINE> ')
text = text.replace(':', ' <COLON> ')
words = text.split()

# 刪除低頻詞，減少噪音影響
word_counts = Counter(words)
trimmed_words = [word for word in words if word_counts[word] > freq]

return trimmed_words

清洗文本並分詞

words = preprocess(text)
print(words[:20])

構建映射表

vocab = set(words)
vocab_to_int = {w: c for c, w in enumerate(vocab)}
int_to_vocab = {c: w for c, w in enumerate(vocab)}

enumerate()是用來遍歷一個可迭代容器中的元素，同時通過一個計數器變量記錄當前元素所對應的索引值。

print("total words: {}".format(len(words)))
print("unique words: {}".format(len(set(words))))

整個文本中單詞大約為1660萬規模，詞典大小為6萬左右

訓練樣本構建

skip-gram中，訓練樣本的形式是(input word, output word)，其中output word是input word的上下文。

為了減少模型噪音並加速訓練速度，我們在構造batch之前要對樣本進行采樣，剔除停用詞等噪音因素。

采樣：對樣本進行抽樣，剔除高頻的停用詞來減少模型的噪音，並加速訓練。

對原文本進行vocab到int的轉換

int_words = [vocab_to_int[w] for w in words]

t = 1e-5 # t值
threshold = 0.8 # 剔除概率閾值

統計單詞出現頻次

int_word_counts = Counter(int_words)
total_count = len(int_words)

計算單詞頻率

word_freqs = {w: c/total_count for w, c in int_word_counts.items()}

計算被刪除的概率

prob_drop = {w: 1 - np.sqrt(t / word_freqs[w]) for w in int_word_counts}

對單詞進行采樣

train_words = [w for w in int_words if prob_drop[w] < threshold]

print(len(train_words))

構建batch

Skip-Gram模型是通過輸入詞來預測上下文。

對於一個給定詞，離它越近的詞可能與它越相關，離它越遠的詞越不相關，這里我們設置窗口大小為5，對於每個訓練單詞，我們還會在[1:5]之間隨機生成一個整數R，

用R作為我們最終選擇output word的窗口大小。這里之所以多加了一步隨機數的窗口重新選擇步驟，是為了能夠讓模型更聚焦於當前input word的鄰近詞。

def get_targets(words, idx, window_size=5):
'''
獲得input word的上下文單詞列表

參數
---
words: 單詞列表
idx: input word的索引號
window_size: 窗口大小
'''
target_window = np.random.randint(1, window_size + 1)
# 這里要考慮input word前面單詞不夠的情況
start_point = idx - target_window if (idx - target_window) > 0 else 0
end_point = idx + target_window
# output words(即窗口中的上下文單詞)
targets = set(words[start_point: idx] + words[idx + 1: end_point + 1])
return list(targets)

def get_batches(words, batch_size, window_size=5):
'''
構造一個獲取batch的生成器
'''
n_batches = len(words) // batch_size

# 僅取full batches
words = words[:n_batches * batch_size]

for idx in range(0, len(words), batch_size):
    x, y = [], []
    batch = words[idx: idx + batch_size]
    for i in range(len(batch)):
        batch_x = batch[i]
        batch_y = get_targets(batch, i, window_size)
        # 由於一個input word會對應多個output word，因此需要長度統一
        x.extend([batch_x] * len(batch_y))
        y.extend(batch_y)
    yield x, y

構建網絡

該部分包括：輸入層，嵌入，負采樣

train_graph = tf.Graph()
with train_graph.as_default():
inputs = tf.placeholder(tf.int32, shape=[None], name='inputs')
labels = tf.placeholder(tf.int32, shape=[None, None], name='labels')

# 嵌入
# 嵌入矩陣的矩陣形狀為  vocab_size*hidden_units_size
vocab_size = len(int_to_vocab)
embedding_size = 200  # 嵌入維度

with train_graph.as_default():
# 嵌入層權重矩陣
embedding = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1, 1))#tf.random_uniform 從均勻分布中輸出隨機值
# 實現lookup
embed = tf.nn.embedding_lookup(embedding, inputs)
#tf.nn.embedding_lookup函數的用法主要是:選取一個張量里面索引對應的元素。
# tf.nn.embedding_lookup（tensor, id）:tensor就是輸入張量，id就是張量對應的索引，

負采樣：負采樣主要是為了解決梯度下降計算速度慢的問題

# ensorFlow中的tf.nn.sampled_softmax_loss會在softmax層上進行采樣計算損失，計算出的loss要比full softmax loss低。
n_sampled = 100

with train_graph.as_default():
softmax_w = tf.Variable(tf.truncated_normal([vocab_size, embedding_size], stddev=0.1))
softmax_b = tf.Variable(tf.zeros(vocab_size))

# 計算negative sampling下的損失
loss = tf.nn.sampled_softmax_loss(softmax_w, softmax_b, labels, embed, n_sampled, vocab_size)

cost = tf.reduce_mean(loss)
optimizer = tf.train.AdamOptimizer().minimize(cost)

模型驗證

with train_graph.as_default():
# 隨機挑選一些單詞
valid_size = 16
valid_window = 100
# 從不同位置各選8個單詞
valid_examples = np.array(random.sample(range(valid_window), valid_size // 2))
valid_examples = np.append(valid_examples,
random.sample(range(1000, 1000 + valid_window), valid_size // 2))

valid_size = len(valid_examples)
# 驗證單詞集
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

# 計算每個詞向量的模並進行單位化
norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keep_dims=True))
normalized_embedding = embedding / norm
# 查找驗證單詞的詞向量
valid_embedding = tf.nn.embedding_lookup(normalized_embedding, valid_dataset)
# 計算余弦相似度
similarity = tf.matmul(valid_embedding, tf.transpose(normalized_embedding))

epochs = 10  # 迭代輪數
batch_size = 1000  # batch大小
window_size = 10  # 窗口大小

with train_graph.as_default():
saver = tf.train.Saver() # 文件存儲

with tf.Session(graph=train_graph) as sess:
iteration = 1
loss = 0
sess.run(tf.global_variables_initializer())

for e in range(1, epochs + 1):
    batches = get_batches(train_words, batch_size, window_size)
    start = time.time()
    #
    for x, y in batches:
        feed = {inputs: x,
                labels: np.array(y)[:, None]}
        train_loss, _ = sess.run([cost, optimizer], feed_dict=feed)

        loss += train_loss

        if iteration % 100 == 0:
            end = time.time()
            print("Epoch {}/{}".format(e, epochs),
                  "Iteration: {}".format(iteration),
                  "Avg. Training loss: {:.4f}".format(loss / 100),
                  "{:.4f} sec/batch".format((end - start) / 100))
            loss = 0
            start = time.time()

        # 計算相似的詞
        if iteration % 1000 == 0:
            # 計算similarity
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = int_to_vocab[valid_examples[i]]
                top_k = 8  # 取最相似單詞的前8個
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log = 'Nearest to [%s]:' % valid_word
                for k in range(top_k):
                    close_word = int_to_vocab[nearest[k]]
                    log = '%s %s,' % (log, close_word)
                print(log)

        iteration += 1

save_path = saver.save(sess, "checkpoints/text8.ckpt")
embed_mat = sess.run(normalized_embedding)

%matplotlib inline

%config InlineBackend.figure_format = 'retina'

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

viz_words = 500
tsne = TSNE()
embed_tsne = tsne.fit_transform(embed_mat[:viz_words, :])

fig, ax = plt.subplots(figsize=(14, 14))
for idx in range(viz_words):
plt.scatter(*embed_tsne[idx, :], color='steelblue')
plt.annotate(int_to_vocab[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7)

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 PyTorch基礎——詞向量（Word Vector）技術 Efficient Estimation of Word Representations in Vector Space（word2vec）利用jQuery-Word-Export導出word (含ECharts) VS Code word wrap 不好使 c# 將頁面導出到word（含圖片及控件）一天一經典Efficient Estimation of Word Representations in Vector Space vector容器(一) vector的用法 vector數組 Vector與KeyPoint