常用API
- gensim.models.Word2Vec(sentence, min_count, workers)
- gensim.models.word2vec.Word2Vec(sentence, min_count, workers)
word2vec參數
- sentence:語料句子,必須是一個可迭代的對象
- min_counts:指定了需要訓練的詞語最小出現次數,小於該值的詞將被忽略
- max_vocab_size:最大詞匯數,防止內存溢出
- size:詞向量維度
- alpha:訓練的初始學習率,隨着訓練的進行,學習率會線性減少
- min_alpha:最小學習率
- window:滑動窗口大小
- sg:訓練模型(0:CBOW;1:skip-gram)
- hs:word2vec兩個解法的選擇了,如果是0, 則是Negative Sampling,是1的話並且負采樣個數negative大於0, 則是Hierarchical Softmax。默認是0即Negative Sampling。
- iter:迭代次數
加載語料庫
自己構建
形如:sentence=[['ab', 'ba'], ['sheu', 'dhudhi', 'hdush'], ... []]
加載單文件語料
使用LineSentence()函數,文件必須是已經分詞過了的。
加載文件夾下的所有語料
使用PathLineSentence()函數,文件必須是已經分詞過了的。
自定義
class MySentence:
def __init__(self, data_path, max_line=None):
self.data_path = data_path
self.max_line = max_line
self.cur_line = 0
def __iter__(self):
if self.max_line is not None:
for line in open(self.data_path, 'r', encoding='utf-8'):
if self.cur_line >= self.max_line:
return
self.cur_line += 1
yield line.strip('\n').split()
else:
for line in open(self.data_path, 'r', encoding='utf-8'):
yield line.strip('\n').split()
上述代碼自定義了一個MySentence類,它的實例是一個可迭代的對象,可以直接傳給LineSentence()函數。
訓練模型
from gensim.models import word2vec
ms = MySentence(data_path)
model = word2vec.Word2Vec(ms, hs=1, min_count=1, window=3, size=64)
假如要在已訓練好的模型上追加訓練:
先加載模型:
model = word2vec.Word2Vec.load(model_path)
再追加訓練
model.train(other_sentence)
存儲模型
- model.save(model_name),可以追加訓練
- model.save_word2vec_format(model_name),不可以追加訓練
加載模型
方法一:
model = word2vec.Word2Vec.load(model_path)
方法二:
model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=False) # C text format
model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True) # C binary format
獲取詞向量
word_vec = model.wv[word]
示例
import jieba
import jieba.analyse
from gensim.models import word2vec
def cut_words():
jieba.suggest_freq('沙瑞金', True)
jieba.suggest_freq('田國富', True)
jieba.suggest_freq('高育良', True)
jieba.suggest_freq('侯亮平', True)
jieba.suggest_freq('鍾小艾', True)
jieba.suggest_freq('陳岩石', True)
jieba.suggest_freq('歐陽菁', True)
jieba.suggest_freq('易學習', True)
jieba.suggest_freq('王大路', True)
jieba.suggest_freq('蔡成功', True)
jieba.suggest_freq('孫連城', True)
jieba.suggest_freq('季昌明', True)
jieba.suggest_freq('丁義珍', True)
jieba.suggest_freq('鄭西坡', True)
jieba.suggest_freq('趙東來', True)
jieba.suggest_freq('高小琴', True)
jieba.suggest_freq('趙瑞龍', True)
jieba.suggest_freq('林華華', True)
jieba.suggest_freq('陸亦可', True)
jieba.suggest_freq('劉新建', True)
jieba.suggest_freq('劉慶祝', True)
with open('./in_the_name_of_people.txt', 'r', encoding='utf-8') as f:
document = f.read()
document_cut = jieba.cut(document)
result = ' '.join(document_cut)
with open('./in_the_name_of_people_segment.txt', 'w', encoding='utf-8') as f2:
f2.write(result)
print('ok')
class MySentence:
def __init__(self, data_path, max_line=None):
self.data_path = data_path
self.max_line = max_line
self.cur_line = 0
def __iter__(self):
if self.max_line is not None:
for line in open(self.data_path, 'r', encoding='utf-8'):
if self.cur_line >= self.max_line:
return
self.cur_line += 1
yield line.strip('\n').split()
else:
for line in open(self.data_path, 'r', encoding='utf-8'):
yield line.strip('\n').split()
def word_embedding():
ms = MySentence('./in_the_name_of_people_segment.txt')
model = word2vec.Word2Vec(ms, hs=1, min_count=1, window=3, size=64)
model.save('./name_of_people_wv.model')
print('ok')
def load_model():
model = word2vec.Word2Vec.load('./name_of_people_wv.model')
words = ['侯亮平', '驀地', '睜開眼睛', '。', '大廳', '突起', '一陣', '騷動', '許多', '人', '擁向', '不同', '的', '登機口']
for word in words:
print(model.wv[word])
if __name__ == "__main__":
word_embedding()
ms = MySentence('./in_the_name_of_people_segment.txt')
load_model()