train_word2vec_model.py:
#!/usr/bin/env python # -*- coding: utf-8 -*-
import logging import os import sys import multiprocessing from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence if __name__ == '__main__': program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) # check and process input arguments
if len(sys.argv) < 4: print(globals()['__doc__'] % locals()) sys.exit(1) inp, outp1, outp2 = sys.argv[1:4] model = Word2Vec(LineSentence(inp), size=800, window=10, min_count=5,sg=1,hs=1, workers=multiprocessing.cpu_count()) #window:skip-gram通常在10附近,CBOW通常在5附近
#hs: 如果为1则会采用hierarchica softmax技巧。如果设置为0(defaut),则negative sampling会被使用。
# trim unneeded model memory = use(much) less RAM
# model.init_sims(replace=True)
model.save(outp1) model.wv.save_word2vec_format(outp2, binary=False)
执行 "python train_word2vec_model.py v6_EN.txt v6_EN.model v6_EN.vector"即可训练词向量
train_word2vec_model.py为训练词向量的程序代码,v6_EN.txt是我训练的语料库的名称,v6_EN.model为我训练出来的词向量模型名称,v6_EN.vector为格式化保存词向量模型的文件(一般用不到,但执行语句必须包含这一项)
训练出的文件有五个:
中间3个.npy文件在load词向量模型时都必须和v6_EN.model放在同一文件夹下
In [1]: import gensim In [2]: model = gensim.models.Word2Vec.load("v6_EN.model") In [3]: result = model.most_similar("足球") In [4]: for e in result: print e[0], e[1] ....: 联赛 0.65538161993 甲级 0.653042972088 篮球 0.596754670143 俱乐部 0.587228953838 乙级 0.58406317234 足球队 0.556015253067 亚足联 0.530800580978 allsvenskan 0.52497625351 代表队 0.521494746208 甲组 0.51778960228
test.py:
import gensim import numpy as np import xlwt model_EN = gensim.models.Word2Vec.load("../v6_EN_SG/v6_EN_SG_800.model") model_FR = gensim.models.Word2Vec.load("../v6_FR_SG/v6_FR_SG.model") workbook = xlwt.Workbook(encoding = 'utf-8') worksheet = workbook.add_sheet('Result') Thta = np.load("GT/ThtaEN-FR/Thta0.07/ThtaEN-FR0.07_7000.npy") test = np.load("GT/test1000EN-FR.npy") font1 = xlwt.Font() font1.height=0x00E8 font1.name = '宋体' style1 = xlwt.XFStyle() style1.font = font1 worksheet.write(0, 0, label = '英文测试单词', style = style1) worksheet.col(0).width = 3333 worksheet.write(0, 1, label = '预测的法语译文', style = style1) worksheet.col(1).width = 4000 worksheet.write(0, 2, label = '词典给出的法语译文', style = style1) worksheet.col(2).width = 4400 worksheet.write(0, 3, label = '对错', style = style1) worksheet.col(3).width = 4400 num = 0 true_Word=0.0
while num < 1000: word_EN = test[num][0] word_FR = test[num][1] vec_Test = model_EN.wv[word_EN] vec_Test.shape = (1,800) b = np.dot(vec_Test,Thta) b.shape = (200,) e = model_FR.wv.similar_by_vector(b, topn=5, restrict_vocab=None) print(e[0][0]) worksheet.write(num+1, 0, label = word_EN) worksheet.write(num+1, 1, label = [e[k][0]+' ' for k in range(5)]) worksheet.write(num+1, 2, label = word_FR) for i in range(5): if e[i][0] == word_FR: worksheet.write(num+1, 3, label = '✔️') true_Word+=1
break
elif i == 4: worksheet.write(num+1, 3, label = '×') print('测试完成%d个单词'%(num+1)) num += 1 worksheet.write(num+1, 0, label = '正确率', style = style1) worksheet.write(num+1, 1, label = str(true_Word/num*100)+'%') print(str(true_Word/num*100)+'%') workbook.save('GT/test/testEN-FR/Thta0.07/EN-FR0.07@5_7000.xls')