Python Word2Vec训练和测试词向量


train_word2vec_model.py:

#!/usr/bin/env python # -*- coding: utf-8 -*-
 
import logging import os import sys import multiprocessing from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence if __name__ == '__main__': program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) # check and process input arguments
    if len(sys.argv) < 4: print(globals()['__doc__'] % locals()) sys.exit(1) inp, outp1, outp2 = sys.argv[1:4] model = Word2Vec(LineSentence(inp), size=800, window=10, min_count=5,sg=1,hs=1, workers=multiprocessing.cpu_count()) #window:skip-gram通常在10附近,CBOW通常在5附近
    #hs: 如果为1则会采用hierarchica softmax技巧。如果设置为0(defaut),则negative sampling会被使用。
    # trim unneeded model memory = use(much) less RAM
    # model.init_sims(replace=True)
 model.save(outp1) model.wv.save_word2vec_format(outp2, binary=False)

执行 "python train_word2vec_model.py v6_EN.txt v6_EN.model v6_EN.vector"即可训练词向量

train_word2vec_model.py为训练词向量的程序代码,v6_EN.txt是我训练的语料库的名称,v6_EN.model为我训练出来的词向量模型名称,v6_EN.vector为格式化保存词向量模型的文件(一般用不到,但执行语句必须包含这一项)

训练出的文件有五个:

中间3个.npy文件在load词向量模型时都必须和v6_EN.model放在同一文件夹下

In [1]: import gensim
 
In [2]: model = gensim.models.Word2Vec.load("v6_EN.model")
 
In [3]: result = model.most_similar("足球")
 
In [4]: for e in result:
    print e[0], e[1]
   ....:     
联赛 0.65538161993
甲级 0.653042972088
篮球 0.596754670143
俱乐部 0.587228953838
乙级 0.58406317234
足球队 0.556015253067
亚足联 0.530800580978
allsvenskan 0.52497625351
代表队 0.521494746208
甲组 0.51778960228

 test.py:

import gensim import numpy as np import xlwt model_EN = gensim.models.Word2Vec.load("../v6_EN_SG/v6_EN_SG_800.model") model_FR = gensim.models.Word2Vec.load("../v6_FR_SG/v6_FR_SG.model") workbook = xlwt.Workbook(encoding = 'utf-8') worksheet = workbook.add_sheet('Result') Thta = np.load("GT/ThtaEN-FR/Thta0.07/ThtaEN-FR0.07_7000.npy") test = np.load("GT/test1000EN-FR.npy") font1 = xlwt.Font() font1.height=0x00E8 font1.name = '宋体' style1 = xlwt.XFStyle() style1.font = font1 worksheet.write(0, 0, label = '英文测试单词', style = style1) worksheet.col(0).width = 3333 worksheet.write(0, 1, label = '预测的法语译文', style = style1) worksheet.col(1).width = 4000 worksheet.write(0, 2, label = '词典给出的法语译文', style = style1) worksheet.col(2).width = 4400 worksheet.write(0, 3, label = '对错', style = style1) worksheet.col(3).width = 4400 num = 0 true_Word=0.0
while num < 1000: word_EN = test[num][0] word_FR = test[num][1] vec_Test = model_EN.wv[word_EN] vec_Test.shape = (1,800) b = np.dot(vec_Test,Thta) b.shape = (200,) e = model_FR.wv.similar_by_vector(b, topn=5, restrict_vocab=None) print(e[0][0]) worksheet.write(num+1, 0, label = word_EN) worksheet.write(num+1, 1, label = [e[k][0]+'  ' for k in range(5)]) worksheet.write(num+1, 2, label = word_FR) for i in range(5): if e[i][0] == word_FR: worksheet.write(num+1, 3, label = '✔️') true_Word+=1
            break
        elif i == 4: worksheet.write(num+1, 3, label = '×') print('测试完成%d个单词'%(num+1)) num += 1 worksheet.write(num+1, 0, label = '正确率', style = style1) worksheet.write(num+1, 1, label = str(true_Word/num*100)+'%') print(str(true_Word/num*100)+'%') workbook.save('GT/test/testEN-FR/Thta0.07/EN-FR0.07@5_7000.xls')

 


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM