Python Word2Vec訓練和測試詞向量


train_word2vec_model.py:

#!/usr/bin/env python # -*- coding: utf-8 -*-
 
import logging import os import sys import multiprocessing from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence if __name__ == '__main__': program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) # check and process input arguments
    if len(sys.argv) < 4: print(globals()['__doc__'] % locals()) sys.exit(1) inp, outp1, outp2 = sys.argv[1:4] model = Word2Vec(LineSentence(inp), size=800, window=10, min_count=5,sg=1,hs=1, workers=multiprocessing.cpu_count()) #window:skip-gram通常在10附近,CBOW通常在5附近
    #hs: 如果為1則會采用hierarchica softmax技巧。如果設置為0(defaut),則negative sampling會被使用。
    # trim unneeded model memory = use(much) less RAM
    # model.init_sims(replace=True)
 model.save(outp1) model.wv.save_word2vec_format(outp2, binary=False)

執行 "python train_word2vec_model.py v6_EN.txt v6_EN.model v6_EN.vector"即可訓練詞向量

train_word2vec_model.py為訓練詞向量的程序代碼,v6_EN.txt是我訓練的語料庫的名稱,v6_EN.model為我訓練出來的詞向量模型名稱,v6_EN.vector為格式化保存詞向量模型的文件(一般用不到,但執行語句必須包含這一項)

訓練出的文件有五個:

中間3個.npy文件在load詞向量模型時都必須和v6_EN.model放在同一文件夾下

In [1]: import gensim
 
In [2]: model = gensim.models.Word2Vec.load("v6_EN.model")
 
In [3]: result = model.most_similar("足球")
 
In [4]: for e in result:
    print e[0], e[1]
   ....:     
聯賽 0.65538161993
甲級 0.653042972088
籃球 0.596754670143
俱樂部 0.587228953838
乙級 0.58406317234
足球隊 0.556015253067
亞足聯 0.530800580978
allsvenskan 0.52497625351
代表隊 0.521494746208
甲組 0.51778960228

 test.py:

import gensim import numpy as np import xlwt model_EN = gensim.models.Word2Vec.load("../v6_EN_SG/v6_EN_SG_800.model") model_FR = gensim.models.Word2Vec.load("../v6_FR_SG/v6_FR_SG.model") workbook = xlwt.Workbook(encoding = 'utf-8') worksheet = workbook.add_sheet('Result') Thta = np.load("GT/ThtaEN-FR/Thta0.07/ThtaEN-FR0.07_7000.npy") test = np.load("GT/test1000EN-FR.npy") font1 = xlwt.Font() font1.height=0x00E8 font1.name = '宋體' style1 = xlwt.XFStyle() style1.font = font1 worksheet.write(0, 0, label = '英文測試單詞', style = style1) worksheet.col(0).width = 3333 worksheet.write(0, 1, label = '預測的法語譯文', style = style1) worksheet.col(1).width = 4000 worksheet.write(0, 2, label = '詞典給出的法語譯文', style = style1) worksheet.col(2).width = 4400 worksheet.write(0, 3, label = '對錯', style = style1) worksheet.col(3).width = 4400 num = 0 true_Word=0.0
while num < 1000: word_EN = test[num][0] word_FR = test[num][1] vec_Test = model_EN.wv[word_EN] vec_Test.shape = (1,800) b = np.dot(vec_Test,Thta) b.shape = (200,) e = model_FR.wv.similar_by_vector(b, topn=5, restrict_vocab=None) print(e[0][0]) worksheet.write(num+1, 0, label = word_EN) worksheet.write(num+1, 1, label = [e[k][0]+'  ' for k in range(5)]) worksheet.write(num+1, 2, label = word_FR) for i in range(5): if e[i][0] == word_FR: worksheet.write(num+1, 3, label = '✔️') true_Word+=1
            break
        elif i == 4: worksheet.write(num+1, 3, label = '×') print('測試完成%d個單詞'%(num+1)) num += 1 worksheet.write(num+1, 0, label = '正確率', style = style1) worksheet.write(num+1, 1, label = str(true_Word/num*100)+'%') print(str(true_Word/num*100)+'%') workbook.save('GT/test/testEN-FR/Thta0.07/EN-FR0.07@5_7000.xls')

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM