- 環境
Anaconda3 Python 3.6, Window 64bit
- 目的
利用 jieba 進行分詞,關鍵詞提取
利用gensim下面的corpora,models,similarities 進行語料庫建立,模型tfidf算法,稀疏矩陣相似度分析
- 代碼

# -*- coding: utf-8 -*- import jieba from gensim import corpora, models, similarities from collections import defaultdict # 定義文件目錄 work_dir = "D:/workspace/PythonSdy/data" f1 = work_dir + "/t1.txt" f2 = work_dir + "/t2.txt" # 讀取文件內容 c1 = open(f1, encoding='utf-8').read() c2 = open(f2, encoding='utf-8').read() # jieba 進行分詞 data1 = jieba.cut(c1) data2 = jieba.cut(c2) data11 = "" # 獲取分詞內容 for i in data1: data11 += i + " " data21 = "" # 獲取分詞內容 for i in data2: data21 += i + " " doc1 = [data11, data21] # print(doc1) t1 = [[word for word in doc.split()] for doc in doc1] # print(t1) # # frequence頻率 freq = defaultdict(int) for i in t1: for j in i: freq[j] += 1 # print(freq) # 限制詞頻 t2 = [[token for token in k if freq[j] >= 3] for k in t1] print(t2) # corpora語料庫建立字典 dic1 = corpora.Dictionary(t2) dic1.save(work_dir + "/yuliaoku.txt") # 對比文件 f3 = work_dir + "/t3.txt" c3 = open(f3, encoding='utf-8').read() # jieba 進行分詞 data3 = jieba.cut(c3) data31 = "" for i in data3: data31 += i + " " new_doc = data31 print(new_doc) # doc2bow把文件變成一個稀疏向量 new_vec = dic1.doc2bow(new_doc.split()) # 對字典進行doc2bow處理,得到新語料庫 new_corpor = [dic1.doc2bow(t3) for t3 in t2] tfidf = models.TfidfModel(new_corpor) # 特征數 featurenum = len(dic1.token2id.keys()) # similarities 相似之處 # SparseMatrixSimilarity 稀疏矩陣相似度 idx = similarities.SparseMatrixSimilarity(tfidf[new_corpor], num_features=featurenum) sims = idx[tfidf[new_vec]] print(sims)
- 結果展示
從結果可以得出:被對比的文件3 和文件2內容更相近。