文章來自於我的個人博客:python 分詞計算文檔TF-IDF值並排序
該程序實現的功能是:首先讀取一些文檔,然后通過jieba來分詞,將分詞存入文件,然后通過sklearn計算每一個分詞文檔中的tf-idf值,再將文檔排序輸入一個大文件里
依賴包:
sklearn
jieba
注:此程序參考了一位同行的程序后進行了改動
# -*- coding: utf-8 -*- """ @author: jiangfuqiang """ import os import jieba import jieba.posseg as pseg import sys import re import time import string from sklearn import feature_extraction from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer reload(sys) sys.setdefaultencoding('utf-8') def getFileList(path): filelist = [] files = os.listdir(path) for f in files: if f[0] == '.': pass else: filelist.append(f) return filelist,path def fenci(filename,path,segPath): f = open(path +"/" + filename,'r+') file_list = f.read() f.close() #保存粉刺結果的文件夾 if not os.path.exists(segPath): os.mkdir(segPath) #對文檔進行分詞處理 seg_list = jieba.cut(file_list,cut_all=True) #對空格。換行符進行處理 result = [] for seg in seg_list: seg = ''.join(seg.split()) reg = 'w+' r = re.search(reg,seg) if seg != '' and seg != ' ' and seg != ' ' and seg != '=' and seg != '[' and seg != ']' and seg != '(' and seg != ')' and not r: result.append(seg) #將分詞后的結果用空格隔開,保存至本地 f = open(segPath+"/"+filename+"-seg.txt","w+") f.write(' '.join(result)) f.close() #讀取已經分詞好的文檔。進行TF-IDF計算 def Tfidf(filelist,sFilePath,path): corpus = [] for ff in filelist: fname = path + ff f = open(fname+"-seg.txt",'r+') content = f.read() f.close() corpus.append(content) vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) word = vectorizer.get_feature_names() #全部文本的關鍵字 weight = tfidf.toarray() if not os.path.exists(sFilePath): os.mkdir(sFilePath) for i in range(len(weight)): print u'----------writing all the tf-idf in the ',i,u'file into ', sFilePath+'/' +string.zfill(i,5)+".txt" f = open(sFilePath+"/"+string.zfill(i,5)+".txt",'w+') for j in range(len(word)): f.write(word[j] + " " + str(weight[i][j]) + " ") f.close() if __name__ == "__main__": #保存tf-idf的計算結果文件夾 sFilePath = "/home/lifeix/soft/allfile/tfidffile"+str(time.time()) #保存分詞的文件夾 segPath = '/home/lifeix/soft/allfile/segfile' (allfile,path) = getFileList('/home/lifeix/soft/allkeyword') for ff in allfile: print "Using jieba on " + ff fenci(ff,path,segPath) Tfidf(allfile,sFilePath,segPath) #對整個文檔進行排序 os.system("sort -nrk 2 " + sFilePath+"/*.txt >" + sFilePath + "/sorted.txt")