1、讀入文本內容
corpos = pandas.DataFrame(columns=['filePath','content']) for root ,dirs,files in os.walk(r'H:\19113117 - 副本'): for name in files: filePath=root+'\\'+name f = codecs.open(filePath,'r','utf-8') content=f.read() f.close() corpos.loc[len(corpos)+1]=[filePath,content.strip()]
2、將手動分完詞的文本進行詞頻統計
filePaths=[] segments=[] for filePath,content in corpos.itertuples(index=False): for item in content.split('/'): segments.append(item) filePaths.append(filePath) segmentDF=pandas.DataFrame({'filePath':filePaths,'segments':segments}) segStat = segmentDF.groupby( by=["filePath","segments"] )["segments"].agg({ "計數":numpy.size }).reset_index();
3、計算tf值
textVector=segStat.pivot_table( index='segments', values='計數', columns='filePath', fill_value=0) tF=(1+numpy.log(textVector)).as_matrix()
4、計算IDF
def handle(x): idf=1+numpy.log(len(corpos)/(numpy.sum(x>0)+1)) return idf zhuan=textVector.T iDF=zhuan.apply(handle).as_matrix() iDF=iDF.reshape(8889,1)
5、計算tfidf
TFIDF=tF*iDF
tFIDF_DF=pandas.DataFrame(TFIDF)
6、將每個文本中tfidf值排名前100的詞和相應的tfidf值輸出
file=[] for root ,dirs,files in os.walk(r'H:\19113117 - 副本'): for name in files: name=name[0:-4] file.append(name) for i in range(len(corpos)): sort=pandas.DataFrame(tFIDF_DF.loc[:,i].order(ascending=False)[:100]).reset_index() names = sort.columns.tolist() names[names.index(i)] = 'value' sort.columns = names tagis = textVector.index[sort.index] print(file[i]) for t in range(len(tagis)): print(tagis[t],sort.loc[t].value)