我的代碼:
# -*- coding: utf-8 -*- from pandas import read_csv import numpy as np from sklearn.datasets.base import Bunch import pickle # 導入cPickle包並且取一個別名pickle #持久化類 from sklearn.feature_extraction.text import TfidfVectorizer import jieba import operator # 排序用 from sklearn import metrics from sklearn.externals import joblib import xlwt #導入wordcloud模塊和matplotlib模塊 import wordcloud import matplotlib.pyplot as plt from scipy.misc import imread '''讀取停用詞''' def _readfile(path): with open(path, "rb") as fp: content = fp.read() return content ''' 讀取bunch對象''' def _readbunchobj(path): with open(path, "rb") as file_obj: bunch = pickle.load(file_obj) return bunch '''寫入bunch對象''' def _writebunchobj(path, bunchobj): with open(path, "wb") as file_obj: pickle.dump(bunchobj, file_obj) def buildtestbunch(bunch_path, art_test): bunch = Bunch(contents=[])#label=[], # ============================================================================= # for item1 in testlabel: # bunch.label.append(item1) # ============================================================================= # testContentdatasave=[] #存儲所有訓練和測試數據的分詞 for item2 in art_test: item2 = str(item2) item2 = item2.replace("\r\n", "") item2 = item2.replace(" ", "") content_seg = jieba.cut(item2) save2 = '' for item3 in content_seg: if len(item3) > 1 and item3 != '\r\n': # testContentdatasave.append(item3) save2 = save2 + "," + item3 bunch.contents.append(save2) with open(bunch_path, "wb") as file_obj: pickle.dump(bunch, file_obj) print("構建測試數據文本對象結束!!!") def vector_space(stopword_path, bunch_path, space_path): stpwrdlst = _readfile(stopword_path).splitlines() # 讀取停用詞 bunch = _readbunchobj(bunch_path) # 導入分詞后的詞向量bunch對象 # 構建tf-idf詞向量空間對象 tfidfspace = Bunch(label=bunch.label, tdm=[], vocabulary={}) # 權重矩陣tdm,其中,權重矩陣是一個二維矩陣,tdm[i][j]表示,第j個詞(即詞典中的序號)在第i個類別中的IF-IDF值 # 使用TfidVectorizer初始化向量空間模型 vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5, min_df=0.0001, use_idf=True, max_features=15000) # print(vectorizer) # 文本轉為詞頻矩陣,單獨保存字典文件 tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) tfidfspace.vocabulary = vectorizer.vocabulary_ # 創建詞袋的持久化 _writebunchobj(space_path, tfidfspace) print("if-idf詞向量空間實例創建成功!!!") def testvector_space(stopword_path, bunch_path, space_path, train_tfidf_path): stpwrdlst = _readfile(stopword_path).splitlines() # 把停用詞變成列表 bunch = _readbunchobj(bunch_path) tfidfspace = Bunch(tdm=[], vocabulary={})#label=bunch.label, # 導入訓練集的TF-IDF詞向量空間 ★★ trainbunch = _readbunchobj(train_tfidf_path) tfidfspace.vocabulary = trainbunch.vocabulary vectorizer= TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.7, vocabulary=trainbunch.vocabulary, min_df=0.001) tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) _writebunchobj(space_path, tfidfspace) print("if-idf詞向量空間實例創建成功!!!") if __name__=="__main__": Sdata = [] art = [] '''============================先導入數據==================================''' file_test = 'F:/goverment/text analyse/type_in.csv' dataset = read_csv(file_test) Sdata = dataset.values[:, :] Sdata=Sdata.tolist() for line in Sdata: art.append(line[1])#line[1]為文本 print(len(Sdata)) '''==========================================================tf-idf對Bar進行文本特征提取============================================================================''' # 導入分詞后的詞向量bunch對象 test_bunch_path = "F:/goverment/text analyse/trainbunch.bat" test_space_path = "F:/goverment/text analyse/traintfdifspace.dat" stopword_path = "F:/goverment/text analyse/hlt_stop_words.txt" '''============================================================tf-idf對Sart進行文本特征提取==============================================================================''' buildtestbunch(test_bunch_path, art) testvector_space(stopword_path, test_bunch_path, test_space_path, test_space_path) test_set = _readbunchobj(test_space_path) '''測試數據''' #獲取已知 id 找 文本 txtcut=[] #存放所有詞 dic={} for i in test_set.vocabulary.keys(): txtcut.append(i) dic[test_set.vocabulary[i]]=i #print(dic) #print(test_set.tdm) #print(test_set.tdm[0]) #print(dir(test_set)) #print(test_set.vocabulary) #print(dir(test_set.tdm)) #print(Sdata) #print(nonzero[1]) '''final里放的是不超過15的詞''' #print(Sdata) final=[] for k in range(len(Sdata)):#遍歷每一條文本 nonzero=test_set.tdm[k].nonzero() ls=[] ls.append(Sdata[k][0]) num=0 for i in range(len(nonzero[1])): num=num+1 b=test_set.tdm[k, nonzero[1][i]]*100 #test_set.tdm[k, nonzero[1][i]]是第k條文本中,第i個權重非零的詞權重 a= dic[nonzero[1][i]] +" "+str(round(b,2))+"%" ls.append(a) if num==15: break final.append(ls) '''畫詞雲圖''' fig = plt.figure(figsize = (15,15)) cloud = wordcloud.WordCloud(font_path='STXINGKA.TTF',mask=imread('water3.png'),mode='RGBA', background_color=None).generate(' '.join(txtcut)) img = imread('water3.png') cloud_colors = wordcloud.ImageColorGenerator(np.array(img)) cloud.recolor(color_func=cloud_colors) plt.imshow(cloud) plt.axis('off') plt.savefig('watercloud3.png',dpi=400) plt.show() myexcel = xlwt.Workbook() sheet = myexcel.add_sheet("sheet1") si=-1 sj=-1 for line in final: si=si+1 sj=-1 for i in line: sj=sj+1 sheet.write(si,sj,str(i)) myexcel.save("各條分詞.xls") #把id存好 myexcel = xlwt.Workbook() sheet = myexcel.add_sheet("sheet2") p=0 for i in test_set.vocabulary.keys(): sheet.write(p,0,i) print(i) sheet.write(p,1,str(test_set.vocabulary[i])) p=p+1 myexcel.save("詞匯id.xls")
各條分詞:
詞匯id: