import xlrd import jieba import sys import importlib import os #python內置的包,用於進行文件目錄操作,我們將會用到os.listdir函數 import pickle #導入cPickle包並且取一個別名pickle #持久化類 import random import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from pylab import mpl from sklearn.naive_bayes import MultinomialNB # 導入多項式貝葉斯算法包 from sklearn import svm from sklearn import metrics from sklearn.datasets.base import Bunch from sklearn.feature_extraction.text import TfidfVectorizer importlib.reload(sys) #把內容和類別轉化成一個向量的形式 trainContentdatasave=[] #存儲所有訓練和測試數據的分詞 testContentdatasave=[] trainContentdata = [] testContentdata = [] trainlabeldata = [] testlabeldata = [] #導入文本描述的訓練和測試數據 def importTrainContentdata(): file = '20180716_train.xls' wb = xlrd.open_workbook(file) ws = wb.sheet_by_name("Sheet1") for r in range(ws.nrows): col = [] for c in range(1): col.append(ws.cell(r, c).value) trainContentdata.append(col) def importTestContentdata(): file = '20180716_test.xls' wb = xlrd.open_workbook(file) ws = wb.sheet_by_name("Sheet1") for r in range(ws.nrows): col = [] for c in range(1): col.append(ws.cell(r, c).value) testContentdata.append(col) #導入類別的訓練和測試數據 def importTrainlabeldata(): file = '20180716_train_label.xls' wb = xlrd.open_workbook(file) ws = wb.sheet_by_name("Sheet1") for r in range(ws.nrows): col = [] for c in range(1): col.append(ws.cell(r, c).value) trainlabeldata.append(col) def importTestlabeldata(): file = '20180716_test_label.xls' wb = xlrd.open_workbook(file) ws = wb.sheet_by_name("Sheet1") for r in range(ws.nrows): col = [] for c in range(1): col.append(ws.cell(r, c).value) testlabeldata.append(col) """ def importClassSet(): file = 'ClassSet.xls' wb = xlrd.open_workbook(file) ws = wb.sheet_by_name("Sheet1") for r in range(ws.nrows): col = [] for c in range(ws.ncols): col.append(ws.cell(r, c).value) ClassSet.append(col) """ def buildtrainbunch(bunch_path): bunch = Bunch(label=[],contents=[]) for item1 in trainlabeldata: bunch.label.append(item1) for item2 in trainContentdata: item2=str(item2) item2 = item2.replace("\r\n", "") item2 = item2.replace(" ", "") content_seg=jieba.cut(item2) save2='' for item3 in content_seg: if len(item3) > 1 and item3!='\r\n': trainContentdatasave.append(item3) save2=save2+","+item3 bunch.contents.append(save2) with open(bunch_path, "wb") as file_obj: pickle.dump(bunch, file_obj) print("構建訓練數據文本對象結束!!!") def buildtestbunch(bunch_path): bunch = Bunch(label=[],contents=[]) for item1 in testlabeldata: bunch.label.append(item1) for item2 in testContentdata: item2=str(item2) item2 = item2.replace("\r\n", "") item2 = item2.replace(" ", "") content_seg=jieba.cut(item2) save2='' for item3 in content_seg: if len(item3) > 1 and item3!='\r\n': testContentdatasave.append(item3) save2=save2+","+item3 bunch.contents.append(save2) with open(bunch_path, "wb") as file_obj: pickle.dump(bunch, file_obj) print("構建測試數據文本對象結束!!!") #讀取停用詞 def _readfile(path): with open(path, "rb") as fp: content = fp.read() return content # 讀取bunch對象 def _readbunchobj(path): with open(path, "rb") as file_obj: bunch = pickle.load(file_obj) return bunch # 寫入bunch對象 def _writebunchobj(path, bunchobj): with open(path, "wb") as file_obj: pickle.dump(bunchobj, file_obj) def vector_space(stopword_path,bunch_path,space_path): stpwrdlst = _readfile(stopword_path).splitlines()#讀取停用詞 bunch = _readbunchobj(bunch_path)#導入分詞后的詞向量bunch對象 #構建tf-idf詞向量空間對象 tfidfspace = Bunch(label=bunch.label,tdm=[], vocabulary={}) ''' 權重矩陣tdm,其中,權重矩陣是一個二維矩陣,tdm[i][j]表示,第j個詞(即詞典中的序號)在第i個類別中的IF-IDF值 ''' #使用TfidVectorizer初始化向量空間模型 vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5, min_df=0.0001,use_idf=False,max_features=10000) #print(vectorizer) #文本轉為詞頻矩陣,單獨保存字典文件 tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) tfidfspace.vocabulary = vectorizer.vocabulary_ #創建詞袋的持久化 _writebunchobj(space_path, tfidfspace) print("if-idf詞向量空間實例創建成功!!!") def testvector_space(stopword_path,bunch_path,space_path,train_tfidf_path): stpwrdlst = _readfile(stopword_path).splitlines()#把停用詞變成列表 bunch = _readbunchobj(bunch_path) tfidfspace = Bunch(label=bunch.label,tdm=[], vocabulary={}) ''' tdm存放的是計算后得到的TF-IDF權重矩陣. vocabulary是詞向量空間的索引,例如,如果我們定義的詞向量空間是(我,喜歡,相國大人),那么vocabulary就是這樣一個索引字典 vocabulary={"我":0,"喜歡":1,"相國大人":2},你可以簡單的理解為:vocabulary就是詞向量空間的坐標軸,索引值相當於表明了第幾個維度。 ''' #導入訓練集的TF-IDF詞向量空間 ★★ trainbunch = _readbunchobj(train_tfidf_path) tfidfspace.vocabulary = trainbunch.vocabulary ''' 關於參數,你只需要了解這么幾個就可以了: stop_words: 傳入停用詞,以后我們獲得vocabulary_的時候,就會根據文本信息去掉停用詞得到 vocabulary: 之前說過,不再解釋。 sublinear_tf: 計算tf值采用亞線性策略。比如,我們以前算tf是詞頻,現在用1+log(tf)來充當詞頻。 smooth_idf: 計算idf的時候log(分子/分母)分母有可能是0,smooth_idf會采用log(分子/(1+分母))的方式解決。默認已經開啟,無需關心。 norm: 歸一化,我們計算TF-IDF的時候,是用TF*IDF,TF可以是歸一化的,也可以是沒有歸一化的,一般都是采用歸一化的方法,默認開啟. max_df: 有些詞,他們的文檔頻率太高了(一個詞如果每篇文檔都出現,那還有必要用它來區分文本類別嗎?當然不用了呀),所以,我們可以 設定一個閾值,比如float類型0.5(取值范圍[0.0,1.0]),表示這個詞如果在整個數據集中超過50%的文本都出現了,那么我們也把它列 為臨時停用詞。當然你也可以設定為int型,例如max_df=10,表示這個詞如果在整個數據集中超過10的文本都出現了,那么我們也把它列 為臨時停用詞。 min_df: 與max_df相反,雖然文檔頻率越低,似乎越能區分文本,可是如果太低,例如10000篇文本中只有1篇文本出現過這個詞,僅僅因為這1篇 文本,就增加了詞向量空間的維度,太不划算。 當然,max_df和min_df在給定vocabulary參數時,就失效了。 ''' vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.7, vocabulary=trainbunch.vocabulary, min_df=0.001) #print(vectorizer) tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) _writebunchobj(space_path, tfidfspace) print("if-idf詞向量空間實例創建成功!!!") def metrics_result(actual, predict): # metrics.f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)) print('精度:{0:.3f}'.format(metrics.precision_score(actual, predict,average='weighted', labels=np.unique(predict)))) print('召回:{0:0.3f}'.format(metrics.recall_score(actual, predict,average='weighted', labels=np.unique(predict)))) print('f1-score:{0:.3f}'.format(metrics.f1_score(actual, predict, average='weighted', labels=np.unique(predict)))) #准確率和召回率是相互影響的,理想情況下是二者都高,但是一般情況下准確率高,召回率就低;召回率高,准確率就低 if __name__=="__main__": importTrainContentdata() importTestContentdata() importTrainlabeldata() importTestlabeldata() #導入分詞后的詞向量bunch對象 train_bunch_path ="F:/goverment/ArticleMining/trainbunch.bat"#Bunch保存路徑 test_bunch_path ="F:/goverment/ArticleMining/testbunch.bat" stopword_path ="F:/goverment/ArticleMining/hlt_stop_words.txt" train_space_path = "F:/goverment/ArticleMining/traintfdifspace.dat" test_space_path = "F:/goverment/ArticleMining/testtfdifspace.dat" #對訓練和測試集進行bunch操作 buildtrainbunch(train_bunch_path) buildtestbunch(test_bunch_path) vector_space(stopword_path,train_bunch_path,train_space_path) testvector_space(stopword_path,test_bunch_path,test_space_path,train_space_path) #導入訓練和測試數據集 train_set=_readbunchobj(train_space_path) test_set=_readbunchobj(test_space_path) print(train_set.tdm) ''' mm=0 ii=0 jj=0 for i in range(3142): for j in range(3142): if train_set.tdm[i][j] >mm: mm=train_set.tdm[i][j] ii=i jj=j print(ii) print(jj) ''' #test_set.tdm #train_set.label # 訓練分類器:輸入詞袋向量和分類標簽,alpha:0.001 alpha越小,迭代次數越多,精度越高 #低召回、F1: 0.75 rbf:0.59 0.8 rbf 0.578 #c0.75 poly 66.5 精度:0.665 gamma=10 召回:0.330 f1-score:0.416 #C=0.7, kernel='poly', gamma=10 召回:0.331 f1-score:0.417 # alpha:0.001 alpha 越小,迭代次數越多,精度越高 ''' clf = MultinomialNB(alpha=0.052).fit(train_set.tdm, train_set.label) #clf = svm.SVC(C=0.7, kernel='poly', gamma=10, decision_function_shape='ovr') clf.fit(train_set.tdm, train_set.label) predicted=clf.predict(test_set.tdm) tv = TfidfVectorizer() train_data = tv.fit_transform(X_train) test_data = tv.transform(X_test) lr = LogisticRegression(C=3) lr.fit(train_set.tdm, train_set.label) predicted=lr.predict(test_set.tdm) print(lr.score(test_set.tdm, test_set.label)) #print(test_set.tdm) ''' clf = SVC(C=1500) clf.fit(train_set.tdm, train_set.label) predicted=clf.predict(test_set.tdm) print(clf.score(test_set.tdm, test_set.label)) ''' from sklearn.neighbors import KNeighborsClassifier knnclf = KNeighborsClassifier(n_neighbors=9)#default with k=5 knnclf.fit(train_set.tdm,train_set.label) predicted = knnclf.predict(test_set.tdm) ''' a=[] b=[] for i in range(len(predicted)): b.append((int)(float(predicted[i]))) a.append(int(test_set.label[i][0])) f=open('F:/goverment/ArticleMining/predict.txt', 'w') for i in range(len(predicted)): f.write(str(b[i])) f.write('\n') f.write("寫好了") f.close() #for i in range(len(predicted)): #print(b[i]) metrics_result(a, b)