用LDA模型抽取文本特征,再用線性SVM分類,發現效果很差,F1=0.654。
Precision:0.680,Recall:0.649,F1:0.654
RandomForestClassifier的表現也比較差:
Precision:0.680,Recall:0.668,F1:0.670
而隨便用一個深度學習模型(textCNN,LSTM+Attention)都能達到0.95+的F1,而且還不用處理特征、不用分詞。
說下具體流程:提取LDA特征時,需要CountVectorizer來先對文本進行向量化,首先需要對文本進行分詞,考慮到樣本數量較多(搜狐新聞數據集,5個類別*3000條信息),使用了多進程程(此處用了進程池ProcessPoolExecutor來實現)來進行jieba分詞。
import pandas as pd import jieba from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import LatentDirichletAllocation import multiprocessing from concurrent.futures import ProcessPoolExecutor,as_completed from utils import log from tqdm import tqdm import time import pickle as pk import numpy as np from sklearn.model_selection import train_test_split from sklearn.svm import LinearSVC,SVC from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import precision_score,recall_score,f1_score def transform_text(text,stopwords): #對文章進行jieba分詞 words=[w for w in jieba.cut(text) if w.strip() and (w not in stopwords)] return ','.join(words) def cut_texts(lock,texts,stopwords,processName,doc_list=[]): #進程+鎖的形式來做多進程分詞 log('Process {} is cutting texts...'.format(processName)) docs=[] for text in tqdm(texts): doc=transform_text(text,stopwords) #log(doc) docs.append(doc) lock.acquire() doc_list.extend(docs) lock.release() def cut_texts_pool(texts,stopwords,processName): #分詞,此方法將以,進程池方式的方式實現多進程加速執行 log('Process {} is cutting texts...'.format(processName)) docs=[] for text in tqdm(texts): doc=transform_text(text,stopwords) #log(doc) docs.append(doc) log('Process {} finished cutting.'.format(processName)) return docs def hard_work(processName): #測試方法,模擬耗時操作 log('Process {} is running...'.format(processName)) time.sleep(2) log('Process {} finished.'.format(processName)) return processName def mp_pool_test(texts=None,res=None): #多進程測試 n_process=multiprocessing.cpu_count() pool=ProcessPoolExecutor() fs=[] for i in range(n_process): f=pool.submit(hard_work,i) fs.append(f) names=[] for f in as_completed(fs): name = f.result() names.append(name) log(names) def partition(iterable_,n_parittion): #多文本進行分割,大體均分為n_parittion份 assert isinstance(n_parittion,int) and n_parittion>0,'Invalid value for "n_partition"' temp=list(iterable_) total=len(temp) assert total>n_parittion,'Size of iterable is less than "n_partition"' partition_size=total//n_parittion res=[] for i in range(n_parittion-1): res.append(temp[partition_size*i:partition_size*(i+1)]) res.append(temp[partition_size*(i+1):]) return res def mp_cut_pool(texts): #有幾個CPU就創建幾個進程 n_process=multiprocessing.cpu_count() texts=partition(texts,n_process) #以進程池的方式進行多進程分詞 pool=ProcessPoolExecutor(max_workers=12) fs=[] docs=[] for i in range(n_process): #submit啟動進程,第一個參數是目標方法,后面是該方法的參數 f=pool.submit(cut_texts_pool,texts[i],[],i) #f是一個Future對象 fs.append(f) #as_completed返回一個迭代器,當進程池當中的進程執行結束時調用 for f in as_completed(fs): #f.result()獲取每個進程的返回值 docs.extend(f.result()) return docs class LDA_Transformer: def __init__(self,n_features): self.n_features=n_features def fit(self,texts): log('Building CountVectorizer with texts...') ct=CountVectorizer() self.count_vectorizer=ct log(type(texts)) if isinstance(texts,list): log('Len of texts:{}'.format(len(texts))) #log(texts) else: log('Shape of texts:{}'.format(texts.shape)) print('texts[0]',texts[0]) ctv=ct.fit_transform(texts) log('Building LDA model with CountVectorizer..') #n_components是LDA的主題個數,類似於word embedding的維度大小 lda=LatentDirichletAllocation(n_components=self.n_features) lda.fit(ctv) log('Done building LDA model.') self.lda_model=lda def transform(self,texts): count_vec=self.count_vectorizer.transform(texts) return self.lda_model.transform(count_vec) def build_data(): df=pd.read_excel('data/souhu_news_400_500.xlsx') texts=list(df['content'])#文本字段 log(df.columns) docs=mp_cut_pool(texts) lda_transformer=LDA_Transformer(64) lda_transformer.fit(docs) #保存LDA模型到本地 with open('output/lda_transformer.pkl','wb') as f: pk.dump(lda_transformer,f) indices=list(range(df.shape[0])) np.random.shuffle(indices) df=df.iloc[indices] dic={topic:i for i,topic in enumerate(list(df['topic'].unique()))} y=[dic[topic] for topic in list(df['topic'])] with open('data/y_lda.pkl','wb') as f: pk.dump(y,f) texts=list(df['content']) X=lda_transformer.transform(texts) with open('data/X_lda.pkl','wb') as f: pk.dump(X,f) log('Training data is saved.') def load_train_data(): with open('data/X_lda.pkl','rb') as f: X=pk.load(f) with open('data/y_lda.pkl','rb') as f: y=pk.load(f) X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2) return X_train,X_test,y_train,y_test def main(): log('Building training data...') build_data() log('Loading training data with LDA features...') X_train,X_test,y_train,y_test=load_train_data() log('Training LinearSVC model..') #model=LinearSVC() model=RandomForestClassifier() model.fit(X_train,y_train) log('Evaluating model...') acc=model.score(X_test,y_test) log('Accuracy:{}'.format(acc)) y_pred=model.predict(X_test) p=precision_score(y_test,y_pred,average='macro') r=recall_score(y_test,y_pred,average='macro') f1=f1_score(y_test,y_pred,average='macro') log('Precision:{:.3f},Recall:{:.3f},F1:{:.3f}'.format(p,r,f1)) if __name__=='__main__': main()