基於LDA主題模型和SVM的文本分類


用LDA模型抽取文本特征,再用線性SVM分類,發現效果很差,F1=0.654。

Precision:0.680,Recall:0.649,F1:0.654

RandomForestClassifier的表現也比較差:

Precision:0.680,Recall:0.668,F1:0.670

而隨便用一個深度學習模型(textCNN,LSTM+Attention)都能達到0.95+的F1,而且還不用處理特征、不用分詞。

 

說下具體流程:提取LDA特征時,需要CountVectorizer來先對文本進行向量化,首先需要對文本進行分詞,考慮到樣本數量較多(搜狐新聞數據集,5個類別*3000條信息),使用了多進程程(此處用了進程池ProcessPoolExecutor來實現)來進行jieba分詞。

import pandas as pd
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import multiprocessing
from concurrent.futures import ProcessPoolExecutor,as_completed
from utils import log
from tqdm import tqdm
import time
import pickle as pk
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC,SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score,recall_score,f1_score

def transform_text(text,stopwords):
    #對文章進行jieba分詞
    words=[w for w in jieba.cut(text) if w.strip() and (w not in stopwords)]
    return ','.join(words)

def cut_texts(lock,texts,stopwords,processName,doc_list=[]):
    #進程+鎖的形式來做多進程分詞
    log('Process {} is cutting texts...'.format(processName))
    docs=[]
    for text in tqdm(texts):
        doc=transform_text(text,stopwords)
        #log(doc)
        docs.append(doc)
    lock.acquire()
    doc_list.extend(docs)
    lock.release()

def cut_texts_pool(texts,stopwords,processName):
    #分詞,此方法將以,進程池方式的方式實現多進程加速執行
    log('Process {} is cutting texts...'.format(processName))
    docs=[]
    for text in tqdm(texts):
        doc=transform_text(text,stopwords)
        #log(doc)
        docs.append(doc)
    log('Process {} finished cutting.'.format(processName))
    return docs

def hard_work(processName):
    #測試方法,模擬耗時操作
    log('Process {} is running...'.format(processName))
    time.sleep(2)
    log('Process {} finished.'.format(processName))
    return processName

def mp_pool_test(texts=None,res=None):
    #多進程測試
    n_process=multiprocessing.cpu_count()
    pool=ProcessPoolExecutor()
    fs=[]
    for i in range(n_process):
        f=pool.submit(hard_work,i)
        fs.append(f)
    names=[]
    for f in as_completed(fs):
        name = f.result()
        names.append(name)
    log(names)

def partition(iterable_,n_parittion):
    #多文本進行分割,大體均分為n_parittion份
    assert isinstance(n_parittion,int) and n_parittion>0,'Invalid value for "n_partition"'
    temp=list(iterable_)
    total=len(temp)
    assert total>n_parittion,'Size of iterable is less than "n_partition"'

    partition_size=total//n_parittion
    res=[]
    for i in range(n_parittion-1):
        res.append(temp[partition_size*i:partition_size*(i+1)])
    res.append(temp[partition_size*(i+1):])
    return res

def mp_cut_pool(texts):
    #有幾個CPU就創建幾個進程
    n_process=multiprocessing.cpu_count()
    texts=partition(texts,n_process)
    #以進程池的方式進行多進程分詞
    pool=ProcessPoolExecutor(max_workers=12)
    fs=[]
    docs=[]
    for i in range(n_process):
        #submit啟動進程,第一個參數是目標方法,后面是該方法的參數
        f=pool.submit(cut_texts_pool,texts[i],[],i)
        #f是一個Future對象
        fs.append(f)
    #as_completed返回一個迭代器,當進程池當中的進程執行結束時調用
    for f in as_completed(fs):
        #f.result()獲取每個進程的返回值
        docs.extend(f.result())
    return docs

class LDA_Transformer:
    def __init__(self,n_features):
        self.n_features=n_features

    def fit(self,texts):
        log('Building CountVectorizer with texts...')
        ct=CountVectorizer()
        self.count_vectorizer=ct
        log(type(texts))
        if isinstance(texts,list):
            log('Len of texts:{}'.format(len(texts)))
            #log(texts)
        else:
            log('Shape of texts:{}'.format(texts.shape))
        print('texts[0]',texts[0])
        ctv=ct.fit_transform(texts)
        log('Building LDA model with CountVectorizer..')
        #n_components是LDA的主題個數,類似於word embedding的維度大小
        lda=LatentDirichletAllocation(n_components=self.n_features)
        lda.fit(ctv)
        log('Done building LDA model.')
        self.lda_model=lda

    def transform(self,texts):
        count_vec=self.count_vectorizer.transform(texts)
        return self.lda_model.transform(count_vec)

def build_data():
    df=pd.read_excel('data/souhu_news_400_500.xlsx')
    texts=list(df['content'])#文本字段
    log(df.columns)
    docs=mp_cut_pool(texts)
    lda_transformer=LDA_Transformer(64)
    lda_transformer.fit(docs)
    #保存LDA模型到本地
    with open('output/lda_transformer.pkl','wb') as f:
        pk.dump(lda_transformer,f)

    indices=list(range(df.shape[0]))
    np.random.shuffle(indices)
    df=df.iloc[indices]
    dic={topic:i for i,topic in enumerate(list(df['topic'].unique()))}
    y=[dic[topic] for topic in list(df['topic'])]
    with open('data/y_lda.pkl','wb') as f:
        pk.dump(y,f)

    texts=list(df['content'])
    X=lda_transformer.transform(texts)
    with open('data/X_lda.pkl','wb') as f:
        pk.dump(X,f)
    log('Training data is saved.')

def load_train_data():
    with open('data/X_lda.pkl','rb') as f:
        X=pk.load(f)
    with open('data/y_lda.pkl','rb') as f:
        y=pk.load(f)
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
    return X_train,X_test,y_train,y_test

def main():
    log('Building training data...')
    build_data()
    log('Loading training data with LDA features...')
    X_train,X_test,y_train,y_test=load_train_data()
    log('Training LinearSVC model..')
    #model=LinearSVC()
    model=RandomForestClassifier()
    model.fit(X_train,y_train)
    log('Evaluating model...')
    acc=model.score(X_test,y_test)
    log('Accuracy:{}'.format(acc))
    y_pred=model.predict(X_test)
    p=precision_score(y_test,y_pred,average='macro')
    r=recall_score(y_test,y_pred,average='macro')
    f1=f1_score(y_test,y_pred,average='macro')
    log('Precision:{:.3f},Recall:{:.3f},F1:{:.3f}'.format(p,r,f1))


if __name__=='__main__':
    main()

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM