keras實戰教程二(文本分類BiLSTM)


 

什么是文本分類


 

給模型輸入一句話,讓模型判斷這句話的類別(預定義)。

以文本情感分類為例

輸入:的確是專業,用心做,出品方面都給好評。
輸出:2
輸出可以是[0,1,2]其中一個,0表示情感消極,1表示情感中性,2表示情感積極。

數據樣式


 

 

 

 網上應該能找到相關數據。

模型圖


 

 

 

 

訓練過程


 

 

 僅僅作為測試訓練一輪

代碼


 

讀取數據


 

import numpy as np
from gensim.models.word2vec import Word2Vec
from gensim.corpora.dictionary import Dictionary
from gensim import models
import pandas as pd
import jieba
import logging
from keras import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Bidirectional,LSTM,Dense,Embedding,Dropout,Activation,Softmax
from sklearn.model_selection import train_test_split
from keras.utils import np_utils

def read_data(data_path):
    senlist = []
    labellist = []  
    with open(data_path, "r",encoding='gb2312',errors='ignore') as f:
         for data in  f.readlines():
                data = data.strip()
                sen = data.split("\t")[2] 
                label = data.split("\t")[3]
                if sen != "" and (label =="0" or label=="1" or label=="2" ) :
                    senlist.append(sen)
                    labellist.append(label) 
                else:
                    pass                    
    assert(len(senlist) == len(labellist))            
    return senlist ,labellist 

sentences,labels = read_data("data_train.csv")

 

詞向量


 

def train_word2vec(sentences,save_path):
    sentences_seg = []
    sen_str = "\n".join(sentences)
    res = jieba.lcut(sen_str)
    seg_str = " ".join(res)
    sen_list = seg_str.split("\n")
    for i in sen_list:
        sentences_seg.append(i.split())
    print("開始訓練詞向量") 
#     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = Word2Vec(sentences_seg,
                size=100,  # 詞向量維度
                min_count=5,  # 詞頻閾值
                window=5)  # 窗口大小    
    model.save(save_path)
    return model

model =  train_word2vec(sentences,'word2vec.model')    

數據處理


 

def generate_id2wec(word2vec_model):
    gensim_dict = Dictionary()
    gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
    w2id = {v: k + 1 for k, v in gensim_dict.items()}  # 詞語的索引,從1開始編號
    w2vec = {word: model[word] for word in w2id.keys()}  # 詞語的詞向量
    n_vocabs = len(w2id) + 1
    embedding_weights = np.zeros((n_vocabs, 100))
    for w, index in w2id.items():  # 從索引為1的詞語開始,用詞向量填充矩陣
        embedding_weights[index, :] = w2vec[w]
    return w2id,embedding_weights

def text_to_array(w2index, senlist):  # 文本轉為索引數字模式
    sentences_array = []
    for sen in senlist:
        new_sen = [ w2index.get(word,0) for word in sen]   # 單詞轉索引數字
        sentences_array.append(new_sen)
    return np.array(sentences_array)

def prepare_data(w2id,sentences,labels,max_len=200):
    X_train, X_val, y_train, y_val = train_test_split(sentences,labels, test_size=0.2)
    X_train = text_to_array(w2id, X_train)
    X_val = text_to_array(w2id, X_val)
    X_train = pad_sequences(X_train, maxlen=max_len)
    X_val = pad_sequences(X_val, maxlen=max_len)
    return np.array(X_train), np_utils.to_categorical(y_train) ,np.array(X_val), np_utils.to_categorical(y_val)
w2id,embedding_weights = generate_id2wec(model)# 獲取詞向量矩陣和詞典
x_train,y_trian, x_val , y_val = prepare_data(w2id,sentences,labels,200)#將數據處理成模型需要的格式

 

構建模型


 

class Sentiment:
    def __init__(self,w2id,embedding_weights,Embedding_dim,maxlen,labels_category):
        self.Embedding_dim = Embedding_dim
        self.embedding_weights = embedding_weights
        self.vocab = w2id
        self.labels_category = labels_category
        self.maxlen = maxlen
        self.model = self.build_model()
      
        
    def build_model(self):
        model = Sequential()
        #input dim(140,100)
        model.add(Embedding(output_dim = self.Embedding_dim,
                           input_dim=len(self.vocab)+1,
                           weights=[self.embedding_weights],
                           input_length=self.maxlen))
        model.add(Bidirectional(LSTM(50),merge_mode='concat'))
        model.add(Dropout(0.5))
        model.add(Dense(self.labels_category))
        model.add(Activation('softmax'))
        model.compile(loss='categorical_crossentropy',
                     optimizer='adam', 
                     metrics=['accuracy'])
        model.summary()
        return model
    
    def train(self,X_train, y_train,X_test, y_test,n_epoch=5 ):
        self.model.fit(X_train, y_train, batch_size=32, epochs=n_epoch,
                      validation_data=(X_test, y_test))
        self.model.save('sentiment.h5')   
        
    def predict(self,model_path,new_sen):
        model = self.model
        model.load_weights(model_path)
        new_sen_list = jieba.lcut(new_sen)
        sen2id =[ self.vocab.get(word,0) for word in new_sen_list]
        sen_input = pad_sequences([sen2id], maxlen=self.maxlen)
        res = model.predict(sen_input)[0]
        return np.argmax(res)
senti = Sentiment(w2id,embedding_weights,100,200,3)

 

訓練預測


senti.train(x_train,y_trian, x_val ,y_val,1)#訓練
label_dic = {0:"消極的",1:"中性的",2:"積極的"}
sen_new = "現如今的公司能夠做成這樣已經很不錯了,微訂點單網站的信息更新很及時,內容來源很真實"
pre = senti.predict("./sentiment.h5",sen_new)
print("'{}'的情感是:\n{}".format(sen_new,label_dic.get(pre)))

參考https://www.jianshu.com/p/fba7df3a76fa

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM