中文情感分析 glove+LSTM


最近嘗試了一下中文的情感分析。

主要使用了Glove和LSTM。語料數據集采用的是中文酒店評價語料

1、首先是訓練Glove,獲得詞向量(這里是用的300d)。這一步使用的是jieba分詞和中文維基。

2、將中文酒店評價語料進行清洗,並分詞。分詞后轉化為詞向量的表示形式。

3、使用LSTM網絡進行訓練。

最終的正確率在91%左右

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 30 13:52:23 2018

@author: xyli
處理酒店評價語料數據,
分詞,並轉化為Glove向量
"""
import sys
import os
import chardet
import jieba
import re
import gensim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Masking
from keras.layers import Dense, Input, Flatten, Activation
from keras.layers import Conv1D, GlobalMaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional,Reshape
from keras.models import Sequential, Model
from Attention_layer import Attention_layer

from keras.layers import Convolution2D, MaxPooling2D  
from keras.utils import np_utils 


def loadGLoveModel(filename):
    embeddings_index = {}
    f = open(filename)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    return embeddings_index

def word2Glovec(List,model):
    vec=[]
    insert = [float(0) for i in range(300)] #300表示vec的維度
    insert = np.asarray(insert, dtype='float32')
    for w in List:
        v = model.get(w)
        if v is None:
            vec.append(insert)
        else:
            vec.append(v)
    return vec

def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
#    string = string.decode('utf-8')
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    string = re.sub(r"\r\n", "", string)
    string = re.sub(r"\r", "", string)
    string = re.sub(r"\,","",string)
    string = re.sub(r"\.","",string)
    string = re.sub(r"\,","",string)
    string = re.sub(r"\。","",string)
    string = re.sub(r"\(","",string)
    string = re.sub(r"\)","",string)
    string = re.sub(r"\(","",string)
    string = re.sub(r"\)","",string)
    string = re.sub(r"\“","",string)
    string = re.sub(r"\”","",string)
    return string.strip()

def fitList(List,n):
    L = len(List)
#    insert = [0 for i in range(300)]
    insert = '!'
    if L < n:
        d=n-L
        appList=[insert for i in range(d)]
        List+=appList
    else:
        if L>n:
            List=List[0:n]
    return List

def readData(filename):
    
    
    with open(filename, 'rb') as f:
        data = f.read()
        data=data.decode('gb18030','ignore')
        data=clean_str(data)
        seg_list = jieba.cut(data)  # 默認是精確模式
    segList=[]
    for s in seg_list:
        s=clean_str(s)
        segList.append(s)
    return segList
        
def loadData():
    Corpus_DIR = "data/ChnSentiCorp_htl_unba_10000"
    DIR=['/neg','/pos']
    commentList=[]
    rootdir = Corpus_DIR+DIR[0]
    filelist = os.listdir(rootdir) #列出文件夾下所有的目錄與文件
    labelList=[[0.0,1.0] for i in range(0,len(filelist))]
    for i in range(0,len(filelist)):
       path = os.path.join(rootdir,filelist[i])
       if os.path.isfile(path):
              templist=readData(path)
              commentList.append(templist)
    
    rootdir = Corpus_DIR+DIR[1]
    filelist = os.listdir(rootdir) #列出文件夾下所有的目錄與文件
    labelList2=[[1.0,0.0] for i in range(0,len(filelist))]
    for i in range(0,len(filelist)):
       path = os.path.join(rootdir,filelist[i])
       if os.path.isfile(path):
              templist=readData(path)
              commentList.append(templist)
    labelList+=labelList2
    return commentList,labelList

if __name__=='__main__':
    List,labelList=loadData()  #加載語料數據
    gloveModel=loadGLoveModel('model/zhs_wiki_glove.vectors.300d.txt')  #加載glove模型數據
    countList=[]
    commentVecList=[]
    n=100
    for c in List:
        countList.append(len(c))
        glovec=word2Glovec(fitList(c,n),gloveModel)
        commentVecList.append(glovec)
        
    VALIDATION_SPLIT = 0.2
    
    commentVecList=np.array(commentVecList)
    labelList=np.array(labelList)
    indices = np.arange(commentVecList.shape[0])
    np.random.shuffle(indices)
    data = commentVecList[indices]
    labels = labelList[indices]
    
    nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
    x_train = data[:-nb_validation_samples]
    y_train = labels[:-nb_validation_samples]
    x_val = data[-nb_validation_samples:]
    y_val = labels[-nb_validation_samples:]
    
    model = Sequential()
    model.add(LSTM(120, input_shape=(x_train.shape[1], x_train.shape[2]),return_sequences=True))
#    model.add(Activation('relu')) #激活層 
#    model.add(Attention_layer())
    model.add(Bidirectional(LSTM(60,return_sequences=True)))
#    model.add(Attention_layer())
#    model.add(Activation('relu')) #激活層 
    model.add(Dropout(0.3)) #神經元隨機失活
    model.add(Bidirectional(LSTM(30,return_sequences=False)))
    model.add(Dropout(0.3)) #神經元隨機失活
    model.add(Dense(y_train.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    model.fit(x_train, y_train, validation_data=(x_val, y_val),
              epochs=25, batch_size=200)
   
    

本文還在完善中。。。


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM