利用CNN進行中文文本分類(數據集是復旦中文語料)


利用TfidfVectorizer進行中文文本分類(數據集是復旦中文語料) 

利用RNN進行中文文本分類(數據集是復旦中文語料)   

上一節我們利用了RNN(GRU)對中文文本進行了分類,本節我們將繼續使用CNN對中文文本進行分類。

數據處理還是沒有變,只是換了個模型,代碼如下:

# coding: utf-8

from __future__ import print_function

import os
import sys
import time
from datetime import timedelta
import keras

import numpy as np
import tensorflow as tf
from sklearn import metrics
#將詞匯表中的單詞映射成id
def word2id():
  vocabulary_path = '/content/drive/My Drive/NLP/dataset/Fudan/vocabulary.txt'
  fp1 = open(vocabulary_path,'r',encoding='utf-8')
  word2id_dict = {}
  for i,line in enumerate(fp1.readlines()):
    word2id_dict[line.strip()] = i
  print(len(word2id_dict))
  fp1.close()
  return word2id_dict

#得到文本內容及對應的標簽
def get_content_label(path):
  #data = '/content/drive/My Drive/NLP/dataset/Fudan/data/train_clean_jieba.txt'
  fp = open(path,'r',encoding='utf-8')
  content_list = []
  label_list = []
  for line in fp.readlines():
    line = line.strip().split('\t')
    if len(line) == 2:
      content_list.append(line[0])
      label_list.append(line[1])
  print(content_list[:5])
  print(label_list[:5])
  fp.close()
  return content_list,label_list
#得到標簽對應的id
def get_label_id():
  label = '/content/drive/My Drive/NLP/dataset/Fudan/label.txt'
  label2id_dict = {}
  fp = open(label,'r',encoding='utf-8')
  for line in fp.readlines():
    line = line.strip().split('\t')
    label2id_dict[line[0]] = line[1]
  #print(label2id_dict)
  return label2id_dict
#將文本內容中的詞替換成詞對應的id,並設定文本的最大長度
#對標簽進行one-hot編碼
def process(path,max_length):
  contents,labels = get_content_label(path)
  word_to_id = word2id()
  cat_to_id = get_label_id()
  data_id = []
  label_id = []
  for i in range(len(contents)):
    data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
    label_id.append(cat_to_id[labels[i]])

  # 使用keras提供的pad_sequences來將文本pad為固定長度
  x_pad = keras.preprocessing.sequence.pad_sequences(data_id, max_length)
  y_pad = keras.utils.to_categorical(label_id, num_classes=len(cat_to_id))  # 將標簽轉換為one-hot表示
  return x_pad,y_pad

def batch_iter(x, y, batch_size=64):
    """生成批次數據"""
    data_len = len(x)
    num_batch = int((data_len - 1) / batch_size) + 1

    indices = np.random.permutation(np.arange(data_len))
    x_shuffle = x[indices]
    y_shuffle = y[indices]

    for i in range(num_batch):
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]

def evaluate(sess, x_, y_):
    """評估在某一數據上的准確率和損失"""
    data_len = len(x_)
    batch_eval = batch_iter(x_, y_, 128)
    total_loss = 0.0
    total_acc = 0.0
    for x_batch, y_batch in batch_eval:
        batch_len = len(x_batch)
        feed_dict = feed_data(x_batch, y_batch, 1.0)
        loss, acc = sess.run([model.loss, model.acc], feed_dict=feed_dict)
        total_loss += loss * batch_len
        total_acc += acc * batch_len

    return total_loss / data_len, total_acc / data_len

def get_time_dif(start_time):
    """獲取已使用時間"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))


def feed_data(x_batch, y_batch, keep_prob):
    feed_dict = {
        model.input_x: x_batch,
        model.input_y: y_batch,
        model.keep_prob: keep_prob
    }
    return feed_dict


def get_training_word2vec_vectors(filename):
  with np.load(filename) as data:
    return data["embeddings"]

class TCNNConfig(object):
    """CNN配置參數"""

    embedding_dim = 100  # 詞向量維度
    seq_length = 600  # 序列長度
    num_classes = 20  # 類別數
    num_filters = 256  # 卷積核數目
    kernel_size = 5  # 卷積核尺寸
    vocab_size = 183664  # 詞匯表達小

    hidden_dim = 128  # 全連接層神經元

    dropout_keep_prob = 0.5  # dropout保留比例
    learning_rate = 1e-3  # 學習率

    batch_size = 64  # 每批訓練大小
    num_epochs = 10  # 總迭代輪次

    print_per_batch = 20  # 每多少輪輸出一次結果
    save_per_batch = 10  # 每多少輪存入tensorboard
    pre_trianing = None
    vector_word_npz = '/content/drive/My Drive/NLP/dataset/Fudan/vector_word.npz'


class TextCNN(object):
    """文本分類,CNN模型"""

    def __init__(self, config):
        self.config = config

        # 三個待輸入的數據
        self.input_x = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x')
        self.input_y = tf.placeholder(tf.float32, [None, self.config.num_classes], name='input_y')
        self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')

        self.cnn()

    def cnn(self):
        """CNN模型"""
        # 詞向量映射
        with tf.device('/cpu:0'):
            #embedding = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim])
            embedding = tf.get_variable("embeddings", shape=[self.config.vocab_size, self.config.embedding_dim],
                                             initializer=tf.constant_initializer(self.config.pre_trianing))
            embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x)
            

        with tf.name_scope("cnn"):
            # CNN layer
            conv = tf.layers.conv1d(embedding_inputs, self.config.num_filters, self.config.kernel_size, name='conv')
            # global max pooling layer
            gmp = tf.reduce_max(conv, reduction_indices=[1], name='gmp')

        with tf.name_scope("score"):
            # 全連接層,后面接dropout以及relu激活
            fc = tf.layers.dense(gmp, self.config.hidden_dim, name='fc1')
            fc = tf.contrib.layers.dropout(fc, self.keep_prob)
            fc = tf.nn.relu(fc)

            # 分類器
            self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2')
            self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1)  # 預測類別

        with tf.name_scope("optimize"):
            # 損失函數,交叉熵
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)
            self.loss = tf.reduce_mean(cross_entropy)
            # 優化器
            self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)

        with tf.name_scope("accuracy"):
            # 准確率
            correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
            self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

def train():
    print("Configuring TensorBoard and Saver...")
    # 配置 Tensorboard,重新訓練時,請將tensorboard文件夾刪除,不然圖會覆蓋
    tensorboard_dir = 'tensorboard/textcnn'
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)

    tf.summary.scalar("loss", model.loss)
    tf.summary.scalar("accuracy", model.acc)
    merged_summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(tensorboard_dir)
    save_dir = 'checkpoint/textcnn/'
    save_path = os.path.join(save_dir, 'best_validation')  # 最佳驗證結果保存路徑
    # 配置 Saver
    saver = tf.train.Saver()
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    print("Loading training and validation data...")
    # 載入訓練集與驗證集
    start_time = time.time()
    train_dir = '/content/drive/My Drive/NLP/dataset/Fudan/data/train_clean_jieba.txt'
    val_dir = '/content/drive/My Drive/NLP/dataset/Fudan/data/test_clean_jieba.txt'
    x_train, y_train = process(train_dir, config.seq_length)
    x_val, y_val = process(val_dir, config.seq_length)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

    # 創建session
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    writer.add_graph(session.graph)

    print('Training and evaluating...')
    start_time = time.time()
    total_batch = 0  # 總批次
    best_acc_val = 0.0  # 最佳驗證集准確率
    last_improved = 0  # 記錄上一次提升批次
    require_improvement = 1000  # 如果超過1000輪未提升,提前結束訓練

    flag = False
    for epoch in range(config.num_epochs):
        print('Epoch:', epoch + 1)
        batch_train = batch_iter(x_train, y_train, config.batch_size)
        for x_batch, y_batch in batch_train:
            feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob)

            if total_batch % config.save_per_batch == 0:
                # 每多少輪次將訓練結果寫入tensorboard scalar
                s = session.run(merged_summary, feed_dict=feed_dict)
                writer.add_summary(s, total_batch)

            if total_batch % config.print_per_batch == 0:
                # 每多少輪次輸出在訓練集和驗證集上的性能
                feed_dict[model.keep_prob] = 1.0
                loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
                loss_val, acc_val = evaluate(session, x_val, y_val)  # todo

                if acc_val > best_acc_val:
                    # 保存最好結果
                    best_acc_val = acc_val
                    last_improved = total_batch
                    saver.save(sess=session, save_path=save_path)
                    improved_str = '*'
                else:
                    improved_str = ''

                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                      + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))

            feed_dict[model.keep_prob] = config.dropout_keep_prob
            session.run(model.optim, feed_dict=feed_dict)  # 運行優化
            total_batch += 1

            if total_batch - last_improved > require_improvement:
                # 驗證集正確率長期不提升,提前結束訓練
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break  # 跳出循環
        if flag:  # 同上
            break


def test():
    print("Loading test data...")
    start_time = time.time()
    test_dir = '/content/drive/My Drive/NLP/dataset/Fudan/data/test_clean_jieba.txt'
    x_test, y_test = process(test_dir, config.seq_length)
    save_path = 'checkpoint/textcnn/best_validation'

    session = tf.Session()
    session.run(tf.global_variables_initializer())

    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=save_path)  # 讀取保存的模型

    print('Testing...')
    loss_test, acc_test = evaluate(session, x_test, y_test)
    msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'
    print(msg.format(loss_test, acc_test))

    batch_size = 128
    data_len = len(x_test)
    num_batch = int((data_len - 1) / batch_size) + 1

    y_test_cls = np.argmax(y_test, 1)
    y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32)  # 保存預測結果
    for i in range(num_batch):  # 逐批次處理
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        feed_dict = {
            model.input_x: x_test[start_id:end_id],
            model.keep_prob: 1.0
        }
        y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict)
    categories = get_label_id().values()
    # 評估
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))

    # 混淆矩陣
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':
  print('Configuring CNN model...')
  config = TCNNConfig()
  config.pre_trianing = get_training_word2vec_vectors(config.vector_word_npz)
  model = TextCNN(config)
  test()

 

結果如下:

Epoch: 8
Iter:   1080, Train Loss:   0.13, Train Acc:  95.31%, Val Loss:   0.44, Val Acc:  87.19%, Time: 0:04:33 
Iter:   1100, Train Loss:   0.24, Train Acc:  95.31%, Val Loss:   0.44, Val Acc:  87.03%, Time: 0:04:38 
Iter:   1120, Train Loss:   0.19, Train Acc:  93.75%, Val Loss:   0.43, Val Acc:  87.38%, Time: 0:04:42 
Iter:   1140, Train Loss:   0.17, Train Acc:  92.19%, Val Loss:   0.42, Val Acc:  87.80%, Time: 0:04:47 *
Iter:   1160, Train Loss:   0.21, Train Acc:  90.62%, Val Loss:   0.41, Val Acc:  87.89%, Time: 0:04:53 *
Iter:   1180, Train Loss:   0.34, Train Acc:  89.06%, Val Loss:   0.43, Val Acc:  87.57%, Time: 0:04:57 
Iter:   1200, Train Loss:   0.22, Train Acc:  92.19%, Val Loss:   0.41, Val Acc:  87.62%, Time: 0:05:01 
Iter:   1220, Train Loss:   0.24, Train Acc:  90.62%, Val Loss:   0.41, Val Acc:  87.87%, Time: 0:05:06 
Epoch: 9
Iter:   1240, Train Loss:  0.096, Train Acc:  95.31%, Val Loss:    0.4, Val Acc:  88.34%, Time: 0:05:11 *
Iter:   1260, Train Loss:   0.21, Train Acc:  92.19%, Val Loss:   0.41, Val Acc:  87.98%, Time: 0:05:16 
Iter:   1280, Train Loss:   0.13, Train Acc:  95.31%, Val Loss:   0.42, Val Acc:  88.14%, Time: 0:05:20 
Iter:   1300, Train Loss:    0.1, Train Acc:  98.44%, Val Loss:   0.43, Val Acc:  87.76%, Time: 0:05:25 
Iter:   1320, Train Loss:   0.27, Train Acc:  92.19%, Val Loss:   0.39, Val Acc:  87.93%, Time: 0:05:29 
Iter:   1340, Train Loss:   0.19, Train Acc:  92.19%, Val Loss:   0.45, Val Acc:  87.67%, Time: 0:05:33 
Iter:   1360, Train Loss:   0.27, Train Acc:  92.19%, Val Loss:   0.42, Val Acc:  87.57%, Time: 0:05:38 
Iter:   1380, Train Loss:   0.17, Train Acc:  92.19%, Val Loss:   0.41, Val Acc:  88.07%, Time: 0:05:42 
Epoch: 10
Iter:   1400, Train Loss:    0.1, Train Acc:  98.44%, Val Loss:   0.39, Val Acc:  88.64%, Time: 0:05:47 *
Iter:   1420, Train Loss:  0.069, Train Acc:  96.88%, Val Loss:    0.4, Val Acc:  88.46%, Time: 0:05:51 
Iter:   1440, Train Loss:   0.15, Train Acc:  98.44%, Val Loss:   0.41, Val Acc:  88.16%, Time: 0:05:56 
Iter:   1460, Train Loss:  0.073, Train Acc:  98.44%, Val Loss:    0.4, Val Acc:  88.38%, Time: 0:06:00 
Iter:   1480, Train Loss:   0.16, Train Acc:  95.31%, Val Loss:   0.42, Val Acc:  88.12%, Time: 0:06:05 
Iter:   1500, Train Loss:   0.21, Train Acc:  92.19%, Val Loss:   0.41, Val Acc:  87.79%, Time: 0:06:09 
Iter:   1520, Train Loss:   0.16, Train Acc:  93.75%, Val Loss:   0.41, Val Acc:  88.03%, Time: 0:06:13 

進行測試,測試結果如下:

Testing...
2020-10-19 12:51:46.979827: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2020-10-19 12:51:47.221023: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
Test Loss:   0.39, Test Acc:  88.64%
Precision, Recall and F1-Score...
/usr/local/lib/python3.6/dist-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.33      0.05      0.09        61
           1       0.89      0.96      0.93      1022
           2       0.39      0.15      0.22        59
           3       0.89      0.95      0.92      1254
           4       0.33      0.08      0.12        52
           5       0.83      0.90      0.86      1026
           6       0.95      0.98      0.96      1358
           7       0.67      0.04      0.08        45
           8       0.39      0.28      0.32        76
           9       0.85      0.94      0.89       742
          10       0.00      0.00      0.00        34
          11       0.00      0.00      0.00        28
          12       0.96      0.96      0.96      1218
          13       0.87      0.92      0.89       642
          14       0.50      0.15      0.23        33
          15       0.67      0.07      0.13        27
          16       0.91      0.91      0.91      1601
          17       0.86      0.11      0.20        53
          18       0.00      0.00      0.00        34
          19       0.74      0.69      0.72       468

    accuracy                           0.89      9833
   macro avg       0.60      0.46      0.47      9833
weighted avg       0.87      0.89      0.87      9833

Confusion Matrix...
[[   3    1    0   42    0    5    0    0    4    3    0    0    0    2
     0    0    1    0    0    0]
 [   0  983    0    5    0    1    0    0    0    0    0    0    8    3
     0    0   14    1    0    7]
 [   1    2    9    3    0    4    2    0    3    1    0    0    2   15
     3    0   13    0    0    1]
 [   0    3    0 1195    0   12    2    0    0   16    0    0    3    2
     0    0    8    0    0   13]
 [   0    6    1    1    4   14    5    0    5    0    0    0    1    1
     0    0   14    0    0    0]
 [   0    7    0   16    0  924    1    0    3    5    0    0    1    0
     0    0   39    0    0   30]
 [   0    1    0    3    0    0 1328    1    1    0    0    0    1   17
     0    0    5    0    0    1]
 [   0    0    0   13    0   12    0    2    0    8    0    0    1    2
     0    0    0    0    0    7]
 [   2    1    1    7    0   39    0    0   21    0    0    0    0    4
     0    0    0    0    0    1]
 [   0    1    0   10    0   10    1    0    1  696    0    0    0    0
     0    0    3    0    0   20]
 [   0    0    0    4    0    0    0    0    0   15    0    0    0    1
     0    0    1    0    0   13]
 [   0    0    0    2    1    0    5    0    2    0    0    0    0   10
     1    0    7    0    0    0]
 [   0   11    0    1    1    1    8    0    3    0    0    0 1175    6
     0    0    7    0    0    5]
 [   0    0    0    6    0    0   31    0    0    1    0    0   12  589
     0    0    3    0    0    0]
 [   0    2    4    1    1    1    0    0    1    0    0    0    4    6
     5    1    7    0    0    0]
 [   0    0    2    1    0    1    6    0    0    0    0    0    0   11
     0    2    4    0    0    0]
 [   0   70    2   10    2   39    5    0    2    2    0    0    7    0
     0    0 1451    0    0   11]
 [   3    4    0   10    3   12    0    0    6    3    0    0    0    0
     0    0    5    6    0    1]
 [   0    7    4    0    0    1    0    0    1    1    0    0    6    5
     1    0    7    0    0    1]
 [   0    4    0    7    0   43    5    0    1   72    0    0    1    1
     0    0   11    0    0  323]]
Time usage: 0:00:13

至此使用傳統的TF-IDF+朴素貝葉斯、RNN(LSTM、GRU)和CNN從數據的處理到模型的訓練和測試就全部完成了,接下來准備弄弄Transformer和Bert了,歡迎關注。

 

參考:

https://github.com/gaussic/text-classification-cnn-rnn


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM