利用TfidfVectorizer進行中文文本分類(數據集是復旦中文語料)
上一節我們利用了RNN(GRU)對中文文本進行了分類,本節我們將繼續使用CNN對中文文本進行分類。
數據處理還是沒有變,只是換了個模型,代碼如下:
# coding: utf-8 from __future__ import print_function import os import sys import time from datetime import timedelta import keras import numpy as np import tensorflow as tf from sklearn import metrics #將詞匯表中的單詞映射成id def word2id(): vocabulary_path = '/content/drive/My Drive/NLP/dataset/Fudan/vocabulary.txt' fp1 = open(vocabulary_path,'r',encoding='utf-8') word2id_dict = {} for i,line in enumerate(fp1.readlines()): word2id_dict[line.strip()] = i print(len(word2id_dict)) fp1.close() return word2id_dict #得到文本內容及對應的標簽 def get_content_label(path): #data = '/content/drive/My Drive/NLP/dataset/Fudan/data/train_clean_jieba.txt' fp = open(path,'r',encoding='utf-8') content_list = [] label_list = [] for line in fp.readlines(): line = line.strip().split('\t') if len(line) == 2: content_list.append(line[0]) label_list.append(line[1]) print(content_list[:5]) print(label_list[:5]) fp.close() return content_list,label_list #得到標簽對應的id def get_label_id(): label = '/content/drive/My Drive/NLP/dataset/Fudan/label.txt' label2id_dict = {} fp = open(label,'r',encoding='utf-8') for line in fp.readlines(): line = line.strip().split('\t') label2id_dict[line[0]] = line[1] #print(label2id_dict) return label2id_dict #將文本內容中的詞替換成詞對應的id,並設定文本的最大長度 #對標簽進行one-hot編碼 def process(path,max_length): contents,labels = get_content_label(path) word_to_id = word2id() cat_to_id = get_label_id() data_id = [] label_id = [] for i in range(len(contents)): data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id]) label_id.append(cat_to_id[labels[i]]) # 使用keras提供的pad_sequences來將文本pad為固定長度 x_pad = keras.preprocessing.sequence.pad_sequences(data_id, max_length) y_pad = keras.utils.to_categorical(label_id, num_classes=len(cat_to_id)) # 將標簽轉換為one-hot表示 return x_pad,y_pad def batch_iter(x, y, batch_size=64): """生成批次數據""" data_len = len(x) num_batch = int((data_len - 1) / batch_size) + 1 indices = np.random.permutation(np.arange(data_len)) x_shuffle = x[indices] y_shuffle = y[indices] for i in range(num_batch): start_id = i * batch_size end_id = min((i + 1) * batch_size, data_len) yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id] def evaluate(sess, x_, y_): """評估在某一數據上的准確率和損失""" data_len = len(x_) batch_eval = batch_iter(x_, y_, 128) total_loss = 0.0 total_acc = 0.0 for x_batch, y_batch in batch_eval: batch_len = len(x_batch) feed_dict = feed_data(x_batch, y_batch, 1.0) loss, acc = sess.run([model.loss, model.acc], feed_dict=feed_dict) total_loss += loss * batch_len total_acc += acc * batch_len return total_loss / data_len, total_acc / data_len def get_time_dif(start_time): """獲取已使用時間""" end_time = time.time() time_dif = end_time - start_time return timedelta(seconds=int(round(time_dif))) def feed_data(x_batch, y_batch, keep_prob): feed_dict = { model.input_x: x_batch, model.input_y: y_batch, model.keep_prob: keep_prob } return feed_dict def get_training_word2vec_vectors(filename): with np.load(filename) as data: return data["embeddings"] class TCNNConfig(object): """CNN配置參數""" embedding_dim = 100 # 詞向量維度 seq_length = 600 # 序列長度 num_classes = 20 # 類別數 num_filters = 256 # 卷積核數目 kernel_size = 5 # 卷積核尺寸 vocab_size = 183664 # 詞匯表達小 hidden_dim = 128 # 全連接層神經元 dropout_keep_prob = 0.5 # dropout保留比例 learning_rate = 1e-3 # 學習率 batch_size = 64 # 每批訓練大小 num_epochs = 10 # 總迭代輪次 print_per_batch = 20 # 每多少輪輸出一次結果 save_per_batch = 10 # 每多少輪存入tensorboard pre_trianing = None vector_word_npz = '/content/drive/My Drive/NLP/dataset/Fudan/vector_word.npz' class TextCNN(object): """文本分類,CNN模型""" def __init__(self, config): self.config = config # 三個待輸入的數據 self.input_x = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x') self.input_y = tf.placeholder(tf.float32, [None, self.config.num_classes], name='input_y') self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') self.cnn() def cnn(self): """CNN模型""" # 詞向量映射 with tf.device('/cpu:0'): #embedding = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim]) embedding = tf.get_variable("embeddings", shape=[self.config.vocab_size, self.config.embedding_dim], initializer=tf.constant_initializer(self.config.pre_trianing)) embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x) with tf.name_scope("cnn"): # CNN layer conv = tf.layers.conv1d(embedding_inputs, self.config.num_filters, self.config.kernel_size, name='conv') # global max pooling layer gmp = tf.reduce_max(conv, reduction_indices=[1], name='gmp') with tf.name_scope("score"): # 全連接層,后面接dropout以及relu激活 fc = tf.layers.dense(gmp, self.config.hidden_dim, name='fc1') fc = tf.contrib.layers.dropout(fc, self.keep_prob) fc = tf.nn.relu(fc) # 分類器 self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2') self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1) # 預測類別 with tf.name_scope("optimize"): # 損失函數,交叉熵 cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y) self.loss = tf.reduce_mean(cross_entropy) # 優化器 self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss) with tf.name_scope("accuracy"): # 准確率 correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls) self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) def train(): print("Configuring TensorBoard and Saver...") # 配置 Tensorboard,重新訓練時,請將tensorboard文件夾刪除,不然圖會覆蓋 tensorboard_dir = 'tensorboard/textcnn' if not os.path.exists(tensorboard_dir): os.makedirs(tensorboard_dir) tf.summary.scalar("loss", model.loss) tf.summary.scalar("accuracy", model.acc) merged_summary = tf.summary.merge_all() writer = tf.summary.FileWriter(tensorboard_dir) save_dir = 'checkpoint/textcnn/' save_path = os.path.join(save_dir, 'best_validation') # 最佳驗證結果保存路徑 # 配置 Saver saver = tf.train.Saver() if not os.path.exists(save_dir): os.makedirs(save_dir) print("Loading training and validation data...") # 載入訓練集與驗證集 start_time = time.time() train_dir = '/content/drive/My Drive/NLP/dataset/Fudan/data/train_clean_jieba.txt' val_dir = '/content/drive/My Drive/NLP/dataset/Fudan/data/test_clean_jieba.txt' x_train, y_train = process(train_dir, config.seq_length) x_val, y_val = process(val_dir, config.seq_length) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) # 創建session session = tf.Session() session.run(tf.global_variables_initializer()) writer.add_graph(session.graph) print('Training and evaluating...') start_time = time.time() total_batch = 0 # 總批次 best_acc_val = 0.0 # 最佳驗證集准確率 last_improved = 0 # 記錄上一次提升批次 require_improvement = 1000 # 如果超過1000輪未提升,提前結束訓練 flag = False for epoch in range(config.num_epochs): print('Epoch:', epoch + 1) batch_train = batch_iter(x_train, y_train, config.batch_size) for x_batch, y_batch in batch_train: feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob) if total_batch % config.save_per_batch == 0: # 每多少輪次將訓練結果寫入tensorboard scalar s = session.run(merged_summary, feed_dict=feed_dict) writer.add_summary(s, total_batch) if total_batch % config.print_per_batch == 0: # 每多少輪次輸出在訓練集和驗證集上的性能 feed_dict[model.keep_prob] = 1.0 loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict) loss_val, acc_val = evaluate(session, x_val, y_val) # todo if acc_val > best_acc_val: # 保存最好結果 best_acc_val = acc_val last_improved = total_batch saver.save(sess=session, save_path=save_path) improved_str = '*' else: improved_str = '' time_dif = get_time_dif(start_time) msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \ + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}' print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str)) feed_dict[model.keep_prob] = config.dropout_keep_prob session.run(model.optim, feed_dict=feed_dict) # 運行優化 total_batch += 1 if total_batch - last_improved > require_improvement: # 驗證集正確率長期不提升,提前結束訓練 print("No optimization for a long time, auto-stopping...") flag = True break # 跳出循環 if flag: # 同上 break def test(): print("Loading test data...") start_time = time.time() test_dir = '/content/drive/My Drive/NLP/dataset/Fudan/data/test_clean_jieba.txt' x_test, y_test = process(test_dir, config.seq_length) save_path = 'checkpoint/textcnn/best_validation' session = tf.Session() session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=session, save_path=save_path) # 讀取保存的模型 print('Testing...') loss_test, acc_test = evaluate(session, x_test, y_test) msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}' print(msg.format(loss_test, acc_test)) batch_size = 128 data_len = len(x_test) num_batch = int((data_len - 1) / batch_size) + 1 y_test_cls = np.argmax(y_test, 1) y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32) # 保存預測結果 for i in range(num_batch): # 逐批次處理 start_id = i * batch_size end_id = min((i + 1) * batch_size, data_len) feed_dict = { model.input_x: x_test[start_id:end_id], model.keep_prob: 1.0 } y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict) categories = get_label_id().values() # 評估 print("Precision, Recall and F1-Score...") print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories)) # 混淆矩陣 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': print('Configuring CNN model...') config = TCNNConfig() config.pre_trianing = get_training_word2vec_vectors(config.vector_word_npz) model = TextCNN(config) test()
結果如下:
Epoch: 8 Iter: 1080, Train Loss: 0.13, Train Acc: 95.31%, Val Loss: 0.44, Val Acc: 87.19%, Time: 0:04:33 Iter: 1100, Train Loss: 0.24, Train Acc: 95.31%, Val Loss: 0.44, Val Acc: 87.03%, Time: 0:04:38 Iter: 1120, Train Loss: 0.19, Train Acc: 93.75%, Val Loss: 0.43, Val Acc: 87.38%, Time: 0:04:42 Iter: 1140, Train Loss: 0.17, Train Acc: 92.19%, Val Loss: 0.42, Val Acc: 87.80%, Time: 0:04:47 * Iter: 1160, Train Loss: 0.21, Train Acc: 90.62%, Val Loss: 0.41, Val Acc: 87.89%, Time: 0:04:53 * Iter: 1180, Train Loss: 0.34, Train Acc: 89.06%, Val Loss: 0.43, Val Acc: 87.57%, Time: 0:04:57 Iter: 1200, Train Loss: 0.22, Train Acc: 92.19%, Val Loss: 0.41, Val Acc: 87.62%, Time: 0:05:01 Iter: 1220, Train Loss: 0.24, Train Acc: 90.62%, Val Loss: 0.41, Val Acc: 87.87%, Time: 0:05:06 Epoch: 9 Iter: 1240, Train Loss: 0.096, Train Acc: 95.31%, Val Loss: 0.4, Val Acc: 88.34%, Time: 0:05:11 * Iter: 1260, Train Loss: 0.21, Train Acc: 92.19%, Val Loss: 0.41, Val Acc: 87.98%, Time: 0:05:16 Iter: 1280, Train Loss: 0.13, Train Acc: 95.31%, Val Loss: 0.42, Val Acc: 88.14%, Time: 0:05:20 Iter: 1300, Train Loss: 0.1, Train Acc: 98.44%, Val Loss: 0.43, Val Acc: 87.76%, Time: 0:05:25 Iter: 1320, Train Loss: 0.27, Train Acc: 92.19%, Val Loss: 0.39, Val Acc: 87.93%, Time: 0:05:29 Iter: 1340, Train Loss: 0.19, Train Acc: 92.19%, Val Loss: 0.45, Val Acc: 87.67%, Time: 0:05:33 Iter: 1360, Train Loss: 0.27, Train Acc: 92.19%, Val Loss: 0.42, Val Acc: 87.57%, Time: 0:05:38 Iter: 1380, Train Loss: 0.17, Train Acc: 92.19%, Val Loss: 0.41, Val Acc: 88.07%, Time: 0:05:42 Epoch: 10 Iter: 1400, Train Loss: 0.1, Train Acc: 98.44%, Val Loss: 0.39, Val Acc: 88.64%, Time: 0:05:47 * Iter: 1420, Train Loss: 0.069, Train Acc: 96.88%, Val Loss: 0.4, Val Acc: 88.46%, Time: 0:05:51 Iter: 1440, Train Loss: 0.15, Train Acc: 98.44%, Val Loss: 0.41, Val Acc: 88.16%, Time: 0:05:56 Iter: 1460, Train Loss: 0.073, Train Acc: 98.44%, Val Loss: 0.4, Val Acc: 88.38%, Time: 0:06:00 Iter: 1480, Train Loss: 0.16, Train Acc: 95.31%, Val Loss: 0.42, Val Acc: 88.12%, Time: 0:06:05 Iter: 1500, Train Loss: 0.21, Train Acc: 92.19%, Val Loss: 0.41, Val Acc: 87.79%, Time: 0:06:09 Iter: 1520, Train Loss: 0.16, Train Acc: 93.75%, Val Loss: 0.41, Val Acc: 88.03%, Time: 0:06:13
進行測試,測試結果如下:
Testing... 2020-10-19 12:51:46.979827: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10 2020-10-19 12:51:47.221023: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7 Test Loss: 0.39, Test Acc: 88.64% Precision, Recall and F1-Score... /usr/local/lib/python3.6/dist-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) precision recall f1-score support 0 0.33 0.05 0.09 61 1 0.89 0.96 0.93 1022 2 0.39 0.15 0.22 59 3 0.89 0.95 0.92 1254 4 0.33 0.08 0.12 52 5 0.83 0.90 0.86 1026 6 0.95 0.98 0.96 1358 7 0.67 0.04 0.08 45 8 0.39 0.28 0.32 76 9 0.85 0.94 0.89 742 10 0.00 0.00 0.00 34 11 0.00 0.00 0.00 28 12 0.96 0.96 0.96 1218 13 0.87 0.92 0.89 642 14 0.50 0.15 0.23 33 15 0.67 0.07 0.13 27 16 0.91 0.91 0.91 1601 17 0.86 0.11 0.20 53 18 0.00 0.00 0.00 34 19 0.74 0.69 0.72 468 accuracy 0.89 9833 macro avg 0.60 0.46 0.47 9833 weighted avg 0.87 0.89 0.87 9833 Confusion Matrix... [[ 3 1 0 42 0 5 0 0 4 3 0 0 0 2 0 0 1 0 0 0] [ 0 983 0 5 0 1 0 0 0 0 0 0 8 3 0 0 14 1 0 7] [ 1 2 9 3 0 4 2 0 3 1 0 0 2 15 3 0 13 0 0 1] [ 0 3 0 1195 0 12 2 0 0 16 0 0 3 2 0 0 8 0 0 13] [ 0 6 1 1 4 14 5 0 5 0 0 0 1 1 0 0 14 0 0 0] [ 0 7 0 16 0 924 1 0 3 5 0 0 1 0 0 0 39 0 0 30] [ 0 1 0 3 0 0 1328 1 1 0 0 0 1 17 0 0 5 0 0 1] [ 0 0 0 13 0 12 0 2 0 8 0 0 1 2 0 0 0 0 0 7] [ 2 1 1 7 0 39 0 0 21 0 0 0 0 4 0 0 0 0 0 1] [ 0 1 0 10 0 10 1 0 1 696 0 0 0 0 0 0 3 0 0 20] [ 0 0 0 4 0 0 0 0 0 15 0 0 0 1 0 0 1 0 0 13] [ 0 0 0 2 1 0 5 0 2 0 0 0 0 10 1 0 7 0 0 0] [ 0 11 0 1 1 1 8 0 3 0 0 0 1175 6 0 0 7 0 0 5] [ 0 0 0 6 0 0 31 0 0 1 0 0 12 589 0 0 3 0 0 0] [ 0 2 4 1 1 1 0 0 1 0 0 0 4 6 5 1 7 0 0 0] [ 0 0 2 1 0 1 6 0 0 0 0 0 0 11 0 2 4 0 0 0] [ 0 70 2 10 2 39 5 0 2 2 0 0 7 0 0 0 1451 0 0 11] [ 3 4 0 10 3 12 0 0 6 3 0 0 0 0 0 0 5 6 0 1] [ 0 7 4 0 0 1 0 0 1 1 0 0 6 5 1 0 7 0 0 1] [ 0 4 0 7 0 43 5 0 1 72 0 0 1 1 0 0 11 0 0 323]] Time usage: 0:00:13
至此使用傳統的TF-IDF+朴素貝葉斯、RNN(LSTM、GRU)和CNN從數據的處理到模型的訓練和測試就全部完成了,接下來准備弄弄Transformer和Bert了,歡迎關注。
參考:
https://github.com/gaussic/text-classification-cnn-rnn