中文文本分類

本文轉載自查看原文 2020-02-20 05:07 1522

代碼分解
- utils
- train_eval
- models.TextCNN
- main
在GPU下的運行結果

代碼分解

代碼包括四個部分，分別是：

工具類：utils
訓練及測試代碼：train_eval
模型：models.TextCNN
主函數：main

在notebook中依次運行前三個部分，最后執行main就可以開始訓練了

colab鏈接：https://colab.research.google.com/drive/1vUnHAjmA3OTt5o47HQkQLCXA8-rtsZEs

具體代碼及解析如下：

utils

"""
utils
"""
import os
import torch
import numpy as np
import pickle as pkl
from tqdm import tqdm
import time
from datetime import timedelta


MAX_VOCAB_SIZE = 10000  # 詞表長度限制
UNK, PAD = '<UNK>', '<PAD>'  # 未知字，padding符號


def build_vocab(file_path, tokenizer, max_size, min_freq):
  """
  構建一個詞表：
  首先對數據集中的每一行句子按字/空格進行分割，然后統計所有元素的出現頻率
  接下來按照頻率從高到低的順序對所有頻率大於min_freq的元素進行排序，取前max_size個元素
  最后按照頻率降序構建字典vocab_dic：{元素:序號}，vocab_dic的最后兩個元素是'<UNK>'和'<PAD>'
  """
    vocab_dic = {}
    with open(file_path, 'r', encoding='UTF-8') as f:
        for line in tqdm(f):  # 處理每一行
            lin = line.strip()  # 移除頭尾空格或換行符
            if not lin:  # 跳過空行
                continue
            content = lin.split('\t')[0]  # 句子和標簽通過tab分割，前面的是句子內容，后面的是標簽
            for word in tokenizer(content):  # 按空格分割或者按字分割
                vocab_dic[word] = vocab_dic.get(word, 0) + 1  # 統計詞頻或字頻
        # 遍歷詞典，篩選出詞頻大於min_freq的詞，然后按照詞頻從高到低排序，取前max_size個詞，組成新的列表vocab_list，vocab_list中的元素為元組(word, freq)
        vocab_list = sorted([_ for _ in vocab_dic.items() if _[1] >= min_freq], key=lambda x: x[1], reverse=True)[:max_size]
        # 構建字典vocab_dic，key為詞，value為索引（按詞頻升序）
        vocab_dic = {word_count[0]: idx for idx, word_count in enumerate(vocab_list)}
        # 在vocab_dic的最后增加兩個元素：{'<UNK>':len(vocab_dic)}和{'<PAD>':len(vocab_dic)+1}
        vocab_dic.update({UNK: len(vocab_dic), PAD: len(vocab_dic) + 1})
    return vocab_dic


def build_dataset(config, ues_word):  # 構建數據集
  """
  加載數據集：
  對數據集中的每一行，先分離內容和標簽
  然后對句子內容，按指定的方式進行分割（依照空格或字符），接着根據pad_size進行補足或截斷
  接着把分割后的元素，通過詞表轉化成一串序號words_line
  最后把所有句子處理后的結果組成一個大列表，列表中的元素為：[(words_line, int(label), seq_len),...]
  """
    if ues_word:
        tokenizer = lambda x: x.split(' ')  # 以空格隔開，word-level
    else:
        tokenizer = lambda x: [y for y in x]  # char-level
    if os.path.exists(config.vocab_path):  # 如果有詞表的話，加載詞表
        vocab = pkl.load(open(config.vocab_path, 'rb'))
    else:  # 沒有詞表的話，就調用build_vocab()自己構建
        vocab = build_vocab(config.train_path, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
        pkl.dump(vocab, open(config.vocab_path, 'wb'))  # 構建完了之后保存為pickle
    print(f"Vocab size: {len(vocab)}")

    def load_dataset(path, pad_size=32):
        contents = []
        with open(path, 'r', encoding='UTF-8') as f:
            for line in tqdm(f):  # 打開數據文件，按行讀取
                lin = line.strip()  # 移除頭尾空格或換行符
                if not lin:
                    continue  # 跳過空行
                content, label = lin.split('\t')  # 句子和標簽通過tab分割，前面的是句子內容，后面的是標簽
                token = tokenizer(content)  # 按空格或字符來分割句子
                seq_len = len(token)  # 得到分割后的元素數量
                if pad_size:  # 如果有指定填充長度
                    if len(token) < pad_size:  #  如果分割后的元素數量小於填充長度
                        token.extend([PAD] * (pad_size - len(token)))  # padding填充
                    else:  # 如果分割后的元素數量大於填充長度
                        token = token[:pad_size]  # 直接截斷至填充長度
                        seq_len = pad_size  # 更新元素數量
                # word to id
                words_line = []  # words_line是句子通過詞表轉化后得到的數字表示
                for word in token:  # 對於句子中的每一個元素
                    # 拿到該元素在詞表中的序號，然后將這個序號添加到words_line中。如果該元素不在詞表中，則填入'<UNK>'（未知字）的序號
                    words_line.append(vocab.get(word, vocab.get(UNK)))
                contents.append((words_line, int(label), seq_len))  # 在contents中存入一個元組，元組的內容為（words_line，數字標簽，元素數量）
        return contents
    train = load_dataset(config.train_path, config.pad_size)
    dev = load_dataset(config.dev_path, config.pad_size)
    test = load_dataset(config.test_path, config.pad_size)
    return vocab, train, dev, test


class DatasetIterater(object):
  """
  根據數據集產生batch
  這里需要注意的是，在_to_tensor()中，代碼把batch中的數據處理成了`(x, seq_len), y`的形式
  其中x是words_line，seq_len是pad前的長度(超過pad_size的設為pad_size)，y是數據標簽
  """
    def __init__(self, batches, batch_size, device):  # 這里的batches就是經過build_dataset()中的load_dataset()處理后得到的contents
        self.batch_size = batch_size  # batch的容量（一次進多少個句子）
        self.batches = batches  # 數據集
        self.n_batches = len(batches) // batch_size  # 數據集大小整除batch容量
        self.residue = False  # 記錄batch能否覆蓋整個數據集，false代表可以，true代表不可以。residuere是‘剩余物，殘渣'的意思
        if len(batches) % self.n_batches != 0:
            self.residue = True
        self.index = 0  # 迭代用的索引
        self.device = device  # 訓練設備

    def _to_tensor(self, datas):
        x = torch.LongTensor([_[0] for _ in datas]).to(self.device)  # 句子words_line
        y = torch.LongTensor([_[1] for _ in datas]).to(self.device)  # 標簽

        # pad前的長度(超過pad_size的設為pad_size)
        seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
        return (x, seq_len), y

    def __next__(self):
        if self.residue and self.index == self.n_batches:  # 如果batch外還剩下一點句子，並且迭代到了最后一個batchbatch
            batches = self.batches[self.index * self.batch_size: len(self.batches)]  # 直接拿出剩下的所有數據
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

        elif self.index >= self.n_batches:
            self.index = 0
            raise StopIteration
        else:  # 迭代器的入口，剛開始self.index是0，肯定小於self.n_batches
            batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]  # 正常取一個batch的數據
            self.index += 1
            batches = self._to_tensor(batches)  # 轉化為tensor
            return batches

    def __iter__(self):
        return self

    def __len__(self):
        if self.residue:
            return self.n_batches + 1
        else:
            return self.n_batches


def build_iterator(dataset, config):  # 這里的dataset是經過build_dataset()處理后得到的數據
    iter = DatasetIterater(dataset, config.batch_size, config.device)
    return iter


def get_time_dif(start_time):
    """獲取已使用時間"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))

train_eval

"""
train_eval
"""
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import metrics
import time
from tensorboardX import SummaryWriter


# 權重初始化，默認xavier（如果不初始化，則默認的隨機權重會特別大，對模型訓練造成影響）
def init_network(model, method='xavier', exclude='embedding', seed=123):
    for name, w in model.named_parameters():  # 迭代網絡中所有可訓練的參數
        if exclude not in name:  # 排除名字中包含指定關鍵詞的參數（默認為'embedding'）
            if 'weight' in name:  # 對權重進行初始化
                if method == 'xavier':
                    nn.init.xavier_normal_(w)  # 調用不同的初始化方法
                elif method == 'kaiming':
                    nn.init.kaiming_normal_(w)
                else:
                    nn.init.normal_(w)
            elif 'bias' in name:  # 對偏置進行初始化
                nn.init.constant_(w, 0)
            else:  # 跳過除權重和偏置外的其他參數
                pass


def train(config, model, train_iter, dev_iter, test_iter):
    start_time = time.time()
    model.train()  # model.train()將啟用BatchNormalization和Dropout，相應的，model.eval()則不啟用BatchNormalization和Dropout
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)  # 指定優化方法

    # 學習率指數衰減，每次epoch：學習率 = gamma * 學習率
    # scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
    total_batch = 0  # 記錄進行到多少batch
    dev_best_loss = float('inf')
    last_improve = 0  # 記錄上次驗證集loss下降的batch數
    flag = False  # 記錄是否很久沒有效果提升（用這個標記來跳出嵌套的循環）
    writer = SummaryWriter(log_dir=config.log_path + '/' + time.strftime('%m-%d_%H.%M', time.localtime()))
    for epoch in range(config.num_epochs):
        print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
        # scheduler.step() # 學習率衰減
        for i, (trains, labels) in enumerate(train_iter):
            outputs = model(trains)
            model.zero_grad()
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()
            if total_batch % 100 == 0:
                # 每多少輪輸出在訓練集和驗證集上的效果
                true = labels.data.cpu()  # 從gpu tensor中取出標簽數據
                predic = torch.max(outputs.data, 1)[1].cpu()  # 返回每一行中最大值的列索引
                train_acc = metrics.accuracy_score(true, predic)  # 計算這個batch的分類准確率
                dev_acc, dev_loss = evaluate(config, model, dev_iter)  # 計算開發集上的准確率和訓練誤差
                if dev_loss < dev_best_loss:  # 使用開發集判斷模型性能是否提升
                    dev_best_loss = dev_loss
                    torch.save(model.state_dict(), config.save_path)
                    improve = '*'
                    last_improve = total_batch
                else:
                    improve = ''
                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6},  Train Loss: {1:>5.2},  Train Acc: {2:>6.2%},  Val Loss: {3:>5.2},  Val Acc: {4:>6.2%},  Time: {5} {6}'
                print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
                writer.add_scalar("loss/train", loss.item(), total_batch)
                writer.add_scalar("loss/dev", dev_loss, total_batch)
                writer.add_scalar("acc/train", train_acc, total_batch)
                writer.add_scalar("acc/dev", dev_acc, total_batch)
                model.train()
            total_batch += 1
            if total_batch - last_improve > config.require_improvement:
                # 開發集loss超過一定數量的batch沒下降，則結束訓練
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break
        if flag:
            break
    writer.close()
    test(config, model, test_iter)


def test(config, model, test_iter):
    # test
    model.load_state_dict(torch.load(config.save_path))
    model.eval()
    start_time = time.time()
    test_acc, test_loss, test_report, test_confusion = evaluate(config, model, test_iter, test=True)
    msg = 'Test Loss: {0:>5.2},  Test Acc: {1:>6.2%}'
    print(msg.format(test_loss, test_acc))
    print("Precision, Recall and F1-Score...")
    print(test_report)
    print("Confusion Matrix...")
    print(test_confusion)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


def evaluate(config, model, data_iter, test=False):
    model.eval()  # 不啟用BatchNormalization和Dropout
    loss_total = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    with torch.no_grad():  # 不追蹤梯度
        for texts, labels in data_iter:  # 對數據集中的每一組數據
            outputs = model(texts)  # 使用模型進行預測
            loss = F.cross_entropy(outputs, labels)  # 計算模型損失
            loss_total += loss  # 累加模型損失
            labels = labels.data.cpu().numpy()
            predic = torch.max(outputs.data, 1)[1].cpu().numpy()
            labels_all = np.append(labels_all, labels)  # 記錄標簽
            predict_all = np.append(predict_all, predic)  # 記錄預測結果

    acc = metrics.accuracy_score(labels_all, predict_all)  # 計算分類誤差
    if test:
        report = metrics.classification_report(labels_all, predict_all, target_names=config.class_list, digits=4)
        confusion = metrics.confusion_matrix(labels_all, predict_all)
        return acc, loss_total / len(data_iter), report, confusion
    return acc, loss_total / len(data_iter)  # 返回分類誤差和平均模型損失

models.TextCNN

"""
models.TextCNN
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


class Config(object):

    """配置參數"""
    def __init__(self, dataset, embedding):
        self.model_name = 'TextCNN'
        self.train_path = dataset + '/data/train.txt'                                # 訓練集
        self.dev_path = dataset + '/data/dev.txt'                                    # 驗證集
        self.test_path = dataset + '/data/test.txt'                                  # 測試集
        self.class_list = [x.strip() for x in open(
            dataset + '/data/class.txt', encoding='utf-8').readlines()]              # 類別名單
        self.vocab_path = dataset + '/data/vocab.pkl'                                # 詞表
        self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt'        # 模型訓練結果
        self.log_path = dataset + '/log/' + self.model_name
        self.embedding_pretrained = torch.tensor(
            np.load(dataset + '/data/' + embedding)["embeddings"].astype('float32'))\
            if embedding != 'random' else None                                       # 預訓練詞向量
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')   # 設備

        self.dropout = 0.5                                              # 隨機失活
        self.require_improvement = 1000                                 # 若超過1000batch效果還沒提升，則提前結束訓練
        self.num_classes = len(self.class_list)                         # 類別數
        self.n_vocab = 0                                                # 詞表大小，在運行時賦值
        self.num_epochs = 20                                            # epoch數
        self.batch_size = 128                                           # mini-batch大小
        self.pad_size = 32                                              # 每句話處理成的長度(短填長切)
        self.learning_rate = 1e-3                                       # 學習率
        self.embed = self.embedding_pretrained.size(1)\
            if self.embedding_pretrained is not None else 300           # 字向量維度
        self.filter_sizes = (2, 3, 4)                                   # 卷積核尺寸
        self.num_filters = 256                                          # 卷積核數量(channels數)


'''Convolutional Neural Networks for Sentence Classification'''


class Model(nn.Module):
    def __init__(self, config):
        super(Model, self).__init__()
        if config.embedding_pretrained is not None:
            self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
        else:
            self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)
        self.convs = nn.ModuleList(
            [nn.Conv2d(1, config.num_filters, (k, config.embed)) for k in config.filter_sizes])
        self.dropout = nn.Dropout(config.dropout)
        self.fc = nn.Linear(config.num_filters * len(config.filter_sizes), config.num_classes)

    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x

    def forward(self, x):
        out = self.embedding(x[0])
        out = out.unsqueeze(1)
        out = torch.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1)
        out = self.dropout(out)
        out = self.fc(out)
        return out

main

"""
主函數
"""
import time
import torch
import numpy as np
# from train_eval import train, init_network
# from importlib import import_module
import argparse  # 解析參數用的包

dataset = '/content/drive/My Drive/ChineseTextClassification/Chinese-Text-Classification-Pytorch-master/THUCNews'  # 數據集

embedding = 'embedding_SougouNews.npz'  # 搜狗新聞:embedding_SougouNews.npz, 騰訊:embedding_Tencent.npz, 隨機初始化:random
model_name = 'TextCNN'  # 選擇模型：TextCNN, TextRNN, FastText, TextRCNN, TextRNN_Att, DPCNN, Transformer
# from utils import build_dataset, build_iterator, get_time_dif

config = Config(dataset, embedding)  # 加載神經網絡模型的參數
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True  # 固定隨機因子，保證每次結果一樣

start_time = time.time()
print("Loading data...")
vocab, train_data, dev_data, test_data = build_dataset(config, False)  # 使用build_dataset()來加載數據集 parser.add_argument('--word', default=False, type=bool, help='True for word, False for char')
train_iter = build_iterator(train_data, config)  # 使用build_iterator()來生成相應的迭代器
dev_iter = build_iterator(dev_data, config)
test_iter = build_iterator(test_data, config)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)

# train
config.n_vocab = len(vocab)
model = Model(config).to(config.device)  # 使用指定的設備來訓練模型
if model_name != 'Transformer':
    init_network(model)
print(model.parameters)
train(config, model, train_iter, dev_iter, test_iter)

在GPU下的運行結果

Loading data...
Vocab size: 4762
180000it [00:04, 40629.57it/s]
10000it [00:00, 10535.39it/s]
10000it [00:01, 7211.66it/s]
Time usage: 0:00:07
<bound method Module.parameters of Model(
  (embedding): Embedding(4762, 300)
  (convs): ModuleList(
    (0): Conv2d(1, 256, kernel_size=(2, 300), stride=(1, 1))
    (1): Conv2d(1, 256, kernel_size=(3, 300), stride=(1, 1))
    (2): Conv2d(1, 256, kernel_size=(4, 300), stride=(1, 1))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=768, out_features=10, bias=True)
)>
Epoch [1/20]
Iter:      0,  Train Loss:   2.3,  Train Acc:  7.03%,  Val Loss:   2.7,  Val Acc: 12.33%,  Time: 0:00:03 *
Iter:    100,  Train Loss:  0.74,  Train Acc: 73.44%,  Val Loss:   0.7,  Val Acc: 78.33%,  Time: 0:00:06 *
Iter:    200,  Train Loss:  0.71,  Train Acc: 77.34%,  Val Loss:  0.55,  Val Acc: 83.33%,  Time: 0:00:09 *
Iter:    300,  Train Loss:  0.46,  Train Acc: 85.94%,  Val Loss:  0.49,  Val Acc: 84.88%,  Time: 0:00:13 *
Iter:    400,  Train Loss:  0.68,  Train Acc: 81.25%,  Val Loss:  0.47,  Val Acc: 85.53%,  Time: 0:00:16 *
Iter:    500,  Train Loss:  0.36,  Train Acc: 89.06%,  Val Loss:  0.43,  Val Acc: 86.47%,  Time: 0:00:19 *
Iter:    600,  Train Loss:  0.52,  Train Acc: 84.38%,  Val Loss:  0.43,  Val Acc: 86.51%,  Time: 0:00:22 *
Iter:    700,  Train Loss:  0.46,  Train Acc: 82.81%,  Val Loss:  0.41,  Val Acc: 87.19%,  Time: 0:00:26 *
Iter:    800,  Train Loss:  0.47,  Train Acc: 85.94%,  Val Loss:  0.39,  Val Acc: 87.70%,  Time: 0:00:29 *
Iter:    900,  Train Loss:  0.47,  Train Acc: 85.16%,  Val Loss:  0.39,  Val Acc: 87.99%,  Time: 0:00:32 *
Iter:   1000,  Train Loss:  0.35,  Train Acc: 86.72%,  Val Loss:  0.39,  Val Acc: 88.03%,  Time: 0:00:35 *
Iter:   1100,  Train Loss:  0.42,  Train Acc: 86.72%,  Val Loss:  0.38,  Val Acc: 88.35%,  Time: 0:00:39 *
Iter:   1200,  Train Loss:  0.39,  Train Acc: 85.16%,  Val Loss:  0.37,  Val Acc: 88.65%,  Time: 0:00:42 *
Iter:   1300,  Train Loss:  0.44,  Train Acc: 85.16%,  Val Loss:  0.36,  Val Acc: 88.44%,  Time: 0:00:45 *
Iter:   1400,  Train Loss:  0.53,  Train Acc: 82.81%,  Val Loss:  0.36,  Val Acc: 88.82%,  Time: 0:00:49 *
Epoch [2/20]
Iter:   1500,  Train Loss:  0.47,  Train Acc: 85.94%,  Val Loss:  0.35,  Val Acc: 88.77%,  Time: 0:00:52 *
Iter:   1600,  Train Loss:  0.36,  Train Acc: 85.94%,  Val Loss:  0.35,  Val Acc: 89.14%,  Time: 0:00:56 *
Iter:   1700,  Train Loss:  0.37,  Train Acc: 87.50%,  Val Loss:  0.34,  Val Acc: 89.36%,  Time: 0:00:59 *
Iter:   1800,  Train Loss:  0.32,  Train Acc: 87.50%,  Val Loss:  0.36,  Val Acc: 88.69%,  Time: 0:01:02 
Iter:   1900,  Train Loss:  0.34,  Train Acc: 89.84%,  Val Loss:  0.35,  Val Acc: 89.13%,  Time: 0:01:05 
Iter:   2000,  Train Loss:  0.37,  Train Acc: 88.28%,  Val Loss:  0.34,  Val Acc: 89.19%,  Time: 0:01:09 *
Iter:   2100,  Train Loss:  0.42,  Train Acc: 85.94%,  Val Loss:  0.34,  Val Acc: 89.44%,  Time: 0:01:12 *
Iter:   2200,  Train Loss:  0.28,  Train Acc: 90.62%,  Val Loss:  0.34,  Val Acc: 89.33%,  Time: 0:01:15 *
Iter:   2300,  Train Loss:  0.36,  Train Acc: 92.97%,  Val Loss:  0.34,  Val Acc: 89.45%,  Time: 0:01:19 
Iter:   2400,  Train Loss:  0.33,  Train Acc: 89.84%,  Val Loss:  0.34,  Val Acc: 89.57%,  Time: 0:01:22 
Iter:   2500,  Train Loss:  0.17,  Train Acc: 94.53%,  Val Loss:  0.33,  Val Acc: 89.85%,  Time: 0:01:25 *
Iter:   2600,  Train Loss:   0.3,  Train Acc: 89.84%,  Val Loss:  0.33,  Val Acc: 89.76%,  Time: 0:01:28 
Iter:   2700,  Train Loss:  0.26,  Train Acc: 91.41%,  Val Loss:  0.33,  Val Acc: 89.84%,  Time: 0:01:32 
Iter:   2800,  Train Loss:   0.4,  Train Acc: 85.16%,  Val Loss:  0.33,  Val Acc: 89.62%,  Time: 0:01:35 
Epoch [3/20]
Iter:   2900,  Train Loss:  0.32,  Train Acc: 89.84%,  Val Loss:  0.33,  Val Acc: 89.77%,  Time: 0:01:38 
Iter:   3000,  Train Loss:  0.22,  Train Acc: 91.41%,  Val Loss:  0.33,  Val Acc: 89.80%,  Time: 0:01:41 
Iter:   3100,  Train Loss:  0.27,  Train Acc: 92.97%,  Val Loss:  0.34,  Val Acc: 89.58%,  Time: 0:01:44 
Iter:   3200,  Train Loss:  0.31,  Train Acc: 90.62%,  Val Loss:  0.33,  Val Acc: 89.78%,  Time: 0:01:48 
Iter:   3300,  Train Loss:  0.38,  Train Acc: 88.28%,  Val Loss:  0.33,  Val Acc: 89.71%,  Time: 0:01:51 
Iter:   3400,  Train Loss:  0.33,  Train Acc: 85.94%,  Val Loss:  0.34,  Val Acc: 89.88%,  Time: 0:01:54 
Iter:   3500,  Train Loss:  0.19,  Train Acc: 92.19%,  Val Loss:  0.33,  Val Acc: 89.76%,  Time: 0:01:58 
No optimization for a long time, auto-stopping...
Test Loss:  0.31,  Test Acc: 90.77%
Precision, Recall and F1-Score...
               precision    recall  f1-score   support

      finance     0.9095    0.9050    0.9073      1000
       realty     0.9147    0.9330    0.9238      1000
       stocks     0.8770    0.8340    0.8549      1000
    education     0.9393    0.9590    0.9490      1000
      science     0.8529    0.8640    0.8584      1000
      society     0.9021    0.9120    0.9070      1000
     politics     0.9050    0.8760    0.8902      1000
       sports     0.9466    0.9570    0.9518      1000
         game     0.9336    0.9000    0.9165      1000
entertainment     0.8958    0.9370    0.9159      1000

     accuracy                         0.9077     10000
    macro avg     0.9076    0.9077    0.9075     10000
 weighted avg     0.9076    0.9077    0.9075     10000

Confusion Matrix...
[[905  17  37   5   8   6  10   5   2   5]
 [  9 933  13   2   3  15   3   7   2  13]
 [ 55  28 834   3  36   2  31   3   5   3]
 [  1   2   1 959   4   9   5   5   1  13]
 [  5   9  28   5 864  17  14   5  34  19]
 [  4  16   2  19  13 912  21   2   4   7]
 [ 10   5  22  16  24  29 876   4   1  13]
 [  2   2   5   2   3   6   1 957   5  17]
 [  1   3   7   6  46   4   4  10 900  19]
 [  3   5   2   4  12  11   3  13  10 937]]
Time usage: 0:00:00

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 中文文本分類中文文本分類之CharCNN 中文文本分類之TextRNN TextGrocery中文文本分類處理 Pytorch-中文文本分類 Pytorch之Bert中文文本分類（二） xlnet中文文本分類任務中文文本分類大概的步驟基於bert的中文文本分類 2.中文文本分類實戰