這次我們使用今日頭條信息流中抽取的38w條新聞標題數據作為數據集。數據集中的文本長度在10到30之間,一共15個類別。
數據預處理:
import torch
from tqdm import tqdm
import time
from datetime import timedelta
PAD, CLS = '[PAD]', '[CLS]' # padding符號, bert中綜合信息符號
def build_dataset(config):
def load_dataset(path, pad_size=32):
contents = []
with open(path, 'r', encoding='UTF-8') as f:
for line in tqdm(f):
# 讀數據,去除首尾空格,分離標簽與句子內容
lin = line.strip()
if not lin:
continue
content, label = lin.split('\t')
# 使用配置中的tokenize對句子內容進行分割,句首增加'[CLS]'
token = config.tokenizer.tokenize(content)
token = [CLS] + token
seq_len = len(token)
mask = []
token_ids = config.tokenizer.convert_tokens_to_ids(token)
if pad_size:
if len(token) < pad_size:
mask = [1] * len(token_ids) + [0] * (pad_size - len(token))
token_ids += ([0] * (pad_size - len(token)))
else:
mask = [1] * pad_size
token_ids = token_ids[:pad_size]
seq_len = pad_size
contents.append((token_ids, int(label), seq_len, mask))
return contents
train = load_dataset(config.train_path, config.pad_size)
dev = load_dataset(config.dev_path, config.pad_size)
test = load_dataset(config.test_path, config.pad_size)
return train, dev, test
class DatasetIterater(object):
def __init__(self, batches, batch_size, device):
self.batch_size = batch_size
self.batches = batches
self.n_batches = len(batches) // batch_size
self.residue = False # 記錄batch數量是否為整數
if len(batches) % self.n_batches != 0:
self.residue = True
self.index = 0
self.device = device
def _to_tensor(self, datas):
x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
y = torch.LongTensor([_[1] for _ in datas]).to(self.device)
# pad前的長度(超過pad_size的設為pad_size)
seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
mask = torch.LongTensor([_[3] for _ in datas]).to(self.device)
return (x, seq_len, mask), y
def __next__(self):
if self.residue and self.index == self.n_batches:
batches = self.batches[self.index * self.batch_size: len(self.batches)]
self.index += 1
batches = self._to_tensor(batches)
return batches
elif self.index >= self.n_batches:
self.index = 0
raise StopIteration
else:
batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
self.index += 1
batches = self._to_tensor(batches)
return batches
def __iter__(self):
return self
def __len__(self):
if self.residue:
return self.n_batches + 1
else:
return self.n_batches
def build_iterator(dataset, config):
iter = DatasetIterater(dataset, config.batch_size, config.device)
return iter
def get_time_dif(start_time):
"""獲取已使用時間"""
end_time = time.time()
time_dif = end_time - start_time
return timedelta(seconds=int(round(time_dif)))
接下來,定義模型。這里我們用到了pytorch_pretrained_bert
這個包:
!pip install pytorch_pretrained_bert
import torch
import torch.nn as nn
from pytorch_pretrained_bert import BertModel, BertTokenizer
class Config(object):
"""配置參數"""
def __init__(self, dataset):
self.model_name = 'bert'
self.train_path = dataset + '/data/train.txt' # 訓練集
self.dev_path = dataset + '/data/dev.txt' # 驗證集
self.test_path = dataset + '/data/test.txt' # 測試集
self.class_list = [x.strip() for x in open(
dataset + '/data/class.txt').readlines()] # 類別名單
self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型訓練結果
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 設備
self.require_improvement = 1000 # 若超過1000batch效果還沒提升,則提前結束訓練
self.num_classes = len(self.class_list) # 類別數
self.num_epochs = 3 # epoch數
self.batch_size = 128 # mini-batch大小
self.pad_size = 32 # 每句話處理成的長度(短填長切)
self.learning_rate = 5e-5 # 學習率
self.bert_path = '/content/drive/Shared drives/A/data/pre_training/bert_pretain' # 預訓練模型
self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
self.hidden_size = 768
class Model(nn.Module):
def __init__(self, config):
super(Model, self).__init__()
self.bert = BertModel.from_pretrained(config.bert_path)
for param in self.bert.parameters():
param.requires_grad = True
self.fc = nn.Linear(config.hidden_size, config.num_classes)
def forward(self, x):
context = x[0] # 輸入的句子
mask = x[2] # 對padding部分進行mask,和句子一個size,padding部分用0表示,如:[1, 1, 1, 1, 0, 0]
_, pooled = self.bert(context, attention_mask=mask, output_all_encoded_layers=False)
out = self.fc(pooled)
return out
定義訓練和測試方法:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import metrics
import time
from pytorch_pretrained_bert.optimization import BertAdam
# 權重初始化,默認xavier
def init_network(model, method='xavier', exclude='embedding', seed=123):
for name, w in model.named_parameters():
if exclude not in name:
if len(w.size()) < 2:
continue
if 'weight' in name:
if method == 'xavier':
nn.init.xavier_normal_(w)
elif method == 'kaiming':
nn.init.kaiming_normal_(w)
else:
nn.init.normal_(w)
elif 'bias' in name:
nn.init.constant_(w, 0)
else:
pass
def train(config, model, train_iter, dev_iter, test_iter):
start_time = time.time()
model.train() # model.train()將啟用BatchNormalization和Dropout,相應的,model.eval()則不啟用BatchNormalization和Dropout
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
# optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
optimizer = BertAdam(optimizer_grouped_parameters,
lr=config.learning_rate,
warmup=0.05,
t_total=len(train_iter) * config.num_epochs)
total_batch = 0 # 記錄進行到多少batch
dev_best_loss = float('inf')
last_improve = 0 # 記錄上次驗證集loss下降的batch數
flag = False # 記錄是否很久沒有效果提升
model.train()
for epoch in range(config.num_epochs):
print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
for i, (trains, labels) in enumerate(train_iter):
outputs = model(trains)
model.zero_grad()
loss = F.cross_entropy(outputs, labels)
loss.backward()
optimizer.step()
if total_batch % 100 == 0:
# 每多少輪輸出在訓練集和驗證集上的效果
true = labels.data.cpu()
predic = torch.max(outputs.data, 1)[1].cpu()
train_acc = metrics.accuracy_score(true, predic)
dev_acc, dev_loss = evaluate(config, model, dev_iter)
if dev_loss < dev_best_loss:
dev_best_loss = dev_loss
torch.save(model.state_dict(), config.save_path)
improve = '*'
last_improve = total_batch
else:
improve = ''
time_dif = get_time_dif(start_time)
msg = 'Iter: {0:>6}, Train Loss: {1:>5.2}, Train Acc: {2:>6.2%}, Val Loss: {3:>5.2}, Val Acc: {4:>6.2%}, Time: {5} {6}'
print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
model.train()
total_batch += 1
if total_batch - last_improve > config.require_improvement:
# 驗證集loss超過1000batch沒下降,結束訓練
print("No optimization for a long time, auto-stopping...")
flag = True
break
if flag:
break
test(config, model, test_iter)
def test(config, model, test_iter):
# test
model.load_state_dict(torch.load(config.save_path))
model.eval()
start_time = time.time()
test_acc, test_loss, test_report, test_confusion = evaluate(config, model, test_iter, test=True)
msg = 'Test Loss: {0:>5.2}, Test Acc: {1:>6.2%}'
print(msg.format(test_loss, test_acc))
print("Precision, Recall and F1-Score...")
print(test_report)
print("Confusion Matrix...")
print(test_confusion)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)
def evaluate(config, model, data_iter, test=False):
model.eval()
loss_total = 0
predict_all = np.array([], dtype=int)
labels_all = np.array([], dtype=int)
with torch.no_grad():
for texts, labels in data_iter:
outputs = model(texts)
loss = F.cross_entropy(outputs, labels)
loss_total += loss
labels = labels.data.cpu().numpy()
predic = torch.max(outputs.data, 1)[1].cpu().numpy()
labels_all = np.append(labels_all, labels)
predict_all = np.append(predict_all, predic)
acc = metrics.accuracy_score(labels_all, predict_all)
if test:
report = metrics.classification_report(labels_all, predict_all, target_names=config.class_list, digits=4)
confusion = metrics.confusion_matrix(labels_all, predict_all)
return acc, loss_total / len(data_iter), report, confusion
return acc, loss_total / len(data_iter)
開始訓練:
import time
import torch
import numpy as np
dataset_path = '/content/drive/Shared drives/A/data/今日頭條文本分類數據集'
config = Config(dataset_path) # 初始化配置
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True # 固定隨機因子
start_time = time.time()
print("Loading data...")
train_data, dev_data, test_data = build_dataset(config) # 數據集預處理
train_iter = build_iterator(train_data, config)
dev_iter = build_iterator(dev_data, config)
test_iter = build_iterator(test_data, config)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)
# train
model = Model(config).to(config.device) # 確定訓練設備
train(config, model, train_iter, dev_iter, test_iter) # 開始訓練
由於colab顯存不夠,所以調小了batch_size,訓練了三小時左右,最終在准確率上比原作者的83.81%
高出了0.46%
,可見bert的性能還是非常不錯的:
Test Loss: 0.56, Test Acc: 84.27%
Precision, Recall and F1-Score...
precision recall f1-score support
news_story 0.7703 0.8085 0.7889 282
news_culture 0.7942 0.8745 0.8324 1474
news_entertainment 0.9271 0.8249 0.8730 1959
news_sports 0.9472 0.9007 0.9234 1833
news_finance 0.8012 0.7189 0.7578 1430
news_house 0.9148 0.8784 0.8962 880
news_car 0.9373 0.8898 0.9129 1815
news_edu 0.8684 0.8494 0.8588 1321
news_tech 0.7520 0.8952 0.8174 2070
news_military 0.8391 0.7913 0.8145 1265
news_travel 0.7840 0.7810 0.7825 1064
news_world 0.6910 0.8112 0.7463 1340
stock 0.0000 0.0000 0.0000 15
news_agriculture 0.8754 0.8011 0.8366 930
news_game 0.8893 0.8826 0.8859 1456
accuracy 0.8427 19134
macro avg 0.7861 0.7805 0.7818 19134
weighted avg 0.8481 0.8427 0.8435 19134