這次采用的數據集為12萬對話語料青雲庫.csv
,這份語料的質量還是不錯的,內容也比較生活化。更多的語料數據可見:中文公開聊天語料庫
首先,項目依賴:
import re
import os
import csv
import math
import random
import codecs
import itertools
import unicodedata
from io import open
import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import jieba
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
准備數據
先讀取數據,然后打印幾行看看語料質量:
file_path = '/content/drive/Shared drives/A/data/raw_chat_corpus/qingyun-11w'
corpus_name = "qingyun-11w"
corpus = os.path.join(file_path, corpus_name)
def printLines(file, n=10):
with open(file, 'r') as file_pipeline:
for index, line in enumerate(file_pipeline):
qa = line.split('|')
print(qa)
if index > 5:
return
printLines(os.path.join(file_path, "12萬對話語料青雲庫.csv"))
打印結果:
['南京在哪里 ', ' 在這里了\n']
['咋死???紅燒還是爆炒 ', ' 哦了哦了哦了,咱聊點別的吧\n']
['孩紙,新年快樂 ', ' {r+}同樂同樂,大家一起樂~\n']
['那重點是什么 ', ' 好話不分輕重!\n']
['在上一條我回復你,你怎么不回復我 ', ' 我也要思考啊,不能隨便回你話吧,那會讓你覺得菲菲好笨的\n']
可以看到,語料的基本質量還是可以的,不過其中還有一些不必要的符號,后續處理時要過濾掉
分詞:
分別試了一下詞向量和字向量,發現中文還是字向量效果更好
word_wise = False # True: 分詞, False:分字
word_segmentation_path = os.path.join(file_path, 'word_segmentation.txt') # 保存分詞后的結果
def clean_zh_text(text):
# 只保留數字,中文及常用中文標點(逗號/句號/感嘆號/問號)
comp = re.compile('[^0-9^\u4e00-\u9fa5^,。!?]')
return comp.sub('', text)
def word_filter(words):
# 去掉空字符,把jieba返回的生成器轉化為字符串
result = []
for word in words:
word = clean_zh_text(word)
if word == '':
continue
else:
result.append(word)
return ' '.join(result)
def cut_sentences(input_file, output_file):
with open(input_file, 'r') as input_pipeline, open(output_file, 'w') as output_pipeline:
for index, line in enumerate(input_pipeline):
qa = line.split('|')
question = word_filter(jieba.cut(qa[0]) if word_wise else qa[0])
answer = word_filter(jieba.cut(qa[1]) if word_wise else qa[1])
result = '\t'.join([question, answer])
output_pipeline.write(result + '\n')
cut_sentences(os.path.join(file_path, "12萬對話語料青雲庫.csv"), word_segmentation_path)
printLines(word_segmentation_path)
創建詞典
# 預定義的token
PAD_token = 0 # 表示padding
SOS_token = 1 # 句子的開始
EOS_token = 2 # 句子的結束
class Voc:
def __init__(self, name):
self.name = name
self.trimmed = False
self.word2index = {}
self.word2count = {}
self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
self.num_words = 3 # 目前有SOS, EOS, PAD這3個token。
def addSentence(self, sentence):
for word in sentence.split(' '):
self.addWord(word)
def addWord(self, word):
if word not in self.word2index:
self.word2index[word] = self.num_words
self.word2count[word] = 1
self.index2word[self.num_words] = word
self.num_words += 1
else:
self.word2count[word] += 1
# 刪除頻次小於min_count的token
def trim(self, min_count):
if self.trimmed: # 避免重復刪減
return
self.trimmed = True
keep_words = []
for k, v in self.word2count.items():
if v >= min_count:
keep_words.append(k)
print('keep_words {} / {} = {:.4f}'.format(
len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
))
# 重新構造詞典
self.word2index = {}
self.word2count = {}
self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
self.num_words = 3 # Count default tokens
# 統計詞頻的目的就是為了trim,因此詞典重新構造后這里就不保留詞頻了
for word in keep_words:
self.addWord(word)
MAX_LENGTH = 15 # 句子最大長度設定為15個詞(包括EOS等特殊詞)
# 讀取問答句對並且返回Voc詞典對象
def readVocs(datafile, corpus_name):
print("Reading lines...")
# 文件每行讀取到list lines中。
lines = open(datafile, encoding='utf-8').\
read().strip().split('\n')
# 每行用tab切分成問答兩個句子
pairs = [[s for s in l.split('\t')] for l in lines]
voc = Voc(corpus_name)
return voc, pairs
def filterPair(p):
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
# 過濾太長的句對
def filterPairs(pairs):
return [pair for pair in pairs if filterPair(pair)]
# 使用上面的函數進行處理,返回Voc對象和句對的list
def loadPrepareData(corpus_name, datafile):
print("Start preparing training data ...")
voc, pairs = readVocs(datafile, corpus_name)
print("Read {!s} sentence pairs".format(len(pairs)))
pairs = filterPairs(pairs)
print("Trimmed to {!s} sentence pairs".format(len(pairs)))
print("Counting words...")
for pair in pairs:
voc.addSentence(pair[0])
voc.addSentence(pair[1])
print("Counted words:", voc.num_words)
return voc, pairs
save_dir = "/content/drive/Shared drives/A/temp_file/zh_seq2seq_chatbot"
voc, pairs = loadPrepareData(corpus_name, word_segmentation_path)
print("\nQA句對:")
for pair in pairs[:10]:
print(pair)
為了收斂更快,我們可以去除掉一些低頻詞:
MIN_COUNT = 2 # 低頻詞閾值設為2
def trimRareWords(voc, pairs, MIN_COUNT):
# 去掉voc中頻次小於3的詞
voc.trim(MIN_COUNT)
# 保留的句對
keep_pairs = []
for pair in pairs:
input_sentence = pair[0]
output_sentence = pair[1]
keep_input = True
keep_output = True
# 檢查問題
for word in input_sentence.split(' '):
if word not in voc.word2index:
keep_input = False
break
# 檢查答案
for word in output_sentence.split(' '):
if word not in voc.word2index:
keep_output = False
break
# 如果問題和答案都只包含高頻詞,我們才保留這個句對
if keep_input and keep_output:
keep_pairs.append(pair)
print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs),
len(keep_pairs), len(keep_pairs) / len(pairs)))
return keep_pairs
# 實際進行處理
pairs = trimRareWords(voc, pairs, MIN_COUNT)
構建數據集
# 把句子的詞變成ID
def indexesFromSentence(voc, sentence):
return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]
# sentences是多個長度不同句子(list),使用zip_longest padding成定長,長度為最長句子的長度。
def zeroPadding(sentences, fillvalue=PAD_token):
return list(itertools.zip_longest(*sentences, fillvalue=fillvalue))
# sentences是二維的padding后的list
# 返回m和sentences的大小一樣,如果某個位置是padding,那么值為0,否則為1
# 這里的m就是mask矩陣
def binaryMatrix(sentences, value=PAD_token):
m = []
for i, seq in enumerate(sentences):
m.append([])
for token in seq:
if token == PAD_token:
m[i].append(0)
else:
m[i].append(1)
return m
# 把輸入句子變成ID,然后再padding,同時返回lengths這個list,標識實際長度。
# 返回的padVar是一個LongTensor,shape是(batch, max_length),
# lengths是一個list,長度為(batch,),表示每個句子的實際長度。
def inputVar(sentences, voc):
indexes_batch = [indexesFromSentence(voc, sentence) for sentence in sentences]
lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
padList = zeroPadding(indexes_batch)
padVar = torch.LongTensor(padList)
return padVar, lengths
# 對輸出句子進行padding,然后用binaryMatrix得到每個位置是padding(0)還是非padding,
# 同時返回最大最長句子的長度(也就是padding后的長度)
# 返回值padVar是LongTensor,shape是(batch, max_target_length)
# mask是ByteTensor,shape也是(batch, max_target_length)
def outputVar(sentences, voc):
indexes_batch = [indexesFromSentence(voc, sentence) for sentence in sentences]
max_target_len = max([len(indexes) for indexes in indexes_batch])
padList = zeroPadding(indexes_batch)
mask = binaryMatrix(padList)
mask = torch.ByteTensor(mask)
mask = mask.bool() # 避免警告:masked_select received a mask with dtype torch.uint8, this behavior is now deprecated,please use a mask with dtype torch.bool instead.
padVar = torch.LongTensor(padList)
return padVar, mask, max_target_len
# 處理一個batch的pair句對
def batch2TrainData(voc, pair_batch):
# 按照句子的長度(詞數)排序
pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
input_batch, output_batch = [], []
for pair in pair_batch:
input_batch.append(pair[0])
output_batch.append(pair[1])
inp, lengths = inputVar(input_batch, voc)
output, mask, max_target_len = outputVar(output_batch, voc)
return inp, lengths, output, mask, max_target_len
# 示例
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches
print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)
定義模型
Encoder
class EncoderRNN(nn.Module):
def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
super(EncoderRNN, self).__init__()
self.n_layers = n_layers
self.hidden_size = hidden_size
self.embedding = embedding
# 初始化GRU,這里輸入和hidden大小都是hidden_size,這里假設embedding層的輸出大小是hidden_size
# 如果只有一層,那么不進行Dropout,否則使用傳入的參數dropout進行GRU的Dropout。
self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
dropout=(0 if n_layers == 1 else dropout), bidirectional=True)
def forward(self, input_seq, input_lengths, hidden=None):
# 輸入是(max_length, batch),Embedding之后變成(max_length, batch, hidden_size)
embedded = self.embedding(input_seq)
packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
outputs, hidden = self.gru(packed, hidden)
outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
# 返回最終的輸出和最后時刻的隱狀態。
return outputs, hidden
Attention
# Luong 注意力layer
class Attn(torch.nn.Module):
def __init__(self, method, hidden_size):
super(Attn, self).__init__()
self.method = method
if self.method not in ['dot', 'general', 'concat']:
raise ValueError(self.method, "is not an appropriate attention method.")
self.hidden_size = hidden_size
if self.method == 'general':
self.attn = torch.nn.Linear(self.hidden_size, hidden_size)
elif self.method == 'concat':
self.attn = torch.nn.Linear(self.hidden_size * 2, hidden_size)
self.v = torch.nn.Parameter(torch.FloatTensor(hidden_size))
def dot_score(self, hidden, encoder_output):
# 輸入hidden的shape是(1, batch=64, hidden_size=500)
# encoder_outputs的shape是(input_lengths=10, batch=64, hidden_size=500)
# hidden * encoder_output得到的shape是(10, 64, 500),然后對第3維求和就可以計算出score。
return torch.sum(hidden * encoder_output, dim=2)
def general_score(self, hidden, encoder_output):
energy = self.attn(encoder_output)
return torch.sum(hidden * energy, dim=2)
def concat_score(self, hidden, encoder_output):
energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1),
encoder_output), 2)).tanh()
return torch.sum(self.v * energy, dim=2)
# 輸入是上一個時刻的隱狀態hidden和所有時刻的Encoder的輸出encoder_outputs
# 輸出是注意力的概率,也就是長度為input_lengths的向量,它的和加起來是1。
def forward(self, hidden, encoder_outputs):
# 計算注意力的score,輸入hidden的shape是(1, batch=64, hidden_size=500),
# 表示t時刻batch數據的隱狀態
# encoder_outputs的shape是(input_lengths=10, batch=64, hidden_size=500)
if self.method == 'general':
attn_energies = self.general_score(hidden, encoder_outputs)
elif self.method == 'concat':
attn_energies = self.concat_score(hidden, encoder_outputs)
elif self.method == 'dot':
# 計算內積,參考dot_score函數
attn_energies = self.dot_score(hidden, encoder_outputs)
# Transpose max_length and batch_size dimensions
# 把attn_energies從(max_length=10, batch=64)轉置成(64, 10)
attn_energies = attn_energies.t()
# 使用softmax函數把score變成概率,shape仍然是(64, 10),然后用unsqueeze(1)變成
# (64, 1, 10)
return F.softmax(attn_energies, dim=1).unsqueeze(1)
Decoder
class LuongAttnDecoderRNN(nn.Module):
def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
super(LuongAttnDecoderRNN, self).__init__()
# 把參數保存到self里,attn_model就是前面定義的Attn類的對象。
self.attn_model = attn_model
self.hidden_size = hidden_size
self.output_size = output_size
self.n_layers = n_layers
self.dropout = dropout
# 定義Decoder的layers
self.embedding = embedding
self.embedding_dropout = nn.Dropout(dropout)
self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
self.concat = nn.Linear(hidden_size * 2, hidden_size)
self.out = nn.Linear(hidden_size, output_size)
self.attn = Attn(attn_model, hidden_size)
def forward(self, input_step, last_hidden, encoder_outputs):
# 注意:decoder每一步只能處理一個時刻的數據,因為t時刻計算完了才能計算t+1時刻。
# input_step的shape是(1, 64),64是batch,1是當前輸入的詞ID(來自上一個時刻的輸出)
# 通過embedding層變成(1, 64, 500),然后進行dropout,shape不變。
embedded = self.embedding(input_step)
embedded = self.embedding_dropout(embedded)
# 把embedded傳入GRU進行forward計算
# 得到rnn_output的shape是(1, 64, 500)
# hidden是(2, 64, 500),因為是兩層的GRU,所以第一維是2。
rnn_output, hidden = self.gru(embedded, last_hidden)
# 計算注意力權重, 根據前面的分析,attn_weights的shape是(64, 1, 10)
attn_weights = self.attn(rnn_output, encoder_outputs)
context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
# 把context向量和GRU的輸出拼接起來
# rnn_output從(1, 64, 500)變成(64, 500)
rnn_output = rnn_output.squeeze(0)
# context從(64, 1, 500)變成(64, 500)
context = context.squeeze(1)
# 拼接得到(64, 1000)
concat_input = torch.cat((rnn_output, context), 1)
concat_output = torch.tanh(self.concat(concat_input))
# out是(500, 詞典大小=7826)
output = self.out(concat_output)
# 用softmax變成概率,表示當前時刻輸出每個詞的概率。
output = F.softmax(output, dim=1)
# 返回 output和新的隱狀態
return output, hidden
Masked損失
def maskNLLLoss(inp, target, mask):
# 計算實際的詞的個數,因為padding是0,非padding是1,因此sum就可以得到詞的個數
nTotal = mask.sum()
# 交叉熵這里使用了gather函數,這是一種比較底層的實現方法。更簡便的方法可以使用CrossEntropyLoss或者NLLLoss
crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
loss = crossEntropy.masked_select(mask).mean()
loss = loss.to(device)
return loss, nTotal.item()
# gather函數可以讓我們在不用for循環的前提下,把batch中正確類別的預測概率找出來
# inp = torch.tensor([[0.3, 0.2, 0.4, 0.1], [0.2, 0.1, 0.4, 0.3]])
# target = torch.tensor([2, 3]) # 第一組的第三個結果是正確的,第二組的第四個結果是正確的
# selected = torch.gather(inp, 1, target.view(-1, 1))
# print(selected)
# 輸出:
# tensor([[ 0.4000],
# [ 0.3000]])
定義訓練過程
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):
# 梯度清空
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
# 設置device,從而支持GPU,當然如果沒有GPU也能工作。
input_variable = input_variable.to(device)
lengths = lengths.to(device)
target_variable = target_variable.to(device)
mask = mask.to(device)
# 初始化變量
loss = 0
print_losses = []
n_totals = 0
# encoder的Forward計算
encoder_outputs, encoder_hidden = encoder(input_variable, lengths)
# Decoder的初始輸入是SOS,我們需要構造(1, batch)的輸入,表示第一個時刻batch個輸入。
decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
decoder_input = decoder_input.to(device)
# 注意:Encoder是雙向的,而Decoder是單向的,因此從下往上取n_layers個
decoder_hidden = encoder_hidden[:decoder.n_layers]
# 確定是否teacher forcing
use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
# 一次處理一個時刻
if use_teacher_forcing:
for t in range(max_target_len):
decoder_output, decoder_hidden = decoder(
decoder_input, decoder_hidden, encoder_outputs
)
# Teacher forcing: 下一個時刻的輸入是當前正確答案
decoder_input = target_variable[t].view(1, -1)
# 計算累計的loss
mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
loss += mask_loss
print_losses.append(mask_loss.item() * nTotal)
n_totals += nTotal
else:
for t in range(max_target_len):
decoder_output, decoder_hidden = decoder(
decoder_input, decoder_hidden, encoder_outputs
)
# 不是teacher forcing: 下一個時刻的輸入是當前模型預測概率最高的值
_, topi = decoder_output.topk(1)
decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
decoder_input = decoder_input.to(device)
# 計算累計的loss
mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
loss += mask_loss
print_losses.append(mask_loss.item() * nTotal)
n_totals += nTotal
# 反向計算
loss.backward()
# 對encoder和decoder進行梯度裁剪
_ = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
_ = torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)
# 更新參數
encoder_optimizer.step()
decoder_optimizer.step()
return sum(print_losses) / n_totals
定義迭代過程
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
print_every, save_every, clip, corpus_name, loadFilename):
# 隨機選擇n_iteration個batch的數據(pair)
training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
for _ in range(n_iteration)]
# 初始化
print('Initializing ...')
start_iteration = 1
print_loss = 0
if loadFilename:
start_iteration = checkpoint['iteration'] + 1
# 訓練
print("Training...")
for iteration in range(start_iteration, n_iteration + 1):
training_batch = training_batches[iteration - 1]
input_variable, lengths, target_variable, mask, max_target_len = training_batch
# 訓練一個batch的數據
loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
print_loss += loss
# 進度
if iteration % print_every == 0:
print_loss_avg = print_loss / print_every
print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}"
.format(iteration, iteration / n_iteration * 100, print_loss_avg))
print_loss = 0
# 保存checkpoint
if (iteration % save_every == 0):
directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'
.format(encoder_n_layers, decoder_n_layers, hidden_size))
if not os.path.exists(directory):
os.makedirs(directory)
torch.save({
'iteration': iteration,
'en': encoder.state_dict(),
'de': decoder.state_dict(),
'en_opt': encoder_optimizer.state_dict(),
'de_opt': decoder_optimizer.state_dict(),
'loss': loss,
'voc_dict': voc.__dict__,
'embedding': embedding.state_dict()
}, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))
定義測試方法
class GreedySearchDecoder(nn.Module):
def __init__(self, encoder, decoder):
super(GreedySearchDecoder, self).__init__()
self.encoder = encoder
self.decoder = decoder
def forward(self, input_seq, input_length, max_length):
# Encoder的Forward計算
encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
# 把Encoder最后時刻的隱狀態作為Decoder的初始值
decoder_hidden = encoder_hidden[:decoder.n_layers]
# 因為我們的函數都是要求(time,batch),因此即使只有一個數據,也要做出二維的。
# Decoder的初始輸入是SOS
decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
# 用於保存解碼結果的tensor
all_tokens = torch.zeros([0], device=device, dtype=torch.long)
all_scores = torch.zeros([0], device=device)
# 循環,這里只使用長度限制,后面處理的時候把EOS去掉了。
for _ in range(max_length):
# Decoder forward一步
decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden,
encoder_outputs)
# decoder_outputs是(batch=1, vob_size)
# 使用max返回概率最大的詞和得分
decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
# 把解碼結果保存到all_tokens和all_scores里
all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
all_scores = torch.cat((all_scores, decoder_scores), dim=0)
# decoder_input是當前時刻輸出的詞的ID,這是個一維的向量,因為max會減少一維。
# 但是decoder要求有一個batch維度,因此用unsqueeze增加batch維度。
decoder_input = torch.unsqueeze(decoder_input, 0)
# 返回所有的詞和得分。
return all_tokens, all_scores
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
# 把輸入的一個batch句子變成id
indexes_batch = [indexesFromSentence(voc, sentence)]
# 創建lengths tensor
lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
# 轉置
input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
# 放到合適的設備上(比如GPU)
input_batch = input_batch.to(device)
lengths = lengths.to(device)
# 用searcher解碼
tokens, scores = searcher(input_batch, lengths, max_length)
# ID變成詞。
decoded_words = [voc.index2word[token.item()] for token in tokens]
return decoded_words
def evaluateInput(encoder, decoder, searcher, voc):
input_sentence = ''
while(1):
try:
# 得到用戶終端的輸入
input_sentence = input('> ')
# 是否退出
if input_sentence == 'q' or input_sentence == 'quit': break
# 句子歸一化
input_sentence = word_filter(jieba.cut(input_sentence) if word_wise else input_sentence)
# 生成響應Evaluate sentence
output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
# 去掉EOS后面的內容
words = []
for word in output_words:
if word == 'EOS':
break
elif word != 'PAD':
words.append(word)
print('Bot:', ''.join(words))
except KeyError:
print("Error: Encountered unknown word.")
初始化模型
# 配置模型
model_name = 'cb_model'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64
# 從哪個checkpoint恢復,如果是None,那么從頭開始訓練。
loadFilename = None
checkpoint_iter = 4000
# 如果loadFilename不空,則從中加載模型
if loadFilename:
# 如果訓練和加載是一條機器,那么直接加載
checkpoint = torch.load(loadFilename)
# 否則比如checkpoint是在GPU上得到的,但是我們現在又用CPU來訓練或者測試,那么注釋掉下面的代碼
#checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
encoder_sd = checkpoint['en']
decoder_sd = checkpoint['de']
encoder_optimizer_sd = checkpoint['en_opt']
decoder_optimizer_sd = checkpoint['de_opt']
embedding_sd = checkpoint['embedding']
voc.__dict__ = checkpoint['voc_dict']
print('Building encoder and decoder ...')
# 初始化word embedding
embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
embedding.load_state_dict(embedding_sd)
# 初始化encoder和decoder模型
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words,
decoder_n_layers, dropout)
if loadFilename:
encoder.load_state_dict(encoder_sd)
decoder.load_state_dict(decoder_sd)
# 使用合適的設備
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')
訓練
# 配置訓練的超參數和優化器
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 4000
print_every = 1
save_every = 500
# 設置進入訓練模式,從而開啟dropout
encoder.train()
decoder.train()
# 初始化優化器
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
encoder_optimizer.load_state_dict(encoder_optimizer_sd)
decoder_optimizer.load_state_dict(decoder_optimizer_sd)
# 開始訓練
print("Starting Training!")
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
print_every, save_every, clip, corpus_name, loadFilename)
輸出:
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1; Percent complete: 0.0%; Average loss: 8.3130
Iteration: 2; Percent complete: 0.1%; Average loss: 8.2717
Iteration: 3; Percent complete: 0.1%; Average loss: 8.2047
Iteration: 4; Percent complete: 0.1%; Average loss: 8.1339
Iteration: 5; Percent complete: 0.1%; Average loss: 7.9634
Iteration: 6; Percent complete: 0.1%; Average loss: 7.6764
Iteration: 7; Percent complete: 0.2%; Average loss: 7.3193
Iteration: 8; Percent complete: 0.2%; Average loss: 7.0421
...
Iteration: 3992; Percent complete: 99.8%; Average loss: 1.8682
Iteration: 3993; Percent complete: 99.8%; Average loss: 1.3369
Iteration: 3994; Percent complete: 99.9%; Average loss: 1.7045
Iteration: 3995; Percent complete: 99.9%; Average loss: 1.3252
Iteration: 3996; Percent complete: 99.9%; Average loss: 1.5306
Iteration: 3997; Percent complete: 99.9%; Average loss: 1.3957
Iteration: 3998; Percent complete: 100.0%; Average loss: 1.6272
Iteration: 3999; Percent complete: 100.0%; Average loss: 1.1216
Iteration: 4000; Percent complete: 100.0%; Average loss: 1.1116
測試
# 進入eval模式,從而去掉dropout。
encoder.eval()
decoder.eval()
# 構造searcher對象
searcher = GreedySearchDecoder(encoder, decoder)
# 測試
evaluateInput(encoder, decoder, searcher, voc)
看看效果, 感覺還可以:
> 你叫什么名字?
Bot: 我叫菲菲
> 我也叫菲菲
Bot: 你不覺得這個話題沒什么意思么
> 好吧,是沒啥意思
Bot: 哦
> 嘿嘿
Bot: 哇靠,你咋笑得這么猥瑣
> 。。。
Bot: 不要這樣說嘛
參考: