https://www.cnblogs.com/Harukaze/p/14266380.html接上篇文章,详细介绍作者的模型部分。
1 trainer = GCNTrainer(opt, emb_matrix=emb_matrix)#初次训练模型 2 3 trainer = GCNTrainer(model_opt) #加载预训练模型 4 trainer.load(model_file) 5 6 loss = trainer.update(batch) #训练过程中返回loss 7 8 preds, _, loss = trainer.predict(batch) #训练一个batch,dev模型效果 9 10 trainer.save(model_file, epoch)#当前epoch使用的model以及config保存 11 12 trainer.update_lr(current_lr)#如果有调整lr的必要调整学习率
GCNTrain( )的细节:
1 def unpack_batch(batch, cuda): 2 if cuda: 3 inputs = [Variable(b.cuda()) for b in batch[:10]] #data[i:i+batch_size][:10] 4 labels = Variable(batch[10].cuda()) 5 else: 6 inputs = [Variable(b) for b in batch[:10]] 7 labels = Variable(batch[10]) 8 tokens = batch[0] 9 head = batch[5] 10 subj_pos = batch[6] 11 obj_pos = batch[7] 12 lens = batch[1].eq(0).long().sum(1).squeeze() #sum(1)按行求和 13 #data中每条数据POS标签对应的PAD_TOKEN:0的数目。[3,5,7,8,..,12] 14 return inputs, labels, tokens, head, subj_pos, obj_pos, lens 15 16 class GCNTrainer(Trainer): 17 def __init__(self, opt, emb_matrix=None): 18 self.opt = opt 19 self.emb_matrix = emb_matrix 20 self.model = GCNClassifier(opt, emb_matrix=emb_matrix) 21 self.criterion = nn.CrossEntropyLoss() 22 self.parameters = [p for p in self.model.parameters() if p.requires_grad] 23 if opt['cuda']: 24 self.model.cuda() 25 self.criterion.cuda() 26 self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr']) 27 28 def update(self, batch): 29 inputs, labels, tokens, head, subj_pos, obj_pos, lens = unpack_batch(batch, self.opt['cuda']) 30 31 # step forward 32 self.model.train() 33 self.optimizer.zero_grad() 34 logits, pooling_output = self.model(inputs) 35 loss = self.criterion(logits, labels) 36 #labels = [0,1,3,4,...,41,0] labels中为batch_size条数据对应的关系的id 37 #logits = [b,42] 38 # l2 decay on all conv layers 39 if self.opt.get('conv_l2', 0) > 0: 40 loss += self.model.conv_l2() * self.opt['conv_l2'] 41 # l2 penalty on output representations 42 if self.opt.get('pooling_l2', 0) > 0: 43 loss += self.opt['pooling_l2'] * (pooling_output ** 2).sum(1).mean() 44 loss_val = loss.item() 45 # backward 46 loss.backward() 47 torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.opt['max_grad_norm']) 48 self.optimizer.step() 49 return loss_val 50 51 def predict(self, batch, unsort=True): 52 inputs, labels, tokens, head, subj_pos, obj_pos, lens = unpack_batch(batch, self.opt['cuda']) 53 orig_idx = batch[11] 54 # forward 55 self.model.eval() 56 logits, _ = self.model(inputs) 57 loss = self.criterion(logits, labels) 58 probs = F.softmax(logits, 1).data.cpu().numpy().tolist() #[b,42] 59 predictions = np.argmax(logits.data.cpu().numpy(), axis=1).tolist() #按行取最大的下标[b,1] 60 if unsort: 61 _, predictions, probs = [list(t) for t in zip(*sorted(zip(orig_idx,\ 62 predictions, probs)))] 63 return predictions, probs, loss.item()
可以看到第19,41行,不管是在训练过程中执行updata( )返回loss,还是dev过程中执行predict( )返回预测结果,都需要调用第5行model,GCNClassifier( ):
1 class GCNClassifier(nn.Module): 2 """ A wrapper classifier for GCNRelationModel. """ 3 def __init__(self, opt, emb_matrix=None): 4 super().__init__() 5 self.gcn_model = GCNRelationModel(opt, emb_matrix=emb_matrix) 6 in_dim = opt['hidden_dim'] 7 self.classifier = nn.Linear(in_dim, opt['num_class']) 8 self.opt = opt 9 10 def conv_l2(self): 11 return self.gcn_model.gcn.conv_l2() 12 13 def forward(self, inputs): 14 outputs, pooling_output = self.gcn_model(inputs) #outputs.shape=pooling_output.shape = [b,200] 15 logits = self.classifier(outputs) #logits.shape = [b,42] 16 return logits, pooling_output
GCNClassifier()两部分组成,一部分是GCNRelationModel模型,再加一个线性层,实现关系的预测,GCNRelationModel()模型:
1 class GCNRelationModel(nn.Module): 2 def __init__(self, opt, emb_matrix=None): 3 super().__init__() 4 self.opt = opt 5 self.emb_matrix = emb_matrix 6 7 # create embedding layers 8 self.emb = nn.Embedding(opt['vocab_size'], opt['emb_dim'], padding_idx=constant.PAD_ID) 9 self.pos_emb = nn.Embedding(len(constant.POS_TO_ID), opt['pos_dim']) if opt['pos_dim'] > 0 else None 10 self.ner_emb = nn.Embedding(len(constant.NER_TO_ID), opt['ner_dim']) if opt['ner_dim'] > 0 else None 11 embeddings = (self.emb, self.pos_emb, self.ner_emb) 12 self.init_embeddings() 13 14 # gcn layer 15 self.gcn = GCN(opt, embeddings, opt['hidden_dim'], opt['num_layers']) #hidden_dim = 200,num_layers=2 16 17 # output mlp layers 18 in_dim = opt['hidden_dim']*3 19 layers = [nn.Linear(in_dim, opt['hidden_dim']), nn.ReLU()] 20 for _ in range(self.opt['mlp_layers']-1): #mlp_layers = 2 21 layers += [nn.Linear(opt['hidden_dim'], opt['hidden_dim']), nn.ReLU()] 22 self.out_mlp = nn.Sequential(*layers) 23 24 def init_embeddings(self): 25 if self.emb_matrix is None: 26 self.emb.weight.data[1:,:].uniform_(-1.0, 1.0) 27 else: 28 self.emb_matrix = torch.from_numpy(self.emb_matrix) 29 self.emb.weight.data.copy_(self.emb_matrix) 30 # decide finetuning 31 if self.opt['topn'] <= 0: 32 print("Do not finetune word embedding layer.") 33 self.emb.weight.requires_grad = False 34 elif self.opt['topn'] < self.opt['vocab_size']: 35 print("Finetune top {} word embeddings.".format(self.opt['topn'])) 36 self.emb.weight.register_hook(lambda x: \ 37 torch_utils.keep_partial_grad(x, self.opt['topn'])) 38 else: 39 print("Finetune all embeddings.") 40 41 def forward(self, inputs): 42 words, masks, pos, ner, deprel, head, subj_pos, obj_pos, subj_type, obj_type = inputs # unpack 43 l = (masks.data.cpu().numpy() == 0).astype(np.int64).sum(1) #将mask矩阵中的True/Fasle->1/0,记录每个batch有多少个单词 44 maxlen = max(l) 45 46 def inputs_to_tree_reps(head, words, l, prune, subj_pos, obj_pos): 47 head, words, subj_pos, obj_pos = head.cpu().numpy(), words.cpu().numpy(), subj_pos.cpu().numpy(), obj_pos.cpu().numpy() 48 trees = [head_to_tree(head[i], words[i], l[i], prune, subj_pos[i], obj_pos[i]) for i in range(len(l))] 49 adj = [tree_to_adj(maxlen, tree, directed=False, self_loop=False).reshape(1, maxlen, maxlen) for tree in trees] 50 adj = np.concatenate(adj, axis=0) # axis=0 跨行进行操作 shape = [b,maxlen,maxlen] 51 adj = torch.from_numpy(adj) 52 return Variable(adj.cuda()) if self.opt['cuda'] else Variable(adj) 53 54 #.data用法可以修改tensor的值而不被autograd(不会影响反向传播), 55 # subj_pos,obj_pos均为主语谓语在句子中的位置,#返回距离List :[-3,-2,-1,0,0,0,1,2,3] 56 adj = inputs_to_tree_reps(head.data, words.data, l, self.opt['prune_k'], subj_pos.data, obj_pos.data) 57 h, pool_mask = self.gcn(adj, inputs) #将此batch的adj邻接矩阵,与输入输入到gcn, 58 # 得到#h = gcn_inputs:[b,maxlen,200] ,pool_mask=mask:[b,maxlen,1] 59 60 # pooling 61 subj_mask, obj_mask = subj_pos.eq(0).eq(0).unsqueeze(2), obj_pos.eq(0).eq(0).unsqueeze(2) # invert mask 62 #第一个eq(0)实体位置都标记为True其余位置标记为False,第二个eq(0),实体位置标记为False,其余位置标记为True shape:[b,maxlen,1] 63 pool_type = self.opt['pooling'] 64 h_out = pool(h, pool_mask, type=pool_type)# shape:[b,200] 65 subj_out = pool(h, subj_mask, type=pool_type) #shape:[b,200] 66 obj_out = pool(h, obj_mask, type=pool_type) #shape:[b,200] 67 outputs = torch.cat([h_out, subj_out, obj_out], dim=1) #[b,600] 68 outputs = self.out_mlp(outputs) #shape:[b,200] 69 return outputs, h_out
由三部分组成,embedding层,GCN层,mlp层(mlp_layers=2)。通过第15行代码,我们进入到GCN模型中看细节:
1 class GCN(nn.Module): 2 """ A GCN/Contextualized GCN module operated on dependency graphs. """ 3 def __init__(self, opt, embeddings, mem_dim, num_layers): 4 super(GCN, self).__init__() 5 self.opt = opt 6 self.layers = num_layers #num_layers = 2 7 self.use_cuda = opt['cuda'] 8 self.mem_dim = mem_dim #hidden_dim = 200 9 self.in_dim = opt['emb_dim'] + opt['pos_dim'] + opt['ner_dim'] #300+30+30 10 11 self.emb, self.pos_emb, self.ner_emb = embeddings 12 13 # rnn layer 14 if self.opt.get('rnn', False): #如果是C-GCN 15 input_size = self.in_dim 16 self.rnn = nn.LSTM(input_size, opt['rnn_hidden'], opt['rnn_layers'], batch_first=True, \ 17 dropout=opt['rnn_dropout'], bidirectional=True) #rnn_layers = 1 18 self.in_dim = opt['rnn_hidden'] * 2 #400 19 self.rnn_drop = nn.Dropout(opt['rnn_dropout']) # use on last layer output 20 21 self.in_drop = nn.Dropout(opt['input_dropout']) 22 self.gcn_drop = nn.Dropout(opt['gcn_dropout']) 23 24 # gcn layer 25 self.W = nn.ModuleList() 26 27 for layer in range(self.layers): 28 input_dim = self.in_dim if layer == 0 else self.mem_dim 29 #有Rnn的情况下如果是第一层,则输入为BiLSTM的400维hidden 30 #无Rnn的情况下第一层,300+30+30 31 self.W.append(nn.Linear(input_dim, self.mem_dim)) 32 33 def conv_l2(self): #卷积层参数通过权重衰减防止过拟合 34 conv_weights = [] 35 for w in self.W: 36 conv_weights += [w.weight, w.bias] 37 return sum([x.pow(2).sum() for x in conv_weights]) 38 39 def encode_with_rnn(self, rnn_inputs, masks, batch_size): #rnn_input=embs=[40,max(len(x)),360] 40 seq_lens = list(masks.data.eq(constant.PAD_ID).long().sum(1).squeeze()) #bacth中每句话的长度 41 h0, c0 = rnn_zero_state(batch_size, self.opt['rnn_hidden'], self.opt['rnn_layers'])#shape:[2,40,200] rnn_layers = 1 run_hidden=200 42 rnn_inputs = nn.utils.rnn.pack_padded_sequence(rnn_inputs, seq_lens, batch_first=True) # 43 rnn_outputs, (ht, ct) = self.rnn(rnn_inputs, (h0, c0)) 44 rnn_outputs, _ = nn.utils.rnn.pad_packed_sequence(rnn_outputs, batch_first=True)#[40,max(len(x)),400] 45 return rnn_outputs 46 47 def forward(self, adj, inputs): 48 words, masks, pos, ner, deprel, head, subj_pos, obj_pos, subj_type, obj_type = inputs # unpack 49 word_embs = self.emb(words) #shape:[40,max(len(x)),300] 50 embs = [word_embs] 51 if self.opt['pos_dim'] > 0: 52 embs += [self.pos_emb(pos)] 53 if self.opt['ner_dim'] > 0: 54 embs += [self.ner_emb(ner)] 55 embs = torch.cat(embs, dim=2) #shape:[40,max(len(x)),300+30+30] 56 embs = self.in_drop(embs) 57 58 # rnn layer 59 if self.opt.get('rnn', False): 60 gcn_inputs = self.rnn_drop(self.encode_with_rnn(embs, masks, words.size()[0])) #从RNN层出来了shape=[40,max(len(x)),400] 61 else: 62 gcn_inputs = embs ##shape:[40,max(len(x)),300+30+30] 63 64 # gcn layer 65 denom = adj.sum(2).unsqueeze(2) + 1 66 #[b,maxlen,maxlen]->[b,maxlen]->[b,maxlen,1]再所有位置数值+1防止0数据无法除,我理解的含义是,adj.sum(2)每个节点都多少条出边记录 67 mask = (adj.sum(2) + adj.sum(1)).eq(0).unsqueeze(2) #adj.sum(1)每个节点有多少个入边 68 #[b,maxlen]+[b,maxlen] = [b,maxlen]等于0的位置全为True,不等于0的位置为False,[b,maxlen,1] 69 # zero out adj for ablation 70 if self.opt.get('no_adj', False): 71 adj = torch.zeros_like(adj) 72 73 #下面紧扣GCN公式还原 74 for l in range(self.layers): #self.layers = 2 75 Ax = adj.bmm(gcn_inputs) #[b,maxlen,maxlen]*[40,max(len(x)),400]或者[b,max(len(x)),300+30+30] = [b,maxlen,200/360] 76 AxW = self.W[l](Ax) #第一层:W[1]:nn.Linear(400/360, 200) AxW.shape=[b,maxlen,200] 77 AxW = AxW + self.W[l](gcn_inputs) # self loop 78 # 这代码写得太好了,前面AxW是WAh ,后边是Wh ,两者相加可以把Wh提出来:Wh(A+I)符合了论文中提到的公式 79 AxW = AxW / denom # 节点i在图中的度 80 81 gAxW = F.relu(AxW) 82 gcn_inputs = self.gcn_drop(gAxW) if l < self.layers - 1 else gAxW #除最后一层外都dropout 83 84 return gcn_inputs, mask #gcn_inputs:[b,maxlen,200] ,mask:[b,maxlen,1] 85 86 def pool(h, mask, type='max'): #h:[b,maxlen,200] mask:[b,maxlen,1] 87 if type == 'max': 88 h = h.masked_fill(mask, -constant.INFINITY_NUMBER) #使用value填充mask中为1位置的元素 89 return torch.max(h, 1)[0] #只返回最大值的每个数 ,1且是按照行为单位即从[maxlen,200中] 90 elif type == 'avg': 91 h = h.masked_fill(mask, 0) 92 return h.sum(1) / (mask.size(1) - mask.float().sum(1)) 93 else: 94 h = h.masked_fill(mask, 0) 95 return h.sum(1) 96 97 def rnn_zero_state(batch_size, hidden_dim, num_layers, bidirectional=True, use_cuda=True): 98 total_layers = num_layers * 2 if bidirectional else num_layers 99 state_shape = (total_layers, batch_size, hidden_dim) #shape:[2,40,200] 100 h0 = c0 = Variable(torch.zeros(*state_shape), requires_grad=False) 101 if use_cuda: 102 return h0.cuda(), c0.cuda() 103 else: 104 return h0, c0
在GCNRealtionModel()第50行我们举个例子说明下:
1 import numpy as np 2 a = [np.zeros((5,5)).reshape(1,5,5) for _ in range(3)] 3 adj = np.concatenate(a, axis=0) 4 print(adj.shape) #[3,5,5]
最后附上pooling部分的详解,如果我有时间会转为Web文字,没时间就算了。二维矩阵的max(0),max(1)对应三维矩阵max(1),max(2),sum(),softmax()操作同理。
参考:
Argparse中action的可选参数store_true,store_false:https://blog.csdn.net/weixin_39943271/article/details/79076376
Pytorch中的基本语法之torch.sum(dim=int)以及由此引出的torch.size张量维度详解:https://blog.csdn.net/hahameier/article/details/103742831
Python Pandas与Numpy中axis参数的二义性:https://www.cnblogs.com/rrttp/p/8028421.html
Pytorch之max()函数:https://blog.csdn.net/liuweiyuxiang/article/details/84668269
https://www.jianshu.com/p/3ed11362b54f