main函數 ,加載數據以及訓練。

import torch from sklearn.metrics import f1_score from utils import load_data, EarlyStopping def score(logits, labels): # micro_f1 和 macro_f1 _, indices = torch.max(logits, dim=1) prediction = indices.long().cpu().numpy() labels = labels.cpu().numpy() accuracy = (prediction == labels).sum() / len(prediction) micro_f1 = f1_score(labels, prediction, average='micro') macro_f1 = f1_score(labels, prediction, average='macro') return accuracy, micro_f1, macro_f1 # 評估 def evaluate(model, g, features, labels, mask, loss_func): model.eval() with torch.no_grad(): logits = model(g, features) loss = loss_func(logits[mask], labels[mask]) accuracy, micro_f1, macro_f1 = score(logits[mask], labels[mask]) return loss, accuracy, micro_f1, macro_f1 def main(args): # If args['hetero'] is True, g would be a heterogeneous graph. # Otherwise, it will be a list of homogeneous graphs. g, features, labels, num_classes, train_idx, val_idx, test_idx, train_mask, \ val_mask, test_mask = load_data(args['dataset']) if hasattr(torch, 'BoolTensor'): train_mask = train_mask.bool() val_mask = val_mask.bool() test_mask = test_mask.bool() features = features.to(args['device']) labels = labels.to(args['device']) train_mask = train_mask.to(args['device']) val_mask = val_mask.to(args['device']) test_mask = test_mask.to(args['device']) if args['hetero']: from model_hetero import HAN model = HAN(meta_paths=[['pa', 'ap'], ['pf', 'fp']], in_size=features.shape[1], hidden_size=args['hidden_units'], out_size=num_classes, num_heads=args['num_heads'], dropout=args['dropout']).to(args['device']) g = g.to(args['device']) else: from model import HAN model = HAN(num_meta_paths=len(g), in_size=features.shape[1], hidden_size=args['hidden_units'], out_size=num_classes, num_heads=args['num_heads'], dropout=args['dropout']).to(args['device']) g = [graph.to(args['device']) for graph in g] #異質圖 stopper = EarlyStopping(patience=args['patience']) loss_fcn = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'], weight_decay=args['weight_decay']) for epoch in range(args['num_epochs']): model.train() logits = model(g, features) loss = loss_fcn(logits[train_mask], labels[train_mask]) optimizer.zero_grad() loss.backward() optimizer.step() train_acc, train_micro_f1, train_macro_f1 = score(logits[train_mask], labels[train_mask]) val_loss, val_acc, val_micro_f1, val_macro_f1 = evaluate(model, g, features, labels, val_mask, loss_fcn) early_stop = stopper.step(val_loss.data.item(), val_acc, model) print('Epoch {:d} | Train Loss {:.4f} | Train Micro f1 {:.4f} | Train Macro f1 {:.4f} | ' 'Val Loss {:.4f} | Val Micro f1 {:.4f} | Val Macro f1 {:.4f}'.format( epoch + 1, loss.item(), train_micro_f1, train_macro_f1, val_loss.item(), val_micro_f1, val_macro_f1)) if early_stop: break stopper.load_checkpoint(model) test_loss, test_acc, test_micro_f1, test_macro_f1 = evaluate(model, g, features, labels, test_mask, loss_fcn) print('Test loss {:.4f} | Test Micro f1 {:.4f} | Test Macro f1 {:.4f}'.format( test_loss.item(), test_micro_f1, test_macro_f1)) if __name__ == '__main__': import argparse from utils import setup parser = argparse.ArgumentParser('HAN') parser.add_argument('-s', '--seed', type=int, default=1, help='Random seed') parser.add_argument('-ld', '--log-dir', type=str, default='results', help='Dir for saving training results') parser.add_argument('--hetero', type=bool ,default=True, help='Use metapath coalescing with DGL\'s own dataset') args = parser.parse_args().__dict__ args = setup(args) main(args)
utils 具體處理數據加載 和 早停策略。

import datetime import dgl import errno import numpy as np import os import pickle import random import torch from dgl.data.utils import download, get_download_dir, _get_dgl_url from pprint import pprint from scipy import sparse from scipy import io as sio def set_random_seed(seed=0): """Set random seed. Parameters ---------- seed : int Random seed to use """ random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) def mkdir_p(path, log=True): """Create a directory for the specified path. Parameters ---------- path : str Path name log : bool Whether to print result for directory creation """ try: os.makedirs(path) if log: print('Created directory {}'.format(path)) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path) and log: print('Directory {} already exists.'.format(path)) else: raise def get_date_postfix(): """Get a date based postfix for directory name. Returns ------- post_fix : str """ dt = datetime.datetime.now() post_fix = '{}_{:02d}-{:02d}-{:02d}'.format( dt.date(), dt.hour, dt.minute, dt.second) return post_fix def setup_log_dir(args, sampling=False): """Name and create directory for logging. Parameters ---------- args : dict Configuration Returns ------- log_dir : str Path for logging directory sampling : bool Whether we are using sampling based training """ date_postfix = get_date_postfix() log_dir = os.path.join( args['log_dir'], '{}_{}'.format(args['dataset'], date_postfix)) if sampling: log_dir = log_dir + '_sampling' mkdir_p(log_dir) return log_dir # The configuration below is from the paper. default_configure = { 'lr': 0.005, # Learning rate 'num_heads': [8], # Number of attention heads for node-level attention 'hidden_units': 8, 'dropout': 0.6, 'weight_decay': 0.001, 'num_epochs': 200, 'patience': 100 } sampling_configure = { 'batch_size': 20 } def setup(args): args.update(default_configure) set_random_seed(args['seed']) args['dataset'] = 'ACMRaw' if args['hetero'] else 'ACM' args['device'] = 'cuda: 0' if torch.cuda.is_available() else 'cpu' args['log_dir'] = setup_log_dir(args) return args def setup_for_sampling(args): args.update(default_configure) args.update(sampling_configure) set_random_seed() args['device'] = 'cuda: 0' if torch.cuda.is_available() else 'cpu' args['log_dir'] = setup_log_dir(args, sampling=True) return args def get_binary_mask(total_size, indices): mask = torch.zeros(total_size) mask[indices] = 1 return mask.byte() def load_acm(remove_self_loop): url = 'dataset/ACM3025.pkl' data_path = get_download_dir() + '/ACM3025.pkl' download(_get_dgl_url(url), path=data_path) with open(data_path, 'rb') as f: data = pickle.load(f) labels, features = torch.from_numpy(data['label'].todense()).long(), \ torch.from_numpy(data['feature'].todense()).float() num_classes = labels.shape[1] labels = labels.nonzero()[:, 1] if remove_self_loop: num_nodes = data['label'].shape[0] data['PAP'] = sparse.csr_matrix(data['PAP'] - np.eye(num_nodes)) data['PLP'] = sparse.csr_matrix(data['PLP'] - np.eye(num_nodes)) # Adjacency matrices for meta path based neighbors # (Mufei): I verified both of them are binary adjacency matrices with self loops author_g = dgl.graph(data['PAP'], ntype='paper', etype='author') subject_g = dgl.graph(data['PLP'], ntype='paper', etype='subject') gs = [author_g, subject_g] train_idx = torch.from_numpy(data['train_idx']).long().squeeze(0) val_idx = torch.from_numpy(data['val_idx']).long().squeeze(0) test_idx = torch.from_numpy(data['test_idx']).long().squeeze(0) num_nodes = author_g.number_of_nodes() train_mask = get_binary_mask(num_nodes, train_idx) val_mask = get_binary_mask(num_nodes, val_idx) test_mask = get_binary_mask(num_nodes, test_idx) print('dataset loaded') pprint({ 'dataset': 'ACM', 'train': train_mask.sum().item() / num_nodes, 'val': val_mask.sum().item() / num_nodes, 'test': test_mask.sum().item() / num_nodes }) return gs, features, labels, num_classes, train_idx, val_idx, test_idx, \ train_mask, val_mask, test_mask def load_acm_raw(remove_self_loop): assert not remove_self_loop url = 'dataset/ACM.mat' data_path = get_download_dir() + '/ACM.mat' download(_get_dgl_url(url), path=data_path) data = sio.loadmat(data_path) p_vs_l = data['PvsL'] # paper-field? p_vs_a = data['PvsA'] # paper-author p_vs_t = data['PvsT'] # paper-term, bag of words p_vs_c = data['PvsC'] # paper-conference, labels come from that # We assign # (1) KDD papers as class 0 (data mining), # (2) SIGMOD and VLDB papers as class 1 (database), # (3) SIGCOMM and MOBICOMM papers as class 2 (communication) conf_ids = [0, 1, 9, 10, 13] label_ids = [0, 1, 2, 2, 1] p_vs_c_filter = p_vs_c[:, conf_ids] ## 過濾出上述五個會議的數據 ''' 首先對跨列(axis=1)進行求和,每一篇paper會對應一個數num, 如果num!=0,那么這篇paper就在五大會議之一中發表過,否則它就沒發表過。 .A1是將上述 papernum*1的二維矩陣轉為 1D矩陣。 .nonzero 是當使用布爾數組直接作為下標對象或者元組下標對象中有布爾數組時,都相當於用nonzero()將布爾數組轉換成一組整數數組,然后使用整數數組進行下標運算。 [0] 是取出一個list 這一步等於是篩選出所有在上述5個會議發表過的論文。 ''' p_selected = (p_vs_c_filter.sum(1) != 0).A1.nonzero()[0] p_vs_l = p_vs_l[p_selected] p_vs_a = p_vs_a[p_selected] p_vs_t = p_vs_t[p_selected] p_vs_c = p_vs_c[p_selected] # 構造多個二分圖 pa = dgl.bipartite(p_vs_a, 'paper', 'pa', 'author') ap = dgl.bipartite(p_vs_a.transpose(), 'author', 'ap', 'paper') pl = dgl.bipartite(p_vs_l, 'paper', 'pf', 'field') lp = dgl.bipartite(p_vs_l.transpose(), 'field', 'fp', 'paper') # 構造異質圖 hg = dgl.hetero_from_relations([pa, ap, pl, lp]) features = torch.FloatTensor(p_vs_t.toarray()) pc_p, pc_c = p_vs_c.nonzero() # 返回包含矩陣非零元素索引的數組(row,col)元組。 row指的是 paper , col是會議 labels = np.zeros(len(p_selected), dtype=np.int64) ## label數量為paper數量 for conf_id, label_id in zip(conf_ids, label_ids): labels[pc_p[pc_c == conf_id]] = label_id # 為每一個會議打上標記 labels = torch.LongTensor(labels) # 轉為tensor num_classes = 3 float_mask = np.zeros(len(pc_p)) for conf_id in conf_ids: pc_c_mask = (pc_c == conf_id) float_mask[pc_c_mask] = np.random.permutation(np.linspace(0, 1, pc_c_mask.sum())) train_idx = np.where(float_mask <= 0.2)[0] val_idx = np.where((float_mask > 0.2) & (float_mask <= 0.3))[0] test_idx = np.where(float_mask > 0.3)[0] num_nodes = hg.number_of_nodes('paper') # 圖中節點數 print(num_nodes) train_mask = get_binary_mask(num_nodes, train_idx) val_mask = get_binary_mask(num_nodes, val_idx) test_mask = get_binary_mask(num_nodes, test_idx) return hg, features, labels, num_classes, train_idx, val_idx, test_idx, \ train_mask, val_mask, test_mask def load_data(dataset, remove_self_loop=False): if dataset == 'ACM': return load_acm(remove_self_loop) elif dataset == 'ACMRaw': return load_acm_raw(remove_self_loop) else: return NotImplementedError('Unsupported dataset {}'.format(dataset)) class EarlyStopping(object): def __init__(self, patience=10): dt = datetime.datetime.now() self.filename = 'early_stop_{}_{:02d}-{:02d}-{:02d}.pth'.format( dt.date(), dt.hour, dt.minute, dt.second) self.patience = patience self.counter = 0 self.best_acc = None self.best_loss = None self.early_stop = False def step(self, loss, acc, model): if self.best_loss is None: self.best_acc = acc self.best_loss = loss self.save_checkpoint(model) elif (loss > self.best_loss) and (acc < self.best_acc): self.counter += 1 print(f'EarlyStopping counter: {self.counter} out of {self.patience}') if self.counter >= self.patience: self.early_stop = True else: if (loss <= self.best_loss) and (acc >= self.best_acc): self.save_checkpoint(model) self.best_loss = np.min((loss, self.best_loss)) self.best_acc = np.max((acc, self.best_acc)) self.counter = 0 return self.early_stop def save_checkpoint(self, model): """Saves model when validation loss decreases.""" torch.save(model.state_dict(), self.filename) def load_checkpoint(self, model): """Load the latest checkpoint.""" model.load_state_dict(torch.load(self.filename))
模型(加載處理后的ACM數據集)

import torch import torch.nn as nn import torch.nn.functional as F from dgl.nn.pytorch import GATConv class SemanticAttention(nn.Module): def __init__(self, in_size, hidden_size=128): super(SemanticAttention, self).__init__() self.project = nn.Sequential( nn.Linear(in_size, hidden_size), nn.Tanh(), nn.Linear(hidden_size, 1, bias=False) ) def forward(self, z): w = self.project(z).mean(0) # (M, 1) beta = torch.softmax(w, dim=0) # (M, 1) beta = beta.expand((z.shape[0],) + beta.shape) # (N, M, 1) return (beta * z).sum(1) # (N, D * K) class HANLayer(nn.Module): """ HAN layer. Arguments --------- num_meta_paths : number of homogeneous graphs generated from the metapaths. in_size : input feature dimension out_size : output feature dimension layer_num_heads : number of attention heads dropout : Dropout probability Inputs ------ g : list[DGLGraph] List of graphs h : tensor Input features Outputs ------- tensor The output feature """ def __init__(self, num_meta_paths, in_size, out_size, layer_num_heads, dropout): super(HANLayer, self).__init__() # One GAT layer for each meta path based adjacency matrix self.gat_layers = nn.ModuleList() for i in range(num_meta_paths): self.gat_layers.append(GATConv(in_size, out_size, layer_num_heads, dropout, dropout, activation=F.elu)) self.semantic_attention = SemanticAttention(in_size=out_size * layer_num_heads) self.num_meta_paths = num_meta_paths def forward(self, gs, h): semantic_embeddings = [] for i, g in enumerate(gs): semantic_embeddings.append(self.gat_layers[i](g, h).flatten(1)) semantic_embeddings = torch.stack(semantic_embeddings, dim=1) # (N, M, D * K) return self.semantic_attention(semantic_embeddings) # (N, D * K) class HAN(nn.Module): def __init__(self, num_meta_paths, in_size, hidden_size, out_size, num_heads, dropout): super(HAN, self).__init__() self.layers = nn.ModuleList() self.layers.append(HANLayer(num_meta_paths, in_size, hidden_size, num_heads[0], dropout)) for l in range(1, len(num_heads)): self.layers.append(HANLayer(num_meta_paths, hidden_size * num_heads[l-1], hidden_size, num_heads[l], dropout)) self.predict = nn.Linear(hidden_size * num_heads[-1], out_size) def forward(self, g, h): for gnn in self.layers: h = gnn(g, h) return self.predict(h)
模型 (加載處理前的ACM數據集)'

"""This model shows an example of using dgl.metapath_reachable_graph on the original heterogeneous graph. Because the original HAN implementation only gives the preprocessed homogeneous graph, this model could not reproduce the result in HAN as they did not provide the preprocessing code, and we constructed another dataset from ACM with a different set of papers, connections, features and labels. """ import torch import torch.nn as nn import torch.nn.functional as F import dgl from dgl.nn.pytorch import GATConv class SemanticAttention(nn.Module): def __init__(self, in_size, hidden_size=128): super(SemanticAttention, self).__init__() self.project = nn.Sequential( nn.Linear(in_size, hidden_size), nn.Tanh(), nn.Linear(hidden_size, 1, bias=False) ) def forward(self, z): w = self.project(z).mean(0) # (M, 1) 輸入維度為(N, M, D * K) 經過project層變為 (N, M, 1) , 對節點求mean之后變為 (M,1) beta = torch.softmax(w, dim=0) # (M, 1) 使用softmax歸一化 beta = beta.expand((z.shape[0],) + beta.shape) # (N, M, 1) 擴展 N 份 return (beta * z).sum(1) # (N, D * K) (N, M, 1)* (N, M, D*K) = (N , M , D*K) sum(1) -> (N, D*K) class HANLayer(nn.Module): """ HAN layer. Arguments --------- meta_paths : list of metapaths, each as a list of edge types in_size : input feature dimension out_size : output feature dimension layer_num_heads : number of attention heads dropout : Dropout probability Inputs ------ g : DGLHeteroGraph The heterogeneous graph h : tensor Input features Outputs ------- tensor The output feature """ def __init__(self, meta_paths, in_size, out_size, layer_num_heads, dropout): super(HANLayer, self).__init__() # One GAT layer for each meta path based adjacency matrix self.gat_layers = nn.ModuleList() for i in range(len(meta_paths)): self.gat_layers.append(GATConv(in_size, out_size, layer_num_heads, dropout, dropout, activation=F.elu)) self.semantic_attention = SemanticAttention(in_size=out_size * layer_num_heads) self.meta_paths = list(tuple(meta_path) for meta_path in meta_paths) self._cached_graph = None self._cached_coalesced_graph = {} def forward(self, g, h): semantic_embeddings = [] if self._cached_graph is None or self._cached_graph is not g: self._cached_graph = g self._cached_coalesced_graph.clear() for meta_path in self.meta_paths: self._cached_coalesced_graph[meta_path] = dgl.metapath_reachable_graph(g, meta_path) for i, meta_path in enumerate(self.meta_paths): new_g = self._cached_coalesced_graph[meta_path] # 通過metapath得到的同質圖 semantic_embeddings.append(self.gat_layers[i](new_g, h).flatten(1)) # (N, D*K) N是節點數, D是輸出的維度out_size, K是注意力頭數 semantic_embeddings = torch.stack(semantic_embeddings, dim=1) # (N, M, D * K) 堆疊M個元路徑的結果 return self.semantic_attention(semantic_embeddings) # (N, D * K) class HAN(nn.Module): def __init__(self, meta_paths, in_size, hidden_size, out_size, num_heads, dropout): super(HAN, self).__init__() self.layers = nn.ModuleList() self.layers.append(HANLayer(meta_paths, in_size, hidden_size, num_heads[0], dropout)) for l in range(1, len(num_heads)): self.layers.append(HANLayer(meta_paths, hidden_size * num_heads[l-1], hidden_size, num_heads[l], dropout)) self.predict = nn.Linear(hidden_size * num_heads[-1], out_size) def forward(self, g, h): for gnn in self.layers: h = gnn(g, h) # return self.predict(h)