使用dgl.heterograph()構建異質圖,其參數是一個字典,key是一個三元組(srctype , edgetype, dsttype), 這個三元組被稱為規范邊類型( canonical edge types)。value 是一堆源數組和目標數組。節點是從零開始的整數ID, 不同類型的節點ID具有單獨的計數。
import numpy as np import dgl import scipy.sparse as sp import networkx as nx ratings = dgl.heterograph( {('user', '+1', 'movie') : (np.array([0, 0, 1]), np.array([0, 1, 0])), ('user', '-1', 'movie') : (np.array([2]), np.array([1]))})
也可以從稀疏矩陣/networkX構造異質圖。
## 從稀疏矩陣構造圖 plus1 = sp.coo_matrix(([1, 1, 1], ([0, 0, 1], [0, 1, 0])), shape=(3, 2)) minus1 = sp.coo_matrix(([1], ([2], [1])), shape=(3, 2)) ratings = dgl.heterograph( {('user', '+1', 'movie') : plus1, ('user', '-1', 'movie') : minus1}) ## 從networkX構造圖 plus1 = nx.DiGraph() plus1.add_nodes_from(['u0', 'u1', 'u2'], bipartite=0) plus1.add_nodes_from(['m0', 'm1'], bipartite=1) plus1.add_edges_from([('u0', 'm0'), ('u0', 'm1'), ('u1', 'm0')]) ratings = dgl.heterograph( {('user', '+1', 'movie') : plus1, ('user', '-1', 'movie') : minus1})
使用ACM異質圖數據集:
import scipy.io import urllib.request data_url = 'https://data.dgl.ai/dataset/ACM.mat' data_file_path = '/tmp/ACM.mat' urllib.request.urlretrieve(data_url, data_file_path) data = scipy.io.loadmat(data_file_path) print(list(data.keys()))
['__header__', '__version__', '__globals__', 'TvsP', 'PvsA', 'PvsV', 'AvsF', 'VvsC', 'PvsL', 'PvsC', 'A', 'C', 'F', 'L', 'P', 'T', 'V', 'PvsT', 'CNormPvsA', 'RNormPvsA', 'CNormPvsC', 'RNormPvsC', 'CNormPvsT', 'RNormPvsT', 'CNormPvsV', 'RNormPvsV', 'CNormVvsC', 'RNormVvsC', 'CNormAvsF', 'RNormAvsF', 'CNormPvsL', 'RNormPvsL', 'stopwords', 'nPvsT', 'nT', 'CNormnPvsT', 'RNormnPvsT', 'nnPvsT', 'nnT', 'CNormnnPvsT', 'RNormnnPvsT', 'PvsP', 'CNormPvsP', 'RNormPvsP']
A代表作者, P代表論文, C代表會議,L是主題代碼; 邊存儲為鍵XvsY下的SciPy稀疏矩陣,其中X和Y可以是任何節點類型代碼。
輸出PvsA (paper - author) 的一些統計量。
print(type(data['PvsA'])) print('#Papers:', data['PvsA'].shape[0]) print('#Authors:', data['PvsA'].shape[1]) print('#Links:', data['PvsA'].nnz)
轉化Scipy Matrix 為 dgl.heterograph()。
pa_g = dgl.heterograph({('paper', 'written-by', 'author') : data['PvsA']}) # equivalent (shorter) API for creating heterograph with two node types: pa_g = dgl.bipartite(data['PvsA'], 'paper', 'written-by', 'author')
打印出類型名稱和其他結構信息。
print('Node types:', pa_g.ntypes) print('Edge types:', pa_g.etypes) print('Canonical edge types:', pa_g.canonical_etypes) # 節點和邊都是從零開始的整數ID,每種類型都有其自己的計數。要區分不同類型的節點和邊緣,需要指定類型名稱作為參數。 print(pa_g.number_of_nodes('paper')) # 如果規范邊類型名稱是唯一可區分的,則可以將其簡化為邊類型名稱。 print(pa_g.number_of_edges(('paper', 'written-by', 'author'))) print(pa_g.number_of_edges('written-by')) ## 獲得論文#1 的作者 print(pa_g.successors(1, etype='written-by'))
Node types: ['paper', 'author'] Edge types: ['written-by'] Canonical edge types: [('paper', 'written-by', 'author')] 12499 37055 37055 tensor([3532, 6421, 8516, 8560])
Metagraph
## Metagraph(或網絡模式)是異質圖結構的一個概覽。 被用作異質圖的模板,它描述了網絡中存在多少種對象以及可能存在的鏈接。 print(G.metagraph)
Graph(num_nodes={'author': 17431, 'paper': 12499, 'subject': 73}, num_edges={('paper', 'written-by', 'author'): 37055, ('author', 'writing', 'paper'): 37055, ('paper', 'citing', 'paper'): 30789, ('paper', 'cited', 'paper'): 30789, ('paper', 'is-about', 'subject'): 12499, ('subject', 'has', 'paper'): 12499}, metagraph=[('author', 'paper'), ('paper', 'author'), ('paper', 'paper'), ('paper', 'paper'), ('paper', 'subject'), ('subject', 'paper')])
半監督節點分類(Relational-GCN)
搭建異質圖神經網絡:
import dgl.function as fn class HeteroRGCNLayer(nn.Module): def __init__(self, in_size, out_size, etypes): super(HeteroRGCNLayer, self).__init__() # W_r for each relation self.weight = nn.ModuleDict({ name : nn.Linear(in_size, out_size) for name in etypes }) def forward(self, G, feat_dict): # The input is a dictionary of node features for each type funcs = {} for srctype, etype, dsttype in G.canonical_etypes: # 計算每一類etype的 W_r * h Wh = self.weight[etype](feat_dict[srctype]) # Save it in graph for message passing G.nodes[srctype].data['Wh_%s' % etype] = Wh # 消息函數 copy_u: 將源節點的特征聚合到'm'中; reduce函數: 將'm'求均值賦值給 'h' funcs[etype] = (fn.copy_u('Wh_%s' % etype, 'm'), fn.mean('m', 'h')) # Trigger message passing of multiple types. # The first argument is the message passing functions for each relation. # The second one is the type wise reducer, could be "sum", "max", # "min", "mean", "stack" G.multi_update_all(funcs, 'sum') # return the updated node feature dictionary return {ntype : G.nodes[ntype].data['h'] for ntype in G.ntypes}
class HeteroRGCN(nn.Module): def __init__(self, G, in_size, hidden_size, out_size): super(HeteroRGCN, self).__init__() # Use trainable node embeddings as featureless inputs. embed_dict = {ntype : nn.Parameter(torch.Tensor(G.number_of_nodes(ntype), in_size)) for ntype in G.ntypes} for key, embed in embed_dict.items(): nn.init.xavier_uniform_(embed) self.embed = nn.ParameterDict(embed_dict) # create layers self.layer1 = HeteroRGCNLayer(in_size, hidden_size, G.etypes) self.layer2 = HeteroRGCNLayer(hidden_size, out_size, G.etypes) def forward(self, G): h_dict = self.layer1(G, self.embed) h_dict = {k : F.leaky_relu(h) for k, h in h_dict.items()} h_dict = self.layer2(G, h_dict) # get paper logits return h_dict['paper']
Train and evaluate
model = HeteroRGCN(G, 10, 10, 3) opt = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4) best_val_acc = 0 best_test_acc = 0 for epoch in range(100): logits = model(G) # The loss is computed only for labeled nodes. loss = F.cross_entropy(logits[train_idx], labels[train_idx]) pred = logits.argmax(1) train_acc = (pred[train_idx] == labels[train_idx]).float().mean() val_acc = (pred[val_idx] == labels[val_idx]).float().mean() test_acc = (pred[test_idx] == labels[test_idx]).float().mean() if best_val_acc < val_acc: best_val_acc = val_acc best_test_acc = test_acc opt.zero_grad() loss.backward() opt.step() if epoch % 5 == 0: print('Loss %.4f, Train Acc %.4f, Val Acc %.4f (Best %.4f), Test Acc %.4f (Best %.4f)' % ( loss.item(), train_acc.item(), val_acc.item(), best_val_acc.item(), test_acc.item(), best_test_acc.item(), ))