圖上的機器學習任務通常有三種類型:整圖分類、節點分類和鏈接預測。本篇博客要實現的例子是節點分類,具體來說是用GCN對Cora數據集里的樣本進行分類。
Cora數據集介紹:
Cora數據集由許多機器學習領域的paper構成,這些paper被分為7個類別:
- Case_Based
- Genetic_Algorithms
- Neural_Networks
- Probabilistic_Methods
- Reinforcement_Learning
- Rule_Learning
- Theory
在該數據集中,每一篇論文至少引用了該數據集里面另外一篇論文或者被另外一篇論文所引用,數據集總共有2708篇papers。
在消除停詞以及除去文檔頻率小於10的詞匯,最終詞匯表中有1433個詞匯,所以特征是1433維。0和1描述的是每個單詞在paper中是否存在。
把每一篇論文作為一個節點,根據論文之間的引用關系可以構建一個graph,包含2708個節點。0~139為訓練節點數據,140~539為驗證節點數據,1708~2707為測試節點數據。
代碼:(摘自https://github.com/rexrex9/gnn/blob/main/gcn.py。講解視頻https://www.bilibili.com/video/BV1W5411N78Y?from=search&seid=6220646189261474464&spm_id_from=333.337.0.0)
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn.pytorch import GraphConv
from dgl.data import CoraGraphDataset
class GCN( nn.Module ):
def __init__(self,
g, #DGL的圖對象
in_feats, #輸入特征的維度
n_hidden, #隱層的特征維度
n_classes, #類別數
n_layers, #網絡層數
activation, #激活函數
dropout #dropout系數
):
super( GCN, self ).__init__()
self.g = g
self.layers = nn.ModuleList()
# 輸入層
self.layers.append( GraphConv( in_feats, n_hidden, activation = activation ))
# 隱層
for i in range(n_layers - 1):
self.layers.append(GraphConv(n_hidden, n_hidden, activation = activation ))
# 輸出層
self.layers.append( GraphConv( n_hidden, n_classes ) )
self.dropout = nn.Dropout(p = dropout)
def forward( self, features ):
h = features
for i, layer in enumerate( self.layers ):
if i != 0:
h = self.dropout( h )
h = layer( self.g, h )
return h
def evaluate(model, features, labels, mask):
model.eval()
with torch.no_grad():
logits = model(features)
logits = logits[mask]
labels = labels[mask]
_, indices = torch.max(logits, dim=1)
correct = torch.sum(indices == labels)
return correct.item() * 1.0 / len(labels)
def train(n_epochs=100, lr=1e-2, weight_decay=5e-4, n_hidden=16, n_layers=1, activation=F.relu , dropout=0.5):
data = CoraGraphDataset()
g=data[0] # 圖的所有信息,包含2078個節點,每個節點1433維,所有節點可分為7類。10556條邊。
features = g.ndata['feat']
labels = g.ndata['label']
train_mask = g.ndata['train_mask'] # 0~139為訓練節點
val_mask = g.ndata['val_mask'] # 140~539為驗證節點
test_mask = g.ndata['test_mask'] # 1708-2707為測試節點
in_feats = features.shape[1]
n_classes = data.num_classes
model = GCN(g,
in_feats,
n_hidden,
n_classes,
n_layers,
activation,
dropout)
loss_fcn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam( model.parameters(),
lr = lr,
weight_decay = weight_decay)
best_val_acc = 0
for epoch in range( n_epochs ):
model.train()
logits = model( features )
loss = loss_fcn( logits[ train_mask ], labels[ train_mask ] )
optimizer.zero_grad()
loss.backward()
optimizer.step()
acc = evaluate(model, features, labels, val_mask)
print("Epoch {} | Loss {:.4f} | Accuracy {:.4f} ".format(epoch, loss.item(), acc ))
if acc > best_val_acc:
best_val_acc = acc
torch.save(model.state_dict(), 'models/best_model.pth')
model.load_state_dict(torch.load("models/best_model.pth"))
acc = evaluate(model, features, labels, test_mask)
print("Test accuracy {:.2%}".format(acc))
if __name__ == '__main__':
train()
運行結果:

......
