Graph Attention Networks (GAT) 代碼解讀
1.1 代碼結構
.
|--- data # Cora數據集
|--- models # GAT模型定義(gat.py)
|--- pre_trained # 預訓練的模型
|--- utils # 工具定義
1.2 參數設置
GAT/execute_cora.py
# training params
batch_size = 1
nb_epochs = 100000
patience = 100
lr = 0.005 # learning rate
l2_coef = 0.0005 # weight decay
hid_units = [8] # numbers of hidden units per each attention head in each layer
n_heads = [8, 1] # additional entry for the output layer
residual = False
nonlinearity = tf.nn.elu
model = GAT
1.3 導入數據
GAT源碼默認使用的Cora數據集。Cora的相關代碼介紹可以參考這里
數據預處理部分和GCN源碼相同,可以參考這里
最終載入的數據adj為鄰接矩陣,表示2708篇文章之間的索引關系。features表示1433個單詞在2708篇文章中是否存在。
GAT/utils/process.py
def load_data(dataset_str):
# ...
print(adj.shape) # (2708, 2708)
print(features.shape) #(2708, 1433)
1.4 特征預處理
GAT/utils/process.py
def preprocess_features(features):
"""Row-normalize feature matrix and convert to tuple representation"""
rowsum = np.array(features.sum(1))
r_inv = np.power(rowsum, -1).flatten()
r_inv[np.isinf(r_inv)] = 0.
r_mat_inv = sp.diags(r_inv)
features = r_mat_inv.dot(features)
return features.todense(), sparse_to_tuple(features)
1.5 模型定義-前向傳播
添加一層單頭注意力
'''
輸入是(B,N,D),B是batch size,N是節點數,D是每個節點的原始特征維數
輸出是(B,N,F),F是每個節點的新特征維數
每個節點從維度D到維度F是按注意力為權重聚合了鄰居節點的特征
'''
def att_head(seq, out_sz, bias_mat, activation, in_drop = 0.0, coef_drop = 0.0, residual = False):
'''
seq:輸入(B,N,D),B是batch size,N是節點數,D是每個節點的原始特征維數
out_sz:每個節點的輸出特征維數,設為F
bias_mat:(N,N)掩碼矩陣
activation:激活函數
in_drop:輸入的dropout率
coef_drop:注意力矩陣的dropout率
residual:是否使用殘差網絡
'''
with tf.name_scope('my_attn'):
# drop out 防止過擬合;如果為0則不設置該層
if in_drop != 0.0:
seq = tf.nn.dropout(seq, 1.0 - in_drop)
'''
為了獲得足夠的表達能力以將輸入特征轉化為高級特征,需要至少一種可學習的線性變換。為此,作為第一步,
我們學習一個W矩陣用於投影特征
實現公式seq_fts = Wh,即每個節點的維度變換
'''
# F2F'
seq_fts = tf.keras.layers.Conv1D(seq, out_sz, 1, use_bias=False)
'''
實現公式 f_1 = a(Whi); f_2 = a(Whj)
f_1+f_2的轉置實現了logits = eij = a(Whi) + a(Whj)
eij經過激活,softmax得到論文中的aij,即點i對點j的注意力
bias_mat是為了讓非互為鄰居的注意力不要j進入softmax的計算
只有互為鄰居的注意力才能進入softmax,從而保證了注意力在局部
'''
# (B, N, F) => (B, N, 1)
f_1 = tf.keras.layers.Conv1D(seq_fts, 1, 1)
# (B, N, F) => (B, N, 1)
f_2 = tf.keras.layers.Conv1D (seq_fts, 1, 1)
# (B, N, 1) + (B, N, 1) = (B, N, N)
# logits 即 eij
logits = f_1 + tf.transpose(f_2, [0, 2, 1])
# (B, N, N) + (1, N, N) => (B, N, N) => softmax => (B, N, N)
# 這里運用了 tensorflow 的廣播機制
# 得到的logits 並不是一個對角矩陣, 這是因為 f_1 和 f_2並非同一個參數 a
# logits{i,j} 等於 a1(Whi) + a2(Whj)
# 注意力系數矩陣coefs=(aij)_{N*N}
# bias_mat 體現 mask 思想, 保留了圖的結構信息,
coefs = tf.nn.softmax(tf.nn.leaky_relu(logits) + bias_mat)
# 輸入矩陣、注意力系數矩陣的dropout操作
if coef_drop != 0.0:
coefs = tf.nn.dropout(coefs, 1.0 - coef_drop)
if in_drop != 0.0:
seq_fts = tf.nn.dropout(seq_fts, 1.0 - in_drop)
'''
實現 hi = sum(aijWhj)
即節點i根據注意力聚合鄰居特征
'''
# (B, N, N) * (B, N, F) => (B, N, F)
vals = tf.matmul(coefs, seq_fts)
# 添加偏置項
ret = tf.contrib.layers.bias_add(vals)
'''
添加殘差連接后,激活
如果輸入(B, N, D)和聚合了節點特征的輸出(B, N, F)的最后一個維度相同,則直接相加
否則將(B, N, D)線性變換為(B, N, F) 再相加
'''
# residual connection
if residual:
# D != F
if seq.shape[-1] != ret.shape[-1]:
ret = ret + conv1d(seq, ret.shape[-1], 1) # activation
else:
ret = ret + seq
return activation(ret) # activation
模型定義
class BaseGAttN:
def loss(logits, labels, nb_classes, class_weights):
sample_wts = tf.reduce_sum(tf.multiply(tf.one_hot(labels, nb_classes), class_weights), axis=-1)
xentropy = tf.multiply(tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=labels, logits=logits), sample_wts)
return tf.reduce_mean(xentropy, name='xentropy_mean')
def training(loss, lr, l2_coef):
# weight decay
vars = tf.trainable_variables()
lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in vars if v.name not in ['bias', 'gamma', 'b', 'g', 'beta']] * l2_coef)
# optimizer
opt = tf.train.AdamOptimizer(learning_rate = lr)
# training op
train_op = opt.minimize(loss + lossL2)
return train_op
def masked_softmax_cross_entropy(logits, labels, mask):
'''
Softmax cross-entropy loss with masking.
logits: 模型的輸出,維度(B, C); B是樣本量, C是輸出維度
labels: 模型的標簽,維度(B, C)
mask: 掩碼,維度(B, )
'''
# logits 先用softmax轉化為概率分布,再和labelsj計算交叉熵
# loss 維度是(B,)
loss = tf.nn.softmax_cross_entropy_with_logits(logits = logits, labels = labels)
# 將數據類型轉化為 tf.float32
mask = tf.cast(mask, dtype = tf.float32)
# 將mask值歸一化
mask /= tf.reduce_mean(mask)
# 屏蔽掉某些樣本的損失
loss *= mask
# 返回均值損失
return tf.reduce_mean(loss)
def masked_sigmoid_cross_entropy(logits, labels, mask):
'''
Softmax cross-entropy loss with masking.
logits:(B, C), 模型輸出; B是樣本量,C是輸出維度
labels:(B, C), 真實標簽
mask: 掩碼,維度(B,)
'''
labels = tf.cast(mask, dtype = tf.float32)
# loss 維度是(B,)
loss = tf.nn.sigmoid_cross_entropy_with_logits(logits = logits, labels = labels)
# (B,C) =>(B,)
loss = tf.reduce_mean(loss, axis = 1)
mask /= tf.reduce_mean(mask)
loss *= mask
return tf.reduce_mean(loss)
def masked_accuracy(logits, labels, mask):
'''
Accuracy with masking
logits:(B, C), 模型輸出; B是樣本量, C是輸出維度
labels:(B, C), 真實標簽
mask: 掩碼,維度(B,)
'''
# 計算預測值和真實值的索引相同,則預測正確
correct_prediction = tf.equal( tf.argmax(logits, 1), tf.argmax(labels, 1) )
accuracy_all = tf.cast( correct_prediction, tf.float32 )
mask = tf.cast( mask, dtype = tf.float32 )
mask /= tf.reduce_mean(mask)
accuracy_all *= mask
return tf.reduce_mean(accuracy_all)
#%%
class GAT(BaseGAttN):
def inference(inputs, nb_classes, nb_nodes, training, attn_drop, ffd_drop, bias_mat,
hid_mat, hid_units, n_heads, activation = tf.nn.elu, residual = False):
'''
inputs:(B,N,D), B是batch size, N是節點數, D是每個節點的原始特征維數
nb_classes: 分類任務的類別數, 設為C
nb_nodes: 節點個數,設為N
training: 標志'訓練階段', '測試階段'
attn_drop: 注意力矩陣dropout率,防止過擬合
ffd_drop: 輸入的dropout率,防止過擬合
bias_mat: 一個(N, N)矩陣,由鄰接矩陣A變化而來,是注意力矩陣的掩碼
hid_units: 列表, 第i個元素是第i層的每個注意力頭的隱藏單元數
n_heads: 列表, 第i個元素是第i層的注意力頭數
activation: 激活函數
resudial: 是否采用殘差連接
'''
'''
第一層,由H1個注意力頭,每個頭的輸入都是(B, N, D), 每個頭的注意力輸出都是(B, N, F1)
將所有注意力頭的輸出聚合, 聚合為(B, N, F1*H1)
'''
attns = []
# n_heads[0] = 第一層注意力頭數, 設為 H1
for i in range(n_heads[0]):
attns.append(
attn_head(inputs, bias_mat = bias_mat,
out_sz = hid_units[0], activation = activatoin,
in_drop = ffd_drop, coef_drop = attn_drop, residual = False)
)
# [(B, N, F1), (B, N, F1)..] => (B, N, F1 * H1)
h_1 = tf.concat(attns, axis = -1) # 連接上一層
'''
中間層,層數是 len(hid_units)-1;
第i層有Hi個注意力頭,輸入是(B, N, F1*H1),每頭注意力輸出是(B, N, F1);
每層均聚合所有頭的注意力, 得到(B, N, Fi * Hi)
'''
# len(hid_units) = 中間層的個數
for i in range(1, len(hid_units)):
h_old = h_1 # 未使用
attns = []
# n_heads[i] = 中間第i層的注意力頭數,設為Hi
for _ in range(n_heads[i]):
attns.append(
attn_head(h_1, bias_mat = bias_mat,
out_sz = hid_units[i], activation = activation,
in_drop = ffd_drop, coef_drop = attn_drop, residual = residual)
)
# [(B, N, Fi), (B, N, Fi) ..] => (B, N, Fi*Hi)
h_1 = tf.concat(attns, axis = -1) # 連接上一層
'''
最后一層,共有n_heads[-1]個注意力,一般為1
輸入: 最后一層的輸出為(B, N, Fi*Hi)
輸出: (B, N, C), C是分類任務數
輸出:
'''
out = []
for i in range(n_heads[-1]):
out.append(
attn_head(h_1, bias_mat = bias_mat,
out_sz = nb_classes, activation = lambda x : x,
in_ drop = ffd_drop, coef_drop = attn_drop, residual = False )
)
# 將多頭注意力相加取平均
logits = tf.add_n(out) / n_heads[-1]
return logits