Graph Attention Networks (GAT) 代碼解讀

1.1 代碼結構

.
|--- data 		    # Cora數據集
|--- models		    # GAT模型定義(gat.py)
|--- pre_trained	# 預訓練的模型
|--- utils		    # 工具定義

1.2 參數設置

GAT/execute_cora.py

# training params
batch_size = 1
nb_epochs = 100000
patience = 100
lr = 0.005  # learning rate
l2_coef = 0.0005  # weight decay
hid_units = [8] # numbers of hidden units per each attention head in each layer
n_heads = [8, 1] # additional entry for the output layer
residual = False
nonlinearity = tf.nn.elu
model = GAT

1.3 導入數據

GAT源碼默認使用的Cora數據集。Cora的相關代碼介紹可以參考這里

數據預處理部分和GCN源碼相同，可以參考這里

最終載入的數據adj為鄰接矩陣，表示2708篇文章之間的索引關系。features表示1433個單詞在2708篇文章中是否存在。

GAT/utils/process.py

def load_data(dataset_str):
    # ...
    print(adj.shape) # (2708, 2708)
    print(features.shape) #(2708, 1433)

1.4 特征預處理

GAT/utils/process.py

def preprocess_features(features):
    """Row-normalize feature matrix and convert to tuple representation"""
    rowsum = np.array(features.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)

    features = r_mat_inv.dot(features)
    return features.todense(), sparse_to_tuple(features)

1.5 模型定義-前向傳播

添加一層單頭注意力

'''
輸入是（B，N，D），B是batch size，N是節點數，D是每個節點的原始特征維數
輸出是（B，N，F），F是每個節點的新特征維數
每個節點從維度D到維度F是按注意力為權重聚合了鄰居節點的特征
'''

def att_head(seq, out_sz, bias_mat, activation, in_drop = 0.0, coef_drop = 0.0, residual = False):
    '''
    seq：輸入（B，N，D），B是batch size，N是節點數，D是每個節點的原始特征維數
    out_sz：每個節點的輸出特征維數，設為F
    bias_mat：（N，N）掩碼矩陣
    activation：激活函數
    in_drop：輸入的dropout率
    coef_drop：注意力矩陣的dropout率
    residual：是否使用殘差網絡
    '''
    
    with tf.name_scope('my_attn'):
        # drop out 防止過擬合;如果為0則不設置該層
        if in_drop != 0.0:
            seq = tf.nn.dropout(seq, 1.0 - in_drop)
        
        '''
        為了獲得足夠的表達能力以將輸入特征轉化為高級特征，需要至少一種可學習的線性變換。為此，作為第一步，
        我們學習一個W矩陣用於投影特征
        實現公式seq_fts = Wh，即每個節點的維度變換
        '''
        
        # F2F'
        seq_fts = tf.keras.layers.Conv1D(seq, out_sz, 1, use_bias=False)

        '''
        實現公式 f_1 = a(Whi); f_2 = a(Whj)
        f_1+f_2的轉置實現了logits = eij = a(Whi) + a(Whj)
        eij經過激活,softmax得到論文中的aij,即點i對點j的注意力
        bias_mat是為了讓非互為鄰居的注意力不要j進入softmax的計算
        只有互為鄰居的注意力才能進入softmax,從而保證了注意力在局部
        '''
        
        # (B, N, F) => (B, N, 1)
        f_1 = tf.keras.layers.Conv1D(seq_fts, 1, 1)
        # (B, N, F) => (B, N, 1)
        f_2 = tf.keras.layers.Conv1D (seq_fts, 1, 1)
        
        # (B, N, 1) + (B, N, 1) = (B, N, N)
        # logits 即 eij
        logits = f_1 + tf.transpose(f_2, [0, 2, 1])
        # (B, N, N) + (1, N, N) => (B, N, N) => softmax => (B, N, N)
        # 這里運用了 tensorflow 的廣播機制
        # 得到的logits 並不是一個對角矩陣, 這是因為 f_1 和 f_2並非同一個參數 a
        # logits{i,j} 等於 a1(Whi) + a2(Whj)
        
        # 注意力系數矩陣coefs=(aij)_{N*N}
        # bias_mat 體現 mask 思想, 保留了圖的結構信息, 

        
        coefs = tf.nn.softmax(tf.nn.leaky_relu(logits) + bias_mat)
        
        # 輸入矩陣、注意力系數矩陣的dropout操作
        if coef_drop != 0.0:
            coefs = tf.nn.dropout(coefs, 1.0 - coef_drop)
        if in_drop != 0.0:
            seq_fts = tf.nn.dropout(seq_fts, 1.0 - in_drop)
            
        '''
        實現 hi = sum(aijWhj)
        即節點i根據注意力聚合鄰居特征
        '''
        
        # (B, N, N) * (B, N, F) => (B, N, F)
        vals = tf.matmul(coefs, seq_fts)
        
        
        
        # 添加偏置項
        ret = tf.contrib.layers.bias_add(vals)
        
        '''
        添加殘差連接后,激活
        如果輸入(B, N, D)和聚合了節點特征的輸出(B, N, F)的最后一個維度相同,則直接相加
        否則將(B, N, D)線性變換為(B, N, F) 再相加
        '''
        
        # residual connection
        if residual:
            # D != F
            if seq.shape[-1] != ret.shape[-1]:
                ret = ret + conv1d(seq, ret.shape[-1], 1) # activation
            else:
                ret = ret + seq
        
        return activation(ret) # activation

模型定義

class BaseGAttN:
    def loss(logits, labels, nb_classes, class_weights):
        sample_wts = tf.reduce_sum(tf.multiply(tf.one_hot(labels, nb_classes), class_weights), axis=-1)
        xentropy = tf.multiply(tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=labels, logits=logits), sample_wts)
        return tf.reduce_mean(xentropy, name='xentropy_mean')
    
    def training(loss, lr, l2_coef):
        # weight decay
        vars = tf.trainable_variables()
        lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in vars if v.name not in ['bias', 'gamma', 'b', 'g', 'beta']] * l2_coef)
        
        # optimizer 
        opt = tf.train.AdamOptimizer(learning_rate = lr)
        
        # training op
        train_op = opt.minimize(loss + lossL2)
        
        return train_op
    
    
    def masked_softmax_cross_entropy(logits, labels, mask):
        '''
        Softmax cross-entropy loss with masking.
        logits: 模型的輸出,維度(B, C); B是樣本量, C是輸出維度
        labels: 模型的標簽,維度(B, C)
        mask: 掩碼,維度(B, )
        '''
        
        # logits 先用softmax轉化為概率分布,再和labelsj計算交叉熵
        # loss 維度是(B,)
        loss = tf.nn.softmax_cross_entropy_with_logits(logits = logits, labels = labels)
        
        # 將數據類型轉化為 tf.float32
        mask = tf.cast(mask, dtype = tf.float32)
        
        # 將mask值歸一化
        mask /= tf.reduce_mean(mask)
        
        # 屏蔽掉某些樣本的損失
        loss *= mask
        
        # 返回均值損失
        return tf.reduce_mean(loss)
    
    
    def masked_sigmoid_cross_entropy(logits, labels, mask):
        '''
        Softmax cross-entropy loss with masking.
        logits:(B, C), 模型輸出; B是樣本量,C是輸出維度
        labels:(B, C), 真實標簽
        mask: 掩碼,維度(B,)
        '''
        labels = tf.cast(mask, dtype = tf.float32)
        # loss 維度是(B,)
        loss = tf.nn.sigmoid_cross_entropy_with_logits(logits = logits, labels = labels)
        # (B,C) =>(B,)
        loss = tf.reduce_mean(loss, axis = 1)
        
        mask /= tf.reduce_mean(mask)
        loss *= mask
        
        return tf.reduce_mean(loss)
    
    def masked_accuracy(logits, labels, mask):
        '''
        Accuracy with masking
        logits:(B, C), 模型輸出; B是樣本量, C是輸出維度
        labels:(B, C), 真實標簽
        mask: 掩碼,維度(B,)
        '''
        
        # 計算預測值和真實值的索引相同,則預測正確
        correct_prediction = tf.equal( tf.argmax(logits, 1), tf.argmax(labels, 1) )
        accuracy_all = tf.cast( correct_prediction, tf.float32 )
        mask = tf.cast( mask, dtype = tf.float32 )
        mask /= tf.reduce_mean(mask)
        accuracy_all *= mask
        return tf.reduce_mean(accuracy_all)
    
    
#%%
class GAT(BaseGAttN):
    
    def inference(inputs, nb_classes, nb_nodes, training, attn_drop, ffd_drop, bias_mat,
                  hid_mat, hid_units, n_heads, activation = tf.nn.elu, residual = False):
        '''
        inputs:(B,N,D), B是batch size, N是節點數, D是每個節點的原始特征維數
        nb_classes: 分類任務的類別數, 設為C
        nb_nodes: 節點個數,設為N
        training: 標志'訓練階段', '測試階段'
        attn_drop: 注意力矩陣dropout率,防止過擬合
        ffd_drop: 輸入的dropout率,防止過擬合
        bias_mat: 一個(N, N)矩陣,由鄰接矩陣A變化而來,是注意力矩陣的掩碼
        hid_units: 列表, 第i個元素是第i層的每個注意力頭的隱藏單元數
        n_heads: 列表, 第i個元素是第i層的注意力頭數
        activation: 激活函數
        resudial: 是否采用殘差連接
        '''
        
        
        '''
        第一層,由H1個注意力頭,每個頭的輸入都是(B, N, D), 每個頭的注意力輸出都是(B, N, F1)
        將所有注意力頭的輸出聚合, 聚合為(B, N, F1*H1)
        '''
        attns = []
        # n_heads[0] = 第一層注意力頭數, 設為 H1
        for i in range(n_heads[0]):
            attns.append(
                    attn_head(inputs, bias_mat = bias_mat, 
                              out_sz = hid_units[0], activation = activatoin,
                              in_drop = ffd_drop, coef_drop = attn_drop, residual = False)
                    ) 
                    
        # [(B, N, F1), (B, N, F1)..] => (B, N, F1 * H1)
        
        h_1 = tf.concat(attns, axis = -1) # 連接上一層
        
        '''
        中間層,層數是 len(hid_units)-1;
        第i層有Hi個注意力頭,輸入是(B, N, F1*H1),每頭注意力輸出是(B, N, F1);
        每層均聚合所有頭的注意力, 得到(B, N, Fi * Hi)
        '''
        # len(hid_units) = 中間層的個數
        for i in range(1, len(hid_units)):
            h_old = h_1 # 未使用
            attns = []
            # n_heads[i] = 中間第i層的注意力頭數,設為Hi
            for _ in range(n_heads[i]):
                attns.append(
                        attn_head(h_1, bias_mat = bias_mat,
                                  out_sz = hid_units[i], activation = activation,
                                  in_drop = ffd_drop, coef_drop = attn_drop, residual = residual)
                        )
            
            # [(B, N, Fi), (B, N, Fi) ..] => (B, N, Fi*Hi)
            h_1 = tf.concat(attns, axis = -1) # 連接上一層
        
        '''
        最后一層,共有n_heads[-1]個注意力,一般為1
        輸入: 最后一層的輸出為(B, N, Fi*Hi)
        輸出: (B, N, C), C是分類任務數
        輸出:
        '''
        
        
        out = []
        for i in range(n_heads[-1]):
            out.append(
                    attn_head(h_1, bias_mat = bias_mat, 
                              out_sz = nb_classes, activation = lambda x : x,
                              in_  drop = ffd_drop, coef_drop = attn_drop, residual = False   )
                    )
        
        # 將多頭注意力相加取平均
        logits = tf.add_n(out) / n_heads[-1]
        
        return logits

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 論文解讀（GAT）《Graph Attention Networks》 GRAPH ATTENTION NETWORKS（GAT）圖注意力網絡 Graph Attention Networks 圖注意力網絡-Graph Attention Network (GAT) 論文筆記之：Graph Attention Networks 《Heterogeneous Graph Attention Network》論文解讀 Graph Attention Network (GAT) 圖注意力網絡論文詳解 ICLR2018 【大綜解讀】A Comprehensive Survey on Graph Neural Networks 論文解讀（AGCN）《 Attention-driven Graph Clustering Network》論文解讀（GIN）《How Powerful are Graph Neural Networks》