文本分類的一種對抗訓練方法

本文轉載自查看原文 2019-04-01 15:52 1004 文本分類

最近閱讀了有關文本分類的文章，其中有一篇名為《Adversarail Training for Semi-supervised Text Classification》, 其主要思路實在文本訓練時增加了一個擾動因子，即在embedding層加入一個小的擾動，發現訓練的結果比不加要好很多。

模型的網絡結構如下圖：

下面就介紹一下這個對抗因子r的生成過程：

在進入lstm網絡前先進行從w到v的計算，即將wordembedding 歸一化:

然后定義模型的損失函數，令輸入為x，參數為θ，R_adv為對抗訓練因子，損失函數為：

其中一個細節，雖然θˆ 是θ的復制，但是它是計算擾動的過程，不會參與到計算梯度的反向傳播算法中。

然后就是求擾動：

先對表達式求導得到倒數g，然后對倒數g進行l2正則化的線性變換。

至此擾動則計算完成然后加入之前的wordembedding中參與模型訓練。

下面則是模型的代碼部分：

#構建adversarailLSTM模型

class AdversarailLSTM(object):
    
    def __init__(self, config, wordEmbedding, indexFreqs):
        
        #定義輸入
        self.inputX = tf.placeholder(tf.int32, [None, config.sequenceLength], name="inputX")
        self.inputY = tf.placeholder(tf.float32, [None, 1], name="inputY")
        
        self.dropoutKeepProb = tf.placeholder(tf.float32, name="dropoutKeepProb")
        
        
        #根據詞頻計算權重
        indexFreqs[0], indexFreqs[1] = 20000, 10000
        weights = tf.cast(tf.reshape(indexFreqs / tf.reduce_sum(indexFreqs), [1, len(indexFreqs)]), dtype=tf.float32)
        
        #詞嵌入層
        with tf.name_scope("wordEmbedding"):
            #利用預訓練的詞向量初始化詞嵌入矩陣
            
            normWordEmbedding = self._normalize(tf.cast(wordEmbedding, dtype=tf.float32, name="word2vec"), weights)
            #self.W = tf.Variable(tf.cast(wordEmbedding, dtype=tf.float32, name="word2vec"), name="W")
            self.embeddedWords = tf.nn.embedding_lookup(normWordEmbedding, self.inputX)
        #計算二元交叉熵損失
        with tf.name_scope("loss"):
            with tf.variable_scope("Bi-LSTM", reuse=None):
                self.predictions = self._Bi_LSTMAttention(self.embeddedWords)
                self.binaryPreds = tf.cast(tf.greater_equal(self.predictions, 0.5), tf.float32, name="binaryPreds")
                losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.predictions, labels=self.inputY)
                loss = tf.reduce_mean(losses)
                
        with tf.name_scope("perturloss"):
            with tf.variable_scope("Bi-LSTM", reuse=True):
                perturWordEmbedding = self._addPerturbation(self.embeddedWords, loss)
                print("perturbSize:{}".format(perturWordEmbedding))
                perturPredictions = self._Bi_LSTMAttention(perturWordEmbedding)
                perturLosses = tf.nn.sigmoid_cross_entropy_with_logits(logits=perturPredictions, labels=self.inputY)
                perturLoss = tf.reduce_mean(perturLosses)
            
        self.loss = loss + perturLoss
        
    def _Bi_LSTMAttention(self, embeddedWords):
        #定義兩層雙向LSTM的模型結構
        with tf.name_scope("Bi-LSTM"):
            fwHiddenLayers = []
            bwHiddenLayers = []
            for idx, hiddenSize in enumerate(config.model.hiddenSizes):
                with tf.name_scope("Bi-LSTM" + str(idx)):
                    #定義前向網絡結構
                    lstmFwCell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(num_units=hiddenSize, state_is_tuple=True),
                                                              output_keep_prob=self.dropoutKeepProb)
                    
                    #定義反向網絡結構
                    lstmBwCell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(num_units=hiddenSize, state_is_tuple=True),
                                                              output_keep_prob=self.dropoutKeepProb)
                    
                fwHiddenLayers.append(lstmFwCell)
                bwHiddenLayers.append(lstmBwCell)
                
             # 實現多層的LSTM結構， state_is_tuple=True，則狀態會以元祖的形式組合(h, c)，否則列向拼接
            fwMultiLstm = tf.nn.rnn_cell.MultiRNNCell(cells=fwHiddenLayers, state_is_tuple=True)
            bwMultiLstm = tf.nn.rnn_cell.MultiRNNCell(cells=bwHiddenLayers, state_is_tuple=True)
                    #采用動態rnn，可以動態地輸入序列的長度，若沒有輸入，則取序列的全長
                    #outputs是一個元組(output_fw, output_bw), 其中兩個元素的維度都是[batch_size, max_time, hidden_size], fw和bw的hiddensize一樣
                    #self.current_state是最終的狀態，二元組(state_fw, state_bw), state_fw=[batch_size, s], s是一個元組(h, c)
            outputs, self.current_state = tf.nn.bidirectional_dynamic_rnn(fwMultiLstm, bwMultiLstm,
                                                                            self.embeddedWords, dtype=tf.float32,
                                                                            scope="bi-lstm" + str(idx))
            
                
        #在bi-lstm+attention論文中，將前向和后向的輸出相加
        with tf.name_scope("Attention"):
            H = outputs[0] + outputs[1]
                    
            #得到attention的輸出
            output = self.attention(H)
            outputSize = config.model.hiddenSizes[-1]
            print("outputSize:{}".format(outputSize))
                    
        #全連接層的輸出
        with tf.name_scope("output"):
            outputW = tf.get_variable(
            "outputW",
            shape=[outputSize, 1],
            initializer=tf.contrib.layers.xavier_initializer())
                    
            outputB = tf.Variable(tf.constant(0.1, shape=[1]), name="outputB")
            
            predictions = tf.nn.xw_plus_b(output, outputW, outputB, name="predictions")
            
            return predictions
            
    def attention(self, H):
        """
        利用Attention機制得到句子的向量表示
        """
        #獲得最后一層lstm神經元的數量
        hiddenSize = config.model.hiddenSizes[-1]
                
        #初始化一個權重向量，是可訓練的參數
        W = tf.Variable(tf.random_normal([hiddenSize], stddev=0.1))
                
        #對bi-lstm的輸出用激活函數做非線性轉換
        M = tf.tanh(H)
                
        #對W和M做矩陣運算，W=[batch_size, time_step, hidden_size], 計算前做維度轉換成[batch_size * time_step, hidden_size]
        #newM = [batch_size, time_step, 1], 每一個時間步的輸出由向量轉換成一個數字
        newM = tf.matmul(tf.reshape(M, [-1, hiddenSize]), tf.reshape(W, [-1, 1]))
                
        #對newM做維度轉換成[batch_size, time_step]
        restoreM = tf.reshape(newM, [-1, config.sequenceLength])
                
        #用softmax做歸一化處理[batch_size, time_step]
        self.alpha = tf.nn.softmax(restoreM)
                
        #利用求得的alpha的值對H進行加權求和，用矩陣運算直接操作
        r = tf.matmul(tf.transpose(H, [0, 2, 1]), tf.reshape(self.alpha, [-1, config.sequenceLength, 1]))
                
        #將三維壓縮成二維sequeezeR = [batch_size, hissen_size]
        sequeezeR = tf.squeeze(r)
                
        sentenceRepren = tf.tanh(sequeezeR)
                
        #對attention的輸出可以做dropout處理
        output = tf.nn.dropout(sentenceRepren, self.dropoutKeepProb)
                
        return output
    
    def _normalize(self, wordEmbedding, weights):
        """
        對word embedding 結合權重做標准化處理
        """
        mean = tf.matmul(weights, wordEmbedding)
        powWordEmbedding = tf.pow(wordEmbedding -mean, 2.)
        
        var = tf.matmul(weights, powWordEmbedding)
        stddev = tf.sqrt(1e-6 + var)
        
        return (wordEmbedding - mean) / stddev
    
    def _addPerturbation(self, embedded, loss):
        """
        添加波動到word embedding
        """
        grad, =tf.gradients(
        loss,
        embedded,
        aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)
        grad = tf.stop_gradient(grad)
        perturb = self._scaleL2(grad, config.model.epsilon)
        #print("perturbSize:{}".format(embedded+perturb))
        return embedded + perturb
    
    def _scaleL2(self, x, norm_length):
        #shape(x) = [batch, num_step, d]
        #divide x by max(abs(x)) for a numerically stable L2 norm
        #2norm(x) = a * 2norm(x/a)
        #scale over the full sequence, dim(1, 2)
        alpha = tf.reduce_max(tf.abs(x), (1, 2), keep_dims=True) + 1e-12
        l2_norm = alpha * tf.sqrt(tf.reduce_sum(tf.pow(x/alpha, 2), (1, 2), keep_dims=True) + 1e-6)
        x_unit = x / l2_norm
        return norm_length * x_unit

代碼是在雙向lstm+attention的基礎上增加adversarial training，訓練數據為imdb電影評論數據，最后的結果發現確實很快就能達到最優值，但是訓練所占的空間比較大（電腦跑了幾十步就停止了），每一步的時間也稍微長一點。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 一種程序設計競賽的訓練方法（譯）六種用於文本分類的開源預訓練模型基於協同訓練的半監督文本分類算法用半監督算法做文本分類(自訓練) Pytorch——BERT 預訓練模型及文本分類文本分類實戰（十）—— BERT 預訓練模型文本分類解決方法綜述文本分類實戰（九）—— ELMO 預訓練模型文本分類和詞向量訓練工具fastText的參數和用法 NLP文本分類方法匯總