最近閱讀了有關文本分類的文章,其中有一篇名為《Adversarail Training for Semi-supervised Text Classification》, 其主要思路實在文本訓練時增加了一個擾動因子,即在embedding層加入一個小的擾動,發現訓練的結果比不加要好很多。
模型的網絡結構如下圖:
下面就介紹一下這個對抗因子r的生成過程:
在進入lstm網絡前先進行從w到v的計算,即將wordembedding 歸一化:
然后定義模型的損失函數,令輸入為x,參數為θ,Radv為對抗訓練因子,損失函數為:
其中一個細節,雖然θˆ 是θ的復制,但是它是計算擾動的過程,不會參與到計算梯度的反向傳播算法中。
然后就是求擾動:
先對表達式求導得到倒數g,然后對倒數g進行l2正則化的線性變換。
至此擾動則計算完成然后加入之前的wordembedding中參與模型訓練。
下面則是模型的代碼部分:
#構建adversarailLSTM模型 class AdversarailLSTM(object): def __init__(self, config, wordEmbedding, indexFreqs): #定義輸入 self.inputX = tf.placeholder(tf.int32, [None, config.sequenceLength], name="inputX") self.inputY = tf.placeholder(tf.float32, [None, 1], name="inputY") self.dropoutKeepProb = tf.placeholder(tf.float32, name="dropoutKeepProb") #根據詞頻計算權重 indexFreqs[0], indexFreqs[1] = 20000, 10000 weights = tf.cast(tf.reshape(indexFreqs / tf.reduce_sum(indexFreqs), [1, len(indexFreqs)]), dtype=tf.float32) #詞嵌入層 with tf.name_scope("wordEmbedding"): #利用預訓練的詞向量初始化詞嵌入矩陣 normWordEmbedding = self._normalize(tf.cast(wordEmbedding, dtype=tf.float32, name="word2vec"), weights) #self.W = tf.Variable(tf.cast(wordEmbedding, dtype=tf.float32, name="word2vec"), name="W") self.embeddedWords = tf.nn.embedding_lookup(normWordEmbedding, self.inputX) #計算二元交叉熵損失 with tf.name_scope("loss"): with tf.variable_scope("Bi-LSTM", reuse=None): self.predictions = self._Bi_LSTMAttention(self.embeddedWords) self.binaryPreds = tf.cast(tf.greater_equal(self.predictions, 0.5), tf.float32, name="binaryPreds") losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.predictions, labels=self.inputY) loss = tf.reduce_mean(losses) with tf.name_scope("perturloss"): with tf.variable_scope("Bi-LSTM", reuse=True): perturWordEmbedding = self._addPerturbation(self.embeddedWords, loss) print("perturbSize:{}".format(perturWordEmbedding)) perturPredictions = self._Bi_LSTMAttention(perturWordEmbedding) perturLosses = tf.nn.sigmoid_cross_entropy_with_logits(logits=perturPredictions, labels=self.inputY) perturLoss = tf.reduce_mean(perturLosses) self.loss = loss + perturLoss def _Bi_LSTMAttention(self, embeddedWords): #定義兩層雙向LSTM的模型結構 with tf.name_scope("Bi-LSTM"): fwHiddenLayers = [] bwHiddenLayers = [] for idx, hiddenSize in enumerate(config.model.hiddenSizes): with tf.name_scope("Bi-LSTM" + str(idx)): #定義前向網絡結構 lstmFwCell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(num_units=hiddenSize, state_is_tuple=True), output_keep_prob=self.dropoutKeepProb) #定義反向網絡結構 lstmBwCell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(num_units=hiddenSize, state_is_tuple=True), output_keep_prob=self.dropoutKeepProb) fwHiddenLayers.append(lstmFwCell) bwHiddenLayers.append(lstmBwCell) # 實現多層的LSTM結構, state_is_tuple=True,則狀態會以元祖的形式組合(h, c),否則列向拼接 fwMultiLstm = tf.nn.rnn_cell.MultiRNNCell(cells=fwHiddenLayers, state_is_tuple=True) bwMultiLstm = tf.nn.rnn_cell.MultiRNNCell(cells=bwHiddenLayers, state_is_tuple=True) #采用動態rnn,可以動態地輸入序列的長度,若沒有輸入,則取序列的全長 #outputs是一個元組(output_fw, output_bw), 其中兩個元素的維度都是[batch_size, max_time, hidden_size], fw和bw的hiddensize一樣 #self.current_state是最終的狀態,二元組(state_fw, state_bw), state_fw=[batch_size, s], s是一個元組(h, c) outputs, self.current_state = tf.nn.bidirectional_dynamic_rnn(fwMultiLstm, bwMultiLstm, self.embeddedWords, dtype=tf.float32, scope="bi-lstm" + str(idx)) #在bi-lstm+attention論文中,將前向和后向的輸出相加 with tf.name_scope("Attention"): H = outputs[0] + outputs[1] #得到attention的輸出 output = self.attention(H) outputSize = config.model.hiddenSizes[-1] print("outputSize:{}".format(outputSize)) #全連接層的輸出 with tf.name_scope("output"): outputW = tf.get_variable( "outputW", shape=[outputSize, 1], initializer=tf.contrib.layers.xavier_initializer()) outputB = tf.Variable(tf.constant(0.1, shape=[1]), name="outputB") predictions = tf.nn.xw_plus_b(output, outputW, outputB, name="predictions") return predictions def attention(self, H): """ 利用Attention機制得到句子的向量表示 """ #獲得最后一層lstm神經元的數量 hiddenSize = config.model.hiddenSizes[-1] #初始化一個權重向量,是可訓練的參數 W = tf.Variable(tf.random_normal([hiddenSize], stddev=0.1)) #對bi-lstm的輸出用激活函數做非線性轉換 M = tf.tanh(H) #對W和M做矩陣運算,W=[batch_size, time_step, hidden_size], 計算前做維度轉換成[batch_size * time_step, hidden_size] #newM = [batch_size, time_step, 1], 每一個時間步的輸出由向量轉換成一個數字 newM = tf.matmul(tf.reshape(M, [-1, hiddenSize]), tf.reshape(W, [-1, 1])) #對newM做維度轉換成[batch_size, time_step] restoreM = tf.reshape(newM, [-1, config.sequenceLength]) #用softmax做歸一化處理[batch_size, time_step] self.alpha = tf.nn.softmax(restoreM) #利用求得的alpha的值對H進行加權求和,用矩陣運算直接操作 r = tf.matmul(tf.transpose(H, [0, 2, 1]), tf.reshape(self.alpha, [-1, config.sequenceLength, 1])) #將三維壓縮成二維sequeezeR = [batch_size, hissen_size] sequeezeR = tf.squeeze(r) sentenceRepren = tf.tanh(sequeezeR) #對attention的輸出可以做dropout處理 output = tf.nn.dropout(sentenceRepren, self.dropoutKeepProb) return output def _normalize(self, wordEmbedding, weights): """ 對word embedding 結合權重做標准化處理 """ mean = tf.matmul(weights, wordEmbedding) powWordEmbedding = tf.pow(wordEmbedding -mean, 2.) var = tf.matmul(weights, powWordEmbedding) stddev = tf.sqrt(1e-6 + var) return (wordEmbedding - mean) / stddev def _addPerturbation(self, embedded, loss): """ 添加波動到word embedding """ grad, =tf.gradients( loss, embedded, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N) grad = tf.stop_gradient(grad) perturb = self._scaleL2(grad, config.model.epsilon) #print("perturbSize:{}".format(embedded+perturb)) return embedded + perturb def _scaleL2(self, x, norm_length): #shape(x) = [batch, num_step, d] #divide x by max(abs(x)) for a numerically stable L2 norm #2norm(x) = a * 2norm(x/a) #scale over the full sequence, dim(1, 2) alpha = tf.reduce_max(tf.abs(x), (1, 2), keep_dims=True) + 1e-12 l2_norm = alpha * tf.sqrt(tf.reduce_sum(tf.pow(x/alpha, 2), (1, 2), keep_dims=True) + 1e-6) x_unit = x / l2_norm return norm_length * x_unit
代碼是在雙向lstm+attention的基礎上增加adversarial training,訓練數據為imdb電影評論數據,最后的結果發現確實很快就能達到最優值,但是訓練所占的空間比較大(電腦跑了幾十步就停止了),每一步的時間也稍微長一點。