梯度修剪
梯度修剪主要避免訓練梯度爆炸的問題,一般來說使用了 Batch Normalization 就不必要使用梯度修剪了,但還是有必要理解下實現的
In TensorFlow, the optimizer’s minimize() function takes care of both computing the gradients and applying them, so you must instead call the optimizer’s compute_gradients() method first, then create an operation to clip the gradients using the clip_by_value() function, and finally create an operation to apply the clipped gradients using the optimizer’s apply_gradients() method:
threshold = 1.0 optimizer = tf.train.GradientDescentOptimizer(learning_rate) grads_and_vars = optimizer.compute_gradients(loss) capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var) for grad, var in grads_and_vars] training_op = optimizer.apply_gradients(capped_gvs)
例子:

import tensorflow as tf def Swish(features): return features*tf.nn.sigmoid(features) # 1. create data from tensorflow.examples.tutorials.mnist import input_data mnist = input_data.read_data_sets('../MNIST_data', one_hot=True) X = tf.placeholder(tf.float32, shape=(None, 784), name='X') y = tf.placeholder(tf.int32, shape=(None), name='y') is_training = tf.placeholder(tf.bool, None, name='is_training') # 2. define network he_init = tf.contrib.layers.variance_scaling_initializer() with tf.name_scope('dnn'): hidden1 = tf.layers.dense(X, 300, kernel_initializer=he_init, name='hidden1') # hidden1 = tf.layers.batch_normalization(hidden1, momentum=0.9) hidden1 = tf.nn.relu(hidden1) hidden2 = tf.layers.dense(hidden1, 100, kernel_initializer=he_init, name='hidden2') # hidden2 = tf.layers.batch_normalization(hidden2, training=is_training, momentum=0.9) hidden2 = tf.nn.relu(hidden2) logits = tf.layers.dense(hidden2, 10, kernel_initializer=he_init, name='output') # prob = tf.layers.dense(hidden2, 10, tf.nn.softmax, name='prob') # 3. define loss with tf.name_scope('loss'): # tf.losses.sparse_softmax_cross_entropy() label is not one_hot and dtype is int* # xentropy = tf.losses.sparse_softmax_cross_entropy(labels=tf.argmax(y, axis=1), logits=logits) # tf.nn.sparse_softmax_cross_entropy_with_logits() label is not one_hot and dtype is int* # xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.argmax(y, axis=1), logits=logits) # loss = tf.reduce_mean(xentropy) loss = tf.losses.softmax_cross_entropy(onehot_labels=y, logits=logits) # label is one_hot # 4. define optimizer learning_rate = 0.01 with tf.name_scope('train'): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # for batch normalization with tf.control_dependencies(update_ops): # optimizer_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss) threshold = 1.0 optimizer = tf.train.GradientDescentOptimizer(learning_rate) grads_and_vars = optimizer.compute_gradients(loss) capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var) for grad, var in grads_and_vars] optimizer_op = optimizer.apply_gradients(capped_gvs) with tf.name_scope('eval'): correct = tf.nn.in_top_k(logits, tf.argmax(y, axis=1), 1) # 目標是否在前K個預測中, label's dtype is int* accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) # 5. initialize init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) saver = tf.train.Saver() # ================= print([v.name for v in tf.trainable_variables()]) print([v.name for v in tf.global_variables()]) # ================= # 5. train & test n_epochs = 20 n_batches = 50 batch_size = 50 with tf.Session() as sess: sess.run(init_op) for epoch in range(n_epochs): for iteration in range(mnist.train.num_examples // batch_size): X_batch, y_batch = mnist.train.next_batch(batch_size) sess.run(optimizer_op, feed_dict={X: X_batch, y: y_batch, is_training:True}) # ================= # for grad, var in grads_and_vars: # grad = grad.eval(feed_dict={X: X_batch, y: y_batch, is_training:True}) # var = var.eval() # ================= acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch, is_training:False}) # 最后一個 batch 的 accuracy acc_test = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels, is_training:False}) loss_test = loss.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels, is_training:False}) print(epoch, "Train accuracy:", acc_train, "Test accuracy:", acc_test, "Test loss:", loss_test) save_path = saver.save(sess, "./my_model_final.ckpt") with tf.Session() as sess: sess.run(init_op) saver.restore(sess, "./my_model_final.ckpt") acc_test = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels, is_training:False}) loss_test = loss.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels, is_training:False}) print("Test accuracy:", acc_test, ", Test loss:", loss_test)
下面我們來看看上面這個例子里所涉及的一些東西
compute_gradients
compute_gradients 是任何一個優化器都有的方法:
compute_gradients( loss, var_list=None, gate_gradients=GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, grad_loss=None )
計算 loss 中可訓練的 var_list 中的梯度。
相當於minimize() 的第一步,返回 (gradient, variable) 列表。
獲得了梯度后我們就可以手動進行梯度裁剪了,下面這句話就是將梯度限制到 [-threshold, threshold] 的范圍內:
capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var) for grad, var in grads_and_vars]
apply_gradients
apply_gradients 同樣是任何一個優化器都有的方法:
apply_gradients( grads_and_vars, global_step=None, name=None )
minimize() 的第二部分,返回一個執行梯度更新的 ops。
Max-Norm Regularization
對於每個節點,max-norm regularization 會對權重 $\mathbf{w}$ 進行限制 $\lVert \mathbf{w} \rVert_2 \le r$:
\begin{equation}
\label{a}
\mathbf{w} \gets \mathbf{w} \frac{r}{\lVert \mathbf{w} \rVert_2}
\end{equation}
實例代碼:

import tensorflow as tf # ================= def max_norm_regularizer(threshold=1.0, axes=1, name="max_norm", collection="max_norm"): def max_norm(weights): clipped = tf.clip_by_norm(weights, clip_norm=threshold, axes=axes) clip_weights = tf.assign(weights, clipped, name=name) tf.add_to_collection(collection, clip_weights) return None # there is no regularization loss term return max_norm max_norm_reg = max_norm_regularizer(threshold=1.0) # ================= # 1. create data from tensorflow.examples.tutorials.mnist import input_data mnist = input_data.read_data_sets('../MNIST_data', one_hot=True) X = tf.placeholder(tf.float32, shape=(None, 784), name='X') y = tf.placeholder(tf.int32, shape=(None), name='y') is_training = tf.placeholder(tf.bool, None, name='is_training') # 2. define network he_init = tf.contrib.layers.variance_scaling_initializer() with tf.name_scope('dnn'): hidden1 = tf.layers.dense(X, 300, kernel_initializer=he_init, kernel_regularizer=max_norm_reg, name='hidden1') # hidden1 = tf.layers.batch_normalization(hidden1, momentum=0.9) hidden1 = tf.nn.relu(hidden1) hidden2 = tf.layers.dense(hidden1, 100, kernel_initializer=he_init, kernel_regularizer=max_norm_reg, name='hidden2') # hidden2 = tf.layers.batch_normalization(hidden2, training=is_training, momentum=0.9) hidden2 = tf.nn.relu(hidden2) logits = tf.layers.dense(hidden2, 10, kernel_initializer=he_init, name='output') # 3. define loss with tf.name_scope('loss'): loss = tf.losses.softmax_cross_entropy(onehot_labels=y, logits=logits) # label is one_hot # 4. define optimizer learning_rate_init = 0.01 global_step = tf.Variable(0, trainable=False) with tf.name_scope('train'): learning_rate = tf.train.polynomial_decay( # 多項式衰減 learning_rate=learning_rate_init, # 初始學習率 global_step=global_step, # 當前迭代次數 decay_steps=22000, # 在迭代到該次數實際,學習率衰減為 learning_rate * dacay_rate end_learning_rate=learning_rate_init / 10, # 最小的學習率 power=0.9, cycle=False ) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # for batch normalization with tf.control_dependencies(update_ops): optimizer_op = tf.train.MomentumOptimizer( learning_rate=learning_rate, momentum=0.9).minimize( loss=loss, var_list=tf.trainable_variables(), global_step=global_step # 不指定的話學習率不更新 ) # ================= clip gradient # threshold = 1.0 # optimizer = tf.train.GradientDescentOptimizer(learning_rate) # grads_and_vars = optimizer.compute_gradients(loss) # capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var) # for grad, var in grads_and_vars] # optimizer_op = optimizer.apply_gradients(capped_gvs) # ================= with tf.name_scope('eval'): correct = tf.nn.in_top_k(logits, tf.argmax(y, axis=1), 1) # 目標是否在前K個預測中, label's dtype is int* accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) # 5. initialize init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) saver = tf.train.Saver() # ================= clip_all_weights = tf.get_collection("max_norm") # ================= # 6. train & test n_epochs = 20 batch_size = 50 with tf.Session() as sess: sess.run(init_op) # saver.restore(sess, './my_model_final.ckpt') for epoch in range(n_epochs): for iteration in range(mnist.train.num_examples // batch_size): X_batch, y_batch = mnist.train.next_batch(batch_size) sess.run([optimizer_op, learning_rate], feed_dict={X: X_batch, y: y_batch, is_training:True}) sess.run(clip_all_weights) # ================= check gradient # for grad, var in grads_and_vars: # grad = grad.eval(feed_dict={X: X_batch, y: y_batch, is_training:True}) # var = var.eval() # ================= learning_rate_cur = learning_rate.eval() acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch, is_training:False}) # 最后一個 batch 的 accuracy acc_test = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels, is_training:False}) loss_test = loss.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels, is_training:False}) print(epoch, "Current learning rate:", learning_rate_cur, "Train accuracy:", acc_train, "Test accuracy:", acc_test, "Test loss:", loss_test) save_path = saver.save(sess, "./my_model_final.ckpt")