#-*-coding:utf8-*- __author = "buyizhiyou" __date = "2017-11-21" ''' 單步調試,結合漢字的識別學習lstm,ctc loss的tf實現,tensorflow1.4 ''' import tensorflow as tf import numpy as np import pdb import random def create_sparse(batch_size, dtype=np.int32): ''' 創建稀疏張量,ctc_loss中labels要求是稀疏張量,隨機生成序列長度在150~180之間的labels ''' indices = [] values = [] for i in range(batch_size): length = random.randint(150,180) for j in range(length): indices.append((i,j)) value = random.randint(0,779) values.append(value) indices = np.asarray(indices, dtype=np.int64) values = np.asarray(values, dtype=dtype) shape = np.asarray([batch_size, np.asarray(indices).max(0)[1] + 1], dtype=np.int64) #[64,180] return [indices, values, shape] W = tf.Variable(tf.truncated_normal([200,781],stddev=0.1), name="W")#num_hidden=200,num_classes=781(想象成780個漢字+blank),shape (200,781) b = tf.Variable(tf.constant(0., shape=[781]), name="b")#781 global_step = tf.Variable(0, trainable=False)#全局步驟計數 #構造輸入 inputs = tf.random_normal(shape=[64,60,3000], dtype=tf.float32)#為了測試,隨機batch_size=64張圖片,h=60,w=3000,w可以看成lstm的時間步,即lstm輸入的time_step=3000,h看成是每一時間步的輸入tensor的size shape = tf.shape(inputs)#array([ 64, 3000, 60], dtype=int32) batch_s, max_timesteps = shape[0], shape[1] #64,3000 output = create_sparse(64)#創建64張圖片對應的labels,稀疏張量,序列長度變長 seq_len = np.ones(64)*180 #180為變長序列的最大值 labels = tf.SparseTensor(values=output[1],indices=output[0],dense_shape=output[2]) pdb.set_trace() cell = tf.nn.rnn_cell.LSTMCell(200, state_is_tuple=True) inputs = tf.transpose(inputs,[0,2,1])#轉置,因為默認的tf.nn.dynamic_rnn中參數time_major=false,即inputs的shape 是`[batch_size, max_time, ...]`, ''' tf.nn.dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None, dtype=None, paralle l_iterations=None, swap_memory=False, time_major=False, scope=None) ''' outputs1, _ = tf.nn.dynamic_rnn(cell, inputs, seq_len, dtype=tf.float32)#(64, 3000, 200)動態rnn實現了輸入變長問題的解決方案http://blog.csdn.net/u010223750/article/details/71079036 outputs = tf.reshape(outputs1, [-1, 200])#(64×3000,200) logits0 = tf.matmul(outputs, W) + b logits1 = tf.reshape(logits0, [batch_s, -1, 781]) logits = tf.transpose(logits1, (1, 0, 2))#(3000, 64, 781) ''' tf.nn.ctc_loss(labels, inputs, sequence_length, preprocess_collapse_repeated=False, ctc_merge _repeated=True, ignore_longer_outputs_than_inputs=False, time_major=True) ''' loss = tf.nn.ctc_loss(logits, labels, seq_len)#關於ctc loss解決rnn輸出和序列不對齊問題 #http://blog.csdn.net/left_think/article/details/76370453 #https://zhuanlan.zhihu.com/p/23293860 cost = tf.reduce_mean(loss) optimizer = tf.train.MomentumOptimizer(learning_rate=0.01, momentum=0.9).minimize(cost, global_step=global_step) #decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, seq_len, merge_repeated=False)#or "tf.nn.ctc_greedy_decoder"一種解碼策略 #acc = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), labels)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) print (outputs.get_shape()) print (sess.run(loss))