本教程轉載至:TensorFlow練習7: 基於RNN生成古詩詞
使用的數據集是全唐詩,首先提供一下數據集的下載鏈接:https://pan.baidu.com/s/13pNWfffr5HSN79WNb3Y0_w 提取碼:koss
RNN不像傳統的神經網絡-它們的輸出輸出是固定的,而RNN允許我們輸入輸出向量序列。RNN是為了對序列數據進行建模而產生的。本帖代碼移植自char-rnn,它是基於Torch的洋文模型,稍加修改即可應用於中文。char-rnn使用文本文件做為輸入、訓練RNN模型,然后使用它生成和訓練數據類似的文本。
下邊代碼有修改,以適應TensorFlow1.4和GPU平台
1 #coding=utf-8 2 import collections 3 import numpy as np 4 import tensorflow as tf 5 import io 6 import sys 7 import os 8 reload(sys) 9 sys.setdefaultencoding('utf-8') 10 #-------------------------------數據預處理---------------------------# 11 12 poetry_file ='poetry.txt' 13 14 # 詩集 15 poetrys = [] 16 with io.open(poetry_file, "r", encoding='utf-8',) as f: 17 for line in f: 18 # print line 19 try: 20 title, content = line.strip().split(':') 21 content = content.replace(' ','') 22 if '_' in content or '(' in content or '(' in content or '《' in content or '[' in content: 23 continue 24 if len(content) < 5 or len(content) > 79: 25 continue 26 content = '[' + content + ']' 27 poetrys.append(content) 28 except Exception as e: 29 pass 30 31 #按詩的字數排序 32 poetrys = sorted(poetrys,key=lambda line: len(line)) 33 print(u"唐詩總數: ") 34 print(len(poetrys)) 35 print(u"測試") 36 37 # 統計每個字出現次數 38 all_words = [] 39 for poetry in poetrys: 40 all_words += [word for word in poetry] 41 counter = collections.Counter(all_words) 42 count_pairs = sorted(counter.items(), key=lambda x: -x[1]) 43 words, _ = zip(*count_pairs) 44 45 # 取前多少個常用字 46 words = words[:len(words)] + (' ',) 47 # 每個字映射為一個數字ID 48 word_num_map = dict(zip(words, range(len(words)))) 49 # 把詩轉換為向量形式,參考TensorFlow練習1 50 to_num = lambda word: word_num_map.get(word, len(words)) 51 poetrys_vector = [ list(map(to_num, poetry)) for poetry in poetrys] 52 #[[314, 3199, 367, 1556, 26, 179, 680, 0, 3199, 41, 506, 40, 151, 4, 98, 1], 53 #[339, 3, 133, 31, 302, 653, 512, 0, 37, 148, 294, 25, 54, 833, 3, 1, 965, 1315, 377, 1700, 562, 21, 37, 0, 2, 1253, 21, 36, 264, 877, 809, 1] 54 #....] 55 56 # 每次取64首詩進行訓練 57 batch_size = 64 58 n_chunk = len(poetrys_vector) // batch_size 59 x_batches = [] 60 y_batches = [] 61 for i in range(n_chunk): 62 start_index = i * batch_size 63 end_index = start_index + batch_size 64 65 batches = poetrys_vector[start_index:end_index] 66 length = max(map(len,batches)) 67 xdata = np.full((batch_size,length), word_num_map[' '], np.int32) 68 for row in range(batch_size): 69 xdata[row,:len(batches[row])] = batches[row] 70 ydata = np.copy(xdata) 71 ydata[:,:-1] = xdata[:,1:] 72 """ 73 xdata ydata 74 [6,2,4,6,9] [2,4,6,9,9] 75 [1,4,2,8,5] [4,2,8,5,5] 76 """ 77 x_batches.append(xdata) 78 y_batches.append(ydata) 79 80 #---------------------------------------RNN--------------------------------------# 81 82 input_data = tf.placeholder(tf.int32, [batch_size, None]) 83 output_targets = tf.placeholder(tf.int32, [batch_size, None]) 84 # 定義RNN 85 def neural_network(model='lstm', rnn_size=128, num_layers=2): 86 if model == 'rnn': 87 cell_fun = tf.nn.rnn_cell.BasicRNNCell 88 elif model == 'gru': 89 cell_fun = tf.nn.rnn_cell.GRUCell 90 elif model == 'lstm': 91 cell_fun = tf.nn.rnn_cell.BasicLSTMCell 92 93 cell = cell_fun(rnn_size, state_is_tuple=True) 94 cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True) 95 96 initial_state = cell.zero_state(batch_size, tf.float32) 97 98 with tf.variable_scope('rnnlm'): 99 softmax_w = tf.get_variable("softmax_w", [rnn_size, len(words)+1]) 100 softmax_b = tf.get_variable("softmax_b", [len(words)+1]) 101 with tf.device("/gpu:0"): 102 embedding = tf.get_variable("embedding", [len(words)+1, rnn_size]) 103 inputs = tf.nn.embedding_lookup(embedding, input_data) 104 105 outputs, last_state = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state, scope='rnnlm') 106 output = tf.reshape(outputs,[-1, rnn_size]) 107 108 logits = tf.matmul(output, softmax_w) + softmax_b 109 probs = tf.nn.softmax(logits) 110 return logits, last_state, probs, cell, initial_state 111 112 ckpt_dir="./ckpt_dir" 113 if not os.path.exists(ckpt_dir): 114 os.makedirs(ckpt_dir) 115 116 #訓練 117 def train_neural_network(): 118 logits, last_state, _, _, _ = neural_network() 119 targets = tf.reshape(output_targets, [-1]) 120 loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [targets], [tf.ones_like(targets, dtype=tf.float32)], len(words)) 121 cost = tf.reduce_mean(loss) 122 learning_rate = tf.Variable(0.0, trainable=False) 123 tvars = tf.trainable_variables() 124 grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 5) 125 optimizer = tf.train.AdamOptimizer(learning_rate) 126 train_op = optimizer.apply_gradients(zip(grads, tvars)) 127 128 with tf.Session() as sess: 129 sess.run(tf.initialize_all_variables()) 130 131 saver = tf.train.Saver(tf.all_variables()) 132 133 for epoch in range(295): 134 sess.run(tf.assign(learning_rate, 0.002 * (0.97 ** epoch))) 135 n = 0 136 for batche in range(n_chunk): 137 train_loss, _ , _ = sess.run([cost, last_state, train_op], feed_dict={input_data: x_batches[n], output_targets: y_batches[n]}) 138 n += 1 139 print(epoch, batche, train_loss) 140 if epoch % 7 == 0: 141 saver.save(sess, ckpt_dir+'/poetry.module', global_step=epoch) 142 143 train_neural_network()
這里我只說自己對bug調試和調優的一些想法,具體代碼理解,請聯系作者本人。
首先是#coding=utf-8的問題,這里是告訴python環境,當前python腳本的文字編碼是utf-8,這里如果不調整的話,默認的ansii環境極有可能報告編碼錯誤。
之后是數據集的utf-8編碼問題,這里在encoding的時候,用了utf-8 的選項,但是卻沒有告訴python環境,字符集編碼是utf-8,會導致每次解析到的content和title都會報錯,最終處理完的數據集大小為0,設置sys的默認編碼可以解決。
同時,默認的open函數沒有encoding選項,這個是在io.open中的選項,這個地方需要修改。
還有一點是一些接口使用問題,比如saver.save現在需要一個parent directory
之后是預測的代碼
1 #coding=utf-8 2 import collections 3 import numpy as np 4 import tensorflow as tf 5 import io 6 import sys 7 import os 8 import pdb 9 import time 10 reload(sys) 11 sys.setdefaultencoding('utf-8') 12 #-------------------------------數據預處理---------------------------# 13 14 poetry_file ='poetry.txt' 15 16 # 詩集 17 poetrys = [] 18 with io.open(poetry_file, "r", encoding='utf-8',) as f: 19 for line in f: 20 try: 21 title, content = line.strip().split(':') 22 content = content.replace(' ','') 23 if '_' in content or '(' in content or '(' in content or '《' in content or '[' in content: 24 continue 25 if len(content) < 5 or len(content) > 79: 26 continue 27 content = '[' + content + ']' 28 poetrys.append(content) 29 except Exception as e: 30 pass 31 32 # 按詩的字數排序 33 poetrys = sorted(poetrys,key=lambda line: len(line)) 34 print(u'唐詩總數: ', len(poetrys)) 35 36 # 統計每個字出現次數 37 all_words = [] 38 for poetry in poetrys: 39 all_words += [word for word in poetry] 40 counter = collections.Counter(all_words) 41 count_pairs = sorted(counter.items(), key=lambda x: -x[1]) 42 words, _ = zip(*count_pairs) 43 44 # 取前多少個常用字 45 words = words[:len(words)] + (' ',) 46 # 每個字映射為一個數字ID 47 word_num_map = dict(zip(words, range(len(words)))) 48 # 把詩轉換為向量形式 49 to_num = lambda word: word_num_map.get(word, len(words)) 50 poetrys_vector = [ list(map(to_num, poetry)) for poetry in poetrys] 51 #[[314, 3199, 367, 1556, 26, 179, 680, 0, 3199, 41, 506, 40, 151, 4, 98, 1], 52 #[339, 3, 133, 31, 302, 653, 512, 0, 37, 148, 294, 25, 54, 833, 3, 1, 965, 1315, 377, 1700, 562, 21, 37, 0, 2, 1253, 21, 36, 264, 877, 809, 1] 53 #....] 54 55 batch_size = 1 56 n_chunk = len(poetrys_vector) // batch_size 57 x_batches = [] 58 y_batches = [] 59 for i in range(n_chunk): 60 start_index = i * batch_size 61 end_index = start_index + batch_size 62 63 batches = poetrys_vector[start_index:end_index] 64 length = max(map(len,batches)) 65 xdata = np.full((batch_size,length), word_num_map[' '], np.int32) 66 for row in range(batch_size): 67 xdata[row,:len(batches[row])] = batches[row] 68 ydata = np.copy(xdata) 69 ydata[:,:-1] = xdata[:,1:] 70 """ 71 xdata ydata 72 [6,2,4,6,9] [2,4,6,9,9] 73 [1,4,2,8,5] [4,2,8,5,5] 74 """ 75 x_batches.append(xdata) 76 y_batches.append(ydata) 77 78 79 #---------------------------------------RNN--------------------------------------# 80 81 input_data = tf.placeholder(tf.int32, [batch_size, None]) 82 output_targets = tf.placeholder(tf.int32, [batch_size, None]) 83 # 定義RNN 84 def neural_network(model='lstm', rnn_size=128, num_layers=2): 85 if model == 'rnn': 86 cell_fun = tf.nn.rnn_cell.BasicRNNCell 87 elif model == 'gru': 88 cell_fun = tf.nn.rnn_cell.GRUCell 89 elif model == 'lstm': 90 cell_fun = tf.nn.rnn_cell.BasicLSTMCell 91 92 cell = cell_fun(rnn_size, state_is_tuple=True) 93 cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True) 94 95 initial_state = cell.zero_state(batch_size, tf.float32) 96 97 with tf.variable_scope('rnnlm'): 98 softmax_w = tf.get_variable("softmax_w", [rnn_size, len(words)+1]) 99 softmax_b = tf.get_variable("softmax_b", [len(words)+1]) 100 with tf.device("/gpu:0"): 101 embedding = tf.get_variable("embedding", [len(words)+1, rnn_size]) 102 inputs = tf.nn.embedding_lookup(embedding, input_data) 103 104 outputs, last_state = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state, scope='rnnlm') 105 output = tf.reshape(outputs,[-1, rnn_size]) 106 107 logits = tf.matmul(output, softmax_w) + softmax_b 108 probs = tf.nn.softmax(logits) 109 return logits, last_state, probs, cell, initial_state 110 111 #-------------------------------生成古詩---------------------------------# 112 # 使用訓練完成的模型 113 114 def gen_poetry(): 115 def to_word(weights): 116 t = np.cumsum(weights) 117 s = np.sum(weights) 118 sample = int(np.searchsorted(t, np.random.rand(1)*s)) 119 return words[sample] 120 121 _, last_state, probs, cell, initial_state = neural_network() 122 123 with tf.Session() as sess: 124 sess.run(tf.initialize_all_variables()) 125 126 saver = tf.train.Saver(tf.all_variables()) 127 saver.restore(sess, './ckpt_dir/poetry.module-294') 128 129 state_ = sess.run(cell.zero_state(1, tf.float32)) 130 131 x = np.array([list(map(word_num_map.get, '['))]) 132 [probs_, state_] = sess.run([probs, last_state], feed_dict={input_data: x, initial_state: state_}) 133 word = to_word(probs_) 134 #word = words[np.argmax(probs_)] 135 poem = '' 136 while word != ']': 137 poem += word 138 x = np.zeros((1,1)) 139 x[0,0] = word_num_map[word] 140 [probs_, state_] = sess.run([probs, last_state], feed_dict={input_data: x, initial_state: state_}) 141 word = to_word(probs_) 142 #word = words[np.argmax(probs_)] 143 return poem 144 145 146 147 def gen_poetry_with_head(head): 148 def to_word(weights): 149 t = np.cumsum(weights) 150 s = np.sum(weights) 151 sample = int(np.searchsorted(t, np.random.rand(1)*s)) 152 return words[sample] 153 154 _, last_state, probs, cell, initial_state = neural_network() 155 156 with tf.Session() as sess: 157 sess.run(tf.initialize_all_variables()) 158 159 saver = tf.train.Saver(tf.all_variables()) 160 saver.restore(sess, './ckpt_dir/poetry.module-294') 161 162 state_ = sess.run(cell.zero_state(1, tf.float32)) 163 poem = '' 164 i = 0 165 # print head 166 # pdb.set_trace() 167 for word in head: 168 while word != ',' and word != '。': 169 poem += word 170 # print poem 171 # print head 172 # print word 173 x = np.array([list(map(word_num_map.get, word))]) 174 [probs_, state_] = sess.run([probs, last_state], feed_dict={input_data: x, initial_state: state_}) 175 word = to_word(probs_) 176 time.sleep(1) 177 if i % 2 == 0: 178 poem += ',' 179 else: 180 poem += '。' 181 i += 1 182 return poem 183 184 print(gen_poetry()) 185 # print(gen_poetry_with_head(u'一二三四'))
這個藏頭詩的代碼用法有問題,不建議使用,我調了很久才調好,這次還是先列原作者的代碼,下次單獨說這塊的調整和調優問題。
結果:
有那么點意思,但仔細看問題還是很大,胡言亂語,模型的調優遠遠不行。