TensorFlow教程使用RNN生成唐詩


本教程轉載至:TensorFlow練習7: 基於RNN生成古詩詞

使用的數據集是全唐詩,首先提供一下數據集的下載鏈接:https://pan.baidu.com/s/13pNWfffr5HSN79WNb3Y0_w              提取碼:koss

RNN不像傳統的神經網絡-它們的輸出輸出是固定的,而RNN允許我們輸入輸出向量序列。RNN是為了對序列數據進行建模而產生的。本帖代碼移植自char-rnn,它是基於Torch的洋文模型,稍加修改即可應用於中文。char-rnn使用文本文件做為輸入、訓練RNN模型,然后使用它生成和訓練數據類似的文本。

下邊代碼有修改,以適應TensorFlow1.4和GPU平台

  1 #coding=utf-8
  2 import collections
  3 import numpy as np 
  4 import tensorflow as tf 
  5 import io
  6 import sys
  7 import os
  8 reload(sys)
  9 sys.setdefaultencoding('utf-8')
 10 #-------------------------------數據預處理---------------------------#
 11  
 12 poetry_file ='poetry.txt'
 13  
 14 # 詩集
 15 poetrys = []
 16 with io.open(poetry_file, "r", encoding='utf-8',) as f:
 17     for line in f:
 18         # print line
 19         try:
 20             title, content = line.strip().split(':')
 21             content = content.replace(' ','')
 22             if '_' in content or '(' in content or '' in content or '' in content or '[' in content:
 23                 continue
 24             if len(content) < 5 or len(content) > 79:
 25                 continue
 26             content = '[' + content + ']'
 27             poetrys.append(content)
 28         except Exception as e:
 29             pass
 30 
 31 #按詩的字數排序
 32 poetrys = sorted(poetrys,key=lambda line: len(line))
 33 print(u"唐詩總數: ")
 34 print(len(poetrys))
 35 print(u"測試")
 36 
 37 # 統計每個字出現次數
 38 all_words = []
 39 for poetry in poetrys:
 40     all_words += [word for word in poetry]
 41 counter = collections.Counter(all_words)
 42 count_pairs = sorted(counter.items(), key=lambda x: -x[1])
 43 words, _ = zip(*count_pairs)
 44 
 45 # 取前多少個常用字
 46 words = words[:len(words)] + (' ',)
 47 # 每個字映射為一個數字ID
 48 word_num_map = dict(zip(words, range(len(words))))
 49 # 把詩轉換為向量形式,參考TensorFlow練習1
 50 to_num = lambda word: word_num_map.get(word, len(words))
 51 poetrys_vector = [ list(map(to_num, poetry)) for poetry in poetrys]
 52 #[[314, 3199, 367, 1556, 26, 179, 680, 0, 3199, 41, 506, 40, 151, 4, 98, 1],
 53 #[339, 3, 133, 31, 302, 653, 512, 0, 37, 148, 294, 25, 54, 833, 3, 1, 965, 1315, 377, 1700, 562, 21, 37, 0, 2, 1253, 21, 36, 264, 877, 809, 1]
 54 #....]
 55 
 56 # 每次取64首詩進行訓練
 57 batch_size = 64
 58 n_chunk = len(poetrys_vector) // batch_size
 59 x_batches = []
 60 y_batches = []
 61 for i in range(n_chunk):
 62     start_index = i * batch_size
 63     end_index = start_index + batch_size
 64 
 65     batches = poetrys_vector[start_index:end_index]
 66     length = max(map(len,batches))
 67     xdata = np.full((batch_size,length), word_num_map[' '], np.int32)
 68     for row in range(batch_size):
 69         xdata[row,:len(batches[row])] = batches[row]
 70     ydata = np.copy(xdata)
 71     ydata[:,:-1] = xdata[:,1:]
 72     """
 73     xdata             ydata
 74     [6,2,4,6,9]       [2,4,6,9,9]
 75     [1,4,2,8,5]       [4,2,8,5,5]
 76     """
 77     x_batches.append(xdata)
 78     y_batches.append(ydata)
 79 
 80 #---------------------------------------RNN--------------------------------------#
 81 
 82 input_data = tf.placeholder(tf.int32, [batch_size, None])
 83 output_targets = tf.placeholder(tf.int32, [batch_size, None])
 84 # 定義RNN
 85 def neural_network(model='lstm', rnn_size=128, num_layers=2):
 86     if model == 'rnn':
 87         cell_fun = tf.nn.rnn_cell.BasicRNNCell
 88     elif model == 'gru':
 89         cell_fun = tf.nn.rnn_cell.GRUCell
 90     elif model == 'lstm':
 91         cell_fun = tf.nn.rnn_cell.BasicLSTMCell
 92 
 93     cell = cell_fun(rnn_size, state_is_tuple=True)
 94     cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)
 95 
 96     initial_state = cell.zero_state(batch_size, tf.float32)
 97 
 98     with tf.variable_scope('rnnlm'):
 99         softmax_w = tf.get_variable("softmax_w", [rnn_size, len(words)+1])
100         softmax_b = tf.get_variable("softmax_b", [len(words)+1])
101         with tf.device("/gpu:0"):
102             embedding = tf.get_variable("embedding", [len(words)+1, rnn_size])
103             inputs = tf.nn.embedding_lookup(embedding, input_data)
104 
105     outputs, last_state = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state, scope='rnnlm')
106     output = tf.reshape(outputs,[-1, rnn_size])
107 
108     logits = tf.matmul(output, softmax_w) + softmax_b
109     probs = tf.nn.softmax(logits)
110     return logits, last_state, probs, cell, initial_state
111 
112 ckpt_dir="./ckpt_dir"
113 if not os.path.exists(ckpt_dir):
114     os.makedirs(ckpt_dir)
115 
116 #訓練
117 def train_neural_network():
118     logits, last_state, _, _, _ = neural_network()
119     targets = tf.reshape(output_targets, [-1])
120     loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [targets], [tf.ones_like(targets, dtype=tf.float32)], len(words))
121     cost = tf.reduce_mean(loss)
122     learning_rate = tf.Variable(0.0, trainable=False)
123     tvars = tf.trainable_variables()
124     grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 5)
125     optimizer = tf.train.AdamOptimizer(learning_rate)
126     train_op = optimizer.apply_gradients(zip(grads, tvars))
127 
128     with tf.Session() as sess:
129         sess.run(tf.initialize_all_variables())
130 
131         saver = tf.train.Saver(tf.all_variables())
132 
133         for epoch in range(295):
134             sess.run(tf.assign(learning_rate, 0.002 * (0.97 ** epoch)))
135             n = 0
136             for batche in range(n_chunk):
137                 train_loss, _ , _ = sess.run([cost, last_state, train_op], feed_dict={input_data: x_batches[n], output_targets: y_batches[n]})
138                 n += 1
139                 print(epoch, batche, train_loss)
140             if epoch % 7 == 0:
141                 saver.save(sess, ckpt_dir+'/poetry.module', global_step=epoch)
142 
143 train_neural_network()

這里我只說自己對bug調試和調優的一些想法,具體代碼理解,請聯系作者本人。

首先是#coding=utf-8的問題,這里是告訴python環境,當前python腳本的文字編碼是utf-8,這里如果不調整的話,默認的ansii環境極有可能報告編碼錯誤。

之后是數據集的utf-8編碼問題,這里在encoding的時候,用了utf-8 的選項,但是卻沒有告訴python環境,字符集編碼是utf-8,會導致每次解析到的content和title都會報錯,最終處理完的數據集大小為0,設置sys的默認編碼可以解決。

同時,默認的open函數沒有encoding選項,這個是在io.open中的選項,這個地方需要修改。

還有一點是一些接口使用問題,比如saver.save現在需要一個parent directory

之后是預測的代碼

  1 #coding=utf-8
  2 import collections
  3 import numpy as np
  4 import tensorflow as tf
  5 import io
  6 import sys
  7 import os
  8 import pdb
  9 import time
 10 reload(sys)
 11 sys.setdefaultencoding('utf-8')
 12 #-------------------------------數據預處理---------------------------#
 13  
 14 poetry_file ='poetry.txt'
 15  
 16 # 詩集
 17 poetrys = []
 18 with io.open(poetry_file, "r", encoding='utf-8',) as f:
 19     for line in f:
 20         try:
 21             title, content = line.strip().split(':')
 22             content = content.replace(' ','')
 23             if '_' in content or '(' in content or '' in content or '' in content or '[' in content:
 24                 continue
 25             if len(content) < 5 or len(content) > 79:
 26                 continue
 27             content = '[' + content + ']'
 28             poetrys.append(content)
 29         except Exception as e: 
 30             pass
 31  
 32 # 按詩的字數排序
 33 poetrys = sorted(poetrys,key=lambda line: len(line))
 34 print(u'唐詩總數: ', len(poetrys))
 35  
 36 # 統計每個字出現次數
 37 all_words = []
 38 for poetry in poetrys:
 39     all_words += [word for word in poetry]
 40 counter = collections.Counter(all_words)
 41 count_pairs = sorted(counter.items(), key=lambda x: -x[1])
 42 words, _ = zip(*count_pairs)
 43  
 44 # 取前多少個常用字
 45 words = words[:len(words)] + (' ',)
 46 # 每個字映射為一個數字ID
 47 word_num_map = dict(zip(words, range(len(words))))
 48 # 把詩轉換為向量形式
 49 to_num = lambda word: word_num_map.get(word, len(words))
 50 poetrys_vector = [ list(map(to_num, poetry)) for poetry in poetrys]
 51 #[[314, 3199, 367, 1556, 26, 179, 680, 0, 3199, 41, 506, 40, 151, 4, 98, 1],
 52 #[339, 3, 133, 31, 302, 653, 512, 0, 37, 148, 294, 25, 54, 833, 3, 1, 965, 1315, 377, 1700, 562, 21, 37, 0, 2, 1253, 21, 36, 264, 877, 809, 1]
 53 #....]
 54  
 55 batch_size = 1
 56 n_chunk = len(poetrys_vector) // batch_size
 57 x_batches = []
 58 y_batches = []
 59 for i in range(n_chunk):
 60     start_index = i * batch_size
 61     end_index = start_index + batch_size
 62  
 63     batches = poetrys_vector[start_index:end_index]
 64     length = max(map(len,batches))
 65     xdata = np.full((batch_size,length), word_num_map[' '], np.int32)
 66     for row in range(batch_size):
 67         xdata[row,:len(batches[row])] = batches[row]
 68     ydata = np.copy(xdata)
 69     ydata[:,:-1] = xdata[:,1:]
 70     """
 71     xdata             ydata
 72     [6,2,4,6,9]       [2,4,6,9,9]
 73     [1,4,2,8,5]       [4,2,8,5,5]
 74     """
 75     x_batches.append(xdata)
 76     y_batches.append(ydata)
 77  
 78  
 79 #---------------------------------------RNN--------------------------------------#
 80  
 81 input_data = tf.placeholder(tf.int32, [batch_size, None])
 82 output_targets = tf.placeholder(tf.int32, [batch_size, None])
 83 # 定義RNN
 84 def neural_network(model='lstm', rnn_size=128, num_layers=2):
 85     if model == 'rnn':
 86         cell_fun = tf.nn.rnn_cell.BasicRNNCell
 87     elif model == 'gru':
 88         cell_fun = tf.nn.rnn_cell.GRUCell
 89     elif model == 'lstm':
 90         cell_fun = tf.nn.rnn_cell.BasicLSTMCell
 91  
 92     cell = cell_fun(rnn_size, state_is_tuple=True)
 93     cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)
 94  
 95     initial_state = cell.zero_state(batch_size, tf.float32)
 96  
 97     with tf.variable_scope('rnnlm'):
 98         softmax_w = tf.get_variable("softmax_w", [rnn_size, len(words)+1])
 99         softmax_b = tf.get_variable("softmax_b", [len(words)+1])
100         with tf.device("/gpu:0"):
101             embedding = tf.get_variable("embedding", [len(words)+1, rnn_size])
102             inputs = tf.nn.embedding_lookup(embedding, input_data)
103  
104     outputs, last_state = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state, scope='rnnlm')
105     output = tf.reshape(outputs,[-1, rnn_size])
106  
107     logits = tf.matmul(output, softmax_w) + softmax_b
108     probs = tf.nn.softmax(logits)
109     return logits, last_state, probs, cell, initial_state
110  
111 #-------------------------------生成古詩---------------------------------#
112 # 使用訓練完成的模型
113  
114 def gen_poetry():
115     def to_word(weights):
116         t = np.cumsum(weights)
117         s = np.sum(weights)
118         sample = int(np.searchsorted(t, np.random.rand(1)*s))
119         return words[sample]
120  
121     _, last_state, probs, cell, initial_state = neural_network()
122  
123     with tf.Session() as sess:
124         sess.run(tf.initialize_all_variables())
125  
126         saver = tf.train.Saver(tf.all_variables())
127         saver.restore(sess, './ckpt_dir/poetry.module-294')
128  
129         state_ = sess.run(cell.zero_state(1, tf.float32))
130  
131         x = np.array([list(map(word_num_map.get, '['))])
132         [probs_, state_] = sess.run([probs, last_state], feed_dict={input_data: x, initial_state: state_})
133         word = to_word(probs_)
134         #word = words[np.argmax(probs_)]
135         poem = ''
136         while word != ']':
137             poem += word
138             x = np.zeros((1,1))
139             x[0,0] = word_num_map[word]
140             [probs_, state_] = sess.run([probs, last_state], feed_dict={input_data: x, initial_state: state_})
141             word = to_word(probs_)
142             #word = words[np.argmax(probs_)]
143         return poem
144  
145 
146 
147 def gen_poetry_with_head(head):
148     def to_word(weights):
149         t = np.cumsum(weights)
150         s = np.sum(weights)
151         sample = int(np.searchsorted(t, np.random.rand(1)*s))
152         return words[sample]
153  
154     _, last_state, probs, cell, initial_state = neural_network()
155  
156     with tf.Session() as sess:
157         sess.run(tf.initialize_all_variables())
158  
159         saver = tf.train.Saver(tf.all_variables())
160         saver.restore(sess, './ckpt_dir/poetry.module-294')
161  
162         state_ = sess.run(cell.zero_state(1, tf.float32))
163         poem = ''
164         i = 0
165         # print head
166         # pdb.set_trace()
167         for word in head:
168             while word != '' and word != '':
169                 poem += word
170                 # print poem
171                 # print head
172                 # print word
173                 x = np.array([list(map(word_num_map.get, word))])
174                 [probs_, state_] = sess.run([probs, last_state], feed_dict={input_data: x, initial_state: state_})
175                 word = to_word(probs_)
176                 time.sleep(1)
177             if i % 2 == 0:
178                 poem += ''
179             else:
180                 poem += ''
181             i += 1
182         return poem
183 
184 print(gen_poetry())
185 # print(gen_poetry_with_head(u'一二三四'))

這個藏頭詩的代碼用法有問題,不建議使用,我調了很久才調好,這次還是先列原作者的代碼,下次單獨說這塊的調整和調優問題。

結果:

有那么點意思,但仔細看問題還是很大,胡言亂語,模型的調優遠遠不行。


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM