TensorFlow練習7: 基於RNN生成古詩詞

本文轉載自查看原文 2017-08-01 20:21 1507 LSTM/ 數據挖掘及機器學習

http://blog.topspeedsnail.com/archives/10542

主題 TensorFlow

RNN不像傳統的神經網絡-它們的輸出輸出是固定的，而RNN允許我們輸入輸出向量序列。RNN是為了對序列數據進行建模而產生的。

樣本序列性：樣本間存在順序關系，每個樣本和它之前的樣本存在關聯。比如說，在文本中，一個詞和它前面的詞是有關聯的；在氣象數據中，一天的氣溫和前幾天的氣溫是有關聯的。

例如本帖要使用RNN生成古詩，你給它輸入一堆古詩詞，它會學着生成和前面相關聯的字詞。如果你給它輸入一堆姓名，它會學着生成姓名；給它輸入一堆古典樂/歌詞，它會學着生成古典樂/歌詞，甚至可以給它輸入源代碼。

關於RNN：

本帖代碼移植自 char-rnn ，它是基於Torch的洋文模型，稍加修改即可應用於中文。char-rnn使用文本文件做為輸入、訓練RNN模型，然后使用它生成和訓練數據類似的文本。

使用的數據集：全唐詩(43030首)：https://pan.baidu.com/s/1o7QlUhO

訓練：

import os
import collections
import numpyas np
import tensorflowas tf
from tensorflow.python.opsimport rnn_cell from tensorflow.python.opsimport seq2seq import time   #-------------------------------數據預處理---------------------------#   poetry_file ='poetry.txt'   # 詩集 poetrys = [] with open(poetry_file, "r", encoding='utf-8',) as f: for linein f: try: title, content = line.strip().split(':') content = content.replace(' ','') if '_' in contentor '(' in contentor '（' in contentor '《' in contentor '[' in content: continue if len(content) < 5 or len(content) > 79: continue content = '[' + content + ']' poetrys.append(content) except Exception as e: pass   # 按詩的字數排序 poetrys = sorted(poetrys,key=lambda line: len(line)) print('唐詩總數: ', len(poetrys))   # 統計每個字出現次數 all_words = [] for poetryin poetrys: all_words += [wordfor wordin poetry] counter = collections.Counter(all_words) count_pairs = sorted(counter.items(), key=lambda x: -x[1]) words, _ = zip(*count_pairs)   # 取前多少個常用字 words = words[:len(words)] + (' ',) # 每個字映射為一個數字ID word_num_map = dict(zip(words, range(len(words)))) # 把詩轉換為向量形式，參考TensorFlow練習1 to_num = lambda word: word_num_map.get(word, len(words)) poetrys_vector = [ list(map(to_num, poetry)) for poetryin poetrys] #[[314, 3199, 367, 1556, 26, 179, 680, 0, 3199, 41, 506, 40, 151, 4, 98, 1], #[339, 3, 133, 31, 302, 653, 512, 0, 37, 148, 294, 25, 54, 833, 3, 1, 965, 1315, 377, 1700, 562, 21, 37, 0, 2, 1253, 21, 36, 264, 877, 809, 1] #....]   # 每次取64首詩進行訓練 batch_size = 64 n_chunk = len(poetrys_vector) // batch_size x_batches = [] y_batches = [] for i in range(n_chunk): start_index = i * batch_size end_index = start_index + batch_size   batches = poetrys_vector[start_index:end_index] length = max(map(len,batches)) xdata = np.full((batch_size,length), word_num_map[' '], np.int32) for rowin range(batch_size): xdata[row,:len(batches[row])] = batches[row] ydata = np.copy(xdata) ydata[:,:-1] = xdata[:,1:] """ xdata             ydata [6,2,4,6,9]       [2,4,6,9,9] [1,4,2,8,5]       [4,2,8,5,5] """ x_batches.append(xdata) y_batches.append(ydata)     #---------------------------------------RNN--------------------------------------#   input_data = tf.placeholder(tf.int32, [batch_size, None]) output_targets = tf.placeholder(tf.int32, [batch_size, None]) # 定義RNN def neural_network(model='lstm', rnn_size=128, num_layers=2): if model == 'rnn': cell_fun = rnn_cell.BasicRNNCell elif model == 'gru': cell_fun = rnn_cell.GRUCell elif model == 'lstm': cell_fun = rnn_cell.BasicLSTMCell   cell = cell_fun(rnn_size, state_is_tuple=True) cell = rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)   initial_state = cell.zero_state(batch_size, tf.float32)   with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [rnn_size, len(words)+1]) softmax_b = tf.get_variable("softmax_b", [len(words)+1]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [len(words)+1, rnn_size]) inputs = tf.nn.embedding_lookup(embedding, input_data)   outputs, last_state = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state, scope='rnnlm') output = tf.reshape(outputs,[-1, rnn_size])   logits = tf.matmul(output, softmax_w) + softmax_b probs = tf.nn.softmax(logits) return logits, last_state, probs, cell, initial_state #訓練 def train_neural_network(): logits, last_state, _, _, _ = neural_network() targets = tf.reshape(output_targets, [-1]) loss = seq2seq.sequence_loss_by_example([logits], [targets], [tf.ones_like(targets, dtype=tf.float32)], len(words)) cost = tf.reduce_mean(loss) learning_rate = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 5) optimizer = tf.train.AdamOptimizer(learning_rate) train_op = optimizer.apply_gradients(zip(grads, tvars))   with tf.Session() as sess: sess.run(tf.initialize_all_variables())   saver = tf.train.Saver(tf.all_variables())   for epochin range(50): sess.run(tf.assign(learning_rate, 0.002 * (0.97 ** epoch))) n = 0 for batchein range(n_chunk): train_loss, _ , _ = sess.run([cost, last_state, train_op], feed_dict={input_data: x_batches[n], output_targets: y_batches[n]}) n += 1 print(epoch, batche, train_loss) if epoch % 7 == 0: saver.save(sess, 'poetry.module', global_step=epoch)   train_neural_network()

使用訓練好的模型生成古詩：

import os
import collections import numpyas np import tensorflowas tf from tensorflow.python.opsimport rnn_cell from tensorflow.python.opsimport seq2seq import time   #-------------------------------數據預處理---------------------------#   poetry_file ='poetry.txt'   # 詩集 poetrys = [] with open(poetry_file, "r", encoding='utf-8',) as f: for linein f: try: title, content = line.strip().split(':') content = content.replace(' ','') if '_' in contentor '(' in contentor '（' in contentor '《' in contentor '[' in content: continue if len(content) < 5 or len(content) > 79: continue content = '[' + content + ']' poetrys.append(content) except Exception as e: pass   # 按詩的字數排序 poetrys = sorted(poetrys,key=lambda line: len(line)) print('唐詩總數: ', len(poetrys))   # 統計每個字出現次數 all_words = [] for poetryin poetrys: all_words += [wordfor wordin poetry] counter = collections.Counter(all_words) count_pairs = sorted(counter.items(), key=lambda x: -x[1]) words, _ = zip(*count_pairs)   # 取前多少個常用字 words = words[:len(words)] + (' ',) # 每個字映射為一個數字ID word_num_map = dict(zip(words, range(len(words)))) # 把詩轉換為向量形式，參考TensorFlow練習1 to_num = lambda word: word_num_map.get(word, len(words)) poetrys_vector = [ list(map(to_num, poetry)) for poetryin poetrys] #[[314, 3199, 367, 1556, 26, 179, 680, 0, 3199, 41, 506, 40, 151, 4, 98, 1], #[339, 3, 133, 31, 302, 653, 512, 0, 37, 148, 294, 25, 54, 833, 3, 1, 965, 1315, 377, 1700, 562, 21, 37, 0, 2, 1253, 21, 36, 264, 877, 809, 1] #....]   # 每次取64首詩進行訓練 batch_size = 1 n_chunk = len(poetrys_vector) // batch_size x_batches = [] y_batches = [] for i in range(n_chunk): start_index = i * batch_size end_index = start_index + batch_size   batches = poetrys_vector[start_index:end_index] length = max(map(len,batches)) xdata = np.full((batch_size,length), word_num_map[' '], np.int32) for rowin range(batch_size): xdata[row,:len(batches[row])] = batches[row] ydata = np.copy(xdata) ydata[:,:-1] = xdata[:,1:] """ xdata             ydata [6,2,4,6,9]       [2,4,6,9,9] [1,4,2,8,5]       [4,2,8,5,5] """ x_batches.append(xdata) y_batches.append(ydata)     #---------------------------------------RNN--------------------------------------#   input_data = tf.placeholder(tf.int32, [batch_size, None]) output_targets = tf.placeholder(tf.int32, [batch_size, None]) # 定義RNN def neural_network(model='lstm', rnn_size=128, num_layers=2): if model == 'rnn': cell_fun = rnn_cell.BasicRNNCell elif model == 'gru': cell_fun = rnn_cell.GRUCell elif model == 'lstm': cell_fun = rnn_cell.BasicLSTMCell   cell = cell_fun(rnn_size, state_is_tuple=True) cell = rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)   initial_state = cell.zero_state(batch_size, tf.float32)   with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [rnn_size, len(words)+1]) softmax_b = tf.get_variable("softmax_b", [len(words)+1]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [len(words)+1, rnn_size]) inputs = tf.nn.embedding_lookup(embedding, input_data)   outputs, last_state = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state, scope='rnnlm') output = tf.reshape(outputs,[-1, rnn_size])   logits = tf.matmul(output, softmax_w) + softmax_b probs = tf.nn.softmax(logits) return logits, last_state, probs, cell, initial_state   #-------------------------------生成古詩---------------------------------# # 使用訓練完成的模型   def gen_poetry(): def to_word(weights): t = np.cumsum(weights) s = np.sum(weights) sample = int(np.searchsorted(t, np.random.rand(1)*s)) return words[sample]   _, last_state, probs, cell, initial_state = neural_network()   with tf.Session() as sess: sess.run(tf.initialize_all_variables())   saver = tf.train.Saver(tf.all_variables()) saver.restore(sess, 'poetry.module-49')   state_ = sess.run(cell.zero_state(1, tf.float32))   x = np.array([list(map(word_num_map.get, '['))]) [probs_, state_] = sess.run([probs, last_state], feed_dict={input_data: x, initial_state: state_}) word = to_word(probs_) #word = words[np.argmax(probs_)] poem = '' while word != ']': poem += word x = np.zeros((1,1)) x[0,0] = word_num_map[word] [probs_, state_] = sess.run([probs, last_state], feed_dict={input_data: x, initial_state: state_}) word = to_word(probs_) #word = words[np.argmax(probs_)] return poem   print(gen_poetry())

生成的“詩詞”（至少格式對上了）：

新犬隨風起，一璃跡陣悲。
淺昏罷庄哉，清插去園空。
雙葉坐成鑒，王妓水正苑。
鳥聲不成影，胙灘朱瓮聲。
無斑紅蕪踏，那期日正閑。
吾燕登無士，無處得趙名。

生成藏頭詩：

def gen_poetry_with_head(head): def to_word(weights): t = np.cumsum(weights) s = np.sum(weights) sample = int(np.searchsorted(t, np.random.rand(1)*s)) return words[sample]   _, last_state, probs, cell, initial_state = neural_network()   with tf.Session() as sess: sess.run(tf.initialize_all_variables())   saver = tf.train.Saver(tf.all_variables()) saver.restore(sess, 'poetry.module-7')   state_ = sess.run(cell.zero_state(1, tf.float32)) poem = '' i = 0 for wordin head: while word != '，' and word != '。': poem += word x = np.array([list(map(word_num_map.get, word))]) [probs_, state_] = sess.run([probs, last_state], feed_dict={input_data: x, initial_state: state_}) word = to_word(probs_) time.sleep(1) if i % 2 == 0: poem += '，' else: poem += '。' i += 1 return poem   print(gen_poetry_with_head('一二三四'))

Share the post "TensorFlow練習7: 基於RNN生成古詩詞"

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 古詩詞api,詩詞接口，詩詞api，中國詩詞我也擼了一個古詩詞網站喜歡古詩詞的性格分析常識積累---古詩詞中蘊含的哲學原理 python 簡單抓取所有古詩詞按分類排列含“秋”字的古詩詞：飛花令比賽准備古詩詞中文分詞自動化 java:利用數組實現將古詩詞縱向輸出『TensotFlow』RNN/LSTM古詩生成中華古詩詞知識圖譜構建之數據獲取