VAE是一个神奇得算法,其实思想倒是有点像word2vec,只是在其上加了一层bayesian的思想,这点上又倒是有点像LDA了;
个人觉得,VAE挖掘的好的话,倒是有很大的潜力和应用的,因为它是真正意义上的无监督的,句子表示成向量之后,然后你就可以想干嘛就干嘛了;
简单介绍一下VAE,就是一个变分估算参数后验概率,输入输出都是句子本身;
下面介绍一种最简单的VAE的实现,用1-gram,就是把句子表示成1*vocabulary size的向量;
tensorflow实现的;现在我在开发基于seq2seq的VAE算法,github上面倒是有几个,但是基于tensorflow的没一个写的让我满意的;
#encoding=utf-8 import os import itertools import numpy as np import tensorflow as tf from reader import TextReader import random embed_dim = 500 h_dim = 100 data_path = './n_gram/' model_dir = './n_gram/model_dir/' reader = TextReader(data_path) def create_train_op(loss): train_op = tf.contrib.layers.optimize_loss(loss = loss, global_step = tf.contrib.framework.get_global_step(), learning_rate = 0.01, clip_gradients = 10.0, optimizer = "Adam") return train_op global_step = tf.Variable(0, name = 'global_step', trainable=False) tx = tf.placeholder(tf.int64, [None, reader.vocab_size]) x = tf.to_float(tx) batch_size = tf.placeholder(tf.int64) w = tf.placeholder(tf.float32) with tf.variable_scope('encoder'): w_1 = tf.get_variable('w_1', [reader.vocab_size, embed_dim], initializer = tf.truncated_normal_initializer()) b_1 = tf.get_variable('b_1', [embed_dim], initializer = tf.truncated_normal_initializer()) L1 = tf.nn.bias_add(tf.matmul(x, w_1), b_1) L1 = tf.nn.tanh(L1) w_2 = tf.get_variable('w_2', [embed_dim, embed_dim], initializer = tf.truncated_normal_initializer()) b_2 = tf.get_variable('b_2', [embed_dim], initializer = tf.truncated_normal_initializer()) L2 = tf.nn.bias_add(tf.matmul(L1, w_2), b_2) L2 = tf.nn.tanh(L2) w_encoder_mu = tf.get_variable('w_encoder_mu', [embed_dim, h_dim], initializer = tf.truncated_normal_initializer(0, 0.01)) b_encoder_mu = tf.get_variable('b_encoder_mu', [h_dim], initializer = tf.truncated_normal_initializer(0, 0.001)) w_encoder_var = tf.get_variable('w_encoder_var', [embed_dim, h_dim], initializer = tf.truncated_normal_initializer(0, 0.01)) b_encoder_var = tf.get_variable('b_encoder_var', [h_dim], initializer = tf.truncated_normal_initializer(0, 0.01)) mu = tf.nn.bias_add(tf.matmul(L2, w_encoder_mu), b_encoder_mu) log_sigma_sq = tf.nn.bias_add(tf.matmul(L2, w_encoder_var), b_encoder_var) eps = tf.random_normal([batch_size, h_dim], 0, 1, dtype = tf.float32) sigma = tf.sqrt(tf.exp(log_sigma_sq)) h = mu + sigma*eps with tf.variable_scope('decoder') as vs: R = tf.get_variable('R', [h_dim, reader.vocab_size], initializer = tf.truncated_normal_initializer(0, 0.0001)) b = tf.get_variable('b', [reader.vocab_size], initializer = tf.truncated_normal_initializer(0, 0.0001)) e = -tf.matmul(h, R) + b p_x_i = tf.nn.softmax(e, -1) e_loss = -0.5 * tf.reduce_sum(1.0 + log_sigma_sq - tf.square(mu) - tf.exp(log_sigma_sq), 1) g_loss = -tf.reduce_sum(tf.log(p_x_i + 1e-10)*x, 1) g_loss_stand = -tf.log(1.0/tf.reduce_sum(x, 1))*tf.reduce_sum(x, 1) #g_loss = g_loss/tf.maximum(g_loss_stand, 1.0) e_loss_mean = tf.reduce_mean(e_loss) g_loss_mean = tf.reduce_mean(g_loss) loss = 0.1*e_loss + g_loss loss = tf.reduce_mean(loss) encoder_var_list = [] decoder_var_list = [] for var in tf.trainable_variables(): if 'encoder' in var.name: encoder_var_list.append(var) elif 'decoder' in var.name: decoder_var_list.append(var) optim_e = tf.train.AdamOptimizer(learning_rate=0.05).minimize(e_loss, global_step=global_step, var_list=encoder_var_list) optim_g = tf.train.AdamOptimizer(learning_rate=0.05).minimize(g_loss, global_step=global_step, var_list=decoder_var_list) train_op = create_train_op(loss) saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.initialize_all_variables()) ckpt = tf.train.get_checkpoint_state(model_dir) if ckpt and ckpt.model_checkpoint_path: print 'the model being restored is ' print ckpt.model_checkpoint_path saver.restore(sess, ckpt.model_checkpoint_path) print 'sucesssfully restored the session' count = global_step.eval() for k in range(0, 0): data, length = reader.iterator() em, gm, lm, _= sess.run([e_loss_mean, g_loss_mean, loss, train_op], feed_dict = {tx: data, batch_size:length, w:k/1000.0}) print 'After\t' + str(global_step.eval()) + ' th step,the loss\t' + str(lm) + '\t kL loss\t' + str(em) + '\tdecoder loss\t' + str(gm) global_step.assign(count).eval() if k%10 == 0: saver.save(sess, model_dir + 'model.ckpt', global_step = global_step) count += 1