先創建一個reader.py,后面的程序將用到其中的函數。
from __future__ import absolute_import, division, print_function import numpy as np import pandas as pd def read_file(filname, sep="\t"): col_names = ["user", "item", "rate", "st"] df = pd.read_csv(filname, sep=sep, header=None, names=col_names, engine='python') df["user"] -= 1 df["item"] -= 1 for col in ("user", "item"): df[col] = df[col].astype(np.int32) df["rate"] = df["rate"].astype(np.float32) return df class ShuffleIterator(object): """ Randomly generate batches """ def __init__(self, inputs, batch_size=10): self.inputs = inputs self.batch_size = batch_size self.num_cols = len(self.inputs) self.len = len(self.inputs[0]) self.inputs = np.transpose(np.vstack([np.array(self.inputs[i]) for i in range(self.num_cols)])) def __len__(self): return self.len def __iter__(self): return self def __next__(self): return self.next() def next(self): ids = np.random.randint(0, self.len, (self.batch_size,)) out = self.inputs[ids, :] return [out[:, i] for i in range(self.num_cols)] class OneEpochIterator(ShuffleIterator): """ Sequentially generate one-epoch batches, typically for test data """ def __init__(self, inputs, batch_size=10): super(OneEpochIterator, self).__init__(inputs, batch_size=batch_size) if batch_size > 0: self.idx_group = np.array_split(np.arange(self.len), np.ceil(self.len / batch_size)) else: self.idx_group = [np.arange(self.len)] self.group_id = 0 def next(self): if self.group_id >= len(self.idx_group): self.group_id = 0 raise StopIteration out = self.inputs[self.idx_group[self.group_id], :] self.group_id += 1 return [out[:, i] for i in range(self.num_cols)]
數據的內容主要是關於電影與用戶。
# 導入數據io操作 from collections import deque from six import next # 調用reader.py import readers # Main imports for training import tensorflow as tf import numpy as np # 評估每個輪次的訓練時間 import time
# 用於復制結果的恆定種子 np.random.seed(42) #3900 個電影 6,040個用戶 u_num = 6040 i_num = 3952 batch_size = 1000 # 數據的維度 dims = 5 # 最大迭代輪次 max_epochs = 50 # 使用設備 place_device = "/cpu:0"
一、加載數據、划分訓練集和測試集
def get_data(): # 數據依次是用戶ID、項目ID、評級、時間戳 # 樣例數據:data - 3::1196::4::978297539 df = readers.read_file("C:/Users/Administrator/.surprise_data/ml-1m/ratings.dat", sep="::") # 獲取數據的行數,待會兒要做訓練和測試集的切分 rows = len(df) # 純粹基於整數位置的索引,根據位置進行選擇 # 實際上就是打亂一下數據的順序 洗牌 df = df.iloc[np.random.permutation(rows)].reset_index(drop=True) # 90%用作訓練,10%用作測試 split_index = int(rows * 0.9) # Use indices to separate the data df_train = df[0:split_index] df_test = df[split_index:].reset_index(drop=True) return df_train, df_test def clip(x): return np.clip(x, 1.0, 5.0)
二、定義模型,返回預測結果和正則化項
def model(user_batch, item_batch, user_num, item_num, dim=5, device="/cpu:0"): with tf.device("/cpu:0"): # 變量域 with tf.variable_scope('lsi',reuse=tf.AUTO_REUSE): # 全局偏置變量 # get_variable:在名稱前面加上當前變量作用域並執行重用檢查 bias_global = tf.get_variable("bias_global",shape=[]) # 用戶的偏好 w_bias_user = tf.get_variable("embd_bias_user", shape=[user_num]) # 電影的偏好 w_bias_item = tf.get_variable("embd_bias_item", shape=[item_num]) # 用戶和電影一個batch的偏好 bias_user = tf.nn.embedding_lookup(w_bias_user, user_batch, name="bias_user") bias_item = tf.nn.embedding_lookup(w_bias_item, item_batch, name="bias_item") # 用戶和電影的權重 w_user = tf.get_variable("embd_user", shape=[user_num, dim], initializer=tf.truncated_normal_initializer(stddev=0.02)) w_item = tf.get_variable("embd_item", shape=[item_num, dim], initializer=tf.truncated_normal_initializer(stddev=0.02)) # 給定批處理的用戶和項的權重嵌入 # 用戶和電影一個batch的權重 embd_user = tf.nn.embedding_lookup(w_user, user_batch, name="embedding_user") embd_item = tf.nn.embedding_lookup(w_item, item_batch, name="embedding_item") with tf.device(device): # 計算張量各維度元素和 infer = tf.reduce_sum(tf.multiply(embd_user, embd_item), 1) infer = tf.add(infer, bias_global) infer = tf.add(infer, bias_user) infer = tf.add(infer, bias_item, name="svd_inference") # 加上L2的正則化項 # l2_loss: 計算一個張量的L2范數的一半 # regularizer:正則化項 regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item), name="svd_regularizer") # 返回我們預測的結果和正則化項 return infer, regularizer
三、定義損失函數
def loss(infer, regularizer, rate_batch, learning_rate=0.001, reg=0.1, device="/cpu:0"): with tf.device(device): # 使用L2 loss算出預測值到實際值的距離 # infer 預測值 rate_batch 實際值 cost_l2 = tf.nn.l2_loss(tf.subtract(infer, rate_batch)) # 懲罰的方式----L2 penalty = tf.constant(reg, dtype=tf.float32, shape=[], name="l2") # 損失函數 = 數據損失(data loss) + 正則化損失(正則化項 * L2懲罰方式) cost = tf.add(cost_l2, tf.multiply(regularizer, penalty)) # 訓練 使用梯度下降 train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) return cost, train_op
四、讀取數據以構建tensorflow模型
# 從評級文件讀取數據以構建 tensorflow 模型 df_train, df_test = get_data() samples_per_batch = len(df_train) // batch_size print("Number of train samples %d, test samples %d, samples per batch %d" % (len(df_train), len(df_test), samples_per_batch))
Number of train samples 900188, test samples 100021, samples per batch 900
# 查看前5個用戶值 print(df_train["user"].head()) print(df_test["user"].head())
0 1834 1 5836 2 1266 3 2468 4 117 Name: user, dtype: int32 0 5062 1 251 2 5831 3 2243 4 4903 Name: user, dtype: int32
# 查看前5個項目的值 print(df_train["item"].head()) print(df_test["item"].head())
0 1213 1 995 2 355 3 2040 4 2670 Name: item, dtype: int32 0 2917 1 291 2 2027 3 2310 4 1930 Name: item, dtype: int32
# 查看前5個評分值 print(df_train["rate"].head()) print(df_test["rate"].head())
0 5.0 1 4.0 2 2.0 3 5.0 4 4.0 Name: rate, dtype: float32 0 5.0 1 4.0 2 4.0 3 3.0 4 5.0 Name: rate, dtype: float32
五、訓練
# 使用shuffle迭代器生成隨機批次,用於訓練 iter_train = readers.ShuffleIterator([df_train["user"], df_train["item"], df_train["rate"]], batch_size=batch_size) # 按順序生成一個epoch的batch用於測試 iter_test = readers.OneEpochIterator([df_test["user"], df_test["item"], df_test["rate"]], batch_size=-1) # 創建占位符 user_batch = tf.placeholder(tf.int32, shape=[None], name="id_user") item_batch = tf.placeholder(tf.int32, shape=[None], name="id_item") rate_batch = tf.placeholder(tf.float32, shape=[None]) infer, regularizer = model(user_batch, item_batch, user_num=u_num, item_num=i_num, dim=dims, device=place_device) _, train_op = loss(infer, regularizer, rate_batch, learning_rate=0.0010, reg=0.05, device=place_device)
六、創建會話
saver = tf.train.Saver() init_op = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init_op) print("%s\t%s\t%s\t%s" % ("Epoch", "Train Error", "Val Error", "Elapsed Time")) errors = deque(maxlen=samples_per_batch) start = time.time() for i in range(max_epochs * samples_per_batch): users, items, rates = next(iter_train) _, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users, item_batch: items, rate_batch: rates}) pred_batch = clip(pred_batch) errors.append(np.power(pred_batch - rates, 2)) if i % samples_per_batch == 0: train_err = np.sqrt(np.mean(errors)) test_err2 = np.array([]) for users, items, rates in iter_test: pred_batch = sess.run(infer, feed_dict={user_batch: users, item_batch: items}) pred_batch = clip(pred_batch) test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2)) end = time.time() print("%02d\t%.3f\t\t%.3f\t\t%.3f secs" % (i // samples_per_batch, train_err, np.sqrt(np.mean(test_err2)), end - start)) start = end saver.save(sess, './save/')
Epoch Train Error Val Error Elapsed Time 00 2.782 1.119 0.053 secs 01 1.046 1.007 0.619 secs 02 0.981 0.973 0.656 secs 03 0.955 0.954 0.602 secs 04 0.941 0.943 0.592 secs 05 0.931 0.937 0.585 secs 06 0.926 0.932 0.589 secs 07 0.921 0.928 0.604 secs 08 0.917 0.927 0.612 secs 09 0.916 0.924 0.610 secs 10 0.914 0.922 0.657 secs 11 0.910 0.920 0.715 secs 12 0.909 0.919 0.802 secs 13 0.909 0.918 0.651 secs 14 0.907 0.917 0.600 secs 15 0.907 0.917 0.688 secs 16 0.906 0.918 0.668 secs 17 0.905 0.917 0.595 secs 18 0.903 0.915 0.607 secs 19 0.905 0.919 0.594 secs 20 0.903 0.915 0.621 secs 21 0.903 0.914 0.634 secs 22 0.902 0.915 0.651 secs 23 0.903 0.913 0.680 secs 24 0.902 0.914 0.586 secs 25 0.902 0.914 0.604 secs 26 0.901 0.913 0.663 secs 27 0.902 0.915 0.734 secs 28 0.901 0.915 0.752 secs 29 0.901 0.913 0.700 secs 30 0.900 0.913 0.616 secs 31 0.900 0.913 0.598 secs 32 0.900 0.912 0.673 secs 33 0.901 0.912 0.591 secs 34 0.900 0.912 0.673 secs 35 0.899 0.912 0.694 secs 36 0.899 0.912 0.653 secs 37 0.898 0.913 0.673 secs 38 0.899 0.913 0.590 secs 39 0.900 0.913 0.691 secs 40 0.899 0.912 0.801 secs 41 0.899 0.912 1.011 secs 42 0.899 0.912 0.593 secs 43 0.899 0.912 0.620 secs 44 0.900 0.912 0.620 secs 45 0.899 0.912 0.613 secs 46 0.899 0.912 0.811 secs 47 0.899 0.912 0.652 secs 48 0.899 0.912 0.592 secs 49 0.899 0.911 0.630 secs