使用pytorch框架實現使用MF模型在movielen數據集上的電影評分預測


一、MF介紹

(1)實驗的主要任務:使用MF模型在數據集合上的評分預測(movielens,隨機80%訓練數據,20%測試數據,隨機構造 Koren的經典模型)

(2)參考論文:MATRIX  FACTORIZATION TECHNIQUES FOR RECOMMENDER SYSTEMS

簡單模型:難點在於構造qipu,通過來預測評分rui。在構造qipu時,對於每個useritem構造為包含k個特征因子的vector

目標函數為:

3)部署環境:python37 + pytorch1.3

4)數據集:Movielensmall數據集,數據集按照8:2的比例進行划分,隨機挑選80%的數據當做訓練集,剩余的20%當做測試集。(數據下載網址:https://grouplens.org/datasets/movielens/

5)代碼結構:

進行數據預處理以及數據划分的代碼在load_data.py文件中,划分之后得到rating_train.csvrating_test.csv兩個文件(數據集的划分是在抽樣之后的數據集ratings_sample.csv上進行划分的)。(data文件夾下的ratings.csv為原始數據集,其中會得到一些中間文件:ratingsNoHead.csv文件為去掉數據集的表頭得到的文件;ratings_sample.csv文件為從原始數據中選取1%的數據作為實驗數據。)

mf.py文件是讀取訓練集以及測試集,並使用pytorch框架編寫MF訓練模型,最后使用rmse作為評價指標,使用測試集對模型進行測試。模型訓練過程中采用batch對數據集進行分批訓練,最終以曲線的形式展現出來。最終測試集的曲線圖如下圖所示:

6)評價標准:采用rmse作為評價指標,使用測試集對模型進行測試。(實驗只使用了數據集中的一部分數據,同樣也使用了完整的數據集進行了測試,測試誤差為0.46。由於數據集較大,這里只上傳使用的部分數據集。)訓練集與測試集的rmse結果為:

二、代碼 

 1.代碼結構:

 2.load_data.py代碼:

# coding: utf-8
"""
該文件主要是對數據進行預處理,將評分數據按照8:2分為訓練數據與測試數據
"""
import pandas as pd
import csv
import random
import os

# 刪除文件中的表頭
origin_f = open('data/ratings.csv','rt',encoding='utf-8',errors="ignore")
new_f = open('data/ratingsNoHead.csv','wt+',encoding='utf-8',errors="ignore",newline="")
reader = csv.reader(origin_f)
writer = csv.writer(new_f)
i=0
for i,row in enumerate(reader):
    if i>0:
        writer.writerow(row)
origin_f.close()
new_f.close()

#從原始數據集中選取1%作為實驗數據
df = pd.read_csv('data/ratingsNoHead.csv', encoding='utf-8')
df=df.sample(frac=1.0)  #全部打亂
cut_idx=int(round(0.01*df.shape[0]))
df_sample=df.iloc[:cut_idx]
#將數據存儲到csv文件中
df_sample=pd.DataFrame(df_sample)
print("sample shape:",df_sample.shape)
df_sample.to_csv('data/ratings_sample_tmp.csv',index=False)
#去掉第一行
origin_f = open('data/ratings_sample_tmp.csv','rt',encoding='utf-8',errors="ignore")
new_f = open('data/ratings_sample.csv','wt+',encoding='utf-8',errors="ignore",newline="")     #必須加上newline=""否則會多出空白行
reader = csv.reader(origin_f)
writer = csv.writer(new_f)
for i,row in enumerate(reader):
    if i>0:
        writer.writerow(row)
origin_f.close()
new_f.close()
os.remove('data/ratings_sample_tmp.csv')

#將數據按照8:2的比例進行划分得到訓練數據集與測試數據集
df = pd.read_csv('data/ratings_sample.csv', encoding='utf-8')
# df.drop_duplicates(keep='first', inplace=True)  # 去重,只保留第一次出現的樣本
# print(df)
df = df.sample(frac=1.0)  # 全部打亂
cut_idx = int(round(0.2 * df.shape[0]))
df_test, df_train = df.iloc[:cut_idx], df.iloc[cut_idx:]
# 打印數據集中的數據記錄數
print("df shape:",df.shape,"test shape:",df_test.shape,"train shape:",df_train.shape)
# print(df_train)
# 將數據記錄存儲到csv文件中
# 存儲訓練數據集
df_train=pd.DataFrame(df_train)
df_train.to_csv('data/ratings_train_tmp.csv',index=False)
# 由於一些不知道為什么的原因,使用pandas讀取得到的數據多了一行,在存儲時也會將這一行存儲起來,所以應該刪除這一行(如果有時間在查一查看能不能解決這個問題)
origin_f = open('data/ratings_train_tmp.csv','rt',encoding='utf-8',errors="ignore")
new_f = open('data/ratings_train.csv','wt+',encoding='utf-8',errors="ignore",newline="")     #必須加上newline=""否則會多出空白行
reader = csv.reader(origin_f)
writer = csv.writer(new_f)
for i,row in enumerate(reader):
    if i>0:
        writer.writerow(row)
origin_f.close()
new_f.close()
os.remove('data/ratings_train_tmp.csv')
# 存儲測試數據集
df_test=pd.DataFrame(df_test)
df_test.to_csv('data/ratings_test_tmp.csv',index=False)
origin_f = open('data/ratings_test_tmp.csv','rt',encoding='utf-8',errors="ignore")
new_f = open('data/ratings_test.csv','wt+',encoding='utf-8',errors="ignore",newline="")
reader = csv.reader(origin_f)
writer = csv.writer(new_f)
for i,row in enumerate(reader):
    if i>0:
        writer.writerow(row)
origin_f.close()
new_f.close()
os.remove('data/ratings_test_tmp.csv')

3.模型文件mf.py代碼:

import pandas as pd
import torch as pt
import numpy as np
import torch.utils.data as Data
import matplotlib.pyplot as plt

BATCH_SIZE=100

# 讀取測試以及訓練數據
cols=['user','item','rating','timestamp']
train=pd.read_csv('data/ratings_train.csv',encoding='utf-8',names=cols)
test=pd.read_csv('data/ratings_test.csv',encoding='utf-8',names=cols)

# 去掉時間戳
train=train.drop(['timestamp'],axis=1)
test=test.drop(['timestamp'],axis=1)
print("train shape:",train.shape)
print("test shape:",test.shape)

#userNo的最大值
userNo=max(train['user'].max(),test['user'].max())+1
print("userNo:",userNo)
#movieNo的最大值
itemNo=max(train['item'].max(),test['item'].max())+1
print("itemNo:",itemNo)

rating_train=pt.zeros((itemNo,userNo))
rating_test=pt.zeros((itemNo,userNo))
for index,row in train.iterrows():
    #train數據集進行遍歷
    rating_train[int(row['item'])][int(row['user'])]=row['rating']
print(rating_train[0:3][1:10])
for index,row in test.iterrows():
    rating_test[int(row['item'])][int(row['user'])] = row['rating']

def normalizeRating(rating_train):
    m,n=rating_train.shape
    # 每部電影的平均得分
    rating_mean=pt.zeros((m,1))
    #所有電影的評分
    all_mean=0
    for i in range(m):
        #每部電影的評分
        idx=(rating_train[i,:]!=0)
        rating_mean[i]=pt.mean(rating_train[i,idx])
    tmp=rating_mean.numpy()
    tmp=np.nan_to_num(tmp)        #對值為NaN進行處理,改成數值0
    rating_mean=pt.tensor(tmp)
    no_zero_rating=np.nonzero(tmp)                #numpyy提取非0元素的位置
    # print("no_zero_rating:",no_zero_rating)
    no_zero_num=np.shape(no_zero_rating)[1]   #非零元素的個數
    print("no_zero_num:",no_zero_num)
    all_mean=pt.sum(rating_mean)/no_zero_num
    return rating_mean,all_mean

rating_mean,all_mean=normalizeRating(rating_train)
print("all mean:",all_mean)

#訓練集分批處理
loader = Data.DataLoader(
    dataset=rating_train,      # torch TensorDataset format
    batch_size=BATCH_SIZE,      # 最新批數據
    shuffle=False           # 是否隨機打亂數據
)

loader2 = Data.DataLoader(
    dataset=rating_test,      # torch TensorDataset format
    batch_size=BATCH_SIZE,      # 最新批數據
    shuffle=False           # 是否隨機打亂數據
)

class MF(pt.nn.Module):
    def __init__(self,userNo,itemNo,num_feature=20):
        super(MF, self).__init__()
        self.num_feature=num_feature     #num of laten features
        self.userNo=userNo               #user num
        self.itemNo=itemNo               #item num
        self.bi=pt.nn.Parameter(pt.rand(self.itemNo,1))    #parameter
        self.bu=pt.nn.Parameter(pt.rand(self.userNo,1))    #parameter
        self.U=pt.nn.Parameter(pt.rand(self.num_feature,self.userNo))    #parameter
        self.V=pt.nn.Parameter(pt.rand(self.itemNo,self.num_feature))    #parameter

    def mf_layer(self,train_set=None):
        # predicts=all_mean+self.bi+self.bu.t()+pt.mm(self.V,self.U)
        predicts =self.bi + self.bu.t() + pt.mm(self.V, self.U)
        return predicts

    def forward(self, train_set):
        output=self.mf_layer(train_set)
        return output


num_feature=2    #k
mf=MF(userNo,itemNo,num_feature)
mf
print("parameters len:",len(list(mf.parameters())))
param_name=[]
params=[]
for name,param in mf.named_parameters():
    param_name.append(name)
    print(name)
    params.append(param)
# param_name的參數依次為bi,bu,U,V

lr=0.3
_lambda=0.001
loss_list=[]
optimizer=pt.optim.SGD(mf.parameters(),lr)
# 對數據集進行訓練
for epoch in range(1000):
    optimizer.zero_grad()
    output=mf(train)
    loss_func=pt.nn.MSELoss()
    # loss=loss_func(output,rating_train)+_lambda*(pt.sum(pt.pow(params[2],2))+pt.sum(pt.pow(params[3],2)))
    loss = loss_func(output, rating_train)
    loss.backward()
    optimizer.step()
    loss_list.append(loss)

print("train loss:",loss)

#評價指標rmse
def rmse(pred_rate,real_rate):
    #使用均方根誤差作為評價指標
    loss_func=pt.nn.MSELoss()
    mse_loss=loss_func(pred_rate,real_rate)
    rmse_loss=pt.sqrt(mse_loss)
    return rmse_loss

# 測試網絡
#測試時測試的是原來評分矩陣為0的元素,通過模型將為0的元素預測一個評分,所以需要找尋評分矩陣中原來元素為0的位置。
prediction=output[np.where(rating_train==0)]
#評分矩陣中元素為0的位置對應測試集中的評分
rating_test=rating_test[np.where(rating_train==0)]
rmse_loss=rmse(prediction,rating_test)
print("test loss:",rmse_loss)

plt.clf()
plt.plot(range(epoch+1),loss_list,label='Training data')
plt.title("The MovieLens Dataset Learning Curve")
plt.xlabel('Number of Epochs')
plt.ylabel('RMSE')
plt.legend()
plt.grid()
plt.show()

如果有疑問,歡迎留言。


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM