參考:https://github.com/Lockvictor/MovieLens-RecSys/blob/master/usercf.py#L169
數據集
本文使用了MovieLens中的ml-100k小數據集,數據集的地址為:傳送門
該數據集中包含了943個獨立用戶對1682部電影做的10000次評分。
完整代碼
總體和UserCF差不多,將用戶相似度的計算改為物品相似度的計算即可。
import numpy as np import pandas as pd import math from collections import defaultdict from operator import itemgetter np.random.seed(1) class ItemCF(object): def __init__(self): self.train_set = {} self.test_set = {} self.movie_popularity = {} self.tot_movie = 0 self.W = {} # 相似度矩陣 self.K = 160 # 最接近的K部電影 self.M = 10 # 推薦電影數 def split_data(self, data, ratio): ''' 按ratio的比例分成訓練集和測試集 ''' for line in data.itertuples(): user, movie, rating = line[1], line[2], line[3] if np.random.random() < ratio: self.train_set.setdefault(user, {}) self.train_set[user][movie] = int(rating) else: self.test_set.setdefault(user, {}) self.test_set[user][movie] = int(rating) print('數據預處理完成') def item_similarity(self): ''' 計算物品相似度 ''' for user, items in self.train_set.items(): for movie in items.keys(): if movie not in self.movie_popularity: # 用於后面計算新穎度 self.movie_popularity[movie] = 0 self.movie_popularity[movie] += 1 self.tot_movie = len(self.movie_popularity) # 用於計算覆蓋率 C, N = {}, {} # C記錄電影兩兩之間共同喜歡的人數, N記錄電影的打分人數 for user, items in self.train_set.items(): for m1 in items.keys(): N.setdefault(m1, 0) N[m1] += 1 C.setdefault(m1, defaultdict(int)) for m2 in items.keys(): if m1 == m2: continue else: C[m1][m2] += 1 count = 1 for u, related_movies in C.items(): print('\r相似度計算進度:{:.2f}%'.format(count * 100 / self.tot_movie), end='') count += 1 self.W.setdefault(u, {}) for v, cuv in related_movies.items(): self.W[u][v] = float(cuv) / math.sqrt(N[u] * N[v]) print('\n相似度計算完成') def recommend(self, u): ''' 推薦M部電影 ''' rank = {} user_movies = self.train_set[u] for movie, rating in user_movies.items(): for related_movie, similarity in sorted(self.W[movie].items(), key=itemgetter(1), reverse=True)[0:self.K]: if related_movie in user_movies: continue else: rank.setdefault(related_movie, 0) rank[related_movie] += similarity * rating return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:self.M] def evaluate(self): ''' 評測算法 ''' hit = 0 ret = 0 rec_tot = 0 pre_tot = 0 tot_rec_movies = set() # 推薦電影 for user in self.train_set: test_movies = self.test_set.get(user, {}) rec_movies = self.recommend(user) for movie, pui in rec_movies: if movie in test_movies.keys(): hit += 1 tot_rec_movies.add(movie) ret += math.log(1+self.movie_popularity[movie]) pre_tot += self.M rec_tot += len(test_movies) precision = hit / (1.0 * pre_tot) recall = hit / (1.0 * rec_tot) coverage = len(tot_rec_movies) / (1.0 * self.tot_movie) ret /= 1.0 * pre_tot print('precision=%.4f' % precision) print('recall=%.4f' % recall) print('coverage=%.4f' % coverage) print('popularity=%.4f' % ret) if __name__ == '__main__': data = pd.read_csv('u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp']) itemcf = ItemCF() itemcf.split_data(data, 0.7) itemcf.item_similarity() itemcf.evaluate()
結果
物品相似度的歸一化
如果將ItemCF的相似度矩陣按最大值歸一化,可以提高性能。
將上述相似度計算的部分代碼改為
count = 1 for u, related_movies in C.items(): print('\r相似度計算進度:{:.2f}%'.format(count * 100 / self.tot_movie), end='') count += 1 self.W.setdefault(u, {}) mx = 0.0 for v, cuv in related_movies.items(): self.W[u][v] = float(cuv) / math.sqrt(N[u] * N[v]) if self.W[u][v] > mx: mx = self.W[u][v] for v, cuv in related_movies.items(): self.W[u][v] /= mx print('\n相似度計算完成')
可以看到性能均有所提升。