推薦系統實踐—ItemCF實現


參考:https://github.com/Lockvictor/MovieLens-RecSys/blob/master/usercf.py#L169

數據集

本文使用了MovieLens中的ml-100k小數據集,數據集的地址為:傳送門
該數據集中包含了943個獨立用戶對1682部電影做的10000次評分。

 

完整代碼

總體和UserCF差不多,將用戶相似度的計算改為物品相似度的計算即可。

import numpy as np
import pandas as pd
import math
from collections import defaultdict
from operator import itemgetter

np.random.seed(1)


class ItemCF(object):

    def __init__(self):
        self.train_set = {}
        self.test_set = {}
        self.movie_popularity = {}

        self.tot_movie = 0
        self.W = {}   # 相似度矩陣

        self.K = 160   # 最接近的K部電影
        self.M = 10   # 推薦電影數

    def split_data(self, data, ratio):
        ''' 按ratio的比例分成訓練集和測試集 '''
        for line in data.itertuples():
            user, movie, rating = line[1], line[2], line[3]
            if np.random.random() < ratio:
                self.train_set.setdefault(user, {})
                self.train_set[user][movie] = int(rating)
            else:
                self.test_set.setdefault(user, {})
                self.test_set[user][movie] = int(rating)
        print('數據預處理完成')

    def item_similarity(self):
        ''' 計算物品相似度 '''
        for user, items in self.train_set.items():
            for movie in items.keys():
                if movie not in self.movie_popularity:   # 用於后面計算新穎度
                    self.movie_popularity[movie] = 0
                self.movie_popularity[movie] += 1
        self.tot_movie = len(self.movie_popularity)  # 用於計算覆蓋率

        C, N = {}, {}    # C記錄電影兩兩之間共同喜歡的人數, N記錄電影的打分人數
        for user, items in self.train_set.items():
            for m1 in items.keys():
                N.setdefault(m1, 0)
                N[m1] += 1
                C.setdefault(m1, defaultdict(int))
                for m2 in items.keys():
                    if m1 == m2:
                        continue
                    else:
                        C[m1][m2] += 1

        count = 1
        for u, related_movies in C.items():
            print('\r相似度計算進度:{:.2f}%'.format(count * 100 / self.tot_movie), end='')
            count += 1
            self.W.setdefault(u, {})
            for v, cuv in related_movies.items():
                self.W[u][v] = float(cuv) / math.sqrt(N[u] * N[v])
        print('\n相似度計算完成')

    def recommend(self, u):
        ''' 推薦M部電影 '''
        rank = {}
        user_movies = self.train_set[u]

        for movie, rating in user_movies.items():
            for related_movie, similarity in sorted(self.W[movie].items(), key=itemgetter(1), reverse=True)[0:self.K]:
                if related_movie in user_movies:
                    continue
                else:
                    rank.setdefault(related_movie, 0)
                    rank[related_movie] += similarity * rating
        return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:self.M]

    def evaluate(self):
        ''' 評測算法 '''
        hit = 0
        ret = 0
        rec_tot = 0
        pre_tot = 0
        tot_rec_movies = set()  # 推薦電影
        for user in self.train_set:
            test_movies = self.test_set.get(user, {})
            rec_movies = self.recommend(user)
            for movie, pui in rec_movies:
                if movie in test_movies.keys():
                    hit += 1
                tot_rec_movies.add(movie)
                ret += math.log(1+self.movie_popularity[movie])
            pre_tot += self.M
            rec_tot += len(test_movies)
        precision = hit / (1.0 * pre_tot)
        recall = hit / (1.0 * rec_tot)
        coverage = len(tot_rec_movies) / (1.0 * self.tot_movie)
        ret /= 1.0 * pre_tot
        print('precision=%.4f' % precision)
        print('recall=%.4f' % recall)
        print('coverage=%.4f' % coverage)
        print('popularity=%.4f' % ret)


if __name__ == '__main__':
    data = pd.read_csv('u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
    itemcf = ItemCF()
    itemcf.split_data(data, 0.7)
    itemcf.item_similarity()
    itemcf.evaluate()

 

結果

 

物品相似度的歸一化

如果將ItemCF的相似度矩陣按最大值歸一化,可以提高性能。

將上述相似度計算的部分代碼改為

count = 1
        for u, related_movies in C.items():
            print('\r相似度計算進度:{:.2f}%'.format(count * 100 / self.tot_movie), end='')
            count += 1
            self.W.setdefault(u, {})
            mx = 0.0
            for v, cuv in related_movies.items():
                self.W[u][v] = float(cuv) / math.sqrt(N[u] * N[v])
                if self.W[u][v] > mx:
                    mx = self.W[u][v]
            for v, cuv in related_movies.items():
                self.W[u][v] /= mx
        print('\n相似度計算完成')

可以看到性能均有所提升。

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM