Python推薦系統庫--Surprise實戰

本文轉載自查看原文 2019-01-02 15:30 2530 機器學習實戰

一、使用movieLens數據集

from surprise import KNNBasic, SVD
from surprise import Dataset
from surprise import evaluate, print_perf

# 使用公開的推薦系統數據集--MovieLens
data = Dataset.load_builtin('ml-100k')
# k 折交叉驗證
data.split(n_folds=3)
# 算法使用SVD分解
algo = SVD()
# 在數據集上測試效果，算出最小均方根誤差、平均絕對誤差
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
# 輸出結果
print_perf(perf)

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.9506
MAE:  0.7511
------------
Fold 2
RMSE: 0.9452
MAE:  0.7456
------------
Fold 3
RMSE: 0.9442
MAE:  0.7444
------------
------------
Mean RMSE: 0.9467
Mean MAE : 0.7470
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
RMSE    0.9506  0.9452  0.9442  0.9467  
MAE     0.7511  0.7456  0.7444  0.7470

二、算法調參

我們使用sklearn常用到的網格搜索交叉驗證（GridSearchCV）來選擇最優的參數

# 算法調參
from surprise import GridSearch
# 迭代輪次、學習率、
# 三個參數，每個有兩個參數，2^3 = 8種可能
param_grid = {'n_epochs':[5, 10], 'lr_all':[0.002, 0.005],
             'reg_all':[0.4, 0.6]}

# 使用SVD算法，三個參數參與調參，評估標准使用最小均方根誤差、協調對分數
grid_search = GridSearch(SVD, param_grid, measures=['RMSE', 'FCP'])
data = Dataset.load_builtin('ml-100k')
data.split(n_folds=3)

grid_search.evaluate(data)

Running grid search for the following parameter combinations:
{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}
{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}
{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4}
{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6}
{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4}
{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.6}
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}

Resulsts:
{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}
{'RMSE': 0.9973640543212537, 'FCP': 0.6834505918617332}
----------
{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}
{'RMSE': 1.0033367804212159, 'FCP': 0.6863671726311678}
----------
{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4}
{'RMSE': 0.9740022047005671, 'FCP': 0.693822773157699}
----------
{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6}
{'RMSE': 0.9828360526820644, 'FCP': 0.6939377853330241}
----------
{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4}
{'RMSE': 0.9783154591562983, 'FCP': 0.6919014896389958}
----------
{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.6}
{'RMSE': 0.9863470326305794, 'FCP': 0.6925580320424597}
----------
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
{'RMSE': 0.9641597864074152, 'FCP': 0.6973875277009212}
----------
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}
{'RMSE': 0.9740231673256359, 'FCP': 0.6976928768968366}

# 輸出最優的參數組
# 輸出最好的RMSE結果
print(grid_search.best_score['RMSE'])

# 輸出對應最好的RMSE結果的參數
print(grid_search.best_params['RMSE'])

0.9641597864074152
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}

# 最好的FCP得分
print(grid_search.best_score['FCP'])

# 輸出對應最好的FCP結果的參數
print(grid_search.best_params['FCP'])

0.6983253171588012
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}

在自己的數據集上訓練模型

該如何做？

1. 載入自己的數據集

import os
from surprise import Reader, Dataset
# 指定文件路徑
file_path = os.path.expanduser('./popular_music_suprise_format.txt')
# 指定文件格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 從文件讀取數據
music_data = Dataset.load_from_file(file_path, reader=reader)
# 分成5折
music_data.split(n_folds=5)

2. 使用不同的推薦算法進行建模比較

### 使用NormalPredictor
from surprise import NormalPredictor, evaluate
algo = NormalPredictor()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用BaselineOnly
from surprise import BaselineOnly, evaluate
algo = BaselineOnly()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用基礎版協同過濾
from surprise import KNNBasic, evaluate
algo = KNNBasic()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用均值協同過濾
from surprise import KNNWithMeans, evaluate
algo = KNNWithMeans()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用協同過濾baseline
from surprise import KNNBaseline, evaluate
algo = KNNBaseline()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用SVD
from surprise import SVD, evaluate
algo = SVD()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用SVD++
from surprise import SVDpp, evaluate
algo = SVDpp()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用NMF
from surprise import NMF
algo = NMF()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])
print_perf(perf)

推薦系統--不同電影之間的相似度

一、載入數據，使用算法算出相互間的相似度

# 在協同過濾算法建模以后，根據item取回相似度最高的item
# 使用的是 algo.get_neighbors()

from __future__ import (absolute_import, division, print_function, unicode_literals)
import os
import io

from surprise import KNNBaseline
from surprise import Dataset

# 獲取電影名到電影id 和 電影id到電影名的映射
def read_item_names():
    file_name = (os.path.expanduser('~') + '/.surprise_data/ml-100k/ml-100k/u.item')
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]
    return rid_to_name, name_to_rid

# 用算法計算相互間的相似度
data = Dataset.load_builtin('ml-100k')
trainest = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.train(trainest)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

# 獲取電影名到電影id 和 電影id到電影名的映射
rid_to_name, name_to_rid = read_item_names()

# 獲取玩具總動員的內部id
toy_story_raw_id = name_to_rid['Toy Story (1995)']
toy_story_raw_id

'1'

toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)
toy_story_inner_id

toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)
toy_story_neighbors

[433, 101, 302, 309, 971, 95, 26, 561, 816, 347]

二、獲取相似度最近的10部電影

# 將鄰居的內部id轉換為名稱。
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in toy_story_neighbors)

toy_story_neighbors = (rid_to_name[rid] for rid in toy_story_neighbors)

print()
print('The 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors:
    print(movie)

The 10 nearest neighbors of Toy Story are:
Beauty and the Beast (1991)
Raiders of the Lost Ark (1981)
That Thing You Do! (1996)
Lion King, The (1994)
Craft, The (1996)
Liar Liar (1997)
Aladdin (1992)
Cool Hand Luke (1967)
Winnie the Pooh and the Blustery Day (1968)
Indiana Jones and the Last Crusade (1989)

參考文章：https://blog.csdn.net/mycafe_/article/details/79146764

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。