1 import random
2 import math
3 class UserBasedCF:
4 def __init__(self,datafile = None):
5 self.datafile = datafile
6 self.readData()
7 self.splitData(3,47)
8 def readData(self,datafile = None):
9 """
10 read the data from the data file which is a data set
11 """
12 self.datafile = datafile or self.datafile
13 self.data = []
14 for line in open(self.datafile):
15 userid,itemid,record,_ = line.split()
16 self.data.append((userid,itemid,int(record)))
17 def splitData(self,k,seed,data=None,M = 8):
18 """
19 split the data set
20 testdata is a test data set
21 traindata is a train set
22 test data set / train data set is 1:M-1
23 """
24 self.testdata = {}
25 self.traindata = {}
26 data = data or self.data
27 random.seed(seed)
28 for user,item, record in self.data:
29 if random.randint(0,M) == k:
30 self.testdata.setdefault(user,{})
31 self.testdata[user][item] = record
32 else:
33 self.traindata.setdefault(user,{})
34 self.traindata[user][item] = record
35 def userSimilarity(self,train = None):
36 """
37 One method of getting user similarity matrix
38 """
39 train = train or self.traindata
40 self.userSim = dict()
41 for u in train.keys():
42 for v in train.keys():
43 if u == v:
44 continue
45 self.userSim.setdefault(u,{})
46 self.userSim[u][v] = len(set(train[u].keys()) & set(train[v].keys()))
47 self.userSim[u][v] /=math.sqrt(len(train[u]) * len(train[v]) *1.0)
48 def userSimilarityBest(self,train = None):
49 """
50 the other method of getting user similarity which is better than above
51 you can get the method on page 46
52 In this experiment,we use this method
53 """
54 train = train or self.traindata
55 self.userSimBest = dict()
56 item_users = dict()
57 for u,item in train.items():
58 for i in item.keys():
59 item_users.setdefault(i,set())
60 item_users[i].add(u)
61 user_item_count = dict()
62 count = dict()
63 for item,users in item_users.items():
64 for u in users:
65 user_item_count.setdefault(u,0)
66 user_item_count[u] += 1
67 for v in users:
68 if u == v:continue
69 count.setdefault(u,{})
70 count[u].setdefault(v,0)
71 count[u][v] += 1
72 for u ,related_users in count.items():
73 self.userSimBest.setdefault(u,dict())
74 for v, cuv in related_users.items():
75 self.userSimBest[u][v] = cuv / math.sqrt(user_item_count[u] * user_item_count[v] * 1.0)
76
77 def recommend(self,user,train = None,k = 8,nitem = 40):
78 train = train or self.traindata
79 rank = dict()
80 interacted_items = train.get(user,{})
81 for v ,wuv in sorted(self.userSimBest[user].items(),key = lambda x : x[1],reverse = True)[0:k]:
82 for i , rvi in train[v].items():
83 if i in interacted_items:
84 continue
85 rank.setdefault(i,0)
86 rank[i] += wuv
87 return dict(sorted(rank.items(),key = lambda x :x[1],reverse = True)[0:nitem])
88 def recallAndPrecision(self,train = None,test = None,k = 8,nitem = 10):
89 """
90 Get the recall and precision, the method you want to know is listed
91 in the page 43
92 """
93 train = train or self.traindata
94 test = test or self.testdata
95 hit = 0
96 recall = 0
97 precision = 0
98 for user in train.keys():
99 tu = test.get(user,{})
100 rank = self.recommend(user, train = train,k = k,nitem = nitem)
101 for item,_ in rank.items():
102 if item in tu:
103 hit += 1
104 recall += len(tu)
105 precision += nitem
106 return (hit / (recall * 1.0),hit / (precision * 1.0))
107 def coverage(self,train = None,test = None,k = 8,nitem = 10):
108 train = train or self.traindata
109 test = test or self.testdata
110 recommend_items = set()
111 all_items = set()
112 for user in train.keys():
113 for item in train[user].keys():
114 all_items.add(item)
115 rank = self.recommend(user, train, k = k, nitem = nitem)
116 for item,_ in rank.items():
117 recommend_items.add(item)
118 return len(recommend_items) / (len(all_items) * 1.0)
119 def popularity(self,train = None,test = None,k = 8,nitem = 10):
120 """
121 Get the popularity
122 the algorithm on page 44
123 """
124 train = train or self.traindata
125 test = test or self.testdata
126 item_popularity = dict()
127 for user ,items in train.items():
128 for item in items.keys():
129 item_popularity.setdefault(item,0)
130 item_popularity[item] += 1
131 ret = 0
132 n = 0
133 for user in train.keys():
134 rank = self.recommend(user, train, k = k, nitem = nitem)
135 for item ,_ in rank.items():
136 ret += math.log(1+item_popularity[item])
137 n += 1
138 return ret / (n * 1.0)
139
140 def testRecommend():
141 ubcf = UserBasedCF('u.data')
142 ubcf.readData()
143 ubcf.splitData(4,100)
144 ubcf.userSimilarity()
145 user = "345"
146 rank = ubcf.recommend(user,k = 3)
147 for i,rvi in rank.items():
148
149 items = ubcf.testdata.get(user,{})
150 record = items.get(i,0)
151 print "%5s: %.4f--%.4f" %(i,rvi,record)
152 def testUserBasedCF():
153 cf = UserBasedCF('u.data')
154 cf.userSimilarityBest()
155 print "%3s%20s%20s%20s%20s" % ('K',"recall",'precision','coverage','popularity')
156 for k in [5,10,20,40,80,160]:
157 recall,precision = cf.recallAndPrecision( k = k)
158 coverage = cf.coverage(k = k)
159 popularity = cf.popularity(k = k)
160 print "%3d%19.3f%%%19.3f%%%19.3f%%%20.3f" % (k,recall * 100,precision * 100,coverage * 100,popularity)
161
162 if __name__ == "__main__":
163 testUserBasedCF()