'''
基于物品的协同推荐
矩阵数据
说明:
1.修正的余弦相似度是一种基于模型的协同过滤算法。我们前面提过,这种算法的优势之
一是扩展性好,对于大数据量而言,运算速度快、占用内存少。
2.用户的评价标准是不同的,比如喜欢一个歌手时有些人会打4分,有些打5分;不喜欢时
有人会打3分,有些则会只给1分。修正的余弦相似度计算时会将用户对物品的评分减去
用户所有评分的均值,从而解决这个问题。
'''
import
pandas
as
pd
from
io
import
StringIO
#数据类型一:csv矩阵(用户-商品)(适用于小数据量)
csv_txt
=
'''"user","Blues Traveler","Broken Bells","Deadmau5","Norah Jones","Phoenix","Slightly Stoopid","The Strokes","Vampire Weekend"
"Angelica",3.5,2.0,,4.5,5.0,1.5,2.5,2.0
"Bill",2.0,3.5,4.0,,2.0,3.5,,3.0
"Chan",5.0,1.0,1.0,3.0,5,1.0,,
"Dan",3.0,4.0,4.5,,3.0,4.5,4.0,2.0
"Hailey",,4.0,1.0,4.0,,,4.0,1.0
"Jordyn",,4.5,4.0,5.0,5.0,4.5,4.0,4.0
"Sam",5.0,2.0,,3.0,5.0,4.0,5.0,
"Veronica",3.0,,,5.0,4.0,2.5,3.0,'''
#数据类型一:csv矩阵(用户-商品)(适用于小数据量)
csv_txt2
=
'''"user","Kacey Musgraves","Imagine Dragons","Daft Punk","Lorde","Fall Out Boy"
"David",,3,5,4,1
"Matt",,3,4,4,1
"Ben",4,3,,3,1
"Chris",4,4,4,3,1
"Tori",5,4,5,,3'''
#数据类型一:csv矩阵(用户-商品)(适用于小数据量)
#根据《data minning guide》第85页的users2数据
csv_txt3
=
'''"user","Taylor Swift","PSY","Whitney Houston"
"Amy",4,3,4
"Ben",5,2,
"Clara",,3.5,4
"Daisy",5,,3'''
df
=
None
#方式一:加载csv数据
def
load_csv_txt():
global
df, csv_txt, csv_txt2, csv_txt3 df
=
pd.read_csv(StringIO(csv_txt3), header
=
0
, index_col
=
"user"
)
#测试:读取数据
load_csv_txt()
#=======================================
# 注意:不需要build_xy
#=======================================
# 计算两个物品相似度
def
computeSimilarity(goods1, goods2):
'''根据《data minning guide》第71页的公式s(i,j)'''
# 每行的用户评分都减去了该用户的平均评分
df2
=
df[[goods1, goods2]].sub(df.mean(axis
=
1
), axis
=
0
).dropna(axis
=
0
)
#黑科技
# 返回修正的余弦相似度
return
sum
(df2[goods1]
*
df2[goods2])
/
(
sum
(df2[goods1]
**
2
)
*
sum
(df2[goods2]
**
2
))
**
0.5
# csv_txt
#print('\n测试:计算Blues Traveler与Broken Bells的相似度')
#print(computeSimilarity("Blues Traveler","Broken Bells"))
# csv_txt2
#print('\n测试:计算Kacey Musgraves与Imagine Dragons的相似度')
#print(computeSimilarity("Kacey Musgraves","Imagine Dragons"))
# 计算给定用户对物品的可能评分
def
p(user, goods):
'''根据《data minning guide》第75页的公式p(u,i)'''
assert
pd.isnull(df.ix[user, goods])
# 必须用户对给定物品尚未评分
s1
=
df.ix[user, df.ix[user].notnull()]
#用户对已打分物品的打分数据
s2
=
s1.index.to_series().
apply
(
lambda
x:computeSimilarity(x, goods))
#打分物品分别与给定物品的相似度
return
sum
(s1
*
s2)
/
sum
(
abs
(s2))
# csv_txt2
#print('\n测试:计算David对Kacey Musgraves的可能打分')
#print(p("David","Kacey Musgraves"))
#为了让公式的计算效果更佳,对物品的评价分值最好介于-1和1之间
def
rate2newrate(rate):
'''根据《data minning guide》第76页的公式NR(u,N)'''
ma, mi
=
df.
max
().
max
(), df.
min
().
min
()
return
(
2
*
(rate
-
mi)
-
(ma
-
mi))
/
(ma
-
mi)
#已知rate2newrate求newrate2rate
def
newrate2rate(new_rate):
'''根据《data minning guide》第76页的公式R(u,N)'''
ma, mi
=
df.
max
().
max
(), df.
min
().
min
()
return
(
0.5
*
(new_rate
+
1
)
*
(ma
-
mi))
+
mi
print
(
'
\n
测试:计算3的new_rate值'
)
print
(rate2newrate(
3
))
print
(
'
\n
测试:计算0.5的rate值'
)
print
(newrate2rate(
0.5
))
# 计算给定用户对物品的可能评分(对评分进行了修正/还原)
def
p2(user, goods):
'''根据《data minning guide》第75页的公式p(u,i)'''
assert
pd.isnull(df.ix[user, goods])
# 必须用户对给定物品尚未评分
s1
=
df.ix[user, df.ix[user].notnull()]
#用户对已打分物品的打分数据
s1
=
s1.
apply
(
lambda
x:rate2newrate(x))
#修正
s2
=
s1.index.to_series().
apply
(
lambda
x:computeSimilarity(x, goods))
#已打分物品分别与给定物品的相似度
return
newrate2rate(
sum
(s1
*
s2)
/
sum
(
abs
(s2)))
#还原
# csv_txt2
#print('\n测试:计算David对Kacey Musgraves的可能打分(修正)')
#print(p2("David","Kacey Musgraves"))
#==================================
# 下面是Slope One算法
#
# 两个步骤:
# 1. 计算差值
# 2. 预测用户对尚未评分物品的评分
#==================================
# 1.计算两物品之间的差异
def
dev(goods1, goods2):
'''根据《data minning guide》第80页的公式dev(i,j)'''
s
=
(df[goods1]
-
df[goods2]).dropna() d
=
sum
(s)
/
s.size
return
d, s.size
#返回差异值,及权值(同时对两个物品打分的人数)
# csv_txt2
#print('\n测试:计算Kacey Musgraves与Imagine Dragons的分数差异')
#print(dev("Kacey Musgraves","Imagine Dragons"))
#计算所有两两物品之间的评分差异,得到方阵pd.DataFrame(行对列)
def
get_dev_table():
'''根据《data minning guide》第87页的表'''
goods_names
=
df.columns.tolist() df2
=
pd.DataFrame(
.
0
, index
=
goods_names, columns
=
goods_names)
#零方阵
for
i,goods1
in
enumerate
(goods_names):
for
goods2
in
goods_names[i
+1
:]: d, _
=
dev(goods1, goods2)
# 注意:只取了物品差异值
df2.ix[goods1, goods2]
=
d df2.ix[goods2, goods1]
=
-
d
# 对称的位置取反
return
df2
print
(
'
\n
测试:计算所有两两物品之间的评分差异表'
)
print
(get_dev_table())
#预测某用户对给定物品的评分
# 加权Slope One算法
def
slopeone(user, goods):
'''根据《data minning guide》第82页的公式p(u,j)'''
s1
=
df.ix[user].dropna()
#用户对已打分物品的打分数据
s2
=
s1.index.to_series().
apply
(
lambda
x:dev(goods, x))
#待打分物品与已打分物品的差异值及权值
s3
=
s2.
apply
(
lambda
x:x[
0
])
#差异值
s4
=
s2.
apply
(
lambda
x:x[
1
])
#权值
#print(s1, s3, s4)
return
sum
((s1
+
s3)
*
s4)
/
sum
(s4)
print
(
'
\n
测试:预测用户Ben对物品Whitney Houston的评分'
)
print
(slopeone(
'Ben'
,
'Whitney Houston'
))
# 3.375
本文转自罗兵博客园博客,原文链接:
http://www.cnblogs.com/hhh5460/p/6121918.html
,如需转载请自行联系原作者