'''
基于用户的协同推荐
条目数据
'''
import pandas as pd
from io import StringIO
import json
csv_txt = '''"Angelica","Blues Traveler",3.5
"Angelica","Broken Bells",2.0
"Angelica","Norah Jones",4.5
"Angelica","Phoenix",5.0
"Angelica","Slightly Stoopid",1.5
"Angelica","The Strokes",2.5
"Angelica","Vampire Weekend",2.0
"Bill","Blues Traveler",2.0
"Bill","Broken Bells",3.5
"Bill","Deadmau5",4.0
"Bill","Phoenix",2.0
"Bill","Slightly Stoopid",3.5
"Bill","Vampire Weekend",3.0
"Chan","Blues Traveler",5.0
"Chan","Broken Bells",1.0
"Chan","Deadmau5",1.0
"Chan","Norah Jones",3.0
"Chan","Phoenix",5,
"Chan","Slightly Stoopid",1.0
"Dan","Blues Traveler",3.0
"Dan","Broken Bells",4.0
"Dan","Deadmau5",4.5
"Dan","Phoenix",3.0
"Dan","Slightly Stoopid",4.5
"Dan","The Strokes",4.0
"Dan","Vampire Weekend",2.0
"Hailey","Broken Bells",4.0
"Hailey","Deadmau5",1.0
"Hailey","Norah Jones",4.0
"Hailey","The Strokes",4.0
"Hailey","Vampire Weekend",1.0
"Jordyn","Broken Bells",4.5
"Jordyn","Deadmau5",4.0
"Jordyn","Norah Jones",5.0
"Jordyn","Phoenix",5.0
"Jordyn","Slightly Stoopid",4.5
"Jordyn","The Strokes",4.0
"Jordyn","Vampire Weekend",4.0
"Sam","Blues Traveler",5.0
"Sam","Broken Bells",2.0
"Sam","Norah Jones",3.0
"Sam","Phoenix",5.0
"Sam","Slightly Stoopid",4.0
"Sam","The Strokes",5.0
"Veronica","Blues Traveler",3.0
"Veronica","Norah Jones",5.0
"Veronica","Phoenix",4.0
"Veronica","Slightly Stoopid",2.5
"Veronica","The Strokes",3.0'''
json_txt = '''{"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0,
"Norah Jones": 4.5, "Phoenix": 5.0,
"Slightly Stoopid": 1.5,
"The Strokes": 2.5, "Vampire Weekend": 2.0},
"Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5,
"Deadmau5": 4.0, "Phoenix": 2.0,
"Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
"Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0,
"Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5,
"Slightly Stoopid": 1.0},
"Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0,
"Deadmau5": 4.5, "Phoenix": 3.0,
"Slightly Stoopid": 4.5, "The Strokes": 4.0,
"Vampire Weekend": 2.0},
"Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0,
"Norah Jones": 4.0, "The Strokes": 4.0,
"Vampire Weekend": 1.0},
"Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0,
"Norah Jones": 5.0, "Phoenix": 5.0,
"Slightly Stoopid": 4.5, "The Strokes": 4.0,
"Vampire Weekend": 4.0},
"Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0,
"Norah Jones": 3.0, "Phoenix": 5.0,
"Slightly Stoopid": 4.0, "The Strokes": 5.0},
"Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0,
"Phoenix": 4.0, "Slightly Stoopid": 2.5,
"The Strokes": 3.0}
}'''
df = None
def load_csv_txt():
global df
df = pd.read_csv(StringIO(csv_txt), header=None, names=['user','goods','rate'])
def load_json_txt():
global df
users = json.loads(json_txt)
csv_txt_ = ''
for user in users:
for goods in users[user]:
csv_txt_ += '{},{},{}\n'.format(user, goods, users[user][goods])
df = pd.read_csv(StringIO(csv_txt_), header=None, names=['user','goods','rate'])
print('测试:读取数据')
load_json_txt()
def build_xy(user_name1, user_name2):
df1 = df.ix[df['user'] == user_name1, ['goods','rate']]
df2 = df.ix[df['user'] == user_name2, ['goods','rate']]
df3 = pd.merge(df1, df2, on='goods', how='inner')
return df3['rate_x'], df3['rate_y']
def manhattan(user_name1, user_name2):
x, y = build_xy(user_name1, user_name2)
return sum(abs(x - y))
def euclidean(user_name1, user_name2):
x, y = build_xy(user_name1, user_name2)
return sum((x - y)**2)**0.5
def minkowski(user_name1, user_name2, r):
x, y = build_xy(user_name1, user_name2)
return sum(abs(x - y)**r)**(1/r)
def pearson(user_name1, user_name2):
x, y = build_xy(user_name1, user_name2)
mean1, mean2 = x.mean(), y.mean()
denominator = (sum((x-mean1)**2)*sum((y-mean2)**2))**0.5
return [sum((x-mean1)*(y-mean2))/denominator, 0][denominator == 0]
def cosine(user_name1, user_name2):
x, y = build_xy(user_name1, user_name2)
denominator = (sum(x*x)*sum(y*y))**0.5
return [sum(x*y)/denominator, 0][denominator == 0]
metric_funcs = {
'manhattan': manhattan,
'euclidean': euclidean,
'minkowski': minkowski,
'pearson': pearson,
'cosine': cosine
}
print('\n测试:计算Angelica与Bill的曼哈顿距离')
print(manhattan('Angelica','Bill'))
def computeNearestNeighbor(user_name, metric='pearson', k=3, r=2):
'''
metric: 度量函数
k: 返回k个邻居
r: 闵可夫斯基距离专用
返回:pd.Series,其中index是邻居名称,values是距离
'''
array = df[df['user'] != user_name]['user'].unique()
if metric in ['manhattan', 'euclidean']:
return pd.Series(array, index=array.tolist()).apply(metric_funcs[metric], args=(user_name,)).nsmallest(k)
elif metric in ['minkowski']:
return pd.Series(array, index=array.tolist()).apply(metric_funcs[metric], args=(user_name, r,)).nsmallest(k)
elif metric in ['pearson', 'cosine']:
return pd.Series(array, index=array.tolist()).apply(metric_funcs[metric], args=(user_name,)).nlargest(k)
print('\n测试:计算Hailey的最近邻居')
print(computeNearestNeighbor('Hailey'))
def recommend(user_name):
"""返回推荐结果列表"""
nearest_username = computeNearestNeighbor(user_name).index[0]
df1 = df.ix[df['user'] == user_name, ['goods', 'rate']]
df2 = df.ix[df['user'] == nearest_username, ['goods', 'rate']]
df3 = pd.merge(df1, df2, on='goods', how='outer')
return df3.ix[(df3['rate_x'].isnull()) & (df3['rate_y'].notnull()), ['goods', 'rate_y']].sort_values(by='rate_y')
print('\n测试:为Hailey做推荐')
print(recommend('Hailey'))
def recommend2(user_name, metric='pearson', k=3, n=5, r=2):
'''
metric: 度量函数
k: 根据k个最近邻居,协同推荐
r: 闵可夫斯基距离专用
n: 推荐的商品数目
返回:pd.Series,其中index是商品名称,values是加权评分
'''
nearest_neighbors = computeNearestNeighbor(user_name, metric='pearson', k=k, r=r)
if metric in ['manhattan', 'euclidean', 'minkowski']:
nearest_neighbors = 1 / nearest_neighbors
elif metric in ['pearson', 'cosine']:
pass
nearest_neighbors = nearest_neighbors / nearest_neighbors.sum()
neighbors_rate_with_weight = []
for neighbor_name in nearest_neighbors.index:
df1 = df.ix[df['user'] == user_name, ['goods', 'rate']]
df2 = df.ix[df['user'] == neighbor_name, ['goods', 'rate']]
df3 = pd.merge(df1, df2, on='goods', how='outer')
df4 = df3.ix[(df3['rate_x'].isnull()) & (df3['rate_y'].notnull()), ['goods', 'rate_y']]
neighbors_rate_with_weight.append(pd.Series(df4['rate_y'].tolist(), index=df4['goods']) * nearest_neighbors[neighbor_name])
return pd.concat(neighbors_rate_with_weight, axis=1).sum(axis=1, skipna=True).nlargest(n)
print('\n测试:为Hailey做推荐')
print(recommend2('Hailey', metric='manhattan', k=3, n=5))
print('\n测试:为Hailey做推荐')
print(recommend2('Hailey', metric='euclidean', k=3, n=5, r=2))
print('\n测试:为Hailey做推荐')
print(recommend2('Hailey', metric='pearson', k=1, n=5))