【笔记4】用pandas实现条目数据格式的推荐算法 (基于用户的协同)

简介:
'''
基于用户的协同推荐

条目数据
'''

import pandas as pd
from io import StringIO
import json

#数据类型一:条目(用户、商品、打分)(避免巨型稀疏矩阵)
csv_txt = '''"Angelica","Blues Traveler",3.5
"Angelica","Broken Bells",2.0
"Angelica","Norah Jones",4.5
"Angelica","Phoenix",5.0
"Angelica","Slightly Stoopid",1.5
"Angelica","The Strokes",2.5
"Angelica","Vampire Weekend",2.0
"Bill","Blues Traveler",2.0
"Bill","Broken Bells",3.5
"Bill","Deadmau5",4.0
"Bill","Phoenix",2.0
"Bill","Slightly Stoopid",3.5
"Bill","Vampire Weekend",3.0
"Chan","Blues Traveler",5.0
"Chan","Broken Bells",1.0
"Chan","Deadmau5",1.0
"Chan","Norah Jones",3.0
"Chan","Phoenix",5,
"Chan","Slightly Stoopid",1.0
"Dan","Blues Traveler",3.0
"Dan","Broken Bells",4.0
"Dan","Deadmau5",4.5
"Dan","Phoenix",3.0
"Dan","Slightly Stoopid",4.5
"Dan","The Strokes",4.0
"Dan","Vampire Weekend",2.0
"Hailey","Broken Bells",4.0
"Hailey","Deadmau5",1.0
"Hailey","Norah Jones",4.0
"Hailey","The Strokes",4.0
"Hailey","Vampire Weekend",1.0
"Jordyn","Broken Bells",4.5
"Jordyn","Deadmau5",4.0
"Jordyn","Norah Jones",5.0
"Jordyn","Phoenix",5.0
"Jordyn","Slightly Stoopid",4.5
"Jordyn","The Strokes",4.0
"Jordyn","Vampire Weekend",4.0
"Sam","Blues Traveler",5.0
"Sam","Broken Bells",2.0
"Sam","Norah Jones",3.0
"Sam","Phoenix",5.0
"Sam","Slightly Stoopid",4.0
"Sam","The Strokes",5.0
"Veronica","Blues Traveler",3.0
"Veronica","Norah Jones",5.0
"Veronica","Phoenix",4.0
"Veronica","Slightly Stoopid",2.5
"Veronica","The Strokes",3.0'''

#数据类型二:json数据(用户、商品、打分)
json_txt = '''{"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0,
                      "Norah Jones": 4.5, "Phoenix": 5.0,
                      "Slightly Stoopid": 1.5,
                      "The Strokes": 2.5, "Vampire Weekend": 2.0},
         
         "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5,
                 "Deadmau5": 4.0, "Phoenix": 2.0,
                 "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
         
         "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0,
                  "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5,
                  "Slightly Stoopid": 1.0},
         
         "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0,
                 "Deadmau5": 4.5, "Phoenix": 3.0,
                 "Slightly Stoopid": 4.5, "The Strokes": 4.0,
                 "Vampire Weekend": 2.0},
         
         "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0,
                    "Norah Jones": 4.0, "The Strokes": 4.0,
                    "Vampire Weekend": 1.0},
         
         "Jordyn":  {"Broken Bells": 4.5, "Deadmau5": 4.0,
                     "Norah Jones": 5.0, "Phoenix": 5.0,
                     "Slightly Stoopid": 4.5, "The Strokes": 4.0,
                     "Vampire Weekend": 4.0},
         
         "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0,
                 "Norah Jones": 3.0, "Phoenix": 5.0,
                 "Slightly Stoopid": 4.0, "The Strokes": 5.0},
         
         "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0,
                      "Phoenix": 4.0, "Slightly Stoopid": 2.5,
                      "The Strokes": 3.0}
}'''


df = None

#方式一:加载csv数据
def load_csv_txt():
    global df
    df = pd.read_csv(StringIO(csv_txt), header=None, names=['user','goods','rate'])

#方式二:加载json数据(把json读成条目)
def load_json_txt():
    global df
    #由json数据得到字典
    users = json.loads(json_txt)
    
    #遍历字典,得到条目
    csv_txt_ = ''
    for user in users:
        for goods in users[user]:
            csv_txt_ += '{},{},{}\n'.format(user, goods, users[user][goods])
    
    df = pd.read_csv(StringIO(csv_txt_), header=None, names=['user','goods','rate'])


print('测试:读取数据')
#load_csv_txt()
load_json_txt()



def build_xy(user_name1, user_name2):
    df1 = df.ix[df['user'] == user_name1, ['goods','rate']]
    df2 = df.ix[df['user'] == user_name2, ['goods','rate']]
    
    df3 = pd.merge(df1, df2, on='goods', how='inner') #只保留两人都有评分的商品的评分
    
    return df3['rate_x'], df3['rate_y'] #merge之后默认的列名:rate_x,rate_y
    



#曼哈顿距离
def manhattan(user_name1, user_name2):
    x, y = build_xy(user_name1, user_name2)
    return sum(abs(x - y))
    
#欧几里德距离
def euclidean(user_name1, user_name2):
    x, y = build_xy(user_name1, user_name2)
    return sum((x - y)**2)**0.5
    
#闵可夫斯基距离
def minkowski(user_name1, user_name2, r):
    x, y = build_xy(user_name1, user_name2)
    return sum(abs(x - y)**r)**(1/r)
    
#皮尔逊相关系数
def pearson(user_name1, user_name2):
    x, y = build_xy(user_name1, user_name2)
    mean1, mean2 = x.mean(), y.mean()
    #分母
    denominator = (sum((x-mean1)**2)*sum((y-mean2)**2))**0.5
    return [sum((x-mean1)*(y-mean2))/denominator, 0][denominator == 0]

#余弦相似度(数据的稀疏性问题,在文本挖掘中应用得较多)
def cosine(user_name1, user_name2):
    x, y = build_xy(user_name1, user_name2)
    #分母
    denominator = (sum(x*x)*sum(y*y))**0.5
    return [sum(x*y)/denominator, 0][denominator == 0]
    
metric_funcs = {
    'manhattan': manhattan,
    'euclidean': euclidean,
    'minkowski': minkowski,
    'pearson': pearson,
    'cosine': cosine
}


print('\n测试:计算Angelica与Bill的曼哈顿距离')
print(manhattan('Angelica','Bill'))


#计算最近的邻居(返回:pd.Series)
def computeNearestNeighbor(user_name, metric='pearson', k=3, r=2):
    '''
    metric: 度量函数
    k:      返回k个邻居
    r:      闵可夫斯基距离专用
    
    返回:pd.Series,其中index是邻居名称,values是距离
    '''
    array = df[df['user'] != user_name]['user'].unique()
    if metric in ['manhattan', 'euclidean']:
        return pd.Series(array, index=array.tolist()).apply(metric_funcs[metric], args=(user_name,)).nsmallest(k)
    elif metric in ['minkowski']:
        return pd.Series(array, index=array.tolist()).apply(metric_funcs[metric], args=(user_name, r,)).nsmallest(k)
    elif metric in ['pearson', 'cosine']:
        return pd.Series(array, index=array.tolist()).apply(metric_funcs[metric], args=(user_name,)).nlargest(k)
    
    
print('\n测试:计算Hailey的最近邻居')
print(computeNearestNeighbor('Hailey'))


#向给定用户推荐(返回:pd.DataFrame)
def recommend(user_name):
    """返回推荐结果列表"""
    # 找到距离最近的用户名
    nearest_username = computeNearestNeighbor(user_name).index[0]
    
    # 找出这位用户评价过、但自己未曾评价的乐队
    df1 = df.ix[df['user'] == user_name, ['goods', 'rate']]
    df2 = df.ix[df['user'] == nearest_username, ['goods', 'rate']]
    
    df3 = pd.merge(df1, df2, on='goods', how='outer')
    
    return df3.ix[(df3['rate_x'].isnull()) & (df3['rate_y'].notnull()), ['goods', 'rate_y']].sort_values(by='rate_y')

    
print('\n测试:为Hailey做推荐')
print(recommend('Hailey'))


#向给定用户推荐(返回:pd.Series)
def recommend2(user_name, metric='pearson', k=3, n=5, r=2):
    '''
    metric: 度量函数
    k:      根据k个最近邻居,协同推荐
    r:      闵可夫斯基距离专用
    n:      推荐的商品数目
    
    返回:pd.Series,其中index是商品名称,values是加权评分
    '''
    # 找到距离最近的k个邻居
    nearest_neighbors = computeNearestNeighbor(user_name, metric='pearson', k=k, r=r)
    
    # 计算权值
    if metric in ['manhattan', 'euclidean', 'minkowski']: # 距离越小,越类似
        nearest_neighbors = 1 / nearest_neighbors # 所以,取倒数(或者别的减函数,如:y=2**-x)
    elif metric in ['pearson', 'cosine']:                 # 距离越大,越类似
        pass
        
    nearest_neighbors = nearest_neighbors / nearest_neighbors.sum() #已经变为权值
    
    # 逐个邻居找出其评价过、但自己未曾评价的乐队(或商品)的评分,并乘以权值
    neighbors_rate_with_weight = []
    for neighbor_name in nearest_neighbors.index:
        # 每个结果:pd.Series,其中index是商品名称,values是评分(已乘权值)
        df1 = df.ix[df['user'] == user_name, ['goods', 'rate']]
        df2 = df.ix[df['user'] == neighbor_name, ['goods', 'rate']]
        
        df3 = pd.merge(df1, df2, on='goods', how='outer')
        
        df4 =  df3.ix[(df3['rate_x'].isnull()) & (df3['rate_y'].notnull()), ['goods', 'rate_y']]
        
        #注意这中间有一个转化为pd.Series的操作!
        neighbors_rate_with_weight.append(pd.Series(df4['rate_y'].tolist(), index=df4['goods']) * nearest_neighbors[neighbor_name])

    # 把邻居们的加权评分拼接成pd.DataFrame,按列累加,取最大的前n个商品的评分
    return pd.concat(neighbors_rate_with_weight, axis=1).sum(axis=1, skipna=True).nlargest(n) # 黑科技!
    
print('\n测试:为Hailey做推荐')
print(recommend2('Hailey', metric='manhattan', k=3, n=5))

print('\n测试:为Hailey做推荐')
print(recommend2('Hailey', metric='euclidean', k=3, n=5, r=2))

print('\n测试:为Hailey做推荐')
print(recommend2('Hailey', metric='pearson', k=1, n=5))
本文转自罗兵博客园博客,原文链接:http://www.cnblogs.com/hhh5460/p/6121899.html ,如需转载请自行联系原作者
相关文章
|
2月前
|
搜索推荐
排序算法笔记
排序算法笔记
15 0
|
4月前
|
自然语言处理 算法 测试技术
【数据结构原理】系统生命周期 | 算法规范 | 笔记
【数据结构原理】系统生命周期 | 算法规范 | 笔记
35 0
|
1天前
|
存储 搜索推荐 算法
【排序】软考笔记:简要回顾下常见的几种排序算法
【排序】软考笔记:简要回顾下常见的几种排序算法
20 0
【排序】软考笔记:简要回顾下常见的几种排序算法
|
28天前
|
存储 算法 Java
数据结构与算法笔记(一)
数据结构与算法笔记(一)
99 0
|
2月前
|
算法
链表算法笔记
链表算法笔记
14 0
|
2月前
|
XML 算法 前端开发
北大陈斌Python算法笔记(二)
北大陈斌Python算法笔记(二)
20 0
|
2月前
|
算法 前端开发 Java
北大陈斌Python算法笔记(一)
北大陈斌Python算法笔记(一)
27 0
|
3月前
|
算法 搜索推荐 测试技术
复杂度分析(算法训练营开课准备笔记)
复杂度分析(算法训练营开课准备笔记)
32 0
|
3月前
|
算法 搜索推荐
太厉害了!腾讯T4大牛把《数据结构与算法》讲透了,带源码笔记
经历过校招的人都知道,算法和数据结构都是不可避免的。 在笔试的时候,最主要的就是靠算法题。像拼多多、头条这种大公司,上来就来几道算法题,如果你没AC出来,面试机会都没有。
|
3月前
|
算法 Java 程序员
太全了!字节总监总结240道算法LeetCode刷题笔记
常言道「算法才是编程的灵魂」,不管是Java, python,还是PHP,都跨不过算法这个门槛。

相关产品

  • 云迁移中心