【笔记4】用pandas实现条目数据格式的推荐算法 (基于用户的协同)

简介: '''基于用户的协同推荐条目数据'''import pandas as pdfrom io import StringIOimport json#数据类型一:条目(用户、商品、打分)(避免巨型稀疏矩阵)csv_txt = '''"Angelica","Blues Traveler",3.
'''
基于用户的协同推荐

条目数据
'''

import pandas as pd
from io import StringIO
import json

#数据类型一:条目(用户、商品、打分)(避免巨型稀疏矩阵)
csv_txt = '''"Angelica","Blues Traveler",3.5
"Angelica","Broken Bells",2.0
"Angelica","Norah Jones",4.5
"Angelica","Phoenix",5.0
"Angelica","Slightly Stoopid",1.5
"Angelica","The Strokes",2.5
"Angelica","Vampire Weekend",2.0
"Bill","Blues Traveler",2.0
"Bill","Broken Bells",3.5
"Bill","Deadmau5",4.0
"Bill","Phoenix",2.0
"Bill","Slightly Stoopid",3.5
"Bill","Vampire Weekend",3.0
"Chan","Blues Traveler",5.0
"Chan","Broken Bells",1.0
"Chan","Deadmau5",1.0
"Chan","Norah Jones",3.0
"Chan","Phoenix",5,
"Chan","Slightly Stoopid",1.0
"Dan","Blues Traveler",3.0
"Dan","Broken Bells",4.0
"Dan","Deadmau5",4.5
"Dan","Phoenix",3.0
"Dan","Slightly Stoopid",4.5
"Dan","The Strokes",4.0
"Dan","Vampire Weekend",2.0
"Hailey","Broken Bells",4.0
"Hailey","Deadmau5",1.0
"Hailey","Norah Jones",4.0
"Hailey","The Strokes",4.0
"Hailey","Vampire Weekend",1.0
"Jordyn","Broken Bells",4.5
"Jordyn","Deadmau5",4.0
"Jordyn","Norah Jones",5.0
"Jordyn","Phoenix",5.0
"Jordyn","Slightly Stoopid",4.5
"Jordyn","The Strokes",4.0
"Jordyn","Vampire Weekend",4.0
"Sam","Blues Traveler",5.0
"Sam","Broken Bells",2.0
"Sam","Norah Jones",3.0
"Sam","Phoenix",5.0
"Sam","Slightly Stoopid",4.0
"Sam","The Strokes",5.0
"Veronica","Blues Traveler",3.0
"Veronica","Norah Jones",5.0
"Veronica","Phoenix",4.0
"Veronica","Slightly Stoopid",2.5
"Veronica","The Strokes",3.0'''

#数据类型二:json数据(用户、商品、打分)
json_txt = '''{"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0,
                      "Norah Jones": 4.5, "Phoenix": 5.0,
                      "Slightly Stoopid": 1.5,
                      "The Strokes": 2.5, "Vampire Weekend": 2.0},
         
         "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5,
                 "Deadmau5": 4.0, "Phoenix": 2.0,
                 "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
         
         "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0,
                  "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5,
                  "Slightly Stoopid": 1.0},
         
         "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0,
                 "Deadmau5": 4.5, "Phoenix": 3.0,
                 "Slightly Stoopid": 4.5, "The Strokes": 4.0,
                 "Vampire Weekend": 2.0},
         
         "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0,
                    "Norah Jones": 4.0, "The Strokes": 4.0,
                    "Vampire Weekend": 1.0},
         
         "Jordyn":  {"Broken Bells": 4.5, "Deadmau5": 4.0,
                     "Norah Jones": 5.0, "Phoenix": 5.0,
                     "Slightly Stoopid": 4.5, "The Strokes": 4.0,
                     "Vampire Weekend": 4.0},
         
         "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0,
                 "Norah Jones": 3.0, "Phoenix": 5.0,
                 "Slightly Stoopid": 4.0, "The Strokes": 5.0},
         
         "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0,
                      "Phoenix": 4.0, "Slightly Stoopid": 2.5,
                      "The Strokes": 3.0}
}'''


df = None

#方式一:加载csv数据
def load_csv_txt():
    global df
    df = pd.read_csv(StringIO(csv_txt), header=None, names=['user','goods','rate'])

#方式二:加载json数据(把json读成条目)
def load_json_txt():
    global df
    #由json数据得到字典
    users = json.loads(json_txt)
    
    #遍历字典,得到条目
    csv_txt_ = ''
    for user in users:
        for goods in users[user]:
            csv_txt_ += '{},{},{}\n'.format(user, goods, users[user][goods])
    
    df = pd.read_csv(StringIO(csv_txt_), header=None, names=['user','goods','rate'])


print('测试:读取数据')
#load_csv_txt()
load_json_txt()



def build_xy(user_name1, user_name2):
    df1 = df.ix[df['user'] == user_name1, ['goods','rate']]
    df2 = df.ix[df['user'] == user_name2, ['goods','rate']]
    
    df3 = pd.merge(df1, df2, on='goods', how='inner') #只保留两人都有评分的商品的评分
    
    return df3['rate_x'], df3['rate_y'] #merge之后默认的列名:rate_x,rate_y
    



#曼哈顿距离
def manhattan(user_name1, user_name2):
    x, y = build_xy(user_name1, user_name2)
    return sum(abs(x - y))
    
#欧几里德距离
def euclidean(user_name1, user_name2):
    x, y = build_xy(user_name1, user_name2)
    return sum((x - y)**2)**0.5
    
#闵可夫斯基距离
def minkowski(user_name1, user_name2, r):
    x, y = build_xy(user_name1, user_name2)
    return sum(abs(x - y)**r)**(1/r)
    
#皮尔逊相关系数
def pearson(user_name1, user_name2):
    x, y = build_xy(user_name1, user_name2)
    mean1, mean2 = x.mean(), y.mean()
    #分母
    denominator = (sum((x-mean1)**2)*sum((y-mean2)**2))**0.5
    return [sum((x-mean1)*(y-mean2))/denominator, 0][denominator == 0]

#余弦相似度(数据的稀疏性问题,在文本挖掘中应用得较多)
def cosine(user_name1, user_name2):
    x, y = build_xy(user_name1, user_name2)
    #分母
    denominator = (sum(x*x)*sum(y*y))**0.5
    return [sum(x*y)/denominator, 0][denominator == 0]
    
metric_funcs = {
    'manhattan': manhattan,
    'euclidean': euclidean,
    'minkowski': minkowski,
    'pearson': pearson,
    'cosine': cosine
}


print('\n测试:计算Angelica与Bill的曼哈顿距离')
print(manhattan('Angelica','Bill'))


#计算最近的邻居(返回:pd.Series)
def computeNearestNeighbor(user_name, metric='pearson', k=3, r=2):
    '''
    metric: 度量函数
    k:      返回k个邻居
    r:      闵可夫斯基距离专用
    
    返回:pd.Series,其中index是邻居名称,values是距离
    '''
    array = df[df['user'] != user_name]['user'].unique()
    if metric in ['manhattan', 'euclidean']:
        return pd.Series(array, index=array.tolist()).apply(metric_funcs[metric], args=(user_name,)).nsmallest(k)
    elif metric in ['minkowski']:
        return pd.Series(array, index=array.tolist()).apply(metric_funcs[metric], args=(user_name, r,)).nsmallest(k)
    elif metric in ['pearson', 'cosine']:
        return pd.Series(array, index=array.tolist()).apply(metric_funcs[metric], args=(user_name,)).nlargest(k)
    
    
print('\n测试:计算Hailey的最近邻居')
print(computeNearestNeighbor('Hailey'))


#向给定用户推荐(返回:pd.DataFrame)
def recommend(user_name):
    """返回推荐结果列表"""
    # 找到距离最近的用户名
    nearest_username = computeNearestNeighbor(user_name).index[0]
    
    # 找出这位用户评价过、但自己未曾评价的乐队
    df1 = df.ix[df['user'] == user_name, ['goods', 'rate']]
    df2 = df.ix[df['user'] == nearest_username, ['goods', 'rate']]
    
    df3 = pd.merge(df1, df2, on='goods', how='outer')
    
    return df3.ix[(df3['rate_x'].isnull()) & (df3['rate_y'].notnull()), ['goods', 'rate_y']].sort_values(by='rate_y')

    
print('\n测试:为Hailey做推荐')
print(recommend('Hailey'))


#向给定用户推荐(返回:pd.Series)
def recommend2(user_name, metric='pearson', k=3, n=5, r=2):
    '''
    metric: 度量函数
    k:      根据k个最近邻居,协同推荐
    r:      闵可夫斯基距离专用
    n:      推荐的商品数目
    
    返回:pd.Series,其中index是商品名称,values是加权评分
    '''
    # 找到距离最近的k个邻居
    nearest_neighbors = computeNearestNeighbor(user_name, metric='pearson', k=k, r=r)
    
    # 计算权值
    if metric in ['manhattan', 'euclidean', 'minkowski']: # 距离越小,越类似
        nearest_neighbors = 1 / nearest_neighbors # 所以,取倒数(或者别的减函数,如:y=2**-x)
    elif metric in ['pearson', 'cosine']:                 # 距离越大,越类似
        pass
        
    nearest_neighbors = nearest_neighbors / nearest_neighbors.sum() #已经变为权值
    
    # 逐个邻居找出其评价过、但自己未曾评价的乐队(或商品)的评分,并乘以权值
    neighbors_rate_with_weight = []
    for neighbor_name in nearest_neighbors.index:
        # 每个结果:pd.Series,其中index是商品名称,values是评分(已乘权值)
        df1 = df.ix[df['user'] == user_name, ['goods', 'rate']]
        df2 = df.ix[df['user'] == neighbor_name, ['goods', 'rate']]
        
        df3 = pd.merge(df1, df2, on='goods', how='outer')
        
        df4 =  df3.ix[(df3['rate_x'].isnull()) & (df3['rate_y'].notnull()), ['goods', 'rate_y']]
        
        #注意这中间有一个转化为pd.Series的操作!
        neighbors_rate_with_weight.append(pd.Series(df4['rate_y'].tolist(), index=df4['goods']) * nearest_neighbors[neighbor_name])

    # 把邻居们的加权评分拼接成pd.DataFrame,按列累加,取最大的前n个商品的评分
    return pd.concat(neighbors_rate_with_weight, axis=1).sum(axis=1, skipna=True).nlargest(n) # 黑科技!
    
print('\n测试:为Hailey做推荐')
print(recommend2('Hailey', metric='manhattan', k=3, n=5))

print('\n测试:为Hailey做推荐')
print(recommend2('Hailey', metric='euclidean', k=3, n=5, r=2))

print('\n测试:为Hailey做推荐')
print(recommend2('Hailey', metric='pearson', k=1, n=5))
目录
相关文章
|
数据采集 机器学习/深度学习 数据挖掘
清洗数据的魔法:让你的数据干净又整洁
清洗数据的魔法:让你的数据干净又整洁
984 2
|
数据挖掘 索引 Python
Pandas中的排序技巧:让你的数据井然有序
Pandas中的排序技巧:让你的数据井然有序
918 1
|
人工智能 弹性计算 API
通义万相AI绘画创作体验评测
从使用者的角度解读通义万相AI绘画创作方案的优与劣
11783 12
|
机器学习/深度学习 数据采集 人工智能
AI技术实践:利用机器学习算法预测房价
人工智能(Artificial Intelligence, AI)已经深刻地影响了我们的生活,从智能助手到自动驾驶,AI的应用无处不在。然而,AI不仅仅是一个理论概念,它的实际应用和技术实现同样重要。本文将通过详细的技术实践,带领读者从理论走向实践,详细介绍AI项目的实现过程,包括数据准备、模型选择、训练和优化等环节。
1432 3
|
Java 开发工具 数据库
IntelliJ IDEA 面试题及答案整理,最新面试题
IntelliJ IDEA 面试题及答案整理,最新面试题
294 0
|
NoSQL Cloud Native 关系型数据库
一张图读懂阿里云数据库架构和配置选择
一张图读懂阿里云数据库架构和配置选择,阿里云数据库大全
1469 1
|
域名解析 网络协议 安全
游戏DDoS防护新思路--SDK版
探索一种新的防护模式,彻底摆脱针对tcp业务端口的cc和ddos攻击,文章阐述原理、技术实践。软件已经开源,希望跟大家一起交流沟通。
游戏DDoS防护新思路--SDK版
|
JSON 数据库 数据安全/隐私保护
Qt使用MD5加密
Qt中包含了大部分常用的功能,比如json、数据库、网络通信、串口通信以及今天说的这个MD5加密;
628 0
|
数据采集 人工智能 运维
New Relic 可观测平台调研
New Relic 可观测平台调研
1150 0
|
Linux 数据安全/隐私保护 Windows
操作系统实战-almalinux的安装过程
操作系统实战-almalinux的安装过程
3737 0
操作系统实战-almalinux的安装过程

热门文章

最新文章