实现结果:
1、找到和用户A相似的N个用户
2、找到和项目A相似的N个项目
数据下载(1 MB):https://grouplens.org/datasets/movielens/
数据格式:data/ratings.csv
userId,movieId,rating,timestamp 1,1,4.0,964982703 1,3,4.0,964981247 1,6,4.0,964982224 1,47,5.0,964983815 1,50,5.0,964982931 1,70,3.0,964982400 1,101,5.0,964980868 ...
简单查看数据数量
# -*- coding: utf-8 -*- import pandas as pd file_path = 'data/ratings.csv' data_frame = pd.read_csv(file_path) print(len(data_frame.groupby("userId"))) print(len(data_frame.groupby("movieId"))) # 610 9724
查找相似用户 和 相似项目
# -*- coding: utf-8 -*- from surprise import KNNBasic, Dataset, Reader # 数据读取 file_path = 'data/ratings.csv' reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) data = Dataset.load_from_file(file_path, reader=reader) train_set = data.build_full_trainset() # 训练模型:基于用户相似度 user_knn_basic = KNNBasic(k=10, min_k=3, sim_options={'user_based': True}) user_knn_basic.fit(train_set) def get_similar_users(top_k, user_id): """ 找到相似用户 Args: top_k(int): 相似用户数量 user_id(str): 用户id Returns: list generator """ user_inner_id = user_knn_basic.trainset.to_inner_uid(user_id) user_neighbors = user_knn_basic.get_neighbors(user_inner_id, k=top_k) user_neighbor_ids = (user_knn_basic.trainset.to_raw_uid(inner_id) for inner_id in user_neighbors) return user_neighbor_ids print(list(get_similar_users(5, '610'))) # ['429', '508', '545', '53', '463'] # 训练模型:基于项目相似度 item_algo = KNNBasic(k=10, min_k=3, sim_options={'user_based': False}) # sim_options={'name': 'cosine','user_based': True} cosine/msd/pearson/pearson_baseline item_algo.fit(train_set) def get_similar_items(top_k, item_id): """ 相似项目 Args: top_k(int): 相似项目数量 item_id(str): 项目id Returns: list generator """ item_inner_id = item_algo.trainset.to_inner_iid(item_id) item_neighbors = item_algo.get_neighbors(item_inner_id, k=top_k) item_neighbor_ids = (item_algo.trainset.to_raw_iid(inner_id) for inner_id in item_neighbors) return item_neighbor_ids print(list(get_similar_items(5, '761'))) # ['423', '1009', '1804', '2099', '2899']
通过找到的相似用户计算他们的相似度
# -*- coding: utf-8 -*- import pandas as pd file_path = 'data/ratings.csv' data_frame = pd.read_csv(file_path) def get_user_movie(user_id): return data_frame[data_frame.userId == user_id].movieId.tolist() user_movies = get_user_movie(610) set_movies = set(movies) similar_user_ids = [429, 508, 545, 53, 463] for similar_user_id in similar_user_ids: movies = get_user_movie(similar_user_id) set_user_movies = set(user_movies) similar_score = len((set_user_movies & set_movies)) / len((set_user_movies | set_movies)) print("{:.2f}".format(similar_score), end="\t") # 0.00 0.00 0.00 0.00 0.01
相似度很低…