2021人工智能领域新星创作者,带你从入门到精通,该博客每天更新,逐渐完善推荐系统各个知识体系的文章,帮助大家更高效学习。
一、FM回归任务
1. 导库
import numpy as np from sklearn.feature_extraction import DictVectorizer from pyfm import pylibfm
DictVectorizer:它是可以将非结构化的数据转成array格式,这里将字典数据转成数组,一般情况下使用字典是因为在推荐系统中的矩阵一般是稀疏的,所以采用字典存储数据高效,可以不用存储大量无用的0
它转成数组的原理和OneHot差不多,对于数值型数据,它是保留原值,而对于非数值型则会将其利用OneHot进行编码,形成一个稀疏矩阵,每列表示同个特征不同值的选择门
2. 加载数据
"""加载数据""" def loadData(filename, path="data/ml-100k/"): data = [] y= [] users = set() items = set() with open(path + filename) as f: for line in f: (user, movie, rating, ts) = line.split('\t') data.append({"user_id":str(user), "movie_id":str(movie)}) y.append(float(rating)) users.add(user) items.add(movie) return (data, np.array(y), users, items)
3. 获取数据
"""获取数据""" (train_data, y_train, train_users, train_items) = loadData("ua.base") (test_data, y_test, test_users, test_items) = loadData("ua.test")
4. 定义编码器
vec = DictVectorizer() # 将字典数据进行编码 X_train = vec.fit_transform(train_data) X_test = vec.fit_transform(test_data)
5. 构建模型
fm = pylibfm.FM(num_factors = 10, # 交互特征维度 num_iter = 10, # 迭代次数 verbose = True, # 是否打印日志 task = "regression", # 模式 initial_learning_rate = 0.001, # 学习率 learning_rate_schedule = "optimal") fm.fit(X_train, y_train)
6. 衡量误差
pred = fm.predict(X_test) from sklearn.metrics import mean_squared_error print("FM MSE: %.4f" % mean_squared_error(y_test, pred)) # 均方误差
二、FM分类任务
1. 导库
import numpy as np from sklearn.feature_extraction import DictVectorizer from sklearn.model_selection import train_test_split from sklearn.datasets import make_classification from pyfm import pylibfm
2. 构造数据
"""加载数据""" X, y = make_classification(n_samples=1000,n_features=100, n_clusters_per_class=1) """将数据转成字典类型""" data = [ {v: k for k, v in dict(zip(i, range(len(i)))).items()} for i in X]
3. 划分数据集
"""划分数据集""" X_train, X_test, y_train, y_test = train_test_split(data, y, test_size = 0.1, random_state = 2021)
4. 定义编码器
vec = DictVectorizer() X_train = vec.fit_transform(X_train) X_test = vec.fit_transform(X_test)
5. 构建模型
"""构建模型""" fm = pylibfm.FM(num_factors = 2, num_iter = 10, verbose = True, task = "classification", initial_learning_rate = 0.0001, learning_rate_schedule = "optimal") fm.fit(X_train, y_train)
6. 衡量误差
"""衡量误差""" from sklearn.metrics import log_loss print("Validation log loss: %.4f" % log_loss(y_test,fm.predict(X_test)))