2021人工智能领域新星创作者,带你从入门到精通,该博客每天更新,逐渐完善推荐系统各个知识体系的文章,帮助大家更高效学习。
在CRT预估中,工业界一般是会采用逻辑回归进行处理,对用户特征画像进行建模,然后计算点击概率,评估用户是否会有点击的行为。
但是逻辑回归这个算法天生就会有个缺陷,它不能够区分非线性的数据,原因是逻辑回归是在普通的线性回归的基础之上添加了Sigmoid函数,处理的只能是线性数据,那么我们就需要获得线性可分的数据,这是如果采用人工进行组合特征,成本会非常的贵,而且需要有经验的专业人士,才能够获得提升模型效果的组合特征。
在2014年Facebook发表的一篇论文《Practical Lessons from Predicting Clicks on Ads at Facebook》,这篇论文提出了使用GBDT去产生高效的特征组合。
一、导库
import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder from sklearn.metrics import log_loss import lightgbm as lgb import gc from scipy import sparse import warnings warnings.filterwarnings('ignore')
二、处理数据
path = 'data/' df_train = pd.read_csv(path + 'kaggle_train.csv') df_test = pd.read_csv(path + 'kaggle_test.csv') # 合并训练集和测试集 df_train.drop(['Id'], axis=1, inplace=True) df_test.drop(['Id'], axis=1, inplace=True) df_test['Label'] = -1 data = pd.concat([df_train, df_test], axis=0) data.fillna(-1, inplace=True)
# 将连续性和类别型特征分离 continuous_feature = ['I' + str(i+1) for i in range(13)] category_feature = ['C' + str(i+1) for i in range(26)]
三、构建LR模型
def LR_model(data, continuous_feature, category_feature): # 将连续型特征归一化 scaler = MinMaxScaler() for col in continuous_feature: data[col] = scaler.fit_transform(data[col].values.reshape(-1,1)) # 将离散特征进行one-hot编码 for col in category_feature: onehot_features = pd.get_dummies(data[col], prefix=col) data.drop([col], axis=1, inplace=True) data = pd.concat([data, onehot_features], axis=1) # 将训练集和测试集分开 train_data = data[data['Label'] != -1] target = train_data.pop('Label') test_data = data[data['Label'] == -1] test_data.drop(['Label'], axis=1, inplace=True) # 划分数据集 x_train, x_val, y_train, y_val = train_test_split(train_data, target, test_size=0.3, random_state=2021) # 构建模型 LR = LogisticRegression() LR.fit(x_train, y_train) train_logloss = log_loss(y_train, LR.predict_proba(x_train)[:, 1]) val_logloss = log_loss(y_val, LR.predict_proba(x_val)[:, 1]) print('train_logloss: ',train_logloss) print('val_logloss:',val_logloss) # 模型预测 y_pred = LR.predict_proba(test_data)[:, 1]
四、构建GBDT模型
def GBDT_model(data, continuous_feature, category_feature): # 将分类特征离散化 for col in category_feature: onehot_feature = pd.get_dummies(data[col], prefix=col) data.drop([col], axis=1, inplace=True) data = pd.concat([data, onehot_feature], axis=1) # 将训练集和测试集分开 train_data = data[data['Label'] != -1] target = train_data.pop('Label') test_data = data[data['Label'] == -1] test_data.drop(['Label'], axis=1, inplace=True) # 划分数据集 x_train, x_val, y_train, y_val = train_test_split(train_data, target, test_size=0.3, random_state=2021) # 构建模型 GBM = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', subsample=0.8, min_child_weight=0.5, colsample_bytree=0.7, num_leaves=100, max_depth=12, learning_rate=0.01, n_estimators=100, silent=True ) GBM.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_val, y_val)], eval_names=['train', 'val'], eval_metric='binary_logloss', early_stopping_rounds=100, ) train_logloss = log_loss(y_train, GBM.predict_proba(x_train)[:, 1]) val_logloss = log_loss(y_val, GBM.predict_proba(x_val)[:, 1]) print('train_logloss: ',train_logloss) print('val_logloss:',val_logloss) # 模型预测 y_pred = GBM.predict_proba(test_data)[:, 1]
五、构建GBDT+LR融合模型
def GBDT_LR_model(data, continuous_feature, category_feature): # 将分类特征离散化 for col in category_feature: onehot_feature = pd.get_dummies(data[col], prefix=col) data.drop([col], axis=1, inplace=True) data = pd.concat([data, onehot_feature], axis=1) # 将训练集和测试集分开 train_data = data[data['Label'] != -1] target = train_data.pop('Label') test_data = data[data['Label'] == -1] test_data.drop(['Label'], axis=1, inplace=True) # 划分数据集 x_train, x_val, y_train, y_val = train_test_split(train_data, target, test_size=0.2, random_state=2021) # 构建模型 GBM = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', subsample=0.8, min_child_weight=0.5, colsample_bytree=0.7, num_leaves=100, max_depth=12, learning_rate=0.01, n_estimators=100, silent=True ) GBM.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_val, y_val)], eval_names=['train', 'val'], eval_metric='binary_logloss', early_stopping_rounds=100, ) model = GBM.booster_ gbdt_feats_train = model.predict(train_data, pred_leaf=True) gbdt_feats_test = model.predict(test_data, pred_leaf=True) gbdt_feats_name = ['gbdt_leaf_' + str(i) for i in range(gbdt_feats_train.shape[1])] df_train_gbdt_feats = pd.DataFrame(gbdt_feats_train, columns = gbdt_feats_name) df_test_gbdt_feats = pd.DataFrame(gbdt_feats_test, columns = gbdt_feats_name) train = pd.concat([train_data, df_train_gbdt_feats], axis = 1) test = pd.concat([test_data, df_test_gbdt_feats], axis = 1) train_len = train.shape[0] data = pd.concat([train, test]) del train del test gc.collect() # 将连续特征归一化 scaler = MinMaxScaler() for col in continuous_feature: data[col] = scaler.fit_transform(data[col].values.reshape(-1, 1)) # 将叶子节点特征进行one-hot编码 for col in gbdt_feats_name: onehot_feats = pd.get_dummies(data[col], prefix = col) data.drop([col], axis = 1, inplace = True) data = pd.concat([data, onehot_feats], axis = 1) train = data[: train_len] test = data[train_len:] del data gc.collect() # 划分数据集 x_train, x_val, y_train, y_val = train_test_split(train, target, test_size = 0.3, random_state = 2021) # 构建LR模型 LR = LogisticRegression() LR.fit(x_train, y_train) train_logloss = log_loss(y_train, LR.predict_proba(x_train)[:, 1]) val_logloss = log_loss(y_val, LR.predict_proba(x_val)[:, 1]) print('train-logloss: ', train_logloss) print('val-logloss: ', val_logloss) # 模型预测 y_pred = LR.predict_proba(test)[:, 1]
六、评估结果
# 训练和预测LR模型 LR_model(data.copy(), continuous_feature, category_feature) # 模型训练和预测GBDT模型 GBDT_model(data.copy(), continuous_feature, category_feature) # 训练和预测GBDT+LR模型 GBDT_LR_model(data.copy(), continuous_feature, category_feature)