在CRT预估中,工业界一般是会采用逻辑回归进行处理,对用户特征画像进行建模,然后计算点击概率,评估用户是否会有点击的行为。
但是逻辑回归这个算法天生就会有个缺陷,它不能够区分非线性的数据,原因是逻辑回归是在普通的线性回归的基础之上添加了Sigmoid函数,处理的只能是线性数据,那么我们就需要获得线性可分的数据,这是如果采用人工进行组合特征,成本会非常的贵,而且需要有经验的专业人士,才能够获得提升模型效果的组合特征。
在2014年Facebook发表的一篇论文《Practical Lessons from Predicting Clicks on Ads at Facebook》,这篇论文提出了使用GBDT去产生高效的特征组合。
一、导库
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import log_loss
import lightgbm as lgb
import gc
from scipy import sparse
import warnings
warnings.filterwarnings('ignore')
二、处理数据
path = 'data/'
df_train = pd.read_csv(path + 'kaggle_train.csv')
df_test = pd.read_csv(path + 'kaggle_test.csv')
# 合并训练集和测试集
df_train.drop(['Id'], axis=1, inplace=True)
df_test.drop(['Id'], axis=1, inplace=True)
df_test['Label'] = -1
data = pd.concat([df_train, df_test], axis=0)
data.fillna(-1, inplace=True)
# 将连续性和类别型特征分离
continuous_feature = ['I' + str(i+1) for i in range(13)]
category_feature = ['C' + str(i+1) for i in range(26)]
三、构建LR模型
def LR_model(data, continuous_feature, category_feature):
# 将连续型特征归一化
scaler = MinMaxScaler()
for col in continuous_feature:
data[col] = scaler.fit_transform(data[col].values.reshape(-1,1))
# 将离散特征进行one-hot编码
for col in category_feature:
onehot_features = pd.get_dummies(data[col], prefix=col)
data.drop([col], axis=1, inplace=True)
data = pd.concat([data, onehot_features], axis=1)
# 将训练集和测试集分开
train_data = data[data['Label'] != -1]
target = train_data.pop('Label')
test_data = data[data['Label'] == -1]
test_data.drop(['Label'], axis=1, inplace=True)
# 划分数据集
x_train, x_val, y_train, y_val = train_test_split(train_data, target, test_size=0.3, random_state=2021)
# 构建模型
LR = LogisticRegression()
LR.fit(x_train, y_train)
train_logloss = log_loss(y_train, LR.predict_proba(x_train)[:, 1])
val_logloss = log_loss(y_val, LR.predict_proba(x_val)[:, 1])
print('train_logloss: ',train_logloss)
print('val_logloss:',val_logloss)
# 模型预测
y_pred = LR.predict_proba(test_data)[:, 1]
四、构建GBDT模型
def GBDT_model(data, continuous_feature, category_feature):
# 将分类特征离散化
for col in category_feature:
onehot_feature = pd.get_dummies(data[col], prefix=col)
data.drop([col], axis=1, inplace=True)
data = pd.concat([data, onehot_feature], axis=1)
# 将训练集和测试集分开
train_data = data[data['Label'] != -1]
target = train_data.pop('Label')
test_data = data[data['Label'] == -1]
test_data.drop(['Label'], axis=1, inplace=True)
# 划分数据集
x_train, x_val, y_train, y_val = train_test_split(train_data, target, test_size=0.3, random_state=2021)
# 构建模型
GBM = lgb.LGBMClassifier(boosting_type='gbdt',
objective='binary',
subsample=0.8,
min_child_weight=0.5,
colsample_bytree=0.7,
num_leaves=100,
max_depth=12,
learning_rate=0.01,
n_estimators=100,
silent=True
)
GBM.fit(x_train, y_train,
eval_set=[(x_train, y_train), (x_val, y_val)],
eval_names=['train', 'val'],
eval_metric='binary_logloss',
early_stopping_rounds=100,
)
train_logloss = log_loss(y_train, GBM.predict_proba(x_train)[:, 1])
val_logloss = log_loss(y_val, GBM.predict_proba(x_val)[:, 1])
print('train_logloss: ',train_logloss)
print('val_logloss:',val_logloss)
# 模型预测
y_pred = GBM.predict_proba(test_data)[:, 1]
五、构建GBDT+LR融合模型
def GBDT_LR_model(data, continuous_feature, category_feature):
# 将分类特征离散化
for col in category_feature:
onehot_feature = pd.get_dummies(data[col], prefix=col)
data.drop([col], axis=1, inplace=True)
data = pd.concat([data, onehot_feature], axis=1)
# 将训练集和测试集分开
train_data = data[data['Label'] != -1]
target = train_data.pop('Label')
test_data = data[data['Label'] == -1]
test_data.drop(['Label'], axis=1, inplace=True)
# 划分数据集
x_train, x_val, y_train, y_val = train_test_split(train_data, target, test_size=0.2, random_state=2021)
# 构建模型
GBM = lgb.LGBMClassifier(boosting_type='gbdt',
objective='binary',
subsample=0.8,
min_child_weight=0.5,
colsample_bytree=0.7,
num_leaves=100,
max_depth=12,
learning_rate=0.01,
n_estimators=100,
silent=True
)
GBM.fit(x_train, y_train,
eval_set=[(x_train, y_train), (x_val, y_val)],
eval_names=['train', 'val'],
eval_metric='binary_logloss',
early_stopping_rounds=100,
)
model = GBM.booster_
gbdt_feats_train = model.predict(train_data, pred_leaf=True)
gbdt_feats_test = model.predict(test_data, pred_leaf=True)
gbdt_feats_name = ['gbdt_leaf_' + str(i) for i in range(gbdt_feats_train.shape[1])]
df_train_gbdt_feats = pd.DataFrame(gbdt_feats_train, columns = gbdt_feats_name)
df_test_gbdt_feats = pd.DataFrame(gbdt_feats_test, columns = gbdt_feats_name)
train = pd.concat([train_data, df_train_gbdt_feats], axis = 1)
test = pd.concat([test_data, df_test_gbdt_feats], axis = 1)
train_len = train.shape[0]
data = pd.concat([train, test])
del train
del test
gc.collect()
# 将连续特征归一化
scaler = MinMaxScaler()
for col in continuous_feature:
data[col] = scaler.fit_transform(data[col].values.reshape(-1, 1))
# 将叶子节点特征进行one-hot编码
for col in gbdt_feats_name:
onehot_feats = pd.get_dummies(data[col], prefix = col)
data.drop([col], axis = 1, inplace = True)
data = pd.concat([data, onehot_feats], axis = 1)
train = data[: train_len]
test = data[train_len:]
del data
gc.collect()
# 划分数据集
x_train, x_val, y_train, y_val = train_test_split(train, target, test_size = 0.3, random_state = 2021)
# 构建LR模型
LR = LogisticRegression()
LR.fit(x_train, y_train)
train_logloss = log_loss(y_train, LR.predict_proba(x_train)[:, 1])
val_logloss = log_loss(y_val, LR.predict_proba(x_val)[:, 1])
print('train-logloss: ', train_logloss)
print('val-logloss: ', val_logloss)
# 模型预测
y_pred = LR.predict_proba(test)[:, 1]
六、评估结果
# 训练和预测LR模型
LR_model(data.copy(), continuous_feature, category_feature)
# 模型训练和预测GBDT模型
GBDT_model(data.copy(), continuous_feature, category_feature)
# 训练和预测GBDT+LR模型
GBDT_LR_model(data.copy(), continuous_feature, category_feature)