下图为Wide&Deep的模型结构图,该模型结合了线性模型的Memorization和神经网络的捕捉深层特征的Generation,将特征分为两个输入源分别输入Wide和Deep部分,最终将两个模型的logits进行融合激活得到最终输出。
一、导库
import tensorflow as tf from tensorflow.keras.layers import * from tensorflow.keras.models import * from tensorflow.keras.utils import plot_model from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler, LabelEncoder import pandas as pd import numpy as np import warnings warnings.filterwarnings('ignore') from utils import SparseFeat, DenseFeat
二、数据处理
def data_process(data, dense_features, sparse_features): # 将数值型特征的缺失值填充为0 data[dense_features] = data[dense_features].fillna(0.0) # 将数值型特征进行平滑处理 for f in dense_features: data[f] = data[f].apply(lambda x: np.log(x+1) if x > -1 else -1) # 将类别型特征的缺失值填充为字符串-1 data[sparse_features] = data[sparse_features].fillna("-1") # 将类别特征进行Label编码,从字符->数值 for f in sparse_features: lb = LabelEncoder() data[f] = lb.fit_transform(data[f]) return data[dense_features + sparse_features]
三、搭建Wide&Deep模型
3.1 构建输入层
def build_input_layers(feature_columns): # 构建Input字典 dense_input_dict, sparse_input_dict = {}, {} for f in feature_columns: if isinstance(f, DenseFeat): dense_input_dict[f.name] = Input(shape=(f.dimension), name=f.name) elif isinstance(f, SparseFeat): sparse_input_dict[f.name] = Input(shape=(1), name=f.name) return dense_input_dict, sparse_input_dict
3.2 Embedding层
def build_embedding_layers(sparse_input_dict, wide_features, is_linear=False): # Embedding层对应的字典 embedding_layers_dict = {} # 获取Sparse特征 sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeat), wide_features)) if wide_features else [] # 如果是采用线性,那么嵌入的维度为1,否则按照自己设定的embedding_dim进行嵌入 if is_linear: for f in sparse_feature_columns: embedding_layers_dict[f.name] = Embedding(f.vocabulary_size + 1, 1, name='1d_emb_' + f.name) else: for f in sparse_feature_columns: embedding_layers_dict[f.name] = Embedding(f.vocabulary_size + 1, f.embedding_dim, name='kd_emb_' + f.name) return embedding_layers_dict
3.3 Wide部分logits
def get_wide_logits(dense_input_dict, sparse_input_dict, wide_features): # 1.将所有数值dense特征拼接 concat_dense_inputs = Concatenate(axis=1)(list(dense_input_dict.values())) # 2.将dense特征全连接输出logits dense_logits_output = Dense(1)(concat_dense_inputs) # 3.将Sparse特征进行Embedding,这里嵌入的维度为1,原因是wide部分特征要进行全连接,而使用Embedding嵌入维度为1,效果一致 sparse_embedding_layers = build_embedding_layers(sparse_input_dict, wide_features, is_linear=True) # 4.根据对应的Input传入Embedding层 sparse_embedding = [] sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeat), wide_features)) if wide_features else [] for f in sparse_feature_columns: _input = sparse_input_dict[f.name] _embedding = sparse_embedding_layers[f.name] embed = Flatten()(_embedding(_input)) # 要将其展开,因为嵌入后的向量为 (?,1,1) sparse_embedding.append(embed) # 5.获取Sparse全连接的结果,由于sparse_embedding中全是(?,1)的向量,所以将所有的进行相加,其实对应就是每个特征的权重,直接相加即可 sparse_logits_output = Add()(sparse_embedding) # 6.融合Dense和Sparse特征的结果 wide_logits = Add()([dense_logits_output, sparse_logits_output]) return wide_logits
3.4 Deep部分logits
def get_deep_logits(dense_input_dict, sparse_input_dict , deep_features): # 1.将所有数值dense特征拼接 concat_dense_inputs = Concatenate(axis=1)(list(dense_input_dict.values())) # 2.将Sparse特征进行Embedding,这里嵌入的维度为1,原因是wide部分特征要进行全连接,而使用Embedding嵌入维度为1,效果一致 sparse_embedding_layers = build_embedding_layers(sparse_input_dict, deep_features, is_linear=False) # 3.根据对应的Input传入Embedding层 sparse_embedding = [] sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeat), deep_features)) if deep_features else [] for f in sparse_feature_columns: _input = sparse_input_dict[f.name] _embedding = sparse_embedding_layers[f.name] embed = Flatten()(_embedding(_input)) # 要将其展开,因为嵌入后的向量为 (?,1,1) sparse_embedding.append(embed) # 4.拼接Sparse特征的Embedding向量 concat_sparse_embedding = Concatenate(axis=1)(sparse_embedding) # 5.拼接Dense特征和Sparse的Embedding向量 deep_input = Concatenate(axis=1)([concat_dense_inputs, concat_sparse_embedding]) deep_out = Dropout(0.5)(Dense(1024, activation='relu')(deep_input)) deep_out = Dropout(0.3)(Dense(512, activation='relu')(deep_out)) deep_out = Dropout(0.1)(Dense(256, activation='relu')(deep_out)) # 6.输出层logits deep_logits = Dense(1)(deep_out) return deep_logits
3.5 Wide&Deep
def WideDeep(wide_features, deep_features): # 1.获取键值类型的Input,返回Dense和Sparse类型 # 同时为两个通路Wide和Deep做Input层 dense_input_dict, sparse_input_dict = build_input_layers(wide_features + deep_features) # 2.获取Input层 input_layers = list(dense_input_dict.values()) + list(sparse_input_dict.values()) # 3.Wide部分 wide_logits = get_wide_logits(dense_input_dict, sparse_input_dict, wide_features) # 4.Deep部分 deep_logits = get_deep_logits(dense_input_dict, sparse_input_dict, deep_features) # 5.加权输出logits output_logits = Add()([wide_logits, deep_logits]) # 6.sigmoid激活 output_layer = Activation('sigmoid')(output_logits) # 7.构建模型 model = Model(input_layers, output_layer) return model
四、运行模型
4.1 准备操作
# 1.读取数据 data = pd.read_csv('./data/criteo_sample.txt') # 2.划分Dense和Sparse特征 columns = data.columns.values # 获取所有特征 dense_features = [f for f in columns if 'I' in f] # 所有Dense特征 sparse_features = [f for f in columns if 'C' in f] # 所有Sparse特征 # 3.为划分特征做标记,模型分为wide部分和deep部分 wide_features = [DenseFeat(name=f, dimension=1) for f in dense_features] + [SparseFeat(name=f, vocabulary_size=data[f].nunique(), embedding_dim=4) for f in sparse_features] deep_features = [DenseFeat(name=f, dimension=1) for f in dense_features] + [SparseFeat(name=f, vocabulary_size=data[f].nunique(), embedding_dim=4) for f in sparse_features] # 4.数据处理 train_data = data_process(data, dense_features, sparse_features) train_data['label'] = data['label']
4.2 构建模型
# 5.构建模型 history = WideDeep(wide_features, deep_features)
4.3 编译模型
# history.summary() # 6.编译模型 history.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_crossentropy',tf.keras.metrics.AUC(name='auc')]) # 7.将输入数据转化成字典的形式输入,与Input层对应 # <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'I1')> train_model_input = {name: data[name] for name in dense_features + sparse_features}
4.4 模型训练
# 8.模型训练 history.fit(train_model_input, train_data['label'], batch_size=64, epochs=5, validation_split=0.2)