下图为PNN的模型结构图,首先将Sparse特征进行Embedding嵌入,然后将其流入Product,分别进行捕捉线性关系lz和特征交叉lp,然后拼接,流到MLP全连接层,最终输出CTR概率值。
一、导包
import tensorflow as tf from tensorflow.keras.layers import * from tensorflow.keras.models import * from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler, LabelEncoder from tqdm import tqdm from collections import namedtuple import pandas as pd import numpy as np import warnings warnings.filterwarnings('ignore')
二、数据处理
"""数据处理""" def data_process(data, dense_features, sparse_features): data[dense_features] = data[dense_features].fillna(0.0) for f in dense_features: data[f] = data[f].apply(lambda x:np.log(x+1) if x > -1 else -1) data[sparse_features] = data[sparse_features].fillna("-1") for f in sparse_features: lb = LabelEncoder() data[f] = lb.fit_transform(data[f]) return data[dense_features + sparse_features]
三、搭建模型
3.1 输入层
def build_input_layers(dnn_feature_columns): dense_input_dict, sparse_input_dict = {}, {} for f in dnn_feature_columns: if isinstance(f, SparseFeat): sparse_input_dict[f.name] = Input(shape=(1), name=f.name) elif isinstance(f, DenseFeat): dense_input_dict[f.name] = Input(shape=(f.dimension), name=f.name) return dense_input_dict, sparse_input_dict
3.2 Embedding层
def build_embedding_layers(dnn_feature_columns, sparse_input_dict, is_linear=False): embedding_layers_dict = {} sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeat), dnn_feature_columns)) if dnn_feature_columns else [] if is_linear: for f in sparse_feature_columns: embedding_layers_dict[f.name] = Embedding(f.vocabulary_size + 1, 1, name='1d_embedding_' + f.name) else: for f in sparse_feature_columns: embedding_layers_dict[f.name] = Embedding(f.vocabulary_size + 1, f.embedding_dim, name='kd_embedding_'+ f.name) return embedding_layers_dict
3.3 EmbeddingInput
def concat_embedding_list(dnn_feature_columns, sparse_input_dict, embedding_layer_dict, flatten=False): sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeat), dnn_feature_columns)) if dnn_feature_columns else [] embedding_list = [] for f in sparse_feature_columns: _input = sparse_input_dict[f.name] _embed = embedding_layer_dict[f.name] embed = _embed(_input) if flatten: embed = Flatten()(embed) embedding_list.append(embed) return embedding_list
3.4 Product层
class ProductLayer(Layer): def __init__(self, units, use_inner=True, use_outer=False): super(ProductLayer, self).__init__() self.use_inner = use_inner # lp使用内积形式 self.use_outer = use_outer # lp使用外积形式 self.units = units # lz和lp的神经单元个数 def build(self, input_shape): # input_shape是输入的Input的形状列表 self.feat_nums = len(input_shape) # 特征个数 # input_shape[0]->(None, 1, 4) self.embedding_dims = input_shape[0][-1] # Embedding维度4 # 推导时,使用N*M的Embedding矩阵和同型的W进行内积,这里进行了Flatten展开,然后进行内积 flatten_dims = self.feat_nums * self.embedding_dims # 26*4=104 # 定义线性权重 (104,32) 32是神经元个数 self.linear_w = self.add_weight(name='linear_w', shape=(flatten_dims, self.units), initializer='glorot_normal') # 定义lp内积权重 if self.use_inner: # 未优化,每个神经元的权重矩阵应为(N,N) # 优化后,将w进行分解,用两个N维列向量乘积表示 self.inner_w = self.add_weight(name='inner_w', shape=(self.units, self.feat_nums), initializer='glorot_normal') # 定义lp外积权重 if self.use_outer: # 优化后,每个神经元的权重矩阵为(M,M) self.outer_w = self.add_weight(name='outer_w', shape=(self.units, self.embedding_dims, self.embedding_dims), initializer='glorot_normal') def call(self, inputs): # inputs是所有Input的列表,要先将其连接 concat_embedding = Concatenate(axis=1)(inputs) # (?,26,4) # 将矩阵展开进行内积 concat_embed_ = Flatten()(concat_embedding) # 进行内积操作 lz = tf.matmul(concat_embed_, self.linear_w) # (?,104)*(104,32)=>(?,32) lp_list = [] # lp内积形式 if self.use_inner: for i in range(self.units): delta = tf.multiply(concat_embedding, tf.expand_dims(self.inner_w[i], axis=1)) # (?,26,4)*(26,1)=>(?,26,4) delta = tf.reduce_sum(delta, axis=1) # (?,4) delta = tf.reduce_sum(tf.square(delta), axis=1, keepdims=True) # (?,1) 由于reduce会减少一个维度,为了维持能够保留一个特征维度,所以使用keepdims=True lp_list.append(delta) # lp外积形式 if self.use_outer: feature_sum = tf.reduce_sum(concat_embedding, axis=1) # (?,4) 所有特征的embedding累加求和 f1 = tf.expand_dims(feature_sum, axis=2) # (?,4,1) f2 = tf.expand_dims(feature_sum, axis=1) # (?,1,4) product = tf.matmul(f1, f2) # (?,4,1)*(?,1,4)=>(?,4,4) for i in range(self.units): lpi = tf.multiply(product, self.outer_w[i]) # (?,4,4)*(4,4)=>(?,4,4) lpi = tf.reduce_sum(lpi , axis=[1,2]) # (?) 相当于矩阵进行求和 lpi = tf.expand_dims(lpi, axis=1) # (?,1) 为了后面运算,需要添加一个维度,变成二维矩阵(?,1),每列就代表p和w内积之和 lp_list.append(lpi) # 由于lp_list中装的都是每个神经元的(?,1),所以需要将所有神经元拼接成一个向量 lp = Concatenate(axis=1)(lp_list) # (?,64) # 将lz和lp进行拼接 # 正常应该是64,因为lz和lp分别有32个神经元,这里是96是因为lp同时使用了外积和内积导致多了1倍 product_out = Concatenate(axis=1)([lz, lp]) # (?,96) return product_out
3.5 MLP层
def MLP(dnn_inputs, units=(64, 32)): for out_dim in units: dnn_inputs = Dense(out_dim, activation='relu')(dnn_inputs) # 输出层 output_layers = Dense(1, activation='sigmoid')(dnn_inputs) return output_layers
3.6 PNN模型
def PNN(dnn_feature_columns): # 1.获取输入层字典 _, sparse_input_dict = build_input_layers(dnn_feature_columns) # 2.获取输入层列表 input_layers = list(sparse_input_dict.values()) # 3.获取Embedding层 embedding_layers_dict = build_embedding_layers(dnn_feature_columns, sparse_input_dict, is_linear=False) # 4.将输入层进行Embedding sparse_embedding_list = concat_embedding_list(dnn_feature_columns, sparse_input_dict, embedding_layers_dict, flatten=False) # 5.将Embedding向量进行Product层操作 dnn_inputs = ProductLayer(units=32, use_inner=True, use_outer=True)(sparse_embedding_list) # 6.将Product层输出传入感知机中 output_layers = MLP(dnn_inputs, units=(64, 32)) # 7.构建模型 model = Model(input_layers,output_layers) return model
四、运转模型
4.1 读取数据
"""读取数据""" data = pd.read_csv('./data/criteo_sample.txt') columns = data.columns.values dense_features = [f for f in columns if 'I' in f][:4] # 使用3个特征进行画图 sparse_features =[f for f in columns if 'C' in f][:4] train_data = data_process(data, dense_features, sparse_features) train_data['label'] = data['label']
4.2 使用具名数据为特征做标记
SparseFeat = namedtuple('SparseFeat', ['name', 'vocabulary_size', 'embedding_dim']) DenseFeat = namedtuple('DenseFeat', ['name', 'dimension']) dnn_feature_columns = [SparseFeat(name=f, vocabulary_size=data[f].nunique(), embedding_dim=4) for f in sparse_features]
4.3 编译模型
# 构建FM模型 history = PNN(dnn_feature_columns) # history.summary() history.compile(optimizer="adam", loss="binary_crossentropy", metrics=["binary_crossentropy", tf.keras.metrics.AUC(name='auc')])
4.4 训练模型
# 将输入数据转化成字典的形式输入 train_model_input = {name: data[name] for name in dense_features + sparse_features} # 模型训练 history.fit(train_model_input, train_data['label'].values, batch_size=64, epochs=1, validation_split=0.2)
4.5 绘制网络结构
from tensorflow.keras.utils import plot_model plot_model(history,show_shapes=True)