xDeepFm Tensorflow2.0实现

发布于 2022-08-31  532 次阅读


import tensorflow as tf
from keras import layers

input_config = {
    'category': [
        # {'feature': 'hour', 'dtype': 'int32', 'num_tokens': 24,'vocab': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]},
        {'feature': 'banner_pos', 'dtype': 'int32', 'num_tokens': 8, 'vocab': [0, 1, 2, 3, 4, 5, 6, 7]},
        {'feature': 'device_type', 'dtype': 'int32', 'num_tokens': 6, 'vocab': [0, 1, 2, 3, 4, 5]},
        {'feature': 'device_conn_type', 'dtype': 'int32', 'num_tokens': 6, 'vocab': [0, 1, 2, 3, 4, 5]},
        {'feature': 'C18', 'dtype': 'int32', 'num_tokens': 4, 'vocab': [0, 1, 2, 3]},
    ],
    # hash分桶
    'hash': [
        {'feature': 'site_category', 'num_bins': 1000, 'dtype': 'string'},
        {'feature': 'app_category', 'num_bins': 1000, 'dtype': 'string'},
        {'feature': 'C14', 'num_bins': 1000, 'dtype': 'int32'},
        {'feature': 'C15', 'num_bins': 1000, 'dtype': 'int32'},
        {'feature': 'C16', 'num_bins': 1000, 'dtype': 'int32'},
        {'feature': 'C17', 'num_bins': 1000, 'dtype': 'int32'},
        {'feature': 'C21', 'num_bins': 1000, 'dtype': 'int32'},
    ],
    # 数值分桶
    'int_bucket': [
        # {'feature': 'Age', 'bin_boundaries': [10, 20, 30, 40, 50, 60, 70, 80, 90], 'embedding_dims': 10}
    ],
    # 数值类型(归一化)
    'num': [

    ],
    # 手动交叉
    'cross': [

    ],
    # 原始稠密特征
    # 'dense': [
    #     {'feature': 'site_category', 'dtype': 'float32'}
    # ]
}

voc_size = {
    # 'hour':24,
    'banner_pos': 8,
    'device_type': 6,
    'device_conn_type': 6,
    'C18': 4,
    'site_category': 1000,
    'app_category': 1000,
    'C14': 1000,
    'C15': 1000,
    'C16': 1000,
    'C17': 1000,
    'C21': 1000,

}
spare_features_config = [
    # 'hour',
    'banner_pos', 'device_type', 'device_conn_type', 'C18', 'site_category', 'app_category', 'C14', 'C15', 'C16', 'C17',
    'C21']
dense_features_config = []

class CIN(layers.Layer):
    def __init__(self,  cin_size=[128, 128], l2_reg=1e-4):
        """
        :param: cin_size: A list. [H_1, H_2, ....H_T], a list of number of layers
        """
        super(CIN, self).__init__()
        self.cin_size = cin_size
        self.l2_reg = l2_reg

    def build(self, input_shape):
        # input_shape  [None, field_nums, embedding_dim]
        self.field_nums = input_shape[1]

        # CIN 的每一层大小,这里加入第0层,也就是输入层H_0
        self.field_nums = [self.field_nums] + self.cin_size

        # 过滤器
        self.cin_W = {
            'CIN_W_' + str(i): self.add_weight(
                name='CIN_W_' + str(i),
                shape=(1, self.field_nums[0] * self.field_nums[i], self.field_nums[i + 1]),  # 这个大小要理解
                initializer='random_uniform',
                regularizer=tf.keras.regularizers.l2(self.l2_reg),
                trainable=True
            )
            for i in range(len(self.field_nums) - 1)
        }

        super(CIN, self).build(input_shape)

    def call(self, inputs):
        # inputs [None, field_num, embed_dim]
        embed_dim = inputs.shape[-1]
        hidden_layers_results = [inputs]

        # 从embedding的维度把张量一个个的切开,这个为了后面逐通道进行卷积,算起来好算
        # 这个结果是个list, list长度是embed_dim, 每个元素维度是[None, field_nums[0], 1]  field_nums[0]即输入的特征个数
        # 即把输入的[None, field_num, embed_dim],切成了embed_dim个[None, field_nums[0], 1]的张量
        split_X_0 = tf.split(hidden_layers_results[0], embed_dim, 2)

        for idx, size in enumerate(self.cin_size):
            # 这个操作和上面是同理的,也是为了逐通道卷积的时候更加方便,分割的是当一层的输入Xk-1
            split_X_K = tf.split(hidden_layers_results[-1], embed_dim,
                                 2)  # embed_dim个[None, field_nums[i], 1] feild_nums[i] 当前隐藏层单元数量

            # 外积的运算
            out_product_res_m = tf.matmul(split_X_0, split_X_K,
                                          transpose_b=True)  # [embed_dim, None, field_nums[0], field_nums[i]]
            out_product_res_o = tf.reshape(out_product_res_m,
                                           shape=[embed_dim, -1, self.field_nums[0] * self.field_nums[idx]])  # 后两维合并起来
            out_product_res = tf.transpose(out_product_res_o,
                                           perm=[1, 0, 2])  # [None, dim, field_nums[0]*field_nums[i]]

            # 卷积运算
            # 这个理解的时候每个样本相当于1张通道为1的照片 dim为宽度, field_nums[0]*field_nums[i]为长度
            # 这时候的卷积核大小是field_nums[0]*field_nums[i]的, 这样一个卷积核的卷积操作相当于在dim上进行滑动,每一次滑动会得到一个数
            # 这样一个卷积核之后,会得到dim个数,即得到了[None, dim, 1]的张量, 这个即当前层某个神经元的输出
            # 当前层一共有field_nums[i+1]个神经元, 也就是field_nums[i+1]个卷积核,最终的这个输出维度[None, dim, field_nums[i+1]]
            cur_layer_out = tf.nn.conv1d(input=out_product_res, filters=self.cin_W['CIN_W_' + str(idx)], stride=1,
                                         padding='VALID')

            cur_layer_out = tf.transpose(cur_layer_out, perm=[0, 2, 1])  # [None, field_num[i+1], dim]

            hidden_layers_results.append(cur_layer_out)

        # 最后CIN的结果,要取每个中间层的输出,这里不要第0层的了
        final_result = hidden_layers_results[1:]  # 这个的维度T个[None, field_num[i], dim]  T 是CIN的网络层数

        # 接下来在第一维度上拼起来
        result = tf.concat(final_result, axis=1)  # [None, H1+H2+...HT, dim]
        # 接下来, dim维度上加和,并把第三个维度1干掉
        result = tf.reduce_sum(result, axis=-1, keepdims=False)  # [None, H1+H2+..HT]

        return result



def build_input(input_config):
    feature_input = []
    feature_map = {}
    input_map = {}
    # 构建连续数值型特征输入
    for num_feature in input_config.get('num', []):
        layer = tf.keras.Input(shape=[1], dtype=num_feature['dtype'], name=num_feature[
            'feature'])
        input_map[num_feature['feature']] = layer
        feature_input.append(layer)  # tf.feature_column.numeric_column(num_feature['feature']))
        feature_map[num_feature['feature']] = layer
    # 构建分类特征输入
    for cate_feature in input_config.get('category', []):
        layer = layers.Input(shape=[1], dtype=cate_feature['dtype'], name=cate_feature['feature'])
        input_map[cate_feature['feature']] = layer
        # 是否数字型
        if cate_feature.get('num_tokens') is None:
            layer = layers.StringLookup(vocabulary=cate_feature['vocabulary'], output_mode="one_hot",
                                                 num_oov_indices=0)(layer)
            input_dim = len(cate_feature['vocabulary'])
        else:
            layer = layers.CategoryEncoding(num_tokens=cate_feature['num_tokens'], output_mode="one_hot")(
                layer)
            input_dim = cate_feature['num_tokens']
        # 是否需要embedding
        if cate_feature.get('embedding_dims') is not None:
            layer = layers.Dense(cate_feature['embedding_dims'], use_bias=False)(layer)
        feature_input.append(layer)
        feature_map[cate_feature['feature']] = layer
    # 需要hash分桶的特征
    for hash_feature in input_config.get('hash', []):
        layer = tf.keras.Input(shape=[1], dtype=hash_feature['dtype'], name=hash_feature['feature'])
        input_map[hash_feature['feature']] = layer
        layer = layers.Hashing(num_bins=hash_feature['num_bins'], output_mode='one_hot',
                                        )(layer)
        if hash_feature.get('embedding_dims') is not None:
            layer = layers.Dense(hash_feature['embedding_dims'], use_bias=False)(layer)
        feature_input.append(layer)
        feature_map[hash_feature['feature']] = layer
    # 连续数值分桶
    for bucket_feature in input_config.get('int_bucket', []):
        layer = layers.Discretization(bin_boundaries=bucket_feature['bin_boundaries'],
                                               name=bucket_feature['feature'])
        if bucket_feature.get('embedding_dims') is not None:
            embedding = layers.Dense(bucket_feature['embedding_dims'], use_bias=False)
            layer = embedding(layer)
        feature_input.append(layer)
        feature_map[bucket_feature['feature']] = layer
        input_map[hash_feature['feature']] = layer
    cross_cate_map = {}
    # 构建交叉特征
    # for cross_feature in input_config.get('cross', []):
    #     col = []
    #     col = col + build_input(cross_feature['features'])
    #     # layer = layers.experimental.preprocessing.HashedCrossing(num_bins=cross_feature['num_bins'],
    #     #                                                                   output_mode='one_hot', sparse=True)(
    #     #     (tuple(col)))
    #     layer=tf.feature_column.indicator_column(tf.feature_column.crossed_column(col, 10000))
    #     feature_input.append(layer)
    #     feature_input_map[cross_feature['feature']] = layer

    return feature_input, feature_map, input_map


def build_embed_features(embedding_dims, spare_features_config, feature_input_map):
    embed_features = []
    for feature_name in spare_features_config:
        embedding = layers.Dense(embedding_dims, use_bias=False)
        embed_features.append(embedding(feature_input_map[feature_name]))
    return embed_features


def build_spare_features(spare_features_config, feature_input_map):
    spare_features = []
    for feature_name in spare_features_config:
        spare_features.append(feature_input_map[feature_name])
    return spare_features


def build_dense_features(dense_features_config, feature_input_map):
    dense_features = []
    for feature_name in spare_features_config:
        dense_features.append(feature_input_map[feature_name])
    return dense_features


def buildLRLayer(spare_features):
    output = layers.Dense(1, use_bias=False)(layers.concatenate(spare_features))
    return output



def xdeepfm(input_config, spare_features_config, dense_features_config, hidden_units):
    feature_input, feature_map, input_map = build_input(input_config)
    embed_features = build_embed_features(8, spare_features_config, feature_map)
    spare_features = build_spare_features(spare_features_config, feature_map)
    dense_features = build_dense_features(dense_features_config, feature_map)
    # 构建逻辑回归层
    LRLayer = buildLRLayer(spare_features)
    # 构建CIN
    CINLayer = CIN()(tf.stack(embed_features, axis=1))
    # 构建DNN
    hidden_units = [32, 64, 64, 128, 128]
    dropout_rate = 0.1
    x = layers.concatenate(dense_features + embed_features)
    for units in hidden_units:
        x = layers.Dense(units)(x)
        x = layers.BatchNormalization()(x)
        x = layers.ReLU()(x)
        x = layers.Dropout(dropout_rate)(x)
    dnn_output = layers.Dense(1, activation='sigmoid', use_bias=False)(x)
    # 汇总输出
    output = layers.Dense(1)(layers.concatenate([LRLayer, CINLayer, dnn_output]))
    # 构建模型
    model = tf.keras.Model(input_map, output)
    model.compile(optimizer="adam",
                  loss="binary_crossentropy",
                  metrics=tf.keras.metrics.BinaryAccuracy()
                  )
    return model


hidden_units = [32, 64, 64, 128, 128]

model = xdeepfm(input_config, spare_features_config, dense_features_config, hidden_units)
dataset = tf.data.experimental.make_csv_dataset(
    '/Volumes/Data/oysterqaq/Desktop/Avazu_train_1.csv', batch_size=2000, label_name='click'
)
dataset=dataset.apply(tf.data.experimental.ignore_errors())
model.summary()
model.fit(dataset,
          batch_size=2000, epochs=100)



 


面向ACG编程