Wide&Deep Tensorflow2.0实现

发布于 2022-08-07  86 次阅读



# pandas分析数据分布

# df = pd.read_csv('/Volumes/Data/oysterqaq/Desktop/Avazu_train.csv', sep=',', low_memory=True, header=0)
# print(df.head(12))
# print(df['banner_pos'].unique())
# print(df['device_type'].unique())
# print(df['device_conn_type'].unique())
# print(df['C18'].unique())

# id类特征不输入模型
# 站点分类和app分类做交叉

input_config = {
    'category': [
        # {'feature': 'hour', 'dtype': 'int32', 'num_tokens': 24,'vocab': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]},
        {'feature': 'banner_pos', 'dtype': 'int32', 'num_tokens': 8, 'vocab': [0, 1, 2, 3, 4, 5, 6, 7]},
        {'feature': 'device_type', 'dtype': 'int32', 'num_tokens': 6, 'vocab': [0, 1, 2, 3, 4, 5]},
        {'feature': 'device_conn_type', 'dtype': 'int32', 'num_tokens': 6, 'vocab': [0, 1, 2, 3, 4, 5]},
        {'feature': 'C18', 'dtype': 'int32', 'num_tokens': 4, 'vocab': [0, 1, 2, 3]},
    ],
    # hash分桶
    'hash': [
        {'feature': 'site_category', 'num_bins': 1000, 'dtype': 'string'},
        {'feature': 'app_category', 'num_bins': 1000, 'dtype': 'string'},
        {'feature': 'C14', 'num_bins': 1000, 'dtype': 'int32'},
        {'feature': 'C15', 'num_bins': 1000, 'dtype': 'int32'},
        {'feature': 'C16', 'num_bins': 1000, 'dtype': 'int32'},
        {'feature': 'C17', 'num_bins': 1000, 'dtype': 'int32'},
        {'feature': 'C21', 'num_bins': 1000, 'dtype': 'int32'},
    ],
    # 数值分桶
    'int_bucket': [
        # {'feature': 'Age', 'bin_boundaries': [10, 20, 30, 40, 50, 60, 70, 80, 90], 'embedding_dims': 10}
    ],
    # 数值类型(归一化)
    'num': [

    ],
    # 手动交叉
    'cross': [
        {'feature': 'site_category#app_category',
         'features': ['site_category', 'app_category'],
         'num_bins': 1000000
         },

    ],
    # 原始稠密特征
    # 'dense': [
    #     {'feature': 'site_category', 'dtype': 'float32'}
    # ]
}

voc_size = {
    # 'hour':24,
    'banner_pos': 8,
    'device_type': 6,
    'device_conn_type': 6,
    'C18': 4,
    'site_category': 1000,
    'app_category': 1000,
    'C14': 1000,
    'C15': 1000,
    'C16': 1000,
    'C17': 1000,
    'C21': 1000,

}
spare_features_config = [
    # 'hour',
    'banner_pos', 'device_type', 'device_conn_type', 'C18', 'site_category', 'app_category', 'C14', 'C15', 'C16', 'C17',
    'C21']
cross_features_config = [
    'site_category#app_category']
dense_features_config = []

wide_input_config = ['site_category', 'app_category']

deep_input_config = ['banner_pos', 'device_type', 'device_conn_type', 'C18', 'site_category', 'app_category', 'C14',
                     'C15', 'C16', 'C17',
                     'C21']


# 连续数值类特征tf.keras.Input
# 数值类分类特征layers.CategoryEncoding 不确定分类数目layers.IntegerLookup
# 字符串类分类特征layers.StringLookup
# hash分桶layers.Hashing
# 连续数值分桶layers.Discretization


# 构造特征预处理层
def build_input(input_config):
    feature_input = []
    feature_map = {}
    input_map = {}
    # 构建连续数值型特征输入
    for num_feature in input_config.get('num', []):
        layer = tf.keras.Input(shape=[1], dtype=num_feature['dtype'], name=num_feature[
            'feature'])
        input_map[num_feature['feature']] = layer
        feature_input.append(layer)  # tf.feature_column.numeric_column(num_feature['feature']))
        feature_map[num_feature['feature']] = layer
    # 构建分类特征输入
    for cate_feature in input_config.get('category', []):
        layer = layers.Input(shape=[1], dtype=cate_feature['dtype'], name=cate_feature['feature'])
        input_map[cate_feature['feature']] = layer
        # 是否数字型
        if cate_feature.get('num_tokens') is None:
            layer = layers.StringLookup(vocabulary=cate_feature['vocabulary'], output_mode="one_hot",
                                        num_oov_indices=0)(layer)
            input_dim = len(cate_feature['vocabulary'])
        else:
            layer = layers.CategoryEncoding(num_tokens=cate_feature['num_tokens'], output_mode="one_hot")(
                layer)
            input_dim = cate_feature['num_tokens']
        # 是否需要embedding
        if cate_feature.get('embedding_dims') is not None:
            layer = layers.Dense(cate_feature['embedding_dims'], use_bias=False)(layer)
        feature_input.append(layer)
        feature_map[cate_feature['feature']] = layer
    # 需要hash分桶的特征
    for hash_feature in input_config.get('hash', []):
        layer = tf.keras.Input(shape=[1], dtype=hash_feature['dtype'], name=hash_feature['feature'])
        input_map[hash_feature['feature']] = layer
        layer = layers.Hashing(num_bins=hash_feature['num_bins'], output_mode='one_hot',
                               )(layer)
        if hash_feature.get('embedding_dims') is not None:
            layer = layers.Dense(hash_feature['embedding_dims'], use_bias=False)(layer)
        feature_input.append(layer)
        feature_map[hash_feature['feature']] = layer
    # 连续数值分桶
    for bucket_feature in input_config.get('int_bucket', []):
        layer = layers.Discretization(bin_boundaries=bucket_feature['bin_boundaries'],
                                      name=bucket_feature['feature'])
        if bucket_feature.get('embedding_dims') is not None:
            embedding = layers.Dense(bucket_feature['embedding_dims'], use_bias=False)
            layer = embedding(layer)
        feature_input.append(layer)
        feature_map[bucket_feature['feature']] = layer
        input_map[hash_feature['feature']] = layer
    cross_cate_map = {}
    # 构建交叉特征
    for cross_features in input_config.get('cross', []):
        layer = tf.keras.layers.experimental.preprocessing.HashedCrossing(
            num_bins=cross_features['num_bins'], output_mode='one_hot')
        # 获取交叉对的输入层
        # for cross_features in cross_features['features']:
        # map(lambda x: input_map[x], cross_features['features'])
        layer = layer(tuple(map(lambda x: input_map[x], cross_features['features'])))
        # 加入map
        feature_map[cross_features['feature']] = layer

    return feature_input, feature_map, input_map


import tensorflow as tf
from keras import layers


def build_spare_features(spare_features_config, feature_input_map):
    spare_features = []
    for feature_name in spare_features_config:
        spare_features.append(feature_input_map[feature_name])
    return spare_features


def build_cross_features(cross_features_config, feature_input_map):
    cross_features = []
    for feature_name in cross_features_config:
        cross_features.append(feature_input_map[feature_name])
    return cross_features


def build_dense_features(dense_features_config, feature_input_map):
    dense_features = []
    for feature_name in spare_features_config:
        dense_features.append(feature_input_map[feature_name])
    return dense_features


def build_wide_input(wide_input_config, input_map):
    wide_input = {}
    for w in wide_input_config:
        wide_input[w] = input_map[w]
    return wide_input


def build_deep_input(deep_input_config, input_map):
    deep_input = {}
    for d in deep_input_config:
        deep_input[d] = input_map[d]
    return deep_input


# onehot稀疏向量 交叉后输入wide
# 其余向量进入deep 不用归一化的话使用batch normalization
def wide_and_deep(deep_input_config, wide_input_config, input_config, spare_features_config, dense_features_config,
                  cross_features_config):
    feature_input, feature_map, input_map = build_input(input_config)
    spare_features = build_spare_features(spare_features_config, feature_map)
    cross_features = build_spare_features(cross_features_config, feature_map)
    dense_features = build_dense_features(dense_features_config, feature_map)
    # wide
    w = layers.concatenate(cross_features)
    wide_output = layers.Dense(1)(w)
    linear_model = tf.keras.Model(inputs=build_wide_input(wide_input_config, input_map), outputs=wide_output)
    hidden_units = [32, 64, 64, 128, 128]
    dropout_rate = 0.1
    x = layers.concatenate(dense_features + spare_features)
    # x=deep_model_input
    for units in hidden_units:
        x = layers.Dense(units)(x)
        x = layers.BatchNormalization()(x)
        x = layers.ReLU()(x)
        x = layers.Dropout(dropout_rate)(x)
    deep_output = layers.Dense(1)(x)
    deep_model = tf.keras.Model(inputs=build_deep_input(deep_input_config, input_map), outputs=deep_output)
    combined_model = tf.compat.v1.keras.experimental.WideDeepModel(linear_model, deep_model, activation='sigmoid')
    wide_optimizer = tf.keras.optimizers.Ftrl(
        l1_regularization_strength=0.001,
        learning_rate=tf.keras.optimizers.schedules.ExponentialDecay(
            initial_learning_rate=0.1, decay_steps=10000, decay_rate=0.9))
    deep_optimizer = tf.keras.optimizers.Adagrad()
    combined_model.compile(
        optimizer=[wide_optimizer, deep_optimizer], loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=tf.keras.metrics.BinaryAccuracy())
    return combined_model


model = wide_and_deep(deep_input_config, wide_input_config, input_config, spare_features_config, dense_features_config,
                      cross_features_config)
# model.summary()

dataset = tf.data.experimental.make_csv_dataset(
    '/Volumes/Data/oysterqaq/Desktop/Avazu_train_1.csv', batch_size=2, label_name='click'
)
# model.summary()
model.fit(dataset,
          batch_size=20, epochs=11)
# df = pd.read_csv('/Volumes/Data/oysterqaq/Desktop/Avazu_train_1.csv', sep=',', low_memory=True)
# layer = tf.keras.layers.experimental.preprocessing.HashedCrossing(
#     num_bins=4, output_mode='one_hot')
# print(layer((df['app_category'], df['site_category'])))

 


面向ACG编程