import tensorflow as tf
from keras import layers
input_config = {
'category': [
# {'feature': 'hour', 'dtype': 'int32', 'num_tokens': 24,'vocab': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]},
{'feature': 'banner_pos', 'dtype': 'int32', 'num_tokens': 8, 'vocab': [0, 1, 2, 3, 4, 5, 6, 7]},
{'feature': 'device_type', 'dtype': 'int32', 'num_tokens': 6, 'vocab': [0, 1, 2, 3, 4, 5]},
{'feature': 'device_conn_type', 'dtype': 'int32', 'num_tokens': 6, 'vocab': [0, 1, 2, 3, 4, 5]},
{'feature': 'C18', 'dtype': 'int32', 'num_tokens': 4, 'vocab': [0, 1, 2, 3]},
],
# hash分桶
'hash': [
{'feature': 'site_category', 'num_bins': 1000, 'dtype': 'string'},
{'feature': 'app_category', 'num_bins': 1000, 'dtype': 'string'},
{'feature': 'C14', 'num_bins': 1000, 'dtype': 'int32'},
{'feature': 'C15', 'num_bins': 1000, 'dtype': 'int32'},
{'feature': 'C16', 'num_bins': 1000, 'dtype': 'int32'},
{'feature': 'C17', 'num_bins': 1000, 'dtype': 'int32'},
{'feature': 'C21', 'num_bins': 1000, 'dtype': 'int32'},
],
# 数值分桶
'int_bucket': [
# {'feature': 'Age', 'bin_boundaries': [10, 20, 30, 40, 50, 60, 70, 80, 90], 'embedding_dims': 10}
],
# 数值类型(归一化)
'num': [
],
# 手动交叉
'cross': [
],
# 原始稠密特征
# 'dense': [
# {'feature': 'site_category', 'dtype': 'float32'}
# ]
}
voc_size = {
# 'hour':24,
'banner_pos': 8,
'device_type': 6,
'device_conn_type': 6,
'C18': 4,
'site_category': 1000,
'app_category': 1000,
'C14': 1000,
'C15': 1000,
'C16': 1000,
'C17': 1000,
'C21': 1000,
}
spare_features_config = [
# 'hour',
'banner_pos', 'device_type', 'device_conn_type', 'C18', 'site_category', 'app_category', 'C14', 'C15', 'C16', 'C17',
'C21']
dense_features_config = []
# spare_feature两条路径,1输入进lr 2进入embed
# embed_feature两条路径,1输入进fm,2与dense_feature一起输入进dnn
# 最后concate(lr+fm+dnn) dense(1) sigmoid
def build_input(input_config):
feature_input = []
feature_map = {}
input_map = {}
# 构建连续数值型特征输入
for num_feature in input_config.get('num', []):
layer = tf.keras.Input(shape=[1], dtype=num_feature['dtype'], name=num_feature[
'feature'])
input_map[num_feature['feature']] = layer
feature_input.append(layer) # tf.feature_column.numeric_column(num_feature['feature']))
feature_map[num_feature['feature']] = layer
# 构建分类特征输入
for cate_feature in input_config.get('category', []):
layer = layers.Input(shape=[1], dtype=cate_feature['dtype'], name=cate_feature['feature'])
input_map[cate_feature['feature']] = layer
# 是否数字型
if cate_feature.get('num_tokens') is None:
layer = layers.StringLookup(vocabulary=cate_feature['vocabulary'], output_mode="one_hot",
num_oov_indices=0)(layer)
input_dim = len(cate_feature['vocabulary'])
else:
layer = layers.CategoryEncoding(num_tokens=cate_feature['num_tokens'], output_mode="one_hot")(
layer)
input_dim = cate_feature['num_tokens']
# 是否需要embedding
if cate_feature.get('embedding_dims') is not None:
layer = layers.Dense(cate_feature['embedding_dims'], use_bias=False)(layer)
feature_input.append(layer)
feature_map[cate_feature['feature']] = layer
# 需要hash分桶的特征
for hash_feature in input_config.get('hash', []):
layer = tf.keras.Input(shape=[1], dtype=hash_feature['dtype'], name=hash_feature['feature'])
input_map[hash_feature['feature']] = layer
layer = layers.Hashing(num_bins=hash_feature['num_bins'], output_mode='one_hot',
)(layer)
if hash_feature.get('embedding_dims') is not None:
layer = layers.Dense(hash_feature['embedding_dims'], use_bias=False)(layer)
feature_input.append(layer)
feature_map[hash_feature['feature']] = layer
# 连续数值分桶
for bucket_feature in input_config.get('int_bucket', []):
layer = layers.Discretization(bin_boundaries=bucket_feature['bin_boundaries'],
name=bucket_feature['feature'])
if bucket_feature.get('embedding_dims') is not None:
embedding = layers.Dense(bucket_feature['embedding_dims'], use_bias=False)
layer = embedding(layer)
feature_input.append(layer)
feature_map[bucket_feature['feature']] = layer
input_map[hash_feature['feature']] = layer
cross_cate_map = {}
# 构建交叉特征
# for cross_feature in input_config.get('cross', []):
# col = []
# col = col + build_input(cross_feature['features'])
# # layer = layers.experimental.preprocessing.HashedCrossing(num_bins=cross_feature['num_bins'],
# # output_mode='one_hot', sparse=True)(
# # (tuple(col)))
# layer=tf.feature_column.indicator_column(tf.feature_column.crossed_column(col, 10000))
# feature_input.append(layer)
# feature_input_map[cross_feature['feature']] = layer
return feature_input, feature_map, input_map
def build_embed_features(embedding_dims, spare_features_config, feature_input_map):
embed_features = []
for feature_name in spare_features_config:
embedding = layers.Dense(embedding_dims, use_bias=False)
embed_features.append(embedding(feature_input_map[feature_name]))
return embed_features
def build_spare_features(spare_features_config, feature_input_map):
spare_features = []
for feature_name in spare_features_config:
spare_features.append(feature_input_map[feature_name])
return spare_features
def build_dense_features(dense_features_config, feature_input_map):
dense_features = []
for feature_name in spare_features_config:
dense_features.append(feature_input_map[feature_name])
return dense_features
def buildLRLayer(spare_features):
output = layers.Dense(1, use_bias=False)(layers.concatenate(spare_features))
return output
def buildDNN(spare_features):
output = layers.Dense(1)(layers.concatenate(spare_features), use_bias=False)
return output
class FM(layers.Layer):
def __init__(self, **kwargs):
super(FM, self).__init__(**kwargs)
def build(self, input_shape):
super(FM, self).build(input_shape) # Be sure to call this somewhere!
def call(self, inputs, **kwargs):
"""
inputs: 是一个列表,列表中每个元素的维度为:(None, 1, emb_dim), 列表长度
为field_num
"""
# print(inputs.shape)
# for input in inputs:
concated_embeds_value = tf.stack(inputs, axis=1) # (None,field_num,emb_dim)
square_of_sum = tf.square(tf.reduce_sum(
concated_embeds_value, axis=1, keepdims=True)) # (None, 1, emb_dim)
sum_of_square = tf.reduce_sum(
concated_embeds_value * concated_embeds_value,
axis=1, keepdims=True) # (None, 1, emb_dim)
cross_term = square_of_sum - sum_of_square
cross_term = 0.5 * tf.reduce_sum(cross_term, axis=2, keepdims=False) # (None,1)
return cross_term
def compute_output_shape(self, input_shape):
return (None, 1)
def get_config(self):
return super().get_config()
def deepfm(input_config, spare_features_config, dense_features_config, hidden_units):
feature_input, feature_map, input_map = build_input(input_config)
embed_features = build_embed_features(8, spare_features_config, feature_map)
spare_features = build_spare_features(spare_features_config, feature_map)
dense_features = build_dense_features(dense_features_config, feature_map)
# 构建逻辑回归层
LRLayer = buildLRLayer(spare_features)
# 构建FM
FMLayer = FM()(embed_features)
# 构建DNN
hidden_units = [32, 64]
dropout_rate = 0.1
x = layers.concatenate(dense_features + embed_features)
for units in hidden_units:
x = layers.Dense(units)(x)
x = layers.BatchNormalization()(x)
x = layers.ReLU()(x)
x = layers.Dropout(dropout_rate)(x)
dnn_output = layers.Dense(1, activation='sigmoid', use_bias=False)(x)
# 汇总输出
output = layers.Dense(1)(layers.concatenate([LRLayer, FMLayer, dnn_output]))
# 构建模型
model = tf.keras.Model(input_map, output)
model.compile(optimizer="adam",
loss="binary_crossentropy",
# metrics=["binary_crossentropy", tf.keras.metrics.AUC(name='auc')]
)
return model
hidden_units = [32, 64, 64, 128, 128]
model = deepfm(input_config, spare_features_config, dense_features_config, hidden_units)
dataset = tf.data.experimental.make_csv_dataset(
'/Volumes/Data/oysterqaq/Desktop/Avazu_train_1.csv', batch_size=2, label_name='click'
)
model.summary()
model.fit(dataset,
batch_size=20, epochs=11)
Comments | NOTHING