diff --git a/docs/source/models/dnn_fg.md b/docs/source/models/dnn_fg.md new file mode 100644 index 000000000..c0ec03e20 --- /dev/null +++ b/docs/source/models/dnn_fg.md @@ -0,0 +1,69 @@ +# MultiTowerRecall + +### 简介 + +专为接入RTP FG时加入负采样和序列特征训练准备的DNN模型。 + +### 配置说明 + +```protobuf +fg_json_path: "!samples/model_config/fg_fusion_train_seq.json" +model_config { + model_class: "DNNFG" + feature_groups { + group_name: "all" + feature_names: 'adgroup_id' + feature_names: 'cate_id' + feature_names: 'campaign_id' + feature_names: 'customer' + feature_names: 'brand' + feature_names: 'user_id' + feature_names: 'cms_segid' + feature_names: 'cms_group_id' + feature_names: 'final_gender_code' + feature_names: 'age_level' + feature_names: 'pvalue_level' + feature_names: 'shopping_level' + feature_names: 'occupation' + feature_names: 'new_user_class_level' + wide_deep: DEEP + sequence_features: { + group_name: "seq_fea" + tf_summary: false + seq_att_map: { + key: "cate_id" + key: "brand" + hist_seq: "click_seq__cate_id" + hist_seq: "click_seq__brand" + } + } + } + dnnfg { + dnn { + hidden_units: 256 + hidden_units: 128 + hidden_units: 64 + } + l2_regularization: 1e-6 + } + embedding_regularization: 5e-6 +} +``` + +- fg_json_path: 指定fg json文件目录 +- model_class: 'DNNFG', 不需要修改 +- feature_groups: 需要一个feature_group: all, **group name不能变** +- dnnfg: dnnfg相关的参数 + - dnn: deep part的参数配置 + - hidden_units: dnn每一层的channel数目,即神经元的数目 +- embedding_regularization: 对embedding部分加regularization,防止overfit + +支持的metric_set包括: + +- auc +- gauc +- recall_at_topK + +### 示例Config + +见路径:samples/model_config/fg_fusion_train_neg_seq_on_dnn.config diff --git a/easy_rec/python/core/sampler.py b/easy_rec/python/core/sampler.py index 9ff25aa5d..fe622243a 100644 --- a/easy_rec/python/core/sampler.py +++ b/easy_rec/python/core/sampler.py @@ -282,7 +282,7 @@ def get(self, ids): sampled_values = tf.py_func(self._get_impl, [ids], self._attr_tf_types) result_dict = {} for k, t, v in zip(self._attr_names, self._attr_tf_types, sampled_values): - v.set_shape([self._num_sample]) + v.set_shape([None]) result_dict[k] = v return result_dict diff --git a/easy_rec/python/input/input.py b/easy_rec/python/input/input.py index 966ec6cf5..5c229d98c 100644 --- a/easy_rec/python/input/input.py +++ b/easy_rec/python/input/input.py @@ -1,16 +1,21 @@ # -*- encoding:utf-8 -*- +# -*- encoding:utf-8 -*- # Copyright (c) Alibaba, Inc. and its affiliates. +import json import logging +import os from abc import abstractmethod from collections import OrderedDict import six import tensorflow as tf +import easy_rec from easy_rec.python.core import sampler as sampler_lib from easy_rec.python.protos.dataset_pb2 import DatasetConfig from easy_rec.python.utils import config_util from easy_rec.python.utils import constant +from easy_rec.python.utils import fg_util from easy_rec.python.utils.check_utils import check_split from easy_rec.python.utils.check_utils import check_string_to_number from easy_rec.python.utils.expr_util import get_expression @@ -78,6 +83,54 @@ def __init__(self, self._input_path = input_path + self._fg_json_path = None + self._fg_config = None + self._fg_module = None + self._fg_input_map = dict() + self._effective_fg_features = set() + + # if self._fg_json_path is not None and self._fg_json_path != '': + # if self._fg_json_path.startswith('!'): + # self._fg_json_path = self._fg_json_path[1:] + # with tf.gfile.GFile(self._fg_json_path, 'r') as f: + # self._fg_config = json.load(f) + # for feature_config in self._fg_config['features']: + # if 'sequence_name' in feature_config: + # sequence_name = feature_config['sequence_name'] + # for sub_feature_config in feature_config['features']: + # sub_feature_name = sub_feature_config['feature_name'] + # feature_name = sequence_name + '__' + sub_feature_name + # self._fg_input_map[feature_name] = [ + # sequence_name + '__' + + # sub_feature_config['expression'].split(':')[-1] + # ] + # else: + # feature_type = feature_config['feature_type'] + # feature_name = feature_config['feature_name'] + # if feature_type in ['id_feature', 'raw_feature']: + # self._fg_input_map[feature_name] = [ + # feature_config['expression'].split(':')[-1] + # ] + # elif feature_type == 'combo_feature': + # self._fg_input_map[feature_name] = [ + # k.split(':')[-1] for k in feature_config['expression'] + # ] + # elif feature_type == 'lookup_feature': + # self._fg_input_map[feature_name] = [ + # feature_config['map'].split(':')[-1], + # feature_config['key'].split(':')[-1] + # ] + # elif feature_type == 'match_feature': + # self._fg_input_map[feature_name] = [ + # feature_config['user'].split(':')[-1], + # feature_config['category'].split(':')[-1], + # feature_config['item'].split(':')[-1], + # ] + # else: + # raise ValueError('Unknown feature type: %s' % feature_type) + # fg_op_path = os.path.join(easy_rec.ops_dir, 'libfg_op.so') + # self._fg_module = tf.load_op_library(fg_op_path) + # findout effective fields self._effective_fields = [] @@ -85,37 +138,42 @@ def __init__(self, # from the types defined in input_fields # it is used in create_multi_placeholders self._multi_value_types = {} - - for fc in self._feature_configs: - for input_name in fc.input_names: - assert input_name in self._input_fields, 'invalid input_name in %s' % str( - fc) - if input_name not in self._effective_fields: - self._effective_fields.append(input_name) - - if fc.feature_type in [fc.TagFeature, fc.SequenceFeature]: - if fc.hash_bucket_size > 0: - self._multi_value_types[fc.input_names[0]] = tf.string - else: - self._multi_value_types[fc.input_names[0]] = tf.int64 - if len(fc.input_names) > 1: - self._multi_value_types[fc.input_names[1]] = tf.float32 - - if fc.feature_type == fc.RawFeature: - self._multi_value_types[fc.input_names[0]] = tf.float32 - - # add sample weight to effective fields - if self._data_config.HasField('sample_weight'): - self._effective_fields.append(self._data_config.sample_weight) - - self._effective_fids = [ - self._input_fields.index(x) for x in self._effective_fields - ] - # sort fids from small to large - self._effective_fids = list(set(self._effective_fids)) - self._effective_fields = [ - self._input_fields[x] for x in self._effective_fids - ] + # for fc in self._feature_configs: + # for input_name in fc.input_names: + # if self._fg_config is not None and input_name in self._fg_input_map: + # self._effective_fg_features.add(input_name) + # true_input_names = self._fg_input_map[input_name] + # else: + # true_input_names = [input_name] + # for true_input_name in true_input_names: + # assert true_input_name in self._input_fields, 'invalid input_name in %s' % str( + # fc) + # if true_input_name not in self._effective_fields: + # self._effective_fields.append(true_input_name) + + # if fc.feature_type in [fc.TagFeature, fc.SequenceFeature]: + # if fc.hash_bucket_size > 0: + # self._multi_value_types[fc.input_names[0]] = tf.string + # else: + # self._multi_value_types[fc.input_names[0]] = tf.int64 + # if len(fc.input_names) > 1: + # self._multi_value_types[fc.input_names[1]] = tf.float32 + + # if fc.feature_type == fc.RawFeature: + # self._multi_value_types[fc.input_names[0]] = tf.float32 + + # # add sample weight to effective fields + # if self._data_config.HasField('sample_weight'): + # self._effective_fields.append(self._data_config.sample_weight) + + # self._effective_fids = [ + # self._input_fields.index(x) for x in self._effective_fields + # ] + # # sort fids from small to large + # self._effective_fids = list(set(self._effective_fids)) + # self._effective_fields = [ + # self._input_fields[x] for x in self._effective_fids + # ] self._label_fids = [self._input_fields.index(x) for x in self._label_fields] @@ -284,6 +342,9 @@ def _preprocess(self, field_dict): sampler_type = self._data_config.WhichOneof('sampler') sampler_config = getattr(self._data_config, sampler_type) item_ids = field_dict[sampler_config.item_id_field] + + parsed_dict['__batch_size__'] = tf.shape(item_ids)[0] + parsed_dict['__sampler_type__'] = sampler_type if sampler_type in ['negative_sampler', 'negative_sampler_in_memory']: sampled = self._sampler.get(item_ids) elif sampler_type == 'negative_sampler_v2': @@ -294,6 +355,11 @@ def _preprocess(self, field_dict): sampled = self._sampler.get(user_ids, item_ids) else: raise ValueError('Unknown sampler %s' % sampler_type) + + parsed_dict['__num_neg_sample__'] = tf.shape(list(sampled.values())[0])[0] + self._appended_fields.append('__num_neg_sample__') + self._appended_fields.append('__sampler_type__') + for k, v in sampled.items(): if k in field_dict: field_dict[k] = tf.concat([field_dict[k], v], axis=0) @@ -302,6 +368,13 @@ def _preprocess(self, field_dict): parsed_dict[k] = v self._appended_fields.append(k) + if self._fg_config is not None: + if self._mode != tf.estimator.ModeKeys.PREDICT and self._fg_module is not None: + parsed_dict['_fg_cfg'] = True + self._appended_fields.append('_fg_cfg') + field_dict = fg_util._fg(self._fg_config, self._effective_fg_features, + self._fg_module, field_dict, parsed_dict) + for fc in self._feature_configs: feature_name = fc.feature_name feature_type = fc.feature_type @@ -793,7 +866,46 @@ def _safe_shard(self, dataset): else: return dataset.shard(self._task_num, self._task_index) + def _set_effective_fields(self): + for fc in self._feature_configs: + for input_name in fc.input_names: + if self._fg_config is not None and input_name in self._fg_input_map: + self._effective_fg_features.add(input_name) + true_input_names = self._fg_input_map[input_name] + else: + true_input_names = [input_name] + for true_input_name in true_input_names: + assert true_input_name in self._input_fields, 'invalid input_name in %s' % str( + fc) + if true_input_name not in self._effective_fields: + self._effective_fields.append(true_input_name) + + if fc.feature_type in [fc.TagFeature, fc.SequenceFeature]: + if fc.hash_bucket_size > 0: + self._multi_value_types[fc.input_names[0]] = tf.string + else: + self._multi_value_types[fc.input_names[0]] = tf.int64 + if len(fc.input_names) > 1: + self._multi_value_types[fc.input_names[1]] = tf.float32 + + if fc.feature_type == fc.RawFeature: + self._multi_value_types[fc.input_names[0]] = tf.float32 + + # add sample weight to effective fields + if self._data_config.HasField('sample_weight'): + self._effective_fields.append(self._data_config.sample_weight) + + self._effective_fids = [ + self._input_fields.index(x) for x in self._effective_fields + ] + # sort fids from small to large + self._effective_fids = list(set(self._effective_fids)) + self._effective_fields = [ + self._input_fields[x] for x in self._effective_fids + ] + def create_input(self, export_config=None): + self._set_effective_fields() def _input_fn(mode=None, params=None, config=None): """Build input_fn for estimator. @@ -829,3 +941,47 @@ def _input_fn(mode=None, params=None, config=None): _input_fn.input_creator = self return _input_fn + + def set_fg_path(self, fg_json_path=None): + self._fg_json_path = fg_json_path + if self._fg_json_path is not None and self._fg_json_path != '': + if self._fg_json_path.startswith('!'): + self._fg_json_path = self._fg_json_path[1:] + with tf.gfile.GFile(self._fg_json_path, 'r') as f: + self._fg_config = json.load(f) + for feature_config in self._fg_config['features']: + if 'sequence_name' in feature_config: + sequence_name = feature_config['sequence_name'] + for sub_feature_config in feature_config['features']: + sub_feature_name = sub_feature_config['feature_name'] + feature_name = sequence_name + '__' + sub_feature_name + self._fg_input_map[feature_name] = [ + sequence_name + '__' + + sub_feature_config['expression'].split(':')[-1] + ] + else: + feature_type = feature_config['feature_type'] + feature_name = feature_config['feature_name'] + if feature_type in ['id_feature', 'raw_feature']: + self._fg_input_map[feature_name] = [ + feature_config['expression'].split(':')[-1] + ] + elif feature_type == 'combo_feature': + self._fg_input_map[feature_name] = [ + k.split(':')[-1] for k in feature_config['expression'] + ] + elif feature_type == 'lookup_feature': + self._fg_input_map[feature_name] = [ + feature_config['map'].split(':')[-1], + feature_config['key'].split(':')[-1] + ] + elif feature_type == 'match_feature': + self._fg_input_map[feature_name] = [ + feature_config['user'].split(':')[-1], + feature_config['category'].split(':')[-1], + feature_config['item'].split(':')[-1], + ] + else: + raise ValueError('Unknown feature type: %s' % feature_type) + fg_op_path = os.path.join(easy_rec.ops_dir, 'libfg_op.so') + self._fg_module = tf.load_op_library(fg_op_path) diff --git a/easy_rec/python/input/odps_input_v2.py b/easy_rec/python/input/odps_input_v2.py index e806e1c30..a0193d14a 100644 --- a/easy_rec/python/input/odps_input_v2.py +++ b/easy_rec/python/input/odps_input_v2.py @@ -8,7 +8,10 @@ from easy_rec.python.utils import odps_util try: - import pai + if tf.__version__ == '1.15': + from tensorflow.python.ops.work_queue import WorkQueue + else: + from pai.data import WorkQueue except Exception: pass @@ -50,7 +53,7 @@ def _build(self, mode, params): mode == tf.estimator.ModeKeys.TRAIN: logging.info('pai_worker_slice_num = %d' % self._data_config.pai_worker_slice_num) - work_queue = pai.data.WorkQueue( + work_queue = WorkQueue( self._input_path, num_epochs=self.num_epochs, shuffle=self._data_config.shuffle, diff --git a/easy_rec/python/layers/input_layer.py b/easy_rec/python/layers/input_layer.py index 4414cc3d9..edcca1759 100644 --- a/easy_rec/python/layers/input_layer.py +++ b/easy_rec/python/layers/input_layer.py @@ -34,13 +34,14 @@ def __init__(self, ev_params=None, embedding_regularizer=None, kernel_regularizer=None, - is_training=False): + is_training=False, + mode=tf.estimator.ModeKeys.EVAL): self._feature_groups = { x.group_name: FeatureGroup(x) for x in feature_groups_config } self.sequence_feature_layer = sequence_feature_layer.SequenceFeatureLayer( feature_configs, feature_groups_config, ev_params, - embedding_regularizer, kernel_regularizer, is_training) + embedding_regularizer, kernel_regularizer, is_training, mode) self._seq_feature_groups_config = [] for x in feature_groups_config: for y in x.sequence_features: @@ -84,7 +85,7 @@ def __call__(self, features, group_name, is_combine=True): assert group_name in self._feature_groups, 'invalid group_name[%s], list: %s' % ( group_name, ','.join([x for x in self._feature_groups])) feature_name_to_output_tensors = {} - negative_sampler = self._feature_groups[group_name]._config.negative_sampler + # negative_sampler = self._feature_groups[group_name]._config.negative_sampler if group_name in self._group_name_to_seq_features: for seq_feature in self._group_name_to_seq_features[group_name]: for seq_att in seq_feature.seq_att_map: @@ -95,11 +96,9 @@ def __call__(self, features, group_name, is_combine=True): features, group_name, is_combine, feature_name_to_output_tensors) if group_name in self._group_name_to_seq_features: concat_features, all_seq_fea = self.sequence_feature_layer( - features, - concat_features, + features, concat_features, self._group_name_to_seq_features[group_name], - feature_name_to_output_tensors, - negative_sampler=negative_sampler) + feature_name_to_output_tensors) group_features.extend(all_seq_fea) all_seq_fea = tf.concat(all_seq_fea, axis=-1) concat_features = tf.concat([concat_features, all_seq_fea], axis=-1) diff --git a/easy_rec/python/layers/sequence_feature_layer.py b/easy_rec/python/layers/sequence_feature_layer.py index c539e5cdc..cd3c566b7 100644 --- a/easy_rec/python/layers/sequence_feature_layer.py +++ b/easy_rec/python/layers/sequence_feature_layer.py @@ -18,7 +18,8 @@ def __init__(self, ev_params=None, embedding_regularizer=None, kernel_regularizer=None, - is_training=False): + is_training=False, + mode=tf.estimator.ModeKeys.EVAL): self._seq_feature_groups_config = [] for x in feature_groups_config: for y in x.sequence_features: @@ -33,6 +34,7 @@ def __init__(self, self._embedding_regularizer = embedding_regularizer self._kernel_regularizer = kernel_regularizer self._is_training = is_training + self.mode = mode def negative_sampler_target_attention(self, dnn_config, @@ -44,7 +46,6 @@ def negative_sampler_target_attention(self, cur_id, hist_id_col, seq_len, aux_hist_emb_list = deep_fea['key'], deep_fea[ 'hist_seq_emb'], deep_fea['hist_seq_len'], deep_fea[ 'aux_hist_seq_emb_list'] - seq_max_len = tf.shape(hist_id_col)[1] seq_emb_dim = hist_id_col.shape[2] cur_id_dim = tf.shape(cur_id)[-1] @@ -52,6 +53,7 @@ def negative_sampler_target_attention(self, pos_feature = cur_id[:batch_size] neg_feature = cur_id[batch_size:] + cur_id = tf.concat([ pos_feature[:, tf.newaxis, :], tf.tile(neg_feature[tf.newaxis, :, :], multiples=[batch_size, 1, 1]) @@ -65,7 +67,10 @@ def negative_sampler_target_attention(self, concat_features = tf.tile( concat_features[:, tf.newaxis, :], multiples=[1, neg_num_add_1, 1]) - seq_len = tf.tile(seq_len, multiples=[neg_num_add_1]) + + # seq_len = tf.tile(seq_len, multiples=[neg_num_add_1]) + seq_len = tf.tile(seq_len[:, tf.newaxis], multiples=[1, neg_num_add_1]) + seq_len = tf.reshape(seq_len, [neg_num_add_1 * batch_size]) if allow_key_transform and (cur_id_dim != seq_emb_dim): cur_id = tf.layers.dense( @@ -80,6 +85,7 @@ def negative_sampler_target_attention(self, [cur_ids, hist_id_col, cur_ids - hist_id_col, cur_ids * hist_id_col], axis=-1) # (B * neg_num_add_1, seq_max_len, seq_emb_dim*4) + # dnn_config.activation = "tf.nn.leaky_relu" din_layer = dnn.DNN( dnn_config, self._kernel_regularizer, @@ -88,9 +94,10 @@ def negative_sampler_target_attention(self, last_layer_no_activation=True, last_layer_no_batch_norm=True) din_net = din_layer(din_net) - scores = tf.reshape(din_net, [-1, 1, seq_max_len]) # (B, 1, ?) + # scores = tf.reshape(din_net, [-1, 1, seq_max_len]) # (B, 1, ?) + scores = tf.reshape(din_net, [-1, seq_max_len]) - seq_len = tf.expand_dims(seq_len, 1) + # seq_len = tf.expand_dims(seq_len, 1) mask = tf.sequence_mask(seq_len) padding = tf.ones_like(scores) * (-2**32 + 1) scores = tf.where(mask, scores, @@ -98,8 +105,9 @@ def negative_sampler_target_attention(self, # Scale scores = tf.nn.softmax(scores) # (B * neg_num_add_1, 1, seq_max_len) - hist_din_emb = tf.matmul(scores, - hist_id_col) # [B * neg_num_add_1, 1, seq_emb_dim] + # hist_din_emb = tf.matmul(scores, + # hist_id_col) # [B * neg_num_add_1, 1, seq_emb_dim] + hist_din_emb = tf.reduce_sum(scores[:, :, tf.newaxis] * hist_id_col, axis=1) hist_din_emb = tf.reshape(hist_din_emb, [batch_size, neg_num_add_1, seq_emb_dim ]) # [B * neg_num_add_1, seq_emb_dim] @@ -129,7 +137,7 @@ def target_attention(self, seq_emb_dim = hist_id_col.shape[2] cur_id_dim = tf.shape(cur_id)[-1] - cur_id = cur_id[:tf.shape(hist_id_col)[0], ...] # for negative sampler + # cur_id = cur_id[:tf.shape(hist_id_col)[0], ...] # for negative sampler if allow_key_transform and (cur_id_dim != seq_emb_dim): cur_id = tf.layers.dense( @@ -179,9 +187,13 @@ def __call__(self, features, concat_features, all_seq_att_map_config, - feature_name_to_output_tensors=None, - negative_sampler=False): + feature_name_to_output_tensors=None): logging.info('use sequence feature layer.') + negative_sampler = False + if (features.get('__sampler_type__', None) + is not None) and (features.get('_fg_cfg', None) is None): + negative_sampler = True + all_seq_fea = [] # process all sequence features for seq_att_map_config in all_seq_att_map_config: @@ -209,7 +221,8 @@ def __call__(self, seq_dnn_config = DNN() seq_dnn_config.hidden_units.extend([128, 64, 32, 1]) cur_target_attention_name = 'seq_dnn' + group_name - if negative_sampler: + if negative_sampler and self.mode != tf.estimator.ModeKeys.PREDICT: + print('cd negative target attention', self.mode) seq_fea, concat_features = self.negative_sampler_target_attention( seq_dnn_config, seq_features, @@ -218,6 +231,7 @@ def __call__(self, need_key_feature=need_key_feature, allow_key_transform=allow_key_transform) else: + print('cd target attention', self.mode) seq_fea = self.target_attention( seq_dnn_config, seq_features, @@ -225,4 +239,5 @@ def __call__(self, need_key_feature=need_key_feature, allow_key_transform=allow_key_transform) all_seq_fea.append(seq_fea) + # concat_features = tf.concat([concat_features, all_seq_fea], axis=-1) return concat_features, all_seq_fea diff --git a/easy_rec/python/main.py b/easy_rec/python/main.py index bb61aae67..8ddd80d6d 100644 --- a/easy_rec/python/main.py +++ b/easy_rec/python/main.py @@ -63,6 +63,7 @@ def _get_input_fn(data_config, data_path=None, export_config=None, check_mode=False, + fg_json_path=None, **kwargs): """Build estimator input function. @@ -89,6 +90,8 @@ def _get_input_fn(data_config, task_num=task_num, check_mode=check_mode, **kwargs) + + input_obj.set_fg_path(fg_json_path) input_fn = input_obj.create_input(export_config) return input_fn @@ -146,8 +149,12 @@ def _create_eval_export_spec(pipeline_config, eval_data, check_mode=False): else: eval_steps = None input_fn_kwargs = {} - if data_config.input_type == data_config.InputType.OdpsRTPInputV2: - input_fn_kwargs['fg_json_path'] = pipeline_config.fg_json_path + fg_json_path = None + if data_config.input_type == data_config.InputType.OdpsRTPInputV2 or \ + data_config.input_type == data_config.InputType.CSVInput or \ + data_config.input_type == data_config.InputType.OdpsInputV2: + # input_fn_kwargs['fg_json_path'] = pipeline_config.fg_json_path + fg_json_path = pipeline_config.fg_json_path # create eval input export_input_fn = _get_input_fn( data_config, @@ -155,6 +162,7 @@ def _create_eval_export_spec(pipeline_config, eval_data, check_mode=False): None, export_config, check_mode=check_mode, + fg_json_path=fg_json_path, **input_fn_kwargs) if export_config.exporter_type == 'final': exporters = [ @@ -196,8 +204,12 @@ def _metric_cmp_fn(best_eval_result, current_eval_result): # set throttle_secs to a small number, so that we can control evaluation # interval steps by checkpoint saving steps - eval_input_fn = _get_input_fn(data_config, feature_configs, eval_data, - **input_fn_kwargs) + eval_input_fn = _get_input_fn( + data_config, + feature_configs, + eval_data, + fg_json_path=fg_json_path, + **input_fn_kwargs) eval_spec = tf.estimator.EvalSpec( name='val', input_fn=eval_input_fn, @@ -313,8 +325,12 @@ def _train_and_evaluate_impl(pipeline_config, logging.info('Will train min(%d, %s) steps...' % (train_steps, epoch_str)) input_fn_kwargs = {} - if data_config.input_type == data_config.InputType.OdpsRTPInputV2: - input_fn_kwargs['fg_json_path'] = pipeline_config.fg_json_path + fg_json_path = None + if data_config.input_type == data_config.InputType.OdpsRTPInputV2 or \ + data_config.input_type == data_config.InputType.CSVInput or \ + data_config.input_type == data_config.InputType.OdpsInputV2: + fg_json_path = pipeline_config.fg_json_path + # input_fn_kwargs['fg_json_path'] = pipeline_config.fg_json_path # create train input train_input_fn = _get_input_fn( @@ -322,6 +338,7 @@ def _train_and_evaluate_impl(pipeline_config, feature_configs, train_data, check_mode=check_mode, + fg_json_path=fg_json_path, **input_fn_kwargs) # Currently only a single Eval Spec is allowed. train_spec = tf.estimator.TrainSpec( @@ -728,10 +745,17 @@ def export(export_dir, export_config = pipeline_config.export_config data_config = pipeline_config.data_config input_fn_kwargs = {} + fg_json_path = None if data_config.input_type == data_config.InputType.OdpsRTPInputV2: - input_fn_kwargs['fg_json_path'] = pipeline_config.fg_json_path - serving_input_fn = _get_input_fn(data_config, feature_configs, None, - export_config, **input_fn_kwargs) + # input_fn_kwargs['fg_json_path'] = pipeline_config.fg_json_path + fg_json_path = pipeline_config.fg_json_path + serving_input_fn = _get_input_fn( + data_config, + feature_configs, + None, + export_config, + fg_json_path=fg_json_path, + **input_fn_kwargs) ckpt_path = _get_ckpt_path(pipeline_config, checkpoint_path) if 'oss_path' in extra_params: if pipeline_config.train_config.HasField('incr_save_config'): @@ -789,8 +813,12 @@ def export_checkpoint(pipeline_config=None, data_config = pipeline_config.data_config input_fn_kwargs = {} - if data_config.input_type == data_config.InputType.OdpsRTPInputV2: - input_fn_kwargs['fg_json_path'] = pipeline_config.fg_json_path + fg_json_path = None + if data_config.input_type == data_config.InputType.OdpsRTPInputV2 or \ + data_config.input_type == data_config.InputType.CSVInput or \ + data_config.input_type == data_config.InputType.OdpsInputV2: + # input_fn_kwargs['fg_json_path'] = pipeline_config.fg_json_path + fg_json_path = pipeline_config.fg_json_path # create estimator params = {'log_device_placement': verbose} @@ -801,8 +829,13 @@ def export_checkpoint(pipeline_config=None, # construct serving input fn export_config = pipeline_config.export_config - serving_input_fn = _get_input_fn(data_config, feature_configs, None, - export_config, **input_fn_kwargs) + serving_input_fn = _get_input_fn( + data_config, + feature_configs, + None, + export_config, + fg_json_path=fg_json_path, + **input_fn_kwargs) ckpt_path = _get_ckpt_path(pipeline_config, checkpoint_path) estimator.export_checkpoint( export_path=export_path, diff --git a/easy_rec/python/model/dnn_fg.py b/easy_rec/python/model/dnn_fg.py new file mode 100644 index 000000000..ed7c05ec7 --- /dev/null +++ b/easy_rec/python/model/dnn_fg.py @@ -0,0 +1,102 @@ +# -*- encoding:utf-8 -*- +# Copyright (c) Alibaba, Inc. and its affiliates. +from __future__ import print_function + +import tensorflow as tf + +from easy_rec.python.compat import regularizers +from easy_rec.python.core import metrics as metrics_lib +from easy_rec.python.layers import dnn +from easy_rec.python.model.rank_model import RankModel + +# from easy_rec.python.model.match_model import MatchModel + +from easy_rec.python.protos.dnnfg_pb2 import DNNFG as DNNFGConfig # NOQA + +if tf.__version__ >= '2.0': + losses = tf.compat.v1.losses + metrics = tf.compat.v1.metrics + tf = tf.compat.v1 +else: + losses = tf.losses + metrics = tf.metrics + + +class DNNFG(RankModel): + + def __init__(self, + model_config, + feature_configs, + features, + labels=None, + is_training=False): + super(DNNFG, self).__init__(model_config, feature_configs, features, labels, + is_training) + assert self._model_config.WhichOneof('model') == 'dnnfg', \ + 'invalid model config: %s' % self._model_config.WhichOneof('model') + self._model_config = self._model_config.dnnfg + assert isinstance(self._model_config, DNNFGConfig) + + self.feature, _ = self._input_layer(self._feature_dict, 'all') + + if self._labels is not None: + self._labels = list(self._labels.values()) + self._labels[0] = tf.cast(self._labels[0], tf.int64) + + self._l2_reg = regularizers.l2_regularizer( + self._model_config.l2_regularization) + + def build_predict_graph(self): + if self._mode != tf.estimator.ModeKeys.PREDICT: + assert 'hard_neg_indices' not in self._feature_dict + num_neg = self._feature_dict['__num_neg_sample__'] + all_fea = tf.reshape(self.feature, + [-1, 1 + num_neg, self.feature.shape[-1]]) + else: + all_fea = self.feature + all_fea = tf.Print(all_fea, ['all_fea', tf.shape(all_fea)]) + dnn_layer = dnn.DNN(self._model_config.dnn, self._l2_reg, 'dnn', + self._is_training) + all_fea = dnn_layer(all_fea) + output = tf.layers.dense(all_fea, 1, name='output') + output = tf.squeeze(output, axis=-1) + + self._prediction_dict['logits'] = output + self._prediction_dict['probs'] = tf.nn.sigmoid(output) + + return self._prediction_dict + + def build_loss_graph(self): + logits = self._prediction_dict['logits'] + label = tf.to_float(self._labels[0]) + self._loss_dict[ + 'sigmoid_cross_entropy_loss'] = self._model_config.pointwise_loss_weight * tf.losses.sigmoid_cross_entropy( + label, logits=logits[:, 0]) + return self._loss_dict + + def build_metric_graph(self, eval_config): + metric_dict = {} + for metric in eval_config.metrics_set: + if metric.WhichOneof('metric') == 'auc': + probs = self._prediction_dict['probs'] + metric_dict['auc'] = metrics.auc(self._labels[0], probs[:, 0]) + elif metric.WhichOneof('metric') == 'gauc': + probs = self._prediction_dict['probs'] + metric_dict['gauc'] = metrics_lib.gauc( + self._labels[0], + probs[:, 0], + uids=self._feature_dict[metric.gauc.uid_field], + reduction=metric.gauc.reduction) + elif metric.WhichOneof('metric') == 'recall_at_topk': + mask = tf.equal(self._labels[0], 1) + logits = tf.boolean_mask(self._prediction_dict['logits'], mask) + label = tf.zeros_like(logits[:, :1], dtype=tf.int64) + with tf.device('/cpu:0'): + metric_dict['recall_at_top%d' % + metric.recall_at_topk.topk] = metrics.recall_at_k( + label, logits, metric.recall_at_topk.topk) + return metric_dict + + def get_outputs(self): + outputs = super(DNNFG, self).get_outputs() + return outputs diff --git a/easy_rec/python/model/easy_rec_estimator.py b/easy_rec/python/model/easy_rec_estimator.py index 2772c9ed6..82b52a71e 100644 --- a/easy_rec/python/model/easy_rec_estimator.py +++ b/easy_rec/python/model/easy_rec_estimator.py @@ -127,7 +127,7 @@ def _train_model_fn(self, features, labels, run_config): self.feature_configs, features, labels, - is_training=True) + is_training=tf.estimator.ModeKeys.TRAIN) predict_dict = model.build_predict_graph() loss_dict = model.build_loss_graph() @@ -432,7 +432,7 @@ def _eval_model_fn(self, features, labels, run_config): self.feature_configs, features, labels, - is_training=False) + is_training=tf.estimator.ModeKeys.EVAL) predict_dict = model.build_predict_graph() loss_dict = model.build_loss_graph() loss = tf.add_n(list(loss_dict.values())) @@ -459,7 +459,7 @@ def _distribute_eval_model_fn(self, features, labels, run_config): self.feature_configs, features, labels, - is_training=False) + is_training=tf.estimator.ModeKeys.EVAL) predict_dict = model.build_predict_graph() loss_dict = model.build_loss_graph() loss = tf.add_n(list(loss_dict.values())) @@ -510,7 +510,7 @@ def _export_model_fn(self, features, labels, run_config, params): self.feature_configs, features, labels=None, - is_training=False) + is_training=tf.estimator.ModeKeys.PREDICT) model.build_predict_graph() export_config = self._pipeline_config.export_config diff --git a/easy_rec/python/model/easy_rec_model.py b/easy_rec/python/model/easy_rec_model.py index 7f1be76c9..3d60d3769 100644 --- a/easy_rec/python/model/easy_rec_model.py +++ b/easy_rec/python/model/easy_rec_model.py @@ -32,10 +32,12 @@ def __init__(self, feature_configs, features, labels=None, - is_training=False): + is_training=tf.estimator.ModeKeys.EVAL): self._base_model_config = model_config self._model_config = model_config - self._is_training = is_training + self._mode = is_training + self._is_training = True if (is_training + == tf.estimator.ModeKeys.TRAIN) else False self._feature_dict = features # embedding variable parameters @@ -93,7 +95,8 @@ def build_input_layer(self, model_config, feature_configs): kernel_regularizer=self._l2_reg, variational_dropout_config=model_config.variational_dropout if model_config.HasField('variational_dropout') else None, - is_training=self._is_training) + is_training=self._is_training, + mode=self._mode) @abstractmethod def build_predict_graph(self): diff --git a/easy_rec/python/model/multi_tower_recall.py b/easy_rec/python/model/multi_tower_recall.py index 8f576944e..5a76de36f 100644 --- a/easy_rec/python/model/multi_tower_recall.py +++ b/easy_rec/python/model/multi_tower_recall.py @@ -27,21 +27,26 @@ def __init__(self, self._model_config = self._model_config.multi_tower_recall assert isinstance(self._model_config, MultiTowerRecallConfig) - self.user_tower_feature, _ = self._input_layer(self._feature_dict, 'user') - self.item_tower_feature, _ = self._input_layer(self._feature_dict, 'item') + self.user_tower_feature, _ = self._input_layer( + self._feature_dict, 'user') # [batch, neg+1, emb] [batch * neg+1, emb] + self.item_tower_feature, _ = self._input_layer( + self._feature_dict, 'item') # [batch+neg, emb] [batch * neg+1, emb] def build_predict_graph(self): - - user_tower_feature = self.user_tower_feature - batch_size = tf.shape(user_tower_feature)[0] - pos_item_feature = self.item_tower_feature[:batch_size] - neg_item_feature = self.item_tower_feature[batch_size:] - item_tower_feature = tf.concat([ - pos_item_feature[:, tf.newaxis, :], - tf.tile( - neg_item_feature[tf.newaxis, :, :], multiples=[batch_size, 1, 1]) - ], - axis=1) # noqa: E126 + if self._mode != tf.estimator.ModeKeys.PREDICT: + user_tower_feature = self.user_tower_feature + batch_size = tf.shape(user_tower_feature)[0] + pos_item_feature = self.item_tower_feature[:batch_size] + neg_item_feature = self.item_tower_feature[batch_size:] + item_tower_feature = tf.concat([ + pos_item_feature[:, tf.newaxis, :], + tf.tile( + neg_item_feature[tf.newaxis, :, :], multiples=[batch_size, 1, 1]) + ], + axis=1) # noqa: E126 + else: + user_tower_feature = self.user_tower_feature + item_tower_feature = self.item_tower_feature user_dnn = dnn.DNN(self._model_config.user_tower.dnn, self._l2_reg, 'user_dnn', self._is_training) @@ -50,7 +55,9 @@ def build_predict_graph(self): item_dnn = dnn.DNN(self._model_config.item_tower.dnn, self._l2_reg, 'item_dnn', self._is_training) item_tower_emb = item_dnn(item_tower_feature) - item_tower_emb = tf.reshape(item_tower_emb, tf.shape(user_tower_emb)) + + # if self._mode == tf.estimator.ModeKeys.PREDICT: + # item_tower_emb = tf.reshape(item_tower_emb, tf.shape(user_tower_emb)) tower_fea_arr = [] tower_fea_arr.append(user_tower_emb) diff --git a/easy_rec/python/model/rank_model.py b/easy_rec/python/model/rank_model.py index ed75fb069..3aba80e28 100644 --- a/easy_rec/python/model/rank_model.py +++ b/easy_rec/python/model/rank_model.py @@ -37,13 +37,13 @@ def _output_to_prediction_impl(self, prediction_dict = {} if loss_type == LossType.F1_REWEIGHTED_LOSS or loss_type == LossType.PAIR_WISE_LOSS: assert num_class == 1, 'num_class must be 1 when loss type is F1_REWEIGHTED_LOSS/PAIR_WISE_LOSS' - output = tf.squeeze(output, axis=1) + output = tf.squeeze(output, axis=-1) probs = tf.sigmoid(output) prediction_dict['logits' + suffix] = output prediction_dict['probs' + suffix] = probs elif loss_type == LossType.CLASSIFICATION: if num_class == 1: - output = tf.squeeze(output, axis=1) + output = tf.squeeze(output, axis=-1) probs = tf.sigmoid(output) tf.summary.scalar('prediction/probs', tf.reduce_mean(probs)) prediction_dict['logits' + suffix] = output @@ -58,10 +58,10 @@ def _output_to_prediction_impl(self, probs, axis=1) prediction_dict['y' + suffix] = tf.argmax(output, axis=1) elif loss_type == LossType.L2_LOSS: - output = tf.squeeze(output, axis=1) + output = tf.squeeze(output, axis=-1) prediction_dict['y' + suffix] = output elif loss_type == LossType.SIGMOID_L2_LOSS: - output = tf.squeeze(output, axis=1) + output = tf.squeeze(output, axis=-1) prediction_dict['y' + suffix] = tf.sigmoid(output) return prediction_dict diff --git a/easy_rec/python/ops/1.12/libfg_op.so b/easy_rec/python/ops/1.12/libfg_op.so new file mode 100755 index 000000000..be8531dbb Binary files /dev/null and b/easy_rec/python/ops/1.12/libfg_op.so differ diff --git a/easy_rec/python/ops/1.12_pai/libfg_op.so b/easy_rec/python/ops/1.12_pai/libfg_op.so new file mode 100755 index 000000000..be8531dbb Binary files /dev/null and b/easy_rec/python/ops/1.12_pai/libfg_op.so differ diff --git a/easy_rec/python/ops/1.15/libfg_op.so b/easy_rec/python/ops/1.15/libfg_op.so new file mode 100755 index 000000000..1dcd578be Binary files /dev/null and b/easy_rec/python/ops/1.15/libfg_op.so differ diff --git a/easy_rec/python/protos/dnnfg.proto b/easy_rec/python/protos/dnnfg.proto new file mode 100644 index 000000000..4661cad61 --- /dev/null +++ b/easy_rec/python/protos/dnnfg.proto @@ -0,0 +1,12 @@ +syntax = "proto2"; +package protos; + +import "easy_rec/python/protos/dnn.proto"; + +message DNNFG { + required DNN dnn = 1; + required float l2_regularization = 2 [default = 1e-4]; + optional float hard_neg_softmax_weight = 5 [default=0.1]; + optional float pairwise_loss_weight = 6 [default=1]; + optional float pointwise_loss_weight = 7 [default=1]; +} diff --git a/easy_rec/python/protos/easy_rec_model.proto b/easy_rec/python/protos/easy_rec_model.proto index 27dcefadc..e1cafaa76 100644 --- a/easy_rec/python/protos/easy_rec_model.proto +++ b/easy_rec/python/protos/easy_rec_model.proto @@ -16,6 +16,7 @@ import "easy_rec/python/protos/dbmtl.proto"; import "easy_rec/python/protos/ple.proto"; import "easy_rec/python/protos/simple_multi_task.proto"; import "easy_rec/python/protos/dcn.proto"; +import "easy_rec/python/protos/dnnfg.proto"; import "easy_rec/python/protos/cmbf.proto"; import "easy_rec/python/protos/uniter.proto"; import "easy_rec/python/protos/autoint.proto"; @@ -65,6 +66,7 @@ message EasyRecModel { DLRM dlrm = 108; CMBF cmbf = 109; Uniter uniter = 110; + DNNFG dnnfg = 111; MultiTowerRecall multi_tower_recall = 200; DSSM dssm = 201; diff --git a/easy_rec/python/test/embed_test.py b/easy_rec/python/test/embed_test.py index e499b29d0..7554da2d6 100644 --- a/easy_rec/python/test/embed_test.py +++ b/easy_rec/python/test/embed_test.py @@ -131,8 +131,8 @@ def test_seq_multi_embed(self): features = {'field1': tf.constant(['0112', '132430'])} dummy_input = DummyInput( data_config, feature_configs, '', input_vals=features) + dummy_input._set_effective_fields() field_dict, _ = dummy_input._build(tf.estimator.ModeKeys.TRAIN, {}) - wide_and_deep_dict = {'field1': WideOrDeep.DEEP} fc_parser = FeatureColumnParser(feature_configs, wide_and_deep_dict) builder = feature_column._LazyBuilder(field_dict) diff --git a/easy_rec/python/test/train_eval_test.py b/easy_rec/python/test/train_eval_test.py index acfe81b5a..cb564aace 100644 --- a/easy_rec/python/test/train_eval_test.py +++ b/easy_rec/python/test/train_eval_test.py @@ -904,6 +904,8 @@ def test_distribute_eval_dssm_pointwise_classification(self): cur_eval_path, self._test_dir) self.assertTrue(self._success) + @unittest.skipIf('-PAI' not in tf.__version__, + 'Only test when pai-tf is used.') def test_distribute_eval_dssm_reg(self): cur_eval_path = 'data/test/distribute_eval_test/dssm_distribute_eval_reg_taobao_ckpt' self._success = test_utils.test_distributed_eval( @@ -930,20 +932,6 @@ def test_share_no_used(self): 'samples/model_config/share_embedding_not_used.config', self._test_dir) self.assertTrue(self._success) - @unittest.skipIf(gl is None, 'graphlearn is not installed') - def test_dssm_neg_sampler_sequence_feature(self): - self._success = test_utils.test_single_train_eval( - 'samples/model_config/dssm_neg_sampler_sequence_feature.config', - self._test_dir) - self.assertTrue(self._success) - - @unittest.skipIf(gl is None, 'graphlearn is not installed') - def test_dssm_neg_sampler_need_key_feature(self): - self._success = test_utils.test_single_train_eval( - 'samples/model_config/dssm_neg_sampler_need_key_feature.config', - self._test_dir) - self.assertTrue(self._success) - def test_dbmtl_on_multi_numeric_boundary_need_key_feature(self): self._success = test_utils.test_single_train_eval( 'samples/model_config/dbmtl_on_multi_numeric_boundary_need_key_feature_taobao.config', @@ -969,16 +957,24 @@ def test_deepfm_on_sequence_feature_aux_hist_seq(self): self.assertTrue(self._success) @unittest.skipIf(gl is None, 'graphlearn is not installed') - def test_multi_tower_recall_neg_sampler_sequence_feature(self): + def test_dnn_fg_recall_neg_sampler(self): self._success = test_utils.test_single_train_eval( - 'samples/model_config/multi_tower_recall_neg_sampler_sequence_feature.config', + 'samples/model_config/fg_fusion_train_neg_on_dnn.config', self._test_dir) self.assertTrue(self._success) @unittest.skipIf(gl is None, 'graphlearn is not installed') - def test_multi_tower_recall_neg_sampler_only_sequence_feature(self): + def test_dnn_fg_recall_neg_sampler_with_sequence_feature(self): + self._success = test_utils.test_single_train_eval( + 'samples/model_config/fg_fusion_train_neg_seq_on_dnn.config', + self._test_dir, + total_steps=10) + self.assertTrue(self._success) + + @unittest.skipIf(gl is None, 'graphlearn is not installed') + def test_dcn_fg_with_sequence_feature(self): self._success = test_utils.test_single_train_eval( - 'samples/model_config/multi_tower_recall_neg_sampler_only_sequence_feature.config', + 'samples/model_config/fg_fusion_train_seq_on_dcn.config', self._test_dir) self.assertTrue(self._success) diff --git a/easy_rec/python/utils/convert_rtp_fg.py b/easy_rec/python/utils/convert_rtp_fg.py index d665fcd74..badd1295e 100644 --- a/easy_rec/python/utils/convert_rtp_fg.py +++ b/easy_rec/python/utils/convert_rtp_fg.py @@ -246,7 +246,7 @@ def load_input_field_and_feature_config(rtp_fg, for sub_feature in feature['features']: sub_feature_type = sub_feature['feature_type'] sub_feature_name = sub_feature['feature_name'] - all_sub_feature_name = sequence_name + '_' + sub_feature_name + all_sub_feature_name = sequence_name + '__' + sub_feature_name pipeline_config = process_features( sub_feature_type, all_sub_feature_name, diff --git a/easy_rec/python/utils/fg_util.py b/easy_rec/python/utils/fg_util.py index c394444bf..4be0402e7 100644 --- a/easy_rec/python/utils/fg_util.py +++ b/easy_rec/python/utils/fg_util.py @@ -51,3 +51,185 @@ def load_fg_json_to_config(pipeline_config): pipeline_config.fg_json_path = '!' + pipeline_config.fg_json_path return pipeline_config + + +def _fg(_fg_config, + _effective_fg_features, + _fg_module, + field_dict, + parsed_dict={}): + multi_val_sep = _fg_config.get('multi_val_sep', '\035') + input_dict = {} + output_dict = {} + + def _tf_type(in_type): + in_type = in_type.lower() + type_map = { + 'integer': tf.int32, + 'int32': tf.int32, + 'int64': tf.int32, + 'bigint': tf.int64, + 'string': tf.string, + 'float': tf.float32, + 'double': tf.double + } + assert in_type in type_map, 'invalid type: %s' % in_type + return type_map[in_type] + + def _get_input(input_name): + if input_name in input_dict: + return input_dict[input_name] + + sample_type = parsed_dict.get('__sampler_type__', None) + + side, key = input_name.split(':') + x = field_dict[key] + if sample_type is not None: + num_neg = parsed_dict.get('__num_neg_sample__', None) + batch_size = parsed_dict.get('__batch_size__', None) + + if sample_type.startswith('hard_negative_sampler'): + raise NotImplementedError + else: + if side == 'user': + x = tf.reshape( + tf.tile(x[:, tf.newaxis], multiples=[1, 1 + num_neg]), [-1]) + elif side == 'item': + x = tf.reshape( + tf.concat([ + x[:batch_size, tf.newaxis], + tf.tile( + x[tf.newaxis, batch_size:], multiples=[batch_size, 1]) + ], + axis=-1), [-1]) # noqa + else: + raise ValueError('Unknown side: %s' % side) + input_dict[input_name] = x if x.dtype == tf.string else tf.as_string(x) + return input_dict[input_name] + + for feature_config in _fg_config['features']: + if 'sequence_name' in feature_config: + sequence_name = feature_config['sequence_name'] # tag_category_list + sequence_delim = feature_config.get('sequence_delim', ';') # ";" + for sub_feature_config in feature_config['features']: + sub_feature_type = sub_feature_config['feature_type'] # id_feature + sub_feature_name = sub_feature_config['feature_name'] # cate_id + feature_name = sequence_name + '__' + sub_feature_name # tag_category_list__cate_id + if feature_name not in _effective_fg_features: + continue + if sub_feature_type == 'id_feature': + # input = sequence_name + '__' + field_dict[sub_feature_config['expression'].split(':')[-1]] + input = field_dict[feature_name] + sparse_input = tf.string_split( + input, delimiter=feature_config['sequence_delim']) + seq_indices = tf.segment_max( + tf.add(sparse_input.indices[:, 1], 1), + sparse_input.indices[:, 0], + name=None) + batch_size = tf.shape(input)[0] + pad_size = batch_size - tf.shape(seq_indices)[0] + seq_indices_pad = tf.pad(seq_indices, [[0, pad_size]]) + sparse_input_values = sparse_input.values + x = _fg_module.batch_sequence_id_feature_op( + sparse_input_values, + seq_indices_pad, + feature_name=feature_name, + msep=multi_val_sep, + default_value=feature_config.get('default_value', ''), + need_prefix=feature_config.get('need_prefix', False), + sequence_delim=sequence_delim, + dtype=tf.string) + output_dict[feature_name] = x + if parsed_dict.get('__sampler_type__', None) is not None: + num_neg = parsed_dict.get('__num_neg_sample__', None) + output_dict[feature_name] = tf.reshape( + tf.tile(x[:, tf.newaxis], multiples=[1, 1 + num_neg]), [-1]) + elif sub_feature_type == 'raw_feature': + # input = sequence_name + '__' + field_dict[sub_feature_config['expression'].split(':')[-1]] + input = field_dict[feature_name] + sparse_input = tf.string_split( + input, delimiter=feature_config['sequence_delim']) + seq_indices = tf.segment_max( + tf.add(sparse_input.indices[:, 1], 1), + sparse_input.indices[:, 0], + name=None) + batch_size = tf.shape(input)[0] + pad_size = batch_size - tf.shape(seq_indices)[0] + seq_indices_pad = tf.pad(seq_indices, [[0, pad_size]]) + sparse_input_values = sparse_input.values + output_dict[feature_name] = _fg_module.batch_sequence_raw_feature_op( + sparse_input_values, + seq_indices_pad, + feature_name=feature_name, + msep=multi_val_sep, + default_value=feature_config.get('default_value', '0.0'), + sequence_delim=sequence_delim, + normalizer=feature_config.get('normalizer', ''), + value_dimension=feature_config.get('value_dimension', 1), + dtype=tf.string) + else: + raise ValueError('Unknown seq sub feature type: %s' % + sub_feature_type) + else: + feature_type = feature_config['feature_type'] + feature_name = feature_config['feature_name'] + if feature_name not in _effective_fg_features: + continue + if feature_type == 'id_feature': + output_dict[feature_name] = _fg_module.id_feature_op( + _get_input(feature_config['expression']), + feature_name=feature_name, + msep=multi_val_sep, + default_value=feature_config.get('default_value', '0.0'), + need_prefix=feature_config.get('need_prefix', True), + dtype=tf.string) + elif feature_type == 'raw_feature': + output_dict[feature_name] = _fg_module.raw_feature_op( + _get_input(feature_config['expression']), + feature_name=feature_name, + msep=multi_val_sep, + default_value=feature_config.get('default_value', '0.0'), + normalizer=feature_config.get('normalizer', ''), + value_dimension=feature_config.get('value_dimension', 1), + dtype=_tf_type(feature_config.get('value_type', 'float'))) + elif feature_type == 'combo_feature': + inputs = [_get_input(k) for k in feature_config['expression']] + output_dict[feature_name] = _fg_module.combo_feature_op( + inputs, + feature_name=feature_name, + msep=multi_val_sep, + default_value=feature_config.get('default_value', ''), + need_prefix=feature_config.get('need_prefix', True), + dtype='string') + elif feature_type == 'lookup_feature': + output_dict[feature_name] = _fg_module.lookup_feature_op( + _get_input(feature_config['map']), + _get_input(feature_config['key']), + feature_name=feature_name, + msep=multi_val_sep, + default_value=feature_config.get('default_value', '0.0'), + dtype=_tf_type(feature_config.get('value_type', 'float')), + need_discrete=feature_config.get('needDiscrete', False), + need_key=feature_config.get('needKey', False), + need_weighting=feature_config.get('needWeighting', False), + value_dimension=feature_config.get('value_dimension', 1), + combiner=feature_config.get('combiner', 'sum'), + boundaries=feature_config.get('bucketize_boundaries', []), + normalizer=feature_config.get('normalizer', '')) + elif feature_type == 'match_feature': + output_dict[feature_name] = _fg_module.match_feature_op( + _get_input(feature_config['user']), + _get_input(feature_config['category']), + _get_input(feature_config['item']), + feature_name=feature_name, + msep=multi_val_sep, + default_value=feature_config.get('default_value', '0.0'), + dtype=_tf_type(feature_config.get('value_type', 'float')), + need_discrete=feature_config.get('needDiscrete', False), + normalizer=feature_config.get('normalizer', ''), + match_type=feature_config.get('matchType', 'hit')) + else: + raise ValueError('Unknown feature type: %s' % feature_type) + + output_dict = dict(field_dict, **output_dict) + return output_dict diff --git a/samples/model_config/dssm_neg_sampler_need_key_feature.config b/samples/model_config/dssm_neg_sampler_need_key_feature.config deleted file mode 100644 index 12a8621cc..000000000 --- a/samples/model_config/dssm_neg_sampler_need_key_feature.config +++ /dev/null @@ -1,302 +0,0 @@ -train_input_path: "data/test/tb_data/taobao_train_data" -eval_input_path: "data/test/tb_data/taobao_test_data" -model_dir: "experiments/dssm_neg_sampler_sequence_feature" - -train_config { - optimizer_config: { - adam_optimizer: { - learning_rate: { - exponential_decay_learning_rate { - initial_learning_rate: 0.001 - decay_steps: 1000 - decay_factor: 0.5 - min_learning_rate: 1e-07 - } - } - } - use_moving_average: false - } - num_steps: 1000 - sync_replicas: false - save_checkpoints_steps: 100 - log_step_count_steps: 10 -} - -eval_config { - metrics_set: { - auc { - } - } - metrics_set: { - gauc { - uid_field: "user_id" - } - } -} - -data_config { - batch_size: 1024 - input_fields { - input_name:'clk' - input_type: INT32 - } - input_fields { - input_name:'buy' - input_type: INT32 - } - input_fields { - input_name: 'pid' - input_type: STRING - } - input_fields { - input_name: 'adgroup_id' - input_type: STRING - } - input_fields { - input_name: 'cate_id' - input_type: STRING - } - input_fields { - input_name: 'campaign_id' - input_type: STRING - } - input_fields { - input_name: 'customer' - input_type: STRING - } - input_fields { - input_name: 'brand' - input_type: STRING - } - input_fields { - input_name: 'user_id' - input_type: STRING - } - input_fields { - input_name: 'cms_segid' - input_type: STRING - } - input_fields { - input_name: 'cms_group_id' - input_type: STRING - } - input_fields { - input_name: 'final_gender_code' - input_type: STRING - } - input_fields { - input_name: 'age_level' - input_type: STRING - } - input_fields { - input_name: 'pvalue_level' - input_type: STRING - } - input_fields { - input_name: 'shopping_level' - input_type: STRING - } - input_fields { - input_name: 'occupation' - input_type: STRING - } - input_fields { - input_name: 'new_user_class_level' - input_type: STRING - } - input_fields { - input_name: 'tag_category_list' - input_type: STRING - } - input_fields { - input_name: 'tag_brand_list' - input_type: STRING - } - input_fields { - input_name: 'price' - input_type: INT32 - } - - label_fields: 'clk' - num_epochs: 5 - prefetch_size: 4 - input_type: CSVInput - - negative_sampler { - input_path: 'data/test/tb_data/taobao_ad_feature_gl' - num_sample: 256 - num_eval_sample: 4096 - attr_fields: 'adgroup_id' - attr_fields: 'cate_id' - attr_fields: 'campaign_id' - attr_fields: 'customer' - attr_fields: 'brand' - item_id_field: 'adgroup_id' - } -} - -feature_configs : { - input_names: 'pid' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs : { - input_names: 'adgroup_id' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100000 -} -feature_configs : { - input_names: 'cate_id' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10000 -} -feature_configs : { - input_names: 'campaign_id' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100000 -} -feature_configs : { - input_names: 'customer' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100000 -} -feature_configs : { - input_names: 'brand' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100000 -} -feature_configs : { - input_names: 'user_id' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100000 -} -feature_configs : { - input_names: 'cms_segid' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100 -} -feature_configs : { - input_names: 'cms_group_id' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100 -} -feature_configs : { - input_names: 'final_gender_code' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs : { - input_names: 'age_level' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs : { - input_names: 'pvalue_level' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs : { - input_names: 'shopping_level' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs : { - input_names: 'occupation' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs : { - input_names: 'new_user_class_level' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs { - input_names: "tag_category_list" - feature_type: SequenceFeature - embedding_dim: 16 - hash_bucket_size: 100000 - sub_feature_type: IdFeature - separator: "|" -} -feature_configs { - input_names: "tag_brand_list" - feature_type: SequenceFeature - embedding_dim: 16 - hash_bucket_size: 100000 - sub_feature_type: IdFeature - separator: "|" -} -feature_configs : { - input_names: 'price' - feature_type: IdFeature - embedding_dim: 16 - num_buckets: 50 -} -model_config:{ - model_class: "DSSM" - feature_groups: { - group_name: 'user' - feature_names: 'user_id' - feature_names: 'cms_segid' - feature_names: 'cms_group_id' - feature_names: 'age_level' - feature_names: 'pvalue_level' - feature_names: 'shopping_level' - feature_names: 'occupation' - feature_names: 'new_user_class_level' - wide_deep:DEEP - sequence_features: { - group_name: "seq_fea" - allow_key_search: true - need_key_feature:false - seq_att_map: { - key: "brand" - key: "cate_id" - hist_seq: "tag_brand_list" - hist_seq: "tag_category_list" - } - } - } - feature_groups: { - group_name: "item" - feature_names: 'adgroup_id' - feature_names: 'cate_id' - feature_names: 'campaign_id' - feature_names: 'customer' - feature_names: 'brand' - wide_deep:DEEP - } - dssm { - user_tower { - id: "user_id" - dnn { - hidden_units: [256, 128, 64, 32] - # dropout_ratio : [0.1, 0.1, 0.1, 0.1] - } - } - item_tower { - id: "adgroup_id" - dnn { - hidden_units: [256, 128, 64, 32] - } - } - l2_regularization: 1e-6 - } - loss_type: SOFTMAX_CROSS_ENTROPY - embedding_regularization: 5e-6 -} diff --git a/samples/model_config/dssm_neg_sampler_sequence_feature.config b/samples/model_config/dssm_neg_sampler_sequence_feature.config deleted file mode 100644 index df6896408..000000000 --- a/samples/model_config/dssm_neg_sampler_sequence_feature.config +++ /dev/null @@ -1,302 +0,0 @@ -train_input_path: "data/test/tb_data/taobao_train_data" -eval_input_path: "data/test/tb_data/taobao_test_data" -model_dir: "experiments/dssm_neg_sampler_sequence_feature" - -train_config { - optimizer_config: { - adam_optimizer: { - learning_rate: { - exponential_decay_learning_rate { - initial_learning_rate: 0.001 - decay_steps: 1000 - decay_factor: 0.5 - min_learning_rate: 1e-07 - } - } - } - use_moving_average: false - } - num_steps: 1000 - sync_replicas: false - save_checkpoints_steps: 100 - log_step_count_steps: 10 -} - -eval_config { - metrics_set: { - auc { - } - } - metrics_set: { - gauc { - uid_field: "user_id" - } - } -} - -data_config { - batch_size: 1024 - input_fields { - input_name:'clk' - input_type: INT32 - } - input_fields { - input_name:'buy' - input_type: INT32 - } - input_fields { - input_name: 'pid' - input_type: STRING - } - input_fields { - input_name: 'adgroup_id' - input_type: STRING - } - input_fields { - input_name: 'cate_id' - input_type: STRING - } - input_fields { - input_name: 'campaign_id' - input_type: STRING - } - input_fields { - input_name: 'customer' - input_type: STRING - } - input_fields { - input_name: 'brand' - input_type: STRING - } - input_fields { - input_name: 'user_id' - input_type: STRING - } - input_fields { - input_name: 'cms_segid' - input_type: STRING - } - input_fields { - input_name: 'cms_group_id' - input_type: STRING - } - input_fields { - input_name: 'final_gender_code' - input_type: STRING - } - input_fields { - input_name: 'age_level' - input_type: STRING - } - input_fields { - input_name: 'pvalue_level' - input_type: STRING - } - input_fields { - input_name: 'shopping_level' - input_type: STRING - } - input_fields { - input_name: 'occupation' - input_type: STRING - } - input_fields { - input_name: 'new_user_class_level' - input_type: STRING - } - input_fields { - input_name: 'tag_category_list' - input_type: STRING - } - input_fields { - input_name: 'tag_brand_list' - input_type: STRING - } - input_fields { - input_name: 'price' - input_type: INT32 - } - - label_fields: 'clk' - num_epochs: 5 - prefetch_size: 4 - input_type: CSVInput - - negative_sampler { - input_path: 'data/test/tb_data/taobao_ad_feature_gl' - num_sample: 256 - num_eval_sample: 4096 - attr_fields: 'adgroup_id' - attr_fields: 'cate_id' - attr_fields: 'campaign_id' - attr_fields: 'customer' - attr_fields: 'brand' - item_id_field: 'adgroup_id' - } -} - -feature_configs : { - input_names: 'pid' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs : { - input_names: 'adgroup_id' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100000 -} -feature_configs : { - input_names: 'cate_id' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10000 -} -feature_configs : { - input_names: 'campaign_id' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100000 -} -feature_configs : { - input_names: 'customer' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100000 -} -feature_configs : { - input_names: 'brand' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100000 -} -feature_configs : { - input_names: 'user_id' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100000 -} -feature_configs : { - input_names: 'cms_segid' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100 -} -feature_configs : { - input_names: 'cms_group_id' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100 -} -feature_configs : { - input_names: 'final_gender_code' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs : { - input_names: 'age_level' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs : { - input_names: 'pvalue_level' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs : { - input_names: 'shopping_level' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs : { - input_names: 'occupation' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs : { - input_names: 'new_user_class_level' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs { - input_names: "tag_category_list" - feature_type: SequenceFeature - embedding_dim: 16 - hash_bucket_size: 100000 - sub_feature_type: IdFeature - separator: "|" -} -feature_configs { - input_names: "tag_brand_list" - feature_type: SequenceFeature - embedding_dim: 16 - hash_bucket_size: 100000 - sub_feature_type: IdFeature - separator: "|" -} -feature_configs : { - input_names: 'price' - feature_type: IdFeature - embedding_dim: 16 - num_buckets: 50 -} -model_config:{ - model_class: "DSSM" - feature_groups: { - group_name: 'user' - feature_names: 'user_id' - feature_names: 'cms_segid' - feature_names: 'cms_group_id' - feature_names: 'age_level' - feature_names: 'pvalue_level' - feature_names: 'shopping_level' - feature_names: 'occupation' - feature_names: 'new_user_class_level' - wide_deep:DEEP - sequence_features: { - group_name: "seq_fea" - allow_key_search: true - need_key_feature:true - seq_att_map: { - key: "brand" - key: "cate_id" - hist_seq: "tag_brand_list" - hist_seq: "tag_category_list" - } - } - } - feature_groups: { - group_name: "item" - feature_names: 'adgroup_id' - feature_names: 'cate_id' - feature_names: 'campaign_id' - feature_names: 'customer' - feature_names: 'brand' - wide_deep:DEEP - } - dssm { - user_tower { - id: "user_id" - dnn { - hidden_units: [256, 128, 64, 32] - # dropout_ratio : [0.1, 0.1, 0.1, 0.1] - } - } - item_tower { - id: "adgroup_id" - dnn { - hidden_units: [256, 128, 64, 32] - } - } - l2_regularization: 1e-6 - } - loss_type: SOFTMAX_CROSS_ENTROPY - embedding_regularization: 5e-6 -} diff --git a/samples/model_config/fg_fusion_train.config b/samples/model_config/fg_fusion_train.config new file mode 100644 index 000000000..b724c9b09 --- /dev/null +++ b/samples/model_config/fg_fusion_train.config @@ -0,0 +1,303 @@ +train_input_path: "data/test/tb_data/taobao_train_data" +eval_input_path: "data/test/tb_data/taobao_test_data" +model_dir: "experiments/rtp_fg_demo_v1" + +train_config { + optimizer_config { + use_moving_average: false + adam_optimizer { + learning_rate { + exponential_decay_learning_rate { + initial_learning_rate: 0.0001 + decay_steps: 100000 + decay_factor: 0.5 + min_learning_rate: 1e-07 + } + } + } + } + num_steps: 800 + sync_replicas: true + log_step_count_steps: 200 +} + +eval_config { + metrics_set { + auc { + } + } +} + +fg_json_path: "!samples/model_config/fg_fusion_train.json" + +data_config { + input_fields { + input_name:'clk' + input_type: INT32 + } + input_fields { + input_name:'buy' + input_type: INT32 + } + input_fields { + input_name: 'pid' + input_type: STRING + } + input_fields { + input_name: 'adgroup_id' + input_type: STRING + } + input_fields { + input_name: 'cate_id' + input_type: STRING + } + input_fields { + input_name: 'campaign_id' + input_type: STRING + } + input_fields { + input_name: 'customer' + input_type: STRING + } + input_fields { + input_name: 'brand' + input_type: STRING + } + input_fields { + input_name: 'user_id' + input_type: STRING + } + input_fields { + input_name: 'cms_segid' + input_type: STRING + } + input_fields { + input_name: 'cms_group_id' + input_type: STRING + } + input_fields { + input_name: 'final_gender_code' + input_type: STRING + } + input_fields { + input_name: 'age_level' + input_type: STRING + } + input_fields { + input_name: 'pvalue_level' + input_type: STRING + } + input_fields { + input_name: 'shopping_level' + input_type: STRING + } + input_fields { + input_name: 'occupation' + input_type: STRING + } + input_fields { + input_name: 'new_user_class_level' + input_type: STRING + } + input_fields { + input_name: 'tag_category_list' + input_type: STRING + } + input_fields { + input_name: 'tag_brand_list' + input_type: STRING + } + input_fields { + input_name: 'price' + input_type: STRING + } + label_fields: 'buy' + label_fields: 'clk' + batch_size: 4096 + num_epochs: 10000 + prefetch_size: 32 + input_type: CSVInput +} + +feature_config: { + features: { + input_names: 'pid' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'adgroup_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'cate_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10000 + } + features: { + input_names: 'campaign_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'customer' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'brand' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'user_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'cms_segid' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 + } + features: { + input_names: 'cms_group_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 + } + features: { + input_names: 'final_gender_code' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'age_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'pvalue_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'shopping_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'occupation' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'new_user_class_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'tag_category_list' + feature_type: TagFeature + separator: '' + hash_bucket_size: 100000 + embedding_dim: 16 + } + features: { + input_names: 'tag_brand_list' + feature_type: TagFeature + separator: '' + hash_bucket_size: 100000 + embedding_dim: 16 + } + #features: { + # input_names: 'price' + # feature_type: RawFeature + # embedding_dim: 16 + # boundaries: [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50] + #} +} + +model_config { + model_class: "MultiTower" + feature_groups { + group_name: "item" + feature_names: 'adgroup_id' + feature_names: 'cate_id' + feature_names: 'campaign_id' + feature_names: 'customer' + feature_names: 'brand' + #feature_names: 'price' + wide_deep: DEEP + } + feature_groups { + group_name: "user" + feature_names: 'user_id' + feature_names: 'cms_segid' + feature_names: 'cms_group_id' + feature_names: 'age_level' + feature_names: 'pvalue_level' + feature_names: 'shopping_level' + feature_names: 'occupation' + feature_names: 'new_user_class_level' + wide_deep: DEEP + } + feature_groups { + group_name: "combo" + feature_names: 'pid' + feature_names: 'tag_category_list' + feature_names: 'tag_brand_list' + wide_deep: DEEP + } + embedding_regularization: 1e-05 + multi_tower { + towers { + input: "item" + dnn { + hidden_units: 256 + hidden_units: 192 + hidden_units: 128 + } + } + towers { + input: "user" + dnn { + hidden_units: 256 + hidden_units: 192 + hidden_units: 128 + } + } + towers { + input: "combo" + dnn { + hidden_units: 256 + hidden_units: 192 + hidden_units: 128 + } + } + final_dnn { + hidden_units: 192 + hidden_units: 128 + hidden_units: 64 + } + l2_regularization: 0.0001 + } +} + +export_config { + multi_placeholder: false +} diff --git a/samples/model_config/fg_fusion_train.json b/samples/model_config/fg_fusion_train.json new file mode 100644 index 000000000..17bf69de7 --- /dev/null +++ b/samples/model_config/fg_fusion_train.json @@ -0,0 +1,203 @@ +{ + "features": [ + { + "expression": "item:pid", + "feature_name": "pid", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 100000, + "embedding_dim": 16, + "need_prefix": false, + "group": "combo" + }, + { + "expression": "item:adgroup_id", + "feature_name": "adgroup_id", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 100000, + "embedding_dim": 16, + "need_prefix": false, + "group": "item" + }, + { + "expression": "item:cate_id", + "feature_name": "cate_id", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 100000, + "embedding_dim": 16, + "need_prefix": false, + "group": "item" + }, + { + "expression": "item:campaign_id", + "feature_name": "campaign_id", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 100000, + "embedding_dim": 16, + "need_prefix": false, + "group": "item" + }, + { + "expression": "item:customer", + "feature_name": "customer", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 100000, + "embedding_dim": 16, + "need_prefix": false, + "group": "item" + }, + { + "expression": "item:brand", + "feature_name": "brand", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 100000, + "embedding_dim": 16, + "need_prefix": false, + "group": "item" + }, + { + "expression": "user:user_id", + "feature_name": "user_id", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 100000, + "embedding_dim": 16, + "need_prefix": false, + "group": "user" + }, + { + "expression": "user:cms_segid", + "feature_name": "cms_segid", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 100, + "embedding_dim": 16, + "need_prefix": false, + "group": "user" + }, + { + "expression": "user:cms_group_id", + "feature_name": "cms_group_id", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 100, + "embedding_dim": 16, + "need_prefix": false, + "group": "user" + }, + { + "expression": "user:final_gender_code", + "feature_name": "final_gender_code", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 10, + "embedding_dim": 16, + "need_prefix": false, + "group": "user" + }, + { + "expression": "user:age_level", + "feature_name": "age_level", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 10, + "embedding_dim": 16, + "need_prefix": false, + "group": "user" + }, + { + "expression": "user:pvalue_level", + "feature_name": "pvalue_level", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 10, + "embedding_dim": 16, + "need_prefix": false, + "group": "user" + }, + { + "expression": "user:shopping_level", + "feature_name": "shopping_level", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 10, + "embedding_dim": 16, + "need_prefix": false, + "group": "user" + }, + { + "expression": "user:occupation", + "feature_name": "occupation", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 10, + "embedding_dim": 16, + "need_prefix": false, + "group": "user" + }, + { + "expression": "user:new_user_class_level", + "feature_name": "new_user_class_level", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 10, + "embedding_dim": 16, + "need_prefix": false, + "group": "user" + }, + { + "expression": "user:tag_category_list", + "feature_name": "tag_category_list", + "feature_type": "id_feature", + "hash_bucket_size": 100000, + "need_prefix": false, + "embedding_dim": 16, + "group": "combo" + }, + { + "expression": "user:tag_brand_list", + "feature_name": "tag_brand_list", + "feature_type": "id_feature", + "hash_bucket_size": 100000, + "need_prefix": false, + "embedding_dim": 16, + "group": "combo" + }, + { + "expression": "item:price", + "feature_name": "price", + "feature_type": "raw_feature", + "value_type": "Integer", + "combiner": "mean", + "need_prefix": false, + "embedding_dim": 16, + "group": "item" + } + ], + "reserves": [ + "user_id", + "campaign_id", + "clk" + ], + "multi_val_sep": "|" +} diff --git a/samples/model_config/fg_fusion_train_neg_on_dnn.config b/samples/model_config/fg_fusion_train_neg_on_dnn.config new file mode 100644 index 000000000..4eddc99f9 --- /dev/null +++ b/samples/model_config/fg_fusion_train_neg_on_dnn.config @@ -0,0 +1,282 @@ +train_input_path: "data/test/tb_data/taobao_train_data" +eval_input_path: "data/test/tb_data/taobao_test_data" +model_dir: "experiments/dnn_rtp_fg_demo_v1_neg" + +train_config { + optimizer_config { + use_moving_average: false + adam_optimizer { + learning_rate { + exponential_decay_learning_rate { + initial_learning_rate: 0.0001 + decay_steps: 100000 + decay_factor: 0.5 + min_learning_rate: 1e-07 + } + } + } + } + num_steps: 6 + sync_replicas: false + log_step_count_steps: 2 +} + +eval_config { + metrics_set { + auc { + } + } +} + +fg_json_path: "!samples/model_config/fg_fusion_train.json" + +data_config { + input_fields { + input_name:'clk' + input_type: INT32 + } + input_fields { + input_name:'buy' + input_type: INT32 + } + input_fields { + input_name: 'pid' + input_type: STRING + } + input_fields { + input_name: 'adgroup_id' + input_type: STRING + } + input_fields { + input_name: 'cate_id' + input_type: STRING + } + input_fields { + input_name: 'campaign_id' + input_type: STRING + } + input_fields { + input_name: 'customer' + input_type: STRING + } + input_fields { + input_name: 'brand' + input_type: STRING + } + input_fields { + input_name: 'user_id' + input_type: STRING + } + input_fields { + input_name: 'cms_segid' + input_type: STRING + } + input_fields { + input_name: 'cms_group_id' + input_type: STRING + } + input_fields { + input_name: 'final_gender_code' + input_type: STRING + } + input_fields { + input_name: 'age_level' + input_type: STRING + } + input_fields { + input_name: 'pvalue_level' + input_type: STRING + } + input_fields { + input_name: 'shopping_level' + input_type: STRING + } + input_fields { + input_name: 'occupation' + input_type: STRING + } + input_fields { + input_name: 'new_user_class_level' + input_type: STRING + } + input_fields { + input_name: 'tag_category_list' + input_type: STRING + } + input_fields { + input_name: 'tag_brand_list' + input_type: STRING + } + input_fields { + input_name: 'price' + input_type: STRING + } + label_fields: 'buy' + label_fields: 'clk' + batch_size: 256 + num_epochs: 10000 + prefetch_size: 4 + input_type: CSVInput + + negative_sampler { + input_path: 'data/test/tb_data/taobao_ad_feature_gl' + num_sample: 512 + num_eval_sample: 512 + attr_fields: 'adgroup_id' + attr_fields: 'cate_id' + attr_fields: 'campaign_id' + attr_fields: 'customer' + attr_fields: 'brand' + item_id_field: 'adgroup_id' + } +} + +feature_config: { + #features: { + # input_names: 'pid' + # feature_type: IdFeature + # embedding_dim: 16 + # hash_bucket_size: 10 + #} + features: { + input_names: 'adgroup_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'cate_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10000 + } + features: { + input_names: 'campaign_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'customer' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'brand' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'user_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'cms_segid' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 + } + features: { + input_names: 'cms_group_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 + } + features: { + input_names: 'final_gender_code' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'age_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'pvalue_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'shopping_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'occupation' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'new_user_class_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'tag_category_list' + feature_type: TagFeature + separator: '' + hash_bucket_size: 100000 + embedding_dim: 16 + } + features: { + input_names: 'tag_brand_list' + feature_type: TagFeature + separator: '' + hash_bucket_size: 100000 + embedding_dim: 16 + } + #features: { + # input_names: 'price' + # feature_type: RawFeature + # embedding_dim: 16 + # boundaries: [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50] + #} +} + +model_config { + model_class: "DNNFG" + feature_groups { + group_name: "all" + feature_names: 'adgroup_id' + feature_names: 'cate_id' + feature_names: 'campaign_id' + feature_names: 'customer' + feature_names: 'brand' + feature_names: 'user_id' + feature_names: 'cms_segid' + feature_names: 'cms_group_id' + feature_names: 'final_gender_code' + feature_names: 'age_level' + feature_names: 'pvalue_level' + feature_names: 'shopping_level' + feature_names: 'occupation' + feature_names: 'new_user_class_level' + feature_names: 'tag_category_list' + feature_names: 'tag_brand_list' + wide_deep: DEEP + } + dnnfg { + dnn { + hidden_units: 256 + hidden_units: 128 + hidden_units: 64 + } + l2_regularization: 1e-6 + } + embedding_regularization: 5e-6 +} + +export_config { + multi_placeholder: false +} diff --git a/samples/model_config/fg_fusion_train_neg_seq_on_dnn.config b/samples/model_config/fg_fusion_train_neg_seq_on_dnn.config new file mode 100644 index 000000000..e9c27e46c --- /dev/null +++ b/samples/model_config/fg_fusion_train_neg_seq_on_dnn.config @@ -0,0 +1,280 @@ +train_input_path: "data/test/tb_data/taobao_train_data" +eval_input_path: "data/test/tb_data/taobao_test_data" +model_dir: "experiments/dnn_rtp_fg_demo_v1_neg_seq" + +train_config { + optimizer_config { + use_moving_average: false + adam_optimizer { + learning_rate { + exponential_decay_learning_rate { + initial_learning_rate: 0.0001 + decay_steps: 100000 + decay_factor: 0.5 + min_learning_rate: 1e-07 + } + } + } + } + num_steps: 6 + sync_replicas: false + log_step_count_steps: 2 +} + +eval_config { + metrics_set { + auc { + } + } +} + +fg_json_path: "!samples/model_config/fg_fusion_train_seq.json" + +data_config { + input_fields { + input_name:'clk' + input_type: INT32 + } + input_fields { + input_name:'buy' + input_type: INT32 + } + input_fields { + input_name: 'pid' + input_type: STRING + } + input_fields { + input_name: 'adgroup_id' + input_type: STRING + } + input_fields { + input_name: 'cate_id' + input_type: STRING + } + input_fields { + input_name: 'campaign_id' + input_type: STRING + } + input_fields { + input_name: 'customer' + input_type: STRING + } + input_fields { + input_name: 'brand' + input_type: STRING + } + input_fields { + input_name: 'user_id' + input_type: STRING + } + input_fields { + input_name: 'cms_segid' + input_type: STRING + } + input_fields { + input_name: 'cms_group_id' + input_type: STRING + } + input_fields { + input_name: 'final_gender_code' + input_type: STRING + } + input_fields { + input_name: 'age_level' + input_type: STRING + } + input_fields { + input_name: 'pvalue_level' + input_type: STRING + } + input_fields { + input_name: 'shopping_level' + input_type: STRING + } + input_fields { + input_name: 'occupation' + input_type: STRING + } + input_fields { + input_name: 'new_user_class_level' + input_type: STRING + } + input_fields { + input_name: 'click_seq__cate_id' + input_type: STRING + } + input_fields { + input_name: 'click_seq__brand' + input_type: STRING + } + input_fields { + input_name: 'price' + input_type: STRING + } + label_fields: 'buy' + label_fields: 'clk' + batch_size: 256 + prefetch_size: 4 + input_type: CSVInput + + negative_sampler { + input_path: 'data/test/tb_data/taobao_ad_feature_gl' + num_sample: 256 + num_eval_sample: 256 + attr_fields: 'adgroup_id' + attr_fields: 'cate_id' + attr_fields: 'campaign_id' + attr_fields: 'customer' + attr_fields: 'brand' + item_id_field: 'adgroup_id' + } +} + +feature_config: { + features: { + input_names: 'adgroup_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'cate_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10000 + } + features: { + input_names: 'campaign_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'customer' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'brand' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'user_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'cms_segid' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 + } + features: { + input_names: 'cms_group_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 + } + features: { + input_names: 'final_gender_code' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'age_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'pvalue_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'shopping_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'occupation' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'new_user_class_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'click_seq__cate_id' + feature_type: SequenceFeature + separator: "|" + hash_bucket_size: 100000 + embedding_dim: 16 + sub_feature_type: IdFeature + } + features: { + input_names: 'click_seq__brand' + feature_type: SequenceFeature + separator: "|" + hash_bucket_size: 100000 + embedding_dim: 16 + sub_feature_type: IdFeature + } +} + +model_config { + model_class: "DNNFG" + feature_groups { + group_name: "all" + feature_names: 'adgroup_id' + feature_names: 'cate_id' + feature_names: 'campaign_id' + feature_names: 'customer' + feature_names: 'brand' + feature_names: 'user_id' + feature_names: 'cms_segid' + feature_names: 'cms_group_id' + feature_names: 'final_gender_code' + feature_names: 'age_level' + feature_names: 'pvalue_level' + feature_names: 'shopping_level' + feature_names: 'occupation' + feature_names: 'new_user_class_level' + wide_deep: DEEP + sequence_features: { + group_name: "seq_fea" + tf_summary: false + seq_att_map: { + key: "cate_id" + key: "brand" + hist_seq: "click_seq__cate_id" + hist_seq: "click_seq__brand" + } + } + negative_sampler: true + } + dnnfg { + dnn { + hidden_units: 256 + hidden_units: 128 + hidden_units: 64 + } + l2_regularization: 1e-6 + } + embedding_regularization: 5e-6 +} + +export_config { + multi_placeholder: false +} diff --git a/samples/model_config/fg_fusion_train_seq.json b/samples/model_config/fg_fusion_train_seq.json new file mode 100644 index 000000000..c7e3c76b1 --- /dev/null +++ b/samples/model_config/fg_fusion_train_seq.json @@ -0,0 +1,226 @@ +{ + "features": [ + { + "expression": "item:pid", + "feature_name": "pid", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 100000, + "embedding_dim": 16, + "need_prefix": false, + "group": "combo" + }, + { + "expression": "item:adgroup_id", + "feature_name": "adgroup_id", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 100000, + "embedding_dim": 16, + "need_prefix": false, + "group": "item" + }, + { + "expression": "item:cate_id", + "feature_name": "cate_id", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 100000, + "embedding_dim": 16, + "need_prefix": false, + "group": "item" + }, + { + "expression": "item:campaign_id", + "feature_name": "campaign_id", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 100000, + "embedding_dim": 16, + "need_prefix": false, + "group": "item" + }, + { + "expression": "item:customer", + "feature_name": "customer", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 100000, + "embedding_dim": 16, + "need_prefix": false, + "group": "item" + }, + { + "expression": "item:brand", + "feature_name": "brand", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 100000, + "embedding_dim": 16, + "need_prefix": false, + "group": "item" + }, + { + "expression": "user:user_id", + "feature_name": "user_id", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 100000, + "embedding_dim": 16, + "need_prefix": false, + "group": "user" + }, + { + "expression": "user:cms_segid", + "feature_name": "cms_segid", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 100, + "embedding_dim": 16, + "need_prefix": false, + "group": "user" + }, + { + "expression": "user:cms_group_id", + "feature_name": "cms_group_id", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 100, + "embedding_dim": 16, + "need_prefix": false, + "group": "user" + }, + { + "expression": "user:final_gender_code", + "feature_name": "final_gender_code", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 10, + "embedding_dim": 16, + "need_prefix": false, + "group": "user" + }, + { + "expression": "user:age_level", + "feature_name": "age_level", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 10, + "embedding_dim": 16, + "need_prefix": false, + "group": "user" + }, + { + "expression": "user:pvalue_level", + "feature_name": "pvalue_level", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 10, + "embedding_dim": 16, + "need_prefix": false, + "group": "user" + }, + { + "expression": "user:shopping_level", + "feature_name": "shopping_level", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 10, + "embedding_dim": 16, + "need_prefix": false, + "group": "user" + }, + { + "expression": "user:occupation", + "feature_name": "occupation", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 10, + "embedding_dim": 16, + "need_prefix": false, + "group": "user" + }, + { + "expression": "user:new_user_class_level", + "feature_name": "new_user_class_level", + "feature_type": "id_feature", + "value_type": "String", + "combiner": "mean", + "hash_bucket_size": 10, + "embedding_dim": 16, + "need_prefix": false, + "group": "user" + }, + { + "expression": "user:tag_brand_list", + "feature_name": "tag_brand_list", + "feature_type": "id_feature", + "hash_bucket_size": 100000, + "need_prefix": false, + "embedding_dim": 16, + "group": "combo" + }, + { + "expression": "item:price", + "feature_name": "price", + "feature_type": "raw_feature", + "value_type": "Integer", + "combiner": "mean", + "need_prefix": false, + "embedding_dim": 16, + "group": "item" + }, + { + "sequence_name":"click_seq", + "sequence_column":"click_seq", + "sequence_length":50, + "sequence_delim":"|", + "attribute_delim":"|", + "sequence_table":"item", + "sequence_pk":"click_seq", + "default_value": "-1", + "features":[ + { + "feature_name":"cate_id", + "feature_type":"id_feature", + "expression":"item:cate_id", + "value_type":"String", + "combiner":"mean", + "need_prefix":false, + "hash_bucket_size":100000, + "embedding_dim":16 + }, + { + "feature_name":"brand", + "feature_type":"id_feature", + "expression":"item:brand", + "value_type":"String", + "combiner":"mean", + "need_prefix":false, + "hash_bucket_size":100000, + "embedding_dim":16 + } + ] + } + ], + "reserves": [ + "user_id", + "campaign_id", + "clk" + ], + "multi_val_sep": "|" +} diff --git a/samples/model_config/fg_fusion_train_seq_on_dcn.config b/samples/model_config/fg_fusion_train_seq_on_dcn.config new file mode 100644 index 000000000..45a26b7a1 --- /dev/null +++ b/samples/model_config/fg_fusion_train_seq_on_dcn.config @@ -0,0 +1,288 @@ +train_input_path: "data/test/tb_data/taobao_train_data" +eval_input_path: "data/test/tb_data/taobao_test_data" +model_dir: "experiments/dcn_rtp_fg_demo_v1_seq" + +train_config { + optimizer_config { + use_moving_average: false + adam_optimizer { + learning_rate { + exponential_decay_learning_rate { + initial_learning_rate: 0.0001 + decay_steps: 100000 + decay_factor: 0.5 + min_learning_rate: 1e-07 + } + } + } + } + num_steps: 6 + sync_replicas: false + log_step_count_steps: 2 +} + +eval_config { + metrics_set { + auc { + } + } +} + +fg_json_path: "!samples/model_config/fg_fusion_train_seq.json" + +data_config { + input_fields { + input_name:'clk' + input_type: INT32 + } + input_fields { + input_name:'buy' + input_type: INT32 + } + input_fields { + input_name: 'pid' + input_type: STRING + } + input_fields { + input_name: 'adgroup_id' + input_type: STRING + } + input_fields { + input_name: 'cate_id' + input_type: STRING + } + input_fields { + input_name: 'campaign_id' + input_type: STRING + } + input_fields { + input_name: 'customer' + input_type: STRING + } + input_fields { + input_name: 'brand' + input_type: STRING + } + input_fields { + input_name: 'user_id' + input_type: STRING + } + input_fields { + input_name: 'cms_segid' + input_type: STRING + } + input_fields { + input_name: 'cms_group_id' + input_type: STRING + } + input_fields { + input_name: 'final_gender_code' + input_type: STRING + } + input_fields { + input_name: 'age_level' + input_type: STRING + } + input_fields { + input_name: 'pvalue_level' + input_type: STRING + } + input_fields { + input_name: 'shopping_level' + input_type: STRING + } + input_fields { + input_name: 'occupation' + input_type: STRING + } + input_fields { + input_name: 'new_user_class_level' + input_type: STRING + } + input_fields { + input_name: 'click_seq__cate_id' + input_type: STRING + } + input_fields { + input_name: 'click_seq__brand' + input_type: STRING + } + input_fields { + input_name: 'price' + input_type: STRING + } + label_fields: 'buy' + label_fields: 'clk' + batch_size: 256 + num_epochs: 10000 + prefetch_size: 4 + input_type: CSVInput +} + +feature_config: { + #features: { + # input_names: 'pid' + # feature_type: IdFeature + # embedding_dim: 16 + # hash_bucket_size: 10 + #} + features: { + input_names: 'adgroup_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'cate_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10000 + } + features: { + input_names: 'campaign_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'customer' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'brand' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'user_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'cms_segid' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 + } + features: { + input_names: 'cms_group_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 + } + features: { + input_names: 'final_gender_code' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'age_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'pvalue_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'shopping_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'occupation' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'new_user_class_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'click_seq__cate_id' + feature_type: SequenceFeature + separator: '' + hash_bucket_size: 100000 + embedding_dim: 16 + sub_feature_type: IdFeature + } + features: { + input_names: 'click_seq__brand' + feature_type: SequenceFeature + separator: '' + hash_bucket_size: 100000 + embedding_dim: 16 + sub_feature_type: IdFeature + } + #features: { + # input_names: 'price' + # feature_type: RawFeature + # embedding_dim: 16 + # boundaries: [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50] + #} +} + +model_config { + model_class: "DCN" + feature_groups { + group_name: "all" + feature_names: 'adgroup_id' + feature_names: 'cate_id' + feature_names: 'campaign_id' + feature_names: 'customer' + feature_names: 'brand' + feature_names: 'user_id' + feature_names: 'cms_segid' + feature_names: 'cms_group_id' + feature_names: 'final_gender_code' + feature_names: 'age_level' + feature_names: 'pvalue_level' + feature_names: 'shopping_level' + feature_names: 'occupation' + feature_names: 'new_user_class_level' + wide_deep: DEEP + sequence_features: { + group_name: "seq_fea" + tf_summary: false + seq_att_map: { + key: "cate_id" + key: "brand" + hist_seq: "click_seq__cate_id" + hist_seq: "click_seq__brand" + } + } + } + dcn { + deep_tower { + input: "all" + dnn { + hidden_units: [256, 128, 96, 64] + } + } + cross_tower { + input: "all" + cross_num: 5 + } + final_dnn { + hidden_units: [128, 96, 64, 32, 16] + } + l2_regularization: 1e-6 + } + embedding_regularization: 5e-6 +} + +export_config { + multi_placeholder: false +} diff --git a/samples/model_config/multi_tower_recall_neg_sampler_only_sequence_feature.config b/samples/model_config/multi_tower_recall_neg_sampler_only_sequence_feature.config deleted file mode 100644 index 2028cb600..000000000 --- a/samples/model_config/multi_tower_recall_neg_sampler_only_sequence_feature.config +++ /dev/null @@ -1,304 +0,0 @@ -train_input_path: "data/test/tb_data/taobao_train_data" -eval_input_path: "data/test/tb_data/taobao_test_data" -model_dir: "experiments/multi_tower_recall_neg_sampler_only_sequence_feature" - -train_config { - optimizer_config: { - adam_optimizer: { - learning_rate: { - exponential_decay_learning_rate { - initial_learning_rate: 0.001 - decay_steps: 1000 - decay_factor: 0.5 - min_learning_rate: 1e-07 - } - } - } - use_moving_average: false - } - num_steps: 6 - sync_replicas: false - save_checkpoints_steps: 100 - log_step_count_steps: 2 -} - -eval_config { - metrics_set: { - auc { - } - } - metrics_set: { - gauc { - uid_field: "user_id" - } - } -} - -data_config { - batch_size: 16 - input_fields { - input_name:'clk' - input_type: INT32 - } - input_fields { - input_name:'buy' - input_type: INT32 - } - input_fields { - input_name: 'pid' - input_type: STRING - } - input_fields { - input_name: 'adgroup_id' - input_type: STRING - } - input_fields { - input_name: 'cate_id' - input_type: STRING - } - input_fields { - input_name: 'campaign_id' - input_type: STRING - } - input_fields { - input_name: 'customer' - input_type: STRING - } - input_fields { - input_name: 'brand' - input_type: STRING - } - input_fields { - input_name: 'user_id' - input_type: STRING - } - input_fields { - input_name: 'cms_segid' - input_type: STRING - } - input_fields { - input_name: 'cms_group_id' - input_type: STRING - } - input_fields { - input_name: 'final_gender_code' - input_type: STRING - } - input_fields { - input_name: 'age_level' - input_type: STRING - } - input_fields { - input_name: 'pvalue_level' - input_type: STRING - } - input_fields { - input_name: 'shopping_level' - input_type: STRING - } - input_fields { - input_name: 'occupation' - input_type: STRING - } - input_fields { - input_name: 'new_user_class_level' - input_type: STRING - } - input_fields { - input_name: 'tag_category_list' - input_type: STRING - } - input_fields { - input_name: 'tag_brand_list' - input_type: STRING - } - input_fields { - input_name: 'price' - input_type: INT32 - } - - label_fields: 'clk' - num_epochs: 5 - prefetch_size: 4 - input_type: CSVInput - - negative_sampler { - input_path: 'data/test/tb_data/taobao_ad_feature_gl' - num_sample: 4 - num_eval_sample: 4 - attr_fields: 'adgroup_id' - attr_fields: 'cate_id' - attr_fields: 'campaign_id' - attr_fields: 'customer' - attr_fields: 'brand' - item_id_field: 'adgroup_id' - } -} - -feature_configs : { - input_names: 'pid' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs : { - input_names: 'adgroup_id' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100000 -} -feature_configs : { - input_names: 'cate_id' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10000 -} -feature_configs : { - input_names: 'campaign_id' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100000 -} -feature_configs : { - input_names: 'customer' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100000 -} -feature_configs : { - input_names: 'brand' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100000 -} -feature_configs : { - input_names: 'user_id' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100000 -} -feature_configs : { - input_names: 'cms_segid' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100 -} -feature_configs : { - input_names: 'cms_group_id' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100 -} -feature_configs : { - input_names: 'final_gender_code' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs : { - input_names: 'age_level' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs : { - input_names: 'pvalue_level' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs : { - input_names: 'shopping_level' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs : { - input_names: 'occupation' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs : { - input_names: 'new_user_class_level' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs { - input_names: "tag_category_list" - feature_type: SequenceFeature - embedding_dim: 16 - hash_bucket_size: 100000 - sub_feature_type: IdFeature - separator: "|" -} -feature_configs { - input_names: "tag_brand_list" - feature_type: SequenceFeature - embedding_dim: 16 - hash_bucket_size: 100000 - sub_feature_type: IdFeature - separator: "|" -} -feature_configs : { - input_names: 'price' - feature_type: IdFeature - embedding_dim: 16 - num_buckets: 50 -} -model_config:{ - model_class: "MultiTowerRecall" - feature_groups: { - group_name: 'user' - feature_names: 'user_id' - feature_names: 'cms_segid' - feature_names: 'cms_group_id' - feature_names: 'age_level' - feature_names: 'pvalue_level' - feature_names: 'shopping_level' - feature_names: 'occupation' - feature_names: 'new_user_class_level' - wide_deep:DEEP - negative_sampler:true - sequence_features: { - group_name: "seq_fea" - allow_key_search: true - need_key_feature:false - seq_att_map: { - key: "brand" - key: "cate_id" - hist_seq: "tag_brand_list" - hist_seq: "tag_category_list" - } - } - } - feature_groups: { - group_name: "item" - feature_names: 'adgroup_id' - feature_names: 'cate_id' - feature_names: 'campaign_id' - feature_names: 'customer' - feature_names: 'brand' - wide_deep:DEEP - } - multi_tower_recall { - user_tower { - dnn { - hidden_units: [256, 128, 64, 32] - # dropout_ratio : [0.1, 0.1, 0.1, 0.1] - } - } - item_tower { - dnn { - hidden_units: [256, 128, 64, 32] - } - } - final_dnn { - hidden_units: [128, 96, 64, 32, 16] - } - l2_regularization: 1e-6 - } - loss_type: CLASSIFICATION - embedding_regularization: 5e-6 -} diff --git a/samples/model_config/multi_tower_recall_neg_sampler_sequence_feature.config b/samples/model_config/multi_tower_recall_neg_sampler_sequence_feature.config deleted file mode 100644 index a51260b59..000000000 --- a/samples/model_config/multi_tower_recall_neg_sampler_sequence_feature.config +++ /dev/null @@ -1,304 +0,0 @@ -train_input_path: "data/test/tb_data/taobao_train_data" -eval_input_path: "data/test/tb_data/taobao_test_data" -model_dir: "experiments/multi_tower_recall_neg_sampler_sequence_feature" - -train_config { - optimizer_config: { - adam_optimizer: { - learning_rate: { - exponential_decay_learning_rate { - initial_learning_rate: 0.001 - decay_steps: 1000 - decay_factor: 0.5 - min_learning_rate: 1e-07 - } - } - } - use_moving_average: false - } - num_steps: 6 - sync_replicas: false - save_checkpoints_steps: 100 - log_step_count_steps: 2 -} - -eval_config { - metrics_set: { - auc { - } - } - metrics_set: { - gauc { - uid_field: "user_id" - } - } -} - -data_config { - batch_size: 16 - input_fields { - input_name:'clk' - input_type: INT32 - } - input_fields { - input_name:'buy' - input_type: INT32 - } - input_fields { - input_name: 'pid' - input_type: STRING - } - input_fields { - input_name: 'adgroup_id' - input_type: STRING - } - input_fields { - input_name: 'cate_id' - input_type: STRING - } - input_fields { - input_name: 'campaign_id' - input_type: STRING - } - input_fields { - input_name: 'customer' - input_type: STRING - } - input_fields { - input_name: 'brand' - input_type: STRING - } - input_fields { - input_name: 'user_id' - input_type: STRING - } - input_fields { - input_name: 'cms_segid' - input_type: STRING - } - input_fields { - input_name: 'cms_group_id' - input_type: STRING - } - input_fields { - input_name: 'final_gender_code' - input_type: STRING - } - input_fields { - input_name: 'age_level' - input_type: STRING - } - input_fields { - input_name: 'pvalue_level' - input_type: STRING - } - input_fields { - input_name: 'shopping_level' - input_type: STRING - } - input_fields { - input_name: 'occupation' - input_type: STRING - } - input_fields { - input_name: 'new_user_class_level' - input_type: STRING - } - input_fields { - input_name: 'tag_category_list' - input_type: STRING - } - input_fields { - input_name: 'tag_brand_list' - input_type: STRING - } - input_fields { - input_name: 'price' - input_type: INT32 - } - - label_fields: 'clk' - num_epochs: 5 - prefetch_size: 4 - input_type: CSVInput - - negative_sampler { - input_path: 'data/test/tb_data/taobao_ad_feature_gl' - num_sample: 4 - num_eval_sample: 4 - attr_fields: 'adgroup_id' - attr_fields: 'cate_id' - attr_fields: 'campaign_id' - attr_fields: 'customer' - attr_fields: 'brand' - item_id_field: 'adgroup_id' - } -} - -feature_configs : { - input_names: 'pid' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs : { - input_names: 'adgroup_id' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100000 -} -feature_configs : { - input_names: 'cate_id' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10000 -} -feature_configs : { - input_names: 'campaign_id' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100000 -} -feature_configs : { - input_names: 'customer' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100000 -} -feature_configs : { - input_names: 'brand' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100000 -} -feature_configs : { - input_names: 'user_id' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100000 -} -feature_configs : { - input_names: 'cms_segid' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100 -} -feature_configs : { - input_names: 'cms_group_id' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 100 -} -feature_configs : { - input_names: 'final_gender_code' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs : { - input_names: 'age_level' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs : { - input_names: 'pvalue_level' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs : { - input_names: 'shopping_level' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs : { - input_names: 'occupation' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs : { - input_names: 'new_user_class_level' - feature_type: IdFeature - embedding_dim: 16 - hash_bucket_size: 10 -} -feature_configs { - input_names: "tag_category_list" - feature_type: SequenceFeature - embedding_dim: 16 - hash_bucket_size: 100000 - sub_feature_type: IdFeature - separator: "|" -} -feature_configs { - input_names: "tag_brand_list" - feature_type: SequenceFeature - embedding_dim: 16 - hash_bucket_size: 100000 - sub_feature_type: IdFeature - separator: "|" -} -feature_configs : { - input_names: 'price' - feature_type: IdFeature - embedding_dim: 16 - num_buckets: 50 -} -model_config:{ - model_class: "MultiTowerRecall" - feature_groups: { - group_name: 'user' - feature_names: 'user_id' - feature_names: 'cms_segid' - feature_names: 'cms_group_id' - feature_names: 'age_level' - feature_names: 'pvalue_level' - feature_names: 'shopping_level' - feature_names: 'occupation' - feature_names: 'new_user_class_level' - wide_deep:DEEP - negative_sampler:true - sequence_features: { - group_name: "seq_fea" - allow_key_search: true - need_key_feature:true - seq_att_map: { - key: "brand" - key: "cate_id" - hist_seq: "tag_brand_list" - hist_seq: "tag_category_list" - } - } - } - feature_groups: { - group_name: "item" - feature_names: 'adgroup_id' - feature_names: 'cate_id' - feature_names: 'campaign_id' - feature_names: 'customer' - feature_names: 'brand' - wide_deep:DEEP - } - multi_tower_recall { - user_tower { - dnn { - hidden_units: [256, 128, 64, 32] - # dropout_ratio : [0.1, 0.1, 0.1, 0.1] - } - } - item_tower { - dnn { - hidden_units: [256, 128, 64, 32] - } - } - final_dnn { - hidden_units: [128, 96, 64, 32, 16] - } - l2_regularization: 1e-6 - } - loss_type: CLASSIFICATION - embedding_regularization: 5e-6 -} diff --git a/samples/rtp_fg/fg_test_extensions_final.config b/samples/rtp_fg/fg_test_extensions_final.config index a3a4e3040..a9d48f05c 100644 --- a/samples/rtp_fg/fg_test_extensions_final.config +++ b/samples/rtp_fg/fg_test_extensions_final.config @@ -29,7 +29,7 @@ data_config { batch_size: 1024 label_fields: "clk" input_type: RTPInput - separator: "" + separator: "\x02" selected_cols: "0,3" input_fields { input_name: "clk" @@ -93,10 +93,10 @@ data_config { input_name: "combo_cate_id" } input_fields { - input_name: "opt_content_long_seq_svid" + input_name: "opt_content_long_seq__svid" } input_fields { - input_name: "opt_content_long_seq_source_type" + input_name: "opt_content_long_seq__source_type" } rtp_separator: ";" } @@ -181,7 +181,7 @@ feature_config { feature_type: IdFeature embedding_dim: 16 hash_bucket_size: 100000 - separator: "" + separator: "\x03" combiner: "mean" } features { @@ -189,7 +189,7 @@ feature_config { feature_type: IdFeature embedding_dim: 16 hash_bucket_size: 100 - separator: "" + separator: "\x03" combiner: "mean" } features { @@ -197,7 +197,7 @@ feature_config { feature_type: IdFeature embedding_dim: 16 hash_bucket_size: 100 - separator: "" + separator: "\x03" combiner: "mean" } features { @@ -205,7 +205,7 @@ feature_config { feature_type: IdFeature embedding_dim: 16 hash_bucket_size: 10 - separator: "" + separator: "\x03" combiner: "mean" } features { @@ -213,7 +213,7 @@ feature_config { feature_type: IdFeature embedding_dim: 16 hash_bucket_size: 10 - separator: "" + separator: "\x03" combiner: "mean" } features { @@ -221,7 +221,7 @@ feature_config { feature_type: IdFeature embedding_dim: 16 hash_bucket_size: 10 - separator: "" + separator: "\x03" combiner: "mean" } features { @@ -229,7 +229,7 @@ feature_config { feature_type: IdFeature embedding_dim: 16 hash_bucket_size: 10 - separator: "" + separator: "\x03" combiner: "mean" } features { @@ -246,7 +246,7 @@ feature_config { feature_type: IdFeature embedding_dim: 16 hash_bucket_size: 10 - separator: "" + separator: "\x03" combiner: "mean" } features { @@ -254,7 +254,7 @@ feature_config { feature_type: IdFeature embedding_dim: 16 hash_bucket_size: 100000 - separator: "" + separator: "\x03" combiner: "mean" } features { @@ -262,7 +262,7 @@ feature_config { feature_type: IdFeature embedding_dim: 16 hash_bucket_size: 100000 - separator: "" + separator: "\x03" combiner: "mean" } features { @@ -270,7 +270,7 @@ feature_config { feature_type: IdFeature embedding_dim: 16 hash_bucket_size: 100000 - separator: "" + separator: "\x03" combiner: "mean" } features { @@ -278,7 +278,7 @@ feature_config { feature_type: IdFeature embedding_dim: 16 hash_bucket_size: 100000 - separator: "" + separator: "\x03" combiner: "mean" } features { @@ -286,20 +286,20 @@ feature_config { feature_type: IdFeature embedding_dim: 16 hash_bucket_size: 100000 - separator: "" + separator: "\x03" combiner: "mean" } features { input_names: "price" feature_type: RawFeature - separator: "" + separator: "\x03" } features { input_names: "pid" feature_type: IdFeature embedding_dim: 16 hash_bucket_size: 100000 - separator: "" + separator: "\x03" combiner: "mean" } features { @@ -307,7 +307,7 @@ feature_config { feature_type: TagFeature embedding_dim: 16 hash_bucket_size: 100000 - separator: "" + separator: "\x03" combiner: "sum" } features { @@ -315,7 +315,7 @@ feature_config { feature_type: TagFeature embedding_dim: 16 hash_bucket_size: 100000 - separator: "" + separator: "\x03" combiner: "sum" } features { @@ -323,24 +323,24 @@ feature_config { feature_type: TagFeature embedding_dim: 16 hash_bucket_size: 10000 - separator: "" + separator: "\x03" combiner: "sum" } features { - input_names: "opt_content_long_seq_svid" + input_names: "opt_content_long_seq__svid" feature_type: SequenceFeature embedding_dim: 16 hash_bucket_size: 100000 - separator: "" + separator: "\x03" combiner: "mean" sub_feature_type: IdFeature } features { - input_names: "opt_content_long_seq_source_type" + input_names: "opt_content_long_seq__source_type" feature_type: SequenceFeature embedding_dim: 16 hash_bucket_size: 100000 - separator: "" + separator: "\x03" combiner: "mean" sub_feature_type: IdFeature }