44 Weichen Shen,wcshen1994@163.com
55"""
66
7- from collections import OrderedDict , namedtuple
7+ from collections import OrderedDict , namedtuple , defaultdict
8+ from itertools import chain
89
910import torch
1011import torch .nn as nn
12+ import numpy as np
1113
1214from .layers .sequence import SequencePoolingLayer
1315from .layers .utils import concat_fun
@@ -27,7 +29,8 @@ def __new__(cls, name, vocabulary_size, embedding_dim=4, use_hash=False, dtype="
2729 if embedding_dim == "auto" :
2830 embedding_dim = 6 * int (pow (vocabulary_size , 0.25 ))
2931 if use_hash :
30- print ("Notice! Feature Hashing on the fly currently is not supported in torch version,you can use tensorflow version!" )
32+ print (
33+ "Notice! Feature Hashing on the fly currently is not supported in torch version,you can use tensorflow version!" )
3134 return super (SparseFeat , cls ).__new__ (cls , name , vocabulary_size , embedding_dim , use_hash , dtype ,
3235 embedding_name , group_name )
3336
@@ -108,23 +111,14 @@ def build_input_features(feature_columns):
108111 elif isinstance (feat , VarLenSparseFeat ):
109112 features [feat_name ] = (start , start + feat .maxlen )
110113 start += feat .maxlen
111- if feat .length_name is not None :
114+ if feat .length_name is not None and feat . length_name not in features :
112115 features [feat .length_name ] = (start , start + 1 )
113116 start += 1
114117 else :
115118 raise TypeError ("Invalid feature column type,got" , type (feat ))
116119 return features
117120
118121
119- # def get_dense_input(features, feature_columns):
120- # dense_feature_columns = list(filter(lambda x: isinstance(
121- # x, DenseFeat), feature_columns)) if feature_columns else []
122- # dense_input_list = []
123- # for fc in dense_feature_columns:
124- # dense_input_list.append(features[fc.name])
125- # return dense_input_list
126-
127-
128122def combined_dnn_input (sparse_embedding_list , dense_value_list ):
129123 if len (sparse_embedding_list ) > 0 and len (dense_value_list ) > 0 :
130124 sparse_dnn_input = torch .flatten (
@@ -139,72 +133,6 @@ def combined_dnn_input(sparse_embedding_list, dense_value_list):
139133 else :
140134 raise NotImplementedError
141135
142- #
143- # def embedding_lookup(sparse_embedding_dict, sparse_input_dict, sparse_feature_columns, return_feat_list=(),
144- # mask_feat_list=(), to_list=False):
145- # """
146- # Args:
147- # sparse_embedding_dict: nn.ModuleDict, {embedding_name: nn.Embedding}
148- # sparse_input_dict: OrderedDict, {feature_name:(start, start+dimension)}
149- # sparse_feature_columns: list, sparse features
150- # return_feat_list: list, names of feature to be returned, defualt () -> return all features
151- # mask_feat_list, list, names of feature to be masked in hash transform
152- # Return:
153- # group_embedding_dict: defaultdict(list)
154- # """
155- # group_embedding_dict = defaultdict(list)
156- # for fc in sparse_feature_columns:
157- # feature_name = fc.name
158- # embedding_name = fc.embedding_name
159- # if (len(return_feat_list) == 0 or feature_name in return_feat_list):
160- # if fc.use_hash:
161- # # lookup_idx = Hash(fc.vocabulary_size, mask_zero=(feature_name in mask_feat_list))(
162- # # sparse_input_dict[feature_name])
163- # # TODO: add hash function
164- # lookup_idx = sparse_input_dict[feature_name]
165- # else:
166- # lookup_idx = sparse_input_dict[feature_name]
167- #
168- # group_embedding_dict[fc.group_name].append(sparse_embedding_dict[embedding_name](lookup_idx))
169- # if to_list:
170- # return list(chain.from_iterable(group_embedding_dict.values()))
171- # return group_embedding_dict
172- #
173- #
174- # def varlen_embedding_lookup(embedding_dict, sequence_input_dict, varlen_sparse_feature_columns):
175- # varlen_embedding_vec_dict = {}
176- # for fc in varlen_sparse_feature_columns:
177- # feature_name = fc.name
178- # embedding_name = fc.embedding_name
179- # if fc.use_hash:
180- # # lookup_idx = Hash(fc.vocabulary_size, mask_zero=True)(sequence_input_dict[feature_name])
181- # # TODO: add hash function
182- # lookup_idx = sequence_input_dict[feature_name]
183- # else:
184- # lookup_idx = sequence_input_dict[feature_name]
185- # varlen_embedding_vec_dict[feature_name] = embedding_dict[embedding_name](lookup_idx)
186- # return varlen_embedding_vec_dict
187- #
188- #
189- # def get_varlen_pooling_list(embedding_dict, features, varlen_sparse_feature_columns, to_list=False):
190- # pooling_vec_list = defaultdict(list)
191- # for fc in varlen_sparse_feature_columns:
192- # feature_name = fc.name
193- # combiner = fc.combiner
194- # feature_length_name = fc.length_name
195- # if feature_length_name is not None:
196- # seq_input = embedding_dict[feature_name]
197- # vec = SequencePoolingLayer(combiner)([seq_input, features[feature_length_name]])
198- # else:
199- # seq_input = embedding_dict[feature_name]
200- # vec = SequencePoolingLayer(combiner)(seq_input)
201- # pooling_vec_list[fc.group_name].append(vec)
202- #
203- # if to_list:
204- # return chain.from_iterable(pooling_vec_list.values())
205- #
206- # return pooling_vec_list
207-
208136
209137def get_varlen_pooling_list (embedding_dict , features , feature_index , varlen_sparse_feature_columns , device ):
210138 varlen_sparse_embedding_list = []
@@ -249,3 +177,95 @@ def create_embedding_matrix(feature_columns, init_std=0.0001, linear=False, spar
249177 nn .init .normal_ (tensor .weight , mean = 0 , std = init_std )
250178
251179 return embedding_dict .to (device )
180+
181+
182+ def input_from_feature_columns (self , X , feature_columns , embedding_dict , support_dense = True ):
183+ sparse_feature_columns = list (
184+ filter (lambda x : isinstance (x , SparseFeat ), feature_columns )) if len (feature_columns ) else []
185+ dense_feature_columns = list (
186+ filter (lambda x : isinstance (x , DenseFeat ), feature_columns )) if len (feature_columns ) else []
187+
188+ varlen_sparse_feature_columns = list (
189+ filter (lambda x : isinstance (x , VarLenSparseFeat ), feature_columns )) if feature_columns else []
190+
191+ if not support_dense and len (dense_feature_columns ) > 0 :
192+ raise ValueError (
193+ "DenseFeat is not supported in dnn_feature_columns" )
194+
195+ sparse_embedding_list = [embedding_dict [feat .embedding_name ](
196+ X [:, self .feature_index [feat .name ][0 ]:self .feature_index [feat .name ][1 ]].long ()) for
197+ feat in sparse_feature_columns ]
198+
199+ varlen_sparse_embedding_list = get_varlen_pooling_list (self .embedding_dict , X , self .feature_index ,
200+ varlen_sparse_feature_columns , self .device )
201+
202+ dense_value_list = [X [:, self .feature_index [feat .name ][0 ]:self .feature_index [feat .name ][1 ]] for feat in
203+ dense_feature_columns ]
204+
205+ return sparse_embedding_list + varlen_sparse_embedding_list , dense_value_list
206+
207+
208+
209+ def embedding_lookup (X , sparse_embedding_dict , sparse_input_dict , sparse_feature_columns , return_feat_list = (),
210+ mask_feat_list = (), to_list = False ):
211+ """
212+ Args:
213+ X: input Tensor [batch_size x hidden_dim]
214+ sparse_embedding_dict: nn.ModuleDict, {embedding_name: nn.Embedding}
215+ sparse_input_dict: OrderedDict, {feature_name:(start, start+dimension)}
216+ sparse_feature_columns: list, sparse features
217+ return_feat_list: list, names of feature to be returned, defualt () -> return all features
218+ mask_feat_list, list, names of feature to be masked in hash transform
219+ Return:
220+ group_embedding_dict: defaultdict(list)
221+ """
222+ group_embedding_dict = defaultdict (list )
223+ for fc in sparse_feature_columns :
224+ feature_name = fc .name
225+ embedding_name = fc .embedding_name
226+ if (len (return_feat_list ) == 0 or feature_name in return_feat_list ):
227+ # TODO: add hash function
228+ # if fc.use_hash:
229+ # raise NotImplementedError("hash function is not implemented in this version!")
230+ lookup_idx = np .array (sparse_input_dict [feature_name ])
231+ input_tensor = X [:, lookup_idx [0 ]:lookup_idx [1 ]].long ()
232+ emb = sparse_embedding_dict [embedding_name ](input_tensor )
233+ group_embedding_dict [fc .group_name ].append (emb )
234+ if to_list :
235+ return list (chain .from_iterable (group_embedding_dict .values ()))
236+ return group_embedding_dict
237+
238+
239+ def varlen_embedding_lookup (X , embedding_dict , sequence_input_dict , varlen_sparse_feature_columns ):
240+ varlen_embedding_vec_dict = {}
241+ for fc in varlen_sparse_feature_columns :
242+ feature_name = fc .name
243+ embedding_name = fc .embedding_name
244+ if fc .use_hash :
245+ # lookup_idx = Hash(fc.vocabulary_size, mask_zero=True)(sequence_input_dict[feature_name])
246+ # TODO: add hash function
247+ lookup_idx = sequence_input_dict [feature_name ]
248+ else :
249+ lookup_idx = sequence_input_dict [feature_name ]
250+ varlen_embedding_vec_dict [feature_name ] = embedding_dict [embedding_name ](
251+ X [:, lookup_idx [0 ]:lookup_idx [1 ]].long ()) # (lookup_idx)
252+
253+ return varlen_embedding_vec_dict
254+
255+
256+ def get_dense_input (X , features , feature_columns ):
257+ dense_feature_columns = list (filter (lambda x : isinstance (
258+ x , DenseFeat ), feature_columns )) if feature_columns else []
259+ dense_input_list = []
260+ for fc in dense_feature_columns :
261+ lookup_idx = np .array (features [fc .name ])
262+ input_tensor = X [:, lookup_idx [0 ]:lookup_idx [1 ]].float ()
263+ dense_input_list .append (input_tensor )
264+ return dense_input_list
265+
266+
267+ def maxlen_lookup (X , sparse_input_dict , maxlen_column ):
268+ if maxlen_column is None or len (maxlen_column )== 0 :
269+ raise ValueError ('please add max length column for VarLenSparseFeat of DIEN input' )
270+ lookup_idx = np .array (sparse_input_dict [maxlen_column [0 ]])
271+ return X [:, lookup_idx [0 ]:lookup_idx [1 ]].long ()
0 commit comments