ydataai
diff --git a/‎src/ydata_synthetic/preprocessing/base_processor.py‎
Lines changed: 31 additions & 11 deletions b/‎src/ydata_synthetic/preprocessing/base_processor.py‎
Lines changed: 31 additions & 11 deletions
diff --git a/‎src/ydata_synthetic/synthesizers/regular/cgan/model.py‎
Lines changed: 10 additions & 6 deletions b/‎src/ydata_synthetic/synthesizers/regular/cgan/model.py‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎src/ydata_synthetic/synthesizers/regular/cramergan/model.py‎
Lines changed: 9 additions & 5 deletions b/‎src/ydata_synthetic/synthesizers/regular/cramergan/model.py‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎src/ydata_synthetic/synthesizers/regular/dragan/model.py‎
Lines changed: 9 additions & 4 deletions b/‎src/ydata_synthetic/synthesizers/regular/dragan/model.py‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎src/ydata_synthetic/synthesizers/regular/vanillagan/model.py‎
Lines changed: 10 additions & 7 deletions b/‎src/ydata_synthetic/synthesizers/regular/vanillagan/model.py‎
Lines changed: 10 additions & 7 deletions
diff --git a/‎src/ydata_synthetic/synthesizers/regular/wgan/model.py‎
Lines changed: 9 additions & 5 deletions b/‎src/ydata_synthetic/synthesizers/regular/wgan/model.py‎
Lines changed: 9 additions & 5 deletions
@@ -1,36 +1,38 @@
-"Implements a BaseProcessor Class, not meant to be directly instantiated."
+"Base class of Data Preprocessors, do not instantiate this class directly."
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
+from collections import namedtuple
 from typing import List, Optional
 
-from numpy import ndarray
-from pandas import DataFrame, Series
+from numpy import concatenate, ndarray, split, zeros
+from pandas import DataFrame, Series, concat
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.exceptions import NotFittedError
 from typeguard import typechecked
 
+ProcessorInfo = namedtuple("ProcessorInfo", ["numerical", "categorical"])
+PipelineInfo = namedtuple("PipelineInfo", ["feat_names_in", "feat_names_out"])
 
+# pylint: disable=R0902
 @typechecked
 class BaseProcessor(ABC, BaseEstimator, TransformerMixin):
     """
-    Base class for Data Preprocessing.
-    It works like any other transformer in scikit learn with the methods fit, transform and inverse transform.
+    This data processor works like a scikit learn transformer in with the methods fit, transform and inverse transform.
     Args:
         num_cols (list of strings):
             List of names of numerical columns.
         cat_cols (list of strings):
             List of names of categorical columns.
     """
     def __init__(self, num_cols: Optional[List[str]] = None, cat_cols: Optional[List[str]] = None):
-
         self.num_cols = [] if num_cols is None else num_cols
         self.cat_cols = [] if cat_cols is None else cat_cols
 
-        self._num_pipeline = None
-        self._cat_pipeline = None
+        self._num_pipeline = None  # To be overriden by child processors
+        self._cat_pipeline = None  # To be overriden by child processors
 
-        self._types = None
+        self._col_transform_info = None  # Metadata object mapping inputs/outputs of each pipeline
 
     @property
     def num_pipeline(self) -> BaseEstimator:
@@ -47,6 +49,25 @@ def types(self) -> Series:
         """Returns a Series with the dtypes of each column in the fitted DataFrame."""
         return self._types
 
+    @property
+    def col_transform_info(self) -> ProcessorInfo:
+        """Returns a ProcessorInfo object specifying input/output feature mappings of this processor's pipelines."""
+        self._check_is_fitted()
+        if self._col_transform_info is None:
+            self._col_transform_info = self.__create_metadata_synth()
+        return self._col_transform_info
+
+    def __create_metadata_synth(self):
+        num_info = PipelineInfo([], [])
+        cat_info = PipelineInfo([], [])
+        # Numerical ls named tuple
+        if self.num_cols:
+            num_info = PipelineInfo(self.num_pipeline.feature_names_in_, self.num_pipeline.get_feature_names_out())
+        # Categorical ls named tuple
+        if self.cat_cols:
+            cat_info = PipelineInfo(self.cat_pipeline.feature_names_in_, self.cat_pipeline.get_feature_names_out())
+        return ProcessorInfo(num_info, cat_info)
+
     def _check_is_fitted(self):
         """Checks if the processor is fitted by testing the numerical pipeline.
         Raises NotFittedError if not."""
@@ -86,8 +107,7 @@ def transform(self, X: DataFrame) -> ndarray:
                 DataFrame used to fit the processor parameters.
                 Should be aligned with the columns types defined in initialization.
         Returns:
-            transformed (ndarray):
-                Processed version of the passed DataFrame.
+            transformed (ndarray): Processed version of the passed DataFrame.
         """
         raise NotImplementedError
 
 
@@ -1,7 +1,7 @@
 """CGAN implementation"""
 import os
 from os import path
-from typing import List, Tuple, Union
+from typing import List, Tuple, Union, Optional, NamedTuple
 
 import numpy as np
 from numpy import array, empty, hstack, ndarray, vstack, save
@@ -19,6 +19,7 @@
 
 from ydata_synthetic.synthesizers import TrainParameters
 from ydata_synthetic.synthesizers.gan import BaseModel
+from ydata_synthetic.utils.gumbel_softmax import GumbelSoftmaxActivation
 
 
 class CGAN(BaseModel):
@@ -44,15 +45,16 @@ def label_col(self, data_label: Tuple[Union[DataFrame, array], str]):
             cannot be used as condition."
         assert data[label_col].isna().sum() == 0, "The label column contains NaN values, please impute or drop the \
             respective records before proceeding."
-        assert is_float_dtype(data[label_col]) or is_integer_dtype(float), "The label column is expected to be an \
+        assert is_float_dtype(data[label_col]) or is_integer_dtype(data[label_col]), "The label column is expected to be an \
             integer or a float dtype to ensure the function of the embedding layer."
         unique_frac = data[label_col].nunique()/len(data.index)
         assert unique_frac < 1, "The provided column {label_col} is constituted by unique values and is not suitable \
             to be used as condition."
 
-    def define_gan(self):
+    def define_gan(self, activation_info: Optional[NamedTuple] = None):
         self.generator = Generator(self.batch_size, self.num_classes). \
-            build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim)
+            build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim,
+                        activation_info = activation_info)
 
         self.discriminator = Discriminator(self.batch_size, self.num_classes). \
             build_model(input_shape=(self.data_dim,), dim=self.layers_dim)
@@ -121,7 +123,7 @@ def train(self, data: Union[DataFrame, array], label_col: str, train_arguments:
 
         processed_data = self.processor.transform(data)
         self.data_dim = processed_data.shape[1]
-        self.define_gan()
+        self.define_gan(self.processor.col_transform_info)
 
         # Merging labels with processed data
         processed_data = hstack([processed_data, label])
@@ -198,7 +200,7 @@ def __init__(self, batch_size, num_classes):
         self.batch_size = batch_size
         self.num_classes = num_classes
 
-    def build_model(self, input_shape, dim, data_dim):
+    def build_model(self, input_shape, dim, data_dim, activation_info: Optional[NamedTuple] = None):
         noise = Input(shape=input_shape, batch_size=self.batch_size)
         label = Input(shape=(1,), batch_size=self.batch_size, dtype='int32')
         label_embedding = Flatten()(Embedding(self.num_classes, 1)(label))
@@ -208,6 +210,8 @@ def build_model(self, input_shape, dim, data_dim):
         x = Dense(dim * 2, activation='relu')(x)
         x = Dense(dim * 4, activation='relu')(x)
         x = Dense(data_dim)(x)
+        if activation_info:
+            x = GumbelSoftmaxActivation(activation_info).call(x)
         return Model(inputs=[noise, label], outputs=x)
 
 
 
@@ -1,6 +1,6 @@
 import os
 from os import path
-from typing import List
+from typing import List, Optional, NamedTuple
 
 import numpy as np
 import tensorflow as tf
@@ -12,6 +12,7 @@
 from ydata_synthetic.synthesizers import TrainParameters
 from ydata_synthetic.synthesizers.gan import BaseModel
 from ydata_synthetic.synthesizers.loss import Mode, gradient_penalty
+from ydata_synthetic.utils.gumbel_softmax import GumbelSoftmaxActivation
 
 
 class CRAMERGAN(BaseModel):
@@ -26,9 +27,10 @@ def __init__(self, model_parameters, gradient_penalty_weight=10):
         self.gradient_penalty_weight = gradient_penalty_weight
         super().__init__(model_parameters)
 
-    def define_gan(self):
+    def define_gan(self, activation_info: Optional[NamedTuple] = None):
         self.generator = Generator(self.batch_size). \
-            build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim)
+            build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim,
+                        activation_info=activation_info)
 
         self.critic = Critic(self.batch_size). \
             build_model(input_shape=(self.data_dim,), dim=self.layers_dim)
@@ -145,7 +147,7 @@ def train(self, data, train_arguments: TrainParameters, num_cols: List[str], cat
 
         data = self.processor.transform(data)
         self.data_dim = data.shape[1]
-        self.define_gan()
+        self.define_gan(self.processor.col_transform_info)
 
         iterations = int(abs(data.shape[0] / self.batch_size) + 1)
 
@@ -190,12 +192,14 @@ def __init__(self, batch_size):
         """Simple generator with dense feedforward layers."""
         self.batch_size = batch_size
 
-    def build_model(self, input_shape, dim, data_dim):
+    def build_model(self, input_shape, dim, data_dim, activation_info: Optional[NamedTuple] = None):
         input_ = Input(shape=input_shape, batch_size=self.batch_size)
         x = Dense(dim, activation='relu')(input_)
         x = Dense(dim * 2, activation='relu')(x)
         x = Dense(dim * 4, activation='relu')(x)
         x = Dense(data_dim)(x)
+        if activation_info:
+            x = GumbelSoftmaxActivation(activation_info)(x)
         return Model(inputs=input_, outputs=x)
 
 class Critic(tf.keras.Model):
 
@@ -1,6 +1,7 @@
 import os
 from os import path
 
+from typing import Optional, NamedTuple
 import tensorflow as tf
 import tqdm
 from tensorflow.keras import Model, initializers
@@ -9,6 +10,7 @@
 
 from ydata_synthetic.synthesizers.gan import BaseModel
 from ydata_synthetic.synthesizers.loss import Mode, gradient_penalty
+from ydata_synthetic.utils.gumbel_softmax import GumbelSoftmaxActivation
 
 
 class DRAGAN(BaseModel):
@@ -21,10 +23,11 @@ def __init__(self, model_parameters, n_discriminator, gradient_penalty_weight=10
         self.gradient_penalty_weight = gradient_penalty_weight
         super().__init__(model_parameters)
 
-    def define_gan(self):
+    def define_gan(self, col_transform_info: Optional[NamedTuple] = None):
         # define generator/discriminator
         self.generator = Generator(self.batch_size). \
-            build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim)
+            build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim,
+                        activation_info=col_transform_info)
 
         self.discriminator = Discriminator(self.batch_size). \
             build_model(input_shape=(self.data_dim,), dim=self.layers_dim)
@@ -125,7 +128,7 @@ def train(self, data, train_arguments, num_cols, cat_cols):
 
         processed_data = self.processor.transform(data)
         self.data_dim = processed_data.shape[1]
-        self.define_gan()
+        self.define_gan(self.processor.col_transform_info)
 
         train_loader = self.get_data_batch(processed_data, self.batch_size)
 
@@ -174,10 +177,12 @@ class Generator(Model):
     def __init__(self, batch_size):
         self.batch_size = batch_size
 
-    def build_model(self, input_shape, dim, data_dim):
+    def build_model(self, input_shape, dim, data_dim, activation_info: NamedTuple = None):
         input = Input(shape=input_shape, batch_size = self.batch_size)
         x = Dense(dim, kernel_initializer=initializers.TruncatedNormal(mean=0., stddev=0.5), activation='relu')(input)
         x = Dense(dim * 2, activation='relu')(x)
         x = Dense(dim * 4, activation='relu')(x)
         x = Dense(data_dim)(x)
+        if activation_info:
+            x = GumbelSoftmaxActivation(activation_info)(x)
         return Model(inputs=input, outputs=x)
@@ -1,11 +1,12 @@
 import os
 from os import path
 import numpy as np
-from typing import List
+from typing import List, Optional, NamedTuple
 from tqdm import trange
 
 from ydata_synthetic.synthesizers.gan import BaseModel
 from ydata_synthetic.synthesizers import TrainParameters
+from ydata_synthetic.utils.gumbel_softmax import GumbelSoftmaxActivation
 
 import tensorflow as tf
 from tensorflow.keras.layers import Input, Dense, Dropout
@@ -19,9 +20,10 @@ class VanilllaGAN(BaseModel):
     def __init__(self, model_parameters):
         super().__init__(model_parameters)
 
-    def define_gan(self):
+    def define_gan(self, activation_info: Optional[NamedTuple]):
         self.generator = Generator(self.batch_size).\
-            build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim)
+            build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim,
+                        activation_info = activation_info)
 
         self.discriminator = Discriminator(self.batch_size).\
             build_model(input_shape=(self.data_dim,), dim=self.layers_dim)
@@ -63,8 +65,7 @@ def get_data_batch(self, train, batch_size, seed=0):
         train_ix = list(train_ix) + list(train_ix)  # duplicate to cover ranges past the end of the set
         return train[train_ix[start_i: stop_i]]
 
-    def train(self, data, train_arguments: TrainParameters, num_cols: List[str],
-              cat_cols: List[str]):
+    def train(self, data, train_arguments: TrainParameters, num_cols: List[str], cat_cols: List[str]):
         """
         Args:
             data: A pandas DataFrame or a Numpy array with the data to be synthesized
@@ -76,7 +77,7 @@ def train(self, data, train_arguments: TrainParameters, num_cols: List[str],
 
         processed_data = self.processor.transform(data)
         self.data_dim = processed_data.shape[1]
-        self.define_gan()
+        self.define_gan(self.processor.col_transform_info)
 
         iterations = int(abs(data.shape[0]/self.batch_size)+1)
 
@@ -130,12 +131,14 @@ class Generator(tf.keras.Model):
     def __init__(self, batch_size):
         self.batch_size=batch_size
 
-    def build_model(self, input_shape, dim, data_dim):
+    def build_model(self, input_shape, dim, data_dim, activation_info: Optional[NamedTuple] = None):
         input= Input(shape=input_shape, batch_size=self.batch_size)
         x = Dense(dim, activation='relu')(input)
         x = Dense(dim * 2, activation='relu')(x)
         x = Dense(dim * 4, activation='relu')(x)
         x = Dense(data_dim)(x)
+        if activation_info:
+            x = GumbelSoftmaxActivation(activation_info)(x)
         return Model(inputs=input, outputs=x)
 
 class Discriminator(tf.keras.Model):
 
@@ -1,5 +1,5 @@
 from os import mkdir, path
-from typing import List
+from typing import List, Optional, NamedTuple
 
 import numpy as np
 import tensorflow as tf
@@ -11,6 +11,7 @@
 
 from ydata_synthetic.synthesizers import TrainParameters
 from ydata_synthetic.synthesizers.gan import BaseModel
+from ydata_synthetic.utils.gumbel_softmax import GumbelSoftmaxActivation
 
 
 #Auxiliary Keras backend class to calculate the Random Weighted average
@@ -41,9 +42,10 @@ def __init__(self, model_parameters, n_critic, clip_value=0.01):
     def wasserstein_loss(self, y_true, y_pred):
         return K.mean(y_true * y_pred)
 
-    def define_gan(self):
+    def define_gan(self, activation_info: Optional[NamedTuple] = None):
         self.generator = Generator(self.batch_size). \
-            build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim)
+            build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim,
+                        activation_info=activation_info)
 
         self.critic = Critic(self.batch_size). \
             build_model(input_shape=(self.data_dim,), dim=self.layers_dim)
@@ -96,7 +98,7 @@ def train(self, data, train_arguments: TrainParameters, num_cols: List[str],
 
         processed_data = self.processor.transform(data)
         self.data_dim = processed_data.shape[1]
-        self.define_gan()
+        self.define_gan(self.processor.col_transform_info)
 
         #Create a summary file
         iterations = int(abs(data.shape[0]/self.batch_size)+1)
@@ -153,12 +155,14 @@ class Generator(tf.keras.Model):
     def __init__(self, batch_size):
         self.batch_size = batch_size
 
-    def build_model(self, input_shape, dim, data_dim):
+    def build_model(self, input_shape, dim, data_dim, activation_info: Optional[NamedTuple] = None):
         input = Input(shape=input_shape, batch_size=self.batch_size)
         x = Dense(dim, activation='relu')(input)
         x = Dense(dim * 2, activation='relu')(x)
         x = Dense(dim * 4, activation='relu')(x)
         x = Dense(data_dim)(x)
+        if activation_info:
+            x = GumbelSoftmaxActivation(activation_info)(x)
         return Model(inputs=input, outputs=x)
 
 class Critic(tf.keras.Model):