Merge pull request #40 from GiannisMitr/tsfresh_extractor_speedups

ruiqiany · web-flow · commit 271608a5f938 · 2022-02-10T10:03:31.000-08:00
TSFeatureExtractor Performance Improvements
diff --git a/src/sagemaker_sklearn_extension/feature_extraction/sequences.py b/src/sagemaker_sklearn_extension/feature_extraction/sequences.py
@@ -10,6 +10,7 @@
 # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
+import os
 from math import ceil
 
 import numpy as np
@@ -21,12 +22,13 @@
 from tsfresh.feature_extraction import EfficientFCParameters
 from tsfresh.feature_extraction import MinimalFCParameters
 from tsfresh.utilities.dataframe_functions import impute
+from tsfresh.defaults import N_PROCESSES  # the default number of processes used by TSFresh, equals to n_vcores/2
 
-from sagemaker_sklearn_extension.preprocessing.data import RobustStandardScaler
-
-
+TOTAL_EXPANSION_THRESHOLD = 2500
 DEFAULT_INPUT_SEQUENCE_LENGTH = 1000
 SEQUENCE_EXPANSION_FACTOR = 2.5
+# do not use TSFresh parallelism in container serve(transform), does not work with server's workers
+N_TSFRESH_JOBS = 0 if os.environ.get("SAGEMAKER_PROGRAM") == "sagemaker_serve" else N_PROCESSES
 
 
 class TSFeatureExtractor(BaseEstimator, TransformerMixin):
@@ -129,7 +131,10 @@ def fit(self, X, y=None):
             raise ValueError(
                 f"length of sequences_lengths_q25 should be equal to number of columns in X (={X.shape[1]})."
             )
-
+        # cap total expansion for all columns
+        expansion_thresholds = np.ceil(
+            (self.sequences_lengths_q25 / np.sum(self.sequences_lengths_q25)) * TOTAL_EXPANSION_THRESHOLD
+        )
         ts_flattener = TSFlattener(max_allowed_length=self.max_allowed_length, trim_beginning=self.trim_beginning)
         tsfresh_feature_extractors = []
         for sequence_column_i, sequence_column in enumerate(X.T):
@@ -140,6 +145,7 @@ def fit(self, X, y=None):
                 extraction_type=self.extraction_type,
                 extraction_seed=self.extraction_seed,
                 sequence_length_q25=self.sequences_lengths_q25[sequence_column_i],
+                expansion_threshold=int(expansion_thresholds[sequence_column_i]),
             )
             tsfresh_feature_extractor.fit(numeric_sequences)
             tsfresh_feature_extractors.append(tsfresh_feature_extractor)
@@ -315,12 +321,6 @@ class TSFreshFeatureExtractor(BaseEstimator, TransformerMixin):
         List contianing 25th percentile of sequence lengths for each column at the train step.
         If not provided, default value will be assigned (DEFAULT_INPUT_SEQUENCE_LENGTH).
 
-    Attributes
-    ----------
-    self.robust_standard_scaler_ : ``sagemaker_sklearn_extension.preprocessing.data.RobustStandardScaler``
-        - `robust_standard_scaler_` is instantiated inside the fit method used for computing the mean and
-        the standard deviation.
-
 
     Examples
     --------
@@ -348,23 +348,21 @@ def __init__(
         extraction_type="efficient",
         extraction_seed=0,
         sequence_length_q25=None,
+        expansion_threshold=None,
     ):
         super().__init__()
         self.augment = augment
         self.interpolation_method = interpolation_method
         self.extraction_type = extraction_type
         self.feature_sampling_seed = extraction_seed
         self.sequence_length_q25 = sequence_length_q25 or DEFAULT_INPUT_SEQUENCE_LENGTH
-        self.expansion_threshold = self._compute_expansion_threshold(self.sequence_length_q25)
-        self.robust_standard_scaler_ = RobustStandardScaler()
+        expansion_threshold = expansion_threshold or self._compute_expansion_threshold(self.sequence_length_q25)
+        self.expansion_threshold = min(expansion_threshold, self._compute_expansion_threshold(self.sequence_length_q25))
+        # expansion_threshold will be the stricter between the one computed for this column and the one respecting
+        # the total expansion for all columns
 
     def fit(self, X, y=None):
-        tsfresh_features, _ = self._extract_tsfresh_features(X)
-
-        # not all features included due to data expansion control
-        tsfresh_features = self._filter_features(tsfresh_features, mode="train")
-
-        self.robust_standard_scaler_.fit(tsfresh_features)
+        # Nothing to learn during fit.
         return self
 
     def transform(self, X, y=None):
@@ -379,13 +377,7 @@ def transform(self, X, y=None):
         tsfresh_features : np.array
 
         """
-        check_is_fitted(self, "robust_standard_scaler_")
-        transform_thresholds = [self._compute_expansion_threshold(len(seq)) for seq in X]
         tsfresh_features, X_df = self._extract_tsfresh_features(X)
-        tsfresh_features = self._filter_features(
-            tsfresh_features, mode="transform", transform_thresholds=transform_thresholds
-        )
-        tsfresh_features = self.robust_standard_scaler_.transform(tsfresh_features)
         if self.augment:
             # Stack the extracted features to the original sequences in X, after padding with np.nans any shorter
             # input sequences in X to match the length of the longest sequence, and imputing missing values as
@@ -462,7 +454,7 @@ def _extract_tsfresh_features(self, X):
             column_id="id",
             column_sort="time",
             impute_function=impute,
-            n_jobs=0,
+            n_jobs=N_TSFRESH_JOBS,
         )
         self.min_settings_card = tsfresh_features.shape[1]
         # Minimal features computed indepdently to ensure they go first in the output,
@@ -473,51 +465,74 @@ def _extract_tsfresh_features(self, X):
             else:
                 settings = ComprehensiveFCParameters()
             settings = {k: v for k, v in settings.items() if k not in min_settings}
-            tsfresh_features_extra = extract_features(
-                X_df_no_nans,
-                default_fc_parameters=settings,
-                column_id="id",
-                column_sort="time",
-                impute_function=impute,
-                n_jobs=0,
-            )
-            self.extra_settings_card = tsfresh_features_extra.shape[1]
-            tsfresh_features = pd.concat([tsfresh_features, tsfresh_features_extra], axis=1)
+
+            self._apply_feature_threshold(settings)
+            if settings:
+                # check that efficient strategies are not emptied when applying expansion threshold
+                tsfresh_features_extra = extract_features(
+                    X_df_no_nans,
+                    default_fc_parameters=settings,
+                    column_id="id",
+                    column_sort="time",
+                    impute_function=impute,
+                    n_jobs=N_TSFRESH_JOBS,
+                )
+                tsfresh_features = pd.concat([tsfresh_features, tsfresh_features_extra], axis=1)
 
         # If X_df.dropna() dropped some observations entirely (i.e., due to all NaNs),
         # impute each tsfresh feature for those observations with the median of that tsfresh feature
         tsfresh_features_imputed = impute(tsfresh_features.reindex(pd.RangeIndex(X_df["id"].max() + 1)))
         return tsfresh_features_imputed, X_df
 
-    def _filter_features(self, tsfresh_features, mode="transform", transform_thresholds=None):
-        if self.expansion_threshold < self.min_settings_card:
-            raise ValueError(
-                f"Provided filter threshold(s) (= {self.expansion_threshold}) can not be smaller than "
-                f"number of features generated by minimal settings (= {self.min_settings_card})"
-            )
-        filter_order = np.arange(self.min_settings_card, tsfresh_features.shape[1])
+    def _apply_feature_threshold(self, settings):
+        """Accepts a settings dictionary, with all the possible generated features,
+        and filters features if needed until their count matches the given "self.expansion_threshold"
+        (minus minimal features).
+        Does that in a reproducible "random" way, controlled by "self.feature_sampling_seed".
+        Draws Random indexes to be filtered, then iterates over the settings dictionary assigning an index to each value
+         and performs the filtering based on that index.
+        """
+        settings.pop("linear_trend_timewise", None)  # remove these 5 features that need dateTime indexes for sequences
+        max_available_features = self._get_features_count(settings)
+        if self.expansion_threshold >= max_available_features + self.min_settings_card:
+            return  # no need to limit
+
+        filter_order = np.arange(max_available_features)
         random_state = np.random.get_state()
         np.random.seed(self.feature_sampling_seed)
         np.random.shuffle(filter_order)
         np.random.set_state(random_state)
-        survivors = list(range(self.min_settings_card)) + list(
-            filter_order[: self.expansion_threshold - self.min_settings_card]
-        )
-        tsfresh_features = tsfresh_features.iloc[:, survivors]
-
-        if mode == "transform":
-            if len(transform_thresholds) != tsfresh_features.shape[0]:
-                raise ValueError(
-                    f"In 'transform' mode transform_thresholds should have number of entries "
-                    f"(= {len(transform_thresholds)}) that corresponds to the number of records "
-                    f"in tsfresh_features (= {tsfresh_features.shape[0]})."
-                )
-            for thrsh_i, thrsh in enumerate(transform_thresholds):
-                tsfresh_features.iloc[thrsh_i, thrsh:] = 0
-        return tsfresh_features
+        removed_indices = list(filter_order[max(0, self.expansion_threshold - self.min_settings_card) :])
+        removed_indices.sort()
+
+        feature_idx = 0
+        for k in list(settings.keys()):
+            if isinstance(settings[k], list):
+                survived_list = []
+                # case the value is a list, each list element is counted separately
+                for index, _ in enumerate(settings[k]):
+                    if removed_indices and removed_indices[0] == feature_idx:
+                        del removed_indices[0]
+                    else:
+                        survived_list.append(settings[k][index])
+                    feature_idx += 1
+                # copy the "survived", features to the final list. if no one survived, delete the settings key.
+                if survived_list:
+                    settings[k] = survived_list
+                else:
+                    del settings[k]
+            else:
+                # case the value is None, count it as one feature
+                if removed_indices and removed_indices[0] == feature_idx:
+                    del removed_indices[0]
+                    del settings[k]
+                feature_idx += 1
 
     def _compute_expansion_threshold(self, input_len):
         return int(max(ceil(SEQUENCE_EXPANSION_FACTOR * input_len + 1) + 1, 10))
 
     def _more_tags(self):
         return {"_skip_test": True, "allow_nan": True}
+
+    def _get_features_count(self, settings):
+        return sum([len(v) if isinstance(v, list) else 1 for v in settings.values()])
diff --git a/test/test_sequence_transformer.py b/test/test_sequence_transformer.py
@@ -13,12 +13,15 @@
 
 import numpy as np
 import pytest
-
+import tsfresh
+import sagemaker_sklearn_extension.feature_extraction.sequences
+import importlib
 from sklearn.utils.testing import assert_array_almost_equal
-
 from sagemaker_sklearn_extension.feature_extraction.sequences import TSFeatureExtractor
 from sagemaker_sklearn_extension.feature_extraction.sequences import TSFlattener
 from sagemaker_sklearn_extension.feature_extraction.sequences import TSFreshFeatureExtractor
+from tsfresh.defaults import N_PROCESSES
+
 
 # To test TSFlattener with and without missing values encoded in different ways
 # with fixed-length inputs
@@ -39,7 +42,11 @@
     [",,,"],
     [",,,"],
 ]
-
+X_sequence_5_columns = [
+    ["1, 2", "11, 12", "1, 4", "11, 99", "7, 4"],
+    ["1, 5", "11, 99", "7, 4", "71, 88", "7, 2"],
+    ["1, 33", "11, 88", "1, 2", "1, 7", "11, 99"],
+]
 
 # with variable-length inputs
 X_sequence_varying_length = [["1, 2"], ["11, 111"], ["2, 3, 1, 4"]]
@@ -68,12 +75,15 @@
 X_all_nan = [[np.nan, np.nan, np.nan, np.nan], [np.nan], [10, 10, 10, 10], [10, 20, 30, 40]]
 X_all_nan_imputed = [[0, 0, 0, 0], [0, 0, 0, 0], [10, 10, 10, 10], [10, 20, 30, 40]]
 # to test that the first tsfresh feature is computed correctly
-X_with_first_feature = np.array([[1, 2, 3, 44, -0.295919], [11, 12, 14, 111, 1.345592], [1, 1, 1, 2, -1.049673]])
+X_with_first_feature = np.array(
+    [[1.0, 2.0, 3.0, 44.0, 50.0], [11.0, 12.0, 14.0, 111.0, 148.0], [1.0, 1.0, 1.0, 2.0, 5.0]]
+)
+
 X_filled_with_first_feature = np.array(
-    [[1.0, 1.0, 3.0, 44.0, -0.236065], [11.0, 12.0, 12.0, 111.0, 1.325594], [0.0, 0.0, 1.0, 0.0, -1.089529]]
+    [[1.0, 1.0, 3.0, 44.0, 48.0], [11.0, 12.0, 12.0, 111.0, 134.0], [0.0, 0.0, 1.0, 0.0, 1.0]]
 )
 X_padded_with_first_feature = np.array(
-    [[1.0, 2.0, 0.0, 0.0, -0.770329], [11.0, 111.0, 0.0, 0.0, 1.41227], [2.0, 3.0, 1.0, 4.0, -0.641941]]
+    [[1.0, 2.0, 0.0, 0.0, 3.0], [11.0, 111.0, 0.0, 0.0, 122.0], [2.0, 3.0, 1.0, 4.0, 10.0]]
 )
 
 
@@ -250,7 +260,85 @@ def test_time_series_expansion_control(sequences_lengths_q25, feats_num):
     assert X_out.shape[1] == feats_num
 
 
+def test_time_series_expansion_control_seed():
+    time_series_feature_extractor = TSFeatureExtractor(
+        extraction_type="efficient", augment=False, sequences_lengths_q25=[5], extraction_seed=27
+    )
+    time_series_feature_extractor.fit(X_sequence)
+    X_out = time_series_feature_extractor.tsfresh_feature_extractors_[0].transform(
+        TSFlattener().transform(enumerate(np.array(X_sequence).T).__next__()[1].reshape(-1, 1))
+    )
+    assert (
+        list(X_out.columns.values).sort()
+        == [
+            "0__sum_values",
+            "0__median",
+            "0__mean",
+            "0__length",
+            "0__standard_deviation",
+            "0__variance",
+            "0__root_mean_square",
+            "0__maximum",
+            "0__minimum",
+            "0__cwt_coefficients__coeff_14__w_5__widths_(2, 5, 10, 20)",
+            '0__fft_coefficient__attr_"imag"__coeff_85',
+            '0__fft_coefficient__attr_"abs"__coeff_19',
+            '0__fft_coefficient__attr_"abs"__coeff_72',
+            '0__fft_coefficient__attr_"angle"__coeff_63',
+            "0__energy_ratio_by_chunks__num_segments_10__segment_focus_4",
+        ].sort()
+    )
+
+
+@pytest.mark.parametrize(
+    "sequences_lengths_q25, expansion_thresholds",
+    [([500, 200, 1000, 2000, 100], [329, 132, 658, 1316, 66]), ([5, 10, 20, 15, 25], [15, 27, 52, 40, 65])],
+)
+def test_time_series_expansion_control_across_columns(sequences_lengths_q25, expansion_thresholds):
+    time_series_feature_extractor = TSFeatureExtractor(
+        extraction_type="efficient", augment=False, sequences_lengths_q25=sequences_lengths_q25
+    )
+    time_series_feature_extractor.fit(X_sequence_5_columns)
+    for i, extractor in enumerate(time_series_feature_extractor.tsfresh_feature_extractors_):
+        assert expansion_thresholds[i] == extractor.expansion_threshold
+
+
+@pytest.mark.parametrize(
+    "settings, expansion_threshold, expected_settings",
+    [
+        ({"k1": None, "k2": ["v1", "v2"], "k3": None}, 1, {"k2": ["v2"]}),
+        ({"k2": ["v1", "v2", "v3"], "k3": None}, 2, {"k2": ["v3"], "k3": None}),
+        ({"k1": None, "k2": ["v1", "v2"], "k3": None}, 15, {"k1": None, "k2": ["v1", "v2"], "k3": None}),
+    ],
+)
+def test_apply_feature_threshold(settings, expansion_threshold, expected_settings):
+    time_series_feature_extractor = TSFreshFeatureExtractor(expansion_threshold=expansion_threshold)
+    time_series_feature_extractor.min_settings_card = 0
+    time_series_feature_extractor._apply_feature_threshold(settings)
+    assert settings == expected_settings
+
+
 def test_time_series_all_nan_column():
     time_series_feature_extractor = TSFeatureExtractor(extraction_type="efficient", augment=False)
     X_out = time_series_feature_extractor.fit_transform(X_all_nan_column)
     assert X_out.shape[0] == 4
+
+
+@pytest.mark.parametrize(
+    "env, n_jobs",
+    [
+        (["SAGEMAKER_PROGRAM", "sagemaker_serve"], 0),
+        (["SAGEMAKER_PROGRAM", "train"], N_PROCESSES),
+        (["key", "value"], N_PROCESSES),
+    ],
+)
+def test_tsfresh_extractor_njobs_is_ncpus_when_non_sagemaker_serve_env(monkeypatch, env, n_jobs):
+    monkeypatch.setenv(env[0], env[1])
+
+    def mocked_extract(*args, **kwargs):
+        assert (kwargs["n_jobs"]) == n_jobs
+        return tsfresh.extract_features(*args, **kwargs)
+
+    importlib.reload(sagemaker_sklearn_extension.feature_extraction.sequences)
+    monkeypatch.setattr("sagemaker_sklearn_extension.feature_extraction.sequences.extract_features", mocked_extract)
+    TSFeatureExtractor(extraction_type="efficient", augment=False).fit_transform(X_sequence)