Skip to content

Commit 271608a

Browse files
authored
Merge pull request #40 from GiannisMitr/tsfresh_extractor_speedups
TSFeatureExtractor Performance Improvements
2 parents e6cc77f + 47a4f77 commit 271608a

File tree

2 files changed

+166
-63
lines changed

2 files changed

+166
-63
lines changed

src/sagemaker_sklearn_extension/feature_extraction/sequences.py

Lines changed: 72 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
1111
# ANY KIND, either express or implied. See the License for the specific
1212
# language governing permissions and limitations under the License.
13+
import os
1314
from math import ceil
1415

1516
import numpy as np
@@ -21,12 +22,13 @@
2122
from tsfresh.feature_extraction import EfficientFCParameters
2223
from tsfresh.feature_extraction import MinimalFCParameters
2324
from tsfresh.utilities.dataframe_functions import impute
25+
from tsfresh.defaults import N_PROCESSES # the default number of processes used by TSFresh, equals to n_vcores/2
2426

25-
from sagemaker_sklearn_extension.preprocessing.data import RobustStandardScaler
26-
27-
27+
TOTAL_EXPANSION_THRESHOLD = 2500
2828
DEFAULT_INPUT_SEQUENCE_LENGTH = 1000
2929
SEQUENCE_EXPANSION_FACTOR = 2.5
30+
# do not use TSFresh parallelism in container serve(transform), does not work with server's workers
31+
N_TSFRESH_JOBS = 0 if os.environ.get("SAGEMAKER_PROGRAM") == "sagemaker_serve" else N_PROCESSES
3032

3133

3234
class TSFeatureExtractor(BaseEstimator, TransformerMixin):
@@ -129,7 +131,10 @@ def fit(self, X, y=None):
129131
raise ValueError(
130132
f"length of sequences_lengths_q25 should be equal to number of columns in X (={X.shape[1]})."
131133
)
132-
134+
# cap total expansion for all columns
135+
expansion_thresholds = np.ceil(
136+
(self.sequences_lengths_q25 / np.sum(self.sequences_lengths_q25)) * TOTAL_EXPANSION_THRESHOLD
137+
)
133138
ts_flattener = TSFlattener(max_allowed_length=self.max_allowed_length, trim_beginning=self.trim_beginning)
134139
tsfresh_feature_extractors = []
135140
for sequence_column_i, sequence_column in enumerate(X.T):
@@ -140,6 +145,7 @@ def fit(self, X, y=None):
140145
extraction_type=self.extraction_type,
141146
extraction_seed=self.extraction_seed,
142147
sequence_length_q25=self.sequences_lengths_q25[sequence_column_i],
148+
expansion_threshold=int(expansion_thresholds[sequence_column_i]),
143149
)
144150
tsfresh_feature_extractor.fit(numeric_sequences)
145151
tsfresh_feature_extractors.append(tsfresh_feature_extractor)
@@ -315,12 +321,6 @@ class TSFreshFeatureExtractor(BaseEstimator, TransformerMixin):
315321
List contianing 25th percentile of sequence lengths for each column at the train step.
316322
If not provided, default value will be assigned (DEFAULT_INPUT_SEQUENCE_LENGTH).
317323
318-
Attributes
319-
----------
320-
self.robust_standard_scaler_ : ``sagemaker_sklearn_extension.preprocessing.data.RobustStandardScaler``
321-
- `robust_standard_scaler_` is instantiated inside the fit method used for computing the mean and
322-
the standard deviation.
323-
324324
325325
Examples
326326
--------
@@ -348,23 +348,21 @@ def __init__(
348348
extraction_type="efficient",
349349
extraction_seed=0,
350350
sequence_length_q25=None,
351+
expansion_threshold=None,
351352
):
352353
super().__init__()
353354
self.augment = augment
354355
self.interpolation_method = interpolation_method
355356
self.extraction_type = extraction_type
356357
self.feature_sampling_seed = extraction_seed
357358
self.sequence_length_q25 = sequence_length_q25 or DEFAULT_INPUT_SEQUENCE_LENGTH
358-
self.expansion_threshold = self._compute_expansion_threshold(self.sequence_length_q25)
359-
self.robust_standard_scaler_ = RobustStandardScaler()
359+
expansion_threshold = expansion_threshold or self._compute_expansion_threshold(self.sequence_length_q25)
360+
self.expansion_threshold = min(expansion_threshold, self._compute_expansion_threshold(self.sequence_length_q25))
361+
# expansion_threshold will be the stricter between the one computed for this column and the one respecting
362+
# the total expansion for all columns
360363

361364
def fit(self, X, y=None):
362-
tsfresh_features, _ = self._extract_tsfresh_features(X)
363-
364-
# not all features included due to data expansion control
365-
tsfresh_features = self._filter_features(tsfresh_features, mode="train")
366-
367-
self.robust_standard_scaler_.fit(tsfresh_features)
365+
# Nothing to learn during fit.
368366
return self
369367

370368
def transform(self, X, y=None):
@@ -379,13 +377,7 @@ def transform(self, X, y=None):
379377
tsfresh_features : np.array
380378
381379
"""
382-
check_is_fitted(self, "robust_standard_scaler_")
383-
transform_thresholds = [self._compute_expansion_threshold(len(seq)) for seq in X]
384380
tsfresh_features, X_df = self._extract_tsfresh_features(X)
385-
tsfresh_features = self._filter_features(
386-
tsfresh_features, mode="transform", transform_thresholds=transform_thresholds
387-
)
388-
tsfresh_features = self.robust_standard_scaler_.transform(tsfresh_features)
389381
if self.augment:
390382
# Stack the extracted features to the original sequences in X, after padding with np.nans any shorter
391383
# input sequences in X to match the length of the longest sequence, and imputing missing values as
@@ -462,7 +454,7 @@ def _extract_tsfresh_features(self, X):
462454
column_id="id",
463455
column_sort="time",
464456
impute_function=impute,
465-
n_jobs=0,
457+
n_jobs=N_TSFRESH_JOBS,
466458
)
467459
self.min_settings_card = tsfresh_features.shape[1]
468460
# Minimal features computed indepdently to ensure they go first in the output,
@@ -473,51 +465,74 @@ def _extract_tsfresh_features(self, X):
473465
else:
474466
settings = ComprehensiveFCParameters()
475467
settings = {k: v for k, v in settings.items() if k not in min_settings}
476-
tsfresh_features_extra = extract_features(
477-
X_df_no_nans,
478-
default_fc_parameters=settings,
479-
column_id="id",
480-
column_sort="time",
481-
impute_function=impute,
482-
n_jobs=0,
483-
)
484-
self.extra_settings_card = tsfresh_features_extra.shape[1]
485-
tsfresh_features = pd.concat([tsfresh_features, tsfresh_features_extra], axis=1)
468+
469+
self._apply_feature_threshold(settings)
470+
if settings:
471+
# check that efficient strategies are not emptied when applying expansion threshold
472+
tsfresh_features_extra = extract_features(
473+
X_df_no_nans,
474+
default_fc_parameters=settings,
475+
column_id="id",
476+
column_sort="time",
477+
impute_function=impute,
478+
n_jobs=N_TSFRESH_JOBS,
479+
)
480+
tsfresh_features = pd.concat([tsfresh_features, tsfresh_features_extra], axis=1)
486481

487482
# If X_df.dropna() dropped some observations entirely (i.e., due to all NaNs),
488483
# impute each tsfresh feature for those observations with the median of that tsfresh feature
489484
tsfresh_features_imputed = impute(tsfresh_features.reindex(pd.RangeIndex(X_df["id"].max() + 1)))
490485
return tsfresh_features_imputed, X_df
491486

492-
def _filter_features(self, tsfresh_features, mode="transform", transform_thresholds=None):
493-
if self.expansion_threshold < self.min_settings_card:
494-
raise ValueError(
495-
f"Provided filter threshold(s) (= {self.expansion_threshold}) can not be smaller than "
496-
f"number of features generated by minimal settings (= {self.min_settings_card})"
497-
)
498-
filter_order = np.arange(self.min_settings_card, tsfresh_features.shape[1])
487+
def _apply_feature_threshold(self, settings):
488+
"""Accepts a settings dictionary, with all the possible generated features,
489+
and filters features if needed until their count matches the given "self.expansion_threshold"
490+
(minus minimal features).
491+
Does that in a reproducible "random" way, controlled by "self.feature_sampling_seed".
492+
Draws Random indexes to be filtered, then iterates over the settings dictionary assigning an index to each value
493+
and performs the filtering based on that index.
494+
"""
495+
settings.pop("linear_trend_timewise", None) # remove these 5 features that need dateTime indexes for sequences
496+
max_available_features = self._get_features_count(settings)
497+
if self.expansion_threshold >= max_available_features + self.min_settings_card:
498+
return # no need to limit
499+
500+
filter_order = np.arange(max_available_features)
499501
random_state = np.random.get_state()
500502
np.random.seed(self.feature_sampling_seed)
501503
np.random.shuffle(filter_order)
502504
np.random.set_state(random_state)
503-
survivors = list(range(self.min_settings_card)) + list(
504-
filter_order[: self.expansion_threshold - self.min_settings_card]
505-
)
506-
tsfresh_features = tsfresh_features.iloc[:, survivors]
507-
508-
if mode == "transform":
509-
if len(transform_thresholds) != tsfresh_features.shape[0]:
510-
raise ValueError(
511-
f"In 'transform' mode transform_thresholds should have number of entries "
512-
f"(= {len(transform_thresholds)}) that corresponds to the number of records "
513-
f"in tsfresh_features (= {tsfresh_features.shape[0]})."
514-
)
515-
for thrsh_i, thrsh in enumerate(transform_thresholds):
516-
tsfresh_features.iloc[thrsh_i, thrsh:] = 0
517-
return tsfresh_features
505+
removed_indices = list(filter_order[max(0, self.expansion_threshold - self.min_settings_card) :])
506+
removed_indices.sort()
507+
508+
feature_idx = 0
509+
for k in list(settings.keys()):
510+
if isinstance(settings[k], list):
511+
survived_list = []
512+
# case the value is a list, each list element is counted separately
513+
for index, _ in enumerate(settings[k]):
514+
if removed_indices and removed_indices[0] == feature_idx:
515+
del removed_indices[0]
516+
else:
517+
survived_list.append(settings[k][index])
518+
feature_idx += 1
519+
# copy the "survived", features to the final list. if no one survived, delete the settings key.
520+
if survived_list:
521+
settings[k] = survived_list
522+
else:
523+
del settings[k]
524+
else:
525+
# case the value is None, count it as one feature
526+
if removed_indices and removed_indices[0] == feature_idx:
527+
del removed_indices[0]
528+
del settings[k]
529+
feature_idx += 1
518530

519531
def _compute_expansion_threshold(self, input_len):
520532
return int(max(ceil(SEQUENCE_EXPANSION_FACTOR * input_len + 1) + 1, 10))
521533

522534
def _more_tags(self):
523535
return {"_skip_test": True, "allow_nan": True}
536+
537+
def _get_features_count(self, settings):
538+
return sum([len(v) if isinstance(v, list) else 1 for v in settings.values()])

test/test_sequence_transformer.py

Lines changed: 94 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,15 @@
1313

1414
import numpy as np
1515
import pytest
16-
16+
import tsfresh
17+
import sagemaker_sklearn_extension.feature_extraction.sequences
18+
import importlib
1719
from sklearn.utils.testing import assert_array_almost_equal
18-
1920
from sagemaker_sklearn_extension.feature_extraction.sequences import TSFeatureExtractor
2021
from sagemaker_sklearn_extension.feature_extraction.sequences import TSFlattener
2122
from sagemaker_sklearn_extension.feature_extraction.sequences import TSFreshFeatureExtractor
23+
from tsfresh.defaults import N_PROCESSES
24+
2225

2326
# To test TSFlattener with and without missing values encoded in different ways
2427
# with fixed-length inputs
@@ -39,7 +42,11 @@
3942
[",,,"],
4043
[",,,"],
4144
]
42-
45+
X_sequence_5_columns = [
46+
["1, 2", "11, 12", "1, 4", "11, 99", "7, 4"],
47+
["1, 5", "11, 99", "7, 4", "71, 88", "7, 2"],
48+
["1, 33", "11, 88", "1, 2", "1, 7", "11, 99"],
49+
]
4350

4451
# with variable-length inputs
4552
X_sequence_varying_length = [["1, 2"], ["11, 111"], ["2, 3, 1, 4"]]
@@ -68,12 +75,15 @@
6875
X_all_nan = [[np.nan, np.nan, np.nan, np.nan], [np.nan], [10, 10, 10, 10], [10, 20, 30, 40]]
6976
X_all_nan_imputed = [[0, 0, 0, 0], [0, 0, 0, 0], [10, 10, 10, 10], [10, 20, 30, 40]]
7077
# to test that the first tsfresh feature is computed correctly
71-
X_with_first_feature = np.array([[1, 2, 3, 44, -0.295919], [11, 12, 14, 111, 1.345592], [1, 1, 1, 2, -1.049673]])
78+
X_with_first_feature = np.array(
79+
[[1.0, 2.0, 3.0, 44.0, 50.0], [11.0, 12.0, 14.0, 111.0, 148.0], [1.0, 1.0, 1.0, 2.0, 5.0]]
80+
)
81+
7282
X_filled_with_first_feature = np.array(
73-
[[1.0, 1.0, 3.0, 44.0, -0.236065], [11.0, 12.0, 12.0, 111.0, 1.325594], [0.0, 0.0, 1.0, 0.0, -1.089529]]
83+
[[1.0, 1.0, 3.0, 44.0, 48.0], [11.0, 12.0, 12.0, 111.0, 134.0], [0.0, 0.0, 1.0, 0.0, 1.0]]
7484
)
7585
X_padded_with_first_feature = np.array(
76-
[[1.0, 2.0, 0.0, 0.0, -0.770329], [11.0, 111.0, 0.0, 0.0, 1.41227], [2.0, 3.0, 1.0, 4.0, -0.641941]]
86+
[[1.0, 2.0, 0.0, 0.0, 3.0], [11.0, 111.0, 0.0, 0.0, 122.0], [2.0, 3.0, 1.0, 4.0, 10.0]]
7787
)
7888

7989

@@ -250,7 +260,85 @@ def test_time_series_expansion_control(sequences_lengths_q25, feats_num):
250260
assert X_out.shape[1] == feats_num
251261

252262

263+
def test_time_series_expansion_control_seed():
264+
time_series_feature_extractor = TSFeatureExtractor(
265+
extraction_type="efficient", augment=False, sequences_lengths_q25=[5], extraction_seed=27
266+
)
267+
time_series_feature_extractor.fit(X_sequence)
268+
X_out = time_series_feature_extractor.tsfresh_feature_extractors_[0].transform(
269+
TSFlattener().transform(enumerate(np.array(X_sequence).T).__next__()[1].reshape(-1, 1))
270+
)
271+
assert (
272+
list(X_out.columns.values).sort()
273+
== [
274+
"0__sum_values",
275+
"0__median",
276+
"0__mean",
277+
"0__length",
278+
"0__standard_deviation",
279+
"0__variance",
280+
"0__root_mean_square",
281+
"0__maximum",
282+
"0__minimum",
283+
"0__cwt_coefficients__coeff_14__w_5__widths_(2, 5, 10, 20)",
284+
'0__fft_coefficient__attr_"imag"__coeff_85',
285+
'0__fft_coefficient__attr_"abs"__coeff_19',
286+
'0__fft_coefficient__attr_"abs"__coeff_72',
287+
'0__fft_coefficient__attr_"angle"__coeff_63',
288+
"0__energy_ratio_by_chunks__num_segments_10__segment_focus_4",
289+
].sort()
290+
)
291+
292+
293+
@pytest.mark.parametrize(
294+
"sequences_lengths_q25, expansion_thresholds",
295+
[([500, 200, 1000, 2000, 100], [329, 132, 658, 1316, 66]), ([5, 10, 20, 15, 25], [15, 27, 52, 40, 65])],
296+
)
297+
def test_time_series_expansion_control_across_columns(sequences_lengths_q25, expansion_thresholds):
298+
time_series_feature_extractor = TSFeatureExtractor(
299+
extraction_type="efficient", augment=False, sequences_lengths_q25=sequences_lengths_q25
300+
)
301+
time_series_feature_extractor.fit(X_sequence_5_columns)
302+
for i, extractor in enumerate(time_series_feature_extractor.tsfresh_feature_extractors_):
303+
assert expansion_thresholds[i] == extractor.expansion_threshold
304+
305+
306+
@pytest.mark.parametrize(
307+
"settings, expansion_threshold, expected_settings",
308+
[
309+
({"k1": None, "k2": ["v1", "v2"], "k3": None}, 1, {"k2": ["v2"]}),
310+
({"k2": ["v1", "v2", "v3"], "k3": None}, 2, {"k2": ["v3"], "k3": None}),
311+
({"k1": None, "k2": ["v1", "v2"], "k3": None}, 15, {"k1": None, "k2": ["v1", "v2"], "k3": None}),
312+
],
313+
)
314+
def test_apply_feature_threshold(settings, expansion_threshold, expected_settings):
315+
time_series_feature_extractor = TSFreshFeatureExtractor(expansion_threshold=expansion_threshold)
316+
time_series_feature_extractor.min_settings_card = 0
317+
time_series_feature_extractor._apply_feature_threshold(settings)
318+
assert settings == expected_settings
319+
320+
253321
def test_time_series_all_nan_column():
254322
time_series_feature_extractor = TSFeatureExtractor(extraction_type="efficient", augment=False)
255323
X_out = time_series_feature_extractor.fit_transform(X_all_nan_column)
256324
assert X_out.shape[0] == 4
325+
326+
327+
@pytest.mark.parametrize(
328+
"env, n_jobs",
329+
[
330+
(["SAGEMAKER_PROGRAM", "sagemaker_serve"], 0),
331+
(["SAGEMAKER_PROGRAM", "train"], N_PROCESSES),
332+
(["key", "value"], N_PROCESSES),
333+
],
334+
)
335+
def test_tsfresh_extractor_njobs_is_ncpus_when_non_sagemaker_serve_env(monkeypatch, env, n_jobs):
336+
monkeypatch.setenv(env[0], env[1])
337+
338+
def mocked_extract(*args, **kwargs):
339+
assert (kwargs["n_jobs"]) == n_jobs
340+
return tsfresh.extract_features(*args, **kwargs)
341+
342+
importlib.reload(sagemaker_sklearn_extension.feature_extraction.sequences)
343+
monkeypatch.setattr("sagemaker_sklearn_extension.feature_extraction.sequences.extract_features", mocked_extract)
344+
TSFeatureExtractor(extraction_type="efficient", augment=False).fit_transform(X_sequence)

0 commit comments

Comments
 (0)