From 430fb940822e9a9be352a29ead70b319e15d9574 Mon Sep 17 00:00:00 2001 From: Matthew Middlehurst Date: Wed, 25 Oct 2023 14:14:02 +0100 Subject: [PATCH 1/8] sweep rules update --- sweep.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sweep.yaml b/sweep.yaml index 7ed6408d..d3a35d76 100644 --- a/sweep.yaml +++ b/sweep.yaml @@ -1,6 +1,6 @@ gha_enabled: True branch: "main" -blocked_dirs: [".github/", "_tsml_group_experiments", "results/", "tsml_eval/publications/"] +blocked_dirs: [".github/", "_tsml_group_experiments/", "results/", "tsml_eval/publications/"] draft: False description: "time-series-machine-learning/tsml-eval is a Python project for running experiments on time series machine learning algorithms and evaluating the results. Write code that adheres to PEP8 and is formatted to the Black code style. The projects documentation is built using Sphinx and MyST, and unit testing is done using pytest." @@ -21,5 +21,6 @@ sandbox: - pre-commit run --files {file_path} rules: - - "There should not be large chunks of code that are just commented out. Docstrings and explanations in code are okay though." - - "Update the relevant API page in 'docs/api.md' when new public functions and classes are added and not included in the API documentation. Only add functions and classes which are not already in the relevant API documentation and avoid duplicate entries. Files in 'tsml_eval/publications/' do not need to be added to the API documentation." + - "Any clearly inefficient or redundant code can be optimized or refactored. Any improvements should not change the functionality of the code." + - "All public classes and functions except test functions should have a `numpydoc` style docstring. This should include a description of the class or function, the parameters, the class attributes or function return values, and a usage example for the class or function." + - "Update the relevant API page in `docs/api_reference/` when new public functions and classes are added and not included in the API documentation. For example, if a new function is added to `aeon/distances/`, a `sphinx.ext.autosummary` link should also be added to `docs/api_reference/distances.rst`. New sections in the page should not be created for individual functions and classes, add it to the most relevant existing one. Only add functions and classes which are not already in the relevant API page and avoid duplicate entries." From b1544883b054edff39822ec3712fa76aae31ceef Mon Sep 17 00:00:00 2001 From: Matthew Middlehurst Date: Wed, 25 Oct 2023 14:16:19 +0100 Subject: [PATCH 2/8] sphinx upper bound --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3c1a4a08..46e39b1d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,7 +72,7 @@ binder = [ "jupyterlab", ] docs = [ - "sphinx", + "sphinx<8.0.0", "sphinx-design", "nbsphinx", "numpydoc", From f92ed7c7ff132b2332cef7cbb3d5b0b2e6290924 Mon Sep 17 00:00:00 2001 From: Matthew Middlehurst Date: Wed, 25 Oct 2023 14:41:38 +0100 Subject: [PATCH 3/8] aeon version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 46e39b1d..346dbfcc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ classifiers = [ "Programming Language :: Python :: 3.11", ] dependencies = [ - "aeon>=0.4.0,<0.5.0", + "aeon>=0.5.0,<0.6.0", "scikit-learn>=1.0.2,<=1.2.2", "tsml>=0.2.0,<0.3.0", "gpustat", From 5311e49ec1bcc935b9e7c516246d339f52607af5 Mon Sep 17 00:00:00 2001 From: Matthew Middlehurst Date: Wed, 25 Oct 2023 17:34:07 +0100 Subject: [PATCH 4/8] revert --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 346dbfcc..46e39b1d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ classifiers = [ "Programming Language :: Python :: 3.11", ] dependencies = [ - "aeon>=0.5.0,<0.6.0", + "aeon>=0.4.0,<0.5.0", "scikit-learn>=1.0.2,<=1.2.2", "tsml>=0.2.0,<0.3.0", "gpustat", From 1d8790cc3f43fce50957fd33567cf31dde877977 Mon Sep 17 00:00:00 2001 From: Matthew Middlehurst Date: Wed, 13 Dec 2023 00:10:22 +0000 Subject: [PATCH 5/8] param --- tsml_eval/experiments/experiments.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/tsml_eval/experiments/experiments.py b/tsml_eval/experiments/experiments.py index 7b1ed501..3b09c747 100644 --- a/tsml_eval/experiments/experiments.py +++ b/tsml_eval/experiments/experiments.py @@ -59,6 +59,7 @@ def run_classification_experiment( resample_id=None, build_test_file=True, build_train_file=False, + attribute_output_path=None, ): """Run a classification experiment and save the results to file. @@ -154,6 +155,9 @@ def run_classification_experiment( classifier.fit(X_train, y_train) fit_time = int(round(time.time() * 1000)) - start + if attribute_output_path is not None: + estimator_attributes_to_file(classifier, attribute_output_path) + if build_test_file: start = int(round(time.time() * 1000)) test_probs = classifier.predict_proba(X_test) @@ -323,6 +327,7 @@ def run_regression_experiment( resample_id=None, build_test_file=True, build_train_file=False, + attribute_output_path=None, ): """Run a regression experiment and save the results to file. @@ -412,6 +417,9 @@ def run_regression_experiment( round(getattr(regressor, "_fit_time", 0) * 1000) ) + if attribute_output_path is not None: + estimator_attributes_to_file(regressor, attribute_output_path) + if build_test_file: start = int(round(time.time() * 1000)) test_preds = regressor.predict(X_test) @@ -575,6 +583,7 @@ def run_clustering_experiment( resample_id=None, build_test_file=False, build_train_file=True, + attribute_output_path=None, ): """Run a clustering experiment and save the results to file. @@ -680,10 +689,6 @@ def run_clustering_experiment( elif n_clusters is not None: raise ValueError("n_clusters must be an int or None.") - start = int(round(time.time() * 1000)) - clusterer.fit(X_train) - fit_time = int(round(time.time() * 1000)) - start - first_comment = ( "Generated by run_clustering_experiment on " f"{datetime.now().strftime('%m/%d/%Y, %H:%M:%S')}. " @@ -692,6 +697,13 @@ def run_clustering_experiment( second = str(clusterer.get_params()).replace("\n", " ").replace("\r", " ") + start = int(round(time.time() * 1000)) + clusterer.fit(X_train) + fit_time = int(round(time.time() * 1000)) - start + + if attribute_output_path is not None: + estimator_attributes_to_file(clusterer, attribute_output_path) + start = int(round(time.time() * 1000)) if callable(getattr(clusterer, "predict_proba", None)): train_probs = clusterer.predict_proba(X_train) @@ -865,6 +877,7 @@ def run_forecasting_experiment( forecaster_name=None, dataset_name="N/A", random_seed=None, + attribute_output_path=None, ): """Run a forecasting experiment and save the results to file. @@ -908,6 +921,9 @@ def run_forecasting_experiment( forecaster.fit(train) fit_time = int(round(time.time() * 1000)) - start + if attribute_output_path is not None: + estimator_attributes_to_file(classifier, attribute_output_path) + start = int(round(time.time() * 1000)) test_preds = forecaster.predict(np.arange(1, len(test) + 1)) test_time = int(round(time.time() * 1000)) - start From 29de807b025e13b95770327d6127b90afb32ff93 Mon Sep 17 00:00:00 2001 From: Matthew Middlehurst Date: Wed, 13 Dec 2023 16:59:20 +0000 Subject: [PATCH 6/8] add function to write attributes to file --- conftest.py | 12 +++ docs/api.md | 3 + tsml_eval/experiments/experiments.py | 25 +++--- tsml_eval/utils/experiments.py | 86 +++++++++++++++++-- .../utils/tests/test_attribute_writing.py | 22 +++++ 5 files changed, 128 insertions(+), 20 deletions(-) create mode 100644 tsml_eval/utils/tests/test_attribute_writing.py diff --git a/conftest.py b/conftest.py index 7ee30853..dc8a0996 100644 --- a/conftest.py +++ b/conftest.py @@ -2,7 +2,19 @@ __author__ = ["MatthewMiddlehurst"] +import shutil + from tsml_eval.experiments import experiments +from tsml_eval.testing.test_utils import _TEST_OUTPUT_PATH + + +def pytest_sessionfinish(session, exitstatus): + """ + Called after test run is finished, right before returning the exit status to + the system. + """ + if not hasattr(session.config, "workerinput"): + shutil.rmtree(_TEST_OUTPUT_PATH) def pytest_addoption(parser): diff --git a/docs/api.md b/docs/api.md index 9010dc72..9a844966 100644 --- a/docs/api.md +++ b/docs/api.md @@ -54,6 +54,8 @@ Functions for running experiments. experiments.load_and_run_regression_experiment experiments.run_clustering_experiment experiments.load_and_run_clustering_experiment + experiments.run_forecasting_experiment + experiments.load_and_run_forecasting_experiment ``` ## Utilities: [tsml_eval.utils](https://github.com/time-series-machine-learning/tsml-eval/tree/main/tsml_eval/utils) @@ -77,6 +79,7 @@ Public utility functions used elsewhere in the package. utils.experiments.compare_result_file_resample utils.experiments.assign_gpu utils.experiments.timing_benchmark + utils.experiments.estimator_attributes_to_file utils.functions.str_in_nested_list utils.functions.pair_list_to_dict utils.functions.time_to_milliseconds diff --git a/tsml_eval/experiments/experiments.py b/tsml_eval/experiments/experiments.py index 98f5aa9b..72b39c55 100644 --- a/tsml_eval/experiments/experiments.py +++ b/tsml_eval/experiments/experiments.py @@ -36,6 +36,7 @@ ) from tsml_eval.evaluation.metrics import clustering_accuracy_score from tsml_eval.utils.experiments import ( + estimator_attributes_to_file, load_experiment_data, resample_data, stratified_resample_data, @@ -66,7 +67,7 @@ def run_classification_experiment( resample_id=None, build_test_file=True, build_train_file=False, - attribute_output_path=None, + attribute_file_path=None, benchmark_time=True, ): """Run a classification experiment and save the results to file. @@ -176,8 +177,8 @@ def run_classification_experiment( ) fit_time += int(round(getattr(classifier, "_fit_time_milli", 0))) - if attribute_output_path is not None: - estimator_attributes_to_file(classifier, attribute_output_path) + if attribute_file_path is not None: + estimator_attributes_to_file(classifier, attribute_file_path) if build_test_file: start = int(round(time.time() * 1000)) @@ -360,7 +361,7 @@ def run_regression_experiment( resample_id=None, build_test_file=True, build_train_file=False, - attribute_output_path=None, + attribute_file_path=None, benchmark_time=True, ): """Run a regression experiment and save the results to file. @@ -462,8 +463,8 @@ def run_regression_experiment( ) fit_time += int(round(getattr(regressor, "_fit_time_milli", 0))) - if attribute_output_path is not None: - estimator_attributes_to_file(regressor, attribute_output_path) + if attribute_file_path is not None: + estimator_attributes_to_file(regressor, attribute_file_path) if build_test_file: start = int(round(time.time() * 1000)) @@ -636,7 +637,7 @@ def run_clustering_experiment( resample_id=None, build_test_file=False, build_train_file=True, - attribute_output_path=None, + attribute_file_path=None, benchmark_time=True, ): """Run a clustering experiment and save the results to file. @@ -768,8 +769,8 @@ def run_clustering_experiment( ) fit_time += int(round(getattr(clusterer, "_fit_time_milli", 0))) - if attribute_output_path is not None: - estimator_attributes_to_file(clusterer, attribute_output_path) + if attribute_file_path is not None: + estimator_attributes_to_file(clusterer, attribute_file_path) start = int(round(time.time() * 1000)) if callable(getattr(clusterer, "predict_proba", None)): @@ -987,7 +988,7 @@ def run_forecasting_experiment( forecaster_name=None, dataset_name="N/A", random_seed=None, - attribute_output_path=None, + attribute_file_path=None, benchmark_time=True, ): """Run a forecasting experiment and save the results to file. @@ -1043,8 +1044,8 @@ def run_forecasting_experiment( ) fit_time += int(round(getattr(forecaster, "_fit_time_milli", 0))) - if attribute_output_path is not None: - estimator_attributes_to_file(classifier, attribute_output_path) + if attribute_file_path is not None: + estimator_attributes_to_file(forecaster, attribute_file_path) start = int(round(time.time() * 1000)) test_preds = forecaster.predict(np.arange(1, len(test) + 1)) diff --git a/tsml_eval/utils/experiments.py b/tsml_eval/utils/experiments.py index 1984bea8..c880e9d6 100644 --- a/tsml_eval/utils/experiments.py +++ b/tsml_eval/utils/experiments.py @@ -14,13 +14,16 @@ "compare_result_file_resample", "assign_gpu", "timing_benchmark", + "estimator_attributes_to_file", ] import os import time +from collections.abc import Sequence import gpustat import numpy as np +from sklearn.base import BaseEstimator from sklearn.utils import check_random_state from tsml.datasets import load_from_ts_file @@ -763,10 +766,7 @@ def write_results_to_tsml_format( if not full_path: file_path = f"{file_path}/{estimator_name}/Predictions/{dataset_name}/" - try: - os.makedirs(file_path) - except os.error: - pass # raises os.error if path already exists, so just ignore this + os.makedirs(file_path, exist_ok=True) if split is None: split = "" @@ -892,10 +892,7 @@ def fix_broken_second_line(file_path, save_path=None): if save_path is None: save_path = file_path - try: - os.makedirs(os.path.dirname(save_path)) - except os.error: - pass # raises os.error if path already exists, so just ignore this + os.makedirs(os.path.dirname(save_path), exist_ok=True) with open(save_path, "w") as f: f.writelines(lines) @@ -1008,3 +1005,76 @@ def timing_benchmark(num_arrays=1000, array_size=20000, random_state=None): total_time += end_time - start_time return int(round(total_time * 1000)) + + +def estimator_attributes_to_file( + estimator, dir_path, estimator_name=None, max_depth=np.inf, max_list_shape=np.inf +): + estimator_name = ( + estimator.__class__.__name__ if estimator_name is None else estimator_name + ) + _write_estimator_attributes_recursive( + estimator, dir_path, estimator_name, 0, max_depth, max_list_shape + ) + + +def _write_estimator_attributes_recursive( + estimator, dir_path, file_name, depth, max_depth, max_list_shape +): + if depth > max_depth: + return + + path = f"{dir_path}/{file_name}.txt" + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "w") as file: + for attr in estimator.__dict__: + value = getattr(estimator, attr) + file.write(f"{attr}: {value}\n") + + if isinstance(value, BaseEstimator): + new_dir_path = f"{dir_path}/{attr}/" + file.write(f" See {new_dir_path}{attr}.txt for more details\n") + _write_estimator_attributes_recursive( + value, new_dir_path, attr, depth + 1, max_depth, max_list_shape + ) + elif _is_non_string_sequence(value): + _write_list_attributes_recursive( + value, file, dir_path, attr, depth + 1, max_depth, 0, max_list_shape + ) + + +def _write_list_attributes_recursive( + it, file, dir_path, file_name, depth, max_depth, shape, max_list_shape +): + if shape > max_list_shape: + return + + for idx, item in enumerate(it): + if isinstance(item, BaseEstimator): + new_dir_path = f"{dir_path}/{file_name}_{idx}/" + file.write( + f" See {new_dir_path}{file_name}_{idx}.txt for more details\n" + ) + _write_estimator_attributes_recursive( + item, + new_dir_path, + f"{file_name}_{idx}", + depth, + max_depth, + max_list_shape, + ) + elif _is_non_string_sequence(item): + _write_list_attributes_recursive( + item, + file, + dir_path, + f"{file_name}_{idx}", + depth, + max_depth, + shape + 1, + max_list_shape, + ) + + +def _is_non_string_sequence(obj): + return isinstance(obj, Sequence) and not isinstance(obj, (str, bytes, bytearray)) diff --git a/tsml_eval/utils/tests/test_attribute_writing.py b/tsml_eval/utils/tests/test_attribute_writing.py new file mode 100644 index 00000000..20dbd6c6 --- /dev/null +++ b/tsml_eval/utils/tests/test_attribute_writing.py @@ -0,0 +1,22 @@ +import os + +from aeon.classification.shapelet_based import ShapeletTransformClassifier +from aeon.classification.sklearn import RotationForestClassifier +from tsml.datasets import load_minimal_chinatown + +from tsml_eval.testing.test_utils import _TEST_OUTPUT_PATH +from tsml_eval.utils.experiments import estimator_attributes_to_file + + +def test_file_creation(): + estimator = ShapeletTransformClassifier( + n_shapelet_samples=50, + estimator=RotationForestClassifier(n_estimators=2), + ) + X, y = load_minimal_chinatown() + estimator.fit(X, y) + + test_dir = _TEST_OUTPUT_PATH + "/attribute_writing/" + estimator_attributes_to_file(estimator, test_dir) + + assert os.path.exists(test_dir + "ShapeletTransformClassifier.txt") From a49fc17d67f8ed1f028b09d25354d7614ef97bfa Mon Sep 17 00:00:00 2001 From: Matthew Middlehurst Date: Thu, 14 Dec 2023 14:56:04 +0000 Subject: [PATCH 7/8] fixes and arguments --- conftest.py | 17 ++++-- .../experiments/classification_experiments.py | 8 +++ .../experiments/clustering_experiments.py | 8 +++ tsml_eval/experiments/experiments.py | 56 ++++++++++++++++++- .../experiments/forecasting_experiments.py | 8 +++ .../experiments/regression_experiments.py | 8 +++ .../threaded_classification_experiments.py | 8 +++ .../threaded_clustering_experiments.py | 8 +++ .../threaded_forecasting_experiments.py | 8 +++ .../threaded_regression_experiments.py | 8 +++ tsml_eval/utils/arguments.py | 28 +++++++++- tsml_eval/utils/experiments.py | 52 +++++++++++++---- .../utils/tests/test_attribute_writing.py | 44 ++++++++++++++- .../utils/tests/test_misc_experiments.py | 8 ++- 14 files changed, 250 insertions(+), 19 deletions(-) diff --git a/conftest.py b/conftest.py index dc8a0996..a1208bdc 100644 --- a/conftest.py +++ b/conftest.py @@ -7,13 +7,12 @@ from tsml_eval.experiments import experiments from tsml_eval.testing.test_utils import _TEST_OUTPUT_PATH +KEEP_PYTEST_OUTPUT = True + def pytest_sessionfinish(session, exitstatus): - """ - Called after test run is finished, right before returning the exit status to - the system. - """ - if not hasattr(session.config, "workerinput"): + """Call after test run is finished, before returning the exit status to system.""" + if not hasattr(session.config, "workerinput") and not KEEP_PYTEST_OUTPUT: shutil.rmtree(_TEST_OUTPUT_PATH) @@ -26,8 +25,16 @@ def pytest_addoption(parser): help="Set the time interval in seconds for recording memory usage " "(default: %(default)s).", ) + parser.addoption( + "--keepoutput", + action="store_true", + help="Keep the unit test output folder after running pytest" + " (default: %(default)s).", + ) def pytest_configure(config): """Pytest configuration preamble.""" experiments.MEMRECORD_INTERVAL = config.getoption("--meminterval") + global KEEP_PYTEST_OUTPUT + KEEP_PYTEST_OUTPUT = config.getoption("--keepoutput") diff --git a/tsml_eval/experiments/classification_experiments.py b/tsml_eval/experiments/classification_experiments.py index ae65f40f..549379d3 100644 --- a/tsml_eval/experiments/classification_experiments.py +++ b/tsml_eval/experiments/classification_experiments.py @@ -84,6 +84,8 @@ def run_experiment(args): classifier_name=args.estimator_name, resample_id=args.resample_id, build_train_file=args.train_fold, + write_attributes=args.write_attributes, + att_max_shape=args.att_max_shape, benchmark_time=args.benchmark_time, overwrite=args.overwrite, predefined_resample=args.predefined_resample, @@ -100,6 +102,9 @@ def run_experiment(args): row_normalise = False resample_id = 0 train_fold = False + write_attributes = True + att_max_shape = 0 + benchmark_time = True overwrite = False predefined_resample = False fit_contract = 0 @@ -126,6 +131,9 @@ def run_experiment(args): classifier_name=estimator_name, resample_id=resample_id, build_train_file=train_fold, + write_attributes=write_attributes, + att_max_shape=att_max_shape, + benchmark_time=benchmark_time, overwrite=overwrite, predefined_resample=predefined_resample, ) diff --git a/tsml_eval/experiments/clustering_experiments.py b/tsml_eval/experiments/clustering_experiments.py index fc914318..09708acd 100644 --- a/tsml_eval/experiments/clustering_experiments.py +++ b/tsml_eval/experiments/clustering_experiments.py @@ -91,6 +91,8 @@ def run_experiment(args): clusterer_name=args.estimator_name, resample_id=args.resample_id, build_test_file=args.test_fold, + write_attributes=args.write_attributes, + att_max_shape=args.att_max_shape, benchmark_time=args.benchmark_time, overwrite=args.overwrite, predefined_resample=args.predefined_resample, @@ -109,6 +111,9 @@ def run_experiment(args): n_clusters = -1 resample_id = 0 test_fold = False + write_attributes = True + att_max_shape = 0 + benchmark_time = True overwrite = False predefined_resample = False fit_contract = 0 @@ -138,6 +143,9 @@ def run_experiment(args): clusterer_name=estimator_name, resample_id=resample_id, build_test_file=test_fold, + write_attributes=write_attributes, + att_max_shape=att_max_shape, + benchmark_time=benchmark_time, overwrite=overwrite, predefined_resample=predefined_resample, combine_train_test_split=combine_test_train_split, diff --git a/tsml_eval/experiments/experiments.py b/tsml_eval/experiments/experiments.py index 72b39c55..c4c6d2cb 100644 --- a/tsml_eval/experiments/experiments.py +++ b/tsml_eval/experiments/experiments.py @@ -68,6 +68,7 @@ def run_classification_experiment( build_test_file=True, build_train_file=False, attribute_file_path=None, + att_max_shape=0, benchmark_time=True, ): """Run a classification experiment and save the results to file. @@ -178,7 +179,9 @@ def run_classification_experiment( fit_time += int(round(getattr(classifier, "_fit_time_milli", 0))) if attribute_file_path is not None: - estimator_attributes_to_file(classifier, attribute_file_path) + estimator_attributes_to_file( + classifier, attribute_file_path, max_list_shape=att_max_shape + ) if build_test_file: start = int(round(time.time() * 1000)) @@ -263,6 +266,8 @@ def load_and_run_classification_experiment( classifier_name=None, resample_id=0, build_train_file=False, + write_attributes=False, + att_max_shape=0, benchmark_time=True, overwrite=False, predefined_resample=False, @@ -308,6 +313,9 @@ def load_and_run_classification_experiment( the file format must include the resample_id at the end of the dataset name i.e. //++"_TRAIN.ts". """ + if classifier_name is None: + classifier_name = type(classifier).__name__ + build_test_file, build_train_file = _check_existing_results( results_path, classifier_name, @@ -331,6 +339,11 @@ def load_and_run_classification_experiment( X_train, y_train, X_test, y_test, random_state=resample_id ) + if write_attributes: + attribute_file_path = f"{results_path}/{classifier_name}/Workspace/{dataset}/" + else: + attribute_file_path = None + run_classification_experiment( X_train, y_train, @@ -344,6 +357,8 @@ def load_and_run_classification_experiment( resample_id=resample_id, build_test_file=build_test_file, build_train_file=build_train_file, + attribute_file_path=attribute_file_path, + att_max_shape=att_max_shape, benchmark_time=benchmark_time, ) @@ -362,6 +377,7 @@ def run_regression_experiment( build_test_file=True, build_train_file=False, attribute_file_path=None, + att_max_shape=0, benchmark_time=True, ): """Run a regression experiment and save the results to file. @@ -534,6 +550,8 @@ def load_and_run_regression_experiment( regressor_name=None, resample_id=0, build_train_file=False, + write_attributes=False, + att_max_shape=0, benchmark_time=True, overwrite=False, predefined_resample=False, @@ -579,6 +597,9 @@ def load_and_run_regression_experiment( the file format must include the resample_id at the end of the dataset name i.e. //++"_TRAIN.ts". """ + if regressor_name is None: + regressor_name = type(regressor).__name__ + build_test_file, build_train_file = _check_existing_results( results_path, regressor_name, @@ -602,6 +623,11 @@ def load_and_run_regression_experiment( X_train, y_train, X_test, y_test, random_state=resample_id ) + if write_attributes: + attribute_file_path = f"{results_path}/{regressor_name}/Workspace/{dataset}/" + else: + attribute_file_path = None + # Ensure labels are floats y_train = y_train.astype(float) y_test = y_test.astype(float) @@ -619,6 +645,8 @@ def load_and_run_regression_experiment( resample_id=resample_id, build_test_file=build_test_file, build_train_file=build_train_file, + attribute_file_path=attribute_file_path, + att_max_shape=att_max_shape, benchmark_time=benchmark_time, ) @@ -638,6 +666,7 @@ def run_clustering_experiment( build_test_file=False, build_train_file=True, attribute_file_path=None, + att_max_shape=0, benchmark_time=True, ): """Run a clustering experiment and save the results to file. @@ -873,6 +902,8 @@ def load_and_run_clustering_experiment( clusterer_name=None, resample_id=0, build_test_file=False, + write_attributes=False, + att_max_shape=0, benchmark_time=True, overwrite=False, predefined_resample=False, @@ -926,6 +957,9 @@ def load_and_run_clustering_experiment( the train/test split is combined into a single train set. If False then the train/test split is used as normal. """ + if clusterer_name is None: + clusterer_name = type(clusterer).__name__ + if combine_train_test_split: build_test_file = False @@ -952,6 +986,11 @@ def load_and_run_clustering_experiment( X_train, y_train, X_test, y_test, random_state=resample_id ) + if write_attributes: + attribute_file_path = f"{results_path}/{clusterer_name}/Workspace/{dataset}/" + else: + attribute_file_path = None + if combine_train_test_split: y_train = np.concatenate((y_train, y_test), axis=None) X_train = ( @@ -976,6 +1015,8 @@ def load_and_run_clustering_experiment( resample_id=resample_id, build_train_file=build_train_file, build_test_file=build_test_file, + attribute_file_path=attribute_file_path, + att_max_shape=att_max_shape, benchmark_time=benchmark_time, ) @@ -989,6 +1030,7 @@ def run_forecasting_experiment( dataset_name="N/A", random_seed=None, attribute_file_path=None, + att_max_shape=0, benchmark_time=True, ): """Run a forecasting experiment and save the results to file. @@ -1085,6 +1127,8 @@ def load_and_run_forecasting_experiment( forecaster, forecaster_name=None, random_seed=None, + write_attributes=False, + att_max_shape=0, benchmark_time=True, overwrite=False, ): @@ -1118,6 +1162,9 @@ def load_and_run_forecasting_experiment( If set to False, this will only build results if there is not a result file already present. If True, it will overwrite anything already there. """ + if forecaster_name is None: + forecaster_name = type(forecaster).__name__ + build_test_file, _ = _check_existing_results( results_path, forecaster_name, @@ -1132,6 +1179,11 @@ def load_and_run_forecasting_experiment( warnings.warn("All files exist and not overwriting, skipping.", stacklevel=1) return + if write_attributes: + attribute_file_path = f"{results_path}/{forecaster_name}/Workspace/{dataset}/" + else: + attribute_file_path = None + train = pd.read_csv( f"{problem_path}/{dataset}/{dataset}_TRAIN.csv", index_col=0 ).squeeze("columns") @@ -1149,6 +1201,8 @@ def load_and_run_forecasting_experiment( forecaster_name=forecaster_name, dataset_name=dataset, random_seed=random_seed, + attribute_file_path=attribute_file_path, + att_max_shape=att_max_shape, benchmark_time=benchmark_time, ) diff --git a/tsml_eval/experiments/forecasting_experiments.py b/tsml_eval/experiments/forecasting_experiments.py index 21c5cf4d..405c9357 100644 --- a/tsml_eval/experiments/forecasting_experiments.py +++ b/tsml_eval/experiments/forecasting_experiments.py @@ -77,6 +77,8 @@ def run_experiment(args, overwrite=False): random_seed=args.resample_id if args.random_seed is None else args.random_seed, + write_attributes=args.write_attributes, + att_max_shape=args.att_max_shape, benchmark_time=args.benchmark_time, overwrite=args.overwrite, ) @@ -90,6 +92,9 @@ def run_experiment(args, overwrite=False): estimator_name = "NaiveForecaster" dataset_name = "ShampooSales" random_seed = 0 + write_attributes = True + att_max_shape = 0 + benchmark_time = True overwrite = False kwargs = {} @@ -108,6 +113,9 @@ def run_experiment(args, overwrite=False): forecaster, forecaster_name=estimator_name, random_seed=random_seed, + write_attributes=write_attributes, + att_max_shape=att_max_shape, + benchmark_time=benchmark_time, overwrite=overwrite, ) diff --git a/tsml_eval/experiments/regression_experiments.py b/tsml_eval/experiments/regression_experiments.py index 03da48fd..137ac6c6 100644 --- a/tsml_eval/experiments/regression_experiments.py +++ b/tsml_eval/experiments/regression_experiments.py @@ -84,6 +84,8 @@ def run_experiment(args): regressor_name=args.estimator_name, resample_id=args.resample_id, build_train_file=args.train_fold, + write_attributes=args.write_attributes, + att_max_shape=args.att_max_shape, benchmark_time=args.benchmark_time, overwrite=args.overwrite, predefined_resample=args.predefined_resample, @@ -100,6 +102,9 @@ def run_experiment(args): row_normalise = False resample_id = 0 train_fold = False + write_attributes = True + att_max_shape = 0 + benchmark_time = True overwrite = False predefined_resample = False fit_contract = 0 @@ -126,6 +131,9 @@ def run_experiment(args): regressor_name=estimator_name, resample_id=resample_id, build_train_file=train_fold, + write_attributes=write_attributes, + att_max_shape=att_max_shape, + benchmark_time=benchmark_time, overwrite=overwrite, predefined_resample=predefined_resample, ) diff --git a/tsml_eval/experiments/threaded_classification_experiments.py b/tsml_eval/experiments/threaded_classification_experiments.py index 8d92d072..284dd8d9 100644 --- a/tsml_eval/experiments/threaded_classification_experiments.py +++ b/tsml_eval/experiments/threaded_classification_experiments.py @@ -60,6 +60,8 @@ def run_experiment(args): classifier_name=args.estimator_name, resample_id=args.resample_id, build_train_file=args.train_fold, + write_attributes=args.write_attributes, + att_max_shape=args.att_max_shape, benchmark_time=args.benchmark_time, overwrite=args.overwrite, predefined_resample=args.predefined_resample, @@ -76,6 +78,9 @@ def run_experiment(args): resample_id = 0 n_jobs = 1 train_fold = False + write_attributes = True + att_max_shape = 0 + benchmark_time = True overwrite = False predefined_resample = False fit_contract = 0 @@ -102,6 +107,9 @@ def run_experiment(args): classifier_name=estimator_name, resample_id=resample_id, build_train_file=train_fold, + write_attributes=write_attributes, + att_max_shape=att_max_shape, + benchmark_time=benchmark_time, overwrite=overwrite, predefined_resample=predefined_resample, ) diff --git a/tsml_eval/experiments/threaded_clustering_experiments.py b/tsml_eval/experiments/threaded_clustering_experiments.py index f26f15cf..3c97ce99 100644 --- a/tsml_eval/experiments/threaded_clustering_experiments.py +++ b/tsml_eval/experiments/threaded_clustering_experiments.py @@ -68,6 +68,8 @@ def run_experiment(args): clusterer_name=args.estimator_name, resample_id=args.resample_id, build_test_file=args.test_fold, + write_attributes=args.write_attributes, + att_max_shape=args.att_max_shape, benchmark_time=args.benchmark_time, overwrite=args.overwrite, predefined_resample=args.predefined_resample, @@ -85,6 +87,9 @@ def run_experiment(args): resample_id = 0 n_jobs = 1 test_fold = False + write_attributes = True + att_max_shape = 0 + benchmark_time = True overwrite = False predefined_resample = False fit_contract = 0 @@ -113,6 +118,9 @@ def run_experiment(args): clusterer_name=estimator_name, resample_id=resample_id, build_test_file=test_fold, + write_attributes=write_attributes, + att_max_shape=att_max_shape, + benchmark_time=benchmark_time, overwrite=overwrite, predefined_resample=predefined_resample, ) diff --git a/tsml_eval/experiments/threaded_forecasting_experiments.py b/tsml_eval/experiments/threaded_forecasting_experiments.py index a904b3c1..ee4bd732 100644 --- a/tsml_eval/experiments/threaded_forecasting_experiments.py +++ b/tsml_eval/experiments/threaded_forecasting_experiments.py @@ -52,6 +52,8 @@ def run_experiment(args, overwrite=False): random_seed=args.resample_id if args.random_seed is None else args.random_seed, + write_attributes=args.write_attributes, + att_max_shape=args.att_max_shape, benchmark_time=args.benchmark_time, overwrite=args.overwrite, ) @@ -66,6 +68,9 @@ def run_experiment(args, overwrite=False): dataset_name = "ShampooSales" random_seed = 0 n_jobs = 1 + write_attributes = True + att_max_shape = 0 + benchmark_time = True overwrite = False kwargs = {} @@ -84,6 +89,9 @@ def run_experiment(args, overwrite=False): forecaster, forecaster_name=estimator_name, random_seed=random_seed, + write_attributes=write_attributes, + att_max_shape=att_max_shape, + benchmark_time=benchmark_time, overwrite=overwrite, ) diff --git a/tsml_eval/experiments/threaded_regression_experiments.py b/tsml_eval/experiments/threaded_regression_experiments.py index 73f3e8a3..b2a47979 100644 --- a/tsml_eval/experiments/threaded_regression_experiments.py +++ b/tsml_eval/experiments/threaded_regression_experiments.py @@ -61,6 +61,8 @@ def run_experiment(args): regressor_name=args.estimator_name, resample_id=args.resample_id, build_train_file=args.train_fold, + write_attributes=args.write_attributes, + att_max_shape=args.att_max_shape, benchmark_time=args.benchmark_time, overwrite=args.overwrite, predefined_resample=args.predefined_resample, @@ -77,6 +79,9 @@ def run_experiment(args): resample_id = 0 n_jobs = 1 train_fold = False + write_attributes = True + att_max_shape = 0 + benchmark_time = True overwrite = False predefined_resample = False fit_contract = 0 @@ -103,6 +108,9 @@ def run_experiment(args): regressor_name=estimator_name, resample_id=resample_id, build_train_file=train_fold, + write_attributes=write_attributes, + att_max_shape=att_max_shape, + benchmark_time=benchmark_time, overwrite=overwrite, predefined_resample=predefined_resample, ) diff --git a/tsml_eval/utils/arguments.py b/tsml_eval/utils/arguments.py index 617edbe3..58290d8f 100644 --- a/tsml_eval/utils/arguments.py +++ b/tsml_eval/utils/arguments.py @@ -77,7 +77,16 @@ def parse_args(args): (default: False). -bt, --benchmark_time run a benchmark function and save the time spent in the - results file (default: %(default)s). + results file (default: False). + -wa, --write_attributes + write the estimator attributes to file when running + experiments. Will recursively write the attributes of + sub-estimators if present. (default: False). + -ams ATT_MAX_SHAPE, --att_max_shape ATT_MAX_SHAPE + The max estimator collections shape allowed when + writing attributes, at 0 no estimators in collections + will be written, at 1 estimators in one-dimensional + lists will be written etc. (default: 0). -kw KEY VALUE TYPE, --kwargs KEY VALUE TYPE, --kwarg KEY VALUE TYPE additional keyword arguments to pass to the estimator. Should contain the parameter to set, the parameter @@ -216,6 +225,23 @@ def parse_args(args): help="run a benchmark function and save the time spent in the results file " "(default: %(default)s).", ) + parser.add_argument( + "-wa", + "--write_attributes", + action="store_true", + help="write the estimator attributes to file when running experiments. Will " + "recursively write the attributes of sub-estimators if present. " + "(default: %(default)s).", + ) + parser.add_argument( + "-ams", + "--att_max_shape", + type=int, + default=0, + help="The max estimator collections shape allowed when writing attributes, at " + "0 no estimators in collections will be written, at 1 estimators in " + "one-dimensional lists will be written etc. (default: %(default)s).", + ) parser.add_argument( "-kw", "--kwargs", diff --git a/tsml_eval/utils/experiments.py b/tsml_eval/utils/experiments.py index c880e9d6..de29d4bd 100644 --- a/tsml_eval/utils/experiments.py +++ b/tsml_eval/utils/experiments.py @@ -1010,6 +1010,30 @@ def timing_benchmark(num_arrays=1000, array_size=20000, random_state=None): def estimator_attributes_to_file( estimator, dir_path, estimator_name=None, max_depth=np.inf, max_list_shape=np.inf ): + """Write the attributes of an estimator to file(s). + + Write the attributes of an estimator to file at a given directory. The function + will recursively write the attributes of any estimators or non-string sequences + containing estimators found in the attributes of the input estimator to spearate + files. + + Parameters + ---------- + estimator : estimator instance + The estimator to write the attributes of. + dir_path : str + The directory to write the attribute files to. + estimator_name : str or None, default=None + The name of the estimator. If None, the name of the estimator class will be + used. + max_depth : int, default=np.inf + The maximum depth to go when recursively writing attributes of estimators. + max_list_shape : int, default=np.inf + The maximum shape of a list to write when recursively writing attributes of + contained estimators. i.e. for 0, no estimators contained in lists will be + written, for 1, only estimators contained in 1-dimensional lists or the top + level of a list will be written. + """ estimator_name = ( estimator.__class__.__name__ if estimator_name is None else estimator_name ) @@ -1031,16 +1055,24 @@ def _write_estimator_attributes_recursive( value = getattr(estimator, attr) file.write(f"{attr}: {value}\n") - if isinstance(value, BaseEstimator): - new_dir_path = f"{dir_path}/{attr}/" - file.write(f" See {new_dir_path}{attr}.txt for more details\n") - _write_estimator_attributes_recursive( - value, new_dir_path, attr, depth + 1, max_depth, max_list_shape - ) - elif _is_non_string_sequence(value): - _write_list_attributes_recursive( - value, file, dir_path, attr, depth + 1, max_depth, 0, max_list_shape - ) + if depth + 1 <= max_depth: + if isinstance(value, BaseEstimator): + new_dir_path = f"{dir_path}/{attr}/" + file.write(f" See {new_dir_path}{attr}.txt for more details\n") + _write_estimator_attributes_recursive( + value, new_dir_path, attr, depth + 1, max_depth, max_list_shape + ) + elif _is_non_string_sequence(value): + _write_list_attributes_recursive( + value, + file, + dir_path, + attr, + depth + 1, + max_depth, + 1, + max_list_shape, + ) def _write_list_attributes_recursive( diff --git a/tsml_eval/utils/tests/test_attribute_writing.py b/tsml_eval/utils/tests/test_attribute_writing.py index 20dbd6c6..fb8179d1 100644 --- a/tsml_eval/utils/tests/test_attribute_writing.py +++ b/tsml_eval/utils/tests/test_attribute_writing.py @@ -8,7 +8,7 @@ from tsml_eval.utils.experiments import estimator_attributes_to_file -def test_file_creation(): +def test_estimator_attributes_to_file(): estimator = ShapeletTransformClassifier( n_shapelet_samples=50, estimator=RotationForestClassifier(n_estimators=2), @@ -20,3 +20,45 @@ def test_file_creation(): estimator_attributes_to_file(estimator, test_dir) assert os.path.exists(test_dir + "ShapeletTransformClassifier.txt") + assert os.path.exists(test_dir + "estimator/estimator.txt") + assert os.path.exists(test_dir + "_estimator/_estimator.txt") + assert os.path.exists(test_dir + "_estimator/_base_estimator/_base_estimator.txt") + assert os.path.exists(test_dir + "_estimator/_pcas_0_0/_pcas_0_0.txt") + + +def test_max_depth(): + estimator = ShapeletTransformClassifier( + n_shapelet_samples=50, + estimator=RotationForestClassifier(n_estimators=2), + ) + X, y = load_minimal_chinatown() + estimator.fit(X, y) + + test_dir = _TEST_OUTPUT_PATH + "/attribute_writing_max_depth/" + estimator_attributes_to_file(estimator, test_dir, max_depth=1) + + assert os.path.exists(test_dir + "ShapeletTransformClassifier.txt") + assert os.path.exists(test_dir + "estimator/estimator.txt") + assert os.path.exists(test_dir + "_estimator/_estimator.txt") + assert not os.path.exists( + test_dir + "_estimator/_base_estimator/_base_estimator.txt" + ) + assert not os.path.exists(test_dir + "_estimator/_pcas_0_0/_pcas_0_0.txt") + + +def test_max_list_shape(): + estimator = ShapeletTransformClassifier( + n_shapelet_samples=50, + estimator=RotationForestClassifier(n_estimators=2), + ) + X, y = load_minimal_chinatown() + estimator.fit(X, y) + + test_dir = _TEST_OUTPUT_PATH + "/attribute_writing_max_list_shape/" + estimator_attributes_to_file(estimator, test_dir, max_list_shape=1) + + assert os.path.exists(test_dir + "ShapeletTransformClassifier.txt") + assert os.path.exists(test_dir + "estimator/estimator.txt") + assert os.path.exists(test_dir + "_estimator/_estimator.txt") + assert os.path.exists(test_dir + "_estimator/_base_estimator/_base_estimator.txt") + assert not os.path.exists(test_dir + "_estimator/_pcas_0_0/_pcas_0_0.txt") diff --git a/tsml_eval/utils/tests/test_misc_experiments.py b/tsml_eval/utils/tests/test_misc_experiments.py index d3d82cc1..ef47c684 100644 --- a/tsml_eval/utils/tests/test_misc_experiments.py +++ b/tsml_eval/utils/tests/test_misc_experiments.py @@ -2,7 +2,7 @@ import pytest -from tsml_eval.utils.experiments import _results_present +from tsml_eval.utils.experiments import _results_present, timing_benchmark @pytest.mark.parametrize("split", ["BOTH", "TRAIN", "TEST", None, "invalid"]) @@ -23,3 +23,9 @@ def test_results_present_split_inputs(split): "test", split=split, ) + + +def test_timing_benchmark_invalid_input(): + """Test timing_benchmark function with invalid input.""" + with pytest.raises(ValueError): + timing_benchmark(random_state="invalid") From 2bf6223aa21f178c48587c4049697eae4b232f21 Mon Sep 17 00:00:00 2001 From: Matthew Middlehurst Date: Thu, 14 Dec 2023 15:14:37 +0000 Subject: [PATCH 8/8] pr labeler fix --- build_tools/pr_labeler.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/build_tools/pr_labeler.py b/build_tools/pr_labeler.py index fe8bc6c4..620c7c04 100644 --- a/build_tools/pr_labeler.py +++ b/build_tools/pr_labeler.py @@ -41,14 +41,14 @@ paths = [file.filename for file in pr.get_files()] content_paths_to_labels = [ - ("tsml-eval/datasets/", "datasets"), - ("tsml-eval/estimators/", "estimators"), - ("tsml-eval/evaluation/", "evaluation"), + ("tsml_eval/datasets/", "datasets"), + ("tsml_eval/estimators/", "estimators"), + ("tsml_eval/evaluation/", "evaluation"), ("examples/", "examples"), - ("tsml-eval/experiments/", "experiments"), - ("tsml-eval/publications/", "publications"), + ("tsml_eval/experiments/", "experiments"), + ("tsml_eval/publications/", "publications"), ("results/", "results"), - ("tsml-eval/testing/", "testing"), + ("tsml_eval/testing/", "testing"), ] present_content_labels = [