diff --git a/Snakefile b/Snakefile index 5b9340ccc..cf075b0fa 100644 --- a/Snakefile +++ b/Snakefile @@ -22,7 +22,6 @@ _config.init_global(config) out_dir = _config.config.out_dir algorithm_params = _config.config.algorithm_params -algorithm_directed = _config.config.algorithm_directed pca_params = _config.config.pca_params hac_params = _config.config.hac_params container_settings = _config.config.container_settings @@ -273,16 +272,12 @@ rule reconstruct: run: # Create a copy so that the updates are not written to the parameters logfile params = reconstruction_params(wildcards.algorithm, wildcards.params).copy() - # Add the input files - params.update(dict(zip(runner.get_required_inputs(wildcards.algorithm), *{input}, strict=True))) - # Add the output file - # All run functions can accept a relative path to the output file that should be written that is called 'output_file' - params['output_file'] = output.pathway_file - # Remove the default placeholder parameter added for algorithms that have no parameters - if 'spras_placeholder' in params: - params.pop('spras_placeholder') - params['container_settings'] = container_settings - runner.run(wildcards.algorithm, params) + # Declare the input files as a dictionary. + inputs = dict(zip(runner.get_required_inputs(wildcards.algorithm), *{input}, strict=True)) + # Remove the _spras_run_name parameter added for keeping track of the run name for parameters.yml + if '_spras_run_name' in params: + params.pop('_spras_run_name') + runner.run(wildcards.algorithm, inputs, output.pathway_file, params, container_settings) # Original pathway reconstruction output to universal output # Use PRRunner as a wrapper to call the algorithm-specific parse_output diff --git a/config/config.yaml b/config/config.yaml index 11bac082a..f2899fb9a 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -3,6 +3,7 @@ # The length of the hash used to identify a parameter combination hash_length: 7 +# Collection of container options containers: # Specify the container framework used by each PRM wrapper. Valid options include: # - docker (default if not specified) @@ -60,14 +61,14 @@ containers: algorithms: - name: "pathlinker" - params: - include: true + include: true + runs: run1: k: range(100,201,100) - name: "omicsintegrator1" - params: - include: true + include: true + runs: run1: b: [5, 6] w: np.linspace(0,5,2) @@ -75,8 +76,8 @@ algorithms: dummy_mode: "file" # Or "terminals", "all", "others" - name: "omicsintegrator2" - params: - include: true + include: true + runs: run1: b: 4 g: 0 @@ -85,52 +86,50 @@ algorithms: g: 3 - name: "meo" - params: - include: true + include: true + runs: run1: max_path_length: 3 - local_search: "Yes" + local_search: true rand_restarts: 10 - name: "mincostflow" - params: - include: true + include: true + runs: run1: - flow: 1 # The flow must be an int + flow: 1 capacity: 1 - name: "allpairs" - params: - include: true + include: true - name: "domino" - params: - include: true + include: true + runs: run1: slice_threshold: 0.3 module_threshold: 0.05 - name: "strwr" - params: - include: true + include: true + runs: run1: alpha: [0.85] threshold: [100, 200] - name: "rwr" - params: - include: true + include: true + runs: run1: alpha: [0.85] threshold: [100, 200] - name: "bowtiebuilder" - params: - include: true + include: true - name: "responsenet" - params: - include: true + include: true + runs: run1: gamma: [10] diff --git a/config/egfr.yaml b/config/egfr.yaml index 25e56ab25..b93c593c4 100644 --- a/config/egfr.yaml +++ b/config/egfr.yaml @@ -1,4 +1,3 @@ -# The length of the hash used to identify a parameter combination hash_length: 7 containers: @@ -29,108 +28,83 @@ containers: algorithms: - name: pathlinker - params: - include: true + include: true + runs: run1: k: - 10 - 20 - 70 - name: omicsintegrator1 - params: - include: true + include: true + runs: run1: b: - 0.55 - 2 - 10 - d: - - 10 - g: - - 1e-3 - r: - - 0.01 - w: - - 0.1 - mu: - - 0.008 + d: 10 + g: 1e-3 + r: 0.01 + w: 0.1 + mu: 0.008 dummy_mode: ["file"] - name: omicsintegrator2 - params: - include: true + include: true + runs: run1: - b: - - 4 - g: - - 0 + b: 4 + g: 0 run2: - b: - - 2 - g: - - 3 + b: 2 + g: 3 - name: meo - params: - include: true + include: true + runs: run1: - local_search: - - "Yes" - max_path_length: - - 3 - rand_restarts: - - 10 + local_search: true + max_path_length: 3 + rand_restarts: 10 run2: - local_search: - - "No" - max_path_length: - - 2 - rand_restarts: - - 10 + local_search: false + max_path_length: 2 + rand_restarts: 10 - name: allpairs - params: - include: true + include: true - name: domino - params: - include: true + include: true + runs: run1: - slice_threshold: - - 0.3 - module_threshold: - - 0.05 + slice_threshold: 0.3 + module_threshold: 0.05 - name: mincostflow - params: - include: true + include: true + runs: run1: - capacity: - - 15 - flow: - - 80 + capacity: 15 + flow: 80 run2: - capacity: - - 1 - flow: - - 6 + capacity: 1 + flow: 6 run3: - capacity: - - 5 - flow: - - 60 + capacity: 5 + flow: 60 - name: "strwr" - params: - include: true + include: true + runs: run1: alpha: [0.85] threshold: [100, 200] - name: "rwr" - params: - include: true + include: true + runs: run1: alpha: [0.85] threshold: [100, 200] - name: "bowtiebuilder" - params: - include: false + include: false datasets: - data_dir: input edge_files: diff --git a/docker-wrappers/SPRAS/example_config.yaml b/docker-wrappers/SPRAS/example_config.yaml index db1c2dbbf..1e7fd69c2 100644 --- a/docker-wrappers/SPRAS/example_config.yaml +++ b/docker-wrappers/SPRAS/example_config.yaml @@ -49,14 +49,14 @@ containers: algorithms: - name: "pathlinker" - params: - include: false + include: false + runs: run1: k: range(100,201,100) - name: "omicsintegrator1" - params: - include: true + include: true + runs: run1: r: [5] b: [5, 6] @@ -65,8 +65,8 @@ algorithms: d: [10] - name: "omicsintegrator2" - params: - include: true + include: true + runs: run1: b: [4] g: [0] @@ -75,27 +75,26 @@ algorithms: g: [3] - name: "meo" - params: - include: true + include: true + runs: run1: max_path_length: [3] - local_search: ["Yes"] + local_search: [true] rand_restarts: [10] - name: "mincostflow" - params: - include: true + include: true + runs: run1: flow: [1] # The flow must be an int capacity: [1] - name: "allpairs" - params: - include: true + include: true - name: "domino" - params: - include: true + include: true + runs: run1: slice_threshold: [0.3] module_threshold: [0.05] diff --git a/docs/_static/config/intermediate.yaml b/docs/_static/config/intermediate.yaml index 1f0ba2eb5..58d1400d8 100644 --- a/docs/_static/config/intermediate.yaml +++ b/docs/_static/config/intermediate.yaml @@ -23,15 +23,15 @@ containers: algorithms: - name: "pathlinker" - params: - include: true + include: true + runs: run1: k: 1 run2: k: [10, 100] - name: omicsintegrator1 - params: - include: true + include: true + runs: run1: b: [0.55, 2, 10] d: 10 @@ -40,8 +40,8 @@ algorithms: w: 0.1 mu: 0.008 - name: omicsintegrator2 - params: - include: true + include: true + runs: run1: b: 4 g: 0 @@ -49,24 +49,23 @@ algorithms: b: 2 g: 3 - name: meo - params: - include: true + include: true + runs: run1: - local_search: ["Yes", "No"] + local_search: [true, false] max_path_length: [2, 3] rand_restarts: 10 - name: allpairs - params: - include: true + include: true - name: domino - params: - include: true + include: true + runs: run1: slice_threshold: 0.3 module_threshold: 0.05 - name: mincostflow - params: - include: true + include: true + runs: run1: capacity: 15 flow: 80 @@ -77,14 +76,14 @@ algorithms: capacity: 5 flow: 60 - name: "strwr" - params: - include: true + include: true + runs: run1: alpha: [0.85] threshold: [100, 200] - name: "rwr" - params: - include: true + include: true + runs: run1: alpha: [0.85] threshold: [100, 200] diff --git a/docs/contributing/index.rst b/docs/contributing/index.rst index 0d8a542ff..b0780619f 100644 --- a/docs/contributing/index.rst +++ b/docs/contributing/index.rst @@ -121,13 +121,9 @@ directory to create a Docker image. The PathLinker ``Dockerfile`` demonstrates how to begin with a Python image and copy files into the image with ``COPY``. Browse the official `Python images `__ to select a recent version -of Python based on Alpine Linux, a small Linux distribution. Note that -the PathLinker example uses an old version of Python, but this Local +of Python based on Alpine Linux, a small Linux distribution. This Local Neighborhood Docker image should be based on a more modern version of -Python. In addition, not all pathway reconstruction algorithms are -compatible with Alpine Linux, so the default Debian-based Python image -is required. The ``Dockerfile`` does not need an ``ENTRYPOINT`` or -``CMD`` line. It will be used to run a Python command. +Python. Build the Docker image by running @@ -176,11 +172,13 @@ Step 3: Write the Local Neighborhood wrapper functions Add a new Python file ``spras/local_neighborhood.py`` to implement the wrapper functions for the Local Neighborhood algorithm. Use -``pathlinker.py`` as an example. +``allpairs.py`` as an example. Call the new class within ``local_neighborhood.py`` ``LocalNeighborhood`` and set ``__all__`` so the class can be `imported `__. +Make sure to specify the type of parameters passed in to ``LocalNeighborhood`` as ``Empty`` +(see ``AllPairs`` for an example of this.) Specify the list of ``required_input`` files to be ``network`` and ``nodes``, and set the ``dois`` property to be an empty list. These entries are used to tell Snakemake what input files should be present @@ -242,11 +240,11 @@ the format ``|``, which also differs from the ``omicsintegrator1.py`` example. ``spras/dataset.py`` provides functions that provide access to node information and the interactome (edge list). -Implement the ``run`` function, following the PathLinker example. The +Implement the ``run`` function, following the AllPairs example. The ``prepare_volume`` utility function is needed to prepare the network and nodes input files to be mounted and used inside the container. It is also used to prepare the path for the output file, which is different -from how the output is prepared in the PathLinker example. The +from how the output is prepared in the AllPairs example. The functionality of ``prepare_volume`` is similar to how you had to manually specify paths relative to the container's file system when you interactive tested the container in Step 2. It is not necessary to @@ -324,7 +322,7 @@ Add test functions to the test file ``test/test_ln.py``. This file already has existing tests to test the correctness of the Local Neighborhood implementation that was added to the Docker image. The new tests will test that the ``run`` function of the ``LocalNeighborhood`` -class works correctly. Use ``test_pathlinker.py`` as an example. There +class works correctly. Use ``test_ap.py`` as an example. There are input files for testing in the ``test/LocalNeighborhood/input`` directory. The new test functions will be automatically run as part of diff --git a/docs/prms/meo.rst b/docs/prms/meo.rst index 55157174a..71206adf9 100644 --- a/docs/prms/meo.rst +++ b/docs/prms/meo.rst @@ -9,6 +9,6 @@ MEO takes in three optional parameters: * max_path_length: The maximal path (from any source to any target) lengths to return when orienting the graph (note: paths may contain duplicate vertices, but never duplicate edges.) -* local_search: a "Yes"/"No" parameter that enables MEO's local search functionality. See "Improving approximations with local search" in - the associated paper for more information. This should almost always be yes. +* local_search: a boolean parameter that enables MEO's local search functionality. See "Improving approximations with local search" in + the associated paper for more information. This should almost always be true. * rand_restarts: the number (int) of random restarts to use. diff --git a/spras/allpairs.py b/spras/allpairs.py index 77e26198f..21fca6ee4 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -2,6 +2,7 @@ from pathlib import Path from spras.config.container_schema import ProcessedContainerSettings +from spras.config.util import Empty from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset from spras.interactome import ( @@ -15,7 +16,7 @@ __all__ = ['AllPairs'] -class AllPairs(PRM): +class AllPairs(PRM[Empty]): required_inputs = ['nodetypes', 'network', 'directed_flag'] dois = [] @@ -24,7 +25,10 @@ def generate_inputs(data: Dataset, filename_map): """ Access fields from the dataset and write the required input files @param data: dataset - @param filename_map: a dict mapping file types in the required_inputs to the filename for that type + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type. Associated files will be written with: + - nodetypes: node types with sources and targets + - network: network file containing edges and their weights + - directed_flag: contains `true` if `network` is fully directed. """ AllPairs.validate_required_inputs(filename_map) @@ -70,27 +74,19 @@ def generate_inputs(data: Dataset, filename_map): header=["#Interactor1", "Interactor2", "Weight"]) @staticmethod - def run(nodetypes=None, network=None, directed_flag=None, output_file=None, container_settings=None): - """ - Run All Pairs Shortest Paths with Docker - @param nodetypes: input node types with sources and targets (required) - @param network: input network file (required) - @param container_settings: configure the container runtime - @param output_file: path to the output pathway file (required) - """ + def run(inputs, output_file, args=None, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() - if not nodetypes or not network or not output_file or not directed_flag: - raise ValueError('Required All Pairs Shortest Paths arguments are missing') + AllPairs.validate_required_run_args(inputs) work_dir = '/apsp' # Each volume is a tuple (src, dest) volumes = list() - bind_path, node_file = prepare_volume(nodetypes, work_dir, container_settings) + bind_path, node_file = prepare_volume(inputs["nodetypes"], work_dir, container_settings) volumes.append(bind_path) - bind_path, network_file = prepare_volume(network, work_dir, container_settings) + bind_path, network_file = prepare_volume(inputs["network"], work_dir, container_settings) volumes.append(bind_path) # Create the parent directories for the output file if needed @@ -104,7 +100,7 @@ def run(nodetypes=None, network=None, directed_flag=None, output_file=None, cont '--network', network_file, '--nodes', node_file, '--output', mapped_out_file] - if Path(directed_flag).read_text().strip() == "true": + if Path(inputs["directed_flag"]).read_text().strip() == "true": command.append("--directed") container_suffix = "allpairs:v4" diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py index c8abc1cad..2092200f5 100644 --- a/spras/analysis/summary.py +++ b/spras/analysis/summary.py @@ -98,6 +98,8 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg # Algorithm parameters have format { algo : { hashcode : { parameter combos } } } param_combo = algo_params[algo][hashcode] + del param_combo['_spras_run_name'] + # TODO: sort parameters to provide stable summary table output cur_nw_info.append(param_combo) # Save the current network information to the network summary list diff --git a/spras/btb.py b/spras/btb.py index 16669676e..d2f18debe 100644 --- a/spras/btb.py +++ b/spras/btb.py @@ -1,6 +1,7 @@ from pathlib import Path from spras.config.container_schema import ProcessedContainerSettings +from spras.config.util import Empty from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( convert_undirected_to_directed, @@ -24,7 +25,7 @@ Interactor1 Interactor2 Weight """ -class BowTieBuilder(PRM): +class BowTieBuilder(PRM[Empty]): required_inputs = ['sources', 'targets', 'edges'] dois = ["10.1186/1752-0509-3-67"] @@ -34,8 +35,10 @@ def generate_inputs(data, filename_map): """ Access fields from the dataset and write the required input files @param data: dataset - @param filename_map: a dict mapping file types in the required_inputs to the filename for that type - @return: + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type. Associated files will be written with: + - sources: NODEID-headered list of sources + - targets: NODEID-headered list of targets + - edges: node pairs with associated edge weights """ BowTieBuilder.validate_required_inputs(filename_map) @@ -69,31 +72,17 @@ def generate_inputs(data, filename_map): # Skips parameter validation step @staticmethod - def run(sources=None, targets=None, edges=None, output_file=None, container_settings=None): - """ - Run BTB with Docker - @param sources: input source file (required) - @param targets: input target file (required) - @param edges: input edge file (required) - @param output_file: path to the output pathway file (required) - @param container_settings: configure the container runtime - """ + def run(inputs, output_file, args=None, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() + BowTieBuilder.validate_required_run_args(inputs) # Tests for pytest (docker container also runs this) # Testing out here avoids the trouble that container errors provide - - if not sources or not targets or not edges or not output_file: - raise ValueError('Required BowTieBuilder arguments are missing') - - if not Path(sources).exists() or not Path(targets).exists() or not Path(edges).exists(): - raise ValueError('Missing input file') - # Testing for btb index errors # TODO: This error will never actually occur if the inputs are passed through # `generate_inputs`. See the discussion about removing this or making this a habit at # https://github.com/Reed-CompBio/spras/issues/306. - with open(edges, 'r') as edge_file: + with open(inputs["edges"], 'r') as edge_file: try: for line in edge_file: line = line.strip().split('\t')[2] @@ -107,13 +96,13 @@ def run(sources=None, targets=None, edges=None, output_file=None, container_sett # Each volume is a tuple (src, dest) volumes = list() - bind_path, source_file = prepare_volume(sources, work_dir, container_settings) + bind_path, source_file = prepare_volume(inputs["sources"], work_dir, container_settings) volumes.append(bind_path) - bind_path, target_file = prepare_volume(targets, work_dir, container_settings) + bind_path, target_file = prepare_volume(inputs["targets"], work_dir, container_settings) volumes.append(bind_path) - bind_path, edges_file = prepare_volume(edges, work_dir, container_settings) + bind_path, edges_file = prepare_volume(inputs["edges"], work_dir, container_settings) volumes.append(bind_path) # Use its --output argument to set the output file prefix to specify an absolute path and prefix diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py new file mode 100644 index 000000000..552fbc4e0 --- /dev/null +++ b/spras/config/algorithms.py @@ -0,0 +1,183 @@ +""" +Dynamic construction of algorithm parameters with runtime type information for +parameter combinations. This has been isolated from schema.py as it is not declarative, +and rather mainly contains validators and lower-level pydantic code. +""" +import ast +import copy +from typing import Annotated, Any, Callable, Literal, Union, cast, get_args + +import numpy as np +from pydantic import ( + BaseModel, + BeforeValidator, + ConfigDict, + Field, + ValidationError, + create_model, +) + +from spras.runner import algorithms + +# This contains the dynamically generated algorithm schema for use in `schema.py` +__all__ = ['AlgorithmUnion'] + +def is_numpy_friendly(type: type[Any] | None) -> bool: + """ + Whether the passed in type can have any numpy helpers. + This is used to provide hints in the JSON schema, + and to determine whether or not to allow for easy ranges using + `python_evalish_coerce`. + """ + allowed_types = (int, float) + + # check basic types, then check optional types + return type in allowed_types or \ + any([arg for arg in get_args(type) if arg in allowed_types]) + +def python_evalish_coerce(value: Any) -> Any: + """ + Allows for using numpy and python calls: specifically, + `range`, `np.linspace`, `np.arange`, and `np.logspace` are supported. + + **Safety Note**: This does not prevent availability attacks: this can still exhaust + resources if wanted. This only prevents secret leakage. + """ + + if not isinstance(value, str): + return value + + # These strings are in the form of function calls `function.name(param1, param2, ...)`. + # Since we want to avoid `eval` (since this might be running in the secret-sensitive HTCondor), + # we need to parse these functions. + functions_dict: dict[str, Callable[[list[Any]], list[Union[int, float]]]] = { + 'range': lambda params: list(range(*params)), + "np.linspace": lambda params: list(np.linspace(*params)), + "np.arange": lambda params: list(np.arange(*params)), + "np.logspace": lambda params: list(np.logspace(*params)), + } + + # To do this, we get the AST of our string as an expression + # (filename='' is to make the error message more closely resemble that of eval.) + value_ast = ast.parse(value, mode='eval', filename='') + + # Then we do some light parsing - we're only looking to do some literal evaluation + # (allowing light python notation) and some basic function parsing. Full python programs + # should just generate a config.yaml. + + # This should always be an Expression whose body is Call (a function). + if not isinstance(value_ast.body, ast.Call): + raise ValueError(f'This argument "{value}" was interpreted as a non-function-calling string: it should be a function call (e.g. range(100, 201, 50)), or an int or a float.') + + # We get the function name back as a string + function_name = ast.unparse(value_ast.body.func) + + # and we use the (non-availability) safe `ast.literal_eval` to support literals passed into functions. + arguments = [ast.literal_eval(arg) for arg in value_ast.body.args] + + if function_name not in functions_dict: + raise ValueError(f"{function_name} is not an allowed function to be run! Allowed functions: {list(functions_dict.keys())}") + + return functions_dict[function_name](arguments) + +def list_coerce(value: Any) -> Any: + """ + Coerces to a value to a list if it isn't already. + Used as a BeforeValidator. + """ + if not isinstance(value, list): + return [value] + return value + +# This is the most 'hacky' part of this code, but, thanks to pydantic, we avoid reflection +# and preserve rich type information at runtime. +def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseModel]: + """ + Dynamically constructs a parameter-combination model based on the original args model. + + Parameter arguments such as `int` get turned into `list[int]`, and have extra conveniences attached: + - Values can be passed as lists (1 -> [1]) + - Ranges and other convenient calls are expanded (see `python_evalish_coerce`) + """ + + # Get the default model instance by trying to serialize the empty dictionary + try: + model_default = model.model_validate({}) + except ValidationError: + model_default = None + + # First, we need to take our 'model' and coerce it to permit parameter combinations. + # This assumes that all of the keys are flattened, so we only get a structure like so: + # class AlgorithmParams(BaseModel): + # key1: int + # key2: list[str] + # ... + # and we want to transform this to: + # class AlgorithmParamsCombination(BaseModel): + # key1: list[int] + # key2: list[list[str]] + # However, we want to preserve certain conveniences (singleton values, fake python evaluation), + # so we also make use of BeforeValidators to do so, and we pass over their preferences into the JSON schema. + # (Note: This function does not worry about getting the cartesian product of this.) + + # Map our fields to a list (assuming we have no nested keys), + # and specify our user convenience validators + mapped_list_field: dict[str, Annotated] = dict() + for field_name, field in model.model_fields.items(): + # We need to create a copy of the field, + # as we need to make sure that it gets mapped to the list coerced version of the field. + new_field = copy.deepcopy(field) + new_field.validate_default = True + + mapped_list_field[field_name] = (Annotated[ + list[field.annotation], + # This order isn't arbitrary. + # https://docs.pydantic.dev/latest/concepts/validators/#ordering-of-validators + # This runs second. This coerces any singletons to lists. + BeforeValidator(list_coerce, json_schema_input_type=Union[field.annotation, list[field.annotation]]), + # This runs first. This evaluates numpy utils for integer/float lists + BeforeValidator( + python_evalish_coerce, + # json_schema_input_type (sensibly) overwrites, so we have to specify the entire union again here. + json_schema_input_type=Union[field.annotation, list[field.annotation], str] + ) if is_numpy_friendly(field.annotation) else None + ], new_field) + + # Runtime assertion check: mapped_list_field does not contain any `__-prefixed` fields + for key in mapped_list_field.keys(): + assert not key.startswith("__"), f"A private key has been passed from {name}'s argument schema. " + \ + "This should have been caught by the Snakemake CI step." + + # Pass this as kwargs to create_model, which usually takes in parameters field_name=type. + # We do need to cast create_model, since otherwise the type-checker complains that we may + # have had a key that starts with __ in mapped_list_fields. The above assertion prevents this. + run_model = (cast(Any, create_model))( + f'{name}RunModel', + __config__=ConfigDict(extra='forbid'), + **mapped_list_field + ) + + # Here is an example of how this would look like inside config.yaml + # name: pathlinker + # include: true + # runs: + # run1: + # (from run_model) + # ... + return create_model( + f'{name}Model', + name=Literal[name], + include=bool, + # For algorithms that have a default parameter config, we allow arbitrarily running an algorithm + # if no runs are specified. For example, the following config + # name: pathlinker + # include: true + # will run, despite there being no entries in `runs`. + # (create_model entries take in either a type or (type, default)). + runs=dict[str, run_model] if model_default is None else (dict[str, run_model], {"default": model_default}), + __config__=ConfigDict(extra='forbid') + ) + +algorithm_models: list[type[BaseModel]] = [construct_algorithm_model(name, model.get_params_generic()) for name, model in algorithms.items()] +# name differentiates algorithms +AlgorithmUnion = Annotated[Union[tuple(algorithm_models)], Field(discriminator='name')] diff --git a/spras/config/config.py b/spras/config/config.py index cb19b2b1d..e180183cc 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -16,7 +16,6 @@ import itertools as it import os import warnings -from collections.abc import Iterable from typing import Any import numpy as np @@ -28,8 +27,6 @@ config = None -DEFAULT_CONTAINER_PREFIX = "docker.io/reedcompbio" - # This will get called in the Snakefile, instantiating the singleton with the raw config def init_global(config_dict): global config @@ -78,9 +75,7 @@ def __init__(self, raw_config: dict[str, Any]): self.algorithms = None # A nested dict mapping algorithm names to dicts that map parameter hashes to parameter combinations. # Only includes algorithms that are set to be run with 'include: true'. - self.algorithm_params = None - # Deprecated. Previously a dict mapping algorithm names to a Boolean tracking whether they used directed graphs. - self.algorithm_directed = None + self.algorithm_params: dict[str, dict[str, Any]] = dict() # A dict with the analysis settings self.analysis_params = parsed_raw_config.analysis # A dict with the evaluation settings @@ -152,58 +147,30 @@ def process_algorithms(self, raw_config: RawConfig): """ prior_params_hashes = set() self.algorithm_params = dict() - self.algorithm_directed = dict() self.algorithms = raw_config.algorithms for alg in self.algorithms: - cur_params = alg.params - if cur_params.include: + if alg.include: # This dict maps from parameter combinations hashes to parameter combination dictionaries self.algorithm_params[alg.name] = dict() else: # Do not parse the rest of the parameters for this algorithm if it is not included continue - if cur_params.directed is not None: - warnings.warn("UPDATE: we no longer use the directed key in the config file", stacklevel=2) - - cur_params = cur_params.__pydantic_extra__ - if cur_params is None: - raise RuntimeError("An internal error occurred: ConfigDict extra should be set on AlgorithmParams.") - - # The algorithm has no named arguments so create a default placeholder - if len(cur_params.keys()) == 0: - cur_params["run1"] = {"spras_placeholder": ["no parameters"]} + runs: dict[str, Any] = alg.runs # Each set of runs should be 1 level down in the config file - for run_params in cur_params: + for run_name in runs.keys(): all_runs = [] # We create the product of all param combinations for each run param_name_list = [] - if cur_params[run_params]: - for p in cur_params[run_params]: - param_name_list.append(p) - obj = str(cur_params[run_params][p]) - try: - obj = [int(obj)] - except ValueError: - try: - obj = [float(obj)] - except ValueError: - # Handles arrays and special evaluation types - # TODO: do we want to explicitly bar `eval` if we may use untrusted user inputs later? - if obj.startswith(("range", "np.linspace", "np.arange", "np.logspace", "[")): - obj = eval(obj) - elif obj.lower() == "true": - obj = [True] - elif obj.lower() == "false": - obj = [False] - else: - # Catch-all for strings - obj = [obj] - if not isinstance(obj, Iterable): - raise ValueError(f"The object `{obj}` in algorithm {alg.name} at key '{p}' in run '{run_params}' is not iterable!") from None - all_runs.append(obj) + # We convert our run parameters to a dictionary, allowing us to iterate over it + run_subscriptable = vars(runs[run_name]) + for param in run_subscriptable: + param_name_list.append(param) + # this is guaranteed to be list[Any] by algorithms.py + param_values: list[Any] = run_subscriptable[param] + all_runs.append(param_values) run_list_tuples = list(it.product(*all_runs)) param_name_tuple = tuple(param_name_list) for r in run_list_tuples: @@ -223,6 +190,11 @@ def process_algorithms(self, raw_config: RawConfig): if params_hash in prior_params_hashes: raise ValueError(f'Parameter hash collision detected. Increase the hash_length in the config file ' f'(current length {self.hash_length}).') + + # We preserve the run name as it carries useful information for the parameter log, + # and is useful for configuration testing. + run_dict["_spras_run_name"] = run_name + self.algorithm_params[alg.name][params_hash] = run_dict def process_analysis(self, raw_config: RawConfig): @@ -287,7 +259,6 @@ def process_analysis(self, raw_config: RawConfig): def process_config(self, raw_config: RawConfig): - # Set up a few top-level config variables self.out_dir = raw_config.reconstruction_settings.locations.reconstruction_dir if raw_config.containers.enable_profiling and raw_config.containers.framework not in ["singularity", "apptainer"]: diff --git a/spras/config/container_schema.py b/spras/config/container_schema.py index 2167ae9e4..c4f293107 100644 --- a/spras/config/container_schema.py +++ b/spras/config/container_schema.py @@ -33,6 +33,8 @@ class ContainerRegistry(BaseModel): class ContainerSettings(BaseModel): framework: ContainerFramework = ContainerFramework.docker unpack_singularity: bool = False + + model_config = ConfigDict(extra='forbid') enable_profiling: bool = False "A Boolean indicating whether to enable container runtime profiling (apptainer/singularity only)" registry: ContainerRegistry diff --git a/spras/config/schema.py b/spras/config/schema.py index 8aa067a53..e530c5d65 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -11,10 +11,11 @@ """ import re -from typing import Annotated, Optional +from typing import Annotated from pydantic import AfterValidator, BaseModel, ConfigDict +from spras.config.algorithms import AlgorithmUnion from spras.config.container_schema import ContainerSettings from spras.config.util import CaseInsensitiveEnum @@ -90,21 +91,6 @@ def validate(label: str): return label return validate -class AlgorithmParams(BaseModel): - include: bool - directed: Optional[bool] = None - - # TODO: use array of runs instead. We currently rely on the - # extra parameters here to extract the algorithm parameter information, - # which is why this deviates from the usual ConfigDict(extra='forbid'). - model_config = ConfigDict(extra='allow') - -class Algorithm(BaseModel): - name: str - params: AlgorithmParams - - model_config = ConfigDict(extra='forbid') - class Dataset(BaseModel): # We prefer AfterValidator here to allow pydantic to run its own # validation & coercion logic before we check it against our own @@ -143,7 +129,8 @@ class RawConfig(BaseModel): hash_length: int = DEFAULT_HASH_LENGTH "The length of the hash used to identify a parameter combination" - algorithms: list[Algorithm] + # See algorithms.py for more information about AlgorithmUnion + algorithms: list[AlgorithmUnion] # type: ignore - pydantic allows this. datasets: list[Dataset] gold_standards: list[GoldStandard] = [] analysis: Analysis = Analysis() diff --git a/spras/config/util.py b/spras/config/util.py index b7680222b..f7abf96b2 100644 --- a/spras/config/util.py +++ b/spras/config/util.py @@ -1,6 +1,15 @@ +""" +General config utilities. This is the only config file +that should be imported by algorithms, and algorithms should +only import this config file. +""" + from enum import Enum from typing import Any +import yaml +from pydantic import BaseModel, ConfigDict + # https://stackoverflow.com/a/76883868/7589775 class CaseInsensitiveEnum(str, Enum): @@ -17,3 +26,18 @@ def _missing_(cls, value: Any): if member.lower() == value: return member return None + +# We also need to allow `CaseInsensitiveEnum` to be represented in yaml.safe_dump, +# allowing us to safely log parameters in Snakemake: +# https://github.com/yaml/pyyaml/issues/722#issue-1781352490 +yaml.SafeDumper.add_multi_representer( + CaseInsensitiveEnum, + yaml.representer.SafeRepresenter.represent_str, +) + +class Empty(BaseModel): + """ + The empty base model. Used for specifying that an algorithm takes no parameters, + yet is deterministic. + """ + model_config = ConfigDict(extra="forbid") diff --git a/spras/domino.py b/spras/domino.py index e060aa617..316044432 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -1,9 +1,12 @@ import json from pathlib import Path +from typing import Optional import pandas as pd +from pydantic import BaseModel, ConfigDict from spras.config.container_schema import ProcessedContainerSettings +from spras.config.util import BaseModel from spras.containers import ContainerError, prepare_volume, run_container_and_log from spras.interactome import ( add_constant, @@ -12,11 +15,19 @@ from spras.prm import PRM from spras.util import duplicate_edges -__all__ = ['DOMINO', 'pre_domino_id_transform', 'post_domino_id_transform'] +__all__ = ['DOMINO', 'DominoParams', 'pre_domino_id_transform', 'post_domino_id_transform'] ID_PREFIX = 'ENSG0' ID_PREFIX_LEN = len(ID_PREFIX) +class DominoParams(BaseModel): + module_threshold: Optional[float] = None + "the p-value threshold for considering a slice as relevant (optional)" + + slice_threshold: Optional[float] = None + "the p-value threshold for considering a putative module as final module (optional)" + + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) """ DOMINO will construct a fully undirected graph from the provided input file @@ -27,7 +38,7 @@ - the expected raw input file should have node pairs in the 1st and 3rd columns, with a 'ppi' in the 2nd column - it can include repeated and bidirectional edges """ -class DOMINO(PRM): +class DOMINO(PRM[DominoParams]): required_inputs = ['network', 'active_genes'] dois = ["10.15252/msb.20209593"] @@ -36,8 +47,9 @@ def generate_inputs(data, filename_map): """ Access fields from the dataset and write the required input files @param data: dataset - @param filename_map: a dict mapping file types in the required_inputs to the filename for that type - @return: + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type. Associated files will be written with: + - network: list of edges + - active_genes: list of active genes """ DOMINO.validate_required_inputs(filename_map) @@ -71,32 +83,20 @@ def generate_inputs(data, filename_map): header=['ID_interactor_A', 'ppi', 'ID_interactor_B']) @staticmethod - def run(network=None, active_genes=None, output_file=None, slice_threshold=None, module_threshold=None, container_settings=None): - """ - Run DOMINO with Docker. - Let visualization be always true, parallelization be always 1 thread, and use_cache be always false. - DOMINO produces multiple output module files in an HTML format. SPRAS concatenates these files into one file. - @param network: input network file (required) - @param active_genes: input active genes (required) - @param output_file: path to the output pathway file (required) - @param slice_threshold: the p-value threshold for considering a slice as relevant (optional) - @param module_threshold: the p-value threshold for considering a putative module as final module (optional) - @param container_settings: configure the container runtime - """ + def run(inputs, output_file, args=None, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() - - if not network or not active_genes or not output_file: - raise ValueError('Required DOMINO arguments are missing') + if not args: args = DominoParams() + DOMINO.validate_required_run_args(inputs) work_dir = '/spras' # Each volume is a tuple (source, destination) volumes = list() - bind_path, network_file = prepare_volume(network, work_dir, container_settings) + bind_path, network_file = prepare_volume(inputs["network"], work_dir, container_settings) volumes.append(bind_path) - bind_path, node_file = prepare_volume(active_genes, work_dir, container_settings) + bind_path, node_file = prepare_volume(inputs["active_genes"], work_dir, container_settings) volumes.append(bind_path) out_dir = Path(output_file).parent @@ -131,6 +131,7 @@ def run(network=None, active_genes=None, output_file=None, slice_threshold=None, raise err # Make the Python command to run within the container + # Let visualization be always true, parallelization be always 1 thread, and use_cache be always false. domino_command = ['domino', '--active_genes_files', node_file, '--network_file', network_file, @@ -141,11 +142,11 @@ def run(network=None, active_genes=None, output_file=None, slice_threshold=None, '--visualization', 'true'] # Add optional arguments - if slice_threshold is not None: + if args.slice_threshold is not None: # DOMINO readme has the wrong argument https://github.com/Shamir-Lab/DOMINO/issues/12 - domino_command.extend(['--slice_threshold', str(slice_threshold)]) - if module_threshold is not None: - domino_command.extend(['--module_threshold', str(module_threshold)]) + domino_command.extend(['--slice_threshold', str(args.slice_threshold)]) + if args.module_threshold is not None: + domino_command.extend(['--module_threshold', str(args.module_threshold)]) try: run_container_and_log('DOMINO', @@ -182,7 +183,7 @@ def run(network=None, active_genes=None, output_file=None, slice_threshold=None, # Clean up DOMINO intermediate and pickle files slices_file.unlink(missing_ok=True) Path(out_dir, 'network.slices.pkl').unlink(missing_ok=True) - Path(str(network) + '.pkl').unlink(missing_ok=True) + Path(str(inputs['network']) + '.pkl').unlink(missing_ok=True) @staticmethod def parse_output(raw_pathway_file, standardized_pathway_file, params): diff --git a/spras/meo.py b/spras/meo.py index 6fe06e058..5d4630f43 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -1,5 +1,8 @@ import os from pathlib import Path +from typing import Optional + +from pydantic import BaseModel, ConfigDict from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log @@ -10,7 +13,7 @@ from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges, raw_pathway_df -__all__ = ['MEO', 'write_properties'] +__all__ = ['MEO', 'MEOParams', 'write_properties'] # replaces all underscores in the node names with unicode separator # MEO keeps only the substring up to the first underscore when parsing node names @@ -57,7 +60,8 @@ def write_properties(filename=Path('properties.txt'), edges=None, sources=None, if max_path_length is not None: f.write(f'max.path.length = {max_path_length}\n') if local_search is not None: - f.write(f'local.search = {local_search}\n') + # Yes/No for this parameter. + f.write(f'local.search = {"Yes" if local_search else "No"}\n') if rand_restarts is not None: f.write(f'rand.restarts = {rand_restarts}\n') @@ -66,6 +70,21 @@ def write_properties(filename=Path('properties.txt'), edges=None, sources=None, # Do not need csp.phase, csp.gen.file, or csp.sol.file because MAXCSP is not supported +class MEOParams(BaseModel): + max_path_length: Optional[int] = None + "the maximal length of a path from sources and targets to orient." + + local_search: Optional[bool] = None + """ + a boolean parameter that enables MEO's local search functionality. + See "Improving approximations with local search" in the associated paper + for more information. + """ + + rand_restarts: Optional[int] = None + "The number of random restarts to use." + + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) """ MEO can support partially directed graphs @@ -83,7 +102,7 @@ def write_properties(filename=Path('properties.txt'), edges=None, sources=None, """ -class MEO(PRM): +class MEO(PRM[MEOParams]): required_inputs = ['sources', 'targets', 'edges'] dois = ["10.1093/nar/gkq1207"] @@ -92,8 +111,10 @@ def generate_inputs(data, filename_map): """ Access fields from the dataset and write the required input files @param data: dataset - @param filename_map: a dict mapping file types in the required_inputs to the filename for that type - @return: + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type. Associated files will be written with: + - sources: list of sources + - targets: list of targets + - edges: list of edges """ MEO.validate_required_inputs(filename_map) @@ -125,8 +146,7 @@ def generate_inputs(data, filename_map): # TODO add parameter validation # TODO document required arguments @staticmethod - def run(edges=None, sources=None, targets=None, output_file=None, max_path_length=None, local_search=None, - rand_restarts=None, container_settings=None): + def run(inputs, output_file=None, args=None, container_settings=None): """ Run Maximum Edge Orientation in the Docker image with the provided parameters. The properties file is generated from the provided arguments. @@ -134,28 +154,23 @@ def run(edges=None, sources=None, targets=None, output_file=None, max_path_lengt Does not support MINSAT or MAXCSP. Only the edge output file is retained. All other output files are deleted. - @param output_file: the name of the output edge file, which will overwrite any existing file with this name - @param max_path_length: the maximal length of a path from sources and targets to orient. - @param local_search: a "Yes"/"No" parameter that enables MEO's local search functionality. See "Improving approximations with local search" in the associated paper for more information. - @param rand_restarts: The (int) of random restarts to use. - @param container_settings: configure the container runtime """ if not container_settings: container_settings = ProcessedContainerSettings() - if edges is None or sources is None or targets is None or output_file is None: - raise ValueError('Required Maximum Edge Orientation arguments are missing') + if not args: args = MEOParams() + MEO.validate_required_run_args(inputs) work_dir = '/spras' # Each volume is a tuple (src, dest) volumes = list() - bind_path, edge_file = prepare_volume(edges, work_dir, container_settings) + bind_path, edge_file = prepare_volume(inputs["edges"], work_dir, container_settings) volumes.append(bind_path) - bind_path, source_file = prepare_volume(sources, work_dir, container_settings) + bind_path, source_file = prepare_volume(inputs["sources"], work_dir, container_settings) volumes.append(bind_path) - bind_path, target_file = prepare_volume(targets, work_dir, container_settings) + bind_path, target_file = prepare_volume(inputs["targets"], work_dir, container_settings) volumes.append(bind_path) out_dir = Path(output_file).parent @@ -174,7 +189,8 @@ def run(edges=None, sources=None, targets=None, output_file=None, max_path_lengt properties_file_local = Path(out_dir, properties_file) write_properties(filename=properties_file_local, edges=edge_file, sources=source_file, targets=target_file, edge_output=mapped_output_file, path_output=mapped_path_output, - max_path_length=max_path_length, local_search=local_search, rand_restarts=rand_restarts, framework=container_settings.framework) + max_path_length=args.max_path_length, local_search=args.local_search, rand_restarts=args.rand_restarts, + framework=container_settings.framework) bind_path, properties_file = prepare_volume(str(properties_file_local), work_dir, container_settings) volumes.append(bind_path) diff --git a/spras/mincostflow.py b/spras/mincostflow.py index ea93fe5a4..dad1d706c 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -1,4 +1,7 @@ from pathlib import Path +from typing import Optional + +from pydantic import BaseModel, ConfigDict from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log @@ -9,7 +12,16 @@ from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges, raw_pathway_df -__all__ = ['MinCostFlow'] +__all__ = ['MinCostFlow', 'MinCostFlowParams'] + +class MinCostFlowParams(BaseModel): + flow: Optional[int] = None + "amount of flow going through the graph" + + capacity: Optional[int] = None + "amount of capacity allowed on each edge" + + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) """ MinCostFlow deals with fully directed graphs @@ -23,7 +35,7 @@ - the expected raw input file should have node pairs in the 1st and 2nd columns, with the weight in the 3rd column - it can include repeated and bidirectional edges """ -class MinCostFlow(PRM): +class MinCostFlow(PRM[MinCostFlowParams]): required_inputs = ['sources', 'targets', 'edges'] # NOTE: This is the DOI for the ResponseNet paper. # This version of MinCostFlow is inspired by the ResponseNet paper, but does not have @@ -35,7 +47,10 @@ def generate_inputs(data, filename_map): """ Access fields from the dataset and write the required input files @param data: dataset - @param filename_map: a dict mapping file types in the required_inputs to the filename for that type + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type. Associated files will be written with: + - sources: list of sources + - targets: list of targets + - edges: list of edges """ MinCostFlow.validate_required_inputs(filename_map) @@ -61,21 +76,10 @@ def generate_inputs(data, filename_map): header=False) @staticmethod - def run(sources=None, targets=None, edges=None, output_file=None, flow=None, capacity=None, container_settings=None): - """ - Run min cost flow with Docker (or singularity) - @param sources: input sources (required) - @param targets: input targets (required) - @param edges: input network file (required) - @param output_file: output file name (required) - @param flow: (int) amount of flow going through the graph (optional) - @param capacity: (float) amount of capacity allowed on each edge (optional) - @param container_settings: configure the container runtime - """ + def run(inputs, output_file, args=None, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() - # ensures that these parameters are required - if not sources or not targets or not edges or not output_file: - raise ValueError('Required MinCostFlow arguments are missing') + if not args: args = MinCostFlowParams() + MinCostFlow.validate_required_run_args(inputs) # the data files will be mapped within this directory within the container work_dir = '/mincostflow' @@ -83,13 +87,13 @@ def run(sources=None, targets=None, edges=None, output_file=None, flow=None, cap # the tuple is for mapping the sources, targets, edges, and output volumes = list() - bind_path, sources_file = prepare_volume(sources, work_dir, container_settings) + bind_path, sources_file = prepare_volume(inputs["sources"], work_dir, container_settings) volumes.append(bind_path) - bind_path, targets_file = prepare_volume(targets, work_dir, container_settings) + bind_path, targets_file = prepare_volume(inputs["targets"], work_dir, container_settings) volumes.append(bind_path) - bind_path, edges_file = prepare_volume(edges, work_dir, container_settings) + bind_path, edges_file = prepare_volume(inputs["edges"], work_dir, container_settings) volumes.append(bind_path) # Create a prefix for the output filename and ensure the directory exists @@ -108,10 +112,10 @@ def run(sources=None, targets=None, edges=None, output_file=None, flow=None, cap '--output', mapped_out_prefix] # Optional arguments (extend the command if available) - if flow is not None: - command.extend(['--flow', str(flow)]) - if capacity is not None: - command.extend(['--capacity', str(capacity)]) + if args.flow is not None: + command.extend(['--flow', str(args.flow)]) + if args.capacity is not None: + command.extend(['--capacity', str(args.capacity)]) # choosing to run in docker or singularity container container_suffix = "mincostflow" diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 1bfa277e9..3e5cbf1b7 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -1,12 +1,16 @@ from pathlib import Path +from typing import Optional + +from pydantic import BaseModel, ConfigDict from spras.config.container_schema import ProcessedContainerSettings +from spras.config.util import CaseInsensitiveEnum from spras.containers import prepare_volume, run_container_and_log from spras.interactome import reinsert_direction_col_mixed from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges, raw_pathway_df -__all__ = ['OmicsIntegrator1', 'write_conf'] +__all__ = ['DummyMode', 'OmicsIntegrator1', 'OmicsIntegrator1Params', 'write_conf'] # TODO decide on default number of processes and threads @@ -36,8 +40,62 @@ def write_conf(filename=Path('config.txt'), w=None, b=None, d=None, mu=None, noi f.write('processes = 1\n') f.write('threads = 1\n') +class DummyMode(CaseInsensitiveEnum): + terminals = 'terminals' + "connect the dummy node to all nodes that have been assigned prizes" + all = 'all' + "connect the dummy node to all nodes in the interactome i.e. full set of nodes in graph" + others = 'others' + "connect the dummy node to all nodes that are not terminal nodes i.e. nodes w/o prizes" + file = 'file' + "connect the dummy node to a specific list of nodes provided in a file" + + # To make sure that DummyMode prints as `terminals`, etc.. in JSON dictionaries + # (since they use object representation internally.) + def __repr__(self) -> str: + return f"'{self.name}'" + +class OmicsIntegrator1Params(BaseModel): + dummy_mode: Optional[DummyMode] = None + mu_squared: bool = False + exclude_terms: bool = False + + noisy_edges: int = 0 + "How many times you would like to add noise to the given edge values and re-run the algorithm." + + shuffled_prizes: int = 0 + "How many times the algorithm should shuffle the prizes and re-run" + + random_terminals: int = 0 + "How many times to apply the given prizes to random nodes in the interactome" + + seed: Optional[int] = None + "The randomness seed to use." + + w: float + "Float that affects the number of connected components, with higher values leading to more components" + + b: float + "The trade-off between including more prizes and using less reliable edgess" -class OmicsIntegrator1(PRM): + d: int + "Controls the maximum path-length from root to terminal nodes" + + mu: float = 0.0 + "Controls the degree-based negative prizes (default 0.0)" + + noise: Optional[float] = None + "Standard Deviation of the gaussian noise added to edges in Noisy Edges Randomizations" + + g: float = 0.001 + "(gamma) msgsteiner reinforcement parameter that affects the convergence of the solution and runtime, with larger values leading to faster convergence but suboptimal results." + + r: float = 0 + "msgsteiner parameter that adds random noise to edges, which is rarely needed." + + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) + +class OmicsIntegrator1(PRM[OmicsIntegrator1Params]): """ Omics Integrator 1 works with partially directed graphs - it takes in the universal input directly @@ -58,8 +116,10 @@ def generate_inputs(data, filename_map): """ Access fields from the dataset and write the required input files @param data: dataset - @param filename_map: a dict mapping file types in the required_inputs to the filename for that type - @return: + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type. Associated files will be written with: + - prizes: list of nodes associated with their prize + - edges: list of edges associated with their weight and directionality + - dummy_nodes: list of dummy nodes """ OmicsIntegrator1.validate_required_inputs(filename_map) @@ -95,62 +155,30 @@ def generate_inputs(data, filename_map): with open(filename_map['dummy_nodes'], mode='w'): pass - # TODO add parameter validation # TODO add support for knockout argument # TODO add reasonable default values @staticmethod - def run(edges=None, prizes=None, dummy_nodes=None, dummy_mode=None, mu_squared=None, exclude_terms=None, - output_file=None, noisy_edges=None, shuffled_prizes=None, random_terminals=None, - seed=None, w=None, b=None, d=None, mu=None, noise=None, g=None, r=None, container_settings=None): - """ - Run Omics Integrator 1 in the Docker image with the provided parameters. - Does not support the garnet, cyto30, knockout, cv, or cv-reps arguments. - The configuration file is generated from the provided arguments. - Does not support the garnetBeta, processes, or threads configuration file parameters. - The msgpath is not required because msgsteiner is available in the Docker image. - Only the optimal forest sif file is retained. - All other output files are deleted. - @param output_file: the name of the output sif file for the optimal forest, which will overwrite any - existing file with this name - @param noisy_edges: How many times you would like to add noise to the given edge values and re-run the algorithm. - @param shuffled_prizes: How many times the algorithm should shuffle the prizes and re-run - @param random_terminals: How many times to apply the given prizes to random nodes in the interactome - @param seed: the randomness seed to use - @param w: float that affects the number of connected components, with higher values leading to more components - @param b: the trade-off between including more prizes and using less reliable edges - @param d: controls the maximum path-length from root to terminal nodes - @param mu: controls the degree-based negative prizes (default 0.0) - @param noise: Standard Deviation of the gaussian noise added to edges in Noisy Edges Randomizations - @param g: (gamma) msgsteiner reinforcement parameter that affects the convergence of the solution and runtime, with larger values leading to faster convergence but suboptimal results (default 0.001) - @param r: msgsteiner parameter that adds random noise to edges, which is rarely needed (default 0) - @param container_settings: configure the container runtime - """ + def run(inputs, output_file, args, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() - if edges is None or prizes is None or output_file is None or w is None or b is None or d is None: - raise ValueError('Required Omics Integrator 1 arguments are missing') + OmicsIntegrator1.validate_required_run_args(inputs, ["dummy_nodes"]) work_dir = '/spras' # Each volume is a tuple (src, dest) volumes = list() - bind_path, edge_file = prepare_volume(edges, work_dir, container_settings) + bind_path, edge_file = prepare_volume(inputs["edges"], work_dir, container_settings) volumes.append(bind_path) - bind_path, prize_file = prepare_volume(prizes, work_dir, container_settings) + bind_path, prize_file = prepare_volume(inputs["prizes"], work_dir, container_settings) volumes.append(bind_path) - # 4 dummy mode possibilities: - # 1. terminals -> connect the dummy node to all nodes that have been assigned prizes - # 2. all -> connect the dummy node to all nodes in the interactome i.e. full set of nodes in graph - # 3. others -> connect the dummy node to all nodes that are not terminal nodes i.e. nodes w/o prizes - # 4. file -> connect the dummy node to a specific list of nodes provided in a file - # add dummy node file to the volume if dummy_mode is not None and it is 'file' - if dummy_mode == 'file': - if dummy_nodes is None: + dummy_file = None + if args.dummy_mode == DummyMode.file: + if "dummy_nodes" not in inputs: raise ValueError("dummy_nodes file is required when dummy_mode is set to 'file'") - bind_path, dummy_file = prepare_volume(dummy_nodes, work_dir, container_settings) + bind_path, dummy_file = prepare_volume(inputs["dummy_nodes"], work_dir, container_settings) volumes.append(bind_path) out_dir = Path(output_file).parent @@ -162,7 +190,8 @@ def run(edges=None, prizes=None, dummy_nodes=None, dummy_mode=None, mu_squared=N conf_file = 'oi1-configuration.txt' conf_file_local = Path(out_dir, conf_file) # Temporary file that will be deleted after running Omics Integrator 1 - write_conf(conf_file_local, w=w, b=b, d=d, mu=mu, noise=noise, g=g, r=r) + write_conf(conf_file_local, w=args.w, b=args.b, d=args.d, mu=args.mu, + noise=args.noise, g=args.g, r=args.r) bind_path, conf_file = prepare_volume(str(conf_file_local), work_dir, container_settings) volumes.append(bind_path) @@ -175,27 +204,24 @@ def run(edges=None, prizes=None, dummy_nodes=None, dummy_mode=None, mu_squared=N '--outlabel', 'oi1'] # add the dummy mode argument - if dummy_mode is not None and dummy_mode: + if args.dummy_mode is not None: # for custom dummy modes, add the file - if dummy_mode == 'file': + if dummy_file: command.extend(['--dummyMode', dummy_file]) # else pass in the dummy_mode and let oi1 handle it else: - command.extend(['--dummyMode', dummy_mode]) + command.extend(['--dummyMode', args.dummy_mode.value]) # Add optional arguments - if mu_squared is not None and mu_squared: + if args.mu_squared: command.extend(['--musquared']) - if exclude_terms is not None and exclude_terms: + if args.exclude_terms: command.extend(['--excludeTerms']) - if noisy_edges is not None: - command.extend(['--noisyEdges', str(noisy_edges)]) - if shuffled_prizes is not None: - command.extend(['--shuffledPrizes', str(shuffled_prizes)]) - if random_terminals is not None: - command.extend(['--randomTerminals', str(random_terminals)]) - if seed is not None: - command.extend(['--seed', str(seed)]) + command.extend(['--noisyEdges', str(args.noisy_edges)]) + command.extend(['--shuffledPrizes', str(args.shuffled_prizes)]) + command.extend(['--randomTerminals', str(args.random_terminals)]) + if args.seed is not None: + command.extend(['--seed', str(args.seed)]) container_suffix = "omics-integrator-1:v2" run_container_and_log('Omics Integrator 1', diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index c8811d2c0..38624d3ab 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -1,15 +1,55 @@ from pathlib import Path +from typing import Optional import pandas as pd +from pydantic import BaseModel, ConfigDict from spras.config.container_schema import ProcessedContainerSettings +from spras.config.util import CaseInsensitiveEnum from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset from spras.interactome import reinsert_direction_col_undirected from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges -__all__ = ['OmicsIntegrator2'] +__all__ = ['DummyMode', 'OmicsIntegrator2', 'OmicsIntegrator2Params'] + +class DummyMode(CaseInsensitiveEnum): + terminals = 'terminals' + others = 'others' + all = 'all' + +class OmicsIntegrator2Params(BaseModel): + w: float = 5 + "Omega: the weight of the edges connecting the dummy node to the nodes selected by dummyMode" + + b: float = 1 + "Beta: scaling factor of prizes" + + g: float = 3 + "Gamma: multiplicative edge penalty from degree of endpoints" + + noise: Optional[float] = None + "Standard Deviation of the gaussian noise added to edges in Noisy Edges Randomizations." + + noisy_edges: Optional[int] = None + "An integer specifying how many times to add noise to the given edge values and re-run." + + random_terminals: Optional[int] = None + "An integer specifying how many times to apply your given prizes to random nodes in the interactome and re-run" + + dummy_mode: Optional[DummyMode] = None + """ + Tells the program which nodes in the interactome to connect the dummy node to. (default: terminals) + "terminals" = connect to all terminals + "others" = connect to all nodes except for terminals + "all" = connect to all nodes in the interactome. + """ + + seed: Optional[int] = None + "The random seed to use for this run." + + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) """ Omics Integrator 2 will construct a fully undirected graph from the provided input file @@ -21,17 +61,19 @@ - the expected raw input file should have node pairs in the 1st and 2nd columns, with a weight in the 3rd column - it can include repeated and bidirectional edges """ -class OmicsIntegrator2(PRM): +class OmicsIntegrator2(PRM[OmicsIntegrator2Params]): required_inputs = ['prizes', 'edges'] # OI2 does not have a specific paper. Instead, we link to the OI1 paper. dois = ["10.1371/journal.pcbi.1004879"] + @staticmethod def generate_inputs(data: Dataset, filename_map): """ - Access fields from the dataset and write the required input files. - Automatically converts edge weights to edge costs. + Access fields from the dataset and write the required input files @param data: dataset - @param filename_map: a dict mapping file types in the required_inputs to the filename for that type + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type. Associated files will be written with: + - prizes: list of nodes associated with their prize + - edges: list of edges associated with their cost (transformed from the original Dataset weights) """ OmicsIntegrator2.validate_required_inputs(filename_map) @@ -64,42 +106,22 @@ def generate_inputs(data: Dataset, filename_map): edges_df.to_csv(filename_map['edges'], sep='\t', index=False, columns=['Interactor1', 'Interactor2', 'cost'], header=['protein1', 'protein2', 'cost']) - # TODO add parameter validation # TODO add reasonable default values @staticmethod - def run(edges=None, prizes=None, output_file=None, w=None, b=None, g=None, noise=None, noisy_edges=None, - random_terminals=None, dummy_mode=None, seed=None, container_settings=None): - """ - Run Omics Integrator 2 in the Docker image with the provided parameters. - Only the .tsv output file is retained and then renamed. - All other output files are deleted. - @param output_file: the name of the output file, which will overwrite any existing file with this name - @param w: Omega: the weight of the edges connecting the dummy node to the nodes selected by dummyMode (default: 5) - @param b: Beta: scaling factor of prizes (default: 1) - @param g: Gamma: multiplicative edge penalty from degree of endpoints (default: 3) - @param noise: Standard Deviation of the gaussian noise added to edges in Noisy Edges Randomizations. - @param noisy_edges: An integer specifying how many times to add noise to the given edge values and re-run. - @param random_terminals: An integer specifying how many times to apply your given prizes to random nodes in the interactome and re-run - @param dummy_mode: Tells the program which nodes in the interactome to connect the dummy node to. (default: terminals) - "terminals" = connect to all terminals - "others" = connect to all nodes except for terminals - "all" = connect to all nodes in the interactome. - @param seed: The random seed to use for this run. - @param container_settings: configure the container runtime - """ + def run(inputs, output_file, args=None, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() - if edges is None or prizes is None or output_file is None: - raise ValueError('Required Omics Integrator 2 arguments are missing') + if not args: args = OmicsIntegrator2Params() + OmicsIntegrator2.validate_required_run_args(inputs) work_dir = '/spras' # Each volume is a tuple (src, dest) volumes = list() - bind_path, edge_file = prepare_volume(edges, work_dir, container_settings) + bind_path, edge_file = prepare_volume(inputs["edges"], work_dir, container_settings) volumes.append(bind_path) - bind_path, prize_file = prepare_volume(prizes, work_dir, container_settings) + bind_path, prize_file = prepare_volume(inputs["prizes"], work_dir, container_settings) volumes.append(bind_path) out_dir = Path(output_file).parent @@ -112,23 +134,23 @@ def run(edges=None, prizes=None, output_file=None, w=None, b=None, g=None, noise '-o', mapped_out_dir, '--filename', 'oi2'] # Add optional arguments - if w is not None: - command.extend(['-w', str(w)]) - if b is not None: - command.extend(['-b', str(b)]) - if g is not None: - command.extend(['-g', str(g)]) - if noise is not None: - command.extend(['-noise', str(noise)]) - if noisy_edges is not None: - command.extend(['--noisy_edges', str(noisy_edges)]) - if random_terminals is not None: - command.extend(['--random_terminals', str(random_terminals)]) - if dummy_mode is not None: + if args.w is not None: + command.extend(['-w', str(args.w)]) + if args.w is not None: + command.extend(['-b', str(args.b)]) + if args.w is not None: + command.extend(['-g', str(args.g)]) + if args.noise is not None: + command.extend(['-noise', str(args.noise)]) + if args.noisy_edges is not None: + command.extend(['--noisy_edges', str(args.noisy_edges)]) + if args.random_terminals is not None: + command.extend(['--random_terminals', str(args.random_terminals)]) + if args.dummy_mode is not None: # This argument does not follow the other naming conventions - command.extend(['--dummyMode', str(dummy_mode)]) - if seed is not None: - command.extend(['--seed', str(seed)]) + command.extend(['--dummyMode', args.dummy_mode.value]) + if args.seed is not None: + command.extend(['--seed', str(args.seed)]) container_suffix = "omics-integrator-2:v3" run_container_and_log('Omics Integrator 2', diff --git a/spras/pathlinker.py b/spras/pathlinker.py index 8e40743a8..c534f2944 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -1,6 +1,8 @@ import warnings from pathlib import Path +from pydantic import BaseModel, ConfigDict + from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset @@ -11,7 +13,13 @@ from spras.prm import PRM from spras.util import duplicate_edges, raw_pathway_df -__all__ = ['PathLinker'] +__all__ = ['PathLinker', 'PathLinkerParams'] + +class PathLinkerParams(BaseModel): + k: int = 100 + "Number of paths" + + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) """ Pathlinker will construct a fully directed graph from the provided input file @@ -23,7 +31,7 @@ - the expected raw input file should have node pairs in the 1st and 2nd columns, with a weight in the 3rd column - it can include repeated and bidirectional edges """ -class PathLinker(PRM): +class PathLinker(PRM[PathLinkerParams]): required_inputs = ['nodetypes', 'network'] dois = ["10.1038/npjsba.2016.2", "10.1089/cmb.2012.0274"] @@ -32,8 +40,9 @@ def generate_inputs(data, filename_map): """ Access fields from the dataset and write the required input files @param data: dataset - @param filename_map: a dict mapping file types in the required_inputs to the filename for that type - @return: + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type. Associated files will be written with: + - nodetypes: list of nodes tagged with whether they are a source or a target + - network: list of edges """ PathLinker.validate_required_inputs(filename_map) @@ -65,34 +74,21 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map["network"],sep="\t",index=False,columns=["Interactor1","Interactor2","Weight"], header=["#Interactor1","Interactor2","Weight"]) - # Skips parameter validation step @staticmethod - def run(nodetypes=None, network=None, output_file=None, k=None, container_settings=None): - """ - Run PathLinker with Docker - @param nodetypes: input node types with sources and targets (required) - @param network: input network file (required) - @param output_file: path to the output pathway file (required) - @param k: path length (optional) - @param container_settings: configure the container runtime - """ + def run(inputs, output_file, args=None, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() - # Add additional parameter validation - # Do not require k - # Use the PathLinker default - # Could consider setting the default here instead - if not nodetypes or not network or not output_file: - raise ValueError('Required PathLinker arguments are missing') + if not args: args = PathLinkerParams() + PathLinker.validate_required_run_args(inputs) work_dir = '/spras' # Each volume is a tuple (src, dest) volumes = list() - bind_path, node_file = prepare_volume(nodetypes, work_dir, container_settings) + bind_path, node_file = prepare_volume(inputs["nodetypes"], work_dir, container_settings) volumes.append(bind_path) - bind_path, network_file = prepare_volume(network, work_dir, container_settings) + bind_path, network_file = prepare_volume(inputs["network"], work_dir, container_settings) volumes.append(bind_path) # PathLinker does not provide an argument to set the output directory @@ -110,9 +106,7 @@ def run(nodetypes=None, network=None, output_file=None, k=None, container_settin node_file, '--output', mapped_out_prefix] - # Add optional argument - if k is not None: - command.extend(['-k', str(k)]) + command.extend(['-k', str(args.k)]) container_suffix = "pathlinker:v2" run_container_and_log('PathLinker', diff --git a/spras/prm.py b/spras/prm.py index eaa6fbe68..636859cf2 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -1,11 +1,16 @@ -import typing +import os from abc import ABC, abstractmethod -from typing import Any +from pathlib import Path +from typing import Any, Generic, Optional, TypeVar, cast, get_args +from pydantic import BaseModel + +from spras.config.container_schema import ProcessedContainerSettings from spras.dataset import Dataset +T = TypeVar('T', bound=BaseModel) -class PRM(ABC): +class PRM(ABC, Generic[T]): """ The PRM (Pathway Reconstruction Module) class, which defines the interface that `runner.py` uses to handle @@ -15,7 +20,7 @@ class PRM(ABC): required_inputs: list[str] = [] # DOIs aren't strictly required (e.g. local neighborhood), # but it should be explicitly declared that there are no DOIs by defining an empty list. - dois: list[str] = typing.cast(list[str], None) + dois: list[str] = cast(list[str], None) def __init_subclass__(cls): # modified from https://stackoverflow.com/a/58206480/7589775 @@ -30,11 +35,58 @@ def __init_subclass__(cls): @staticmethod @abstractmethod def generate_inputs(data: Dataset, filename_map: dict[str, str]): + """ + Access fields from the dataset and write the required input files + @param data: dataset + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type + """ raise NotImplementedError + @classmethod + def get_params_generic(cls) -> type[T]: + """ + Gets the class instance of the parameter type passed into T, allowing us to use the + underlying pydantic model associated to it. + + For example, on `class PathLinker(PRM[PathLinkerParams])`, + calling `PathLinker.get_params_generic()` returns `PathLinkerParams`. + """ + # TODO: use the type-safe get_original_bases when we bump to >= Python 3.12 + # This is hacky reflection from https://stackoverflow.com/a/71720366/7589775 + # which grabs the class of type T by the definition of `__orig_bases__`. + return get_args(cast(Any, cls).__orig_bases__[0])[0] + + # This is used in `runner.py` to avoid a dependency diamond when trying + # to import the actual algorithm schema. + @classmethod + def run_typeless(cls, inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, args: dict[str, Any], container_settings: ProcessedContainerSettings): + """ + This is similar to PRA.run, but it does pydantic logic internally to re-validate argument parameters. + """ + T_class = cls.get_params_generic() + + # Since we just used reflection, we provide a mountain-dewey error message here + # to protect against any developer confusion. + if not issubclass(T_class, BaseModel): + raise RuntimeError("The generic passed into PRM is not a pydantic.BaseModel.") + + # Validates our untyped `args` parameter against our parameter class of type T + # using BaseModel.model_validate (https://docs.pydantic.dev/latest/api/base_model/#pydantic.BaseModel.model_validate) + # (Pydantic already provides nice error messages, so we don't need to worry about catching this.) + T_parsed = T_class.model_validate(args) + + return cls.run(inputs, output_file, T_parsed, container_settings) + @staticmethod @abstractmethod - def run(**kwargs): + def run(inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, args: T, container_settings: ProcessedContainerSettings): + """ + Runs an algorithm with the specified inputs, algorithm params (T), + the designated output_file, and the desired container_settings. + + See the algorithm-specific `generate_inputs` and `parse_output` + for information about the input and output format. + """ raise NotImplementedError @staticmethod @@ -47,3 +99,35 @@ def validate_required_inputs(cls, filename_map: dict[str, str]): for input_type in cls.required_inputs: if input_type not in filename_map: raise ValueError("{input_type} filename is missing") + + @classmethod + def validate_required_run_args(cls, inputs: dict[str, str | os.PathLike], relax: Optional[list[str]] = None): + """ + Validates the `inputs` parameter for `PRM#run`. + + @param inputs: See `PRM#run`. + @param relax: List of inputs that aren't required: if they are specified, they should be valid path + """ + if not relax: relax = [] + + # Check that `relax` is a valid list + for entry in relax: + if entry not in cls.required_inputs: + raise RuntimeError(f"{relax} is not contained in this PRM's required inputs ({cls.required_inputs}). This should have been caught in testing.") + + # Check that all non-relaxed required inputs are present + for input_type in cls.required_inputs: + if input_type not in inputs or not inputs[input_type]: + # Ignore relaxed inputs + if input_type in relax: + continue + raise ValueError(f'Required input "{input_type}" is not set') + + path = Path(inputs[input_type]) + if not path.exists(): + raise OSError(f'Required input "{input_type}" is pointing to a missing file "{path}".') + + # Then, check that all inputs are required inputs (to prevent typos / catch errors when inputs are updated) + for input_type in inputs.keys(): + if input_type not in cls.required_inputs: + raise ValueError(f'Extra input "{input_type}" was provided but is not present in required inputs ({cls.required_inputs})') diff --git a/spras/responsenet.py b/spras/responsenet.py index 2a7d4394c..4989de482 100644 --- a/spras/responsenet.py +++ b/spras/responsenet.py @@ -1,5 +1,7 @@ from pathlib import Path +from pydantic import BaseModel, ConfigDict + from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( @@ -9,7 +11,16 @@ from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges, raw_pathway_df -__all__ = ['ResponseNet'] +__all__ = ['ResponseNet', 'ResponseNetParams'] + +class ResponseNetParams(BaseModel): + gamma: int = 10 + """ + The 'size' of the graph. The higher gamma is, the more flow + is encouraged to start from the source nodes. + """ + + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) """ ResponseNet will construct a fully directed graph from the provided input file @@ -21,7 +32,7 @@ - the expected raw input file should have node pairs in the 1st and 2nd columns, with a weight in the 3rd column - it can include bidirectional edges, but will only keep one copy of repeated edges """ -class ResponseNet(PRM): +class ResponseNet(PRM[ResponseNetParams]): required_inputs = ['sources', 'targets', 'edges'] dois = ["10.1038/ng.337"] @@ -30,7 +41,10 @@ def generate_inputs(data, filename_map): """ Access fields from the dataset and write the required input files @param data: dataset - @param filename_map: a dict mapping file types in the required_inputs to the filename for that type + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type. Associated files will be written with: + - sources: list of sources + - targets: list of targets + - edges: list of edges """ ResponseNet.validate_required_inputs(filename_map) @@ -54,21 +68,10 @@ def generate_inputs(data, filename_map): header=False) @staticmethod - def run(sources=None, targets=None, edges=None, output_file=None, gamma=10, container_settings=None): - """ - Run ResponseNet with Docker (or singularity) - @param sources: input sources (required) - @param targets: input targets (required) - @param edges: input network file (required) - @param output_file: output file name (required) - @param gamma: integer representing gamma (optional, default is 10) - @param container_settings: configure the container runtime - """ + def run(inputs, output_file, args=None, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() - - # ensures that these parameters are required - if not sources or not targets or not edges or not output_file: - raise ValueError('Required ResponseNet arguments are missing') + ResponseNet.validate_required_run_args(inputs) + if not args: args = ResponseNetParams() # the data files will be mapped within this directory within the container work_dir = '/ResponseNet' @@ -76,13 +79,13 @@ def run(sources=None, targets=None, edges=None, output_file=None, gamma=10, cont # the tuple is for mapping the sources, targets, edges, and output volumes = list() - bind_path, sources_file = prepare_volume(sources, work_dir, container_settings) + bind_path, sources_file = prepare_volume(inputs["sources"], work_dir, container_settings) volumes.append(bind_path) - bind_path, targets_file = prepare_volume(targets, work_dir, container_settings) + bind_path, targets_file = prepare_volume(inputs["targets"], work_dir, container_settings) volumes.append(bind_path) - bind_path, edges_file = prepare_volume(edges, work_dir, container_settings) + bind_path, edges_file = prepare_volume(inputs["edges"], work_dir, container_settings) volumes.append(bind_path) # Create a prefix for the output filename and ensure the directory exists @@ -92,7 +95,7 @@ def run(sources=None, targets=None, edges=None, output_file=None, gamma=10, cont volumes.append(bind_path) mapped_out_prefix = Path(mapped_out_dir) - out_file_suffixed = out_dir / f'output_gamma{str(gamma)}.txt' + out_file_suffixed = out_dir / f'output_gamma{str(args.gamma)}.txt' # Makes the Python command to run within in the container command = ['python', @@ -101,7 +104,7 @@ def run(sources=None, targets=None, edges=None, output_file=None, gamma=10, cont '--sources_file', sources_file, '--targets_file', targets_file, '--output', str(Path(mapped_out_prefix, 'output').as_posix()), - '--gamma', str(gamma)] + '--gamma', str(args.gamma)] # choosing to run in docker or singularity container container_suffix = "responsenet:v2" diff --git a/spras/runner.py b/spras/runner.py index 9a5a04f32..d138d8e33 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -35,12 +35,14 @@ def get_algorithm(algorithm: str) -> type[PRM]: except KeyError as exc: raise NotImplementedError(f'{algorithm} is not currently supported.') from exc -def run(algorithm: str, params): +def run(algorithm: str, inputs, output_file, args, container_settings): """ A generic interface to the algorithm-specific run functions """ algorithm_runner = get_algorithm(algorithm) - algorithm_runner.run(**params) + # We can't use config.config here else we would get a cyclic dependency. + # Since args is a dict here, we use the 'run_typeless' utility PRM function. + algorithm_runner.run_typeless(inputs, output_file, args, container_settings) def get_required_inputs(algorithm: str): diff --git a/spras/rwr.py b/spras/rwr.py index e7691f160..e6f54d674 100644 --- a/spras/rwr.py +++ b/spras/rwr.py @@ -1,6 +1,8 @@ from pathlib import Path +from typing import Optional import pandas as pd +from pydantic import BaseModel, ConfigDict from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log @@ -12,14 +14,30 @@ from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges, raw_pathway_df -__all__ = ['RWR'] +__all__ = ['RWR', 'RWRParams'] -class RWR(PRM): +class RWRParams(BaseModel): + threshold: int + "The number of nodes to return" + + alpha: Optional[float] = None + "The chance of a restart during the random walk" + + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) + +class RWR(PRM[RWRParams]): required_inputs = ['network','nodes'] dois = [] @staticmethod def generate_inputs(data, filename_map): + """ + Access fields from the dataset and write the required input files + @param data: dataset + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type. Associated files will be written with: + - nodes: list of active nodes + - network: list of edges + """ RWR.validate_required_inputs(filename_map) # Get sources and targets for node input file @@ -38,12 +56,11 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map['network'],sep='|',index=False,columns=['Interactor1','Interactor2'],header=False) @staticmethod - def run(network=None, nodes=None, alpha=None, output_file=None, container_settings=None, threshold=None): + def run(inputs, output_file, args, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() - if not nodes: - raise ValueError('Required RWR arguments are missing') + RWR.validate_required_run_args(inputs) - with Path(network).open() as network_f: + with Path(inputs["network"]).open() as network_f: for line in network_f: line = line.strip() endpoints = line.split("|") @@ -54,10 +71,10 @@ def run(network=None, nodes=None, alpha=None, output_file=None, container_settin # Each volume is a tuple (src, dest) volumes = list() - bind_path, nodes_file = prepare_volume(nodes, work_dir, container_settings) + bind_path, nodes_file = prepare_volume(inputs["nodes"], work_dir, container_settings) volumes.append(bind_path) - bind_path, network_file = prepare_volume(network, work_dir, container_settings) + bind_path, network_file = prepare_volume(inputs["network"], work_dir, container_settings) volumes.append(bind_path) # RWR does not provide an argument to set the output directory @@ -75,8 +92,8 @@ def run(network=None, nodes=None, alpha=None, output_file=None, container_settin '--output', mapped_out_prefix] # Add alpha as an optional argument - if alpha is not None: - command.extend(['--alpha', str(alpha)]) + if args.alpha is not None: + command.extend(['--alpha', str(args.alpha)]) container_suffix = 'rwr:v1' run_container_and_log( diff --git a/spras/strwr.py b/spras/strwr.py index c99b39830..42928e4cd 100644 --- a/spras/strwr.py +++ b/spras/strwr.py @@ -1,4 +1,7 @@ from pathlib import Path +from typing import Optional + +from pydantic import BaseModel, ConfigDict from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log @@ -10,15 +13,32 @@ from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges, raw_pathway_df -__all__ = ['ST_RWR'] +__all__ = ['ST_RWR', 'ST_RWRParams'] + +class ST_RWRParams(BaseModel): + threshold: int + "The number of nodes to return" + + alpha: Optional[float] = None + "The chance of a restart during the random walk" + + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) # Note: This class is almost identical to the rwr.py file. -class ST_RWR(PRM): +class ST_RWR(PRM[ST_RWRParams]): required_inputs = ['network','sources','targets'] dois = [] @staticmethod def generate_inputs(data, filename_map): + """ + Access fields from the dataset and write the required input files + @param data: dataset + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type. Associated files will be written with: + - sources: list of sources + - targets: list of targets + - network: list of edges + """ ST_RWR.validate_required_inputs(filename_map) # Get separate source and target nodes for source and target files @@ -38,12 +58,11 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map['network'],sep='|',index=False,columns=['Interactor1','Interactor2'],header=False) @staticmethod - def run(network=None, sources=None, targets=None, alpha=None, output_file=None, container_settings=None, threshold=None): + def run(inputs, output_file, args, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() - if not sources or not targets or not network or not output_file: - raise ValueError('Required local_neighborhood arguments are missing') + ST_RWR.validate_required_run_args(inputs) - with Path(network).open() as network_f: + with Path(inputs["network"]).open() as network_f: for line in network_f: line = line.strip() endpoints = line.split("|") @@ -55,13 +74,13 @@ def run(network=None, sources=None, targets=None, alpha=None, output_file=None, # Each volume is a tuple (src, dest) volumes = list() - bind_path, source_file = prepare_volume(sources, work_dir, container_settings) + bind_path, source_file = prepare_volume(inputs["sources"], work_dir, container_settings) volumes.append(bind_path) - bind_path, target_file = prepare_volume(targets, work_dir, container_settings) + bind_path, target_file = prepare_volume(inputs["targets"], work_dir, container_settings) volumes.append(bind_path) - bind_path, network_file = prepare_volume(network, work_dir, container_settings) + bind_path, network_file = prepare_volume(inputs["network"], work_dir, container_settings) volumes.append(bind_path) # ST_RWR does not provide an argument to set the output directory @@ -80,8 +99,8 @@ def run(network=None, sources=None, targets=None, alpha=None, output_file=None, '--output', mapped_out_prefix] # Add alpha as an optional argument - if alpha is not None: - command.extend(['--alpha', str(alpha)]) + if args.alpha is not None: + command.extend(['--alpha', str(args.alpha)]) container_suffix = 'st-rwr:v1' run_container_and_log( diff --git a/test/AllPairs/test_ap.py b/test/AllPairs/test_ap.py index 460761e44..e9e8a8e37 100644 --- a/test/AllPairs/test_ap.py +++ b/test/AllPairs/test_ap.py @@ -46,11 +46,10 @@ def test_allpairs(self): out_path = OUT_DIR.joinpath('sample-out.txt') out_path.unlink(missing_ok=True) # Only include required arguments - AllPairs.run( - nodetypes=str(TEST_DIR / 'input' / 'sample-in-nodetypes.txt'), - network=str(TEST_DIR / 'input' / 'sample-in-net.txt'), - directed_flag=str(TEST_DIR / 'input' / 'directed-flag-false.txt'), - output_file=str(out_path) + AllPairs.run({"nodetypes": str(TEST_DIR / 'input' / 'sample-in-nodetypes.txt'), + "network": str(TEST_DIR / 'input' / 'sample-in-net.txt'), + "directed_flag": str(TEST_DIR / 'input' / 'directed-flag-false.txt')}, + output_file=str(out_path) ) assert out_path.exists() @@ -58,9 +57,8 @@ def test_allpairs_missing(self): # Test the expected error is raised when required arguments are missing with pytest.raises(ValueError): # No nodetypes - AllPairs.run( - network=str(TEST_DIR / 'input' / 'sample-in-net.txt'), - output_file=str(OUT_DIR / 'sample-out.txt')) + AllPairs.run({"network": str(TEST_DIR / 'input' / 'sample-in-net.txt')}, + output_file=str(OUT_DIR / 'sample-out.txt')) # Only run Singularity test if the binary is available on the system # spython is only available on Unix, but do not explicitly skip non-Unix platforms @@ -69,24 +67,22 @@ def test_allpairs_singularity(self): out_path = OUT_DIR / 'sample-out.txt' out_path.unlink(missing_ok=True) # Only include required arguments and run with Singularity - AllPairs.run( - nodetypes=str(TEST_DIR / 'input' / 'sample-in-nodetypes.txt'), - network=str(TEST_DIR / 'input' / 'sample-in-net.txt'), - directed_flag=str(TEST_DIR / 'input' / 'directed-flag-false.txt'), - output_file=str(out_path), - container_settings=ProcessedContainerSettings(framework=ContainerFramework.singularity)) + AllPairs.run({"nodetypes": str(TEST_DIR / 'input' / 'sample-in-nodetypes.txt'), + "network": str(TEST_DIR / 'input' / 'sample-in-net.txt'), + "directed_flag": str(TEST_DIR / 'input' / 'directed-flag-false.txt')}, + output_file=str(out_path), + container_settings=ProcessedContainerSettings(framework=ContainerFramework.singularity)) assert out_path.exists() @pytest.mark.skipif(not shutil.which('singularity'), reason='Singularity not found on system') def test_allpairs_singularity_unpacked(self): out_path = OUT_DIR / 'sample-out-unpack.txt' out_path.unlink(missing_ok=True) - AllPairs.run( - nodetypes=str(TEST_DIR / 'input/sample-in-nodetypes.txt'), - network=str(TEST_DIR / 'input/sample-in-net.txt'), - directed_flag=str(TEST_DIR / 'input' / 'directed-flag-false.txt'), - output_file=str(out_path), - container_settings=ProcessedContainerSettings(framework=ContainerFramework.singularity, unpack_singularity=True)) + AllPairs.run({"nodetypes": str(TEST_DIR / 'input/sample-in-nodetypes.txt'), + "network": str(TEST_DIR / 'input/sample-in-net.txt'), + "directed_flag": str(TEST_DIR / 'input' / 'directed-flag-false.txt')}, + output_file=str(out_path), + container_settings=ProcessedContainerSettings(framework=ContainerFramework.singularity, unpack_singularity=True)) assert out_path.exists() def test_allpairs_correctness(self): @@ -102,12 +98,10 @@ def test_allpairs_correctness(self): out_path = OUT_DIR / 'correctness-out.txt' out_path.unlink(missing_ok=True) - AllPairs.run( - nodetypes=str(TEST_DIR / 'input' / 'correctness-nodetypes.txt'), - network=str(TEST_DIR / 'input' / 'correctness-network.txt'), - directed_flag=str(TEST_DIR / 'input' / 'directed-flag-false.txt'), - output_file=str(OUT_DIR / 'correctness-out.txt') - ) + AllPairs.run({"nodetypes": TEST_DIR / 'input' / 'correctness-nodetypes.txt', + "network": TEST_DIR / 'input' / 'correctness-network.txt', + "directed_flag": TEST_DIR / 'input' / 'directed-flag-false.txt'}, + output_file=OUT_DIR / 'correctness-out.txt') edge_equality_test_util(out_path, EXPECTED_DIR / 'correctness-expected.txt') @@ -115,12 +109,10 @@ def test_allpairs_directed(self): out_path = OUT_DIR / 'directed-out.txt' out_path.unlink(missing_ok=True) - AllPairs.run( - nodetypes=str(TEST_DIR / 'input' / 'directed-nodetypes.txt'), - network=str(TEST_DIR / 'input' / 'directed-network.txt'), - directed_flag=str(TEST_DIR / 'input' / 'directed-flag-true.txt'), - output_file=str(OUT_DIR / 'directed-out.txt'), - ) + AllPairs.run({"nodetypes": TEST_DIR / 'input' / 'directed-nodetypes.txt', + "network": TEST_DIR / 'input' / 'directed-network.txt', + "directed_flag": TEST_DIR / 'input' / 'directed-flag-true.txt'}, + output_file=OUT_DIR / 'directed-out.txt') edge_equality_test_util(out_path, EXPECTED_DIR.joinpath('directed-expected.txt')) @@ -134,11 +126,10 @@ def test_allpairs_zero_length(self): out_path = OUT_DIR / 'zero-length-out.txt' out_path.unlink(missing_ok=True) - AllPairs.run( - nodetypes=TEST_DIR / 'input' / 'zero-length-nodetypes.txt', - network=TEST_DIR / 'input' / 'zero-length-network.txt', - directed_flag=str(TEST_DIR / 'input' / 'directed-flag-false.txt'), - output_file=OUT_DIR / 'zero-length-out.txt' + AllPairs.run({"nodetypes": TEST_DIR / 'input' / 'zero-length-nodetypes.txt', + "network": TEST_DIR / 'input' / 'zero-length-network.txt', + "directed_flag": TEST_DIR / 'input' / 'directed-flag-false.txt'}, + output_file=OUT_DIR / 'zero-length-out.txt' ) assert filecmp.cmp(OUT_DIR / 'zero-length-out.txt', EXPECTED_DIR / 'zero-length-expected.txt', shallow=False) diff --git a/test/BowTieBuilder/test_btb.py b/test/BowTieBuilder/test_btb.py index c6c1046ab..18279ddae 100644 --- a/test/BowTieBuilder/test_btb.py +++ b/test/BowTieBuilder/test_btb.py @@ -20,53 +20,50 @@ class TestBowTieBuilder: def test_btb_missing(self): with pytest.raises(ValueError): # No edges - BTB.run( - targets=Path(TEST_DIR, 'input', 'target.txt'), - sources=Path(TEST_DIR, 'input', 'source.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"targets": Path(TEST_DIR, 'input', 'target1.txt'), + "sources": Path(TEST_DIR, 'input', 'source1.txt')}, + output_file=OUT_FILE_DEFAULT) with pytest.raises(ValueError): # No source - BTB.run( - targets=Path(TEST_DIR, 'input', 'target.txt'), - edges=Path(TEST_DIR, 'input', 'edges.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"targets": Path(TEST_DIR, 'input', 'target1.txt'), + "edges": Path(TEST_DIR, 'input', 'edges1.txt')}, + output_file=OUT_FILE_DEFAULT) with pytest.raises(ValueError): # No target - BTB.run( - sources=Path(TEST_DIR, 'input', 'source.txt'), - edges=Path(TEST_DIR, 'input', 'edges.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"sources": Path(TEST_DIR, 'input', 'source1.txt'), + "edges": Path(TEST_DIR, 'input', 'edges1.txt')}, + output_file=OUT_FILE_DEFAULT) """ Run the BowTieBuilder algorithm with missing files """ def test_btb_file(self): - with pytest.raises(ValueError): - BTB.run(sources=Path(TEST_DIR, 'input', 'unknown.txt'), - targets=Path(TEST_DIR, 'input', 'target.txt'), - edges=Path(TEST_DIR, 'input', 'edges.txt'), - output_file=OUT_FILE_DEFAULT) + with pytest.raises(OSError): + BTB.run({"sources": Path(TEST_DIR, 'input', 'unknown.txt'), + "targets": Path(TEST_DIR, 'input', 'target.txt'), + "edges": Path(TEST_DIR, 'input', 'edges.txt')}, + output_file=OUT_FILE_DEFAULT) """ Run the BowTieBuilder algorithm with bad input data """ def test_format_error(self): with pytest.raises(IndexError): - BTB.run(sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - edges=Path(TEST_DIR, 'input', 'bad-edges.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt'), + "edges": Path(TEST_DIR, 'input', 'bad-edges.txt')}, + output_file=OUT_FILE_DEFAULT) """ Run the BowTieBuilder algorithm on the example input files and check the output matches the expected output """ def test_btb(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'btb-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'btb-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'btb-output.txt') @@ -84,10 +81,10 @@ def test_btb(self): """ def test_disjoint(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'disjoint-edges.txt'), - sources=Path(TEST_DIR, 'input', 'disjoint-sources.txt'), - targets=Path(TEST_DIR, 'input', 'disjoint-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'disjoint-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'disjoint-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'disjoint-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'disjoint-output.txt') @@ -105,10 +102,10 @@ def test_disjoint(self): """ def test_disjoint2(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'disjoint2-edges.txt'), - sources=Path(TEST_DIR, 'input', 'disjoint-sources.txt'), - targets=Path(TEST_DIR, 'input', 'disjoint-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'disjoint2-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'disjoint-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'disjoint-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'disjoint-output.txt') @@ -125,12 +122,11 @@ def test_disjoint2(self): Run the BowTieBuilder algorithm with a missing input file """ def test_missing_file(self): - with pytest.raises(ValueError): - with pytest.raises(OSError): - BTB.run(edges=Path(TEST_DIR, 'input', 'missing.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + with pytest.raises(OSError): + BTB.run({"edges": Path(TEST_DIR, 'input', 'missing.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) """ @@ -138,10 +134,10 @@ def test_missing_file(self): """ def test_source_to_source(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'source-to-source-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'source-to-source-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'source-to-source-output.txt') @@ -159,10 +155,10 @@ def test_source_to_source(self): """ def test_source_to_source2(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'source-to-source2-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'source-to-source2-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'source-to-source2-output.txt') @@ -181,10 +177,10 @@ def test_source_to_source2(self): def test_source_to_source_disjoint(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'source-to-source-disjoint-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'source-to-source-disjoint-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'source-to-source-disjoint-output.txt') @@ -203,10 +199,10 @@ def test_source_to_source_disjoint(self): def test_bidirectional(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'bidirectional-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'bidirectional-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'bidirectional-output.txt') @@ -225,10 +221,10 @@ def test_bidirectional(self): def test_target_to_source(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'target-to-source-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'target-to-source-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'empty-output.txt') @@ -247,10 +243,10 @@ def test_target_to_source(self): def test_loop(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'loop-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'loop-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'loop-output.txt') @@ -269,10 +265,10 @@ def test_loop(self): def test_weighted(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'weighted-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'weighted-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'weighted-output.txt') @@ -287,10 +283,10 @@ def test_weighted(self): def test_weight_one(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'weight-one-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'weight-one-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'weighted-output.txt') @@ -308,8 +304,8 @@ def test_weight_one(self): @pytest.mark.skipif(not shutil.which('singularity'), reason='Singularity not found on system') def test_btb_singularity(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'source-to-source-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT, - container_settings=ProcessedContainerSettings(framework=ContainerFramework.singularity)) + BTB.run({"edges": Path(TEST_DIR, 'input', 'source-to-source-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT, + container_settings=ProcessedContainerSettings(framework=ContainerFramework.singularity)) diff --git a/test/DOMINO/test_domino.py b/test/DOMINO/test_domino.py index d53fc87cd..2f88e4c03 100644 --- a/test/DOMINO/test_domino.py +++ b/test/DOMINO/test_domino.py @@ -5,7 +5,12 @@ import spras.config.config as config from spras.config.container_schema import ContainerFramework, ProcessedContainerSettings -from spras.domino import DOMINO, post_domino_id_transform, pre_domino_id_transform +from spras.domino import ( + DOMINO, + DominoParams, + post_domino_id_transform, + pre_domino_id_transform, +) config.init_from_file("config/config.yaml") @@ -27,22 +32,19 @@ class TestDOMINO: def test_domino_required(self): # Only include required arguments OUT_FILE_DEFAULT.unlink(missing_ok=True) - DOMINO.run( - network=TEST_DIR / 'input' / 'simple' / 'domino-network.txt', - active_genes=TEST_DIR / 'input' / 'simple' / 'domino-active-genes.txt', - output_file=OUT_FILE_DEFAULT) + DOMINO.run({"network": TEST_DIR / 'input' / 'simple' / 'domino-network.txt', + "active_genes": TEST_DIR / 'input' / 'simple' / 'domino-active-genes.txt'}, + output_file=OUT_FILE_DEFAULT) # output_file should be empty assert OUT_FILE_DEFAULT.exists() def test_domino_optional(self): # Include optional arguments - OUT_FILE_OPTIONAL.unlink(missing_ok=True) - DOMINO.run( - network=TEST_DIR / 'input' / 'simple' / 'domino-network.txt', - active_genes=TEST_DIR / 'input' / 'simple' / 'domino-active-genes.txt', - output_file=OUT_FILE_OPTIONAL, - slice_threshold=0.4, - module_threshold=0.06) + OUT_FILE_DEFAULT.unlink(missing_ok=True) + DOMINO.run({"network": TEST_DIR / 'input' / 'simple' / 'domino-network.txt', + "active_genes": TEST_DIR / 'input' / 'simple' / 'domino-active-genes.txt'}, + output_file=OUT_FILE_OPTIONAL, + args=DominoParams(slice_threshold=0.4, module_threshold=0.06)) # output_file should be empty assert OUT_FILE_OPTIONAL.exists() @@ -50,25 +52,23 @@ def test_domino_missing_active_genes(self): # Test the expected error is raised when active_genes argument is missing with pytest.raises(ValueError): # No active_genes - DOMINO.run( - network=TEST_DIR / 'input' / 'simple' / 'domino-network.txt', - output_file=OUT_FILE_DEFAULT) + DOMINO.run({"network": TEST_DIR / 'input' / 'simple' / 'domino-network.txt'}, + output_file=OUT_FILE_DEFAULT) def test_domino_missing_network(self): # Test the expected error is raised when network argument is missing with pytest.raises(ValueError): # No network - DOMINO.run( - active_genes=TEST_DIR / 'input' / 'simple' / 'domino-active-genes.txt', - output_file=OUT_FILE_DEFAULT) + DOMINO.run({"active_genes": TEST_DIR / 'input' / 'simple' / 'domino-active-genes.txt'}, + output_file=OUT_FILE_DEFAULT) def test_domino_empty(self): # Test over empty files # https://github.com/Reed-CompBio/spras/pull/103#issuecomment-1681526958 OUT_FILE_DEFAULT.unlink(missing_ok=True) DOMINO.run( - network=TEST_DIR / 'input' / 'empty' / 'domino-network.txt', - active_genes=TEST_DIR / 'input' / 'empty' / 'domino-active-genes.txt', + {"network": TEST_DIR / 'input' / 'empty' / 'domino-network.txt', + "active_genes": TEST_DIR / 'input' / 'empty' / 'domino-active-genes.txt'}, output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists() @@ -79,11 +79,10 @@ def test_domino_empty(self): def test_domino_singularity(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) # Only include required arguments and run with Singularity - DOMINO.run( - network=TEST_DIR / 'input' / 'simple' / 'domino-network.txt', - active_genes=TEST_DIR / 'input' / 'simple' / 'domino-active-genes.txt', - output_file=OUT_FILE_DEFAULT, - container_settings=ProcessedContainerSettings(framework=ContainerFramework.singularity)) + DOMINO.run({"network": TEST_DIR / 'input' / 'simple' / 'domino-network.txt', + "active_genes": TEST_DIR / 'input' / 'simple' / 'domino-active-genes.txt'}, + output_file=OUT_FILE_DEFAULT, + container_settings=ProcessedContainerSettings(framework=ContainerFramework.singularity)) assert OUT_FILE_DEFAULT.exists() def test_pre_id_transform(self): diff --git a/test/MEO/test_meo.py b/test/MEO/test_meo.py index cd5260f2e..7322bf003 100644 --- a/test/MEO/test_meo.py +++ b/test/MEO/test_meo.py @@ -5,7 +5,7 @@ import spras.config.config as config from spras.config.container_schema import ContainerFramework, ProcessedContainerSettings -from spras.meo import MEO, write_properties +from spras.meo import MEO, MEOParams, write_properties config.init_from_file("config/config.yaml") @@ -21,9 +21,9 @@ def test_meo_required(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Only include required arguments - MEO.run(edges=TEST_DIR + 'input/meo-edges.txt', - sources=TEST_DIR + 'input/meo-sources.txt', - targets=TEST_DIR + 'input/meo-targets.txt', + MEO.run({"edges": TEST_DIR + 'input/meo-edges.txt', + "sources": TEST_DIR + 'input/meo-sources.txt', + "targets": TEST_DIR + 'input/meo-targets.txt'}, output_file=OUT_FILE) assert out_path.exists() @@ -31,21 +31,19 @@ def test_meo_all_optional(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include all optional arguments - MEO.run(edges=TEST_DIR + 'input/meo-edges.txt', - sources=TEST_DIR + 'input/meo-sources.txt', - targets=TEST_DIR + 'input/meo-targets.txt', - output_file=OUT_FILE, - max_path_length=3, - local_search='No', - rand_restarts=10) + MEO.run({"edges": TEST_DIR + 'input/meo-edges.txt', + "sources": TEST_DIR + 'input/meo-sources.txt', + "targets": TEST_DIR + 'input/meo-targets.txt'}, + args=MEOParams(max_path_length=3, local_search=False, rand_restarts=10), + output_file=OUT_FILE) assert out_path.exists() def test_meo_missing(self): # Test the expected error is raised when required arguments are missing with pytest.raises(ValueError): # No edges - MEO.run(sources=TEST_DIR + 'input/meo-sources.txt', - targets=TEST_DIR + 'input/meo-targets.txt', + MEO.run({"sources": TEST_DIR + 'input/meo-sources.txt', + "targets": TEST_DIR + 'input/meo-targets.txt'}, output_file=OUT_FILE) with pytest.raises(ValueError): @@ -63,9 +61,9 @@ def test_meo_singularity(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Only include required arguments and run with Singularity - MEO.run(edges=TEST_DIR + 'input/meo-edges.txt', - sources=TEST_DIR + 'input/meo-sources.txt', - targets=TEST_DIR + 'input/meo-targets.txt', + MEO.run({"edges": TEST_DIR + 'input/meo-edges.txt', + "sources": TEST_DIR + 'input/meo-sources.txt', + "targets": TEST_DIR + 'input/meo-targets.txt'}, output_file=OUT_FILE, container_settings=ProcessedContainerSettings(framework=ContainerFramework.singularity)) assert out_path.exists() diff --git a/test/MinCostFlow/test_mcf.py b/test/MinCostFlow/test_mcf.py index 875743933..23e597175 100644 --- a/test/MinCostFlow/test_mcf.py +++ b/test/MinCostFlow/test_mcf.py @@ -5,7 +5,7 @@ import spras.config.config as config from spras.config.container_schema import ContainerFramework, ProcessedContainerSettings -from spras.mincostflow import MinCostFlow +from spras.mincostflow import MinCostFlow, MinCostFlowParams config.init_from_file("config/config.yaml") @@ -22,9 +22,9 @@ def test_mincostflow_required(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE) assert out_path.exists() # TODO: assert for the output .equals expected_output instead of only testing @@ -35,11 +35,11 @@ def test_mincostflow_missing_capacity(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE, - flow=1) + args=MinCostFlowParams(flow=1)) assert out_path.exists() @pytest.mark.parametrize('graph', ['graph1']) @@ -47,11 +47,11 @@ def test_mincostflow_missing_flow(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE, - capacity=1) + args=MinCostFlowParams(capacity=1)) assert out_path.exists() @pytest.mark.parametrize('graph', ['graph1']) @@ -60,24 +60,22 @@ def test_mincostflow_too_much_flow(self, graph): out_path.unlink(missing_ok=True) with pytest.raises(RuntimeError): - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE, - flow=50, - capacity=1) + args=MinCostFlowParams(flow=50, capacity=1)) @pytest.mark.parametrize('graph', ['graph1']) def test_mincostflow_no_flow(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE, - flow=0, - capacity=1) + args=MinCostFlowParams(flow=0, capacity=1)) assert out_path.exists() @pytest.mark.parametrize('graph', ['graph1']) @@ -85,20 +83,19 @@ def test_mincostflow_all_optional(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include all optional arguments - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE, - flow=1, - capacity=1) + args=MinCostFlowParams(flow=1, capacity=1)) assert out_path.exists() @pytest.mark.parametrize('graph', ['graph1']) def test_mincostflow_missing(self, graph): # Test the expected error is raised when required arguments are missing with pytest.raises(ValueError): - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt'}, output_file=OUT_FILE) @pytest.mark.parametrize('graph', ['graph1']) @@ -107,12 +104,10 @@ def test_mincostflow_singularity(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include all optional arguments - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE, - flow=1, - capacity=1, + args=MinCostFlowParams(flow=1, capacity=1), container_settings=ProcessedContainerSettings(framework=ContainerFramework.singularity)) assert out_path.exists() - diff --git a/test/OmicsIntegrator1/test_oi1.py b/test/OmicsIntegrator1/test_oi1.py index 458341c93..c0fa19914 100644 --- a/test/OmicsIntegrator1/test_oi1.py +++ b/test/OmicsIntegrator1/test_oi1.py @@ -5,7 +5,12 @@ import spras.config.config as config from spras.config.container_schema import ContainerFramework, ProcessedContainerSettings -from spras.omicsintegrator1 import OmicsIntegrator1, write_conf +from spras.omicsintegrator1 import ( + DummyMode, + OmicsIntegrator1, + OmicsIntegrator1Params, + write_conf, +) config.init_from_file("config/config.yaml") @@ -21,79 +26,74 @@ def test_oi1_required(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Only include required arguments - OmicsIntegrator1.run(edges=TEST_DIR+'input/oi1-edges.txt', - prizes=TEST_DIR+'input/oi1-prizes.txt', + OmicsIntegrator1.run({"edges": TEST_DIR+'input/oi1-edges.txt', + "prizes": TEST_DIR+'input/oi1-prizes.txt'}, output_file=OUT_FILE, - w=5, - b=1, - d=10) + args=OmicsIntegrator1Params(w=5, b=1, d=10)) assert out_path.exists() def test_oi1_some_optional(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include optional argument - OmicsIntegrator1.run(edges=TEST_DIR+'input/oi1-edges.txt', - prizes=TEST_DIR+'input/oi1-prizes.txt', + OmicsIntegrator1.run({"edges": TEST_DIR+'input/oi1-edges.txt', + "prizes": TEST_DIR+'input/oi1-prizes.txt'}, output_file=OUT_FILE, - w=5, - b=1, - d=10, - noise=0.333, - g=0.001, - r=0) + args=OmicsIntegrator1Params(w=5, b=1, d=10, noise=0.333, g=0.001, r=0)) assert out_path.exists() def test_oi1_all_optional(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include all optional arguments - OmicsIntegrator1.run(edges=TEST_DIR+'input/oi1-edges.txt', - prizes=TEST_DIR+'input/oi1-prizes.txt', - dummy_nodes=None, - dummy_mode='terminals', - mu_squared=True, - exclude_terms=True, + OmicsIntegrator1.run({"edges": TEST_DIR+'input/oi1-edges.txt', + "prizes": TEST_DIR+'input/oi1-prizes.txt'}, output_file=OUT_FILE, - noisy_edges=0, - shuffled_prizes=0, - random_terminals=0, - seed=1, - w=5, - b=1, - d=10, - mu=0, - noise=0.333, - g=0.001, - r=0) + args=OmicsIntegrator1Params( + dummy_mode=DummyMode.terminals, + mu_squared=True, + exclude_terms=True, + noisy_edges=0, + shuffled_prizes=0, + random_terminals=0, + seed=1, + w=5, + b=1, + d=10, + mu=0, + noise=0.333, + g=0.001, + r=0)) assert out_path.exists() def test_oi1_dummy_file(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include optional argument - OmicsIntegrator1.run(edges=TEST_DIR+'input/oi1-edges.txt', - prizes=TEST_DIR+'input/oi1-prizes.txt', - dummy_nodes=TEST_DIR + 'input/oi1-dummy.txt', - dummy_mode='file', + OmicsIntegrator1.run({"edges": TEST_DIR+'input/oi1-edges.txt', + "prizes": TEST_DIR+'input/oi1-prizes.txt', + "dummy_nodes": TEST_DIR + 'input/oi1-dummy.txt'}, output_file=OUT_FILE, - w=5, - b=1, - d=10, - noise=0.333, - g=0.001, - r=0) + args=OmicsIntegrator1Params( + dummy_mode=DummyMode.file, + w=5, + b=1, + d=10, + noise=0.333, + g=0.001, + r=0)) assert out_path.exists() def test_oi1_missing(self): # Test the expected error is raised when required arguments are missing with pytest.raises(ValueError): # No edges - OmicsIntegrator1.run(prizes=TEST_DIR + 'input/oi1-prizes.txt', + OmicsIntegrator1.run({"prizes": TEST_DIR + 'input/oi1-prizes.txt'}, output_file=TEST_DIR+'output/test_optimalForest.sif', - w=5, - b=1, - d=10) + args=OmicsIntegrator1Params( + w=5, + b=1, + d=10)) with pytest.raises(ValueError): # No w write_conf(Path('.'), @@ -104,13 +104,14 @@ def test_oi1_missing_dummy(self): # Test the expected error is raised when the dummy_nodes file is missing and the dummy_mode is 'file' with pytest.raises(ValueError): # No edges - OmicsIntegrator1.run(edges=TEST_DIR+'input/oi1-edges.txt', - prizes=TEST_DIR + 'input/oi1-prizes.txt', + OmicsIntegrator1.run({"edges": TEST_DIR+'input/oi1-edges.txt', + "prizes": TEST_DIR + 'input/oi1-prizes.txt'}, output_file=TEST_DIR+'output/test_optimalForest.sif', - w=5, - b=1, - d=10, - dummy_mode='file') + args=OmicsIntegrator1Params( + w=5, + b=1, + d=10, + dummy_mode=DummyMode.file)) # Only run Singularity test if the binary is available on the system # spython is only available on Unix, but do not explicitly skip non-Unix platforms @@ -119,11 +120,12 @@ def test_oi1_singularity(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Only include required arguments and run with Singularity - OmicsIntegrator1.run(edges=TEST_DIR + 'input/oi1-edges.txt', - prizes=TEST_DIR + 'input/oi1-prizes.txt', + OmicsIntegrator1.run({"edges": TEST_DIR + 'input/oi1-edges.txt', + "prizes": TEST_DIR + 'input/oi1-prizes.txt'}, output_file=OUT_FILE, - w=5, - b=1, - d=10, + args=OmicsIntegrator1Params( + w=5, + b=1, + d=10), container_settings=ProcessedContainerSettings(framework=ContainerFramework.singularity)) assert out_path.exists() diff --git a/test/OmicsIntegrator2/test_oi2.py b/test/OmicsIntegrator2/test_oi2.py index 471a59882..3638587e8 100644 --- a/test/OmicsIntegrator2/test_oi2.py +++ b/test/OmicsIntegrator2/test_oi2.py @@ -5,7 +5,7 @@ import spras.config.config as config from spras.config.container_schema import ContainerFramework, ProcessedContainerSettings -from spras.omicsintegrator2 import OmicsIntegrator2 +from spras.omicsintegrator2 import DummyMode, OmicsIntegrator2, OmicsIntegrator2Params config.init_from_file("config/config.yaml") @@ -22,51 +22,44 @@ class TestOmicsIntegrator2: def test_oi2_required(self): # Only include required arguments OUT_FILE.unlink(missing_ok=True) - OmicsIntegrator2.run(edges=EDGE_FILE, - prizes=PRIZE_FILE, + OmicsIntegrator2.run({"edges": EDGE_FILE, + "prizes": PRIZE_FILE}, output_file=OUT_FILE) assert OUT_FILE.exists() def test_oi2_some_optional(self): # Include optional argument OUT_FILE.unlink(missing_ok=True) - OmicsIntegrator2.run(edges=EDGE_FILE, - prizes=PRIZE_FILE, + OmicsIntegrator2.run({"edges": EDGE_FILE, + "prizes": PRIZE_FILE}, output_file=OUT_FILE, - g=0) + args=OmicsIntegrator2Params(g=0)) assert OUT_FILE.exists() def test_oi2_all_optional(self): # Include all optional arguments OUT_FILE.unlink(missing_ok=True) - OmicsIntegrator2.run(edges=EDGE_FILE, - prizes=PRIZE_FILE, + OmicsIntegrator2.run({"edges": EDGE_FILE, + "prizes": PRIZE_FILE}, output_file=OUT_FILE, - w=5, - b=1, - g=3, - noise=0.1, - noisy_edges=0, - random_terminals=0, - dummy_mode='terminals', - seed=2) + args=OmicsIntegrator2Params(w=5, + b=1, + g=3, + noise=0.1, + noisy_edges=0, + random_terminals=0, + dummy_mode=DummyMode.terminals, + seed=2)) assert OUT_FILE.exists() - def test_oi2_missing(self): - # Test the expected error is raised when required arguments are missing - with pytest.raises(ValueError): - # No output_file - OmicsIntegrator2.run(edges=EDGE_FILE, - prizes=PRIZE_FILE) - # Only run Singularity test if the binary is available on the system # spython is only available on Unix, but do not explicitly skip non-Unix platforms @pytest.mark.skipif(not shutil.which('singularity'), reason='Singularity not found on system') def test_oi2_singularity(self): # Only include required arguments OUT_FILE.unlink(missing_ok=True) - OmicsIntegrator2.run(edges=EDGE_FILE, - prizes=PRIZE_FILE, + OmicsIntegrator2.run({"edges": EDGE_FILE, + "prizes": PRIZE_FILE}, output_file=OUT_FILE, container_settings=ProcessedContainerSettings(framework=ContainerFramework.singularity)) assert OUT_FILE.exists() diff --git a/test/PathLinker/test_pathlinker.py b/test/PathLinker/test_pathlinker.py index 56f77ec7a..e2d3e6fae 100644 --- a/test/PathLinker/test_pathlinker.py +++ b/test/PathLinker/test_pathlinker.py @@ -5,7 +5,7 @@ import spras.config.config as config from spras.config.container_schema import ContainerFramework, ProcessedContainerSettings -from spras.pathlinker import PathLinker +from spras.pathlinker import PathLinker, PathLinkerParams config.init_from_file("config/config.yaml") @@ -22,33 +22,28 @@ def test_pathlinker_required(self): out_path = Path(OUT_FILE_DEFAULT) out_path.unlink(missing_ok=True) # Only include required arguments - PathLinker.run( - nodetypes=TEST_DIR+'input/sample-in-nodetypes.txt', - network=TEST_DIR+'input/sample-in-net.txt', - output_file=OUT_FILE_DEFAULT - ) + PathLinker.run({"nodetypes": TEST_DIR+'input/sample-in-nodetypes.txt', + "network": TEST_DIR+'input/sample-in-net.txt'}, + output_file=OUT_FILE_DEFAULT) assert out_path.exists() def test_pathlinker_optional(self): out_path = Path(OUT_FILE_100) out_path.unlink(missing_ok=True) # Include optional argument - PathLinker.run( - nodetypes=TEST_DIR+'input/sample-in-nodetypes.txt', - network=TEST_DIR+'input/sample-in-net.txt', - output_file=OUT_FILE_100, - k=100 - ) + PathLinker.run({"nodetypes": TEST_DIR+'input/sample-in-nodetypes.txt', + "network": TEST_DIR+'input/sample-in-net.txt'}, + output_file=OUT_FILE_100, + args=PathLinkerParams(k=100)) assert out_path.exists() def test_pathlinker_missing(self): # Test the expected error is raised when required arguments are missing with pytest.raises(ValueError): # No nodetypes - PathLinker.run( - network=TEST_DIR + 'input/sample-in-net.txt', - output_file=OUT_FILE_100, - k=100) + PathLinker.run({"network": TEST_DIR + 'input/sample-in-net.txt'}, + output_file=OUT_FILE_100, + args=PathLinkerParams(k=100)) # Only run Singularity test if the binary is available on the system # spython is only available on Unix, but do not explicitly skip non-Unix platforms @@ -57,9 +52,8 @@ def test_pathlinker_singularity(self): out_path = Path(OUT_FILE_DEFAULT) out_path.unlink(missing_ok=True) # Only include required arguments and run with Singularity - PathLinker.run( - nodetypes=TEST_DIR+'input/sample-in-nodetypes.txt', - network=TEST_DIR+'input/sample-in-net.txt', - output_file=OUT_FILE_DEFAULT, - container_settings=ProcessedContainerSettings(framework=ContainerFramework.singularity)) + PathLinker.run({"nodetypes": TEST_DIR+'input/sample-in-nodetypes.txt', + "network": TEST_DIR+'input/sample-in-net.txt'}, + output_file=OUT_FILE_DEFAULT, + container_settings=ProcessedContainerSettings(framework=ContainerFramework.singularity)) assert out_path.exists() diff --git a/test/RWR/test_RWR.py b/test/RWR/test_RWR.py index 08b42c369..fa3ca96f1 100644 --- a/test/RWR/test_RWR.py +++ b/test/RWR/test_RWR.py @@ -6,7 +6,7 @@ import spras.config.config as config from spras.config.container_schema import ContainerFramework, ProcessedContainerSettings -from spras.rwr import RWR +from spras.rwr import RWR, RWRParams config.init_from_file("config/config.yaml") @@ -20,9 +20,9 @@ class TestRWR: """ def test_rwr(self): OUT_FILE.unlink(missing_ok=True) - RWR.run(network=Path(TEST_DIR, 'input', 'rwr-network.txt'), - nodes=Path(TEST_DIR, 'input','rwr-nodes.txt'), - alpha=0.85, + RWR.run({"network": Path(TEST_DIR, 'input', 'rwr-network.txt'), + "nodes": Path(TEST_DIR, 'input','rwr-nodes.txt')}, + args=RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE) assert OUT_FILE.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected_output', 'rwr-output.txt') @@ -33,9 +33,9 @@ def test_rwr(self): """ def test_missing_file(self): with pytest.raises(OSError): - RWR.run(network=Path(TEST_DIR, 'input', 'missing.txt'), - nodes=Path(TEST_DIR, 'input','rwr-nodes.txt'), - alpha=0.85, + RWR.run({"network": Path(TEST_DIR, 'input', 'missing.txt'), + "nodes": Path(TEST_DIR, 'input','rwr-nodes.txt')}, + args=RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE) """ @@ -43,9 +43,9 @@ def test_missing_file(self): """ def test_format_error(self): with pytest.raises(ValueError): - RWR.run(network=Path(TEST_DIR, 'input', 'rwr-bad-network.txt'), - nodes=Path(TEST_DIR, 'input','rwr-nodes.txt'), - alpha=0.85, + RWR.run({"network": Path(TEST_DIR, 'input', 'rwr-bad-network.txt'), + "nodes": Path(TEST_DIR, 'input','rwr-nodes.txt')}, + args=RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE) # Only run Singularity test if the binary is available on the system @@ -54,9 +54,9 @@ def test_format_error(self): def test_rwr_singularity(self): OUT_FILE.unlink(missing_ok=True) # Only include required arguments and run with Singularity - RWR.run(network=Path(TEST_DIR, 'input', 'rwr-network.txt'), - nodes=Path(TEST_DIR, 'input','rwr-nodes.txt'), - alpha=0.85, + RWR.run({"network": Path(TEST_DIR, 'input', 'rwr-network.txt'), + "nodes": Path(TEST_DIR, 'input','rwr-nodes.txt')}, + args=RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE, container_settings=ProcessedContainerSettings(framework=ContainerFramework.singularity)) assert OUT_FILE.exists() diff --git a/test/ResponseNet/test_rn.py b/test/ResponseNet/test_rn.py index 2a3abccca..9d3c1cbaa 100644 --- a/test/ResponseNet/test_rn.py +++ b/test/ResponseNet/test_rn.py @@ -6,7 +6,7 @@ import spras.config.config as config from spras.config.container_schema import ContainerFramework, ProcessedContainerSettings -from spras.responsenet import ResponseNet +from spras.responsenet import ResponseNet, ResponseNetParams config.init_from_file("config/config.yaml") @@ -21,9 +21,9 @@ class TestResponseNet: def test_responsenet_required(self): OUT_FILE.unlink(missing_ok=True) - ResponseNet.run(sources=TEST_DIR / 'input' / 'rn-sources.txt', - targets=TEST_DIR / 'input' / 'rn-targets.txt', - edges=TEST_DIR / 'input' / 'rn-edges.txt', + ResponseNet.run({"sources": TEST_DIR / 'input' / 'rn-sources.txt', + "targets": TEST_DIR / 'input' / 'rn-targets.txt', + "edges": TEST_DIR / 'input' / 'rn-edges.txt'}, output_file=OUT_FILE) assert OUT_FILE.exists() @@ -32,11 +32,11 @@ def test_responsenet_required(self): def test_responsenet_all_optional(self): OUT_FILE.unlink(missing_ok=True) # Include all optional arguments - ResponseNet.run(sources=TEST_DIR / 'input' / 'rn-sources.txt', - targets=TEST_DIR / 'input' / 'rn-targets.txt', - edges=TEST_DIR / 'input' / 'rn-edges.txt', + ResponseNet.run({"sources": TEST_DIR / 'input' / 'rn-sources.txt', + "targets": TEST_DIR / 'input' / 'rn-targets.txt', + "edges": TEST_DIR / 'input' / 'rn-edges.txt'}, output_file=OUT_FILE, - gamma=1) + args=ResponseNetParams(gamma=1)) assert OUT_FILE.exists() assert filecmp.cmp(OUT_FILE, EXPECTED_FILE_OPTIONAL, shallow=True) @@ -44,8 +44,8 @@ def test_responsenet_all_optional(self): def test_mincostflow_missing(self): # Test the expected error is raised when required arguments are missing with pytest.raises(ValueError): - ResponseNet.run(sources=TEST_DIR / 'input' / 'rn-sources.txt', - targets=TEST_DIR / 'input' / 'rn-targets.txt', + ResponseNet.run({"sources": TEST_DIR / 'input' / 'rn-sources.txt', + "targets": TEST_DIR / 'input' / 'rn-targets.txt'}, output_file=OUT_FILE) # Only run Singularity test if the binary is available on the system @@ -54,9 +54,9 @@ def test_mincostflow_missing(self): def test_responsenet_singularity(self): OUT_FILE.unlink(missing_ok=True) - ResponseNet.run(sources=TEST_DIR / 'input' / 'rn-sources.txt', - targets=TEST_DIR / 'input' / 'rn-targets.txt', - edges=TEST_DIR / 'input' / 'rn-edges.txt', + ResponseNet.run({"sources": TEST_DIR / 'input' / 'rn-sources.txt', + "targets": TEST_DIR / 'input' / 'rn-targets.txt', + "edges": TEST_DIR / 'input' / 'rn-edges.txt'}, output_file=OUT_FILE, container_settings=ProcessedContainerSettings(framework=ContainerFramework.singularity)) assert OUT_FILE.exists() diff --git a/test/ST_RWR/test_STRWR.py b/test/ST_RWR/test_STRWR.py index a4e5e1a37..be5a7c20c 100644 --- a/test/ST_RWR/test_STRWR.py +++ b/test/ST_RWR/test_STRWR.py @@ -6,7 +6,7 @@ import spras.config.config as config from spras.config.container_schema import ContainerFramework, ProcessedContainerSettings -from spras.strwr import ST_RWR +from spras.strwr import ST_RWR, ST_RWRParams config.init_from_file("config/config.yaml") @@ -21,10 +21,10 @@ class TestSTRWR: """ def test_strwr(self): OUT_FILE.unlink(missing_ok=True) - ST_RWR.run(network=Path(TEST_DIR, 'input', 'strwr-network.txt'), - sources=Path(TEST_DIR, 'input', 'strwr-sources.txt'), - targets=Path(TEST_DIR, 'input','strwr-targets.txt'), - alpha=0.85, + ST_RWR.run({"network": Path(TEST_DIR, 'input', 'strwr-network.txt'), + "sources": Path(TEST_DIR, 'input', 'strwr-sources.txt'), + "targets": Path(TEST_DIR, 'input','strwr-targets.txt')}, + args=ST_RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE) assert OUT_FILE.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected_output', 'strwr-output.txt') @@ -35,10 +35,10 @@ def test_strwr(self): """ def test_missing_file(self): with pytest.raises(OSError): - ST_RWR.run(network=Path(TEST_DIR, 'input', 'missing.txt'), - sources=Path(TEST_DIR, 'input', 'strwr-sources.txt'), - targets=Path(TEST_DIR, 'input','strwr-targets.txt'), - alpha=0.85, + ST_RWR.run({"network": Path(TEST_DIR, 'input', 'missing.txt'), + "sources": Path(TEST_DIR, 'input', 'strwr-sources.txt'), + "targets": Path(TEST_DIR, 'input','strwr-targets.txt')}, + args=ST_RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE) """ @@ -46,10 +46,10 @@ def test_missing_file(self): """ def test_format_error(self): with pytest.raises(ValueError): - ST_RWR.run(network=Path(TEST_DIR, 'input', 'strwr-bad-network.txt'), - sources=Path(TEST_DIR, 'input', 'strwr-sources.txt'), - targets=Path(TEST_DIR, 'input','strwr-targets.txt'), - alpha=0.85, + ST_RWR.run({"network": Path(TEST_DIR, 'input', 'strwr-bad-network.txt'), + "sources": Path(TEST_DIR, 'input', 'strwr-sources.txt'), + "targets": Path(TEST_DIR, 'input','strwr-targets.txt')}, + args=ST_RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE) # Only run Singularity test if the binary is available on the system @@ -58,10 +58,10 @@ def test_format_error(self): def test_strwr_singularity(self): OUT_FILE.unlink(missing_ok=True) # Only include required arguments and run with Singularity - ST_RWR.run(network=Path(TEST_DIR, 'input', 'strwr-network.txt'), - sources=Path(TEST_DIR, 'input', 'strwr-sources.txt'), - targets=Path(TEST_DIR, 'input','strwr-targets.txt'), - alpha=0.85, + ST_RWR.run({"network": Path(TEST_DIR, 'input', 'strwr-network.txt'), + "sources": Path(TEST_DIR, 'input', 'strwr-sources.txt'), + "targets": Path(TEST_DIR, 'input','strwr-targets.txt')}, + args=ST_RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE, container_settings=ProcessedContainerSettings(framework=ContainerFramework.singularity)) assert OUT_FILE.exists() diff --git a/test/analysis/expected_output/expected_egfr_summary.txt b/test/analysis/expected_output/expected_egfr_summary.txt index 0b4fe9ebd..a2d151a4a 100644 --- a/test/analysis/expected_output/expected_egfr_summary.txt +++ b/test/analysis/expected_output/expected_egfr_summary.txt @@ -1,10 +1,10 @@ Name Number of nodes Number of edges Number of connected components Density Max degree Median degree Max diameter Average path length Nodes in prize Nodes in sources Nodes in targets Nodes in active Nodes in dummy Parameter combination -test/analysis/input/egfr/tps-egfr-domino-params-V3X4RW7_pathway.txt 48 45 3 0.0398936170212766 5 2.0 16 3.882808476926124 27 0 27 27 0 {'slice_threshold': 0.3, 'module_threshold': 0.05} -test/analysis/input/egfr/tps-egfr-meo-params-GKEDDFZ_pathway.txt 1877 12845 1 0.007295700506524384 469 6.0 6 2.7973618474338107 621 1 620 621 1 {'local_search': 'Yes', 'max_path_length': 3, 'rand_restarts': 10} -test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-3THRXWW_pathway.txt 28 20 8 0.05291005291005291 4 1.0 5 1.306439393939394 28 1 27 28 1 {'b': 2, 'd': 10, 'g': '1e-3', 'r': 0.01, 'w': 0.1, 'mu': 0.008, 'dummy_mode': 'file'} -test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-5QH767V_pathway.txt 39 31 8 0.04183535762483131 6 1.0 5 1.5084498834498834 39 1 38 39 1 {'b': 10, 'd': 10, 'g': '1e-3', 'r': 0.01, 'w': 0.1, 'mu': 0.008, 'dummy_mode': 'file'} -test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-ITO5EQS_pathway.txt 14 9 5 0.0989010989010989 4 1.0 2 1.1866666666666668 14 0 14 14 0 {'b': 0.55, 'd': 10, 'g': '1e-3', 'r': 0.01, 'w': 0.1, 'mu': 0.008, 'dummy_mode': 'file'} -test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-EHHWPMD_pathway.txt 593 591 2 0.0033669841848593955 32 1.0 30 6.72248989073389 531 1 530 531 1 {'b': 2, 'g': 3} -test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-IV3IPCJ_pathway.txt 704 702 2 0.002836867968446916 35 1.0 24 6.038766691954387 616 1 615 616 1 {'b': 4, 'g': 0} +test/analysis/input/egfr/tps-egfr-domino-params-V3X4RW7_pathway.txt 48 45 3 0.0398936170212766 5 2.0 16 3.882808476926124 27 0 27 27 0 {'module_threshold': 0.05, 'slice_threshold': 0.3} +test/analysis/input/egfr/tps-egfr-meo-params-GKEDDFZ_pathway.txt 1877 12845 1 0.007295700506524384 469 6.0 6 2.7973618474338107 621 1 620 621 1 {'max_path_length': 3, 'local_search': True, 'rand_restarts': 10} +test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-3THRXWW_pathway.txt 28 20 8 0.05291005291005291 4 1.0 5 1.306439393939394 28 1 27 28 1 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.1, 'b': 10.0, 'd': 10, 'mu': 0.008, 'noise': None, 'g': 0.001, 'r': 0.01} +test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-5QH767V_pathway.txt 39 31 8 0.04183535762483131 6 1.0 5 1.5084498834498834 39 1 38 39 1 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.1, 'b': 2.0, 'd': 10, 'mu': 0.008, 'noise': None, 'g': 0.001, 'r': 0.01} +test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-ITO5EQS_pathway.txt 14 9 5 0.0989010989010989 4 1.0 2 1.1866666666666668 14 0 14 14 0 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.1, 'b': 0.55, 'd': 10, 'mu': 0.008, 'noise': None, 'g': 0.001, 'r': 0.01} +test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-EHHWPMD_pathway.txt 593 591 2 0.0033669841848593955 32 1.0 30 6.72248989073389 531 1 530 531 1 {'w': 5.0, 'b': 4.0, 'g': 0.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None} +test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-IV3IPCJ_pathway.txt 704 702 2 0.002836867968446916 35 1.0 24 6.038766691954387 616 1 615 616 1 {'w': 5.0, 'b': 2.0, 'g': 3.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None} test/analysis/input/egfr/tps-egfr-pathlinker-params-7S4SLU6_pathway.txt 14 17 1 0.18681318681318682 6 2.0 7 2.857142857142857 6 1 5 6 1 {'k': 10} test/analysis/input/egfr/tps-egfr-pathlinker-params-TCEMRS7_pathway.txt 25 32 1 0.10666666666666667 8 2.0 7 3.486666666666667 11 1 10 11 1 {'k': 20} diff --git a/test/analysis/expected_output/expected_example_summary.txt b/test/analysis/expected_output/expected_example_summary.txt index 4cb5b8c8f..78fe74d78 100644 --- a/test/analysis/expected_output/expected_example_summary.txt +++ b/test/analysis/expected_output/expected_example_summary.txt @@ -1,13 +1,13 @@ Name Number of nodes Number of edges Number of connected components Density Max degree Median degree Max diameter Average path length Nodes in prize Nodes in active Nodes in dummy Nodes in sources Nodes in targets Parameter combination -test/analysis/input/example/data0-allpairs-params-BEH6YB2_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'spras_placeholder': 'no parameters'} -test/analysis/input/example/data0-domino-params-V3X4RW7_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'slice_threshold': 0.3, 'module_threshold': 0.05} -test/analysis/input/example/data0-meo-params-GKEDDFZ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'max_path_length': 3, 'local_search': 'Yes', 'rand_restarts': 10} +test/analysis/input/example/data0-allpairs-params-BEH6YB2_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {} +test/analysis/input/example/data0-domino-params-V3X4RW7_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'module_threshold': 0.05, 'slice_threshold': 0.3} +test/analysis/input/example/data0-meo-params-GKEDDFZ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'max_path_length': 3, 'local_search': True, 'rand_restarts': 10} test/analysis/input/example/data0-mincostflow-params-SZPZVU6_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'flow': 1, 'capacity': 1} -test/analysis/input/example/data0-omicsintegrator1-params-E3LSEZQ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'b': 6, 'w': 5.0, 'd': 10, 'dummy_mode': 'file'} -test/analysis/input/example/data0-omicsintegrator1-params-NFIPHUX_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'b': 6, 'w': 0.0, 'd': 10, 'dummy_mode': 'file'} -test/analysis/input/example/data0-omicsintegrator1-params-SU2S63Y_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'b': 5, 'w': 0.0, 'd': 10, 'dummy_mode': 'file'} -test/analysis/input/example/data0-omicsintegrator1-params-V26JBGX_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'b': 5, 'w': 5.0, 'd': 10, 'dummy_mode': 'file'} -test/analysis/input/example/data0-omicsintegrator2-params-EHHWPMD_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'b': 2, 'g': 3} -test/analysis/input/example/data0-omicsintegrator2-params-IV3IPCJ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'b': 4, 'g': 0} +test/analysis/input/example/data0-omicsintegrator1-params-E3LSEZQ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.0, 'b': 6.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0} +test/analysis/input/example/data0-omicsintegrator1-params-NFIPHUX_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.0, 'b': 5.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0} +test/analysis/input/example/data0-omicsintegrator1-params-SU2S63Y_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 5.0, 'b': 5.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0} +test/analysis/input/example/data0-omicsintegrator1-params-V26JBGX_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 5.0, 'b': 6.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0} +test/analysis/input/example/data0-omicsintegrator2-params-EHHWPMD_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'w': 5.0, 'b': 4.0, 'g': 0.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None} +test/analysis/input/example/data0-omicsintegrator2-params-IV3IPCJ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'w': 5.0, 'b': 2.0, 'g': 3.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None} test/analysis/input/example/data0-pathlinker-params-6SWY7JS_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'k': 200} test/analysis/input/example/data0-pathlinker-params-VQL7BDZ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'k': 100} diff --git a/test/analysis/input/config.yaml b/test/analysis/input/config.yaml index 15a5572fa..f16c1dbc7 100644 --- a/test/analysis/input/config.yaml +++ b/test/analysis/input/config.yaml @@ -1,42 +1,22 @@ -# The length of the hash used to identify a parameter combination hash_length: 7 containers: - # Specify the container framework used by each PRM wrapper. Valid options include: - # - docker (default if not specified) - # - singularity -- Also known as apptainer, useful in HPC/HTC environments where docker isn't allowed - # - dsub -- experimental with limited support, used for running on Google Cloud with the All of Us cloud environment. - # - There is no support for other environments at the moment. framework: docker - - # Only used if container_framework is set to singularity, this will unpack the singularity containers - # to the local filesystem. This is useful when PRM containers need to run inside another container, - # such as would be the case in an HTCondor/OSPool environment. - # NOTE: This unpacks singularity containers to the local filesystem, which will take up space in a way - # that persists after the workflow is complete. To clean up the unpacked containers, the user must - # manually delete them. For convenience, these unpacked files will exist in the current working directory - # under `unpacked`. unpack_singularity: false - - # Allow the user to configure which container registry containers should be pulled from - # Note that this assumes container names are consistent across registries, and that the - # registry being passed doesn't require authentication for pull actions registry: base_url: docker.io - # The owner or project of the registry - # For example, "reedcompbio" if the image is available as docker.io/reedcompbio/allpairs owner: reedcompbio algorithms: - name: "pathlinker" - params: - include: true + include: true + runs: run1: k: range(100,201,100) - name: "omicsintegrator1" - params: - include: true + include: true + runs: run1: b: [5, 6] w: np.linspace(0,5,2) @@ -44,8 +24,8 @@ algorithms: dummy_mode: ["file"] - name: "omicsintegrator2" - params: - include: true + include: true + runs: run1: b: [4] g: [0] @@ -54,27 +34,26 @@ algorithms: g: [3] - name: "meo" - params: - include: true + include: true + runs: run1: max_path_length: [3] - local_search: ["Yes"] + local_search: [true] rand_restarts: [10] - name: "mincostflow" - params: - include: true + include: true + runs: run1: flow: [1] # The flow must be an int capacity: [1] - name: "allpairs" - params: - include: true + include: true - name: "domino" - params: - include: true + include: true + runs: run1: slice_threshold: [0.3] module_threshold: [0.05] diff --git a/test/analysis/input/egfr.yaml b/test/analysis/input/egfr.yaml index d26bded2d..823db03bb 100644 --- a/test/analysis/input/egfr.yaml +++ b/test/analysis/input/egfr.yaml @@ -29,15 +29,15 @@ containers: algorithms: - name: pathlinker - params: - include: true + include: true + runs: run1: k: - 10 - 20 - name: omicsintegrator1 - params: - include: true + include: true + runs: run1: b: - 0.55 @@ -55,8 +55,8 @@ algorithms: - 0.008 dummy_mode: ["file"] - name: omicsintegrator2 - params: - include: true + include: true + runs: run1: b: - 4 @@ -68,18 +68,18 @@ algorithms: g: - 3 - name: meo - params: - include: true + include: true + runs: run1: local_search: - - "Yes" + - true max_path_length: - 3 rand_restarts: - 10 - name: domino - params: - include: true + include: true + runs: run1: slice_threshold: - 0.3 diff --git a/test/generate-inputs/inputs/test_config.yaml b/test/generate-inputs/inputs/test_config.yaml index 0c83017fe..5638d5cc2 100644 --- a/test/generate-inputs/inputs/test_config.yaml +++ b/test/generate-inputs/inputs/test_config.yaml @@ -36,7 +36,7 @@ algorithms: include: true run1: max_path_length: [3] - local_search: ["Yes"] + local_search: [true] rand_restarts: [10] - name: "mincostflow" diff --git a/test/test_config.py b/test/test_config.py index c8b05f3c5..41551c381 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -1,10 +1,16 @@ +import copy import pickle +from typing import Iterable -import numpy as np import pytest +from pydantic import BaseModel import spras.config.config as config +from spras.config.container_schema import DEFAULT_CONTAINER_PREFIX from spras.config.schema import DEFAULT_HASH_LENGTH +from spras.meo import MEOParams +from spras.mincostflow import MinCostFlowParams +from spras.omicsintegrator2 import DummyMode, OmicsIntegrator2Params filler_dataset_data: dict[str, str | list[str]] = { "data_dir": "fake", @@ -51,55 +57,38 @@ def get_test_config(): "data_dir": "gs-fake" }], "algorithms": [ + # Since there is algorithm validation, + # we are (mostly) forced to use real algorithm parameters here. + # To make this more readable, we make the 'test names' the run names. + # TODO: we don't have a test for combinations of strings anymore. This seems to be fine, + # but it would be nice to have once we introduce an algorithm that takes more than 1 string parameter. { - "name": "strings", - "params": { - "include": True, - "run1": {"test": "str1", "test2": ["str2", "str3"]} + "name": "omicsintegrator2", + "include": True, + "runs": { + "strings": {"dummy_mode": ["terminals", "others"], "b": 3}, + # spacing in np.linspace is on purpose + "singleton_string_np_linspace": {"dummy_mode": "terminals", "b": "np.linspace(0, 5,2,)"}, + "str_array_np_logspace": {"dummy_mode": ["others", "all"], "g": "np.logspace(1,1)"} } }, { - "name": "numbersAndBools", - "params": { - "include": True, - "run1": {"a": 1, "b": [float(2.0), 3], "c": [4], "d": float(5.6), "f": False} + "name": "meo", + "include": True, + "runs": { + "numbersAndBoolsDuplicate": {"max_path_length": 1, "rand_restarts": [float(2.0), 3], "local_search": [True, False]}, + "numbersAndBool": {"max_path_length": 2, "rand_restarts": [float(2.0), 3], "local_search": [True]}, + "numbersAndBools": {"max_path_length": 1, "rand_restarts": [float(2.0), 3], "local_search": [True, False]}, + "boolArrTest": {"local_search": [True, False], "max_path_length": "range(1, 3)"} } }, { - "name": "singleton_int64_with_array", - "params": { - "include": True, - "run1": {"test": np.int64(1), "test2": [2, 3]} + "name": "mincostflow", + "include": True, + "runs": { + "int64artifact": {"flow": "np.arange(5, 7)", "capacity": [2, 3]} } }, - { - "name": "singleton_string_np_linspace", - "params": { - "include": True, - "run1": {"test": "str1", "test2": "np.linspace(0,5,2)"} - } - }, - { - "name": "str_array_np_logspace", - "params": { - "include": True, - "run1": {"test": ["a", "b"], "test2": "np.logspace(1,1)"} - } - }, - { - "name": "int64artifact", - "params": { - "include": True, - "run1": {"test": "np.arange(5,6)", "test2": [2, 3]} - } - }, - { - "name": "boolArrTest", - "params": { - "include": True, - "run1": {"flags": [True, False], "range": "range(1, 3)"} - } - } ], "analysis": { "summary": { @@ -121,22 +110,49 @@ def get_test_config(): return test_raw_config -def value_test_util(name: str, configurations: list): - assert name in config.config.algorithm_params, f"{name} isn't a present algorithm configuration!" - - keys = config.config.algorithm_params[name] - values = [config.config.algorithm_params[name][key] for key in keys] +def value_test_util(alg: str, run_name: str, param_type: type[BaseModel], configurations: Iterable[BaseModel]): + """ + Utility test function to be able to test against certain named runs + under algorithms. This is, unfortunately, a very holistic function that depends + on the current state of how config parsing is. + """ + assert alg in config.config.algorithm_params, f"{alg} isn't a present algorithm name!" + runs = config.config.algorithm_params[alg] + # Filter using the internal _spras_run_name key. + runs = {hash: params for hash, params in runs.items() if params["_spras_run_name"] == run_name} + + # We copy values so we don't mutate it + values: list[dict] = copy.deepcopy(list(runs.values())) + for value in values: + # then, remove the internal key for easy comparison. + del value["_spras_run_name"] + + # Since configurations is a bunch of objects, we need to turn those into dictionaries + # and exclude their defaults. + new_configurations = [config.model_dump(exclude_defaults=True) for config in configurations] + + # Same for values, but we reserialize them first + values = [param_type.model_validate(value).model_dump(exclude_defaults=True) for value in values] + + # Now, we need to also remove any dynamic values from values and configurations + # (_time and seeded values) + for value in values: + value.pop("_time", None) + value.pop("seed", None) + for configuration in new_configurations: + configuration.pop("_time", None) + configuration.pop("seed", None) # https://stackoverflow.com/a/50486270/7589775 # Note: We use pickle as we also compare dictionaries in these two sets - some kind of consistent total ordering # is required for the tests to consistently pass when comparing them to `configurations`. - set_values = set(tuple(sorted(d.items())) for d in sorted(values, key=lambda x: pickle.dumps(x, protocol=3))) - set_configurations = set(tuple(sorted(d.items())) for d in sorted(configurations, key=lambda x: pickle.dumps(x, protocol=3))) + final_values = sorted(tuple(sorted(d.items())) for d in sorted(values, key=lambda x: pickle.dumps(x, protocol=3))) + final_configurations = sorted(tuple(sorted(d.items())) for d in sorted(new_configurations, key=lambda x: pickle.dumps(x, protocol=3))) - if set_values != set_configurations: - print(f'Got: {set_values}') - print(f'Expected: {set_configurations}') - assert set_values == set_configurations + if final_values != final_configurations: + print(f'Got: {final_values}') + print(f'Expected: {final_configurations}') + assert final_values == final_configurations class TestConfig: """ @@ -200,7 +216,7 @@ def test_config_container_registry(self): test_config["containers"]["registry"]["base_url"] = "" test_config["containers"]["registry"]["owner"] = "" config.init_global(test_config) - assert (config.config.container_settings.prefix == config.DEFAULT_CONTAINER_PREFIX) + assert (config.config.container_settings.prefix == DEFAULT_CONTAINER_PREFIX) def test_error_dataset_label(self): test_config = get_test_config() @@ -242,17 +258,51 @@ def test_config_values(self): test_config = get_test_config() config.init_global(test_config) - value_test_util('strings', [{'test': "str1", 'test2': "str2"}, {'test': 'str1', 'test2': 'str3'}]) - value_test_util('numbersAndBools', [{'a': 1, 'b': float(2.0), 'c': 4, 'd': 5.6, 'f': False}, {'a': 1, 'b': 3, 'c': 4, 'd': 5.6, 'f': False}]) - - value_test_util('singleton_int64_with_array', [{'test': 1, 'test2': 2}, {'test': 1, 'test2': 3}]) - value_test_util('singleton_string_np_linspace', [{'test': "str1", 'test2': 5.0}, {'test': "str1", 'test2': 0.0}]) - value_test_util('str_array_np_logspace', [{'test': "a", 'test2': 10}] * 10 + [{'test': "b", 'test2': 10}] * 10) - - value_test_util('int64artifact', [{'test': 5, 'test2': 2}, {'test': 5, 'test2': 3}]) - - value_test_util('boolArrTest', [{'flags': True, 'range': 1}, {'flags': False, 'range': 2}, - {'flags': False, 'range': 1}, {'flags': True, 'range': 2}]) + value_test_util('omicsintegrator2', 'strings', OmicsIntegrator2Params, [ + OmicsIntegrator2Params(dummy_mode=DummyMode.terminals, b=3), + OmicsIntegrator2Params(dummy_mode=DummyMode.others, b=3) + ]) + + value_test_util('omicsintegrator2', 'singleton_string_np_linspace', OmicsIntegrator2Params, [ + OmicsIntegrator2Params(dummy_mode=DummyMode.terminals, b=5.0), + OmicsIntegrator2Params(dummy_mode=DummyMode.terminals, b=0.0) + ]) + + value_test_util('omicsintegrator2', 'str_array_np_logspace', OmicsIntegrator2Params, [ + # While these both repeat 50 times, parameter hash makes sure to not duplicate the work. + # This serves as a test to make sure _time isn't inserted during parameter combinations. + OmicsIntegrator2Params(dummy_mode=DummyMode.others, g=10), OmicsIntegrator2Params(dummy_mode=DummyMode.all, g=10) + ]) + + value_test_util('meo', 'numbersAndBools', MEOParams, [ + MEOParams(max_path_length=1, rand_restarts=2, local_search=False), + MEOParams(max_path_length=1, rand_restarts=2, local_search=True), + MEOParams(max_path_length=1, rand_restarts=3, local_search=False), + MEOParams(max_path_length=1, rand_restarts=3, local_search=True), + ]) + + # Encoding this behavior: run names are not passed into the parameter hash, + # and thus won't duplicate runs. + value_test_util('meo', 'numbersAndBoolsDuplicate', MEOParams, []) + + value_test_util('meo', 'numbersAndBool', MEOParams, [ + MEOParams(max_path_length=2, rand_restarts=2, local_search=True), + MEOParams(max_path_length=2, rand_restarts=3, local_search=True), + ]) + + value_test_util('mincostflow', 'int64artifact', MinCostFlowParams, [ + MinCostFlowParams(flow=5, capacity=2), + MinCostFlowParams(flow=5, capacity=3), + MinCostFlowParams(flow=6, capacity=2), + MinCostFlowParams(flow=6, capacity=3) + ]) + + value_test_util('meo', 'boolArrTest', MEOParams, [ + MEOParams(local_search=True, max_path_length=1), + MEOParams(local_search=True, max_path_length=2), + MEOParams(local_search=False, max_path_length=1), + MEOParams(local_search=False, max_path_length=2) + ]) @pytest.mark.parametrize("ml_include, eval_include, expected_ml, expected_eval", [ (True, True, True, True),