diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ffb138c14..b01358b46 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -142,8 +142,6 @@ and your editor's interpreter is set to using the SPRAS environment over the bas Note the behaviors of the `request_node_columns` function when there are missing values in that column of the node table and when multiple columns are requested. `request_node_columns` always returns the `NODEID` column in addition to the requested columns. -Note: If you encounter a `'property' object is not iterable` error arising from inside the Snakefile, this means that `required_inputs` is not set. This is because when `required_inputs` is not set inside an algorithm wrapper, it falls back to the underlying unimplemented function inside the PRM base class, which, while it is marked as a property function, is non-static; therefore, when the runner utility class tries to dynamically fetch `required_inputs` with reflection, it ends up grabbing the `property` function instead of the underlying error, and tries to iterate over it (since `required_inputs` is usually a list.) - Now implement the `generate_inputs` function. Start by inspecting the `omicsintegrator1.py` example, but note the differences in the expected file formats generated for the two algorithms with respect to the header rows and node prize column. The selected nodes should be any node in the dataset that has a prize set, any node that is active, any node that is a source, or any node that is a target. diff --git a/Snakefile b/Snakefile index d9ade0bc1..358a83f42 100644 --- a/Snakefile +++ b/Snakefile @@ -5,7 +5,7 @@ import yaml from spras.dataset import Dataset from spras.evaluation import Evaluation from spras.analysis import ml, summary, cytoscape -import spras.config as _config +import spras.config.config as _config # Snakemake updated the behavior in the 6.5.0 release https://github.com/snakemake/snakemake/pull/1037 # and using the wrong separator prevents Snakemake from matching filenames to the rules that can produce them @@ -25,7 +25,7 @@ algorithm_params = _config.config.algorithm_params algorithm_directed = _config.config.algorithm_directed pca_params = _config.config.pca_params hac_params = _config.config.hac_params -FRAMEWORK = _config.config.container_framework +FRAMEWORK = _config.config.container_settings.framework # Return the dataset or gold_standard dictionary from the config file given the label def get_dataset(_datasets, label): diff --git a/config/config.yaml b/config/config.yaml index 034b7d4dc..30b438390 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,32 +1,36 @@ +# yaml-language-server: $schema=./schema.json + # Global workflow control # The length of the hash used to identify a parameter combination hash_length: 7 -# Specify the container framework used by each PRM wrapper. Valid options include: -# - docker (default if not specified) -# - singularity -- Also known as apptainer, useful in HPC/HTC environments where docker isn't allowed -# - dsub -- experimental with limited support, used for running on Google Cloud with the All of Us cloud environment. -# - There is no support for other environments at the moment. -container_framework: docker - -# Only used if container_framework is set to singularity, this will unpack the singularity containers -# to the local filesystem. This is useful when PRM containers need to run inside another container, -# such as would be the case in an HTCondor/OSPool environment. -# NOTE: This unpacks singularity containers to the local filesystem, which will take up space in a way -# that persists after the workflow is complete. To clean up the unpacked containers, the user must -# manually delete them. For convenience, these unpacked files will exist in the current working directory -# under `unpacked`. -unpack_singularity: false - -# Allow the user to configure which container registry containers should be pulled from -# Note that this assumes container names are consistent across registries, and that the -# registry being passed doesn't require authentication for pull actions -container_registry: - base_url: docker.io - # The owner or project of the registry - # For example, "reedcompbio" if the image is available as docker.io/reedcompbio/allpairs - owner: reedcompbio +# Collection of container options +containers: + # Specify the container framework used by each PRM wrapper. Valid options include: + # - docker (default if not specified) + # - singularity -- Also known as apptainer, useful in HPC/HTC environments where docker isn't allowed + # - dsub -- experimental with limited support, used for running on Google Cloud with the All of Us cloud environment. + # - There is no support for other environments at the moment. + framework: docker + + # Only used if container_framework is set to singularity, this will unpack the singularity containers + # to the local filesystem. This is useful when PRM containers need to run inside another container, + # such as would be the case in an HTCondor/OSPool environment. + # NOTE: This unpacks singularity containers to the local filesystem, which will take up space in a way + # that persists after the workflow is complete. To clean up the unpacked containers, the user must + # manually delete them. For convenience, these unpacked files will exist in the current working directory + # under `unpacked`. + unpack_singularity: false + + # Allow the user to configure which container registry containers should be pulled from + # Note that this assumes container names are consistent across registries, and that the + # registry being passed doesn't require authentication for pull actions + registry: + base_url: docker.io + # The owner or project of the registry + # For example, "reedcompbio" if the image is available as docker.io/reedcompbio/allpairs + owner: reedcompbio # This list of algorithms should be generated by a script which checks the filesystem for installs. # It shouldn't be changed by mere mortals. (alternatively, we could add a path to executable for each algorithm @@ -48,14 +52,14 @@ container_registry: algorithms: - name: "pathlinker" - params: - include: true + include: true + runs: run1: k: range(100,201,100) - name: "omicsintegrator1" - params: - include: true + include: true + runs: run1: b: [5, 6] w: np.linspace(0,5,2) @@ -63,8 +67,8 @@ algorithms: dummy_mode: "file" # Or "terminals", "all", "others" - name: "omicsintegrator2" - params: - include: true + include: true + runs: run1: b: 4 g: 0 @@ -73,48 +77,46 @@ algorithms: g: 3 - name: "meo" - params: - include: true + include: true + runs: run1: max_path_length: 3 - local_search: "Yes" + local_search: true rand_restarts: 10 - name: "mincostflow" - params: - include: true + include: true + runs: run1: flow: 1 # The flow must be an int capacity: 1 - name: "allpairs" - params: - include: true + include: true - name: "domino" - params: - include: true + include: true + runs: run1: slice_threshold: 0.3 module_threshold: 0.05 - name: "strwr" - params: - include: true + include: true + runs: run1: alpha: [0.85] threshold: [100, 200] - name: "rwr" - params: - include: true + include: true + runs: run1: alpha: [0.85] threshold: [100, 200] - name: "bowtiebuilder" - params: - include: true + include: true # Here we specify which pathways to run and other file location information. # DataLoader.py can currently only load a single dataset @@ -164,8 +166,6 @@ reconstruction_settings: # TODO move to global reconstruction_dir: "output" - run: true - analysis: # Create one summary per pathway file and a single summary table for all pathways for each dataset summary: diff --git a/config/egfr.yaml b/config/egfr.yaml index 5c5e9b07f..363d213a1 100644 --- a/config/egfr.yaml +++ b/config/egfr.yaml @@ -1,116 +1,76 @@ -# The length of the hash used to identify a parameter combination -hash_length: 7 - -# Specify the container framework used by each PRM wrapper. Valid options include: -# - docker (default if not specified) -# - singularity -- Also known as apptainer, useful in HPC/HTC environments where docker isn't allowed -# - dsub -- experimental with limited support, used for running on Google Cloud -container_framework: docker - -# Only used if container_framework is set to singularity, this will unpack the singularity containers -# to the local filesystem. This is useful when PRM containers need to run inside another container, -# such as would be the case in an HTCondor/OSPool environment. -# NOTE: This unpacks singularity containers to the local filesystem, which will take up space in a way -# that persists after the workflow is complete. To clean up the unpacked containers, the user must -# manually delete them. -unpack_singularity: false +# yaml-language-server: $schema=./schema.json -# Allow the user to configure which container registry containers should be pulled from -# Note that this assumes container names are consistent across registries, and that the -# registry being passed doesn't require authentication for pull actions -container_registry: - base_url: docker.io - # The owner or project of the registry - # For example, "reedcompbio" if the image is available as docker.io/reedcompbio/allpairs - owner: reedcompbio +hash_length: 7 +containers: + framework: docker + unpack_singularity: false + registry: + base_url: docker.io + owner: reedcompbio algorithms: - name: pathlinker - params: - include: true + include: true + runs: run1: k: - 10 - 20 - 70 - name: omicsintegrator1 - params: - include: true + include: true + runs: run1: b: - 0.55 - 2 - 10 - d: - - 10 - g: - - 1e-3 - r: - - 0.01 - w: - - 0.1 - mu: - - 0.008 + d: 10 + g: 1e-3 + r: 0.01 + w: 0.1 + mu: 0.008 dummy_mode: ["file"] - name: omicsintegrator2 - params: - include: true + include: true + runs: run1: - b: - - 4 - g: - - 0 + b: 4 + g: 0 run2: - b: - - 2 - g: - - 3 + b: 2 + g: 3 - name: meo - params: - include: true + include: true + runs: run1: - local_search: - - "Yes" - max_path_length: - - 3 - rand_restarts: - - 10 + local_search: true + max_path_length: 3 + rand_restarts: 10 run2: - local_search: - - "No" - max_path_length: - - 2 - rand_restarts: - - 10 + local_search: false + max_path_length: 2 + rand_restarts: 10 - name: allpairs - params: - include: true + include: true - name: domino - params: - include: true + include: true + runs: run1: - slice_threshold: - - 0.3 - module_threshold: - - 0.05 + slice_threshold: 0.3 + module_threshold: 0.05 - name: mincostflow - params: - include: true + include: true + runs: run1: - capacity: - - 15 - flow: - - 80 + capacity: 15 + flow: 80 run2: - capacity: - - 1 - flow: - - 6 + capacity: 1 + flow: 6 run3: - capacity: - - 5 - flow: - - 60 + capacity: 5 + flow: 60 datasets: - data_dir: input edge_files: @@ -129,7 +89,6 @@ gold_standards: reconstruction_settings: locations: reconstruction_dir: output/egfr - run: true analysis: cytoscape: include: true diff --git a/config/schema.json b/config/schema.json new file mode 100644 index 000000000..494736275 --- /dev/null +++ b/config/schema.json @@ -0,0 +1,1641 @@ +{ + "$defs": { + "Analysis": { + "additionalProperties": false, + "properties": { + "summary": { + "$ref": "#/$defs/SummaryAnalysis", + "default": { + "include": false + } + }, + "cytoscape": { + "$ref": "#/$defs/CytoscapeAnalysis", + "default": { + "include": false + } + }, + "ml": { + "$ref": "#/$defs/MlAnalysis", + "default": { + "include": false, + "aggregate_per_algorithm": false, + "components": 2, + "labels": true, + "linkage": "ward", + "metric": "euclidean" + } + }, + "evaluation": { + "$ref": "#/$defs/EvaluationAnalysis", + "default": { + "include": false, + "aggregate_per_algorithm": false + } + } + }, + "title": "Analysis", + "type": "object" + }, + "ContainerFramework": { + "enum": [ + "docker", + "singularity", + "dsub" + ], + "title": "ContainerFramework", + "type": "string" + }, + "ContainerRegistry": { + "additionalProperties": false, + "properties": { + "base_url": { + "default": "docker.io", + "description": "The domain of the registry", + "title": "Base Url", + "type": "string" + }, + "owner": { + "default": "reedcompbio", + "description": "The owner or project of the registry", + "title": "Owner", + "type": "string" + } + }, + "title": "ContainerRegistry", + "type": "object" + }, + "ContainerSettings": { + "additionalProperties": false, + "properties": { + "framework": { + "$ref": "#/$defs/ContainerFramework", + "default": "docker" + }, + "unpack_singularity": { + "default": false, + "title": "Unpack Singularity", + "type": "boolean" + }, + "registry": { + "$ref": "#/$defs/ContainerRegistry" + }, + "hash_length": { + "default": 7, + "title": "Hash Length", + "type": "integer" + } + }, + "required": [ + "registry" + ], + "title": "ContainerSettings", + "type": "object" + }, + "CytoscapeAnalysis": { + "additionalProperties": false, + "properties": { + "include": { + "title": "Include", + "type": "boolean" + } + }, + "required": [ + "include" + ], + "title": "CytoscapeAnalysis", + "type": "object" + }, + "Dataset": { + "additionalProperties": false, + "properties": { + "label": { + "title": "Label", + "type": "string" + }, + "node_files": { + "items": { + "type": "string" + }, + "title": "Node Files", + "type": "array" + }, + "edge_files": { + "items": { + "type": "string" + }, + "title": "Edge Files", + "type": "array" + }, + "other_files": { + "items": { + "type": "string" + }, + "title": "Other Files", + "type": "array" + }, + "data_dir": { + "title": "Data Dir", + "type": "string" + } + }, + "required": [ + "label", + "node_files", + "edge_files", + "other_files", + "data_dir" + ], + "title": "Dataset", + "type": "object" + }, + "DummyMode": { + "enum": [ + "terminals", + "others", + "all" + ], + "title": "DummyMode", + "type": "string" + }, + "EvaluationAnalysis": { + "additionalProperties": false, + "properties": { + "include": { + "title": "Include", + "type": "boolean" + }, + "aggregate_per_algorithm": { + "default": false, + "title": "Aggregate Per Algorithm", + "type": "boolean" + } + }, + "required": [ + "include" + ], + "title": "EvaluationAnalysis", + "type": "object" + }, + "GoldStandard": { + "additionalProperties": false, + "properties": { + "label": { + "title": "Label", + "type": "string" + }, + "node_files": { + "items": { + "type": "string" + }, + "title": "Node Files", + "type": "array" + }, + "data_dir": { + "title": "Data Dir", + "type": "string" + }, + "dataset_labels": { + "items": { + "type": "string" + }, + "title": "Dataset Labels", + "type": "array" + } + }, + "required": [ + "label", + "node_files", + "data_dir", + "dataset_labels" + ], + "title": "GoldStandard", + "type": "object" + }, + "Locations": { + "additionalProperties": false, + "properties": { + "reconstruction_dir": { + "title": "Reconstruction Dir", + "type": "string" + } + }, + "required": [ + "reconstruction_dir" + ], + "title": "Locations", + "type": "object" + }, + "MlAnalysis": { + "additionalProperties": false, + "properties": { + "include": { + "title": "Include", + "type": "boolean" + }, + "aggregate_per_algorithm": { + "default": false, + "title": "Aggregate Per Algorithm", + "type": "boolean" + }, + "components": { + "default": 2, + "title": "Components", + "type": "integer" + }, + "labels": { + "default": true, + "title": "Labels", + "type": "boolean" + }, + "linkage": { + "$ref": "#/$defs/MlLinkage", + "default": "ward" + }, + "metric": { + "$ref": "#/$defs/MlMetric", + "default": "euclidean" + } + }, + "required": [ + "include" + ], + "title": "MlAnalysis", + "type": "object" + }, + "MlLinkage": { + "enum": [ + "ward", + "complete", + "average", + "single" + ], + "title": "MlLinkage", + "type": "string" + }, + "MlMetric": { + "enum": [ + "euclidean", + "manhattan", + "cosine" + ], + "title": "MlMetric", + "type": "string" + }, + "ReconstructionSettings": { + "additionalProperties": false, + "properties": { + "locations": { + "$ref": "#/$defs/Locations" + } + }, + "required": [ + "locations" + ], + "title": "ReconstructionSettings", + "type": "object" + }, + "SummaryAnalysis": { + "additionalProperties": false, + "properties": { + "include": { + "title": "Include", + "type": "boolean" + } + }, + "required": [ + "include" + ], + "title": "SummaryAnalysis", + "type": "object" + }, + "allpairsModel": { + "additionalProperties": false, + "properties": { + "name": { + "const": "allpairs", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/allpairsRunModel" + }, + "default": { + "default": {} + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include" + ], + "title": "allpairsModel", + "type": "object" + }, + "allpairsRunModel": { + "additionalProperties": false, + "properties": {}, + "title": "allpairsRunModel", + "type": "object" + }, + "bowtiebuilderModel": { + "additionalProperties": false, + "properties": { + "name": { + "const": "bowtiebuilder", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/bowtiebuilderRunModel" + }, + "default": { + "default": {} + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include" + ], + "title": "bowtiebuilderModel", + "type": "object" + }, + "bowtiebuilderRunModel": { + "additionalProperties": false, + "properties": {}, + "title": "bowtiebuilderRunModel", + "type": "object" + }, + "dominoModel": { + "additionalProperties": false, + "properties": { + "name": { + "const": "domino", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/dominoRunModel" + }, + "default": { + "default": { + "_time": 1752611437.804319, + "module_threshold": null, + "slice_threshold": null + } + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include" + ], + "title": "dominoModel", + "type": "object" + }, + "dominoRunModel": { + "additionalProperties": false, + "properties": { + "_time": { + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "type": "number" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "The internal _time parameter. This is a parameter only given to nondeterminsitic\nalgorithms that provide no randomness seed. While this should be unset,\nwe allow specifying `_time` for users that want to re-use outputs of runs,\nthough this explicitly breaks the 'immutability' promise of runs.", + "title": "Time" + }, + "module_threshold": { + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "type": "array" + }, + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "the p-value threshold for considering a slice as relevant (optional)", + "title": "Module Threshold" + }, + "slice_threshold": { + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "type": "array" + }, + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "the p-value threshold for considering a putative module as final module (optional)", + "title": "Slice Threshold" + } + }, + "title": "dominoRunModel", + "type": "object" + }, + "meoModel": { + "additionalProperties": false, + "properties": { + "name": { + "const": "meo", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/meoRunModel" + }, + "default": { + "default": { + "max_path_length": null, + "local_search": null, + "rand_restarts": null + } + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include" + ], + "title": "meoModel", + "type": "object" + }, + "meoRunModel": { + "additionalProperties": false, + "properties": { + "max_path_length": { + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] + }, + "type": "array" + }, + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "the maximal length of a path from sources and targets to orient.", + "title": "Max Path Length" + }, + "local_search": { + "anyOf": [ + { + "type": "boolean" + }, + { + "items": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ] + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "a boolean parameter that enables MEO's local search functionality.\nSee \"Improving approximations with local search\" in the associated paper\nfor more information.", + "title": "Local Search" + }, + "rand_restarts": { + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] + }, + "type": "array" + }, + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The number of random restarts to do.", + "title": "Rand Restarts" + } + }, + "title": "meoRunModel", + "type": "object" + }, + "mincostflowModel": { + "additionalProperties": false, + "properties": { + "name": { + "const": "mincostflow", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/mincostflowRunModel" + }, + "default": { + "default": { + "flow": null, + "capacity": null + } + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include" + ], + "title": "mincostflowModel", + "type": "object" + }, + "mincostflowRunModel": { + "additionalProperties": false, + "properties": { + "flow": { + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "type": "array" + }, + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "amount of flow going through the graph", + "title": "Flow" + }, + "capacity": { + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "type": "array" + }, + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "amount of capacity allowed on each edge", + "title": "Capacity" + } + }, + "title": "mincostflowRunModel", + "type": "object" + }, + "omicsintegrator1Model": { + "additionalProperties": false, + "properties": { + "name": { + "const": "omicsintegrator1", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/omicsintegrator1RunModel" + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include", + "runs" + ], + "title": "omicsintegrator1Model", + "type": "object" + }, + "omicsintegrator1RunModel": { + "additionalProperties": false, + "properties": { + "dummy_mode": { + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Dummy Mode" + }, + "mu_squared": { + "anyOf": [ + { + "type": "boolean" + }, + { + "items": { + "type": "boolean" + }, + "type": "array" + } + ], + "default": false, + "title": "Mu Squared" + }, + "exclude_terms": { + "anyOf": [ + { + "type": "boolean" + }, + { + "items": { + "type": "boolean" + }, + "type": "array" + } + ], + "default": false, + "title": "Exclude Terms" + }, + "noisy_edges": { + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "default": 0, + "description": "How many times you would like to add noise to the given edge values and re-run the algorithm.", + "title": "Noisy Edges" + }, + "shuffled_prizes": { + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "default": 0, + "description": "shuffled_prizes: How many times the algorithm should shuffle the prizes and re-run", + "title": "Shuffled Prizes" + }, + "random_terminals": { + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "default": 0, + "description": "How many times to apply the given prizes to random nodes in the interactome", + "title": "Random Terminals" + }, + "seed": { + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "The random seed to use for this run. Defaults to the current UNIX timestamp.", + "title": "Seed" + }, + "w": { + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "the number of trees", + "title": "W" + }, + "b": { + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "type": "number" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "the trade-off between including more terminals and using less reliable edges", + "title": "B" + }, + "d": { + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "controls the maximum path-length from v0 to terminal nodes", + "title": "D" + }, + "mu": { + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "type": "array" + }, + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "controls the degree-based negative prizes (defualt 0.0)", + "title": "Mu" + }, + "noise": { + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "type": "array" + }, + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Standard Deviation of the gaussian noise added to edges in Noisy Edges Randomizations", + "title": "Noise" + }, + "g": { + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "type": "array" + }, + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "(Gamma) multiplicative edge penalty from degree of endpoints", + "title": "G" + }, + "r": { + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "type": "array" + }, + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "msgsteiner parameter that adds random noise to edges, which is rarely needed because the Forest --noisyEdges option is recommended instead (default 0)", + "title": "R" + } + }, + "required": [ + "w", + "b", + "d" + ], + "title": "omicsintegrator1RunModel", + "type": "object" + }, + "omicsintegrator2Model": { + "additionalProperties": false, + "properties": { + "name": { + "const": "omicsintegrator2", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/omicsintegrator2RunModel" + }, + "default": { + "default": { + "w": 6.0, + "b": 1.0, + "g": 20.0, + "noise": null, + "noisy_edges": null, + "random_terminals": null, + "dummy_mode": null, + "seed": 1752611437804 + } + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include" + ], + "title": "omicsintegrator2Model", + "type": "object" + }, + "omicsintegrator2RunModel": { + "additionalProperties": false, + "properties": { + "w": { + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "type": "number" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "default": 6, + "description": "Omega: the weight of the edges connecting the dummy node to the nodes selected by dummyMode", + "title": "W" + }, + "b": { + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "type": "number" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "default": 1, + "description": "Beta: scaling factor of prizes", + "title": "B" + }, + "g": { + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "type": "number" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "default": 20, + "description": "Gamma: multiplicative edge penalty from degree of endpoints", + "title": "G" + }, + "noise": { + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "type": "array" + }, + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Standard Deviation of the gaussian noise added to edges in Noisy Edges Randomizations.", + "title": "Noise" + }, + "noisy_edges": { + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] + }, + "type": "array" + }, + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "An integer specifying how many times to add noise to the given edge values and re-run.", + "title": "Noisy Edges" + }, + "random_terminals": { + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] + }, + "type": "array" + }, + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "An integer specifying how many times to apply your given prizes to random nodes in the interactome and re-run", + "title": "Random Terminals" + }, + "dummy_mode": { + "anyOf": [ + { + "$ref": "#/$defs/DummyMode" + }, + { + "items": { + "anyOf": [ + { + "$ref": "#/$defs/DummyMode" + }, + { + "type": "null" + } + ] + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Tells the program which nodes in the interactome to connect the dummy node to. (default: terminals)\n \"terminals\" = connect to all terminals\n \"others\" = connect to all nodes except for terminals\n \"all\" = connect to all nodes in the interactome.", + "title": "Dummy Mode" + }, + "seed": { + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "The random seed to use for this run. Defaults to the current UNIX timestamp.", + "title": "Seed" + } + }, + "title": "omicsintegrator2RunModel", + "type": "object" + }, + "pathlinkerModel": { + "additionalProperties": false, + "properties": { + "name": { + "const": "pathlinker", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/pathlinkerRunModel" + }, + "default": { + "default": { + "k": 100 + } + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include" + ], + "title": "pathlinkerModel", + "type": "object" + }, + "pathlinkerRunModel": { + "additionalProperties": false, + "properties": { + "k": { + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "default": 100, + "description": "path length", + "title": "K" + } + }, + "title": "pathlinkerRunModel", + "type": "object" + }, + "rwrModel": { + "additionalProperties": false, + "properties": { + "name": { + "const": "rwr", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/rwrRunModel" + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include", + "runs" + ], + "title": "rwrModel", + "type": "object" + }, + "rwrRunModel": { + "additionalProperties": false, + "properties": { + "threshold": { + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "The number of nodes to return", + "title": "Threshold" + }, + "alpha": { + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "type": "array" + }, + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The chance of a restart during the random walk", + "title": "Alpha" + } + }, + "required": [ + "threshold" + ], + "title": "rwrRunModel", + "type": "object" + }, + "strwrModel": { + "additionalProperties": false, + "properties": { + "name": { + "const": "strwr", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/strwrRunModel" + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include", + "runs" + ], + "title": "strwrModel", + "type": "object" + }, + "strwrRunModel": { + "additionalProperties": false, + "properties": { + "threshold": { + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "The number of nodes to return", + "title": "Threshold" + }, + "alpha": { + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "type": "array" + }, + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The chance of a restart during the random walk", + "title": "Alpha" + } + }, + "required": [ + "threshold" + ], + "title": "strwrRunModel", + "type": "object" + } + }, + "additionalProperties": false, + "properties": { + "containers": { + "$ref": "#/$defs/ContainerSettings" + }, + "hash_length": { + "default": 7, + "description": "The length of the hash used to identify a parameter combination", + "title": "Hash Length", + "type": "integer" + }, + "algorithms": { + "items": { + "discriminator": { + "mapping": { + "allpairs": "#/$defs/allpairsModel", + "bowtiebuilder": "#/$defs/bowtiebuilderModel", + "domino": "#/$defs/dominoModel", + "meo": "#/$defs/meoModel", + "mincostflow": "#/$defs/mincostflowModel", + "omicsintegrator1": "#/$defs/omicsintegrator1Model", + "omicsintegrator2": "#/$defs/omicsintegrator2Model", + "pathlinker": "#/$defs/pathlinkerModel", + "rwr": "#/$defs/rwrModel", + "strwr": "#/$defs/strwrModel" + }, + "propertyName": "name" + }, + "oneOf": [ + { + "$ref": "#/$defs/allpairsModel" + }, + { + "$ref": "#/$defs/bowtiebuilderModel" + }, + { + "$ref": "#/$defs/dominoModel" + }, + { + "$ref": "#/$defs/meoModel" + }, + { + "$ref": "#/$defs/mincostflowModel" + }, + { + "$ref": "#/$defs/omicsintegrator1Model" + }, + { + "$ref": "#/$defs/omicsintegrator2Model" + }, + { + "$ref": "#/$defs/pathlinkerModel" + }, + { + "$ref": "#/$defs/rwrModel" + }, + { + "$ref": "#/$defs/strwrModel" + } + ] + }, + "title": "Algorithms", + "type": "array" + }, + "datasets": { + "items": { + "$ref": "#/$defs/Dataset" + }, + "title": "Datasets", + "type": "array" + }, + "gold_standards": { + "default": [], + "items": { + "$ref": "#/$defs/GoldStandard" + }, + "title": "Gold Standards", + "type": "array" + }, + "analysis": { + "$ref": "#/$defs/Analysis", + "default": { + "summary": { + "include": false + }, + "cytoscape": { + "include": false + }, + "ml": { + "aggregate_per_algorithm": false, + "components": 2, + "include": false, + "labels": true, + "linkage": "ward", + "metric": "euclidean" + }, + "evaluation": { + "aggregate_per_algorithm": false, + "include": false + } + } + }, + "reconstruction_settings": { + "$ref": "#/$defs/ReconstructionSettings" + } + }, + "required": [ + "containers", + "algorithms", + "datasets", + "reconstruction_settings" + ], + "title": "RawConfig", + "type": "object" +} \ No newline at end of file diff --git a/docker-wrappers/SPRAS/example_config.yaml b/docker-wrappers/SPRAS/example_config.yaml index 9d827e600..d7c85cca5 100644 --- a/docker-wrappers/SPRAS/example_config.yaml +++ b/docker-wrappers/SPRAS/example_config.yaml @@ -123,8 +123,6 @@ reconstruction_settings: # TODO move to global reconstruction_dir: "output" - run: true - analysis: # Create one summary per pathway file and a single summary table for all pathways for each dataset summary: diff --git a/environment.yml b/environment.yml index 7c2a2e98c..07a2b6e7b 100644 --- a/environment.yml +++ b/environment.yml @@ -8,6 +8,7 @@ dependencies: - matplotlib=3.6 - networkx=2.8 - pandas=1.5 + - pydantic=2.11.7 - numpy=1.26.4 - pre-commit=2.20 # Only required for development - go=1.24 # Only required for development diff --git a/pyproject.toml b/pyproject.toml index 22acca360..be3bf33ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ dependencies = [ "matplotlib==3.6", "networkx==2.8", "pandas==1.5", + "pydantic==2.11.7", "numpy==1.26.4", "pip==22.1", "requests==2.28", @@ -73,4 +74,4 @@ select = [ # py-modules tells setuptools which directory is our actual module py-modules = ["spras"] # packages tells setuptools what the exported package is called (ie allows import spras) -packages = ["spras", "spras.analysis"] +packages = ["spras", "spras.analysis", "spras.config"] diff --git a/spras/allpairs.py b/spras/allpairs.py index b462ed71c..5c1476e8a 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -1,6 +1,8 @@ import warnings from pathlib import Path +from spras.config.container_schema import ProcessedContainerSettings +from spras.config.util import Empty from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset from spras.interactome import ( @@ -14,8 +16,9 @@ __all__ = ['AllPairs'] -class AllPairs(PRM): +class AllPairs(PRM[Empty]): required_inputs = ['nodetypes', 'network', 'directed_flag'] + dois = [] @staticmethod def generate_inputs(data: Dataset, filename_map): @@ -70,15 +73,9 @@ def generate_inputs(data: Dataset, filename_map): header=["#Interactor1", "Interactor2", "Weight"]) @staticmethod - def run(nodetypes=None, network=None, directed_flag=None, output_file=None, container_framework="docker"): - """ - Run All Pairs Shortest Paths with Docker - @param nodetypes: input node types with sources and targets (required) - @param network: input network file (required) - @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) - @param output_file: path to the output pathway file (required) - """ - if not nodetypes or not network or not output_file or not directed_flag: + def run(inputs, output_file, args=None, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() + if not inputs["nodetypes"] or not inputs["network"] or not inputs["directed_flag"]: raise ValueError('Required All Pairs Shortest Paths arguments are missing') work_dir = '/apsp' @@ -86,15 +83,15 @@ def run(nodetypes=None, network=None, directed_flag=None, output_file=None, cont # Each volume is a tuple (src, dest) volumes = list() - bind_path, node_file = prepare_volume(nodetypes, work_dir) + bind_path, node_file = prepare_volume(inputs["nodetypes"], work_dir, container_settings) volumes.append(bind_path) - bind_path, network_file = prepare_volume(network, work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir, container_settings) volumes.append(bind_path) # Create the parent directories for the output file if needed Path(output_file).parent.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_file = prepare_volume(output_file, work_dir) + bind_path, mapped_out_file = prepare_volume(output_file, work_dir, container_settings) volumes.append(bind_path) command = ['python', @@ -102,17 +99,17 @@ def run(nodetypes=None, network=None, directed_flag=None, output_file=None, cont '--network', network_file, '--nodes', node_file, '--output', mapped_out_file] - if Path(directed_flag).read_text().strip() == "true": + if Path(inputs["directed_flag"]).read_text().strip() == "true": command.append("--directed") container_suffix = "allpairs:v4" run_container_and_log( 'All Pairs Shortest Paths', - container_framework, container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) @staticmethod def parse_output(raw_pathway_file, standardized_pathway_file, params): diff --git a/spras/btb.py b/spras/btb.py index 416395a55..16bce75ae 100644 --- a/spras/btb.py +++ b/spras/btb.py @@ -1,5 +1,7 @@ from pathlib import Path +from spras.config.container_schema import ProcessedContainerSettings +from spras.config.util import Empty from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( convert_undirected_to_directed, @@ -23,18 +25,13 @@ Interactor1 Interactor2 Weight """ -class BowTieBuilder(PRM): +class BowTieBuilder(PRM[Empty]): required_inputs = ['sources', 'targets', 'edges'] + dois = ["10.1186/1752-0509-3-67"] #generate input taken from meo.py beacuse they have same input requirements @staticmethod def generate_inputs(data, filename_map): - """ - Access fields from the dataset and write the required input files - @param data: dataset - @param filename_map: a dict mapping file types in the required_inputs to the filename for that type - @return: - """ for input_type in BowTieBuilder.required_inputs: if input_type not in filename_map: raise ValueError(f"{input_type} filename is missing") @@ -69,30 +66,22 @@ def generate_inputs(data, filename_map): # Skips parameter validation step @staticmethod - def run(sources=None, targets=None, edges=None, output_file=None, container_framework="docker"): - """ - Run BTB with Docker - @param sources: input source file (required) - @param targets: input target file (required) - @param edges: input edge file (required) - @param output_file: path to the output pathway file (required) - @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) - """ - + def run(inputs, output_file, args=None, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() # Tests for pytest (docker container also runs this) # Testing out here avoids the trouble that container errors provide - if not sources or not targets or not edges or not output_file: + if not inputs["sources"] or not inputs["targets"] or not inputs["edges"]: raise ValueError('Required BowTieBuilder arguments are missing') - if not Path(sources).exists() or not Path(targets).exists() or not Path(edges).exists(): + if not Path(inputs["sources"]).exists() or not Path(inputs["targets"]).exists() or not Path(inputs["edges"]).exists(): raise ValueError('Missing input file') # Testing for btb index errors # TODO: This error will never actually occur if the inputs are passed through # `generate_inputs`. See the discussion about removing this or making this a habit at # https://github.com/Reed-CompBio/spras/issues/306. - with open(edges, 'r') as edge_file: + with open(inputs["edges"], 'r') as edge_file: try: for line in edge_file: line = line.strip().split('\t')[2] @@ -106,19 +95,19 @@ def run(sources=None, targets=None, edges=None, output_file=None, container_fram # Each volume is a tuple (src, dest) volumes = list() - bind_path, source_file = prepare_volume(sources, work_dir) + bind_path, source_file = prepare_volume(inputs["sources"], work_dir, container_settings) volumes.append(bind_path) - bind_path, target_file = prepare_volume(targets, work_dir) + bind_path, target_file = prepare_volume(inputs["targets"], work_dir, container_settings) volumes.append(bind_path) - bind_path, edges_file = prepare_volume(edges, work_dir) + bind_path, edges_file = prepare_volume(inputs["edges"], work_dir, container_settings) volumes.append(bind_path) # Use its --output argument to set the output file prefix to specify an absolute path and prefix out_dir = Path(output_file).parent out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir, container_settings) volumes.append(bind_path) mapped_out_prefix = mapped_out_dir + '/raw-pathway.txt' # Use posix path inside the container @@ -135,11 +124,11 @@ def run(sources=None, targets=None, edges=None, output_file=None, container_fram container_suffix = "bowtiebuilder:v2" run_container_and_log('BowTieBuilder', - container_framework, container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) # Output is already written to raw-pathway.txt file diff --git a/spras/config.py b/spras/config.py deleted file mode 100644 index 5287815fe..000000000 --- a/spras/config.py +++ /dev/null @@ -1,310 +0,0 @@ -""" -This config file is being used as a singleton. Because python creates a single instance -of modules when they're imported, we rely on the Snakefile instantiating the module. -In particular, when the Snakefile calls init_config, it will reassign config -to take the value of the actual config provided by Snakemake. After that point, any -module that imports this module can access a config option by checking the object's -value. For example - -import spras.config as config -container_framework = config.config.container_framework - -will grab the top level registry configuration option as it appears in the config file -""" - -import copy as copy -import itertools as it -import os -import re -from collections.abc import Iterable - -import numpy as np -import yaml - -from spras.util import NpHashEncoder, hash_params_sha1_base32 - -# The default length of the truncated hash used to identify parameter combinations -DEFAULT_HASH_LENGTH = 7 -DEFAULT_CONTAINER_PREFIX = "docker.io/reedcompbio" - -config = None - -# This will get called in the Snakefile, instantiating the singleton with the raw config -def init_global(config_dict): - global config - config = Config(config_dict) - -def init_from_file(filepath): - global config - - # Handle opening the file and parsing the yaml - filepath = os.path.abspath(filepath) - try: - with open(filepath, 'r') as yaml_file: - config_dict = yaml.safe_load(yaml_file) - except FileNotFoundError: - print(f"Error: The specified config '{filepath}' could not be found.") - return False - except yaml.YAMLError as e: - print(f"Error: Failed to parse config '{filepath}': {e}") - return False - - # And finally, initialize - config = Config(config_dict) - - -class Config: - def __init__(self, raw_config): - # Since process_config winds up modifying the raw_config passed to it as a side effect, - # we'll make a deep copy here to guarantee we don't break anything. This preserves the - # config as it's given to the Snakefile by Snakemake - - # Member vars populated by process_config. Set to None before they are populated so that our - # __init__ makes clear exactly what is being configured. - # Directory used for storing output - self.out_dir = None - # Container framework used by PRMs. Valid options are "docker", "dsub", and "singularity" - self.container_framework = None - # The container prefix (host and organization) to use for images. Default is "docker.io/reedcompbio" - self.container_prefix = DEFAULT_CONTAINER_PREFIX - # A Boolean specifying whether to unpack singularity containers. Default is False - self.unpack_singularity = False - # A dictionary to store configured datasets against which SPRAS will be run - self.datasets = None - # A dictionary to store configured gold standard data against output of SPRAS runs - self.gold_standards = None - # The hash length SPRAS will use to identify parameter combinations. Default is 7 - self.hash_length = DEFAULT_HASH_LENGTH - # The list of algorithms to run in the workflow. Each is a dict with 'name' as an expected key. - self.algorithms = None - # A nested dict mapping algorithm names to dicts that map parameter hashes to parameter combinations. - # Only includes algorithms that are set to be run with 'include: true'. - self.algorithm_params = None - # Deprecated. Previously a dict mapping algorithm names to a Boolean tracking whether they used directed graphs. - self.algorithm_directed = None - # A dict with the analysis settings - self.analysis_params = None - # A dict with the ML settings - self.ml_params = None - # A Boolean specifying whether to run ML analysis for individual algorithms - self.analysis_include_ml_aggregate_algo = None - # A dict with the PCA settings - self.pca_params = None - # A dict with the hierarchical clustering settings - self.hac_params = None - # A Boolean specifying whether to run the summary analysis - self.analysis_include_summary = None - # A Boolean specifying whether to run the Cytoscape analysis - self.analysis_include_cytoscape = None - # A Boolean specifying whether to run the ML analysis - self.analysis_include_ml = None - # A Boolean specifying whether to run the Evaluation analysis - self.analysis_include_evaluation = None - # A Boolean specifying whether to run the ML per algorithm analysis - self.analysis_include_ml_aggregate_algo = None - # A Boolean specifying whether to run the evaluation per algorithm analysis - self.analysis_include_evaluation_aggregate_algo = None - - _raw_config = copy.deepcopy(raw_config) - self.process_config(_raw_config) - - def process_config(self, raw_config): - if raw_config == {}: - raise ValueError("Config file cannot be empty. Use --configfile to set a config file.") - - # Set up a few top-level config variables - self.out_dir = raw_config["reconstruction_settings"]["locations"]["reconstruction_dir"] - - # We allow the container framework not to be defined in the config. In the case it isn't, default to docker. - # However, if we get a bad value, we raise an exception. - if "container_framework" in raw_config: - container_framework = raw_config["container_framework"].lower() - if container_framework not in ("docker", "singularity", "dsub"): - msg = "SPRAS was configured to run with an unknown container framework: '" + raw_config["container_framework"] + "'. Accepted values are 'docker', 'singularity' or 'dsub'." - raise ValueError(msg) - if container_framework == "dsub": - print("Warning: 'dsub' framework integration is experimental and may not be fully supported.") - self.container_framework = container_framework - else: - self.container_framework = "docker" - - # Unpack settings for running in singularity mode. Needed when running PRM containers if already in a container. - if "unpack_singularity" in raw_config: - # The value in the config is a string, and we need to convert it to a bool. - unpack_singularity = raw_config["unpack_singularity"] - if unpack_singularity and self.container_framework != "singularity": - print("Warning: unpack_singularity is set to True, but the container framework is not singularity. This setting will have no effect.") - self.unpack_singularity = unpack_singularity - - # Grab registry from the config, and if none is provided default to docker - if "container_registry" in raw_config and raw_config["container_registry"]["base_url"] != "" and raw_config["container_registry"]["owner"] != "": - self.container_prefix = raw_config["container_registry"]["base_url"] + "/" + raw_config["container_registry"]["owner"] - - # Parse dataset information - # Datasets is initially a list, where each list entry has a dataset label and lists of input files - # Convert the dataset list into a dict where the label is the key and update the config data structure - # TODO allow labels to be optional and assign default labels - # TODO check for collisions in dataset labels, warn, and make the labels unique - # Need to work more on input file naming to make less strict assumptions - # about the filename structure - # Currently assumes all datasets have a label and the labels are unique - # When Snakemake parses the config file it loads the datasets as OrderedDicts not dicts - # Convert to dicts to simplify the yaml logging - self.datasets = {dataset["label"]: dict(dataset) for dataset in raw_config["datasets"]} - - for key in self.datasets: - pattern = r'^\w+$' - if not bool(re.match(pattern, key)): - raise ValueError(f"Dataset label \'{key}\' contains invalid values. Dataset labels can only contain letters, numbers, or underscores.") - - # parse gold standard information - try: - self.gold_standards = {gold_standard["label"]: dict(gold_standard) for gold_standard in raw_config["gold_standards"]} - except: - self.gold_standards = {} - - # check that gold_standard labels are formatted correctly - for key in self.gold_standards: - pattern = r'^\w+$' - if not bool(re.match(pattern, key)): - raise ValueError(f"Gold standard label \'{key}\' contains invalid values. Gold standard labels can only contain letters, numbers, or underscores.") - - # check that all the dataset labels in the gold standards are existing datasets labels - dataset_labels = set(self.datasets.keys()) - gold_standard_dataset_labels = {dataset_label for value in self.gold_standards.values() for dataset_label in value['dataset_labels']} - for label in gold_standard_dataset_labels: - if label not in dataset_labels: - raise ValueError(f"Dataset label '{label}' provided in gold standards does not exist in the existing dataset labels.") - - # Code snipped from Snakefile that may be useful for assigning default labels - # dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)] - # Maps from the dataset label to the dataset list index - # dataset_dict = {dataset.get('label', f'dataset{index}'): index for index, dataset in enumerate(datasets)} - - # Override the default parameter hash length if specified in the config file - if "hash_length" in raw_config and raw_config["hash_length"] != "": - self.hash_length = int(raw_config["hash_length"]) - - prior_params_hashes = set() - - # Parse algorithm information - # Each algorithm's parameters are provided as a list of dictionaries - # Defaults are handled in the Python function or class that wraps - # running that algorithm - # Keys in the parameter dictionary are strings - self.algorithm_params = dict() - self.algorithm_directed = dict() - self.algorithms = raw_config["algorithms"] - for alg in self.algorithms: - cur_params = alg["params"] - if "include" in cur_params and cur_params.pop("include"): - # This dict maps from parameter combinations hashes to parameter combination dictionaries - self.algorithm_params[alg["name"]] = dict() - else: - # Do not parse the rest of the parameters for this algorithm if it is not included - continue - - if "directed" in cur_params: - print("UPDATE: we no longer use the directed key in the config file") - cur_params.pop("directed") - - # The algorithm has no named arguments so create a default placeholder - if len(cur_params) == 0: - cur_params["run1"] = {"spras_placeholder": ["no parameters"]} - - # Each set of runs should be 1 level down in the config file - for run_params in cur_params: - all_runs = [] - - # We create the product of all param combinations for each run - param_name_list = [] - if cur_params[run_params]: - for p in cur_params[run_params]: - param_name_list.append(p) - obj = str(cur_params[run_params][p]) - try: - obj = [int(obj)] - except ValueError: - try: - obj = [float(obj)] - except ValueError: - # Handles arrays and special evaluation types - # TODO: do we want to explicitly bar `eval` if we may use untrusted user inputs later? - if obj.startswith(("range", "np.linspace", "np.arange", "np.logspace", "[")): - obj = eval(obj) - elif obj.lower() == "true": - obj = [True] - elif obj.lower() == "false": - obj = [False] - else: - # Catch-all for strings - obj = [obj] - if not isinstance(obj, Iterable): - raise ValueError(f"The object `{obj}` in algorithm {alg['name']} at key '{p}' in run '{run_params}' is not iterable!") from None - all_runs.append(obj) - run_list_tuples = list(it.product(*all_runs)) - param_name_tuple = tuple(param_name_list) - for r in run_list_tuples: - run_dict = dict(zip(param_name_tuple, r, strict=True)) - # TODO: Workaround for yaml.safe_dump in Snakefile write_parameter_log. - # We would like to preserve np info for larger floats and integers on the config, - # but this isn't strictly necessary for the pretty yaml logging that's happening - if we - # want to preserve the precision, we need to output this into yaml as strings. - for param, value in run_dict.copy().items(): - if isinstance(value, np.integer): - run_dict[param] = int(value) - if isinstance(value, np.floating): - run_dict[param] = float(value) - if isinstance(value, np.ndarray): - run_dict[param] = value.tolist() - params_hash = hash_params_sha1_base32(run_dict, self.hash_length, cls=NpHashEncoder) - if params_hash in prior_params_hashes: - raise ValueError(f'Parameter hash collision detected. Increase the hash_length in the config file ' - f'(current length {self.hash_length}).') - self.algorithm_params[alg["name"]][params_hash] = run_dict - - self.analysis_params = raw_config["analysis"] if "analysis" in raw_config else {} - self.ml_params = self.analysis_params["ml"] if "ml" in self.analysis_params else {} - self.evaluation_params = self.analysis_params["evaluation"] if "evaluation" in self.analysis_params else {} - - self.pca_params = {} - if "components" in self.ml_params: - self.pca_params["components"] = self.ml_params["components"] - if "labels" in self.ml_params: - self.pca_params["labels"] = self.ml_params["labels"] - - self.hac_params = {} - if "linkage" in self.ml_params: - self.hac_params["linkage"] = self.ml_params["linkage"] - if "metric" in self.ml_params: - self.hac_params["metric"] = self.ml_params ["metric"] - - self.analysis_include_summary = raw_config["analysis"]["summary"]["include"] - self.analysis_include_cytoscape = raw_config["analysis"]["cytoscape"]["include"] - self.analysis_include_ml = raw_config["analysis"]["ml"]["include"] - self.analysis_include_evaluation = raw_config["analysis"]["evaluation"]["include"] - - # Only run ML aggregate per algorithm if analysis include ML is set to True - if 'aggregate_per_algorithm' in self.ml_params and self.analysis_include_ml: - self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"] - else: - self.analysis_include_ml_aggregate_algo = False - - # Raises an error if Evaluation is enabled but no gold standard data is provided - if self.gold_standards == {} and self.analysis_include_evaluation: - raise ValueError("Evaluation analysis cannot run as gold standard data not provided. " - "Please set evaluation include to false or provide gold standard data.") - - # Only run Evaluation if ML is set to True - if not self.analysis_include_ml: - self.analysis_include_evaluation = False - - # Only run Evaluation aggregate per algorithm if analysis include ML is set to True - if 'aggregate_per_algorithm' in self.evaluation_params and self.analysis_include_evaluation: - self.analysis_include_evaluation_aggregate_algo = raw_config["analysis"]["evaluation"]["aggregate_per_algorithm"] - else: - self.analysis_include_evaluation_aggregate_algo = False - - # Only run Evaluation per algorithm if ML per algorithm is set to True - if not self.analysis_include_ml_aggregate_algo: - self.analysis_include_evaluation_aggregate_algo = False diff --git a/spras/config/__init__.py b/spras/config/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py new file mode 100644 index 000000000..889efab35 --- /dev/null +++ b/spras/config/algorithms.py @@ -0,0 +1,161 @@ +""" +Dynamic construction of algoithm parameters with runtime type information for +parameter combinations. This has been isolated from schema.py as it is not declarative, +and rather mainly contains validators and lower-level pydantic code. +""" +import ast +import copy +from typing import Annotated, Any, Callable, Literal, Optional, Union, cast, get_args + +import numpy as np +from pydantic import BaseModel, BeforeValidator, ConfigDict, Field, create_model + +from spras.runner import algorithms + +__all__ = ['AlgorithmUnion'] + +def is_numpy_friendly(type: type[Any] | None) -> bool: + """ + Whether the passed in type can have any numpy helpers. + This is mainly used to provide hints in the JSON schema. + """ + allowed_types = (int, float) + + # check basic types, then check optional types + return type in allowed_types or \ + any([arg for arg in get_args(type) if arg in allowed_types]) + +def python_evalish_coerce(value: Any) -> Any: + """ + Allows for using numpy and python calls. + + **Safety Note**: This does not prevent availability attacks: this can still exhaust + resources if wanted. This only prevents secret leakage. + """ + + if not isinstance(value, str): + return value + + # These strings are in the form of function calls `function.name(param1, param2, ...)`. + # Since we want to avoid `eval` (since this might be running in the secret-sensitive HTCondor), + # we need to parse these functions. + functions_dict: dict[str, Callable[[list[Any]], list[Union[int, float]]]] = { + 'range': lambda params: list(range(*params)), + "np.linspace": lambda params: list(np.linspace(*params)), + "np.arange": lambda params: list(np.arange(*params)), + "np.logspace": lambda params: list(np.logspace(*params)), + } + + # To do this, we get the AST of our string as an expression + # (filename='' is to make the error message more closely resemble that of eval.) + value_ast = ast.parse(value, mode='eval', filename='') + + # Then we do some light parsing - we're only looking to do some literal evaluation + # (allowing light python notation) and some basic function parsing. Full python programs + # should just generate a config.yaml. + + # This should always be an Expression whose body is Call (a function). + if not isinstance(value_ast.body, ast.Call): + raise ValueError(f'The python code "{value}" should be calling a function directly. Is this meant to be python code?') + + # We get the function name back as a string + function_name = ast.unparse(value_ast.body.func) + + # and we use the (non-availability) safe `ast.literal_eval` to support literals passed into functions. + arguments = [ast.literal_eval(arg) for arg in value_ast.body.args] + + if function_name not in functions_dict: + raise ValueError(f"{function_name} is not an allowed function to be run!") + + return functions_dict[function_name](arguments) + +def list_coerce(value: Any) -> Any: + """ + Coerces to a value to a list if it isn't already. + Used as a BeforeValidator. + """ + if not isinstance(value, list): + return [value] + return value + +def construct_algorithm_model(name: str, model: type[BaseModel], model_default: Optional[BaseModel]) -> type[BaseModel]: + """ + Dynamically constructs a parameter-combination model based on the original args model. + This is the most 'hacky' part of this code, but, thanks to pydantic, we avoid reflection + and preserve rich type information at runtime. + """ + # First, we need to take our 'model' and coerce it to permit parameter combinations. + # This assumes that all of the keys are flattened, so we only get a structure like so: + # class AlgorithmParams(BaseModel): + # key1: int + # key2: list[str] + # ... + # and we want to transform this to: + # class AlgorithmParamsCombination(BaseModel): + # key1: list[int] + # key2: list[list[str]] + # However, we want to preserve certain conveniences (singleton values, fake python evaluation), + # so we also make use of BeforeValidators to do so, and we pass over their preferences into the JSON schema. + # (Note: This function does not worry about getting the cartesian product of this.) + + # Map our fields to a list (assuming we have no nested keys), + # and specify our user convenience validators + mapped_list_field: dict[str, Annotated] = dict() + for field_name, field in model.model_fields.items(): + # We need to create a copy of the field, + # as we need to make sure that it gets mapped to the list coerced version of the field. + new_field = copy.deepcopy(field) + new_field.validate_default = True + + mapped_list_field[field_name] = (Annotated[ + list[field.annotation], + # This order isn't arbitrary. + # https://docs.pydantic.dev/latest/concepts/validators/#ordering-of-validators + # This runs second. This coerces any singletons to lists. + BeforeValidator(list_coerce, json_schema_input_type=Union[field.annotation, list[field.annotation]]), + # This runs first. This evaluates numpy utils for integer/float lists + BeforeValidator( + python_evalish_coerce, + # json_schema_input_type (sensibly) overwrites, so we have to specify the entire union again here. + json_schema_input_type=Union[field.annotation, list[field.annotation], str] + ) if is_numpy_friendly(field.annotation) else None + ], new_field) + + # Runtime assertion check: mapped_list_field does not contain any `__-prefixed` fields + for key in mapped_list_field.keys(): + assert not key.startswith("__"), f"A private key has been passed from {name}'s argument schema. " + \ + "This should have been caught by the Snakemake CI step." + + # Pass this as kwargs to create_model, which usually takes in parameters field_name=type. + # We do need to cast create_model, since otherwise the type-checker complains that we may + # have had a key that starts with __ in mapped_list_fields. The above assertion prevents this. + run_model = (cast(Any, create_model))( + f'{name}RunModel', + __config__=ConfigDict(extra='forbid'), + **mapped_list_field + ) + + # Here is an example of how this would look like inside config.yaml + # name: pathlinker + # include: true + # runs: + # run1: + # (from run_model) + # ... + return create_model( + f'{name}Model', + name=Literal[name], + include=bool, + # For algorithms that have a default parameter config, we allow arbitrarily running an algorithm + # if no runs are specified. For example, the following config + # name: pathlinker + # include: true + # will run, despite there being no entries in `runs`. + # (create_model entries take in either a type or (type, default)). + runs=dict[str, run_model] if model_default is None else (dict[str, run_model], {"default": model_default}), + __config__=ConfigDict(extra='forbid') + ) + +algorithm_models: list[type[BaseModel]] = [construct_algorithm_model(name, model, model_default) for name, (_, model, model_default) in algorithms.items()] +# name differentriates algorithms +AlgorithmUnion = Annotated[Union[tuple(algorithm_models)], Field(discriminator='name')] diff --git a/spras/config/config.py b/spras/config/config.py new file mode 100644 index 000000000..2c0499fb7 --- /dev/null +++ b/spras/config/config.py @@ -0,0 +1,256 @@ +""" +This config file is being used as a singleton. Because python creates a single instance +of modules when they're imported, we rely on the Snakefile instantiating the module. +In particular, when the Snakefile calls init_config, it will reassign config +to take the value of the actual config provided by Snakemake. After that point, any +module that imports this module can access a config option by checking the object's +value. For example + +import spras.config.config as config +container_framework = config.config.container_framework + +will grab the top level registry configuration option as it appears in the config file +""" + +import copy as copy +import itertools as it +import os +import re +import warnings +from collections.abc import Iterable +from typing import Any + +import numpy as np +import yaml + +from spras.config.container_schema import ProcessedContainerSettings +from spras.config.schema import Analysis, RawConfig +from spras.util import NpHashEncoder, hash_params_sha1_base32 + +config = None + +# This will get called in the Snakefile, instantiating the singleton with the raw config +def init_global(config_dict): + global config + config = Config(config_dict) + +def init_from_file(filepath): + global config + + # Handle opening the file and parsing the yaml + filepath = os.path.abspath(filepath) + try: + with open(filepath, 'r') as yaml_file: + config_dict = yaml.safe_load(yaml_file) + except FileNotFoundError as e: + raise RuntimeError(f"Error: The specified config '{filepath}' could not be found.") from e + except yaml.YAMLError as e: + raise RuntimeError(f"Error: Failed to parse config '{filepath}'") from e + + # And finally, initialize + config = Config(config_dict) + + +class Config: + def __init__(self, raw_config: dict[str, Any]): + # Since snakemake provides an empty config, we provide this + # wrapper error first before passing validation to pydantic. + if raw_config == {}: + raise ValueError("Config file cannot be empty. Use --configfile to set a config file.") + + parsed_raw_config = RawConfig.model_validate(raw_config) + + # Member vars populated by process_config. Any values that don't have quick initial values are set to None + # before they are populated for __init__ to show exactly what is being configured. + + # Directory used for storing output + self.out_dir = parsed_raw_config.reconstruction_settings.locations.reconstruction_dir + # Container framework used by PRMs. Valid options are "docker", "dsub", and "singularity" + self.container_settings = ProcessedContainerSettings.from_container_settings(parsed_raw_config.containers, parsed_raw_config.hash_length) + # A Boolean specifying whether to unpack singularity containers. Default is False + self.unpack_singularity = False + # A dictionary to store configured datasets against which SPRAS will be run + self.datasets = None + # A dictionary to store configured gold standard data against output of SPRAS runs + self.gold_standards = None + # The hash length SPRAS will use to identify parameter combinations. + self.hash_length = parsed_raw_config.hash_length + # The list of algorithms to run in the workflow. Each is a dict with 'name' as an expected key. + self.algorithms = None + # A nested dict mapping algorithm names to dicts that map parameter hashes to parameter combinations. + # Only includes algorithms that are set to be run with 'include: true'. + self.algorithm_params: dict[str, dict[str, Any]] = dict() + # Deprecated. Previously a dict mapping algorithm names to a Boolean tracking whether they used directed graphs. + self.algorithm_directed = None + # A dict with the analysis settings + self.analysis_params = parsed_raw_config.analysis + # A dict with the evaluation settings + self.evaluation_params = self.analysis_params.evaluation + # A dict with the ML settings + self.ml_params = self.analysis_params.ml + # A Boolean specifying whether to run ML analysis for individual algorithms + self.analysis_include_ml_aggregate_algo = None + # A dict with the PCA settings + self.pca_params = None + # A dict with the hierarchical clustering settings + self.hac_params = None + # A Boolean specifying whether to run the summary analysis + self.analysis_include_summary = None + # A Boolean specifying whether to run the Cytoscape analysis + self.analysis_include_cytoscape = None + # A Boolean specifying whether to run the ML analysis + self.analysis_include_ml = None + # A Boolean specifying whether to run the Evaluation analysis + self.analysis_include_evaluation = None + # A Boolean specifying whether to run the ML per algorithm analysis + self.analysis_include_ml_aggregate_algo = None + # A Boolean specifying whether to run the evaluation per algorithm analysis + self.analysis_include_evaluation_aggregate_algo = None + + self.process_config(parsed_raw_config) + + def process_datasets(self, raw_config: RawConfig): + """ + Parse dataset information + Datasets is initially a list, where each list entry has a dataset label and lists of input files + Convert the dataset list into a dict where the label is the key and update the config data structure + """ + # TODO allow labels to be optional and assign default labels + # Need to work more on input file naming to make less strict assumptions + # about the filename structure + # Currently assumes all datasets have a label and the labels are unique + # When Snakemake parses the config file it loads the datasets as OrderedDicts not dicts + # Convert to dicts to simplify the yaml logging + self.datasets = {} + for dataset in raw_config.datasets: + label = dataset.label + if label.lower() in [key.lower() for key in self.datasets.keys()]: + raise ValueError(f"Datasets must have unique case-insensitive labels, but the label {label} appears at least twice.") + self.datasets[label] = dict(dataset) + + # parse gold standard information + self.gold_standards = {gold_standard.label: dict(gold_standard) for gold_standard in raw_config.gold_standards} + + # check that all the dataset labels in the gold standards are existing datasets labels + dataset_labels = set(self.datasets.keys()) + gold_standard_dataset_labels = {dataset_label for value in self.gold_standards.values() for dataset_label in value['dataset_labels']} + for label in gold_standard_dataset_labels: + if label not in dataset_labels: + raise ValueError(f"Dataset label '{label}' provided in gold standards does not exist in the existing dataset labels.") + + # Code snipped from Snakefile that may be useful for assigning default labels + # dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)] + # Maps from the dataset label to the dataset list index + # dataset_dict = {dataset.get('label', f'dataset{index}'): index for index, dataset in enumerate(datasets)} + + def process_algorithms(self, raw_config: RawConfig): + """ + Parse algorithm information + Each algorithm's parameters are provided as a list of dictionaries + Defaults are handled in the Python function or class that wraps + running that algorithm + Keys in the parameter dictionary are strings + """ + prior_params_hashes = set() + self.algorithm_params = dict() + self.algorithm_directed = dict() + self.algorithms = raw_config.algorithms + for alg in self.algorithms: + if alg.include: + # This dict maps from parameter combinations hashes to parameter combination dictionaries + self.algorithm_params[alg.name] = dict() + else: + # Do not parse the rest of the parameters for this algorithm if it is not included + continue + + runs: dict[str, Any] = alg.runs + + # Each set of runs should be 1 level down in the config file + for run_name in runs.keys(): + all_runs = [] + + # We create the product of all param combinations for each run + param_name_list = [] + run_subscriptable = vars(runs[run_name]) + for param in run_subscriptable: + param_name_list.append(param) + # this is guaranteed to be list[Any] by algorithms.py + param_values: list[Any] = run_subscriptable[param] + all_runs.append(param_values) + run_list_tuples = list(it.product(*all_runs)) + param_name_tuple = tuple(param_name_list) + for r in run_list_tuples: + run_dict = dict(zip(param_name_tuple, r, strict=True)) + # TODO: Workaround for yaml.safe_dump in Snakefile write_parameter_log. + # We would like to preserve np info for larger floats and integers on the config, + # but this isn't strictly necessary for the pretty yaml logging that's happening - if we + # want to preserve the precision, we need to output this into yaml as strings. + for param, value in run_dict.copy().items(): + if isinstance(value, np.integer): + run_dict[param] = int(value) + if isinstance(value, np.floating): + run_dict[param] = float(value) + if isinstance(value, np.ndarray): + run_dict[param] = value.tolist() + params_hash = hash_params_sha1_base32(run_dict, self.hash_length, cls=NpHashEncoder) + if params_hash in prior_params_hashes: + raise ValueError(f'Parameter hash collision detected. Increase the hash_length in the config file ' + f'(current length {self.hash_length}).') + + # We preserve the run name as it carries useful information for the parameter log, + # and is useful for testing. + run_dict["_spras_run_name"] = run_name + + self.algorithm_params[alg.name][params_hash] = run_dict + + def process_analysis(self, raw_config: RawConfig): + if not raw_config.analysis: + return + + # self.ml_params is a class, pca_params needs to be a dict. + self.pca_params = { + "components": self.ml_params.components, + "labels": self.ml_params.labels + } + + self.hac_params = { + "linkage": self.ml_params.linkage, + "metric": self.ml_params.metric + } + + self.analysis_include_summary = raw_config.analysis.summary.include + self.analysis_include_cytoscape = raw_config.analysis.cytoscape.include + self.analysis_include_ml = raw_config.analysis.ml.include + self.analysis_include_evaluation = raw_config.analysis.evaluation.include + + # Only run ML aggregate per algorithm if analysis include ML is set to True + if self.ml_params.aggregate_per_algorithm and self.analysis_include_ml: + self.analysis_include_ml_aggregate_algo = raw_config.analysis.ml.aggregate_per_algorithm + else: + self.analysis_include_ml_aggregate_algo = False + + # Raises an error if Evaluation is enabled but no gold standard data is provided + if self.gold_standards == {} and self.analysis_include_evaluation: + raise ValueError("Evaluation analysis cannot run as gold standard data not provided. " + "Please set evaluation include to false or provide gold standard data.") + + # Only run Evaluation if ML is set to True + if not self.analysis_include_ml: + self.analysis_include_evaluation = False + + # Only run Evaluation aggregate per algorithm if analysis include ML is set to True + if self.evaluation_params.aggregate_per_algorithm and self.analysis_include_evaluation: + self.analysis_include_evaluation_aggregate_algo = raw_config.analysis.evaluation.aggregate_per_algorithm + else: + self.analysis_include_evaluation_aggregate_algo = False + + # Only run Evaluation per algorithm if ML per algorithm is set to True + if not self.analysis_include_ml_aggregate_algo: + self.analysis_include_evaluation_aggregate_algo = False + + def process_config(self, raw_config: RawConfig): + self.out_dir = raw_config.reconstruction_settings.locations.reconstruction_dir + + self.process_datasets(raw_config) + self.process_algorithms(raw_config) + self.process_analysis(raw_config) diff --git a/spras/config/container_schema.py b/spras/config/container_schema.py new file mode 100644 index 000000000..c88692678 --- /dev/null +++ b/spras/config/container_schema.py @@ -0,0 +1,70 @@ +""" +The separate container schema specification file. +For information about pydantic, see schema.py. + +We move this to a separate file to allow `containers.py` to explicitly take in +this subsection of the configuration. +""" + +import warnings +from dataclasses import dataclass +from typing import Optional + +from pydantic import BaseModel, ConfigDict, Field + +from spras.config.util import CaseInsensitiveEnum + +DEFAULT_CONTAINER_PREFIX = "docker.io/reedcompbio" + +class ContainerFramework(CaseInsensitiveEnum): + docker = 'docker' + # TODO: add apptainer variant once #260 gets merged + singularity = 'singularity' + dsub = 'dsub' + +class ContainerRegistry(BaseModel): + base_url: str = "docker.io" + "The domain of the registry" + + owner: str = "reedcompbio" + "The owner or project of the registry" + + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) + +class ContainerSettings(BaseModel): + framework: ContainerFramework = ContainerFramework.docker + unpack_singularity: bool = False + registry: ContainerRegistry + hash_length: int = 7 + + model_config = ConfigDict(extra='forbid') + +@dataclass +class ProcessedContainerSettings: + framework: ContainerFramework = ContainerFramework.docker + unpack_singularity: bool = False + prefix: str = DEFAULT_CONTAINER_PREFIX + hash_length: int = 7 + + @staticmethod + def from_container_settings(settings: ContainerSettings, default_hash_length: int) -> "ProcessedContainerSettings": + if settings.framework == ContainerFramework.dsub: + warnings.warn("'dsub' framework integration is experimental and may not be fully supported.", stacklevel=2) + container_framework = settings.framework + + # Unpack settings for running in singularity mode. Needed when running PRM containers if already in a container. + if settings.unpack_singularity and container_framework != "singularity": + warnings.warn("unpack_singularity is set to True, but the container framework is not singularity. This setting will have no effect.", stacklevel=2) + unpack_singularity = settings.unpack_singularity + + # Grab registry from the config, and if none is provided default to docker + container_prefix = DEFAULT_CONTAINER_PREFIX + if settings.registry and settings.registry.base_url != "" and settings.registry.owner != "": + container_prefix = settings.registry.base_url + "/" + settings.registry.owner + + return ProcessedContainerSettings( + framework=container_framework, + unpack_singularity=unpack_singularity, + prefix=container_prefix, + hash_length=settings.hash_length or default_hash_length + ) diff --git a/spras/config/schema.py b/spras/config/schema.py new file mode 100644 index 000000000..49100bef3 --- /dev/null +++ b/spras/config/schema.py @@ -0,0 +1,150 @@ +""" +Contains the raw pydantic schema for the configuration file. + +Using Pydantic as our backing config parser allows us to declaratively +type our config, giving us more robust user errors with guarantees +that parts of the config exist after parsing it through Pydantic. + +We declare models using two classes here: +- `BaseModel` (docs: https://docs.pydantic.dev/latest/concepts/models/) +- `CaseInsensitiveEnum` (see ./util.py) +""" + +import re +from typing import Annotated + +from pydantic import AfterValidator, BaseModel, ConfigDict, Field + +from spras.config.algorithms import AlgorithmUnion +from spras.config.container_schema import ContainerSettings +from spras.config.util import CaseInsensitiveEnum + + +class SummaryAnalysis(BaseModel): + include: bool + + model_config = ConfigDict(extra='forbid') + +class CytoscapeAnalysis(BaseModel): + include: bool + + model_config = ConfigDict(extra='forbid') + +class MlLinkage(CaseInsensitiveEnum): + ward = 'ward' + complete = 'complete' + average = 'average' + single = 'single' + +class MlMetric(CaseInsensitiveEnum): + euclidean = 'euclidean' + manhattan = 'manhattan' + cosine = 'cosine' + +class MlAnalysis(BaseModel): + include: bool + aggregate_per_algorithm: bool = False + components: int = 2 + labels: bool = True + linkage: MlLinkage = MlLinkage.ward + metric: MlMetric = MlMetric.euclidean + + model_config = ConfigDict(extra='forbid') + +class EvaluationAnalysis(BaseModel): + include: bool + aggregate_per_algorithm: bool = False + + model_config = ConfigDict(extra='forbid') + +class Analysis(BaseModel): + summary: SummaryAnalysis = SummaryAnalysis(include=False) + cytoscape: CytoscapeAnalysis = CytoscapeAnalysis(include=False) + ml: MlAnalysis = MlAnalysis(include=False) + evaluation: EvaluationAnalysis = EvaluationAnalysis(include=False) + + model_config = ConfigDict(extra='forbid') + + +# The default length of the truncated hash used to identify parameter combinations +DEFAULT_HASH_LENGTH = 7 + +def label_validator(name: str): + label_pattern = r'^\w+$' + def validate(label: str): + if not bool(re.match(label_pattern, label)): + raise ValueError(f"{name} label '{label}' contains invalid values. {name} labels can only contain letters, numbers, or underscores.") + return label + return validate + +class ContainerFramework(CaseInsensitiveEnum): + docker = 'docker' + # TODO: add apptainer variant once #260 gets merged + singularity = 'singularity' + dsub = 'dsub' + +class ContainerRegistry(BaseModel): + base_url: str + owner: str = Field(description="The owner or project of the registry") + + model_config = ConfigDict(extra='forbid') + +class Dataset(BaseModel): + label: Annotated[str, AfterValidator(label_validator("Dataset"))] + node_files: list[str] + edge_files: list[str] + other_files: list[str] + data_dir: str + + model_config = ConfigDict(extra='forbid') + +class GoldStandard(BaseModel): + label: Annotated[str, AfterValidator(label_validator("Gold Standard"))] + node_files: list[str] + data_dir: str + dataset_labels: list[str] + + model_config = ConfigDict(extra='forbid') + +class Locations(BaseModel): + reconstruction_dir: str + + model_config = ConfigDict(extra='forbid') + +class ReconstructionSettings(BaseModel): + locations: Locations + + model_config = ConfigDict(extra='forbid') + +class RawConfig(BaseModel): + resume: bool = Field(alias="_resume", default=False) + """ + Declares whether a config has resumability. This is meant to be used internally, as it + enforces some extra preconditions on the config (such that all defaults must be explicitly + declared within the config, and that it meets the specified hash). + + Unlike their nonresumable counterparts, these resumable configurations will store all configuration + defaults (including, most importantly, _time from NondeterministicModel and any seeded values). + + Resumable configurations are generated whenever a non-resumable configuration is run, inside + `{output}/resumables/{hash}.yaml`. The timestamp is present only for file ordering, and {hash} is a hash + of the configuration _excluding_ default values. + + By default, SPRAS runs through Snakemake will generate a resumable configuration if none is present, + or reuse the configuration associated with its hash otherwise. + """ + + containers: ContainerSettings + + hash_length: int = DEFAULT_HASH_LENGTH + "The length of the hash used to identify a parameter combination" + + # See algorithms.py for more information about AlgorithmUnion + algorithms: list[AlgorithmUnion] # type: ignore - pydantic allows this. + datasets: list[Dataset] + gold_standards: list[GoldStandard] = [] + analysis: Analysis = Analysis() + + reconstruction_settings: ReconstructionSettings + + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) diff --git a/spras/config/util.py b/spras/config/util.py new file mode 100644 index 000000000..63799e478 --- /dev/null +++ b/spras/config/util.py @@ -0,0 +1,54 @@ +""" +General config utilities. This is the only config file +that should be imported by algorithms, and algorithms should +only import this config file. +""" + +import time +from enum import Enum +from typing import Any + +from pydantic import BaseModel, ConfigDict, Field + + +# https://stackoverflow.com/a/76883868/7589775 +class CaseInsensitiveEnum(str, Enum): + """ + We prefer this over Enum to make sure the config parsing + is more relaxed when it comes to string enum values. + """ + @classmethod + def _missing_(cls, value: Any): + if isinstance(value, str): + value = value.lower() + + for member in cls: + if member.lower() == value: + return member + return None + + +class Empty(BaseModel): + """ + The empty base model. Used for specifying that an algorithm takes no parameters, + yet are deterministic. + """ + model_config = ConfigDict(extra="forbid") + +class NondeterministicModel(BaseModel): + """ + A nondeterministic model. Any seedless nondeterministic algorithm should extend this. + Internally, this inserts a _time parameter that can be serialized but not + deserialized, and will affect the hash. + """ + + # We don't make this a PrivateAttr for reasons explained in the doc comment. + time: float = Field(default_factory=time.time, alias="_time") + """ + The internal _time parameter. This is a parameter only given to nondeterminsitic + algorithms that provide no randomness seed. While this should be unset, + we allow specifying `_time` for users that want to re-use outputs of runs, + though this explicitly breaks the 'immutability' promise of runs. + """ + + model_config = ConfigDict(use_attribute_docstrings=True) diff --git a/spras/containers.py b/spras/containers.py index ce6d62a24..d065b2ea8 100644 --- a/spras/containers.py +++ b/spras/containers.py @@ -8,7 +8,7 @@ import docker import docker.errors -import spras.config as config +from spras.config.container_schema import ProcessedContainerSettings from spras.logging import indent from spras.util import hash_filename @@ -131,47 +131,47 @@ def env_to_items(environment: dict[str, str]) -> Iterator[str]: # TODO consider a better default environment variable # Follow docker-py's naming conventions (https://docker-py.readthedocs.io/en/stable/containers.html) # Technically the argument is an image, not a container, but we use container here. -def run_container(framework: str, container_suffix: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, environment: Optional[dict[str, str]] = None): +def run_container(container_suffix: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, container_settings: ProcessedContainerSettings, environment: Optional[dict[str, str]] = None): """ Runs a command in the container using Singularity or Docker - @param framework: singularity or docker @param container_suffix: name of the DockerHub container without the 'docker://' prefix @param command: command to run in the container @param volumes: a list of volumes to mount where each item is a (source, destination) tuple @param working_dir: the working directory in the container + @param container_settings: the settings to use to run the container @param environment: environment variables to set in the container @return: output from Singularity execute or Docker run """ - normalized_framework = framework.casefold() + normalized_framework = container_settings.framework.casefold() - container = config.config.container_prefix + "/" + container_suffix + container = container_settings.prefix + "/" + container_suffix if normalized_framework == 'docker': return run_container_docker(container, command, volumes, working_dir, environment) elif normalized_framework == 'singularity': - return run_container_singularity(container, command, volumes, working_dir, environment) + return run_container_singularity(container, command, volumes, working_dir, container_settings, environment) elif normalized_framework == 'dsub': return run_container_dsub(container, command, volumes, working_dir, environment) else: - raise ValueError(f'{framework} is not a recognized container framework. Choose "docker", "dsub", or "singularity".') + raise ValueError(f'{container_settings.framework} is not a recognized container framework. Choose "docker", "dsub", or "singularity".') -def run_container_and_log(name: str, framework: str, container_suffix: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, environment: Optional[dict[str, str]] = None): +def run_container_and_log(name: str, container_suffix: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, container_settings: ProcessedContainerSettings, environment: Optional[dict[str, str]] = None): """ Runs a command in the container using Singularity or Docker with associated pretty printed messages. @param name: the display name of the running container for logging purposes - @param framework: singularity or docker @param container_suffix: name of the DockerHub container without the 'docker://' prefix @param command: command to run in the container @param volumes: a list of volumes to mount where each item is a (source, destination) tuple @param working_dir: the working directory in the container + @param container_settings: the container settings to use @param environment: environment variables to set in the container @return: output from Singularity execute or Docker run """ if not environment: environment = {'SPRAS': 'True'} - print('Running {} on container framework "{}" on env {} with command: {}'.format(name, framework, list(env_to_items(environment)), ' '.join(command)), flush=True) + print('Running {} on container framework "{}" on env {} with command: {}'.format(name, container_settings.framework, list(env_to_items(environment)), ' '.join(command)), flush=True) try: - out = run_container(framework=framework, container_suffix=container_suffix, command=command, volumes=volumes, working_dir=working_dir, environment=environment) + out = run_container(container_suffix=container_suffix, command=command, volumes=volumes, working_dir=working_dir, container_settings=container_settings, environment=environment) if out is not None: if isinstance(out, list): out = ''.join(out) @@ -290,7 +290,7 @@ def run_container_docker(container: str, command: List[str], volumes: List[Tuple return out -def run_container_singularity(container: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, environment: Optional[dict[str, str]] = None): +def run_container_singularity(container: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, config: ProcessedContainerSettings, environment: Optional[dict[str, str]] = None): """ Runs a command in the container using Singularity. Only available on Linux. @@ -329,7 +329,7 @@ def run_container_singularity(container: str, command: List[str], volumes: List[ singularity_options.extend(['--env', ",".join(env_to_items(environment))]) # Handle unpacking singularity image if needed. Potentially needed for running nested unprivileged containers - if config.config.unpack_singularity: + if config.unpack_singularity: # Split the string by "/" path_elements = container.split("/") @@ -369,7 +369,7 @@ def run_container_singularity(container: str, command: List[str], volumes: List[ # Because this is called independently for each file, the same local path can be mounted to multiple volumes -def prepare_volume(filename: Union[str, PurePath], volume_base: Union[str, PurePath]) -> Tuple[Tuple[PurePath, PurePath], str]: +def prepare_volume(filename: Union[str, os.PathLike], volume_base: Union[str, PurePath], config: ProcessedContainerSettings) -> Tuple[Tuple[PurePath, PurePath], str]: """ Makes a file on the local file system accessible within a container by mapping the local (source) path to a new container (destination) path and renaming the file to be relative to the destination path. @@ -385,10 +385,10 @@ def prepare_volume(filename: Union[str, PurePath], volume_base: Union[str, PureP if not base_path.is_absolute(): raise ValueError(f'Volume base must be an absolute path: {volume_base}') - if isinstance(filename, PurePath): + if isinstance(filename, os.PathLike): filename = str(filename) - filename_hash = hash_filename(filename, config.config.hash_length) + filename_hash = hash_filename(filename, config.hash_length) dest = PurePosixPath(base_path, filename_hash) abs_filename = Path(filename).resolve() diff --git a/spras/domino.py b/spras/domino.py index fbf402ab9..d3d761e1f 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -1,8 +1,12 @@ import json from pathlib import Path +from typing import Optional import pandas as pd +from pydantic import ConfigDict +from spras.config.container_schema import ProcessedContainerSettings +from spras.config.util import NondeterministicModel from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( add_constant, @@ -11,11 +15,19 @@ from spras.prm import PRM from spras.util import duplicate_edges -__all__ = ['DOMINO', 'pre_domino_id_transform', 'post_domino_id_transform'] +__all__ = ['DOMINO', 'DominoParams', 'pre_domino_id_transform', 'post_domino_id_transform'] ID_PREFIX = 'ENSG0' ID_PREFIX_LEN = len(ID_PREFIX) +class DominoParams(NondeterministicModel): + module_threshold: Optional[float] = None + "the p-value threshold for considering a slice as relevant (optional)" + + slice_threshold: Optional[float] = None + "the p-value threshold for considering a putative module as final module (optional)" + + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) """ DOMINO will construct a fully undirected graph from the provided input file @@ -26,17 +38,12 @@ - the expected raw input file should have node pairs in the 1st and 3rd columns, with a 'ppi' in the 2nd column - it can include repeated and bidirectional edges """ -class DOMINO(PRM): +class DOMINO(PRM[DominoParams]): required_inputs = ['network', 'active_genes'] + dois = ["10.15252/msb.20209593"] @staticmethod def generate_inputs(data, filename_map): - """ - Access fields from the dataset and write the required input files - @param data: dataset - @param filename_map: a dict mapping file types in the required_inputs to the filename for that type - @return: - """ for input_type in DOMINO.required_inputs: if input_type not in filename_map: raise ValueError(f"{input_type} filename is missing") @@ -71,20 +78,12 @@ def generate_inputs(data, filename_map): header=['ID_interactor_A', 'ppi', 'ID_interactor_B']) @staticmethod - def run(network=None, active_genes=None, output_file=None, slice_threshold=None, module_threshold=None, container_framework="docker"): - """ - Run DOMINO with Docker. - Let visualization be always true, parallelization be always 1 thread, and use_cache be always false. - DOMINO produces multiple output module files in an HTML format. SPRAS concatenates these files into one file. - @param network: input network file (required) - @param active_genes: input active genes (required) - @param output_file: path to the output pathway file (required) - @param slice_threshold: the p-value threshold for considering a slice as relevant (optional) - @param module_threshold: the p-value threshold for considering a putative module as final module (optional) - @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) - """ + def run(inputs, output_file, args=None, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() + if not args: args = DominoParams() - if not network or not active_genes or not output_file: + # Let visualization be always true, parallelization be always 1 thread, and use_cache be always false. + if not inputs["network"] or not inputs["active_genes"]: raise ValueError('Required DOMINO arguments are missing') work_dir = '/spras' @@ -92,19 +91,19 @@ def run(network=None, active_genes=None, output_file=None, slice_threshold=None, # Each volume is a tuple (source, destination) volumes = list() - bind_path, network_file = prepare_volume(network, work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir, container_settings) volumes.append(bind_path) - bind_path, node_file = prepare_volume(active_genes, work_dir) + bind_path, node_file = prepare_volume(inputs["active_genes"], work_dir, container_settings) volumes.append(bind_path) out_dir = Path(output_file).parent out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir, container_settings) volumes.append(bind_path) slices_file = Path(out_dir, 'slices.txt') - bind_path, mapped_slices_file = prepare_volume(str(slices_file), work_dir) + bind_path, mapped_slices_file = prepare_volume(str(slices_file), work_dir, container_settings) volumes.append(bind_path) # Make the Python command to run within the container @@ -114,11 +113,11 @@ def run(network=None, active_genes=None, output_file=None, slice_threshold=None, container_suffix = "domino" run_container_and_log('slicer', - container_framework, container_suffix, slicer_command, volumes, - work_dir) + work_dir, + container_settings) # Make the Python command to run within the container domino_command = ['domino', @@ -131,18 +130,18 @@ def run(network=None, active_genes=None, output_file=None, slice_threshold=None, '--visualization', 'true'] # Add optional arguments - if slice_threshold is not None: + if args.slice_threshold is not None: # DOMINO readme has the wrong argument https://github.com/Shamir-Lab/DOMINO/issues/12 - domino_command.extend(['--slice_threshold', str(slice_threshold)]) - if module_threshold is not None: - domino_command.extend(['--module_threshold', str(module_threshold)]) + domino_command.extend(['--slice_threshold', str(args.slice_threshold)]) + if args.module_threshold is not None: + domino_command.extend(['--module_threshold', str(args.module_threshold)]) run_container_and_log('DOMINO', - container_framework, container_suffix, domino_command, volumes, - work_dir) + work_dir, + container_settings) # DOMINO creates a new folder in out_dir to output its modules HTML files into called active_genes # The filename is determined by the input active_genes and cannot be configured @@ -158,7 +157,7 @@ def run(network=None, active_genes=None, output_file=None, slice_threshold=None, # Clean up DOMINO intermediate and pickle files slices_file.unlink(missing_ok=True) Path(out_dir, 'network.slices.pkl').unlink(missing_ok=True) - Path(network + '.pkl').unlink(missing_ok=True) + Path(f"{inputs['network']}.pkl").unlink(missing_ok=True) @staticmethod def parse_output(raw_pathway_file, standardized_pathway_file, params): diff --git a/spras/meo.py b/spras/meo.py index 3dc2dd863..b3b8a5973 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -1,6 +1,10 @@ import os from pathlib import Path +from typing import Optional +from pydantic import BaseModel, ConfigDict + +from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( add_directionality_constant, @@ -9,7 +13,7 @@ from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges, raw_pathway_df -__all__ = ['MEO', 'write_properties'] +__all__ = ['MEO', 'MEOParams', 'write_properties'] # replaces all underscores in the node names with unicode seperator # MEO keeps only the substring up to the first underscore when parsing node names @@ -56,7 +60,8 @@ def write_properties(filename=Path('properties.txt'), edges=None, sources=None, if max_path_length is not None: f.write(f'max.path.length = {max_path_length}\n') if local_search is not None: - f.write(f'local.search = {local_search}\n') + # Yes/No for this parameter. + f.write(f'local.search = {"Yes" if local_search else "No"}\n') if rand_restarts is not None: f.write(f'rand.restarts = {rand_restarts}\n') @@ -65,6 +70,21 @@ def write_properties(filename=Path('properties.txt'), edges=None, sources=None, # Do not need csp.phase, csp.gen.file, or csp.sol.file because MAXCSP is not supported +class MEOParams(BaseModel): + max_path_length: Optional[int] = None + "the maximal length of a path from sources and targets to orient." + + local_search: Optional[bool] = None + """ + a boolean parameter that enables MEO's local search functionality. + See "Improving approximations with local search" in the associated paper + for more information. + """ + + rand_restarts: Optional[int] = None + "The number of random restarts to do." + + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) """ MEO can support partially directed graphs @@ -82,8 +102,9 @@ def write_properties(filename=Path('properties.txt'), edges=None, sources=None, """ -class MEO(PRM): +class MEO(PRM[MEOParams]): required_inputs = ['sources', 'targets', 'edges'] + dois = ["10.1093/nar/gkq1207"] @staticmethod def generate_inputs(data, filename_map): @@ -125,8 +146,7 @@ def generate_inputs(data, filename_map): # TODO add parameter validation # TODO document required arguments @staticmethod - def run(edges=None, sources=None, targets=None, output_file=None, max_path_length=None, local_search=None, - rand_restarts=None, container_framework="docker"): + def run(inputs, output_file=None, args=None, container_settings=None): """ Run Maximum Edge Orientation in the Docker image with the provided parameters. The properties file is generated from the provided arguments. @@ -134,10 +154,11 @@ def run(edges=None, sources=None, targets=None, output_file=None, max_path_lengt Does not support MINSAT or MAXCSP. Only the edge output file is retained. All other output files are deleted. - @param output_file: the name of the output edge file, which will overwrite any existing file with this name - @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) """ - if edges is None or sources is None or targets is None or output_file is None: + if not container_settings: container_settings = ProcessedContainerSettings() + if not args: args = MEOParams() + + if inputs["edges"] is None or inputs["sources"] is None or inputs["targets"] is None: raise ValueError('Required Maximum Edge Orientation arguments are missing') work_dir = '/spras' @@ -145,44 +166,45 @@ def run(edges=None, sources=None, targets=None, output_file=None, max_path_lengt # Each volume is a tuple (src, dest) volumes = list() - bind_path, edge_file = prepare_volume(edges, work_dir) + bind_path, edge_file = prepare_volume(inputs["edges"], work_dir, container_settings) volumes.append(bind_path) - bind_path, source_file = prepare_volume(sources, work_dir) + bind_path, source_file = prepare_volume(inputs["sources"], work_dir, container_settings) volumes.append(bind_path) - bind_path, target_file = prepare_volume(targets, work_dir) + bind_path, target_file = prepare_volume(inputs["targets"], work_dir, container_settings) volumes.append(bind_path) out_dir = Path(output_file).parent # Maximum Edge Orientation requires that the output directory exist out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_output_file = prepare_volume(str(output_file), work_dir) + bind_path, mapped_output_file = prepare_volume(str(output_file), work_dir, container_settings) volumes.append(bind_path) # Hard code the path output filename, which will be deleted path_output_file = Path(out_dir, 'path-output.txt') - bind_path, mapped_path_output = prepare_volume(str(path_output_file), work_dir) + bind_path, mapped_path_output = prepare_volume(str(path_output_file), work_dir, container_settings) volumes.append(bind_path) properties_file = 'meo-properties.txt' properties_file_local = Path(out_dir, properties_file) write_properties(filename=properties_file_local, edges=edge_file, sources=source_file, targets=target_file, edge_output=mapped_output_file, path_output=mapped_path_output, - max_path_length=max_path_length, local_search=local_search, rand_restarts=rand_restarts, framework=container_framework) - bind_path, properties_file = prepare_volume(str(properties_file_local), work_dir) + max_path_length=args.max_path_length, local_search=args.local_search, rand_restarts=args.rand_restarts, + framework=container_settings.framework) + bind_path, properties_file = prepare_volume(str(properties_file_local), work_dir, container_settings) volumes.append(bind_path) command = ['java', '-jar', '/meo/EOMain.jar', properties_file] container_suffix = "meo" run_container_and_log('Maximum Edge Orientation', - container_framework, container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) properties_file_local.unlink(missing_ok=True) diff --git a/spras/mincostflow.py b/spras/mincostflow.py index bc60dea9e..05dd22bf5 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -1,5 +1,9 @@ from pathlib import Path +from typing import Optional +from pydantic import BaseModel, ConfigDict + +from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( convert_undirected_to_directed, @@ -8,7 +12,16 @@ from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges, raw_pathway_df -__all__ = ['MinCostFlow'] +__all__ = ['MinCostFlow', 'MinCostFlowParams'] + +class MinCostFlowParams(BaseModel): + flow: Optional[float] = None + "amount of flow going through the graph" + + capacity: Optional[float] = None + "amount of capacity allowed on each edge" + + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) """ MinCostFlow deals with fully directed graphs @@ -22,8 +35,9 @@ - the expected raw input file should have node pairs in the 1st and 2nd columns, with the weight in the 3rd column - it can include repeated and bidirectional edges """ -class MinCostFlow (PRM): +class MinCostFlow(PRM[MinCostFlowParams]): required_inputs = ['sources', 'targets', 'edges'] + dois = ["10.1038/s41540-020-00167-1"] @staticmethod def generate_inputs(data, filename_map): @@ -59,20 +73,12 @@ def generate_inputs(data, filename_map): header=False) @staticmethod - def run(sources=None, targets=None, edges=None, output_file=None, flow=None, capacity=None, container_framework="docker"): - """ - Run min cost flow with Docker (or singularity) - @param sources: input sources (required) - @param targets: input targets (required) - @param edges: input network file (required) - @param output_file: output file name (required) - @param flow: amount of flow going through the graph (optional) - @param capacity: amount of capacity allowed on each edge (optional) - @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) - """ + def run(inputs, output_file, args=None, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() + if not args: args = MinCostFlowParams() # ensures that these parameters are required - if not sources or not targets or not edges or not output_file: + if not inputs["sources"] or not inputs["targets"] or not inputs["edges"]: raise ValueError('Required MinCostFlow arguments are missing') # the data files will be mapped within this directory within the container @@ -81,19 +87,19 @@ def run(sources=None, targets=None, edges=None, output_file=None, flow=None, cap # the tuple is for mapping the sources, targets, edges, and output volumes = list() - bind_path, sources_file = prepare_volume(sources, work_dir) + bind_path, sources_file = prepare_volume(inputs["sources"], work_dir, container_settings) volumes.append(bind_path) - bind_path, targets_file = prepare_volume(targets, work_dir) + bind_path, targets_file = prepare_volume(inputs["targets"], work_dir, container_settings) volumes.append(bind_path) - bind_path, edges_file = prepare_volume(edges, work_dir) + bind_path, edges_file = prepare_volume(inputs["edges"], work_dir, container_settings) volumes.append(bind_path) # Create a prefix for the output filename and ensure the directory exists out_dir = Path(output_file).parent out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir, container_settings) volumes.append(bind_path) mapped_out_prefix = mapped_out_dir + '/out' @@ -106,21 +112,21 @@ def run(sources=None, targets=None, edges=None, output_file=None, flow=None, cap '--output', mapped_out_prefix] # Optional arguments (extend the command if available) - if flow is not None: - command.extend(['--flow', str(flow)]) - if capacity is not None: - command.extend(['--capacity', str(capacity)]) + if args.flow is not None: + command.extend(['--flow', str(args.flow)]) + if args.capacity is not None: + command.extend(['--capacity', str(args.capacity)]) # choosing to run in docker or singularity container container_suffix = "mincostflow" # constructs a docker run call run_container_and_log('MinCostFlow', - container_framework, container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) # Check the output of the container out_dir_content = sorted(out_dir.glob('*.sif')) diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 5f899bb85..9d1396902 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -1,11 +1,16 @@ +import time from pathlib import Path +from typing import Optional +from pydantic import BaseModel, ConfigDict, Field + +from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log from spras.interactome import reinsert_direction_col_mixed from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges, raw_pathway_df -__all__ = ['OmicsIntegrator1', 'write_conf'] +__all__ = ['OmicsIntegrator1', 'OmicsIntegrator1Params', 'write_conf'] # TODO decide on default number of processes and threads @@ -35,8 +40,47 @@ def write_conf(filename=Path('config.txt'), w=None, b=None, d=None, mu=None, noi f.write('processes = 1\n') f.write('threads = 1\n') +class OmicsIntegrator1Params(BaseModel): + dummy_mode: Optional[str] = None + mu_squared: bool = False + exclude_terms: bool = False + + noisy_edges: int = 0 + "How many times you would like to add noise to the given edge values and re-run the algorithm." + + shuffled_prizes: int = 0 + "shuffled_prizes: How many times the algorithm should shuffle the prizes and re-run" + + random_terminals: int = 0 + "How many times to apply the given prizes to random nodes in the interactome" + + seed: int = Field(default_factory=lambda _: int(time.time() * 1000)) + "The random seed to use for this run. Defaults to the current UNIX timestamp." + + w: int + "the number of trees" + + b: float + "the trade-off between including more terminals and using less reliable edges" + + d: int + "controls the maximum path-length from v0 to terminal nodes" -class OmicsIntegrator1(PRM): + mu: Optional[float] = None + "controls the degree-based negative prizes (defualt 0.0)" + + noise: Optional[float] = None + "Standard Deviation of the gaussian noise added to edges in Noisy Edges Randomizations" + + g: Optional[float] = None + "(Gamma) multiplicative edge penalty from degree of endpoints" + + r: Optional[float] = None + "msgsteiner parameter that adds random noise to edges, which is rarely needed because the Forest --noisyEdges option is recommended instead (default 0)" + + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) + +class OmicsIntegrator1(PRM[OmicsIntegrator1Params]): """ Omics Integrator 1 works with partially directed graphs - it takes in the universal input directly @@ -50,6 +94,7 @@ class OmicsIntegrator1(PRM): """ required_inputs = ['prizes', 'edges', 'dummy_nodes'] + dois = ["10.1371/journal.pcbi.1004879"] @staticmethod def generate_inputs(data, filename_map): @@ -95,27 +140,12 @@ def generate_inputs(data, filename_map): with open(filename_map['dummy_nodes'], mode='w'): pass - # TODO add parameter validation # TODO add support for knockout argument # TODO add reasonable default values - # TODO document required arguments @staticmethod - def run(edges=None, prizes=None, dummy_nodes=None, dummy_mode=None, mu_squared=None, exclude_terms=None, - output_file=None, noisy_edges=None, shuffled_prizes=None, random_terminals=None, - seed=None, w=None, b=None, d=None, mu=None, noise=None, g=None, r=None, container_framework="docker"): - """ - Run Omics Integrator 1 in the Docker image with the provided parameters. - Does not support the garnet, cyto30, knockout, cv, or cv-reps arguments. - The configuration file is generated from the provided arguments. - Does not support the garnetBeta, processes, or threads configuration file parameters. - The msgpath is not required because msgsteiner is available in the Docker image. - Only the optimal forest sif file is retained. - All other output files are deleted. - @param output_file: the name of the output sif file for the optimal forest, which will overwrite any - existing file with this name - @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) - """ - if edges is None or prizes is None or output_file is None or w is None or b is None or d is None: + def run(inputs, output_file, args, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() + if inputs["edges"] is None or inputs["prizes"] is None or output_file is None: raise ValueError('Required Omics Integrator 1 arguments are missing') work_dir = '/spras' @@ -123,10 +153,10 @@ def run(edges=None, prizes=None, dummy_nodes=None, dummy_mode=None, mu_squared=N # Each volume is a tuple (src, dest) volumes = list() - bind_path, edge_file = prepare_volume(edges, work_dir) + bind_path, edge_file = prepare_volume(inputs["edges"], work_dir, container_settings) volumes.append(bind_path) - bind_path, prize_file = prepare_volume(prizes, work_dir) + bind_path, prize_file = prepare_volume(inputs["prizes"], work_dir, container_settings) volumes.append(bind_path) # 4 dummy mode possibilities: @@ -136,23 +166,24 @@ def run(edges=None, prizes=None, dummy_nodes=None, dummy_mode=None, mu_squared=N # 4. file -> connect the dummy node to a specific list of nodes provided in a file # add dummy node file to the volume if dummy_mode is not None and it is 'file' - if dummy_mode == 'file': - if dummy_nodes is None: + if args.dummy_mode == 'file': + if inputs["dummy_nodes"] is None: raise ValueError("dummy_nodes file is required when dummy_mode is set to 'file'") - bind_path, dummy_file = prepare_volume(dummy_nodes, work_dir) + bind_path, dummy_file = prepare_volume(inputs["dummy_nodes"], work_dir, container_settings) volumes.append(bind_path) out_dir = Path(output_file).parent # Omics Integrator 1 requires that the output directory exist out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir, container_settings) volumes.append(bind_path) conf_file = 'oi1-configuration.txt' conf_file_local = Path(out_dir, conf_file) # Temporary file that will be deleted after running Omics Integrator 1 - write_conf(conf_file_local, w=w, b=b, d=d, mu=mu, noise=noise, g=g, r=r) - bind_path, conf_file = prepare_volume(str(conf_file_local), work_dir) + write_conf(conf_file_local, w=args.w, b=args.b, d=args.d, mu=args.mu, + noise=args.noise, g=args.g, r=args.r) + bind_path, conf_file = prepare_volume(str(conf_file_local), work_dir, container_settings) volumes.append(bind_path) command = ['python', '/OmicsIntegrator/scripts/forest.py', @@ -164,35 +195,31 @@ def run(edges=None, prizes=None, dummy_nodes=None, dummy_mode=None, mu_squared=N '--outlabel', 'oi1'] # add the dummy mode argument - if dummy_mode is not None and dummy_mode: + if args.dummy_mode is not None and args.dummy_mode: # for custom dummy modes, add the file - if dummy_mode == 'file': - command.extend(['--dummyMode', dummy_file]) + if args.dummy_mode == 'file': + command.extend(['--dummyMode', str(inputs["dummy_file"])]) # else pass in the dummy_mode and let oi1 handle it else: - command.extend(['--dummyMode', dummy_mode]) + command.extend(['--dummyMode', args.dummy_mode]) # Add optional arguments - if mu_squared is not None and mu_squared: + if args.mu_squared: command.extend(['--musquared']) - if exclude_terms is not None and exclude_terms: + if args.exclude_terms: command.extend(['--excludeTerms']) - if noisy_edges is not None: - command.extend(['--noisyEdges', str(noisy_edges)]) - if shuffled_prizes is not None: - command.extend(['--shuffledPrizes', str(shuffled_prizes)]) - if random_terminals is not None: - command.extend(['--randomTerminals', str(random_terminals)]) - if seed is not None: - command.extend(['--seed', str(seed)]) + command.extend(['--noisyEdges', str(args.noisy_edges)]) + command.extend(['--shuffledPrizes', str(args.shuffled_prizes)]) + command.extend(['--randomTerminals', str(args.random_terminals)]) + command.extend(['--seed', str(args.seed)]) container_suffix = "omics-integrator-1:no-conda" # no-conda version is the default run_container_and_log('Omics Integrator 1', - container_framework, container_suffix, # no-conda version is the default command, volumes, work_dir, + container_settings, {'TMPDIR': mapped_out_dir}) conf_file_local.unlink(missing_ok=True) diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 0d0fd6b39..8b5c29799 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -1,14 +1,56 @@ +import time from pathlib import Path +from typing import Optional import pandas as pd +from pydantic import BaseModel, ConfigDict, Field +from spras.config.container_schema import ProcessedContainerSettings +from spras.config.util import CaseInsensitiveEnum from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset from spras.interactome import reinsert_direction_col_undirected from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges -__all__ = ['OmicsIntegrator2'] +__all__ = ['OmicsIntegrator2', 'OmicsIntegrator2Params'] + +class DummyMode(CaseInsensitiveEnum): + terminals = 'terminals' + others = 'others' + all = 'all' + +class OmicsIntegrator2Params(BaseModel): + w: float = 6 + "Omega: the weight of the edges connecting the dummy node to the nodes selected by dummyMode" + + b: float = 1 + "Beta: scaling factor of prizes" + + g: float = 20 + "Gamma: multiplicative edge penalty from degree of endpoints" + + noise: Optional[float] = None + "Standard Deviation of the gaussian noise added to edges in Noisy Edges Randomizations." + + noisy_edges: Optional[int] = None + "An integer specifying how many times to add noise to the given edge values and re-run." + + random_terminals: Optional[int] = None + "An integer specifying how many times to apply your given prizes to random nodes in the interactome and re-run" + + dummy_mode: Optional[DummyMode] = None + """ + Tells the program which nodes in the interactome to connect the dummy node to. (default: terminals) + "terminals" = connect to all terminals + "others" = connect to all nodes except for terminals + "all" = connect to all nodes in the interactome. + """ + + seed: int = Field(default_factory=lambda _: int(time.time() * 1000)) + "The random seed to use for this run. Defaults to the current UNIX timestamp." + + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) """ Omics Integrator 2 will construct a fully undirected graph from the provided input file @@ -20,9 +62,12 @@ - the expected raw input file should have node pairs in the 1st and 2nd columns, with a weight in the 3rd column - it can include repeated and bidirectional edges """ -class OmicsIntegrator2(PRM): +class OmicsIntegrator2(PRM[OmicsIntegrator2Params]): required_inputs = ['prizes', 'edges'] + # OI2 does not have a specific paper. Instead, we link to the OI1 paper. + dois = ["10.1371/journal.pcbi.1004879"] + @staticmethod def generate_inputs(data: Dataset, filename_map): """ Access fields from the dataset and write the required input files. @@ -63,12 +108,10 @@ def generate_inputs(data: Dataset, filename_map): edges_df.to_csv(filename_map['edges'], sep='\t', index=False, columns=['Interactor1', 'Interactor2', 'cost'], header=['protein1', 'protein2', 'cost']) - # TODO add parameter validation # TODO add reasonable default values # TODO document required arguments @staticmethod - def run(edges=None, prizes=None, output_file=None, w=None, b=None, g=None, noise=None, noisy_edges=None, - random_terminals=None, dummy_mode=None, seed=None, container_framework="docker"): + def run(inputs, output_file, args=None, container_settings=None): """ Run Omics Integrator 2 in the Docker image with the provided parameters. Only the .tsv output file is retained and then renamed. @@ -76,7 +119,10 @@ def run(edges=None, prizes=None, output_file=None, w=None, b=None, g=None, noise @param output_file: the name of the output file, which will overwrite any existing file with this name @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) """ - if edges is None or prizes is None or output_file is None: + if not container_settings: container_settings = ProcessedContainerSettings() + if not args: args = OmicsIntegrator2Params() + + if inputs["edges"] is None or inputs["prizes"] is None: raise ValueError('Required Omics Integrator 2 arguments are missing') work_dir = '/spras' @@ -84,47 +130,46 @@ def run(edges=None, prizes=None, output_file=None, w=None, b=None, g=None, noise # Each volume is a tuple (src, dest) volumes = list() - bind_path, edge_file = prepare_volume(edges, work_dir) + bind_path, edge_file = prepare_volume(inputs["edges"], work_dir, container_settings) volumes.append(bind_path) - bind_path, prize_file = prepare_volume(prizes, work_dir) + bind_path, prize_file = prepare_volume(inputs["prizes"], work_dir, container_settings) volumes.append(bind_path) out_dir = Path(output_file).parent # Omics Integrator 2 requires that the output directory exist out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(out_dir, work_dir) + bind_path, mapped_out_dir = prepare_volume(out_dir, work_dir, container_settings) volumes.append(bind_path) command = ['OmicsIntegrator', '-e', edge_file, '-p', prize_file, '-o', mapped_out_dir, '--filename', 'oi2'] # Add optional arguments - if w is not None: - command.extend(['-w', str(w)]) - if b is not None: - command.extend(['-b', str(b)]) - if g is not None: - command.extend(['-g', str(g)]) - if noise is not None: - command.extend(['-noise', str(noise)]) - if noisy_edges is not None: - command.extend(['--noisy_edges', str(noisy_edges)]) - if random_terminals is not None: - command.extend(['--random_terminals', str(random_terminals)]) - if dummy_mode is not None: + if args.w is not None: + command.extend(['-w', str(args.w)]) + if args.w is not None: + command.extend(['-b', str(args.b)]) + if args.w is not None: + command.extend(['-g', str(args.g)]) + if args.noise is not None: + command.extend(['-noise', str(args.noise)]) + if args.noisy_edges is not None: + command.extend(['--noisy_edges', str(args.noisy_edges)]) + if args.random_terminals is not None: + command.extend(['--random_terminals', str(args.random_terminals)]) + if args.dummy_mode is not None: # This argument does not follow the other naming conventions - command.extend(['--dummyMode', str(dummy_mode)]) - if seed is not None: - command.extend(['--seed', str(seed)]) + command.extend(['--dummyMode', str(args.dummy_mode)]) + command.extend(['--seed', str(args.seed)]) container_suffix = "omics-integrator-2:v2" run_container_and_log('Omics Integrator 2', - container_framework, container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) # TODO do we want to retain other output files? # TODO if deleting other output files, write them all to a tmp directory and copy diff --git a/spras/pathlinker.py b/spras/pathlinker.py index 0a50ffb41..f71015f0e 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -1,6 +1,10 @@ import warnings from pathlib import Path +from typing import Optional +from pydantic import BaseModel, ConfigDict + +from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset from spras.interactome import ( @@ -10,7 +14,13 @@ from spras.prm import PRM from spras.util import duplicate_edges, raw_pathway_df -__all__ = ['PathLinker'] +__all__ = ['PathLinker', 'PathLinkerParams'] + +class PathLinkerParams(BaseModel): + k: int = 100 + "path length" + + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) """ Pathlinker will construct a fully directed graph from the provided input file @@ -22,8 +32,9 @@ - the expected raw input file should have node pairs in the 1st and 2nd columns, with a weight in the 3rd column - it can include repeated and bidirectional edges """ -class PathLinker(PRM): +class PathLinker(PRM[PathLinkerParams]): required_inputs = ['nodetypes', 'network'] + dois = ["10.1038/npjsba.2016.2", "10.1089/cmb.2012.0274"] @staticmethod def generate_inputs(data, filename_map): @@ -65,22 +76,12 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map["network"],sep="\t",index=False,columns=["Interactor1","Interactor2","Weight"], header=["#Interactor1","Interactor2","Weight"]) - # Skips parameter validation step @staticmethod - def run(nodetypes=None, network=None, output_file=None, k=None, container_framework="docker"): - """ - Run PathLinker with Docker - @param nodetypes: input node types with sources and targets (required) - @param network: input network file (required) - @param output_file: path to the output pathway file (required) - @param k: path length (optional) - @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) - """ - # Add additional parameter validation - # Do not require k - # Use the PathLinker default - # Could consider setting the default here instead - if not nodetypes or not network or not output_file: + def run(inputs, output_file, args=None, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() + if not args: args = PathLinkerParams() + + if not inputs["nodetypes"] or not inputs["network"]: raise ValueError('Required PathLinker arguments are missing') work_dir = '/spras' @@ -88,10 +89,10 @@ def run(nodetypes=None, network=None, output_file=None, k=None, container_framew # Each volume is a tuple (src, dest) volumes = list() - bind_path, node_file = prepare_volume(nodetypes, work_dir) + bind_path, node_file = prepare_volume(inputs["nodetypes"], work_dir, container_settings) volumes.append(bind_path) - bind_path, network_file = prepare_volume(network, work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir, container_settings) volumes.append(bind_path) # PathLinker does not provide an argument to set the output directory @@ -99,7 +100,7 @@ def run(nodetypes=None, network=None, output_file=None, k=None, container_framew out_dir = Path(output_file).parent # PathLinker requires that the output directory exist out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir, container_settings) volumes.append(bind_path) mapped_out_prefix = mapped_out_dir + '/out' # Use posix path inside the container @@ -109,17 +110,15 @@ def run(nodetypes=None, network=None, output_file=None, k=None, container_framew node_file, '--output', mapped_out_prefix] - # Add optional argument - if k is not None: - command.extend(['-k', str(k)]) + command.extend(['-k', str(args.k)]) container_suffix = "pathlinker:v2" run_container_and_log('PathLinker', - container_framework, container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) # Rename the primary output file to match the desired output filename # Currently PathLinker only writes one output file so we do not need to delete others diff --git a/spras/prm.py b/spras/prm.py index 944873176..d52214083 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -1,32 +1,53 @@ +import os from abc import ABC, abstractmethod -from typing import Any +from typing import Any, Generic, TypeVar, cast +from pydantic import BaseModel + +from spras.config.container_schema import ProcessedContainerSettings from spras.dataset import Dataset +T = TypeVar('T', bound=BaseModel) -class PRM(ABC): +class PRM(ABC, Generic[T]): """ The PRM (Pathway Reconstruction Module) class, which defines the interface that `runner.py` uses to handle algorithms. """ - @property - @staticmethod - @abstractmethod - def required_inputs(self): - # Note: This NotImplementedError will never trigger. - # See CONTRIBUTING.md for more information. - raise NotImplementedError + required_inputs: list[str] = [] + # DOIs aren't strictly required (e.g. local neighborhood), + # but it should be explicitly declared that there are no DOIs. + dois: list[str] = cast(list[str], None) + + def __init_subclass__(cls): + # modified from https://stackoverflow.com/a/58206480/7589775 + props = ["required_inputs", "dois"] + for prop in props: + if getattr(PRM, prop) is getattr(cls, prop): + raise NotImplementedError( + "Attribute '{}' has not been overriden in class '{}'" \ + .format(prop, cls.__name__) + ) @staticmethod @abstractmethod def generate_inputs(data: Dataset, filename_map: dict[str, str]): + """ + Access fields from the dataset and write the required input files + @param data: dataset + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type + """ raise NotImplementedError @staticmethod @abstractmethod - def run(**kwargs): + def run(inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, args: T, container_settings: ProcessedContainerSettings): + """ + Runs an algorithm with the specified inputs, algorithm params (T), + the designated output_file, and the desired container_settings. + """ raise NotImplementedError @staticmethod diff --git a/spras/runner.py b/spras/runner.py index 735925007..209a32f42 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -1,27 +1,48 @@ -from typing import Any +from typing import Any, Optional + +from pydantic import BaseModel # supported algorithm imports -from spras.allpairs import AllPairs as allpairs -from spras.btb import BowTieBuilder as bowtiebuilder +from spras.allpairs import AllPairs +from spras.btb import BowTieBuilder +from spras.config.util import Empty from spras.dataset import Dataset -from spras.domino import DOMINO as domino -from spras.meo import MEO as meo -from spras.mincostflow import MinCostFlow as mincostflow -from spras.omicsintegrator1 import OmicsIntegrator1 as omicsintegrator1 -from spras.omicsintegrator2 import OmicsIntegrator2 as omicsintegrator2 -from spras.pathlinker import PathLinker as pathlinker -from spras.rwr import RWR as rwr -from spras.strwr import ST_RWR as strwr +from spras.domino import DOMINO, DominoParams +from spras.meo import MEO, MEOParams +from spras.mincostflow import MinCostFlow, MinCostFlowParams +from spras.omicsintegrator1 import OmicsIntegrator1, OmicsIntegrator1Params +from spras.omicsintegrator2 import OmicsIntegrator2, OmicsIntegrator2Params +from spras.pathlinker import PathLinker, PathLinkerParams +from spras.prm import PRM +from spras.rwr import RWR, RWRParams +from spras.strwr import ST_RWR, ST_RWRParams + +# Algorithm names to a three-tuple of (PRM, BaseModel, default BaseModel or None if there are no good defaults). +# This is used for the configuration and to fetch algorithms during reconstruction +algorithms: dict[str, tuple[type[PRM], type[BaseModel], Optional[BaseModel]]] = { + "allpairs": (AllPairs, Empty, Empty()), + "bowtiebuilder": (BowTieBuilder, Empty, Empty()), + "domino": (DOMINO, DominoParams, DominoParams()), + "meo": (MEO, MEOParams, MEOParams()), + "mincostflow": (MinCostFlow, MinCostFlowParams, MinCostFlowParams()), + "omicsintegrator1": (OmicsIntegrator1, OmicsIntegrator1Params, None), + "omicsintegrator2": (OmicsIntegrator2, OmicsIntegrator2Params, OmicsIntegrator2Params()), + "pathlinker": (PathLinker, PathLinkerParams, PathLinkerParams()), + "rwr": (RWR, RWRParams, None), + "strwr": (ST_RWR, ST_RWRParams, None), +} +def get_algorithm(algorithm: str) -> type[PRM]: + try: + return algorithms[algorithm.lower()][0] + except KeyError as exc: + raise NotImplementedError(f'{algorithm} is not currently supported.') from exc def run(algorithm: str, params): """ A generic interface to the algorithm-specific run functions """ - try: - algorithm_runner = globals()[algorithm.lower()] - except KeyError as exc: - raise NotImplementedError(f'{algorithm} is not currently supported') from exc + algorithm_runner = get_algorithm(algorithm) algorithm_runner.run(**params) @@ -31,10 +52,7 @@ def get_required_inputs(algorithm: str): @param algorithm: algorithm name @return: A list of strings of input files types """ - try: - algorithm_runner = globals()[algorithm.lower()] - except KeyError as exc: - raise NotImplementedError(f'{algorithm} is not currently supported') from exc + algorithm_runner = get_algorithm(algorithm) return algorithm_runner.required_inputs @@ -57,10 +75,7 @@ def prepare_inputs(algorithm: str, data_file: str, filename_map: dict[str, str]) @return: """ dataset = Dataset.from_file(data_file) - try: - algorithm_runner = globals()[algorithm.lower()] - except KeyError as exc: - raise NotImplementedError(f'{algorithm} is not currently supported') from exc + algorithm_runner = get_algorithm(algorithm) return algorithm_runner.generate_inputs(dataset, filename_map) @@ -71,8 +86,5 @@ def parse_output(algorithm: str, raw_pathway_file: str, standardized_pathway_fil @param raw_pathway_file: pathway file produced by an algorithm's run function @param standardized_pathway_file: the same pathway written in the universal format """ - try: - algorithm_runner = globals()[algorithm.lower()] - except KeyError as exc: - raise NotImplementedError(f'{algorithm} is not currently supported') from exc + algorithm_runner = get_algorithm(algorithm) return algorithm_runner.parse_output(raw_pathway_file, standardized_pathway_file, params) diff --git a/spras/rwr.py b/spras/rwr.py index adeccaaed..a46e734e6 100644 --- a/spras/rwr.py +++ b/spras/rwr.py @@ -1,17 +1,30 @@ from pathlib import Path +from typing import Optional import pandas as pd +from pydantic import BaseModel, ConfigDict +from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container from spras.dataset import Dataset from spras.interactome import reinsert_direction_col_directed from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges, raw_pathway_df -__all__ = ['RWR'] +__all__ = ['RWR', 'RWRParams'] -class RWR(PRM): +class RWRParams(BaseModel): + threshold: int + "The number of nodes to return" + + alpha: Optional[float] = None + "The chance of a restart during the random walk" + + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) + +class RWR(PRM[RWRParams]): required_inputs = ['network','nodes'] + dois = [] @staticmethod def generate_inputs(data, filename_map): @@ -33,11 +46,12 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map['network'],sep='|',index=False,columns=['Interactor1','Interactor2'],header=False) @staticmethod - def run(network=None, nodes=None, alpha=None, output_file=None, container_framework="docker", threshold=None): - if not nodes: + def run(inputs, output_file, args, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() + if not inputs["nodes"] or not inputs["network"]: raise ValueError('Required RWR arguments are missing') - with Path(network).open() as network_f: + with Path(inputs["network"]).open() as network_f: for line in network_f: line = line.strip() endpoints = line.split("|") @@ -48,10 +62,10 @@ def run(network=None, nodes=None, alpha=None, output_file=None, container_framew # Each volume is a tuple (src, dest) volumes = list() - bind_path, nodes_file = prepare_volume(nodes, work_dir) + bind_path, nodes_file = prepare_volume(inputs["nodes"], work_dir, container_settings) volumes.append(bind_path) - bind_path, network_file = prepare_volume(network, work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir, container_settings) volumes.append(bind_path) # RWR does not provide an argument to set the output directory @@ -59,7 +73,7 @@ def run(network=None, nodes=None, alpha=None, output_file=None, container_framew out_dir = Path(output_file).parent # RWR requires that the output directory exist out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir, container_settings) volumes.append(bind_path) mapped_out_prefix = mapped_out_dir + "/output.txt" command = ['python', @@ -69,15 +83,15 @@ def run(network=None, nodes=None, alpha=None, output_file=None, container_framew '--output', mapped_out_prefix] # Add alpha as an optional argument - if alpha is not None: - command.extend(['--alpha', str(alpha)]) + if args.alpha is not None: + command.extend(['--alpha', str(args.alpha)]) container_suffix = 'rwr:v1' - out = run_container(container_framework, - container_suffix, + out = run_container(container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) print(out) # Rename the primary output file to match the desired output filename diff --git a/spras/strwr.py b/spras/strwr.py index dfa1adc2a..28a76099e 100644 --- a/spras/strwr.py +++ b/spras/strwr.py @@ -1,18 +1,31 @@ from pathlib import Path +from typing import Optional import pandas as pd +from pydantic import BaseModel, ConfigDict +from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container from spras.dataset import Dataset from spras.interactome import reinsert_direction_col_directed from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges, raw_pathway_df -__all__ = ['ST_RWR'] +__all__ = ['ST_RWR', 'ST_RWRParams'] + +class ST_RWRParams(BaseModel): + threshold: int + "The number of nodes to return" + + alpha: Optional[float] = None + "The chance of a restart during the random walk" + + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) # Note: This class is almost identical to the rwr.py file. -class ST_RWR(PRM): +class ST_RWR(PRM[ST_RWRParams]): required_inputs = ['network','sources','targets'] + dois = [] @staticmethod def generate_inputs(data, filename_map): @@ -35,11 +48,12 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map['network'],sep='|',index=False,columns=['Interactor1','Interactor2'],header=False) @staticmethod - def run(network=None, sources=None, targets=None, alpha=None, output_file=None, container_framework="docker", threshold=None): - if not sources or not targets or not network or not output_file: + def run(inputs, output_file, args, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() + if not inputs["sources"] or not inputs["targets"] or not inputs["network"] or not output_file: raise ValueError('Required local_neighborhood arguments are missing') - with Path(network).open() as network_f: + with Path(inputs["network"]).open() as network_f: for line in network_f: line = line.strip() endpoints = line.split("|") @@ -51,13 +65,13 @@ def run(network=None, sources=None, targets=None, alpha=None, output_file=None, # Each volume is a tuple (src, dest) volumes = list() - bind_path, source_file = prepare_volume(sources, work_dir) + bind_path, source_file = prepare_volume(inputs["sources"], work_dir, container_settings) volumes.append(bind_path) - bind_path, target_file = prepare_volume(targets, work_dir) + bind_path, target_file = prepare_volume(inputs["targets"], work_dir, container_settings) volumes.append(bind_path) - bind_path, network_file = prepare_volume(network, work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir, container_settings) volumes.append(bind_path) # ST_RWR does not provide an argument to set the output directory @@ -65,7 +79,7 @@ def run(network=None, sources=None, targets=None, alpha=None, output_file=None, out_dir = Path(output_file).parent # ST_RWR requires that the output directory exist out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir, container_settings) volumes.append(bind_path) mapped_out_prefix = mapped_out_dir + "/output.txt" command = ['python', @@ -76,15 +90,15 @@ def run(network=None, sources=None, targets=None, alpha=None, output_file=None, '--output', mapped_out_prefix] # Add alpha as an optional argument - if alpha is not None: - command.extend(['--alpha', str(alpha)]) + if args.alpha is not None: + command.extend(['--alpha', str(args.alpha)]) container_suffix = 'st-rwr:v1' - out = run_container(container_framework, - container_suffix, + out = run_container(container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) print(out) # Rename the primary output file to match the desired output filename diff --git a/test/AllPairs/test_ap.py b/test/AllPairs/test_ap.py index 4f73e5791..ee76d0ce7 100644 --- a/test/AllPairs/test_ap.py +++ b/test/AllPairs/test_ap.py @@ -4,7 +4,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.allpairs import AllPairs # Note that we don't directly use the config in the test, but we need the config @@ -45,11 +45,10 @@ def test_allpairs(self): out_path = OUT_DIR.joinpath('sample-out.txt') out_path.unlink(missing_ok=True) # Only include required arguments - AllPairs.run( - nodetypes=str(TEST_DIR / 'input' / 'sample-in-nodetypes.txt'), - network=str(TEST_DIR / 'input' / 'sample-in-net.txt'), - directed_flag=str(TEST_DIR / 'input' / 'directed-flag-false.txt'), - output_file=str(out_path) + AllPairs.run({"nodetypes": str(TEST_DIR / 'input' / 'sample-in-nodetypes.txt'), + "network": str(TEST_DIR / 'input' / 'sample-in-net.txt'), + "directed_flag": str(TEST_DIR / 'input' / 'directed-flag-false.txt')}, + output_file=str(out_path) ) assert out_path.exists() @@ -57,9 +56,8 @@ def test_allpairs_missing(self): # Test the expected error is raised when required arguments are missing with pytest.raises(ValueError): # No nodetypes - AllPairs.run( - network=str(TEST_DIR / 'input' / 'sample-in-net.txt'), - output_file=str(OUT_DIR / 'sample-out.txt')) + AllPairs.run({"network": str(TEST_DIR / 'input' / 'sample-in-net.txt')}, + output_file=str(OUT_DIR / 'sample-out.txt')) # Only run Singularity test if the binary is available on the system # spython is only available on Unix, but do not explicitly skip non-Unix platforms @@ -68,12 +66,11 @@ def test_allpairs_singularity(self): out_path = OUT_DIR / 'sample-out.txt' out_path.unlink(missing_ok=True) # Only include required arguments and run with Singularity - AllPairs.run( - nodetypes=str(TEST_DIR / 'input' / 'sample-in-nodetypes.txt'), - network=str(TEST_DIR / 'input' / 'sample-in-net.txt'), - directed_flag=str(TEST_DIR / 'input' / 'directed-flag-false.txt'), - output_file=str(out_path), - container_framework="singularity") + AllPairs.run({"nodetypes": str(TEST_DIR / 'input' / 'sample-in-nodetypes.txt'), + "network": str(TEST_DIR / 'input' / 'sample-in-net.txt'), + "directed_flag": str(TEST_DIR / 'input' / 'directed-flag-false.txt')}, + output_file=str(out_path), + container_framework="singularity") assert out_path.exists() @pytest.mark.skipif(not shutil.which('singularity'), reason='Singularity not found on system') @@ -82,12 +79,11 @@ def test_allpairs_singularity_unpacked(self): out_path.unlink(missing_ok=True) # Indicate via config mechanism that we want to unpack the Singularity container config.config.unpack_singularity = True - AllPairs.run( - nodetypes=str(TEST_DIR / 'input/sample-in-nodetypes.txt'), - network=str(TEST_DIR / 'input/sample-in-net.txt'), - directed_flag=str(TEST_DIR / 'input' / 'directed-flag-false.txt'), - output_file=str(out_path), - container_framework="singularity") + AllPairs.run({"nodetypes": str(TEST_DIR / 'input/sample-in-nodetypes.txt'), + "network": str(TEST_DIR / 'input/sample-in-net.txt'), + "directed_flag": str(TEST_DIR / 'input' / 'directed-flag-false.txt')}, + output_file=str(out_path), + container_framework="singularity") config.config.unpack_singularity = False assert out_path.exists() @@ -104,12 +100,10 @@ def test_allpairs_correctness(self): out_path = OUT_DIR / 'correctness-out.txt' out_path.unlink(missing_ok=True) - AllPairs.run( - nodetypes=str(TEST_DIR / 'input' / 'correctness-nodetypes.txt'), - network=str(TEST_DIR / 'input' / 'correctness-network.txt'), - directed_flag=str(TEST_DIR / 'input' / 'directed-flag-false.txt'), - output_file=str(OUT_DIR / 'correctness-out.txt') - ) + AllPairs.run({"nodetypes": TEST_DIR / 'input' / 'correctness-nodetypes.txt', + "network": TEST_DIR / 'input' / 'correctness-network.txt', + "directed_flag": TEST_DIR / 'input' / 'directed-flag-false.txt'}, + output_file=OUT_DIR / 'correctness-out.txt') edge_equality_test_util(out_path, EXPECTED_DIR / 'correctness-expected.txt') @@ -117,12 +111,10 @@ def test_allpairs_directed(self): out_path = OUT_DIR / 'directed-out.txt' out_path.unlink(missing_ok=True) - AllPairs.run( - nodetypes=str(TEST_DIR / 'input' / 'directed-nodetypes.txt'), - network=str(TEST_DIR / 'input' / 'directed-network.txt'), - directed_flag=str(TEST_DIR / 'input' / 'directed-flag-true.txt'), - output_file=str(OUT_DIR / 'directed-out.txt'), - ) + AllPairs.run({"nodetypes": TEST_DIR / 'input' / 'directed-nodetypes.txt', + "network": TEST_DIR / 'input' / 'directed-network.txt', + "directed_flag": TEST_DIR / 'input' / 'directed-flag-true.txt'}, + output_file=OUT_DIR / 'directed-out.txt') edge_equality_test_util(out_path, EXPECTED_DIR.joinpath('directed-expected.txt')) @@ -136,11 +128,10 @@ def test_allpairs_zero_length(self): out_path = OUT_DIR / 'zero-length-out.txt' out_path.unlink(missing_ok=True) - AllPairs.run( - nodetypes=TEST_DIR / 'input' / 'zero-length-nodetypes.txt', - network=TEST_DIR / 'input' / 'zero-length-network.txt', - directed_flag=str(TEST_DIR / 'input' / 'directed-flag-false.txt'), - output_file=OUT_DIR / 'zero-length-out.txt' + AllPairs.run({"nodetypes": TEST_DIR / 'input' / 'zero-length-nodetypes.txt', + "network": TEST_DIR / 'input' / 'zero-length-network.txt', + "directed_flag": TEST_DIR / 'input' / 'directed-flag-false.txt'}, + output_file=OUT_DIR / 'zero-length-out.txt' ) assert filecmp.cmp(OUT_DIR / 'zero-length-out.txt', EXPECTED_DIR / 'zero-length-expected.txt', shallow=False) diff --git a/test/BowTieBuilder/test_btb.py b/test/BowTieBuilder/test_btb.py index 88b12d0dd..c65ce4a32 100644 --- a/test/BowTieBuilder/test_btb.py +++ b/test/BowTieBuilder/test_btb.py @@ -4,7 +4,7 @@ import pytest -import spras.config as config +import spras.config.config as config config.init_from_file("config/config.yaml") @@ -25,22 +25,19 @@ class TestBowTieBuilder: def test_btb_missing(self): with pytest.raises(ValueError): # No edges - BTB.run( - targets=Path(TEST_DIR, 'input', 'target.txt'), - sources=Path(TEST_DIR, 'input', 'source.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"targets": Path(TEST_DIR, 'input', 'target.txt'), + "sources": Path(TEST_DIR, 'input', 'source.txt')}, + output_file=OUT_FILE_DEFAULT) with pytest.raises(ValueError): # No source - BTB.run( - targets=Path(TEST_DIR, 'input', 'target.txt'), - edges=Path(TEST_DIR, 'input', 'edges.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"targets": Path(TEST_DIR, 'input', 'target.txt'), + "edges": Path(TEST_DIR, 'input', 'edges.txt')}, + output_file=OUT_FILE_DEFAULT) with pytest.raises(ValueError): # No target - BTB.run( - sources=Path(TEST_DIR, 'input', 'source.txt'), - edges=Path(TEST_DIR, 'input', 'edges.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"sources": Path(TEST_DIR, 'input', 'source.txt'), + "edges": Path(TEST_DIR, 'input', 'edges.txt')}, + output_file=OUT_FILE_DEFAULT) """ @@ -48,30 +45,30 @@ def test_btb_missing(self): """ def test_btb_file(self): with pytest.raises(ValueError): - BTB.run(sources=Path(TEST_DIR, 'input', 'unknown.txt'), - targets=Path(TEST_DIR, 'input', 'target.txt'), - edges=Path(TEST_DIR, 'input', 'edges.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"sources": Path(TEST_DIR, 'input', 'unknown.txt'), + "targets": Path(TEST_DIR, 'input', 'target.txt'), + "edges": Path(TEST_DIR, 'input', 'edges.txt')}, + output_file=OUT_FILE_DEFAULT) """ Run the BowTieBuilder algorithm with bad input data """ def test_format_error(self): with pytest.raises(IndexError): - BTB.run(sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - edges=Path(TEST_DIR, 'input', 'bad-edges.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt'), + "edges": Path(TEST_DIR, 'input', 'bad-edges.txt')}, + output_file=OUT_FILE_DEFAULT) """ Run the BowTieBuilder algorithm on the example input files and check the output matches the expected output """ def test_btb(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'btb-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'btb-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'btb-output.txt') @@ -89,10 +86,10 @@ def test_btb(self): """ def test_disjoint(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'disjoint-edges.txt'), - sources=Path(TEST_DIR, 'input', 'disjoint-sources.txt'), - targets=Path(TEST_DIR, 'input', 'disjoint-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'disjoint-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'disjoint-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'disjoint-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'disjoint-output.txt') @@ -110,10 +107,10 @@ def test_disjoint(self): """ def test_disjoint2(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'disjoint2-edges.txt'), - sources=Path(TEST_DIR, 'input', 'disjoint-sources.txt'), - targets=Path(TEST_DIR, 'input', 'disjoint-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'disjoint2-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'disjoint-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'disjoint-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'disjoint-output.txt') @@ -132,10 +129,10 @@ def test_disjoint2(self): def test_missing_file(self): with pytest.raises(ValueError): with pytest.raises(OSError): - BTB.run(edges=Path(TEST_DIR, 'input', 'missing.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'missing.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) """ @@ -143,10 +140,10 @@ def test_missing_file(self): """ def test_source_to_source(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'source-to-source-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'source-to-source-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'source-to-source-output.txt') @@ -164,10 +161,10 @@ def test_source_to_source(self): """ def test_source_to_source2(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'source-to-source2-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'source-to-source2-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'source-to-source2-output.txt') @@ -186,10 +183,10 @@ def test_source_to_source2(self): def test_source_to_source_disjoint(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'source-to-source-disjoint-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'source-to-source-disjoint-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'source-to-source-disjoint-output.txt') @@ -208,10 +205,10 @@ def test_source_to_source_disjoint(self): def test_bidirectional(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'bidirectional-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'bidirectional-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'bidirectional-output.txt') @@ -230,10 +227,10 @@ def test_bidirectional(self): def test_target_to_source(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'target-to-source-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'target-to-source-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'empty-output.txt') @@ -252,10 +249,10 @@ def test_target_to_source(self): def test_loop(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'loop-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'loop-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'loop-output.txt') @@ -274,10 +271,10 @@ def test_loop(self): def test_weighted(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'weighted-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'weighted-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'weighted-output.txt') @@ -292,10 +289,10 @@ def test_weighted(self): def test_weight_one(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'weight-one-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'weight-one-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'weighted-output.txt') diff --git a/test/DOMINO/test_domino.py b/test/DOMINO/test_domino.py index 7f09fa975..e84c0df8b 100644 --- a/test/DOMINO/test_domino.py +++ b/test/DOMINO/test_domino.py @@ -4,8 +4,13 @@ import pytest -import spras.config as config -from spras.domino import DOMINO, post_domino_id_transform, pre_domino_id_transform +import spras.config.config as config +from spras.domino import ( + DOMINO, + DominoParams, + post_domino_id_transform, + pre_domino_id_transform, +) config.init_from_file("config/config.yaml") @@ -28,10 +33,9 @@ def test_domino_required(self): # Only include required arguments out_path = Path(OUT_FILE_DEFAULT) out_path.unlink(missing_ok=True) - DOMINO.run( - network=TEST_DIR+'input/domino-network.txt', - active_genes=TEST_DIR+'input/domino-active-genes.txt', - output_file=OUT_FILE_DEFAULT) + DOMINO.run({"network": TEST_DIR+'input/domino-network.txt', + "active_genes": TEST_DIR+'input/domino-active-genes.txt'}, + output_file=OUT_FILE_DEFAULT) # output_file should be empty assert out_path.exists() @@ -39,12 +43,10 @@ def test_domino_optional(self): # Include optional arguments out_path = Path(OUT_FILE_OPTIONAL) out_path.unlink(missing_ok=True) - DOMINO.run( - network=TEST_DIR+'input/domino-network.txt', - active_genes=TEST_DIR+'input/domino-active-genes.txt', - output_file=OUT_FILE_OPTIONAL, - slice_threshold=0.4, - module_threshold=0.06) + DOMINO.run({"network": TEST_DIR+'input/domino-network.txt', + "active_genes": TEST_DIR+'input/domino-active-genes.txt'}, + output_file=OUT_FILE_OPTIONAL, + args=DominoParams(slice_threshold=0.4, module_threshold=0.06)) # output_file should be empty assert out_path.exists() @@ -52,17 +54,15 @@ def test_domino_missing_active_genes(self): # Test the expected error is raised when active_genes argument is missing with pytest.raises(ValueError): # No active_genes - DOMINO.run( - network=TEST_DIR+'input/domino-network.txt', - output_file=OUT_FILE_DEFAULT) + DOMINO.run({"network": TEST_DIR+'input/domino-network.txt'}, + output_file=OUT_FILE_DEFAULT) def test_domino_missing_network(self): # Test the expected error is raised when network argument is missing with pytest.raises(ValueError): # No network - DOMINO.run( - active_genes=TEST_DIR+'input/domino-active-genes.txt', - output_file=OUT_FILE_DEFAULT) + DOMINO.run({"active_genes": TEST_DIR+'input/domino-active-genes.txt'}, + output_file=OUT_FILE_DEFAULT) # Only run Singularity test if the binary is available on the system # spython is only available on Unix, but do not explicitly skip non-Unix platforms @@ -71,11 +71,10 @@ def test_domino_singularity(self): out_path = Path(OUT_FILE_DEFAULT) out_path.unlink(missing_ok=True) # Only include required arguments and run with Singularity - DOMINO.run( - network=TEST_DIR+'input/domino-network.txt', - active_genes=TEST_DIR+'input/domino-active-genes.txt', - output_file=OUT_FILE_DEFAULT, - container_framework="singularity") + DOMINO.run({"network": TEST_DIR+'input/domino-network.txt', + "active_genes": TEST_DIR+'input/domino-active-genes.txt'}, + output_file=OUT_FILE_DEFAULT, + container_framework="singularity") assert out_path.exists() def test_pre_id_transform(self): diff --git a/test/LocalNeighborhood/test_ln.py b/test/LocalNeighborhood/test_ln.py index fbee54902..9093efc68 100644 --- a/test/LocalNeighborhood/test_ln.py +++ b/test/LocalNeighborhood/test_ln.py @@ -4,7 +4,7 @@ import pytest -import spras.config as config +import spras.config.config as config config.init_from_file("config/config.yaml") diff --git a/test/MEO/test_meo.py b/test/MEO/test_meo.py index e2abdb72d..051744ed7 100644 --- a/test/MEO/test_meo.py +++ b/test/MEO/test_meo.py @@ -3,8 +3,8 @@ import pytest -import spras.config as config -from spras.meo import MEO, write_properties +import spras.config.config as config +from spras.meo import MEO, MEOParams, write_properties config.init_from_file("config/config.yaml") @@ -20,9 +20,9 @@ def test_meo_required(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Only include required arguments - MEO.run(edges=TEST_DIR + 'input/meo-edges.txt', - sources=TEST_DIR + 'input/meo-sources.txt', - targets=TEST_DIR + 'input/meo-targets.txt', + MEO.run({"edges": TEST_DIR + 'input/meo-edges.txt', + "sources": TEST_DIR + 'input/meo-sources.txt', + "targets": TEST_DIR + 'input/meo-targets.txt'}, output_file=OUT_FILE) assert out_path.exists() @@ -30,21 +30,19 @@ def test_meo_all_optional(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include all optional arguments - MEO.run(edges=TEST_DIR + 'input/meo-edges.txt', - sources=TEST_DIR + 'input/meo-sources.txt', - targets=TEST_DIR + 'input/meo-targets.txt', - output_file=OUT_FILE, - max_path_length=3, - local_search='No', - rand_restarts=10) + MEO.run({"edges": TEST_DIR + 'input/meo-edges.txt', + "sources": TEST_DIR + 'input/meo-sources.txt', + "targets": TEST_DIR + 'input/meo-targets.txt'}, + args=MEOParams(max_path_length=3, local_search=False, rand_restarts=10), + output_file=OUT_FILE) assert out_path.exists() def test_meo_missing(self): # Test the expected error is raised when required arguments are missing with pytest.raises(ValueError): # No edges - MEO.run(sources=TEST_DIR + 'input/meo-sources.txt', - targets=TEST_DIR + 'input/meo-targets.txt', + MEO.run({"sources": TEST_DIR + 'input/meo-sources.txt', + "targets": TEST_DIR + 'input/meo-targets.txt'}, output_file=OUT_FILE) with pytest.raises(ValueError): @@ -62,9 +60,9 @@ def test_meo_singularity(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Only include required arguments and run with Singularity - MEO.run(edges=TEST_DIR + 'input/meo-edges.txt', - sources=TEST_DIR + 'input/meo-sources.txt', - targets=TEST_DIR + 'input/meo-targets.txt', + MEO.run({"edges": TEST_DIR + 'input/meo-edges.txt', + "sources": TEST_DIR + 'input/meo-sources.txt', + "targets": TEST_DIR + 'input/meo-targets.txt'}, output_file=OUT_FILE, container_framework="singularity") assert out_path.exists() diff --git a/test/MinCostFlow/test_mcf.py b/test/MinCostFlow/test_mcf.py index 89bd61d0b..1c9c61a60 100644 --- a/test/MinCostFlow/test_mcf.py +++ b/test/MinCostFlow/test_mcf.py @@ -3,8 +3,8 @@ import pytest -import spras.config as config -from spras.mincostflow import MinCostFlow +import spras.config.config as config +from spras.mincostflow import MinCostFlow, MinCostFlowParams config.init_from_file("config/config.yaml") @@ -21,9 +21,9 @@ def test_mincostflow_required(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE) assert out_path.exists() # TODO: assert for the output .equals expected_output instead of only testing @@ -34,11 +34,11 @@ def test_mincostflow_missing_capacity(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE, - flow=1) + args=MinCostFlowParams(flow=1)) assert out_path.exists() @pytest.mark.parametrize('graph', ['graph1']) @@ -46,11 +46,11 @@ def test_mincostflow_missing_flow(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE, - capacity=1) + args=MinCostFlowParams(capacity=1)) assert out_path.exists() @pytest.mark.parametrize('graph', ['graph1']) @@ -59,24 +59,22 @@ def test_mincostflow_too_much_flow(self, graph): out_path.unlink(missing_ok=True) with pytest.raises(RuntimeError): - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE, - flow=50, - capacity=1) + args=MinCostFlowParams(flow=50, capacity=1)) @pytest.mark.parametrize('graph', ['graph1']) def test_mincostflow_no_flow(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE, - flow=0, - capacity=1) + args=MinCostFlowParams(flow=0, capacity=1)) assert out_path.exists() @pytest.mark.parametrize('graph', ['graph1']) @@ -84,20 +82,19 @@ def test_mincostflow_all_optional(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include all optional arguments - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE, - flow=1, - capacity=1) + args=MinCostFlowParams(flow=1, capacity=1)) assert out_path.exists() @pytest.mark.parametrize('graph', ['graph1']) def test_mincostflow_missing(self, graph): # Test the expected error is raised when required arguments are missing with pytest.raises(ValueError): - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt'}, output_file=OUT_FILE) @pytest.mark.parametrize('graph', ['graph1']) @@ -106,12 +103,10 @@ def test_mincostflow_singularity(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include all optional arguments - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE, - flow=1, - capacity=1, + args=MinCostFlowParams(flow=1, capacity=1), container_framework="singularity") assert out_path.exists() - diff --git a/test/OmicsIntegrator1/test_oi1.py b/test/OmicsIntegrator1/test_oi1.py index 35b41d428..fad4627e0 100644 --- a/test/OmicsIntegrator1/test_oi1.py +++ b/test/OmicsIntegrator1/test_oi1.py @@ -3,8 +3,8 @@ import pytest -import spras.config as config -from spras.omicsintegrator1 import OmicsIntegrator1, write_conf +import spras.config.config as config +from spras.omicsintegrator1 import OmicsIntegrator1, OmicsIntegrator1Params, write_conf config.init_from_file("config/config.yaml") @@ -20,79 +20,74 @@ def test_oi1_required(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Only include required arguments - OmicsIntegrator1.run(edges=TEST_DIR+'input/oi1-edges.txt', - prizes=TEST_DIR+'input/oi1-prizes.txt', + OmicsIntegrator1.run({"edges": TEST_DIR+'input/oi1-edges.txt', + "prizes": TEST_DIR+'input/oi1-prizes.txt'}, output_file=OUT_FILE, - w=5, - b=1, - d=10) + args=OmicsIntegrator1Params(w=5, b=1, d=10)) assert out_path.exists() def test_oi1_some_optional(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include optional argument - OmicsIntegrator1.run(edges=TEST_DIR+'input/oi1-edges.txt', - prizes=TEST_DIR+'input/oi1-prizes.txt', + OmicsIntegrator1.run({"edges": TEST_DIR+'input/oi1-edges.txt', + "prizes": TEST_DIR+'input/oi1-prizes.txt'}, output_file=OUT_FILE, - w=5, - b=1, - d=10, - noise=0.333, - g=0.001, - r=0) + args=OmicsIntegrator1Params(w=5, b=1, d=10, noise=0.333, g=0.001, r=0)) assert out_path.exists() def test_oi1_all_optional(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include all optional arguments - OmicsIntegrator1.run(edges=TEST_DIR+'input/oi1-edges.txt', - prizes=TEST_DIR+'input/oi1-prizes.txt', - dummy_nodes=None, - dummy_mode='terminals', - mu_squared=True, - exclude_terms=True, + OmicsIntegrator1.run({"edges": TEST_DIR+'input/oi1-edges.txt', + "prizes": TEST_DIR+'input/oi1-prizes.txt'}, output_file=OUT_FILE, - noisy_edges=0, - shuffled_prizes=0, - random_terminals=0, - seed=1, - w=5, - b=1, - d=10, - mu=0, - noise=0.333, - g=0.001, - r=0) + args=OmicsIntegrator1Params( + dummy_mode='terminals', + mu_squared=True, + exclude_terms=True, + noisy_edges=0, + shuffled_prizes=0, + random_terminals=0, + seed=1, + w=5, + b=1, + d=10, + mu=0, + noise=0.333, + g=0.001, + r=0)) assert out_path.exists() def test_oi1_dummy_file(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include optional argument - OmicsIntegrator1.run(edges=TEST_DIR+'input/oi1-edges.txt', - prizes=TEST_DIR+'input/oi1-prizes.txt', - dummy_nodes=TEST_DIR + 'input/oi1-dummy.txt', - dummy_mode='file', + OmicsIntegrator1.run({"edges": TEST_DIR+'input/oi1-edges.txt', + "prizes": TEST_DIR+'input/oi1-prizes.txt', + "dummy_nodes": TEST_DIR + 'input/oi1-dummy.txt'}, output_file=OUT_FILE, - w=5, - b=1, - d=10, - noise=0.333, - g=0.001, - r=0) + args=OmicsIntegrator1Params( + dummy_mode='file', + w=5, + b=1, + d=10, + noise=0.333, + g=0.001, + r=0)) assert out_path.exists() def test_oi1_missing(self): # Test the expected error is raised when required arguments are missing with pytest.raises(ValueError): # No edges - OmicsIntegrator1.run(prizes=TEST_DIR + 'input/oi1-prizes.txt', + OmicsIntegrator1.run({"prizes": TEST_DIR + 'input/oi1-prizes.txt'}, output_file=TEST_DIR+'output/test_optimalForest.sif', - w=5, - b=1, - d=10) + args=OmicsIntegrator1Params( + w=5, + b=1, + d=10)) with pytest.raises(ValueError): # No w write_conf(Path('.'), @@ -103,13 +98,14 @@ def test_oi1_missing_dummy(self): # Test the expected error is raised when the dummy_nodes file is missing and the dummy_mode is 'file' with pytest.raises(ValueError): # No edges - OmicsIntegrator1.run(edges=TEST_DIR+'input/oi1-edges.txt', - prizes=TEST_DIR + 'input/oi1-prizes.txt', + OmicsIntegrator1.run({"edges": TEST_DIR+'input/oi1-edges.txt', + "prizes": TEST_DIR + 'input/oi1-prizes.txt'}, output_file=TEST_DIR+'output/test_optimalForest.sif', - w=5, - b=1, - d=10, - dummy_mode='file') + args=OmicsIntegrator1Params( + w=5, + b=1, + d=10, + dummy_mode='file')) # Only run Singularity test if the binary is available on the system # spython is only available on Unix, but do not explicitly skip non-Unix platforms @@ -118,11 +114,12 @@ def test_oi1_singularity(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Only include required arguments and run with Singularity - OmicsIntegrator1.run(edges=TEST_DIR + 'input/oi1-edges.txt', - prizes=TEST_DIR + 'input/oi1-prizes.txt', + OmicsIntegrator1.run({"edges": TEST_DIR + 'input/oi1-edges.txt', + "prizes": TEST_DIR + 'input/oi1-prizes.txt'}, output_file=OUT_FILE, - w=5, - b=1, - d=10, + args=OmicsIntegrator1Params( + w=5, + b=1, + d=10), container_framework="singularity") assert out_path.exists() diff --git a/test/OmicsIntegrator2/test_oi2.py b/test/OmicsIntegrator2/test_oi2.py index 311a9c7e7..172197efd 100644 --- a/test/OmicsIntegrator2/test_oi2.py +++ b/test/OmicsIntegrator2/test_oi2.py @@ -3,8 +3,8 @@ import pytest -import spras.config as config -from spras.omicsintegrator2 import OmicsIntegrator2 +import spras.config.config as config +from spras.omicsintegrator2 import OmicsIntegrator2, OmicsIntegrator2Params config.init_from_file("config/config.yaml") @@ -13,7 +13,6 @@ PRIZE_FILE = TEST_DIR / 'input' / 'oi2-prizes.txt' OUT_FILE = TEST_DIR / 'output' / 'test.tsv' - class TestOmicsIntegrator2: """ Run Omics Integrator 2 in the Docker image @@ -21,51 +20,44 @@ class TestOmicsIntegrator2: def test_oi2_required(self): # Only include required arguments OUT_FILE.unlink(missing_ok=True) - OmicsIntegrator2.run(edges=EDGE_FILE, - prizes=PRIZE_FILE, + OmicsIntegrator2.run({"edges": EDGE_FILE, + "prizes": PRIZE_FILE}, output_file=OUT_FILE) assert OUT_FILE.exists() def test_oi2_some_optional(self): # Include optional argument OUT_FILE.unlink(missing_ok=True) - OmicsIntegrator2.run(edges=EDGE_FILE, - prizes=PRIZE_FILE, + OmicsIntegrator2.run({"edges": EDGE_FILE, + "prizes": PRIZE_FILE}, output_file=OUT_FILE, - g=0) + args=OmicsIntegrator2Params(g=0)) assert OUT_FILE.exists() def test_oi2_all_optional(self): # Include all optional arguments OUT_FILE.unlink(missing_ok=True) - OmicsIntegrator2.run(edges=EDGE_FILE, - prizes=PRIZE_FILE, + OmicsIntegrator2.run({"edges": EDGE_FILE, + "prizes": PRIZE_FILE}, output_file=OUT_FILE, - w=5, - b=1, - g=3, - noise=0.1, - noisy_edges=0, - random_terminals=0, - dummy_mode='terminals', - seed=2) + args=OmicsIntegrator2Params(w=5, + b=1, + g=3, + noise=0.1, + noisy_edges=0, + random_terminals=0, + dummy_mode='terminals', + seed=2)) assert OUT_FILE.exists() - def test_oi2_missing(self): - # Test the expected error is raised when required arguments are missing - with pytest.raises(ValueError): - # No output_file - OmicsIntegrator2.run(edges=EDGE_FILE, - prizes=PRIZE_FILE) - # Only run Singularity test if the binary is available on the system # spython is only available on Unix, but do not explicitly skip non-Unix platforms @pytest.mark.skipif(not shutil.which('singularity'), reason='Singularity not found on system') def test_oi2_singularity(self): # Only include required arguments OUT_FILE.unlink(missing_ok=True) - OmicsIntegrator2.run(edges=EDGE_FILE, - prizes=PRIZE_FILE, + OmicsIntegrator2.run({"edges": EDGE_FILE, + "prizes": PRIZE_FILE}, output_file=OUT_FILE, container_framework="singularity") assert OUT_FILE.exists() diff --git a/test/PathLinker/test_pathlinker.py b/test/PathLinker/test_pathlinker.py index 3fd6a96bd..67e4b598f 100644 --- a/test/PathLinker/test_pathlinker.py +++ b/test/PathLinker/test_pathlinker.py @@ -3,8 +3,8 @@ import pytest -import spras.config as config -from spras.pathlinker import PathLinker +import spras.config.config as config +from spras.pathlinker import PathLinker, PathLinkerParams config.init_from_file("config/config.yaml") @@ -21,33 +21,28 @@ def test_pathlinker_required(self): out_path = Path(OUT_FILE_DEFAULT) out_path.unlink(missing_ok=True) # Only include required arguments - PathLinker.run( - nodetypes=TEST_DIR+'input/sample-in-nodetypes.txt', - network=TEST_DIR+'input/sample-in-net.txt', - output_file=OUT_FILE_DEFAULT - ) + PathLinker.run({"nodetypes": TEST_DIR+'input/sample-in-nodetypes.txt', + "network": TEST_DIR+'input/sample-in-net.txt'}, + output_file=OUT_FILE_DEFAULT) assert out_path.exists() def test_pathlinker_optional(self): out_path = Path(OUT_FILE_100) out_path.unlink(missing_ok=True) # Include optional argument - PathLinker.run( - nodetypes=TEST_DIR+'input/sample-in-nodetypes.txt', - network=TEST_DIR+'input/sample-in-net.txt', - output_file=OUT_FILE_100, - k=100 - ) + PathLinker.run({"nodetypes": TEST_DIR+'input/sample-in-nodetypes.txt', + "network": TEST_DIR+'input/sample-in-net.txt'}, + output_file=OUT_FILE_100, + args=PathLinkerParams(k=100)) assert out_path.exists() def test_pathlinker_missing(self): # Test the expected error is raised when required arguments are missing with pytest.raises(ValueError): # No nodetypes - PathLinker.run( - network=TEST_DIR + 'input/sample-in-net.txt', - output_file=OUT_FILE_100, - k=100) + PathLinker.run({"network": TEST_DIR + 'input/sample-in-net.txt'}, + output_file=OUT_FILE_100, + args=PathLinkerParams(k=100)) # Only run Singularity test if the binary is available on the system # spython is only available on Unix, but do not explicitly skip non-Unix platforms @@ -56,9 +51,8 @@ def test_pathlinker_singularity(self): out_path = Path(OUT_FILE_DEFAULT) out_path.unlink(missing_ok=True) # Only include required arguments and run with Singularity - PathLinker.run( - nodetypes=TEST_DIR+'input/sample-in-nodetypes.txt', - network=TEST_DIR+'input/sample-in-net.txt', - output_file=OUT_FILE_DEFAULT, - container_framework="singularity") + PathLinker.run({"nodetypes": TEST_DIR+'input/sample-in-nodetypes.txt', + "network": TEST_DIR+'input/sample-in-net.txt'}, + output_file=OUT_FILE_DEFAULT, + container_framework="singularity") assert out_path.exists() diff --git a/test/RWR/test_RWR.py b/test/RWR/test_RWR.py index 4d6ce7864..70eb06845 100644 --- a/test/RWR/test_RWR.py +++ b/test/RWR/test_RWR.py @@ -4,8 +4,8 @@ import pytest -import spras.config as config -from spras.rwr import RWR +import spras.config.config as config +from spras.rwr import RWR, RWRParams config.init_from_file("config/config.yaml") @@ -19,9 +19,9 @@ class TestRWR: """ def test_rwr(self): OUT_FILE.unlink(missing_ok=True) - RWR.run(network=Path(TEST_DIR, 'input', 'rwr-network.txt'), - nodes=Path(TEST_DIR, 'input','rwr-nodes.txt'), - alpha=0.85, + RWR.run({"network": Path(TEST_DIR, 'input', 'rwr-network.txt'), + "nodes": Path(TEST_DIR, 'input','rwr-nodes.txt')}, + args=RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE) assert OUT_FILE.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected_output', 'rwr-output.txt') @@ -32,9 +32,9 @@ def test_rwr(self): """ def test_missing_file(self): with pytest.raises(OSError): - RWR.run(network=Path(TEST_DIR, 'input', 'missing.txt'), - nodes=Path(TEST_DIR, 'input','rwr-nodes.txt'), - alpha=0.85, + RWR.run({"network": Path(TEST_DIR, 'input', 'missing.txt'), + "nodes": Path(TEST_DIR, 'input','rwr-nodes.txt')}, + args=RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE) """ @@ -42,9 +42,9 @@ def test_missing_file(self): """ def test_format_error(self): with pytest.raises(ValueError): - RWR.run(network=Path(TEST_DIR, 'input', 'rwr-bad-network.txt'), - nodes=Path(TEST_DIR, 'input','rwr-nodes.txt'), - alpha=0.85, + RWR.run({"network": Path(TEST_DIR, 'input', 'rwr-bad-network.txt'), + "nodes": Path(TEST_DIR, 'input','rwr-nodes.txt')}, + args=RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE) # Only run Singularity test if the binary is available on the system @@ -53,9 +53,9 @@ def test_format_error(self): def test_rwr_singularity(self): OUT_FILE.unlink(missing_ok=True) # Only include required arguments and run with Singularity - RWR.run(network=Path(TEST_DIR, 'input', 'rwr-network.txt'), - nodes=Path(TEST_DIR, 'input','rwr-nodes.txt'), - alpha=0.85, + RWR.run({"network": Path(TEST_DIR, 'input', 'rwr-network.txt'), + "nodes": Path(TEST_DIR, 'input','rwr-nodes.txt')}, + args=RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE, container_framework="singularity") assert OUT_FILE.exists() diff --git a/test/ST_RWR/test_STRWR.py b/test/ST_RWR/test_STRWR.py index a0a5b4ea9..ea0c2bda0 100644 --- a/test/ST_RWR/test_STRWR.py +++ b/test/ST_RWR/test_STRWR.py @@ -4,8 +4,8 @@ import pytest -import spras.config as config -from spras.strwr import ST_RWR +import spras.config.config as config +from spras.strwr import ST_RWR, ST_RWRParams config.init_from_file("config/config.yaml") @@ -20,10 +20,10 @@ class TestSTRWR: """ def test_strwr(self): OUT_FILE.unlink(missing_ok=True) - ST_RWR.run(network=Path(TEST_DIR, 'input', 'strwr-network.txt'), - sources=Path(TEST_DIR, 'input', 'strwr-sources.txt'), - targets=Path(TEST_DIR, 'input','strwr-targets.txt'), - alpha=0.85, + ST_RWR.run({"network": Path(TEST_DIR, 'input', 'strwr-network.txt'), + "sources": Path(TEST_DIR, 'input', 'strwr-sources.txt'), + "targets": Path(TEST_DIR, 'input','strwr-targets.txt')}, + args=ST_RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE) assert OUT_FILE.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected_output', 'strwr-output.txt') @@ -34,10 +34,10 @@ def test_strwr(self): """ def test_missing_file(self): with pytest.raises(OSError): - ST_RWR.run(network=Path(TEST_DIR, 'input', 'missing.txt'), - sources=Path(TEST_DIR, 'input', 'strwr-sources.txt'), - targets=Path(TEST_DIR, 'input','strwr-targets.txt'), - alpha=0.85, + ST_RWR.run({"network": Path(TEST_DIR, 'input', 'missing.txt'), + "sources": Path(TEST_DIR, 'input', 'strwr-sources.txt'), + "targets": Path(TEST_DIR, 'input','strwr-targets.txt')}, + args=ST_RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE) """ @@ -45,10 +45,10 @@ def test_missing_file(self): """ def test_format_error(self): with pytest.raises(ValueError): - ST_RWR.run(network=Path(TEST_DIR, 'input', 'strwr-bad-network.txt'), - sources=Path(TEST_DIR, 'input', 'strwr-sources.txt'), - targets=Path(TEST_DIR, 'input','strwr-targets.txt'), - alpha=0.85, + ST_RWR.run({"network": Path(TEST_DIR, 'input', 'strwr-bad-network.txt'), + "sources": Path(TEST_DIR, 'input', 'strwr-sources.txt'), + "targets": Path(TEST_DIR, 'input','strwr-targets.txt')}, + args=ST_RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE) # Only run Singularity test if the binary is available on the system @@ -57,10 +57,10 @@ def test_format_error(self): def test_strwr_singularity(self): OUT_FILE.unlink(missing_ok=True) # Only include required arguments and run with Singularity - ST_RWR.run(network=Path(TEST_DIR, 'input', 'strwr-network.txt'), - sources=Path(TEST_DIR, 'input', 'strwr-sources.txt'), - targets=Path(TEST_DIR, 'input','strwr-targets.txt'), - alpha=0.85, + ST_RWR.run({"network": Path(TEST_DIR, 'input', 'strwr-network.txt'), + "sources": Path(TEST_DIR, 'input', 'strwr-sources.txt'), + "targets": Path(TEST_DIR, 'input','strwr-targets.txt')}, + args=ST_RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE, container_framework="singularity") assert OUT_FILE.exists() diff --git a/test/analysis/input/config.yaml b/test/analysis/input/config.yaml index c9eaa437a..abde6f979 100644 --- a/test/analysis/input/config.yaml +++ b/test/analysis/input/config.yaml @@ -102,7 +102,6 @@ reconstruction_settings: locations: #place the save path here reconstruction_dir: "output" - run: true analysis: # Create one summary per pathway file and a single summary table for all pathways for each dataset diff --git a/test/analysis/input/egfr.yaml b/test/analysis/input/egfr.yaml index 4a31dad46..da4560df9 100644 --- a/test/analysis/input/egfr.yaml +++ b/test/analysis/input/egfr.yaml @@ -91,7 +91,6 @@ datasets: reconstruction_settings: locations: reconstruction_dir: output/egfr - run: true analysis: cytoscape: include: true diff --git a/test/analysis/test_cytoscape.py b/test/analysis/test_cytoscape.py index 7451b9876..68a77cd07 100644 --- a/test/analysis/test_cytoscape.py +++ b/test/analysis/test_cytoscape.py @@ -2,7 +2,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.analysis.cytoscape import run_cytoscape config.init_from_file("test/analysis/input/config.yaml") diff --git a/test/analysis/test_summary.py b/test/analysis/test_summary.py index 4ff5396da..0400d1f1b 100644 --- a/test/analysis/test_summary.py +++ b/test/analysis/test_summary.py @@ -3,7 +3,7 @@ import pandas as pd -import spras.config as config +import spras.config.config as config from spras.analysis.summary import summarize_networks from spras.dataset import Dataset diff --git a/test/test_config.py b/test/test_config.py index 4c9c15807..3d8d67d78 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -1,20 +1,36 @@ +import copy import pickle +from typing import Iterable import numpy as np import pytest - -import spras.config as config - +from pydantic import BaseModel + +import spras.config.config as config +from spras.config.container_schema import DEFAULT_CONTAINER_PREFIX +from spras.config.schema import DEFAULT_HASH_LENGTH +from spras.meo import MEOParams +from spras.mincostflow import MinCostFlowParams +from spras.omicsintegrator2 import DummyMode, OmicsIntegrator2Params + +filler_dataset_data: dict[str, str | list[str]] = { + "data_dir": "fake", + "edge_files": [], + "other_files": [], + "node_files": [] +} # Set up a dummy config for testing. For now, only include things that MUST exist in the dict # in order for the config init to complete. To test particular parts of the config initialization, # individual values of the dict can be changed and the whole initialization can be re-run. def get_test_config(): test_raw_config = { - "container_framework": "singularity", - "container_registry": { - "base_url": "docker.io", - "owner": "reedcompbio", + "containers": { + "framework": "singularity", + "registry": { + "base_url": "docker.io", + "owner": "reedcompbio", + }, }, "hash_length": 7, "reconstruction_settings": { @@ -22,59 +38,58 @@ def get_test_config(): "reconstruction_dir": "my_dir" } }, - "datasets": [{"label": "alg1"}, {"label": "alg2"}], - "gold_standards": [{"label": "gs1", "dataset_labels": []}], + "datasets": [{ + "label": "alg1", + "data_dir": "fake", + "edge_files": [], + "other_files": [], + "node_files": [] + }, { + "label": "alg2", + "data_dir": "faux", + "edge_files": [], + "other_files": [], + "node_files": [] + }], + "gold_standards": [{ + "label": "gs1", + "dataset_labels": [], + "node_files": [], + "data_dir": "gs-fake" + }], "algorithms": [ - {"params": ["param2", "param2"]}, - { - "name": "strings", - "params": { - "include": True, - "run1": {"test": "str1", "test2": ["str2", "str3"]} - } - }, + # Since there is algorithm validation, + # we are (mostly) forced to use real algorithm parameters here. + # To make this more readable, we make the 'test names' the run names. + # TODO: we don't have a test for combinations of strings anymore. This seems to be fine, + # but it would be nice to have once we introduce an algorithm that takes more than 1 string parameter. { - "name": "numbersAndBools", - "params": { - "include": True, - "run1": {"a": 1, "b": [float(2.0), 3], "c": [4], "d": float(5.6), "f": False} + "name": "omicsintegrator2", + "include": True, + "runs": { + "strings": {"dummy_mode": ["terminals", "others"], "b": 3}, + # spacing in np.linspace is on purpose + "singleton_string_np_linspace": {"dummy_mode": "terminals", "b": "np.linspace(0, 5,2,)"}, + "str_array_np_logspace": {"dummy_mode": ["others", "all"], "g": "np.logspace(1,1)"} } }, { - "name": "singleton_int64_with_array", - "params": { - "include": True, - "run1": {"test": np.int64(1), "test2": [2, 3]} + "name": "meo", + "include": True, + "runs": { + "numbersAndBoolsDuplicate": {"max_path_length": 1, "rand_restarts": [float(2.0), 3], "local_search": [True, False]}, + "numbersAndBool": {"max_path_length": 2, "rand_restarts": [float(2.0), 3], "local_search": [True]}, + "numbersAndBools": {"max_path_length": 1, "rand_restarts": [float(2.0), 3], "local_search": [True, False]}, + "boolArrTest": {"local_search": [True, False], "max_path_length": "range(1, 3)"} } }, { - "name": "singleton_string_np_linspace", - "params": { - "include": True, - "run1": {"test": "str1", "test2": "np.linspace(0,5,2)"} + "name": "mincostflow", + "include": True, + "runs": { + "int64artifact": {"flow": "np.arange(5, 7)", "capacity": [2, 3]} } }, - { - "name": "str_array_np_logspace", - "params": { - "include": True, - "run1": {"test": ["a", "b"], "test2": "np.logspace(1,1)"} - } - }, - { - "name": "int64artifact", - "params": { - "include": True, - "run1": {"test": "np.arange(5,6)", "test2": [2, 3]} - } - }, - { - "name": "boolArrTest", - "params": { - "include": True, - "run1": {"flags": [True, False], "range": "range(1, 3)"} - } - } ], "analysis": { "summary": { @@ -96,22 +111,49 @@ def get_test_config(): return test_raw_config -def value_test_util(name: str, configurations: list): - assert name in config.config.algorithm_params, f"{name} isn't a present algorithm configuration!" - - keys = config.config.algorithm_params[name] - values = [config.config.algorithm_params[name][key] for key in keys] +def value_test_util(alg: str, run_name: str, param_type: type[BaseModel], configurations: Iterable[BaseModel]): + """ + Utility test function to be able to test against certain named runs + under algorithms. This is, unfortunately, a very holistic function that depends + on the current state of how config parsing is. + """ + assert alg in config.config.algorithm_params, f"{alg} isn't a present algorithm name!" + runs = config.config.algorithm_params[alg] + # Filter using the internal _spras_run_name key. + runs = {hash: params for hash, params in runs.items() if params["_spras_run_name"] == run_name} + + # We copy values so we don't mutate it + values: list[dict] = copy.deepcopy(list(runs.values())) + for value in values: + # then, remove the internal key for easy comparison. + del value["_spras_run_name"] + + # Since configurations is a bunch of objects, we need to turn those into dictionaries + # and exclude their defaults. + new_configurations = [config.model_dump(exclude_defaults=True) for config in configurations] + + # Same for values, but we reserialize them first + values = [param_type.model_validate(value).model_dump(exclude_defaults=True) for value in values] + + # Now, we need to also remove any dynamic values from values and configurations + # (_time and seeded values) + for value in values: + value.pop("_time", None) + value.pop("seed", None) + for configuration in new_configurations: + configuration.pop("_time", None) + configuration.pop("seed", None) # https://stackoverflow.com/a/50486270/7589775 # Note: We use pickle as we also compare dictionaries in these two sets - some kind of consistent total ordering # is required for the tests to consistently pass when comparing them to `configurations`. - set_values = set(tuple(sorted(d.items())) for d in sorted(values, key=lambda x: pickle.dumps(x, protocol=3))) - set_configurations = set(tuple(sorted(d.items())) for d in sorted(configurations, key=lambda x: pickle.dumps(x, protocol=3))) + final_values = sorted(tuple(sorted(d.items())) for d in sorted(values, key=lambda x: pickle.dumps(x, protocol=3))) + final_configurations = sorted(tuple(sorted(d.items())) for d in sorted(new_configurations, key=lambda x: pickle.dumps(x, protocol=3))) - if set_values != set_configurations: - print(f'Got: {set_values}') - print(f'Expected: {set_configurations}') - assert set_values == set_configurations + if final_values != final_configurations: + print(f'Got: {final_values}') + print(f'Expected: {final_configurations}') + assert final_values == final_configurations class TestConfig: """ @@ -123,9 +165,9 @@ def test_config_hash_length(self): config.init_global(test_config) assert (config.config.hash_length == 7) - test_config["hash_length"] = "" + test_config.pop("hash_length", None) config.init_global(test_config) - assert (config.config.hash_length == config.DEFAULT_HASH_LENGTH) + assert (config.config.hash_length == DEFAULT_HASH_LENGTH) # Initialize the configuration test_config["hash_length"] = "12" @@ -136,46 +178,46 @@ def test_config_container_framework_normalization(self): # Test singularity test_config = get_test_config() - test_config["container_framework"] = "singularity" + test_config["containers"]["framework"] = "singularity" config.init_global(test_config) - assert (config.config.container_framework == "singularity") + assert (config.config.container_settings.framework == "singularity") # Test singularity with capitalization - test_config["container_framework"] = "Singularity" + test_config["containers"]["framework"] = "Singularity" config.init_global(test_config) - assert (config.config.container_framework == "singularity") + assert (config.config.container_settings.framework == "singularity") # Test docker - test_config["container_framework"] = "docker" + test_config["containers"]["framework"] = "docker" config.init_global(test_config) - assert (config.config.container_framework == "docker") + assert (config.config.container_settings.framework == "docker") # Test docker with capitalization - test_config["container_framework"] = "Docker" + test_config["containers"]["framework"] = "Docker" config.init_global(test_config) - assert (config.config.container_framework == "docker") + assert (config.config.container_settings.framework == "docker") # Test unknown framework - test_config["container_framework"] = "badFramework" + test_config["containers"]["framework"] = "badFramework" with pytest.raises(ValueError): config.init_global(test_config) def test_config_container_registry(self): test_config = get_test_config() - test_config["container_registry"]["base_url"] = "docker.io" - test_config["container_registry"]["owner"] = "reedcompbio" + test_config["containers"]["registry"]["base_url"] = "docker.io" + test_config["containers"]["registry"]["owner"] = "reedcompbio" config.init_global(test_config) - assert (config.config.container_prefix == "docker.io/reedcompbio") + assert (config.config.container_settings.prefix == "docker.io/reedcompbio") - test_config["container_registry"]["base_url"] = "another.repo" - test_config["container_registry"]["owner"] = "different-owner" + test_config["containers"]["registry"]["base_url"] = "another.repo" + test_config["containers"]["registry"]["owner"] = "different-owner" config.init_global(test_config) - assert (config.config.container_prefix == "another.repo/different-owner") + assert (config.config.container_settings.prefix == "another.repo/different-owner") - test_config["container_registry"]["base_url"] = "" - test_config["container_registry"]["owner"] = "" + test_config["containers"]["registry"]["base_url"] = "" + test_config["containers"]["registry"]["owner"] = "" config.init_global(test_config) - assert (config.config.container_prefix == config.DEFAULT_CONTAINER_PREFIX) + assert (config.config.container_settings.prefix == DEFAULT_CONTAINER_PREFIX) def test_error_dataset_label(self): test_config = get_test_config() @@ -191,6 +233,7 @@ def test_correct_dataset_label(self): test_config = get_test_config() correct_test_dicts = [{"label": "test"}, {"label": "123"}, {"label": "test123"}, {"label": "123test"}, {"label": "_"}, {"label": "test_test"}, {"label": "_test"}, {"label": "test_"}] + correct_test_dicts = [dict(list(d.items()) + list(filler_dataset_data.items())) for d in correct_test_dicts] for test_dict in correct_test_dicts: test_config["datasets"] = [test_dict] @@ -216,17 +259,51 @@ def test_config_values(self): test_config = get_test_config() config.init_global(test_config) - value_test_util('strings', [{'test': "str1", 'test2': "str2"}, {'test': 'str1', 'test2': 'str3'}]) - value_test_util('numbersAndBools', [{'a': 1, 'b': float(2.0), 'c': 4, 'd': 5.6, 'f': False}, {'a': 1, 'b': 3, 'c': 4, 'd': 5.6, 'f': False}]) - - value_test_util('singleton_int64_with_array', [{'test': 1, 'test2': 2}, {'test': 1, 'test2': 3}]) - value_test_util('singleton_string_np_linspace', [{'test': "str1", 'test2': 5.0}, {'test': "str1", 'test2': 0.0}]) - value_test_util('str_array_np_logspace', [{'test': "a", 'test2': 10}] * 10 + [{'test': "b", 'test2': 10}] * 10) - - value_test_util('int64artifact', [{'test': 5, 'test2': 2}, {'test': 5, 'test2': 3}]) - - value_test_util('boolArrTest', [{'flags': True, 'range': 1}, {'flags': False, 'range': 2}, - {'flags': False, 'range': 1}, {'flags': True, 'range': 2}]) + value_test_util('omicsintegrator2', 'strings', OmicsIntegrator2Params, [ + OmicsIntegrator2Params(dummy_mode=DummyMode.terminals, b=3), + OmicsIntegrator2Params(dummy_mode=DummyMode.others, b=3) + ]) + + value_test_util('omicsintegrator2', 'singleton_string_np_linspace', OmicsIntegrator2Params, [ + OmicsIntegrator2Params(dummy_mode=DummyMode.terminals, b=5.0), + OmicsIntegrator2Params(dummy_mode=DummyMode.terminals, b=0.0) + ]) + + value_test_util('omicsintegrator2', 'str_array_np_logspace', OmicsIntegrator2Params, [ + # While these both repeat 50 times, parameter hash makes sure to not duplicate the work. + # This serves as a test to make sure _time isn't inserted during parameter combinations. + OmicsIntegrator2Params(dummy_mode=DummyMode.others, g=10), OmicsIntegrator2Params(dummy_mode=DummyMode.all, g=10) + ]) + + value_test_util('meo', 'numbersAndBools', MEOParams, [ + MEOParams(max_path_length=1, rand_restarts=2, local_search=False), + MEOParams(max_path_length=1, rand_restarts=2, local_search=True), + MEOParams(max_path_length=1, rand_restarts=3, local_search=False), + MEOParams(max_path_length=1, rand_restarts=3, local_search=True), + ]) + + # Encoding this behavior: run names are not passed into the parameter hash, + # and thus won't duplicate runs. + value_test_util('meo', 'numbersAndBoolsDuplicate', MEOParams, []) + + value_test_util('meo', 'numbersAndBool', MEOParams, [ + MEOParams(max_path_length=2, rand_restarts=2, local_search=True), + MEOParams(max_path_length=2, rand_restarts=3, local_search=True), + ]) + + value_test_util('mincostflow', 'int64artifact', MinCostFlowParams, [ + MinCostFlowParams(flow=5, capacity=2), + MinCostFlowParams(flow=5, capacity=3), + MinCostFlowParams(flow=6, capacity=2), + MinCostFlowParams(flow=6, capacity=3) + ]) + + value_test_util('meo', 'boolArrTest', MEOParams, [ + MEOParams(local_search=True, max_path_length=1), + MEOParams(local_search=True, max_path_length=2), + MEOParams(local_search=False, max_path_length=1), + MEOParams(local_search=False, max_path_length=2) + ]) @pytest.mark.parametrize("ml_include, eval_include, expected_ml, expected_eval", [ (True, True, True, True), diff --git a/test/test_util.py b/test/test_util.py index baf9db0ed..2a25fc0d1 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -2,7 +2,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.containers import convert_docker_path, prepare_path_docker, prepare_volume from spras.util import hash_params_sha1_base32 diff --git a/util/update_schema.py b/util/update_schema.py new file mode 100644 index 000000000..c6a7bedca --- /dev/null +++ b/util/update_schema.py @@ -0,0 +1,13 @@ +""" +Updates config/schema.json. +This should be done whenever a new algorithm is introduced, +or the config is otherwise directly changed. +""" + +import json +from pathlib import Path + +from spras.config.schema import RawConfig + +config_schema = RawConfig.model_json_schema() +Path('config/schema.json').write_text(json.dumps(config_schema, indent=2))