Reed-CompBio · tristan-f-r · Jul 9, 2025 · Jul 9, 2025 · Jul 10, 2025 · Jul 10, 2025
diff --git a/Snakefile b/Snakefile
@@ -35,9 +35,14 @@ def get_dataset(_datasets, label):
 algorithms = list(algorithm_params)
 algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()]
 dataset_labels = list(_config.config.datasets.keys())
-
-dataset_gold_standard_node_pairs = [f"{dataset}-{gs['label']}" for gs in _config.config.gold_standards.values() if gs['node_files'] for dataset in gs['dataset_labels']]
-dataset_gold_standard_edge_pairs = [f"{dataset}-{gs['label']}" for gs in _config.config.gold_standards.values() if gs['edge_files'] for dataset in gs['dataset_labels']]
+dataset_gold_standard_node_pairs = [
+    f"{_config.attach_spras_revision(dataset)}-{_config.attach_spras_revision(gs['label'])}"
+    for gs in _config.config.gold_standards.values() if gs['node_files'] for dataset in gs['dataset_labels']
+]
+dataset_gold_standard_edge_pairs = [
+    f"{_config.attach_spras_revision(dataset)}-{_config.attach_spras_revision(gs['label'])}"
+    for gs in _config.config.gold_standards.values() if gs['edge_files'] for dataset in gs['dataset_labels']
+]
 
 # Get algorithms that are running multiple parameter combinations
 def algo_has_mult_param_combos(algo):
@@ -287,7 +292,7 @@ rule reconstruct:
 # Original pathway reconstruction output to universal output
 # Use PRRunner as a wrapper to call the algorithm-specific parse_output
 rule parse_output:
-    input: 
+    input:
         raw_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'raw-pathway.txt']),
         dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle'])
     output: standardized_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'pathway.txt'])

diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py
@@ -7,7 +7,7 @@
 
 
 def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict],
-                       algo_with_params: list) -> pd.DataFrame:
+                       algo_with_params: list[str]) -> pd.DataFrame:
     """
     Generate a table that aggregates summary information about networks in file_paths, including which nodes are present
     in node_table columns. Network directionality is ignored and all edges are treated as undirected. The order of the

diff --git a/spras/config/config.py b/spras/config/config.py
@@ -13,8 +13,11 @@
 """
 
 import copy as copy
+import functools
+import importlib.metadata
 import itertools as it
 import os
+import subprocess
 import warnings
 from collections.abc import Iterable
 from typing import Any
@@ -28,6 +31,24 @@
 
 config = None
 
+@functools.cache
+def spras_revision() -> str:
+    """
+    Gets the revision of the current SPRAS repository.
+    If this file is inside a `.git` repository, this uses the revision hash.
+    Otherwise, this uses the package version.
+    """
+    # Check if we're inside a git repository
+    try:
+        return subprocess.check_output(["git", "rev-parse", "--short", "HEAD"], encoding='utf-8').strip()
+    except subprocess.CalledProcessError:
+        # Use the current package version instead
+        # https://stackoverflow.com/a/75100875/7589775
+        return f"v{importlib.metadata.version('spras').replace('.', '_')}"
+
+def attach_spras_revision(label: str) -> str:
+    return f"{label}_{spras_revision()}"
+
 DEFAULT_CONTAINER_PREFIX = "docker.io/reedcompbio"
 
 # This will get called in the Snakefile, instantiating the singleton with the raw config
@@ -127,16 +148,16 @@ def process_datasets(self, raw_config: RawConfig):
             label = dataset.label
             if label.lower() in [key.lower() for key in self.datasets.keys()]:
                 raise ValueError(f"Datasets must have unique case-insensitive labels, but the label {label} appears at least twice.")
-            self.datasets[label] = dict(dataset)
+            self.datasets[attach_spras_revision(label)] = dict(dataset)
 
         # parse gold standard information
-        self.gold_standards = {gold_standard.label: dict(gold_standard) for gold_standard in raw_config.gold_standards}
+        self.gold_standards = {attach_spras_revision(gold_standard.label): dict(gold_standard) for gold_standard in raw_config.gold_standards}
 
         # check that all the dataset labels in the gold standards are existing datasets labels
         dataset_labels = set(self.datasets.keys())
         gold_standard_dataset_labels = {dataset_label for value in self.gold_standards.values() for dataset_label in value['dataset_labels']}
         for label in gold_standard_dataset_labels:
-            if label not in dataset_labels:
+            if attach_spras_revision(label) not in dataset_labels:
                 raise ValueError(f"Dataset label '{label}' provided in gold standards does not exist in the existing dataset labels.")
 
         # Code snipped from Snakefile that may be useful for assigning default labels
@@ -221,7 +242,10 @@ def process_algorithms(self, raw_config: RawConfig):
                             run_dict[param] = float(value)
                         if isinstance(value, np.ndarray):
                             run_dict[param] = value.tolist()
-                    params_hash = hash_params_sha1_base32(run_dict, self.hash_length, cls=NpHashEncoder)
+                    # Incorporates the `spras_revision` into the hash
+                    hash_run_dict = copy.deepcopy(run_dict)
+                    hash_run_dict["_spras_rev"] = spras_revision()
+                    params_hash = hash_params_sha1_base32(hash_run_dict, self.hash_length, cls=NpHashEncoder)
                     if params_hash in prior_params_hashes:
                         raise ValueError(f'Parameter hash collision detected. Increase the hash_length in the config file '
                                         f'(current length {self.hash_length}).')

diff --git a/test/analysis/expected_output/expected_egfr_summary.txt b/test/analysis/expected_output/expected_egfr_summary.txt
@@ -1,10 +1,4 @@
-Name	Number of nodes	Number of edges	Number of connected components	Density	Max degree	Median degree	Max diameter	Average path length	Nodes in prize	Nodes in sources	Nodes in targets	Nodes in active	Nodes in dummy	Parameter combination
-test/analysis/input/egfr/tps-egfr-domino-params-V3X4RW7_pathway.txt	48	45	3	0.0398936170212766	5	2.0	16	3.882808476926124	27	0	27	27	0	{'slice_threshold': 0.3, 'module_threshold': 0.05}
-test/analysis/input/egfr/tps-egfr-meo-params-GKEDDFZ_pathway.txt	1877	12845	1	0.007295700506524384	469	6.0	6	2.7973618474338107	621	1	620	621	1	{'local_search': 'Yes', 'max_path_length': 3, 'rand_restarts': 10}
-test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-3THRXWW_pathway.txt	28	20	8	0.05291005291005291	4	1.0	5	1.306439393939394	28	1	27	28	1	{'b': 2, 'd': 10, 'g': '1e-3', 'r': 0.01, 'w': 0.1, 'mu': 0.008, 'dummy_mode': 'file'}
-test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-5QH767V_pathway.txt	39	31	8	0.04183535762483131	6	1.0	5	1.5084498834498834	39	1	38	39	1	{'b': 10, 'd': 10, 'g': '1e-3', 'r': 0.01, 'w': 0.1, 'mu': 0.008, 'dummy_mode': 'file'}
-test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-ITO5EQS_pathway.txt	14	9	5	0.0989010989010989	4	1.0	2	1.1866666666666668	14	0	14	14	0	{'b': 0.55, 'd': 10, 'g': '1e-3', 'r': 0.01, 'w': 0.1, 'mu': 0.008, 'dummy_mode': 'file'}
-test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-EHHWPMD_pathway.txt	593	591	2	0.0033669841848593955	32	1.0	30	6.72248989073389	531	1	530	531	1	{'b': 2, 'g': 3}
-test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-IV3IPCJ_pathway.txt	704	702	2	0.002836867968446916	35	1.0	24	6.038766691954387	616	1	615	616	1	{'b': 4, 'g': 0}
-test/analysis/input/egfr/tps-egfr-pathlinker-params-7S4SLU6_pathway.txt	14	17	1	0.18681318681318682	6	2.0	7	2.857142857142857	6	1	5	6	1	{'k': 10}
-test/analysis/input/egfr/tps-egfr-pathlinker-params-TCEMRS7_pathway.txt	25	32	1	0.10666666666666667	8	2.0	7	3.486666666666667	11	1	10	11	1	{'k': 20}
+Number of nodes	Number of edges	Number of connected components	Density	Max degree	Median degree	Max diameter	Average path length	Nodes in prize	Nodes in sources	Nodes in targets	Nodes in active	Nodes in dummy	Parameter combination
+14	17	1	0.18681318681318682	6	2.0	7	2.857142857142857	6	1	5	6	1	{'k': 10}
+25	32	1	0.10666666666666667	8	2.0	7	3.486666666666667	11	1	10	11	1	{'k': 20}
+1874	12845	1	0.007319084148670001	469	6.0	6	2.7952001166950904	621	1	620	621	1	{'local_search': 'Yes', 'max_path_length': 3, 'rand_restarts': 10}
diff --git a/test/analysis/expected_output/expected_example_summary.txt b/test/analysis/expected_output/expected_example_summary.txt
@@ -1,13 +1,6 @@
-Name	Number of nodes	Number of edges	Number of connected components	Density	Max degree	Median degree	Max diameter	Average path length	Nodes in prize	Nodes in active	Nodes in dummy	Nodes in sources	Nodes in targets	Parameter combination
-test/analysis/input/example/data0-allpairs-params-BEH6YB2_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'spras_placeholder': 'no parameters'}
-test/analysis/input/example/data0-domino-params-V3X4RW7_pathway.txt	0	0	0	0.0	0	0.0	0	0.0	0	0	0	0	0	{'slice_threshold': 0.3, 'module_threshold': 0.05}
-test/analysis/input/example/data0-meo-params-GKEDDFZ_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'max_path_length': 3, 'local_search': 'Yes', 'rand_restarts': 10}
-test/analysis/input/example/data0-mincostflow-params-SZPZVU6_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'flow': 1, 'capacity': 1}
-test/analysis/input/example/data0-omicsintegrator1-params-E3LSEZQ_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'b': 6, 'w': 5.0, 'd': 10, 'dummy_mode': 'file'}
-test/analysis/input/example/data0-omicsintegrator1-params-NFIPHUX_pathway.txt	0	0	0	0.0	0	0.0	0	0.0	0	0	0	0	0	{'b': 6, 'w': 0.0, 'd': 10, 'dummy_mode': 'file'}
-test/analysis/input/example/data0-omicsintegrator1-params-SU2S63Y_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'b': 5, 'w': 0.0, 'd': 10, 'dummy_mode': 'file'}
-test/analysis/input/example/data0-omicsintegrator1-params-V26JBGX_pathway.txt	0	0	0	0.0	0	0.0	0	0.0	0	0	0	0	0	{'b': 5, 'w': 5.0, 'd': 10, 'dummy_mode': 'file'}
-test/analysis/input/example/data0-omicsintegrator2-params-EHHWPMD_pathway.txt	0	0	0	0.0	0	0.0	0	0.0	0	0	0	0	0	{'b': 2, 'g': 3}
-test/analysis/input/example/data0-omicsintegrator2-params-IV3IPCJ_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'b': 4, 'g': 0}
-test/analysis/input/example/data0-pathlinker-params-6SWY7JS_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'k': 200}
-test/analysis/input/example/data0-pathlinker-params-VQL7BDZ_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'k': 100}
+Number of nodes	Number of edges	Number of connected components	Density	Max degree	Median degree	Max diameter	Average path length	Nodes in prize	Nodes in active	Nodes in dummy	Nodes in sources	Nodes in targets	Parameter combination
+3	2	1	0.6666666666666666	2	1	2	1.3333333333333333	2	2	0	1	1	{'flow': 1, 'capacity': 1}
+3	2	1	0.6666666666666666	2	1	2	1.3333333333333333	2	2	0	1	1	{'k': 100}
+3	2	1	0.6666666666666666	2	1	2	1.3333333333333333	2	2	0	1	1	{'k': 200}
+3	2	1	0.6666666666666666	2	1	2	1.3333333333333333	2	2	0	1	1	{'max_path_length': 3, 'local_search': 'Yes', 'rand_restarts': 10}
+3	2	1	0.6666666666666666	2	1	2	1.3333333333333333	2	2	0	1	1	{'spras_placeholder': 'no parameters'}
diff --git a/test/analysis/input/.gitignore b/test/analysis/input/.gitignore
@@ -0,0 +1 @@
+run
diff --git a/test/analysis/input/config.yaml b/test/analysis/input/config.yaml
diff --git a/test/analysis/input/egfr.yaml b/test/analysis/input/egfr.yaml
@@ -1,92 +1,28 @@
-# The length of the hash used to identify a parameter combination
+# Basic settings
 hash_length: 7
 
 containers:
-  # Specify the container framework used by each PRM wrapper. Valid options include:
-  # - docker (default if not specified)
-  # - singularity -- Also known as apptainer, useful in HPC/HTC environments where docker isn't allowed
-  # - dsub -- experimental with limited support, used for running on Google Cloud with the All of Us cloud environment.
-  #   - There is no support for other environments at the moment.
   framework: docker
-
-  # Only used if container_framework is set to singularity, this will unpack the singularity containers
-  # to the local filesystem. This is useful when PRM containers need to run inside another container,
-  # such as would be the case in an HTCondor/OSPool environment.
-  # NOTE: This unpacks singularity containers to the local filesystem, which will take up space in a way
-  # that persists after the workflow is complete. To clean up the unpacked containers, the user must
-  # manually delete them. For convenience, these unpacked files will exist in the current working directory
-  # under `unpacked`.
   unpack_singularity: false
-
-  # Allow the user to configure which container registry containers should be pulled from
-  # Note that this assumes container names are consistent across registries, and that the
-  # registry being passed doesn't require authentication for pull actions
   registry:
     base_url: docker.io
-    # The owner or project of the registry
-    # For example, "reedcompbio" if the image is available as docker.io/reedcompbio/allpairs
     owner: reedcompbio
 
 algorithms:
   - name: pathlinker
     params:
       include: true
       run1:
-        k:
-          - 10
-          - 20
-  - name: omicsintegrator1
-    params:
-      include: true
-      run1:
-        b:
-          - 0.55
-          - 2
-          - 10
-        d:
-          - 10
-        g:
-          - 1e-3
-        r:
-          - 0.01
-        w:
-          - 0.1
-        mu:
-          - 0.008
-        dummy_mode: ["file"]
-  - name: omicsintegrator2
-    params:
-      include: true
-      run1:
-        b:
-          - 4
-        g:
-          - 0
-      run2:
-        b:
-          - 2
-        g:
-          - 3
+        k: [10, 20]
   - name: meo
     params:
       include: true
       run1:
-        local_search:
-          - "Yes"
-        max_path_length:
-          - 3
-        rand_restarts:
-          - 10
-  - name: domino
-    params:
-      include: true
-      run1:
-        slice_threshold:
-          - 0.3
-        module_threshold:
-          - 0.05
+        local_search: "Yes"
+        max_path_length: 3
+        rand_restarts: 10
 datasets:
-  - data_dir: input
+  - data_dir: "input"
     edge_files:
       - phosphosite-irefindex13.0-uniprot.txt
     label: tps_egfr
@@ -95,12 +31,12 @@ datasets:
     other_files: []
 reconstruction_settings:
   locations:
-    reconstruction_dir: output/egfr
+    reconstruction_dir: "test/analysis/input/run/egfr"
 analysis:
   cytoscape:
-    include: true
+    include: false
   summary:
-    include: true
+    include: false
   ml:
     include: false
   evaluation: