Reed-CompBio · agitter · May 25, 2025 · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024
diff --git a/Snakefile b/Snakefile
@@ -296,7 +296,7 @@ rule summary_table:
     run:
         # Load the node table from the pickled dataset file
         node_table = Dataset.from_file(input.dataset_file).node_table
-        summary_df = summary.summarize_networks(input.pathways, node_table)
+        summary_df = summary.summarize_networks(input.pathways, node_table, algorithm_params, algorithms_with_params)
         summary_df.to_csv(output.summary_table, sep='\t', index=False)
 
 # Cluster the output pathways for each dataset

diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py
@@ -1,71 +1,87 @@
-import os
-import sys
 from pathlib import Path
 from typing import Iterable
 
 import networkx as nx
 import pandas as pd
 
 
-def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame) -> pd.DataFrame:
+def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict],
+                       algo_with_params: list) -> pd.DataFrame:
     """
-    Generate a table that aggregates summary information about networks in file_paths,
-    including which nodes are present in node_table columns.
+    Generate a table that aggregates summary information about networks in file_paths, including which nodes are present
+    in node_table columns. Network directionality is ignored and all edges are treated as undirected. The order of the
+    file_paths and algo_with_params inputs must match after they are each sorted.
     @param file_paths: iterable of edge list files
     @param node_table: pandas DataFrame containing node attributes
+    @param algo_params: a nested dict mapping algorithm names to dicts that map parameter hashes to parameter
+    combinations.
+    @param algo_with_params: a list of <algorithm>-params-<params_hash> combinations
     @return: pandas DataFrame with summary information
     """
     # Ensure that NODEID is the first column
-    assert node_table.columns[0] == "NODEID"
+    assert node_table.columns[0] == 'NODEID'
     # Initialize list to store input nodes that have property data
     nodes_by_col = []
     # Save new labels
-    nodes_by_col_labs = ("Nodes in " + node_table.columns[1:]).tolist()
+    nodes_by_col_labs = ('Nodes in ' + node_table.columns[1:]).tolist()
     # Iterate through each node property column
     for col in node_table.columns[1:]:
         # Assumption: property columns only contain NA, boolean, numeric data
         # If the property contains numeric data, save the nodes with property values that are not NA and > 0
         # If the property contains boolean data, save the nodes with property values that are True
-        nodes_by_col.append(set(node_table.loc[node_table[col] > 0, "NODEID"]))
+        nodes_by_col.append(set(node_table.loc[node_table[col] > 0, 'NODEID']))
 
     # Initialize list to store network summary data
     nw_info = []
 
-    # Iterate through each network file path
-    for file_path in sorted(file_paths):
+    algo_with_params = sorted(algo_with_params)
 
+    # Iterate through each network file path
+    for index, file_path in enumerate(sorted(file_paths)):
         with open(file_path, 'r') as f:
             lines = f.readlines()[1:]  # skip the header line
 
+        # directed or mixed graphs are parsed and summarized as an undirected graph
         nw = nx.read_edgelist(lines, data=(('weight', float), ('Direction', str)))
 
         # Save the network name, number of nodes, number edges, and number of connected components
         nw_name = str(file_path)
         number_nodes = nw.number_of_nodes()
         number_edges = nw.number_of_edges()
         ncc = nx.number_connected_components(nw)
+
         # Initialize list to store current network information
         cur_nw_info = [nw_name, number_nodes, number_edges, ncc]
+
         # Iterate through each node property and save the intersection with the current network
         for node_list in nodes_by_col:
             num_nodes = len(set(nw).intersection(node_list))
             cur_nw_info.append(num_nodes)
+
+        # String split to access algorithm and hashcode: <algorithm>-params-<params_hash>
+        parts = algo_with_params[index].split('-')
+        algo = parts[0]
+        hashcode = parts[2]
+
+        # Algorithm parameters have format { algo : { hashcode : { parameter combos } } }
+        param_combo = algo_params[algo][hashcode]
+        cur_nw_info.append(param_combo)
+
         # Save the current network information to the network summary list
         nw_info.append(cur_nw_info)
 
+    # Prepare column names
+    col_names = ['Name', 'Number of nodes', 'Number of edges', 'Number of connected components']
+    col_names.extend(nodes_by_col_labs)
+    col_names.append('Parameter combination')
+
     # Convert the network summary data to pandas dataframe
     # Could refactor to create the dataframe line by line instead of storing data as lists and then converting
     nw_info = pd.DataFrame(
         nw_info,
-        columns=[
-                    "Name",
-                    "Number of nodes",
-                    "Number of undirected edges",
-                    "Number of connected components"
-                ]
-                +
-                nodes_by_col_labs
+        columns=col_names
     )
+
     return nw_info
 
 
@@ -129,5 +145,5 @@ def degree(g):
 #     save(dat, argv[2])
 
 
-# if __name__ == "__main__":
+# if __name__ == '__main__':
 #     main(sys.argv)
diff --git a/test/analysis/expected_output/expected_node_table.txt b/test/analysis/expected_output/expected_node_table.txt
@@ -0,0 +1,4 @@
+NODEID	prize	active	dummy	sources	targets
+0	C	5.7	True	NaN	NaN	True
+1	A	2.0	True	True	True	NaN
+2	B	NaN	NaN	NaN	NaN	NaN
diff --git a/test/analysis/expected_output/test_egfr_summary.txt b/test/analysis/expected_output/test_egfr_summary.txt
@@ -0,0 +1,10 @@
+Name	Number of nodes	Number of edges	Number of connected components	Nodes in prize	Nodes in sources	Nodes in targets	Nodes in active	Nodes in dummy	Parameter combination
+test/analysis/input/egfr/tps-egfr-domino-params-V3X4RW7_pathway.txt	48	45	3	27	0	27	27	0	{'slice_threshold': 0.3, 'module_threshold': 0.05}
+test/analysis/input/egfr/tps-egfr-meo-params-GKEDDFZ_pathway.txt	1877	12845	1	621	1	620	621	1	{'local_search': 'Yes', 'max_path_length': 3, 'rand_restarts': 10}
+test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-3THRXWW_pathway.txt	28	20	8	28	1	27	28	1	{'b': 2, 'd': 10, 'g': '1e-3', 'r': 0.01, 'w': 0.1, 'mu': 0.008, 'dummy_mode': 'file'}
+test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-5QH767V_pathway.txt	39	31	8	39	1	38	39	1	{'b': 10, 'd': 10, 'g': '1e-3', 'r': 0.01, 'w': 0.1, 'mu': 0.008, 'dummy_mode': 'file'}
+test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-ITO5EQS_pathway.txt	14	9	5	14	0	14	14	0	{'b': 0.55, 'd': 10, 'g': '1e-3', 'r': 0.01, 'w': 0.1, 'mu': 0.008, 'dummy_mode': 'file'}
+test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-EHHWPMD_pathway.txt	593	591	2	531	1	530	531	1	{'b': 2, 'g': 3}
+test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-IV3IPCJ_pathway.txt	704	702	2	616	1	615	616	1	{'b': 4, 'g': 0}
+test/analysis/input/egfr/tps-egfr-pathlinker-params-7S4SLU6_pathway.txt	14	17	1	6	1	5	6	1	{'k': 10}
+test/analysis/input/egfr/tps-egfr-pathlinker-params-TCEMRS7_pathway.txt	25	32	1	11	1	10	11	1	{'k': 20}
diff --git a/test/analysis/expected_output/test_example_summary.txt b/test/analysis/expected_output/test_example_summary.txt
@@ -0,0 +1,13 @@
+Name	Number of nodes	Number of edges	Number of connected components	Nodes in prize	Nodes in active	Nodes in dummy	Nodes in sources	Nodes in targets	Parameter combination
+test/analysis/input/example/data0-allpairs-params-BEH6YB2_pathway.txt	3	2	1	2	2	1	1	1	{'spras_placeholder': 'no parameters'}
+test/analysis/input/example/data0-domino-params-V3X4RW7_pathway.txt	0	0	0	0	0	0	0	0	{'slice_threshold': 0.3, 'module_threshold': 0.05}
+test/analysis/input/example/data0-meo-params-GKEDDFZ_pathway.txt	3	2	1	2	2	1	1	1	{'max_path_length': 3, 'local_search': 'Yes', 'rand_restarts': 10}
+test/analysis/input/example/data0-mincostflow-params-SZPZVU6_pathway.txt	3	2	1	2	2	1	1	1	{'flow': 1, 'capacity': 1}
+test/analysis/input/example/data0-omicsintegrator1-params-E3LSEZQ_pathway.txt	3	2	1	2	2	1	1	1	{'b': 6, 'w': 5.0, 'd': 10, 'dummy_mode': 'file'}
+test/analysis/input/example/data0-omicsintegrator1-params-NFIPHUX_pathway.txt	0	0	0	0	0	0	0	0	{'b': 6, 'w': 0.0, 'd': 10, 'dummy_mode': 'file'}
+test/analysis/input/example/data0-omicsintegrator1-params-SU2S63Y_pathway.txt	3	2	1	2	2	1	1	1	{'b': 5, 'w': 0.0, 'd': 10, 'dummy_mode': 'file'}
+test/analysis/input/example/data0-omicsintegrator1-params-V26JBGX_pathway.txt	0	0	0	0	0	0	0	0	{'b': 5, 'w': 5.0, 'd': 10, 'dummy_mode': 'file'}
+test/analysis/input/example/data0-omicsintegrator2-params-EHHWPMD_pathway.txt	0	0	0	0	0	0	0	0	{'b': 2, 'g': 3}
+test/analysis/input/example/data0-omicsintegrator2-params-IV3IPCJ_pathway.txt	3	2	1	2	2	1	1	1	{'b': 4, 'g': 0}
+test/analysis/input/example/data0-pathlinker-params-6SWY7JS_pathway.txt	3	2	1	2	2	1	1	1	{'k': 200}
+test/analysis/input/example/data0-pathlinker-params-VQL7BDZ_pathway.txt	3	2	1	2	2	1	1	1	{'k': 100}
diff --git a/test/analysis/input/config.yaml b/test/analysis/input/config.yaml
@@ -0,0 +1,137 @@
+# The length of the hash used to identify a parameter combination
+hash_length: 7
+
+# Specify the container framework. Current supported versions include 'docker' and
+# 'singularity'. If container_framework is not specified, SPRAS will default to docker.
+container_framework: docker
+
+# Only used if container_framework is set to singularity, this will unpack the singularity containers
+# to the local filesystem. This is useful when PRM containers need to run inside another container,
+# such as would be the case in an HTCondor/OSPool environment.
+# NOTE: This unpacks singularity containers to the local filesystem, which will take up space in a way
+# that persists after the workflow is complete. To clean up the unpacked containers, the user must
+# manually delete them.
+unpack_singularity: false
+
+# Allow the user to configure which container registry containers should be pulled from
+# Note that this assumes container names are consistent across registries, and that the
+# registry being passed doesn't require authentication for pull actions
+container_registry:
+   base_url: docker.io
+   # The owner or project of the registry
+   # For example, "reedcompbio" if the image is available as docker.io/reedcompbio/allpairs
+   owner: reedcompbio
+
+algorithms:
+      - name: "pathlinker"
+        params:
+              include: true
+              run1:
+                  k: range(100,201,100)
+
+      - name: "omicsintegrator1"
+        params:
+              include: true
+              run1:
+                  b: [5, 6]
+                  w: np.linspace(0,5,2)
+                  d: [10]
+                  dummy_mode: ["file"]
+
+      - name: "omicsintegrator2"
+        params:
+              include: true
+              run1:
+                  b: [4]
+                  g: [0]
+              run2:
+                  b: [2]
+                  g: [3]
+
+      - name: "meo"
+        params:
+              include: true
+              run1:
+                  max_path_length: [3]
+                  local_search: ["Yes"]
+                  rand_restarts: [10]
+
+      - name: "mincostflow"
+        params:
+              include: true
+              run1:
+                  flow: [1] # The flow must be an int
+                  capacity: [1]
+
+      - name: "allpairs"
+        params:
+              include: true
+
+      - name: "domino"
+        params:
+              include: true
+              run1:
+                  slice_threshold: [0.3]
+                  module_threshold: [0.05]
+
+
+# Here we specify which pathways to run and other file location information.
+# DataLoader.py can currently only load a single dataset
+# Assume that if a dataset label does not change, the lists of associated input files do not change
+datasets:
+    -
+      # Labels can only contain letters, numbers, or underscores
+      label: data0
+      node_files: ["node-prizes.txt", "sources.txt", "targets.txt"]
+      # DataLoader.py can currently only load a single edge file, which is the primary network
+      edge_files: ["network.txt"]
+      # Placeholder
+      other_files: []
+      # Relative path from the spras directory
+      data_dir: "input"
+
+gold_standards:
+    -
+      # Labels can only contain letters, numbers, or underscores
+      label: gs0
+      node_files: ["gs_nodes0.txt"]
+      data_dir: "input"
+      # List of dataset labels to compare with the specific gold standard dataset
+      dataset_labels: ["data0"]
+
+# If we want to reconstruct then we should set run to true.
+reconstruction_settings:
+        #set where everything is saved
+        locations:
+              #place the save path here
+              reconstruction_dir: "output"
+        run: true
+
+analysis:
+      # Create one summary per pathway file and a single summary table for all pathways for each dataset
+      summary:
+        include: true
+      # Create output files for each pathway that can be visualized with GraphSpace
+      graphspace:
+        include: true
+      # Create Cytoscape session file with all pathway graphs for each dataset
+      cytoscape:
+        include: true
+      # Machine learning analysis (e.g. clustering) of the pathway output files for each dataset
+      ml:
+        # ml analysis per dataset
+        include: false
+        # adds ml analysis per algorithm output
+        # only runs for algorithms with multiple parameter combinations chosen
+        aggregate_per_algorithm: true
+        # specify how many principal components to calculate
+        components: 2
+        # boolean to show the labels on the pca graph
+        labels: true
+        # 'ward', 'complete', 'average', 'single'
+        # if linkage: ward, must use metric: euclidean
+        linkage: 'ward'
+        # 'euclidean', 'manhattan', 'cosine'
+        metric: 'euclidean'
+      evaluation:
+        include: false