Reed-CompBio · tristan-f-r · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 30, 2025
diff --git a/Snakefile b/Snakefile
@@ -295,6 +295,9 @@ rule parse_output:
         params = reconstruction_params(wildcards.algorithm, wildcards.params).copy()
         params['dataset'] = input.dataset_file
         runner.parse_output(wildcards.algorithm, input.raw_file, output.standardized_file, params)
+        # TODO: cache heuristics result, store partial heuristics configuration file
+        # to allow this rule to update when heuristics change
+        _config.config.heuristics.validate_graph_from_file(output.standardized_file)
 
 # TODO: reuse in the future once we make summary work for mixed graphs. See https://github.com/Reed-CompBio/spras/issues/128
 # Collect summary statistics for a single pathway

diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py
@@ -1,10 +1,11 @@
 from pathlib import Path
-from statistics import median
 from typing import Iterable
 
 import networkx as nx
 import pandas as pd
 
+from spras.statistics import compute_statistics, statistics_options
+
 
 def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict],
                        algo_with_params: list) -> pd.DataFrame:
@@ -47,44 +48,11 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
 
         # Save the network name, number of nodes, number edges, and number of connected components
         nw_name = str(file_path)
-        number_nodes = nw.number_of_nodes()
-        number_edges = nw.number_of_edges()
-        ncc = nx.number_connected_components(nw)
-
-        # Save the max/median degree, average clustering coefficient, and density
-        if number_nodes == 0:
-            max_degree = 0
-            median_degree = 0.0
-            density = 0.0
-        else:
-            degrees = [deg for _, deg in nw.degree()]
-            max_degree = max(degrees)
-            median_degree = median(degrees)
-            density = nx.density(nw)
-
-        cc = list(nx.connected_components(nw))
-        # Save the max diameter
-        # Use diameter only for components with ≥2 nodes (singleton components have diameter 0)
-        diameters = [
-            nx.diameter(nw.subgraph(c).copy()) if len(c) > 1 else 0
-            for c in cc
-        ]
-        max_diameter = max(diameters, default=0)
-
-        # Save the average path lengths
-        # Compute average shortest path length only for components with ≥2 nodes (undefined for singletons, set to 0.0)
-        avg_path_lengths = [
-            nx.average_shortest_path_length(nw.subgraph(c).copy()) if len(c) > 1 else 0.0
-            for c in cc
-        ]
-
-        if len(avg_path_lengths) != 0:
-            avg_path_len = sum(avg_path_lengths) / len(avg_path_lengths)
-        else:
-            avg_path_len = 0.0
+
+        graph_statistics = compute_statistics(nw, statistics_options)
 
         # Initialize list to store current network information
-        cur_nw_info = [nw_name, number_nodes, number_edges, ncc, density, max_degree, median_degree, max_diameter, avg_path_len]
+        cur_nw_info = [nw_name, *graph_statistics.values()]
 
         # Iterate through each node property and save the intersection with the current network
         for node_list in nodes_by_col:
@@ -104,7 +72,7 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
         nw_info.append(cur_nw_info)
 
     # Prepare column names
-    col_names = ['Name', 'Number of nodes', 'Number of edges', 'Number of connected components', 'Density', 'Max degree', 'Median degree', 'Max diameter', 'Average path length']
+    col_names = ['Name', *statistics_options]
     col_names.extend(nodes_by_col_labs)
     col_names.append('Parameter combination')
 

diff --git a/spras/config/config.py b/spras/config/config.py
@@ -78,6 +78,8 @@ def __init__(self, raw_config: dict[str, Any]):
         self.container_settings = ProcessedContainerSettings.from_container_settings(parsed_raw_config.containers, self.hash_length)
         # The list of algorithms to run in the workflow. Each is a dict with 'name' as an expected key.
         self.algorithms = None
+        # The heuristic handler
+        self.heuristics = parsed_raw_config.heuristics
         # A nested dict mapping algorithm names to dicts that map parameter hashes to parameter combinations.
         # Only includes algorithms that are set to be run with 'include: true'.
         self.algorithm_params = None

diff --git a/spras/config/heuristics.py b/spras/config/heuristics.py
@@ -0,0 +1,105 @@
+import os
+
+import networkx as nx
+from pydantic import BaseModel, ConfigDict
+
+from spras.interval import Interval
+from spras.statistics import compute_statistics, statistics_options
+
+all = ['GraphHeuristicsError', 'GraphHeuristic']
+
+class GraphHeuristicsError(RuntimeError):
+    """
+    Represents an error arising from a graph algorithm output
+    not meeting the necessary graph heuristisc.
+    """
+    failed_heuristics: list[tuple[str, float | int, list[Interval]]]
+
+    @staticmethod
+    def format_failed_heuristic(heuristic: tuple[str, float | int, list[Interval]]) -> str:
+        name, desired, intervals = heuristic
+        if len(intervals) == 1:
+            interval_string = str(intervals[0])
+        else:
+            formatted_intervals = ", ".join([str(interval) for interval in intervals])
+            interval_string = f"one of the intervals ({formatted_intervals})"
+        return f"{name} expected {desired} in interval {interval_string}"
+    @staticmethod
+    def to_string(failed_heuristics: list[tuple[str, float | int, list[Interval]]]):
+        formatted_heuristics = [
+            GraphHeuristicsError.format_failed_heuristic(heuristic) for heuristic in failed_heuristics
+        ]
+
+        formatted_heuristics = "\n".join([f"- {formatted_heuristics}" for heuristic in formatted_heuristics])
+        return f"The following heuristics failed:\n{formatted_heuristics}"
+
+    def __init__(self, failed_heuristics: list[tuple[str, float | int, list[Interval]]]):
+        super().__init__(GraphHeuristicsError.to_string(failed_heuristics))
+
+        self.failed_heuristics = failed_heuristics
+
+    def __str__(self) -> str:
+        return GraphHeuristicsError.to_string(self.failed_heuristics)
+
+class GraphHeuristics(BaseModel):
+    number_of_nodes: Interval | list[Interval] = []
+    number_of_edges: Interval | list[Interval] = []
+    number_of_connected_components: Interval | list[Interval] = []
+    density: Interval | list[Interval] = []
+
+    max_degree: Interval | list[Interval] = []
+    median_degree: Interval | list[Interval] = []
+    max_diameter: Interval | list[Interval] = []
+    average_path_length: Interval | list[Interval] = []
+
+    def validate_graph(self, graph: nx.DiGraph):
+        statistics_dictionary = {
+            'Number of nodes': self.number_of_nodes,
+            'Number of edges': self.number_of_edges,
+            'Number of connected components': self.number_of_connected_components,
+            'Density': self.density,
+            'Max degree': self.max_degree,
+            'Median degree': self.median_degree,
+            'Max diameter': self.max_diameter,
+            'Average path length': self.average_path_length
+        }
+
+        # quick assert: is statistics_dictionary exhaustive?
+        assert set(statistics_dictionary.keys()) == set(statistics_options)
+
+        stats = compute_statistics(
+            graph,
+            list(k for k, v in statistics_dictionary.items() if not isinstance(v, list) or len(v) != 0)
+        )
+
+        failed_heuristics: list[tuple[str, float | int, list[Interval]]] = []
+        for key, value in stats.items():
+            intervals = statistics_dictionary[key]
+            if not isinstance(intervals, list): intervals = [intervals]
+
+            for interval in intervals:
+                if not interval.mem(value):
+                    failed_heuristics.append((key, value, intervals))
+                    break
+
+        if len(failed_heuristics) != 0:
+            raise GraphHeuristicsError(failed_heuristics)
+
+    model_config = ConfigDict(extra='forbid')
+
+    def validate_graph_from_file(self, path: str | os.PathLike):
+        """
+        Takes in a graph produced by PRM#parse_output,
+        and throws a GraphHeuristicsError if it fails the heuristics in `self`.
+        """
+        # TODO: re-use from summary.py once we have a mixed/hypergraph library
+        G: nx.DiGraph = nx.read_edgelist(path, data=(('Rank', str), ('Direction', str)), create_using=nx.DiGraph)
+
+        # We explicitly use `list` here to stop add_edge
+        # from expanding our iterator infinitely.
+        for source, target, data in list(G.edges(data=True)):
+            if data["Direction"] == 'U':
+                G.add_edge(target, source, data=data)
+            pass
+
+        return self.validate_graph(G)
diff --git a/spras/config/schema.py b/spras/config/schema.py
@@ -16,6 +16,7 @@
 from pydantic import AfterValidator, BaseModel, ConfigDict
 
 from spras.config.container_schema import ContainerSettings
+from spras.config.heuristics import GraphHeuristics
 from spras.config.util import CaseInsensitiveEnum
 
 # Most options here have an `include` property,
@@ -151,6 +152,8 @@ class RawConfig(BaseModel):
 
     reconstruction_settings: ReconstructionSettings
 
+    heuristics: GraphHeuristics = GraphHeuristics()
+
     # We include use_attribute_docstrings here to preserve the docstrings
     # after attributes at runtime (for future JSON schema generation)
     model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True)