Reed-CompBio · tristan-f-r · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 30, 2025
diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py
@@ -1,10 +1,11 @@
 from pathlib import Path
-from statistics import median
 from typing import Iterable
 
 import networkx as nx
 import pandas as pd
 
+from spras.statistics import compute_statistics, statistics_options
+
 
 def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict],
                        algo_with_params: list) -> pd.DataFrame:
@@ -47,44 +48,11 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
 
         # Save the network name, number of nodes, number edges, and number of connected components
         nw_name = str(file_path)
-        number_nodes = nw.number_of_nodes()
-        number_edges = nw.number_of_edges()
-        ncc = nx.number_connected_components(nw)
-
-        # Save the max/median degree, average clustering coefficient, and density
-        if number_nodes == 0:
-            max_degree = 0
-            median_degree = 0.0
-            density = 0.0
-        else:
-            degrees = [deg for _, deg in nw.degree()]
-            max_degree = max(degrees)
-            median_degree = median(degrees)
-            density = nx.density(nw)
-
-        cc = list(nx.connected_components(nw))
-        # Save the max diameter
-        # Use diameter only for components with ≥2 nodes (singleton components have diameter 0)
-        diameters = [
-            nx.diameter(nw.subgraph(c).copy()) if len(c) > 1 else 0
-            for c in cc
-        ]
-        max_diameter = max(diameters, default=0)
-
-        # Save the average path lengths
-        # Compute average shortest path length only for components with ≥2 nodes (undefined for singletons, set to 0.0)
-        avg_path_lengths = [
-            nx.average_shortest_path_length(nw.subgraph(c).copy()) if len(c) > 1 else 0.0
-            for c in cc
-        ]
-
-        if len(avg_path_lengths) != 0:
-            avg_path_len = sum(avg_path_lengths) / len(avg_path_lengths)
-        else:
-            avg_path_len = 0.0
+
+        graph_statistics = compute_statistics(nw, statistics_options)
 
         # Initialize list to store current network information
-        cur_nw_info = [nw_name, number_nodes, number_edges, ncc, density, max_degree, median_degree, max_diameter, avg_path_len]
+        cur_nw_info = [nw_name, *graph_statistics.values()]
 
         # Iterate through each node property and save the intersection with the current network
         for node_list in nodes_by_col:
@@ -104,7 +72,7 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
         nw_info.append(cur_nw_info)
 
     # Prepare column names
-    col_names = ['Name', 'Number of nodes', 'Number of edges', 'Number of connected components', 'Density', 'Max degree', 'Median degree', 'Max diameter', 'Average path length']
+    col_names = ['Name', *statistics_options]
     col_names.extend(nodes_by_col_labs)
     col_names.append('Parameter combination')
 

diff --git a/spras/statistics.py b/spras/statistics.py
@@ -0,0 +1,91 @@
+"""
+Graph statistics, used to power summary.py.
+
+We allow for arbitrary computation of any specific statistic on some graph,
+computing more than necessary if we have dependencies. See the top level
+`statistics_computation` dictionary for usage.
+"""
+
+import itertools
+from statistics import median
+from typing import Callable
+
+import networkx as nx
+
+
+def compute_degree(graph: nx.DiGraph) -> tuple[int, float]:
+    """
+    Computes the (max, median) degree of a `graph`.
+    """
+    # number_of_nodes is a cheap call
+    if graph.number_of_nodes() == 0:
+        return (0, 0.0)
+    else:
+        degrees = [deg for _, deg in graph.degree()]
+        return max(degrees), median(degrees)
+
+def compute_on_cc(directed_graph: nx.DiGraph) -> tuple[int, float]:
+    graph: nx.Graph = directed_graph.to_undirected()
+    cc = list(nx.connected_components(graph))
+    # Save the max diameter
+    # Use diameter only for components with ≥2 nodes (singleton components have diameter 0)
+    diameters = [
+        nx.diameter(graph.subgraph(c).copy()) if len(c) > 1 else 0
+        for c in cc
+    ]
+    max_diameter = max(diameters, default=0)
+
+    # Save the average path lengths
+    # Compute average shortest path length only for components with ≥2 nodes (undefined for singletons, set to 0.0)
+    avg_path_lengths = [
+        nx.average_shortest_path_length(graph.subgraph(c).copy()) if len(c) > 1 else 0.0
+        for c in cc
+    ]
+
+    if len(avg_path_lengths) != 0:
+        avg_path_len = sum(avg_path_lengths) / len(avg_path_lengths)
+    else:
+        avg_path_len = 0.0
+
+    return max_diameter, avg_path_len
+
+# The type signature on here is quite bad. I would like to say that an n-tuple has n-outputs.
+statistics_computation: dict[tuple[str, ...], Callable[[nx.DiGraph], tuple[float | int, ...]]] = {
+    ('Number of nodes',): lambda graph : (graph.number_of_nodes(),),
+    ('Number of edges',): lambda graph : (graph.number_of_edges(),),
+    ('Number of connected components',): lambda graph : (nx.number_connected_components(graph.to_undirected()),),
+    ('Density',): lambda graph : (nx.density(graph),),
+
+    ('Max degree', 'Median degree'): compute_degree,
+    ('Max diameter', 'Average path length'): compute_on_cc,
+}
+
+# All of the keys inside statistics_computation, flattened.
+statistics_options: list[str] = list(itertools.chain(*(list(key) for key in statistics_computation.keys())))
+
+def compute_statistics(graph: nx.DiGraph, statistics: list[str]) -> dict[str, float | int]:
+    """
+    Computes `statistics` for a graph corresponding to the top-level `statistics` dictionary
+    in this file.
+    """
+
+    # early-scan cutoff for statistics:
+    # we want to err as soon as possible
+    for stat in statistics:
+        if stat not in statistics_options:
+            raise RuntimeError(f"Statistic {stat} not a computable statistics! Available statistics: {statistics_options}")
+
+    # now, we can compute statistics only
+    computed_statistics: dict[str, float | int] = dict()
+    for statistic_tuple, compute in statistics_computation.items():
+        # when we want them
+        if not set(statistic_tuple).isdisjoint(set(statistics)):
+            computed_tuple = compute(graph)
+            assert len(statistic_tuple) == len(computed_tuple), f"bad tuple length for {statistic_tuple}"
+
+            current_computed_statistics = zip(statistic_tuple, computed_tuple, strict=True)
+            for stat, value in current_computed_statistics:
+                computed_statistics[stat] = value
+
+    # (and return only the statistics we wanted)
+    return {key: computed_statistics[key] for key in statistics}