Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,9 @@ rule parse_output:
params = reconstruction_params(wildcards.algorithm, wildcards.params).copy()
params['dataset'] = input.dataset_file
runner.parse_output(wildcards.algorithm, input.raw_file, output.standardized_file, params)
# TODO: cache heuristics result, store partial heuristics configuration file
# to allow this rule to update when heuristics change
_config.config.heuristics.validate_graph_from_file(output.standardized_file)

# TODO: reuse in the future once we make summary work for mixed graphs. See https://github.com/Reed-CompBio/spras/issues/128
# Collect summary statistics for a single pathway
Expand Down
44 changes: 6 additions & 38 deletions spras/analysis/summary.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from pathlib import Path
from statistics import median
from typing import Iterable

import networkx as nx
import pandas as pd

from spras.statistics import compute_statistics, statistics_options


def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict],
algo_with_params: list) -> pd.DataFrame:
Expand Down Expand Up @@ -47,44 +48,11 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg

# Save the network name, number of nodes, number edges, and number of connected components
nw_name = str(file_path)
number_nodes = nw.number_of_nodes()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these the same changes from #411 and any new design from that pull request will be merged in here?

number_edges = nw.number_of_edges()
ncc = nx.number_connected_components(nw)

# Save the max/median degree, average clustering coefficient, and density
if number_nodes == 0:
max_degree = 0
median_degree = 0.0
density = 0.0
else:
degrees = [deg for _, deg in nw.degree()]
max_degree = max(degrees)
median_degree = median(degrees)
density = nx.density(nw)

cc = list(nx.connected_components(nw))
# Save the max diameter
# Use diameter only for components with ≥2 nodes (singleton components have diameter 0)
diameters = [
nx.diameter(nw.subgraph(c).copy()) if len(c) > 1 else 0
for c in cc
]
max_diameter = max(diameters, default=0)

# Save the average path lengths
# Compute average shortest path length only for components with ≥2 nodes (undefined for singletons, set to 0.0)
avg_path_lengths = [
nx.average_shortest_path_length(nw.subgraph(c).copy()) if len(c) > 1 else 0.0
for c in cc
]

if len(avg_path_lengths) != 0:
avg_path_len = sum(avg_path_lengths) / len(avg_path_lengths)
else:
avg_path_len = 0.0

graph_statistics = compute_statistics(nw, statistics_options)

# Initialize list to store current network information
cur_nw_info = [nw_name, number_nodes, number_edges, ncc, density, max_degree, median_degree, max_diameter, avg_path_len]
cur_nw_info = [nw_name, *graph_statistics.values()]

# Iterate through each node property and save the intersection with the current network
for node_list in nodes_by_col:
Expand All @@ -104,7 +72,7 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
nw_info.append(cur_nw_info)

# Prepare column names
col_names = ['Name', 'Number of nodes', 'Number of edges', 'Number of connected components', 'Density', 'Max degree', 'Median degree', 'Max diameter', 'Average path length']
col_names = ['Name', *statistics_options]
col_names.extend(nodes_by_col_labs)
col_names.append('Parameter combination')

Expand Down
2 changes: 2 additions & 0 deletions spras/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ def __init__(self, raw_config: dict[str, Any]):
self.container_settings = ProcessedContainerSettings.from_container_settings(parsed_raw_config.containers, self.hash_length)
# The list of algorithms to run in the workflow. Each is a dict with 'name' as an expected key.
self.algorithms = None
# The heuristic handler
self.heuristics = parsed_raw_config.heuristics
# A nested dict mapping algorithm names to dicts that map parameter hashes to parameter combinations.
# Only includes algorithms that are set to be run with 'include: true'.
self.algorithm_params = None
Expand Down
105 changes: 105 additions & 0 deletions spras/config/heuristics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import os

import networkx as nx
from pydantic import BaseModel, ConfigDict

from spras.interval import Interval
from spras.statistics import compute_statistics, statistics_options

all = ['GraphHeuristicsError', 'GraphHeuristic']

class GraphHeuristicsError(RuntimeError):
"""
Represents an error arising from a graph algorithm output
not meeting the necessary graph heuristisc.
"""
failed_heuristics: list[tuple[str, float | int, list[Interval]]]

@staticmethod
def format_failed_heuristic(heuristic: tuple[str, float | int, list[Interval]]) -> str:
name, desired, intervals = heuristic
if len(intervals) == 1:
interval_string = str(intervals[0])
else:
formatted_intervals = ", ".join([str(interval) for interval in intervals])
interval_string = f"one of the intervals ({formatted_intervals})"
return f"{name} expected {desired} in interval {interval_string}"
Comment on lines +25 to +26
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This text doesn't quite match up. You could get "in interval one of the intervals..."

@staticmethod
def to_string(failed_heuristics: list[tuple[str, float | int, list[Interval]]]):
formatted_heuristics = [
GraphHeuristicsError.format_failed_heuristic(heuristic) for heuristic in failed_heuristics
]

formatted_heuristics = "\n".join([f"- {formatted_heuristics}" for heuristic in formatted_heuristics])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we use a different character besides - like * for the list? I'm trying to imagine whether we could ever have a leading negative here in formatted_heuristics that would be confusing.

return f"The following heuristics failed:\n{formatted_heuristics}"

def __init__(self, failed_heuristics: list[tuple[str, float | int, list[Interval]]]):
super().__init__(GraphHeuristicsError.to_string(failed_heuristics))

self.failed_heuristics = failed_heuristics

def __str__(self) -> str:
return GraphHeuristicsError.to_string(self.failed_heuristics)

class GraphHeuristics(BaseModel):
number_of_nodes: Interval | list[Interval] = []
number_of_edges: Interval | list[Interval] = []
number_of_connected_components: Interval | list[Interval] = []
density: Interval | list[Interval] = []

max_degree: Interval | list[Interval] = []
median_degree: Interval | list[Interval] = []
max_diameter: Interval | list[Interval] = []
average_path_length: Interval | list[Interval] = []

def validate_graph(self, graph: nx.DiGraph):
statistics_dictionary = {
'Number of nodes': self.number_of_nodes,
'Number of edges': self.number_of_edges,
'Number of connected components': self.number_of_connected_components,
'Density': self.density,
'Max degree': self.max_degree,
'Median degree': self.median_degree,
'Max diameter': self.max_diameter,
'Average path length': self.average_path_length
}

# quick assert: is statistics_dictionary exhaustive?
assert set(statistics_dictionary.keys()) == set(statistics_options)

stats = compute_statistics(
graph,
list(k for k, v in statistics_dictionary.items() if not isinstance(v, list) or len(v) != 0)
)

failed_heuristics: list[tuple[str, float | int, list[Interval]]] = []
for key, value in stats.items():
intervals = statistics_dictionary[key]
if not isinstance(intervals, list): intervals = [intervals]

for interval in intervals:
if not interval.mem(value):
failed_heuristics.append((key, value, intervals))
break

if len(failed_heuristics) != 0:
raise GraphHeuristicsError(failed_heuristics)

model_config = ConfigDict(extra='forbid')

def validate_graph_from_file(self, path: str | os.PathLike):
"""
Takes in a graph produced by PRM#parse_output,
and throws a GraphHeuristicsError if it fails the heuristics in `self`.
"""
# TODO: re-use from summary.py once we have a mixed/hypergraph library
G: nx.DiGraph = nx.read_edgelist(path, data=(('Rank', str), ('Direction', str)), create_using=nx.DiGraph)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is reading in directed edges but summary.py reads undirected edges. Those should be consistent. That is a good reason to use shared code if possible so it doesn't accidentally diverge later.


# We explicitly use `list` here to stop add_edge
# from expanding our iterator infinitely.
for source, target, data in list(G.edges(data=True)):
if data["Direction"] == 'U':
G.add_edge(target, source, data=data)
pass

return self.validate_graph(G)
3 changes: 3 additions & 0 deletions spras/config/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from pydantic import AfterValidator, BaseModel, ConfigDict

from spras.config.container_schema import ContainerSettings
from spras.config.heuristics import GraphHeuristics
from spras.config.util import CaseInsensitiveEnum

# Most options here have an `include` property,
Expand Down Expand Up @@ -151,6 +152,8 @@ class RawConfig(BaseModel):

reconstruction_settings: ReconstructionSettings

heuristics: GraphHeuristics = GraphHeuristics()

# We include use_attribute_docstrings here to preserve the docstrings
# after attributes at runtime (for future JSON schema generation)
model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True)
Loading
Loading