Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
c83370b
new updates
cgsze Nov 11, 2024
bdb818f
Merge branch 'update-summary' of https://github.com/cgsze/spras into …
cgsze Nov 11, 2024
320d399
pre-commit
cgsze Nov 11, 2024
753035a
updated filepath indexing
cgsze Dec 2, 2024
ed6e4ca
edit test cases
cgsze Dec 2, 2024
c4c65e9
Updated input/output test files
cgsze Jan 13, 2025
4893f86
node tables into edge/node files from yaml
cgsze Jan 22, 2025
400df3b
init from file
cgsze Jan 22, 2025
a75cb4f
dataset_dict
cgsze Feb 17, 2025
aadb0f8
started loading test case
cgsze Feb 17, 2025
83cacd8
resolved mismatched param combos
cgsze Feb 17, 2025
602145c
latest changes
cgsze Feb 18, 2025
a1a189b
resolved 3 AttributeErrors
cgsze Mar 3, 2025
4f365f6
resolved TypeError, set all but summary to false
cgsze Mar 17, 2025
a6a7f48
debugging summarize_networks returning empty df -> AssertionError
cgsze Mar 31, 2025
3a1c0c6
fixed AssertionErrors
cgsze Apr 1, 2025
d0b0b6c
load dataset test case
cgsze Apr 4, 2025
9df48cd
pytest fixes
cgsze Apr 7, 2025
6b7092f
Merge branch 'Reed-CompBio:master' into update-summary
cgsze Apr 7, 2025
a7187e0
revert file path changes
cgsze Apr 7, 2025
ac99e5e
load dataset_dict test case
cgsze Apr 21, 2025
1823788
added dummy node to test case/summarize_networks
cgsze Apr 28, 2025
fb6b1ae
output/ directory issues (to be continued)
cgsze Apr 28, 2025
0d29b43
test_summary.py test cases
cgsze May 19, 2025
a8563d1
debugging
cgsze May 19, 2025
c1fc881
set graphspace/cytoscape to true in config files
cgsze May 19, 2025
61e7b09
new changes
cgsze May 19, 2025
156a9a5
linux style paths
cgsze May 19, 2025
4696cdc
Refactor summarization code
agitter May 24, 2025
f00a42a
Fix test case paths
agitter May 24, 2025
1cf27b8
Fix test case path again
agitter May 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ rule summary_table:
run:
# Load the node table from the pickled dataset file
node_table = Dataset.from_file(input.dataset_file).node_table
summary_df = summary.summarize_networks(input.pathways, node_table)
summary_df = summary.summarize_networks(input.pathways, node_table, algorithm_params, algorithms_with_params)
summary_df.to_csv(output.summary_table, sep='\t', index=False)

# Cluster the output pathways for each dataset
Expand Down
54 changes: 35 additions & 19 deletions spras/analysis/summary.py
Original file line number Diff line number Diff line change
@@ -1,71 +1,87 @@
import os
import sys
from pathlib import Path
from typing import Iterable

import networkx as nx
import pandas as pd


def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame) -> pd.DataFrame:
def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict],
algo_with_params: list) -> pd.DataFrame:
"""
Generate a table that aggregates summary information about networks in file_paths,
including which nodes are present in node_table columns.
Generate a table that aggregates summary information about networks in file_paths, including which nodes are present
in node_table columns. Network directionality is ignored and all edges are treated as undirected. The order of the
file_paths and algo_with_params inputs must match after they are each sorted.
@param file_paths: iterable of edge list files
@param node_table: pandas DataFrame containing node attributes
@param algo_params: a nested dict mapping algorithm names to dicts that map parameter hashes to parameter
combinations.
@param algo_with_params: a list of <algorithm>-params-<params_hash> combinations
@return: pandas DataFrame with summary information
"""
# Ensure that NODEID is the first column
assert node_table.columns[0] == "NODEID"
assert node_table.columns[0] == 'NODEID'
# Initialize list to store input nodes that have property data
nodes_by_col = []
# Save new labels
nodes_by_col_labs = ("Nodes in " + node_table.columns[1:]).tolist()
nodes_by_col_labs = ('Nodes in ' + node_table.columns[1:]).tolist()
# Iterate through each node property column
for col in node_table.columns[1:]:
# Assumption: property columns only contain NA, boolean, numeric data
# If the property contains numeric data, save the nodes with property values that are not NA and > 0
# If the property contains boolean data, save the nodes with property values that are True
nodes_by_col.append(set(node_table.loc[node_table[col] > 0, "NODEID"]))
nodes_by_col.append(set(node_table.loc[node_table[col] > 0, 'NODEID']))

# Initialize list to store network summary data
nw_info = []

# Iterate through each network file path
for file_path in sorted(file_paths):
algo_with_params = sorted(algo_with_params)

# Iterate through each network file path
for index, file_path in enumerate(sorted(file_paths)):
with open(file_path, 'r') as f:
lines = f.readlines()[1:] # skip the header line

# directed or mixed graphs are parsed and summarized as an undirected graph
nw = nx.read_edgelist(lines, data=(('weight', float), ('Direction', str)))

# Save the network name, number of nodes, number edges, and number of connected components
nw_name = str(file_path)
number_nodes = nw.number_of_nodes()
number_edges = nw.number_of_edges()
ncc = nx.number_connected_components(nw)

# Initialize list to store current network information
cur_nw_info = [nw_name, number_nodes, number_edges, ncc]

# Iterate through each node property and save the intersection with the current network
for node_list in nodes_by_col:
num_nodes = len(set(nw).intersection(node_list))
cur_nw_info.append(num_nodes)

# String split to access algorithm and hashcode: <algorithm>-params-<params_hash>
parts = algo_with_params[index].split('-')
algo = parts[0]
hashcode = parts[2]

# Algorithm parameters have format { algo : { hashcode : { parameter combos } } }
param_combo = algo_params[algo][hashcode]
cur_nw_info.append(param_combo)

# Save the current network information to the network summary list
nw_info.append(cur_nw_info)

# Prepare column names
col_names = ['Name', 'Number of nodes', 'Number of edges', 'Number of connected components']
col_names.extend(nodes_by_col_labs)
col_names.append('Parameter combination')

# Convert the network summary data to pandas dataframe
# Could refactor to create the dataframe line by line instead of storing data as lists and then converting
nw_info = pd.DataFrame(
nw_info,
columns=[
"Name",
"Number of nodes",
"Number of undirected edges",
"Number of connected components"
]
+
nodes_by_col_labs
columns=col_names
)

return nw_info


Expand Down Expand Up @@ -129,5 +145,5 @@ def degree(g):
# save(dat, argv[2])


# if __name__ == "__main__":
# if __name__ == '__main__':
# main(sys.argv)
4 changes: 4 additions & 0 deletions test/analysis/expected_output/expected_node_table.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
NODEID prize active dummy sources targets
0 C 5.7 True NaN NaN True
1 A 2.0 True True True NaN
2 B NaN NaN NaN NaN NaN
10 changes: 10 additions & 0 deletions test/analysis/expected_output/test_egfr_summary.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Name Number of nodes Number of edges Number of connected components Nodes in prize Nodes in sources Nodes in targets Nodes in active Nodes in dummy Parameter combination
test/analysis/input/egfr/tps-egfr-domino-params-V3X4RW7_pathway.txt 48 45 3 27 0 27 27 0 {'slice_threshold': 0.3, 'module_threshold': 0.05}
test/analysis/input/egfr/tps-egfr-meo-params-GKEDDFZ_pathway.txt 1877 12845 1 621 1 620 621 1 {'local_search': 'Yes', 'max_path_length': 3, 'rand_restarts': 10}
test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-3THRXWW_pathway.txt 28 20 8 28 1 27 28 1 {'b': 2, 'd': 10, 'g': '1e-3', 'r': 0.01, 'w': 0.1, 'mu': 0.008, 'dummy_mode': 'file'}
test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-5QH767V_pathway.txt 39 31 8 39 1 38 39 1 {'b': 10, 'd': 10, 'g': '1e-3', 'r': 0.01, 'w': 0.1, 'mu': 0.008, 'dummy_mode': 'file'}
test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-ITO5EQS_pathway.txt 14 9 5 14 0 14 14 0 {'b': 0.55, 'd': 10, 'g': '1e-3', 'r': 0.01, 'w': 0.1, 'mu': 0.008, 'dummy_mode': 'file'}
test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-EHHWPMD_pathway.txt 593 591 2 531 1 530 531 1 {'b': 2, 'g': 3}
test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-IV3IPCJ_pathway.txt 704 702 2 616 1 615 616 1 {'b': 4, 'g': 0}
test/analysis/input/egfr/tps-egfr-pathlinker-params-7S4SLU6_pathway.txt 14 17 1 6 1 5 6 1 {'k': 10}
test/analysis/input/egfr/tps-egfr-pathlinker-params-TCEMRS7_pathway.txt 25 32 1 11 1 10 11 1 {'k': 20}
13 changes: 13 additions & 0 deletions test/analysis/expected_output/test_example_summary.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Name Number of nodes Number of edges Number of connected components Nodes in prize Nodes in active Nodes in dummy Nodes in sources Nodes in targets Parameter combination
test/analysis/input/example/data0-allpairs-params-BEH6YB2_pathway.txt 3 2 1 2 2 1 1 1 {'spras_placeholder': 'no parameters'}
test/analysis/input/example/data0-domino-params-V3X4RW7_pathway.txt 0 0 0 0 0 0 0 0 {'slice_threshold': 0.3, 'module_threshold': 0.05}
test/analysis/input/example/data0-meo-params-GKEDDFZ_pathway.txt 3 2 1 2 2 1 1 1 {'max_path_length': 3, 'local_search': 'Yes', 'rand_restarts': 10}
test/analysis/input/example/data0-mincostflow-params-SZPZVU6_pathway.txt 3 2 1 2 2 1 1 1 {'flow': 1, 'capacity': 1}
test/analysis/input/example/data0-omicsintegrator1-params-E3LSEZQ_pathway.txt 3 2 1 2 2 1 1 1 {'b': 6, 'w': 5.0, 'd': 10, 'dummy_mode': 'file'}
test/analysis/input/example/data0-omicsintegrator1-params-NFIPHUX_pathway.txt 0 0 0 0 0 0 0 0 {'b': 6, 'w': 0.0, 'd': 10, 'dummy_mode': 'file'}
test/analysis/input/example/data0-omicsintegrator1-params-SU2S63Y_pathway.txt 3 2 1 2 2 1 1 1 {'b': 5, 'w': 0.0, 'd': 10, 'dummy_mode': 'file'}
test/analysis/input/example/data0-omicsintegrator1-params-V26JBGX_pathway.txt 0 0 0 0 0 0 0 0 {'b': 5, 'w': 5.0, 'd': 10, 'dummy_mode': 'file'}
test/analysis/input/example/data0-omicsintegrator2-params-EHHWPMD_pathway.txt 0 0 0 0 0 0 0 0 {'b': 2, 'g': 3}
test/analysis/input/example/data0-omicsintegrator2-params-IV3IPCJ_pathway.txt 3 2 1 2 2 1 1 1 {'b': 4, 'g': 0}
test/analysis/input/example/data0-pathlinker-params-6SWY7JS_pathway.txt 3 2 1 2 2 1 1 1 {'k': 200}
test/analysis/input/example/data0-pathlinker-params-VQL7BDZ_pathway.txt 3 2 1 2 2 1 1 1 {'k': 100}
137 changes: 137 additions & 0 deletions test/analysis/input/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# The length of the hash used to identify a parameter combination
hash_length: 7

# Specify the container framework. Current supported versions include 'docker' and
# 'singularity'. If container_framework is not specified, SPRAS will default to docker.
container_framework: docker

# Only used if container_framework is set to singularity, this will unpack the singularity containers
# to the local filesystem. This is useful when PRM containers need to run inside another container,
# such as would be the case in an HTCondor/OSPool environment.
# NOTE: This unpacks singularity containers to the local filesystem, which will take up space in a way
# that persists after the workflow is complete. To clean up the unpacked containers, the user must
# manually delete them.
unpack_singularity: false

# Allow the user to configure which container registry containers should be pulled from
# Note that this assumes container names are consistent across registries, and that the
# registry being passed doesn't require authentication for pull actions
container_registry:
base_url: docker.io
# The owner or project of the registry
# For example, "reedcompbio" if the image is available as docker.io/reedcompbio/allpairs
owner: reedcompbio

algorithms:
- name: "pathlinker"
params:
include: true
run1:
k: range(100,201,100)

- name: "omicsintegrator1"
params:
include: true
run1:
b: [5, 6]
w: np.linspace(0,5,2)
d: [10]
dummy_mode: ["file"]

- name: "omicsintegrator2"
params:
include: true
run1:
b: [4]
g: [0]
run2:
b: [2]
g: [3]

- name: "meo"
params:
include: true
run1:
max_path_length: [3]
local_search: ["Yes"]
rand_restarts: [10]

- name: "mincostflow"
params:
include: true
run1:
flow: [1] # The flow must be an int
capacity: [1]

- name: "allpairs"
params:
include: true

- name: "domino"
params:
include: true
run1:
slice_threshold: [0.3]
module_threshold: [0.05]


# Here we specify which pathways to run and other file location information.
# DataLoader.py can currently only load a single dataset
# Assume that if a dataset label does not change, the lists of associated input files do not change
datasets:
-
# Labels can only contain letters, numbers, or underscores
label: data0
node_files: ["node-prizes.txt", "sources.txt", "targets.txt"]
# DataLoader.py can currently only load a single edge file, which is the primary network
edge_files: ["network.txt"]
# Placeholder
other_files: []
# Relative path from the spras directory
data_dir: "input"

gold_standards:
-
# Labels can only contain letters, numbers, or underscores
label: gs0
node_files: ["gs_nodes0.txt"]
data_dir: "input"
# List of dataset labels to compare with the specific gold standard dataset
dataset_labels: ["data0"]

# If we want to reconstruct then we should set run to true.
reconstruction_settings:
#set where everything is saved
locations:
#place the save path here
reconstruction_dir: "output"
run: true

analysis:
# Create one summary per pathway file and a single summary table for all pathways for each dataset
summary:
include: true
# Create output files for each pathway that can be visualized with GraphSpace
graphspace:
include: true
# Create Cytoscape session file with all pathway graphs for each dataset
cytoscape:
include: true
# Machine learning analysis (e.g. clustering) of the pathway output files for each dataset
ml:
# ml analysis per dataset
include: false
# adds ml analysis per algorithm output
# only runs for algorithms with multiple parameter combinations chosen
aggregate_per_algorithm: true
# specify how many principal components to calculate
components: 2
# boolean to show the labels on the pca graph
labels: true
# 'ward', 'complete', 'average', 'single'
# if linkage: ward, must use metric: euclidean
linkage: 'ward'
# 'euclidean', 'manhattan', 'cosine'
metric: 'euclidean'
evaluation:
include: false
Loading
Loading