diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py index b8237ed3b..c8abc1cad 100644 --- a/spras/analysis/summary.py +++ b/spras/analysis/summary.py @@ -1,4 +1,5 @@ from pathlib import Path +from statistics import median from typing import Iterable import networkx as nx @@ -50,8 +51,40 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg number_edges = nw.number_of_edges() ncc = nx.number_connected_components(nw) + # Save the max/median degree, average clustering coefficient, and density + if number_nodes == 0: + max_degree = 0 + median_degree = 0.0 + density = 0.0 + else: + degrees = [deg for _, deg in nw.degree()] + max_degree = max(degrees) + median_degree = median(degrees) + density = nx.density(nw) + + cc = list(nx.connected_components(nw)) + # Save the max diameter + # Use diameter only for components with ≥2 nodes (singleton components have diameter 0) + diameters = [ + nx.diameter(nw.subgraph(c).copy()) if len(c) > 1 else 0 + for c in cc + ] + max_diameter = max(diameters, default=0) + + # Save the average path lengths + # Compute average shortest path length only for components with ≥2 nodes (undefined for singletons, set to 0.0) + avg_path_lengths = [ + nx.average_shortest_path_length(nw.subgraph(c).copy()) if len(c) > 1 else 0.0 + for c in cc + ] + + if len(avg_path_lengths) != 0: + avg_path_len = sum(avg_path_lengths) / len(avg_path_lengths) + else: + avg_path_len = 0.0 + # Initialize list to store current network information - cur_nw_info = [nw_name, number_nodes, number_edges, ncc] + cur_nw_info = [nw_name, number_nodes, number_edges, ncc, density, max_degree, median_degree, max_diameter, avg_path_len] # Iterate through each node property and save the intersection with the current network for node_list in nodes_by_col: @@ -71,7 +104,7 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg nw_info.append(cur_nw_info) # Prepare column names - col_names = ['Name', 'Number of nodes', 'Number of edges', 'Number of connected components'] + col_names = ['Name', 'Number of nodes', 'Number of edges', 'Number of connected components', 'Density', 'Max degree', 'Median degree', 'Max diameter', 'Average path length'] col_names.extend(nodes_by_col_labs) col_names.append('Parameter combination') diff --git a/test/analysis/expected_output/expected_egfr_summary.txt b/test/analysis/expected_output/expected_egfr_summary.txt new file mode 100644 index 000000000..0b4fe9ebd --- /dev/null +++ b/test/analysis/expected_output/expected_egfr_summary.txt @@ -0,0 +1,10 @@ +Name Number of nodes Number of edges Number of connected components Density Max degree Median degree Max diameter Average path length Nodes in prize Nodes in sources Nodes in targets Nodes in active Nodes in dummy Parameter combination +test/analysis/input/egfr/tps-egfr-domino-params-V3X4RW7_pathway.txt 48 45 3 0.0398936170212766 5 2.0 16 3.882808476926124 27 0 27 27 0 {'slice_threshold': 0.3, 'module_threshold': 0.05} +test/analysis/input/egfr/tps-egfr-meo-params-GKEDDFZ_pathway.txt 1877 12845 1 0.007295700506524384 469 6.0 6 2.7973618474338107 621 1 620 621 1 {'local_search': 'Yes', 'max_path_length': 3, 'rand_restarts': 10} +test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-3THRXWW_pathway.txt 28 20 8 0.05291005291005291 4 1.0 5 1.306439393939394 28 1 27 28 1 {'b': 2, 'd': 10, 'g': '1e-3', 'r': 0.01, 'w': 0.1, 'mu': 0.008, 'dummy_mode': 'file'} +test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-5QH767V_pathway.txt 39 31 8 0.04183535762483131 6 1.0 5 1.5084498834498834 39 1 38 39 1 {'b': 10, 'd': 10, 'g': '1e-3', 'r': 0.01, 'w': 0.1, 'mu': 0.008, 'dummy_mode': 'file'} +test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-ITO5EQS_pathway.txt 14 9 5 0.0989010989010989 4 1.0 2 1.1866666666666668 14 0 14 14 0 {'b': 0.55, 'd': 10, 'g': '1e-3', 'r': 0.01, 'w': 0.1, 'mu': 0.008, 'dummy_mode': 'file'} +test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-EHHWPMD_pathway.txt 593 591 2 0.0033669841848593955 32 1.0 30 6.72248989073389 531 1 530 531 1 {'b': 2, 'g': 3} +test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-IV3IPCJ_pathway.txt 704 702 2 0.002836867968446916 35 1.0 24 6.038766691954387 616 1 615 616 1 {'b': 4, 'g': 0} +test/analysis/input/egfr/tps-egfr-pathlinker-params-7S4SLU6_pathway.txt 14 17 1 0.18681318681318682 6 2.0 7 2.857142857142857 6 1 5 6 1 {'k': 10} +test/analysis/input/egfr/tps-egfr-pathlinker-params-TCEMRS7_pathway.txt 25 32 1 0.10666666666666667 8 2.0 7 3.486666666666667 11 1 10 11 1 {'k': 20} diff --git a/test/analysis/expected_output/expected_example_summary.txt b/test/analysis/expected_output/expected_example_summary.txt new file mode 100644 index 000000000..4cb5b8c8f --- /dev/null +++ b/test/analysis/expected_output/expected_example_summary.txt @@ -0,0 +1,13 @@ +Name Number of nodes Number of edges Number of connected components Density Max degree Median degree Max diameter Average path length Nodes in prize Nodes in active Nodes in dummy Nodes in sources Nodes in targets Parameter combination +test/analysis/input/example/data0-allpairs-params-BEH6YB2_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'spras_placeholder': 'no parameters'} +test/analysis/input/example/data0-domino-params-V3X4RW7_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'slice_threshold': 0.3, 'module_threshold': 0.05} +test/analysis/input/example/data0-meo-params-GKEDDFZ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'max_path_length': 3, 'local_search': 'Yes', 'rand_restarts': 10} +test/analysis/input/example/data0-mincostflow-params-SZPZVU6_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'flow': 1, 'capacity': 1} +test/analysis/input/example/data0-omicsintegrator1-params-E3LSEZQ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'b': 6, 'w': 5.0, 'd': 10, 'dummy_mode': 'file'} +test/analysis/input/example/data0-omicsintegrator1-params-NFIPHUX_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'b': 6, 'w': 0.0, 'd': 10, 'dummy_mode': 'file'} +test/analysis/input/example/data0-omicsintegrator1-params-SU2S63Y_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'b': 5, 'w': 0.0, 'd': 10, 'dummy_mode': 'file'} +test/analysis/input/example/data0-omicsintegrator1-params-V26JBGX_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'b': 5, 'w': 5.0, 'd': 10, 'dummy_mode': 'file'} +test/analysis/input/example/data0-omicsintegrator2-params-EHHWPMD_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'b': 2, 'g': 3} +test/analysis/input/example/data0-omicsintegrator2-params-IV3IPCJ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'b': 4, 'g': 0} +test/analysis/input/example/data0-pathlinker-params-6SWY7JS_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'k': 200} +test/analysis/input/example/data0-pathlinker-params-VQL7BDZ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'k': 100} diff --git a/test/analysis/expected_output/test_egfr_summary.txt b/test/analysis/expected_output/test_egfr_summary.txt deleted file mode 100644 index 4e8e3ac95..000000000 --- a/test/analysis/expected_output/test_egfr_summary.txt +++ /dev/null @@ -1,10 +0,0 @@ -Name Number of nodes Number of edges Number of connected components Nodes in prize Nodes in sources Nodes in targets Nodes in active Nodes in dummy Parameter combination -test/analysis/input/egfr/tps-egfr-domino-params-V3X4RW7_pathway.txt 48 45 3 27 0 27 27 0 {'slice_threshold': 0.3, 'module_threshold': 0.05} -test/analysis/input/egfr/tps-egfr-meo-params-GKEDDFZ_pathway.txt 1877 12845 1 621 1 620 621 1 {'local_search': 'Yes', 'max_path_length': 3, 'rand_restarts': 10} -test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-3THRXWW_pathway.txt 28 20 8 28 1 27 28 1 {'b': 2, 'd': 10, 'g': '1e-3', 'r': 0.01, 'w': 0.1, 'mu': 0.008, 'dummy_mode': 'file'} -test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-5QH767V_pathway.txt 39 31 8 39 1 38 39 1 {'b': 10, 'd': 10, 'g': '1e-3', 'r': 0.01, 'w': 0.1, 'mu': 0.008, 'dummy_mode': 'file'} -test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-ITO5EQS_pathway.txt 14 9 5 14 0 14 14 0 {'b': 0.55, 'd': 10, 'g': '1e-3', 'r': 0.01, 'w': 0.1, 'mu': 0.008, 'dummy_mode': 'file'} -test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-EHHWPMD_pathway.txt 593 591 2 531 1 530 531 1 {'b': 2, 'g': 3} -test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-IV3IPCJ_pathway.txt 704 702 2 616 1 615 616 1 {'b': 4, 'g': 0} -test/analysis/input/egfr/tps-egfr-pathlinker-params-7S4SLU6_pathway.txt 14 17 1 6 1 5 6 1 {'k': 10} -test/analysis/input/egfr/tps-egfr-pathlinker-params-TCEMRS7_pathway.txt 25 32 1 11 1 10 11 1 {'k': 20} diff --git a/test/analysis/expected_output/test_example_summary.txt b/test/analysis/expected_output/test_example_summary.txt deleted file mode 100644 index 2d35023ef..000000000 --- a/test/analysis/expected_output/test_example_summary.txt +++ /dev/null @@ -1,13 +0,0 @@ -Name Number of nodes Number of edges Number of connected components Nodes in prize Nodes in active Nodes in dummy Nodes in sources Nodes in targets Parameter combination -test/analysis/input/example/data0-allpairs-params-BEH6YB2_pathway.txt 3 2 1 2 2 1 1 1 {'spras_placeholder': 'no parameters'} -test/analysis/input/example/data0-domino-params-V3X4RW7_pathway.txt 0 0 0 0 0 0 0 0 {'slice_threshold': 0.3, 'module_threshold': 0.05} -test/analysis/input/example/data0-meo-params-GKEDDFZ_pathway.txt 3 2 1 2 2 1 1 1 {'max_path_length': 3, 'local_search': 'Yes', 'rand_restarts': 10} -test/analysis/input/example/data0-mincostflow-params-SZPZVU6_pathway.txt 3 2 1 2 2 1 1 1 {'flow': 1, 'capacity': 1} -test/analysis/input/example/data0-omicsintegrator1-params-E3LSEZQ_pathway.txt 3 2 1 2 2 1 1 1 {'b': 6, 'w': 5.0, 'd': 10, 'dummy_mode': 'file'} -test/analysis/input/example/data0-omicsintegrator1-params-NFIPHUX_pathway.txt 0 0 0 0 0 0 0 0 {'b': 6, 'w': 0.0, 'd': 10, 'dummy_mode': 'file'} -test/analysis/input/example/data0-omicsintegrator1-params-SU2S63Y_pathway.txt 3 2 1 2 2 1 1 1 {'b': 5, 'w': 0.0, 'd': 10, 'dummy_mode': 'file'} -test/analysis/input/example/data0-omicsintegrator1-params-V26JBGX_pathway.txt 0 0 0 0 0 0 0 0 {'b': 5, 'w': 5.0, 'd': 10, 'dummy_mode': 'file'} -test/analysis/input/example/data0-omicsintegrator2-params-EHHWPMD_pathway.txt 0 0 0 0 0 0 0 0 {'b': 2, 'g': 3} -test/analysis/input/example/data0-omicsintegrator2-params-IV3IPCJ_pathway.txt 3 2 1 2 2 1 1 1 {'b': 4, 'g': 0} -test/analysis/input/example/data0-pathlinker-params-6SWY7JS_pathway.txt 3 2 1 2 2 1 1 1 {'k': 200} -test/analysis/input/example/data0-pathlinker-params-VQL7BDZ_pathway.txt 3 2 1 2 2 1 1 1 {'k': 100} diff --git a/test/analysis/test_summary.py b/test/analysis/test_summary.py index 0400d1f1b..57f1f6012 100644 --- a/test/analysis/test_summary.py +++ b/test/analysis/test_summary.py @@ -51,7 +51,7 @@ def test_example_networks(self): # Comparing the dataframes directly with equals does not match because of how the parameter # combinations column is loaded from disk. Therefore, write both to disk and compare the files. - assert filecmp.cmp(out_path, EXPECT_DIR + "test_example_summary.txt", shallow=False) + assert filecmp.cmp(out_path, EXPECT_DIR + "expected_example_summary.txt", shallow=False) def test_egfr_networks(self): """Test data from EGFR workflow""" @@ -80,7 +80,7 @@ def test_egfr_networks(self): # Comparing the dataframes directly with equals does not match because of how the parameter # combinations column is loaded from disk. Therefore, write both to disk and compare the files. - assert filecmp.cmp(out_path, EXPECT_DIR + "test_egfr_summary.txt", shallow=False) + assert filecmp.cmp(out_path, EXPECT_DIR + "expected_egfr_summary.txt", shallow=False) def test_load_dataset_dict(self): """Test loading files from dataset_dict"""