From 5d495da0d05c7230177b1c0781e15e783d14b11b Mon Sep 17 00:00:00 2001 From: ntalluri Date: Thu, 30 Oct 2025 13:42:47 -0500 Subject: [PATCH 01/10] added edge evaluation per dataset-edge-goldstandard pair --- Snakefile | 34 ++++++---- spras/evaluation.py | 162 +++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 173 insertions(+), 23 deletions(-) diff --git a/Snakefile b/Snakefile index 02f019e8d..a54ea6015 100644 --- a/Snakefile +++ b/Snakefile @@ -112,9 +112,12 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) + # dummy file final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}dummy-edge.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_edge_pairs)) - + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-edges.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_edge_pairs)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-edges.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_edge_pairs)) + if _config.config.analysis_include_evaluation_aggregate_algo: final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-for-{algorithm}-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs,algorithm=algorithms)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-for-{algorithm}-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs,algorithm=algorithms)) @@ -478,7 +481,7 @@ def collect_pca_coordinates_per_dataset(wildcards): # Run PCA chosen to select the representative from all pathway outputs for a given dataset, # then evaluate with precision and recall against the corresponding gold standard -rule evaluation_pca_chosen: +rule evaluation_pca_chosen_nodes: input: node_gold_standard_file = get_gold_standard_pickle_file, pca_coordinates_file = collect_pca_coordinates_per_dataset, @@ -492,6 +495,22 @@ rule evaluation_pca_chosen: pr_df = Evaluation.node_precision_and_recall(pca_chosen_pathway, node_table) Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, output.node_pca_chosen_pr_file, output.node_pca_chosen_pr_png) +rule evaluation_pca_chosen_edges: + input: + edge_gold_standard_file = get_gold_standard_pickle_file, + pca_coordinates_file = collect_pca_coordinates_per_dataset, + pathway_summary_file = collect_summary_statistics_per_dataset + output: + edge_pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pair}-eval', 'pr-pca-chosen-pathway-edges.txt']), + edge_pca_chosen_pr_png = SEP.join([out_dir, '{dataset_gold_standard_pair}-eval', 'pr-pca-chosen-pathway-edges.png']), + run: + mixed_edge_table = Evaluation.from_file(input.edge_gold_standard_file).mixed_edge_table + undirected_edge_table = Evaluation.from_file(input.edge_gold_standard_file).undirected_edge_table + directed_edge_table = Evaluation.from_file(input.edge_gold_standard_file).directed_edge_table + pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, input.pathway_summary_file, out_dir) + pr_df = Evaluation.edge_precision_and_recall(pca_chosen_pathway, mixed_edge_table, directed_edge_table, undirected_edge_table) + Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, output.edge_pca_chosen_pr_file, output.edge_pca_chosen_pr_png, edge_evaluation=True) + # Returns pca coordinates for a specific algorithm and dataset def collect_pca_coordinates_per_algo_per_dataset(wildcards): dataset_label = get_dataset_label(wildcards) @@ -556,17 +575,6 @@ rule evaluation_per_algo_ensemble_pr_curve: node_ensembles_dict = Evaluation.edge_frequency_node_ensemble(node_table, input.ensemble_files, input.dataset_file) Evaluation.precision_recall_curve_node_ensemble(node_ensembles_dict, node_table, output.node_pr_curve_png, output.node_pr_curve_file, include_aggregate_algo_eval) -rule evaluation_edge_dummy: - input: - edge_gold_standard_file = get_gold_standard_pickle_file, - output: - dummy_file = SEP.join([out_dir, '{dataset_gold_standard_pair}-eval', 'dummy-edge.txt']), - run: - mixed_edge_table = Evaluation.from_file(input.edge_gold_standard_file).mixed_edge_table - undirected_edge_table = Evaluation.from_file(input.edge_gold_standard_file).undirected_edge_table - directed_edge_table = Evaluation.from_file(input.edge_gold_standard_file).directed_edge_table - Evaluation.edge_dummy_function(mixed_edge_table, undirected_edge_table, directed_edge_table, output.dummy_file) - # Remove the output directory rule clean: shell: f'rm -rf {out_dir}' diff --git a/spras/evaluation.py b/spras/evaluation.py index 8c09e3f34..c82e5d2ed 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -179,8 +179,9 @@ def node_precision_and_recall(file_paths: Iterable[Union[str, PathLike]], node_t pr_df = pd.DataFrame(results) return pr_df + @staticmethod - def visualize_precision_and_recall_plot(pr_df: pd.DataFrame, output_file: str | PathLike, output_png: str | PathLike, title: str): + def nodes_visualize_precision_and_recall_plot(pr_df: pd.DataFrame, output_file: str | PathLike, output_png: str | PathLike, title: str): """ Generates a scatter plot of precision and recall values for each pathway and saves both the plot and the data. @@ -230,11 +231,127 @@ def visualize_precision_and_recall_plot(pr_df: pd.DataFrame, output_file: str | pr_df.drop(columns=['Algorithm'], inplace=True) pr_df.to_csv(output_file, sep='\t', index=False) + def edge_precision_and_recall(file_paths: Iterable[Union[str, PathLike]], mixed_edge_table: pd.DataFrame, directed_edge_table: pd.DataFrame, undirected_edge_table: pd.DataFrame) -> pd.DataFrame: + """ + Computes edge-level precision and recall for each pathway reconstruction output file against three edge gold standard tables. + + This function takes a list of file paths corresponding to pathway reconstruction algorithm outputs, + each formatted as a tab-separated file with columns 'Node1', 'Node2', 'Rank', and 'Direction'. + It compares the set of predicted edges to the three provided gold standard edge tables and computes precision and recall per file. + + @param file_paths: list of file paths of pathway reconstruction algorithm outputs + @param mixed_edge_table: the gold standard edges that includes directed and undirected edges + @param directed_edge_table: the gold standard edges that only includes directed edges + @param undirected_edge_table: the gold standard edges that only includes undirected edges + @return: A DataFrame with the following columns: + - 'Pathway': Path object corresponding to each pathway file + - 'Precision': Precision of predicted nodes vs. gold standard nodes + - 'Recall': Recall of predicted nodes vs. gold standard nodes + - 'Gold_Standard_Type': Which gold standard was used to calculate the precision and recall + """ + + y_true_mixed = set(map(tuple, mixed_edge_table[['Interactor1', 'Interactor2', 'Direction']].values)) + y_true_directed = set(map(tuple, directed_edge_table[['Interactor1', 'Interactor2', 'Direction']].values)) + y_true_undirected = set(map(tuple, undirected_edge_table[['Interactor1', 'Interactor2', 'Direction']].values)) + + results = [] + for f in file_paths: + df = pd.read_table(f, sep='\t', header=0) + y_pred = set(map(tuple, df[['Node1', 'Node2', 'Direction']].values)) + + all_edges_mixed = y_true_mixed.union(y_pred) + y_true_mixed_binary = [1 if edge in y_true_mixed else 0 for edge in all_edges_mixed] + y_pred_mixed_binary = [1 if edge in y_pred else 0 for edge in all_edges_mixed] + # default to 0.0 if there is a divide by 0 error + # not using precision_recall_curve because thresholds are binary (0 or 1); rather we are directly + # calculating precision and recall per pathway + precision_mixed = precision_score(y_true_mixed_binary, y_pred_mixed_binary, zero_division=0.0) + recall_mixed = recall_score(y_true_mixed_binary, y_pred_mixed_binary, zero_division=0.0) + results.append({'Pathway': f, 'Precision': precision_mixed, 'Recall': recall_mixed, 'Gold_Standard_Type': "mixed"}) + + all_edges_directed = y_true_directed.union(y_pred) + y_true_directed_binary = [1 if edge in y_true_directed else 0 for edge in all_edges_directed] + y_pred_directed_binary = [1 if edge in y_pred else 0 for edge in all_edges_directed] + precision_directed = precision_score(y_true_directed_binary, y_pred_directed_binary, zero_division=0.0) + recall_directed = recall_score(y_true_directed_binary, y_pred_directed_binary, zero_division=0.0) + results.append({'Pathway': f, 'Precision': precision_directed, 'Recall': recall_directed, 'Gold_Standard_Type': "directed"}) + + all_edges_undirected = y_true_undirected.union(y_pred) + y_true_undirected_binary = [1 if edge in y_true_undirected else 0 for edge in all_edges_undirected] + y_pred_undirected_binary = [1 if edge in y_pred else 0 for edge in all_edges_undirected] + precision_undirected = precision_score(y_true_undirected_binary, y_pred_undirected_binary, zero_division=0.0) + recall_undirected = recall_score(y_true_undirected_binary, y_pred_undirected_binary, zero_division=0.0) + results.append({'Pathway': f, 'Precision': precision_undirected, 'Recall': recall_undirected, 'Gold_Standard_Type': "undirected"}) + + pr_df = pd.DataFrame(results) + return pr_df + + @staticmethod + def edges_visualize_precision_and_recall_plot(pr_df: pd.DataFrame, output_file: str | PathLike, output_png: str | PathLike, title: str): + """ + Generates three scatter plot subplots showing edge precision and recall values for each pathway across three edge gold standard types, + and saves both the resulting plots and the corresponding data. + + This function is intended for visualizing how different pathway reconstructions perform + (not a precision-recall curve) showing the precision and recall of each parameter combination + for each algorithm per edge gold standard dataset. + + @param pr_df: Dataframe of calculated precision and recall for each pathway file per edge gold standard. + Must include a preprocessed 'Algorithm' column and 'Gold_Standard_Type" column + @param output_file: the filename to save the precision and recall of each pathway per gold standard type + @param output_png: the filename to plot the precision and recall of each pathway (not a PRC) per gold standard type + @param title: The title to use for the plot + """ + if 'Algorithm' not in pr_df.columns: + raise ValueError( + "Column 'Algorithm' not found in DataFrame. " + "The input DataFrame must include a preprocessed 'Algorithm' column to visulize a precision and recall per pathway file per gold standard type." + ) + if 'Gold_Standard_Type' not in pr_df.columns: + raise ValueError( + "Column 'Gold_Standard_Type' not found in DataFrame. " + "The input DataFrame must include a preprocessed 'Gold_Standard_Type' column indicating the edge directionality used for the gold standard, which is required to visualize precision and recall for each pathway file per gold standard type." + ) + + + gs_types = pr_df["Gold_Standard_Type"].unique().tolist() + fig, axes = plt.subplots(1, len(gs_types), figsize=(6 * len(gs_types), 5), sharex=True, sharey=True, constrained_layout=True) + color_palette = create_palette(pr_df['Algorithm'].tolist()) + + for ax, gs_type in zip(axes, gs_types, strict=True): + df_gs_type = pr_df[pr_df["Gold_Standard_Type"] == gs_type] + for algorithm, subset in df_gs_type.groupby('Algorithm'): + if not subset.empty: + ax.plot( + subset['Recall'], + subset['Precision'], + color=color_palette[algorithm], + marker='o', + linestyle='', + label=algorithm.capitalize() + ) + ax.set_title(gs_type.capitalize()) + ax.set_xlim(-0.05, 1.05) + ax.set_ylim(-0.05, 1.05) + ax.grid(True) + + fig.supxlabel("Recall") + fig.supylabel("Precision") + fig.suptitle(title) + handles, labels = axes[0].get_legend_handles_labels() + fig.legend(handles, labels, loc="upper right") # TODO: when doing aggregate per algorithm, check if this needs to be fixed to be in a different place (issue might be constrained_layout) + plt.savefig(output_png) + plt.close(fig) + + # save dataframe + pr_df.drop(columns=['Algorithm'], inplace=True) + pr_df.to_csv(output_file, sep='\t', index=False) + @staticmethod def precision_and_recall_per_pathway(pr_df: pd.DataFrame, output_file: str | PathLike, output_png: str | PathLike, aggregate_per_algorithm: bool = False): """ Function for visualizing per pathway precision and recall across all algorithms. Each point in the plot represents - a single pathway reconstruction. If `aggregate_per_algorithm` is set to True, the plot is restricted to a single + a single pathway reconstruction. If `aggregate_per_algorithm` is set to True, each plot is restricted to a single algorithm and titled accordingly. @param pr_df: Dataframe of calculated precision and recall for each pathway file @@ -252,7 +369,7 @@ def precision_and_recall_per_pathway(pr_df: pd.DataFrame, output_file: str | Pat else: title = "Precision and Recall Plot Per Pathway Per Algorithm" - Evaluation.visualize_precision_and_recall_plot(pr_df, output_file, output_png, title) + Evaluation.nodes_visualize_precision_and_recall_plot(pr_df, output_file, output_png, title) else: # this block should never be reached — having 0 pathways implies that no algorithms or parameter combinations were run, @@ -260,18 +377,20 @@ def precision_and_recall_per_pathway(pr_df: pd.DataFrame, output_file: str | Pat raise ValueError("No pathways were provided to evaluate and visulize on. This likely means no algorithms or parameter combinations were run.") @staticmethod - def precision_and_recall_pca_chosen_pathway(pr_df: pd.DataFrame, output_file: str | PathLike, output_png: str | PathLike, aggregate_per_algorithm: bool = False): + def precision_and_recall_pca_chosen_pathway(pr_df: pd.DataFrame, output_file: str | PathLike, output_png: str | PathLike, aggregate_per_algorithm: bool = False, edge_evaluation: bool = False): """ Function for visualizing the precision and recall of the single parameter combination selected via PCA, either for each algorithm individually or one combination shared across all algorithms. Each point represents a pathway reconstruction corresponding to the PCA-selected parameter combination. If `aggregate_per_algorithm` - is True, the plot includes a pca chosen pathway per algorithm and titled accordingly. + is True, the plot includes a pca chosen pathway per algorithm and titled accordingly. If `edge_evaluation` is True, + the plot will include the evaluation across the three gold standard edge files. @param pr_df: Dataframe of calculated precision and recall for each pathway file @param output_file: the filename to save the precision and recall of each pathway @param output_png: the filename to plot the precision and recall of each pathway (not a PRC) - @param aggregate_per_algorithm: Boolean indicating if function is used per algorithm (Default False) + @param aggregate_per_algorithm: Boolean indicating if this function is used per algorithm (Default False) + @param edge_evaluation: Boolean indicating if this function is used for creating edge_evaluation plots (Default False; used for node evaluation) """ # TODO update to add in the pathways for the algorithms that do not provide a pca chosen pathway https://github.com/Reed-CompBio/spras/issues/341 @@ -279,12 +398,22 @@ def precision_and_recall_pca_chosen_pathway(pr_df: pd.DataFrame, output_file: st pr_df['Algorithm'] = pr_df['Pathway'].apply(lambda p: Path(p).parent.name.split('-')[1]) pr_df.sort_values(by=['Recall', 'Pathway'], axis=0, ascending=True, inplace=True) - if aggregate_per_algorithm: - title = "PCA-Chosen Pathway Per Algorithm Precision and Recall Plot" + if not edge_evaluation: + if aggregate_per_algorithm: + title = "Node Evaluation PCA-Chosen Pathway Per Algorithm Precision and Recall Plot" + else: + title = "Node Evaluation PCA-Chosen Pathway Across all Algorithms Precision and Recall Plot" + + Evaluation.nodes_visualize_precision_and_recall_plot(pr_df, output_file, output_png, title) + else: - title = "PCA-Chosen Pathway Across All Algorithms Precision and Recall Plot" + if aggregate_per_algorithm : + title = "Edge Evaluation PCA-Chosen Pathway Per Algorithm Precision and Recall Plot" + else: + title = "Edge Evaluation PCA-Chosen Pathway Across all Algorithms Precision and Recall Plot" + + Evaluation.edges_visualize_precision_and_recall_plot(pr_df, output_file, output_png, title) - Evaluation.visualize_precision_and_recall_plot(pr_df, output_file, output_png, title) else: # Edge case: if all algorithms chosen use only 1 parameter combination @@ -300,6 +429,16 @@ def precision_and_recall_pca_chosen_pathway(pr_df: pd.DataFrame, output_file: st plt.savefig(output_png) plt.close() + # TODO + # need to make a edge_precision_recall function to make the pr_df + # I think then the precision_and_recall_pca_chosen_pathway function can be reused but needs to be updated to be able to differentiate between nodes or edges + # i think I can do that with a boolean + # then I need to make a edges_visualize_precision_and_recall_plot that is called + # these can then be reused for no parameter selection evaluation + # i think I will need to make a new snakemake rule for each of the evaluatuon because the gold standards only include nodes or edges, + # sharing the same one will cause errors that one type of evalaution doesn't exist + + @staticmethod def pca_chosen_pathway(coordinates_files: list[Union[str, PathLike]], pathway_summary_file: str, output_dir: str): """ @@ -349,6 +488,7 @@ def pca_chosen_pathway(coordinates_files: list[Union[str, PathLike]], pathway_su rep_pathway = os.path.join(output_dir, f"{closest_to_kde_peak['datapoint_labels']}", "pathway.txt") rep_pathways.append(rep_pathway) + print(rep_pathways) return rep_pathways @staticmethod @@ -536,3 +676,5 @@ def edge_dummy_function(mixed_edge_table: pd.DataFrame, undirected_edge_table: p undirected_edge_table.to_csv(f, index=False) f.write("\n\nDirected Edge Table\n") directed_edge_table.to_csv(f, index=False) + + From 206c45351dcfdd1109184ec6f602396bd7b3f3ce Mon Sep 17 00:00:00 2001 From: Neha Talluri <78840540+ntalluri@users.noreply.github.com> Date: Thu, 30 Oct 2025 14:01:12 -0500 Subject: [PATCH 02/10] Apply suggestions from code review Co-authored-by: Tristan F.-R. --- Snakefile | 2 -- spras/evaluation.py | 1 - 2 files changed, 3 deletions(-) diff --git a/Snakefile b/Snakefile index a54ea6015..32dcc233b 100644 --- a/Snakefile +++ b/Snakefile @@ -113,8 +113,6 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) - # dummy file - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}dummy-edge.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_edge_pairs)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-edges.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_edge_pairs)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-edges.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_edge_pairs)) diff --git a/spras/evaluation.py b/spras/evaluation.py index c82e5d2ed..6ce035acb 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -488,7 +488,6 @@ def pca_chosen_pathway(coordinates_files: list[Union[str, PathLike]], pathway_su rep_pathway = os.path.join(output_dir, f"{closest_to_kde_peak['datapoint_labels']}", "pathway.txt") rep_pathways.append(rep_pathway) - print(rep_pathways) return rep_pathways @staticmethod From 5a6c4a80b71306e2cf683741717a384b88089c56 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Thu, 30 Oct 2025 14:16:20 -0500 Subject: [PATCH 03/10] in process of aggregate --- Snakefile | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index a54ea6015..fb5261d77 100644 --- a/Snakefile +++ b/Snakefile @@ -126,6 +126,9 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes-per-algorithm-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes-per-algorithm-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) + edge_pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pair}-eval', 'pr-pca-chosen-pathway-per-algorithm-edges.txt']), + edge_pca_chosen_pr_png = SEP.join([out_dir, '{dataset_gold_standard_pair}-eval', 'pr-pca-chosen-pathway-per-algorithm-edges.png']), + # Since (formatted) pathway files are interesting to the user, we preserve them. final_input.extend(expand('{out_dir}{sep}{dataset}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, dataset=dataset_labels, algorithm_params=algorithms_with_params)) @@ -518,7 +521,7 @@ def collect_pca_coordinates_per_algo_per_dataset(wildcards): # Run PCA chosen to select the representative pathway per algorithm pathway outputs for a given dataset, # then evaluate with precision and recall against the corresponding gold standard -rule evaluation_per_algo_pca_chosen: +rule evaluation_per_algo_pca_chosen_nodes: input: node_gold_standard_file = get_gold_standard_pickle_file, pca_coordinates_file = collect_pca_coordinates_per_algo_per_dataset, @@ -532,6 +535,30 @@ rule evaluation_per_algo_pca_chosen: pr_df = Evaluation.node_precision_and_recall(pca_chosen_pathways, node_table) Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, output.node_pca_chosen_pr_file, output.node_pca_chosen_pr_png, include_aggregate_algo_eval) +rule evaluation_per_algo_pca_chosen_edges: + input: + edge_gold_standard_file = get_gold_standard_pickle_file, + pca_coordinates_file = collect_pca_coordinates_per_algo_per_dataset, + pathway_summary_file = collect_summary_statistics_per_dataset + output: + edge_pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pair}-eval', 'pr-pca-chosen-pathway-per-algorithm-edges.txt']), + edge_pca_chosen_pr_png = SEP.join([out_dir, '{dataset_gold_standard_pair}-eval', 'pr-pca-chosen-pathway-per-algorithm-edges.png']), + run: + mixed_edge_table = Evaluation.from_file(input.edge_gold_standard_file).mixed_edge_table + undirected_edge_table = Evaluation.from_file(input.edge_gold_standard_file).undirected_edge_table + directed_edge_table = Evaluation.from_file(input.edge_gold_standard_file).directed_edge_table + + pca_chosen_pathways = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, input.pathway_summary_file, out_dir) + pr_df = Evaluation.edge_precision_and_recall(pca_chosen_pathways, mixed_edge_table, directed_edge_table, undirected_edge_table) + + Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, output.edge_pca_chosen_pr_file, output.edge_pca_chosen_pr_png, include_aggregate_algo_eval, edge_evaluation=True) + +# Returns pca coordinates for a specific algorithm and dataset +def collect_pca_coordinates_per_algo_per_dataset(wildcards): + dataset_label = get_dataset_label(wildcards) + return expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt', out_dir=out_dir, sep=SEP, dataset=dataset_label, algorithm=algorithms_mult_param_combos) #TODO we are using algos with mult param combos, what to do when empty? + + # Return the dataset pickle file for a specific dataset def get_dataset_pickle_file(wildcards): dataset_label = get_dataset_label(wildcards) From bead0e7f232d613b7e2ae8879e259a78adaad5ed Mon Sep 17 00:00:00 2001 From: ntalluri Date: Thu, 30 Oct 2025 14:50:19 -0500 Subject: [PATCH 04/10] added aggregate per algorithm --- Snakefile | 19 +++++++++++-------- spras/evaluation.py | 7 ++++--- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/Snakefile b/Snakefile index 50ea1a131..fcb45ed69 100644 --- a/Snakefile +++ b/Snakefile @@ -106,6 +106,7 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-jaccard-heatmap.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms)) if _config.config.analysis_include_evaluation: + # node evaluation final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) @@ -113,10 +114,12 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) + # edge evaluation final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-edges.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_edge_pairs)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-edges.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_edge_pairs)) if _config.config.analysis_include_evaluation_aggregate_algo: + # node evaluation final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-for-{algorithm}-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs,algorithm=algorithms)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-for-{algorithm}-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs,algorithm=algorithms)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-per-algorithm-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) @@ -124,9 +127,10 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes-per-algorithm-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes-per-algorithm-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) - edge_pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pair}-eval', 'pr-pca-chosen-pathway-per-algorithm-edges.txt']), - edge_pca_chosen_pr_png = SEP.join([out_dir, '{dataset_gold_standard_pair}-eval', 'pr-pca-chosen-pathway-per-algorithm-edges.png']), - + # edge evaluation + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-per-algorithm-edges.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_edge_pairs)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-per-algorithm-edges.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_edge_pairs)) + # Since (formatted) pathway files are interesting to the user, we preserve them. final_input.extend(expand('{out_dir}{sep}{dataset}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, dataset=dataset_labels, algorithm_params=algorithms_with_params)) @@ -431,7 +435,6 @@ def get_dataset_label(wildcards): dataset = parts[0] return dataset - # Returns all pathways for a specific dataset def collect_pathways_per_dataset(wildcards): dataset_label = get_dataset_label(wildcards) @@ -522,21 +525,21 @@ def collect_pca_coordinates_per_algo_per_dataset(wildcards): rule evaluation_per_algo_pca_chosen_nodes: input: node_gold_standard_file = get_gold_standard_pickle_file, - pca_coordinates_file = collect_pca_coordinates_per_algo_per_dataset, + pca_coordinates_files = collect_pca_coordinates_per_algo_per_dataset, pathway_summary_file = collect_summary_statistics_per_dataset output: node_pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pair}-eval', 'pr-pca-chosen-pathway-per-algorithm-nodes.txt']), node_pca_chosen_pr_png = SEP.join([out_dir, '{dataset_gold_standard_pair}-eval', 'pr-pca-chosen-pathway-per-algorithm-nodes.png']), run: node_table = Evaluation.from_file(input.node_gold_standard_file).node_table - pca_chosen_pathways = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, input.pathway_summary_file, out_dir) + pca_chosen_pathways = Evaluation.pca_chosen_pathway(input.pca_coordinates_files, input.pathway_summary_file, out_dir) pr_df = Evaluation.node_precision_and_recall(pca_chosen_pathways, node_table) Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, output.node_pca_chosen_pr_file, output.node_pca_chosen_pr_png, include_aggregate_algo_eval) rule evaluation_per_algo_pca_chosen_edges: input: edge_gold_standard_file = get_gold_standard_pickle_file, - pca_coordinates_file = collect_pca_coordinates_per_algo_per_dataset, + pca_coordinates_files = collect_pca_coordinates_per_algo_per_dataset, pathway_summary_file = collect_summary_statistics_per_dataset output: edge_pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pair}-eval', 'pr-pca-chosen-pathway-per-algorithm-edges.txt']), @@ -546,7 +549,7 @@ rule evaluation_per_algo_pca_chosen_edges: undirected_edge_table = Evaluation.from_file(input.edge_gold_standard_file).undirected_edge_table directed_edge_table = Evaluation.from_file(input.edge_gold_standard_file).directed_edge_table - pca_chosen_pathways = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, input.pathway_summary_file, out_dir) + pca_chosen_pathways = Evaluation.pca_chosen_pathway(input.pca_coordinates_files, input.pathway_summary_file, out_dir) pr_df = Evaluation.edge_precision_and_recall(pca_chosen_pathways, mixed_edge_table, directed_edge_table, undirected_edge_table) Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, output.edge_pca_chosen_pr_file, output.edge_pca_chosen_pr_png, include_aggregate_algo_eval, edge_evaluation=True) diff --git a/spras/evaluation.py b/spras/evaluation.py index 6ce035acb..9427dfd22 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -202,6 +202,8 @@ def nodes_visualize_precision_and_recall_plot(pr_df: pd.DataFrame, output_file: "The input DataFrame must include a preprocessed 'Algorithm' column to calculate precision and recall per pathway file." ) + pr_df.sort_values(by=['Algorithm', 'Recall', 'Pathway'], axis=0, ascending=True, inplace=True) + # save figure plt.figure(figsize=(10, 7)) color_palette = create_palette(pr_df['Algorithm'].tolist()) @@ -313,9 +315,10 @@ def edges_visualize_precision_and_recall_plot(pr_df: pd.DataFrame, output_file: "The input DataFrame must include a preprocessed 'Gold_Standard_Type' column indicating the edge directionality used for the gold standard, which is required to visualize precision and recall for each pathway file per gold standard type." ) + pr_df.sort_values(by=['Algorithm', 'Gold_Standard_Type', 'Recall', 'Pathway'], axis=0, ascending=True, inplace=True) gs_types = pr_df["Gold_Standard_Type"].unique().tolist() - fig, axes = plt.subplots(1, len(gs_types), figsize=(6 * len(gs_types), 5), sharex=True, sharey=True, constrained_layout=True) + fig, axes = plt.subplots(1, len(gs_types), figsize=(6 * len(gs_types), 5), sharex=True, sharey=True) color_palette = create_palette(pr_df['Algorithm'].tolist()) for ax, gs_type in zip(axes, gs_types, strict=True): @@ -361,7 +364,6 @@ def precision_and_recall_per_pathway(pr_df: pd.DataFrame, output_file: str | Pat """ if not pr_df.empty: pr_df['Algorithm'] = pr_df['Pathway'].apply(lambda p: Path(p).parent.name.split('-')[1]) - pr_df.sort_values(by=['Recall', 'Pathway'], axis=0, ascending=True, inplace=True) if aggregate_per_algorithm: # Guaranteed to only have one algorithm in Algorithm column @@ -396,7 +398,6 @@ def precision_and_recall_pca_chosen_pathway(pr_df: pd.DataFrame, output_file: st if not pr_df.empty: pr_df['Algorithm'] = pr_df['Pathway'].apply(lambda p: Path(p).parent.name.split('-')[1]) - pr_df.sort_values(by=['Recall', 'Pathway'], axis=0, ascending=True, inplace=True) if not edge_evaluation: if aggregate_per_algorithm: From 94149b9152f4b435a7abd0ff9fb593d06d953584 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Thu, 30 Oct 2025 14:52:59 -0500 Subject: [PATCH 05/10] add a todo and remove the plan I made --- spras/evaluation.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/spras/evaluation.py b/spras/evaluation.py index 9427dfd22..1dade4b0f 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -316,6 +316,7 @@ def edges_visualize_precision_and_recall_plot(pr_df: pd.DataFrame, output_file: ) pr_df.sort_values(by=['Algorithm', 'Gold_Standard_Type', 'Recall', 'Pathway'], axis=0, ascending=True, inplace=True) + # TODO: fix the layout of the output png gs_types = pr_df["Gold_Standard_Type"].unique().tolist() fig, axes = plt.subplots(1, len(gs_types), figsize=(6 * len(gs_types), 5), sharex=True, sharey=True) @@ -342,7 +343,7 @@ def edges_visualize_precision_and_recall_plot(pr_df: pd.DataFrame, output_file: fig.supylabel("Precision") fig.suptitle(title) handles, labels = axes[0].get_legend_handles_labels() - fig.legend(handles, labels, loc="upper right") # TODO: when doing aggregate per algorithm, check if this needs to be fixed to be in a different place (issue might be constrained_layout) + fig.legend(handles, labels, loc="upper right") plt.savefig(output_png) plt.close(fig) @@ -430,16 +431,6 @@ def precision_and_recall_pca_chosen_pathway(pr_df: pd.DataFrame, output_file: st plt.savefig(output_png) plt.close() - # TODO - # need to make a edge_precision_recall function to make the pr_df - # I think then the precision_and_recall_pca_chosen_pathway function can be reused but needs to be updated to be able to differentiate between nodes or edges - # i think I can do that with a boolean - # then I need to make a edges_visualize_precision_and_recall_plot that is called - # these can then be reused for no parameter selection evaluation - # i think I will need to make a new snakemake rule for each of the evaluatuon because the gold standards only include nodes or edges, - # sharing the same one will cause errors that one type of evalaution doesn't exist - - @staticmethod def pca_chosen_pathway(coordinates_files: list[Union[str, PathLike]], pathway_summary_file: str, output_dir: str): """ From 7277120e508008127166ab450877a80ef6b42688 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 7 Nov 2025 15:00:15 -0600 Subject: [PATCH 06/10] updated test cases and reworded comments --- spras/evaluation.py | 30 ++++---- ...pected-pr-per-pathway-pca-chosen-edges.txt | 4 + ...ected-pr-per-pathway-pca-chosen-nodes.txt} | 0 .../evaluate/input/gs_directed_edge_table.csv | 3 + test/evaluate/input/gs_mixed_edge_table.csv | 2 + .../input/gs_undirected_edge_table.csv | 2 + test/evaluate/test_evaluate.py | 76 ++++++++++++++----- 7 files changed, 87 insertions(+), 30 deletions(-) create mode 100644 test/evaluate/expected/expected-pr-per-pathway-pca-chosen-edges.txt rename test/evaluate/expected/{expected-pr-per-pathway-pca-chosen.txt => expected-pr-per-pathway-pca-chosen-nodes.txt} (100%) create mode 100644 test/evaluate/input/gs_directed_edge_table.csv create mode 100644 test/evaluate/input/gs_mixed_edge_table.csv create mode 100644 test/evaluate/input/gs_undirected_edge_table.csv diff --git a/spras/evaluation.py b/spras/evaluation.py index 1dade4b0f..fb878cc8f 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -152,7 +152,7 @@ def node_precision_and_recall(file_paths: Iterable[Union[str, PathLike]], node_t This function takes a list of file paths corresponding to pathway reconstruction algorithm outputs, each formatted as a tab-separated file with columns 'Node1', 'Node2', 'Rank', and 'Direction'. It compares the set of predicted nodes (from both columns Node1 and Node2) to a provided gold standard node table - and computes precision and recall per file. + and computes a precision and recall per file. @param file_paths: list of file paths of pathway reconstruction algorithm outputs @param node_table: the gold standard nodes @@ -239,7 +239,7 @@ def edge_precision_and_recall(file_paths: Iterable[Union[str, PathLike]], mixed_ This function takes a list of file paths corresponding to pathway reconstruction algorithm outputs, each formatted as a tab-separated file with columns 'Node1', 'Node2', 'Rank', and 'Direction'. - It compares the set of predicted edges to the three provided gold standard edge tables and computes precision and recall per file. + It compares the set of predicted edges to the three provided gold standard edge tables and computes a precision and recall per file. @param file_paths: list of file paths of pathway reconstruction algorithm outputs @param mixed_edge_table: the gold standard edges that includes directed and undirected edges @@ -249,7 +249,7 @@ def edge_precision_and_recall(file_paths: Iterable[Union[str, PathLike]], mixed_ - 'Pathway': Path object corresponding to each pathway file - 'Precision': Precision of predicted nodes vs. gold standard nodes - 'Recall': Recall of predicted nodes vs. gold standard nodes - - 'Gold_Standard_Type': Which gold standard was used to calculate the precision and recall + - 'Gold_Standard_Type': Which gold standard was used to calculate the precision and recall """ y_true_mixed = set(map(tuple, mixed_edge_table[['Interactor1', 'Interactor2', 'Direction']].values)) @@ -291,12 +291,12 @@ def edge_precision_and_recall(file_paths: Iterable[Union[str, PathLike]], mixed_ @staticmethod def edges_visualize_precision_and_recall_plot(pr_df: pd.DataFrame, output_file: str | PathLike, output_png: str | PathLike, title: str): """ - Generates three scatter plot subplots showing edge precision and recall values for each pathway across three edge gold standard types, + Generates three scatter subplots showing edge precision and recall values for each pathway across the three edge gold standard types, and saves both the resulting plots and the corresponding data. - This function is intended for visualizing how different pathway reconstructions perform - (not a precision-recall curve) showing the precision and recall of each parameter combination - for each algorithm per edge gold standard dataset. + This function is intended for visualizing how different pathway reconstructions perform, + showing the precision and recall of each parameter combination for each algorithm across + each edge gold standard dataset (not a precision-recall curve). @param pr_df: Dataframe of calculated precision and recall for each pathway file per edge gold standard. Must include a preprocessed 'Algorithm' column and 'Gold_Standard_Type" column @@ -355,8 +355,9 @@ def edges_visualize_precision_and_recall_plot(pr_df: pd.DataFrame, output_file: def precision_and_recall_per_pathway(pr_df: pd.DataFrame, output_file: str | PathLike, output_png: str | PathLike, aggregate_per_algorithm: bool = False): """ Function for visualizing per pathway precision and recall across all algorithms. Each point in the plot represents - a single pathway reconstruction. If `aggregate_per_algorithm` is set to True, each plot is restricted to a single - algorithm and titled accordingly. + a single pathway reconstruction. + + If `aggregate_per_algorithm` is set to True, each plot is restricted to a single algorithm and titled accordingly. @param pr_df: Dataframe of calculated precision and recall for each pathway file @param output_file: the filename to save the precision and recall of each pathway @@ -385,15 +386,18 @@ def precision_and_recall_pca_chosen_pathway(pr_df: pd.DataFrame, output_file: st Function for visualizing the precision and recall of the single parameter combination selected via PCA, either for each algorithm individually or one combination shared across all algorithms. Each point represents - a pathway reconstruction corresponding to the PCA-selected parameter combination. If `aggregate_per_algorithm` - is True, the plot includes a pca chosen pathway per algorithm and titled accordingly. If `edge_evaluation` is True, - the plot will include the evaluation across the three gold standard edge files. + a pathway reconstruction corresponding to the PCA-selected parameter combination. + + If `aggregate_per_algorithm` is True, the output_png includes a pca chosen pathway per algorithm and titled accordingly. + + If `edge_evaluation` is True, the output PNG shows performance across all three edge gold standards; + if False, the output PNG shows evaluation for the single node gold standard. @param pr_df: Dataframe of calculated precision and recall for each pathway file @param output_file: the filename to save the precision and recall of each pathway @param output_png: the filename to plot the precision and recall of each pathway (not a PRC) @param aggregate_per_algorithm: Boolean indicating if this function is used per algorithm (Default False) - @param edge_evaluation: Boolean indicating if this function is used for creating edge_evaluation plots (Default False; used for node evaluation) + @param edge_evaluation: Boolean indicating if this function is used for creating edge_evaluation plots (Default False) """ # TODO update to add in the pathways for the algorithms that do not provide a pca chosen pathway https://github.com/Reed-CompBio/spras/issues/341 diff --git a/test/evaluate/expected/expected-pr-per-pathway-pca-chosen-edges.txt b/test/evaluate/expected/expected-pr-per-pathway-pca-chosen-edges.txt new file mode 100644 index 000000000..9134b4503 --- /dev/null +++ b/test/evaluate/expected/expected-pr-per-pathway-pca-chosen-edges.txt @@ -0,0 +1,4 @@ +Pathway Precision Recall Gold_Standard_Type +test/evaluate/input/data-test-params-123/pathway.txt 0.0 0.0 directed +test/evaluate/input/data-test-params-123/pathway.txt 0.5 0.5 mixed +test/evaluate/input/data-test-params-123/pathway.txt 1.0 1.0 undirected diff --git a/test/evaluate/expected/expected-pr-per-pathway-pca-chosen.txt b/test/evaluate/expected/expected-pr-per-pathway-pca-chosen-nodes.txt similarity index 100% rename from test/evaluate/expected/expected-pr-per-pathway-pca-chosen.txt rename to test/evaluate/expected/expected-pr-per-pathway-pca-chosen-nodes.txt diff --git a/test/evaluate/input/gs_directed_edge_table.csv b/test/evaluate/input/gs_directed_edge_table.csv new file mode 100644 index 000000000..c3755a19c --- /dev/null +++ b/test/evaluate/input/gs_directed_edge_table.csv @@ -0,0 +1,3 @@ +A B D +B A D +B C D \ No newline at end of file diff --git a/test/evaluate/input/gs_mixed_edge_table.csv b/test/evaluate/input/gs_mixed_edge_table.csv new file mode 100644 index 000000000..d819aa76e --- /dev/null +++ b/test/evaluate/input/gs_mixed_edge_table.csv @@ -0,0 +1,2 @@ +A B U +B C D \ No newline at end of file diff --git a/test/evaluate/input/gs_undirected_edge_table.csv b/test/evaluate/input/gs_undirected_edge_table.csv new file mode 100644 index 000000000..af85f211b --- /dev/null +++ b/test/evaluate/input/gs_undirected_edge_table.csv @@ -0,0 +1,2 @@ +A B U +B C U \ No newline at end of file diff --git a/test/evaluate/test_evaluate.py b/test/evaluate/test_evaluate.py index ce50350e5..e909cfc21 100644 --- a/test/evaluate/test_evaluate.py +++ b/test/evaluate/test_evaluate.py @@ -13,6 +13,9 @@ OUT_DIR = 'test/evaluate/output/' EXPECT_DIR = 'test/evaluate/expected/' GS_NODE_TABLE = pd.read_csv(INPUT_DIR + 'gs_node_table.csv', header=0) +GS_MIXED_EDGE_TABLE = pd.read_csv(INPUT_DIR + 'gs_mixed_edge_table.csv', names=["Interactor1", "Interactor2", "Direction"], sep="\t") +GS_DIRECTED_EDGE_TABLE = pd.read_csv(INPUT_DIR + 'gs_directed_edge_table.csv', names=["Interactor1", "Interactor2", "Direction"], sep="\t") +GS_UNDIRECTED_EDGE_TABLE = pd.read_csv(INPUT_DIR + 'gs_undirected_edge_table.csv', names=["Interactor1", "Interactor2", "Direction"], sep="\t") SUMMARY_FILE = INPUT_DIR + 'example_summary.txt' @@ -81,26 +84,43 @@ def test_node_precision_recall_per_pathway_not_provided(self): Evaluation.precision_and_recall_per_pathway(pr_df, output_file, output_png) def test_node_precision_recall_pca_chosen_pathway_not_provided(self): - output_file = Path( OUT_DIR + 'pr-per-pathway-pca-chosen-not-provided.txt') - output_file.unlink(missing_ok=True) - output_png = Path(OUT_DIR + 'pr-per-pathway-pca-chosen-not-provided.png') - output_png.unlink(missing_ok=True) + node_output_file = Path( OUT_DIR + 'pr-per-pathway-pca-chosen-not-provided_nodes.txt') + node_output_file.unlink(missing_ok=True) + node_output_png = Path(OUT_DIR + 'pr-per-pathway-pca-chosen-not-provided_nodes.png') + node_output_png.unlink(missing_ok=True) + file_paths = [] pr_df = Evaluation.node_precision_and_recall(file_paths, GS_NODE_TABLE) - Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, output_file, output_png) + Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, node_output_file, node_output_png) - output = pd.read_csv(output_file, sep='\t', header=0).round(8) + node_output = pd.read_csv(node_output_file, sep='\t', header=0).round(8) expected = pd.read_csv(EXPECT_DIR + 'expected-pr-pca-chosen-not-provided.txt', sep='\t', header=0).round(8) - assert output.equals(expected) - assert output_png.exists() + assert node_output.equals(expected) + assert node_output_png.exists() + + def test_edge_precision_recall_pca_chosen_pathway_not_provided(self): + edge_output_file = Path( OUT_DIR + 'pr-per-pathway-pca-chosen-not-provided_edges.txt') + edge_output_file.unlink(missing_ok=True) + edge_output_png = Path(OUT_DIR + 'pr-per-pathway-pca-chosen-not-provided_edges.png') + edge_output_png.unlink(missing_ok=True) + + file_paths = [] + + pr_df = Evaluation.edge_precision_and_recall(file_paths, GS_MIXED_EDGE_TABLE, GS_DIRECTED_EDGE_TABLE, GS_UNDIRECTED_EDGE_TABLE) + Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, edge_output_file, edge_output_png) + edge_output = pd.read_csv(edge_output_file, sep='\t', header=0).round(8) + expected = pd.read_csv(EXPECT_DIR + 'expected-pr-pca-chosen-not-provided.txt', sep='\t', header=0).round(8) + + assert edge_output.equals(expected) + assert edge_output_png.exists() def test_node_precision_recall_pca_chosen_pathway(self): - output_file = Path(OUT_DIR + 'pr-per-pathway-pca-chosen.txt') - output_file.unlink(missing_ok=True) - output_png = Path(OUT_DIR + 'pr-per-pathway-pca-chosen.png') - output_png.unlink(missing_ok=True) + node_output_file = Path(OUT_DIR + 'pr-per-pathway-pca-chosen_nodes.txt') + node_output_file.unlink(missing_ok=True) + node_output_png = Path(OUT_DIR + 'pr-per-pathway-pca-chosen_nodes.png') + node_output_png.unlink(missing_ok=True) output_coordinates = Path(OUT_DIR + 'pca-coordinates.tsv') output_coordinates.unlink(missing_ok=True) @@ -111,16 +131,38 @@ def test_node_precision_recall_pca_chosen_pathway(self): ml.pca(dataframe, OUT_DIR + 'pca.png', OUT_DIR + 'pca-variance.txt', str(output_coordinates), kde=True, remove_empty_pathways=True) pathway = Evaluation.pca_chosen_pathway([output_coordinates], SUMMARY_FILE, INPUT_DIR) - pr_df = Evaluation.node_precision_and_recall(pathway, GS_NODE_TABLE) - Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, output_file, output_png, True) + Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, node_output_file, node_output_png, True) + chosen = pd.read_csv(node_output_file, sep='\t', header=0).round(8) + expected = pd.read_csv(EXPECT_DIR + 'expected-pr-per-pathway-pca-chosen-nodes.txt', sep='\t', header=0).round(8) - chosen = pd.read_csv(output_file, sep='\t', header=0).round(8) - expected = pd.read_csv(EXPECT_DIR + 'expected-pr-per-pathway-pca-chosen.txt', sep='\t', header=0).round(8) + assert chosen.equals(expected) + assert node_output_png.exists() + + def test_edge_precision_recall_pca_chosen_pathway(self): + edge_output_file = Path(OUT_DIR + 'pr-per-pathway-pca-chosen_edges.txt') + edge_output_file.unlink(missing_ok=True) + edge_output_png = Path(OUT_DIR + 'pr-per-pathway-pca-chosen_edges.png') + edge_output_png.unlink(missing_ok=True) + output_coordinates = Path(OUT_DIR + 'pca-coordinates.tsv') + output_coordinates.unlink(missing_ok=True) + + file_paths = [INPUT_DIR + 'data-test-params-123/pathway.txt', INPUT_DIR + 'data-test-params-456/pathway.txt', + INPUT_DIR + 'data-test-params-789/pathway.txt', INPUT_DIR + 'data-test-params-empty/pathway.txt'] + + dataframe = ml.summarize_networks(file_paths) + ml.pca(dataframe, OUT_DIR + 'pca.png', OUT_DIR + 'pca-variance.txt', str(output_coordinates), kde=True, remove_empty_pathways=True) + + pathway = Evaluation.pca_chosen_pathway([output_coordinates], SUMMARY_FILE, INPUT_DIR) + pr_df = Evaluation.edge_precision_and_recall(pathway, GS_MIXED_EDGE_TABLE, GS_DIRECTED_EDGE_TABLE, GS_UNDIRECTED_EDGE_TABLE) + Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, edge_output_file, edge_output_png, True, True) + + chosen = pd.read_csv(edge_output_file, sep='\t', header=0).round(8) + expected = pd.read_csv(EXPECT_DIR + 'expected-pr-per-pathway-pca-chosen-edges.txt', sep='\t', header=0).round(8) assert chosen.equals(expected) - assert output_png.exists() + assert edge_output_png.exists() def test_node_ensemble(self): out_path_file = Path(OUT_DIR + 'node-ensemble.csv') From 26f3d54d0602ae6a107b08108f4554bb8d0f443c Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 7 Nov 2025 15:24:59 -0600 Subject: [PATCH 07/10] attempt to fix the spacing --- spras/evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/evaluation.py b/spras/evaluation.py index fb878cc8f..a123661ec 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -319,7 +319,7 @@ def edges_visualize_precision_and_recall_plot(pr_df: pd.DataFrame, output_file: # TODO: fix the layout of the output png gs_types = pr_df["Gold_Standard_Type"].unique().tolist() - fig, axes = plt.subplots(1, len(gs_types), figsize=(6 * len(gs_types), 5), sharex=True, sharey=True) + fig, axes = plt.subplots(1, len(gs_types), figsize=(6 * len(gs_types), 5)) color_palette = create_palette(pr_df['Algorithm'].tolist()) for ax, gs_type in zip(axes, gs_types, strict=True): From f31d598ed7309acd66b914884c33651cb9e4f020 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 7 Nov 2025 16:16:37 -0600 Subject: [PATCH 08/10] made changes based on review --- Snakefile | 6 ------ spras/evaluation.py | 27 +++------------------------ 2 files changed, 3 insertions(+), 30 deletions(-) diff --git a/Snakefile b/Snakefile index fcb45ed69..ea75092bc 100644 --- a/Snakefile +++ b/Snakefile @@ -554,12 +554,6 @@ rule evaluation_per_algo_pca_chosen_edges: Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, output.edge_pca_chosen_pr_file, output.edge_pca_chosen_pr_png, include_aggregate_algo_eval, edge_evaluation=True) -# Returns pca coordinates for a specific algorithm and dataset -def collect_pca_coordinates_per_algo_per_dataset(wildcards): - dataset_label = get_dataset_label(wildcards) - return expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt', out_dir=out_dir, sep=SEP, dataset=dataset_label, algorithm=algorithms_mult_param_combos) #TODO we are using algos with mult param combos, what to do when empty? - - # Return the dataset pickle file for a specific dataset def get_dataset_pickle_file(wildcards): dataset_label = get_dataset_label(wildcards) diff --git a/spras/evaluation.py b/spras/evaluation.py index a123661ec..928be684c 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -152,7 +152,7 @@ def node_precision_and_recall(file_paths: Iterable[Union[str, PathLike]], node_t This function takes a list of file paths corresponding to pathway reconstruction algorithm outputs, each formatted as a tab-separated file with columns 'Node1', 'Node2', 'Rank', and 'Direction'. It compares the set of predicted nodes (from both columns Node1 and Node2) to a provided gold standard node table - and computes a precision and recall per file. + and computes precision and recall per file. @param file_paths: list of file paths of pathway reconstruction algorithm outputs @param node_table: the gold standard nodes @@ -233,13 +233,14 @@ def nodes_visualize_precision_and_recall_plot(pr_df: pd.DataFrame, output_file: pr_df.drop(columns=['Algorithm'], inplace=True) pr_df.to_csv(output_file, sep='\t', index=False) + @staticmethod def edge_precision_and_recall(file_paths: Iterable[Union[str, PathLike]], mixed_edge_table: pd.DataFrame, directed_edge_table: pd.DataFrame, undirected_edge_table: pd.DataFrame) -> pd.DataFrame: """ Computes edge-level precision and recall for each pathway reconstruction output file against three edge gold standard tables. This function takes a list of file paths corresponding to pathway reconstruction algorithm outputs, each formatted as a tab-separated file with columns 'Node1', 'Node2', 'Rank', and 'Direction'. - It compares the set of predicted edges to the three provided gold standard edge tables and computes a precision and recall per file. + It compares the set of predicted edges to the three provided gold standard edge tables and computes precision and recall per file. @param file_paths: list of file paths of pathway reconstruction algorithm outputs @param mixed_edge_table: the gold standard edges that includes directed and undirected edges @@ -650,26 +651,4 @@ def precision_recall_curve_node_ensemble(node_ensembles: dict, node_table: pd.Da complete_df.loc[not_last_rows, ['Average_Precision', 'Baseline']] = None complete_df.to_csv(output_file, index=False, sep='\t') - @staticmethod - def edge_dummy_function(mixed_edge_table: pd.DataFrame, undirected_edge_table: pd.DataFrame, directed_edge_table: pd.DataFrame, dummy_file: str): - """ - Temporary function to test edge file implementation. - Will be removed from SPRAS's evaluation code in the future. - - Takes in the different edge table versions (mixed, fully directed, fully undirected) - for a specific edge gold standard dataset and writes them to a file. - - @param mixed_edge_table: Edge gold standard treated as mixed directionality. - @param undirected_edge_table: Edge gold standard treated as fully undirected. - @param directed_edge_table: Edge gold standard treated as fully directed. - @param dummy_file: Filename to save the edge tables. - """ - with open(dummy_file, "w") as f: - f.write("Mixed Edge Table\n") - mixed_edge_table.to_csv(f, index=False) - f.write("\n\nUndirected Edge Table\n") - undirected_edge_table.to_csv(f, index=False) - f.write("\n\nDirected Edge Table\n") - directed_edge_table.to_csv(f, index=False) - From f19272377c8ac3bc15b0976fb2aa1c3d4122a7e8 Mon Sep 17 00:00:00 2001 From: Neha Talluri <78840540+ntalluri@users.noreply.github.com> Date: Fri, 7 Nov 2025 16:19:27 -0600 Subject: [PATCH 09/10] Update spras/evaluation.py --- spras/evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/evaluation.py b/spras/evaluation.py index 928be684c..04a3724e9 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -317,7 +317,7 @@ def edges_visualize_precision_and_recall_plot(pr_df: pd.DataFrame, output_file: ) pr_df.sort_values(by=['Algorithm', 'Gold_Standard_Type', 'Recall', 'Pathway'], axis=0, ascending=True, inplace=True) - # TODO: fix the layout of the output png + gs_types = pr_df["Gold_Standard_Type"].unique().tolist() fig, axes = plt.subplots(1, len(gs_types), figsize=(6 * len(gs_types), 5)) From e3c69d7675840639d8dd9ec3431ecb8761f0a539 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Thu, 4 Dec 2025 23:09:16 -0800 Subject: [PATCH 10/10] style: fmt --- spras/evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/evaluation.py b/spras/evaluation.py index 04a3724e9..27eb419a0 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -317,7 +317,7 @@ def edges_visualize_precision_and_recall_plot(pr_df: pd.DataFrame, output_file: ) pr_df.sort_values(by=['Algorithm', 'Gold_Standard_Type', 'Recall', 'Pathway'], axis=0, ascending=True, inplace=True) - + gs_types = pr_df["Gold_Standard_Type"].unique().tolist() fig, axes = plt.subplots(1, len(gs_types), figsize=(6 * len(gs_types), 5))