Skip to content
34 changes: 21 additions & 13 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,12 @@ def make_final_input(wildcards):
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs))
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs))
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs))

# dummy file
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}dummy-edge.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_edge_pairs))

final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-edges.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_edge_pairs))
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-edges.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_edge_pairs))

if _config.config.analysis_include_evaluation_aggregate_algo:
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-for-{algorithm}-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs,algorithm=algorithms))
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-for-{algorithm}-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs,algorithm=algorithms))
Expand Down Expand Up @@ -478,7 +481,7 @@ def collect_pca_coordinates_per_dataset(wildcards):

# Run PCA chosen to select the representative from all pathway outputs for a given dataset,
# then evaluate with precision and recall against the corresponding gold standard
rule evaluation_pca_chosen:
rule evaluation_pca_chosen_nodes:
input:
node_gold_standard_file = get_gold_standard_pickle_file,
pca_coordinates_file = collect_pca_coordinates_per_dataset,
Expand All @@ -492,6 +495,22 @@ rule evaluation_pca_chosen:
pr_df = Evaluation.node_precision_and_recall(pca_chosen_pathway, node_table)
Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, output.node_pca_chosen_pr_file, output.node_pca_chosen_pr_png)

rule evaluation_pca_chosen_edges:
input:
edge_gold_standard_file = get_gold_standard_pickle_file,
pca_coordinates_file = collect_pca_coordinates_per_dataset,
pathway_summary_file = collect_summary_statistics_per_dataset
output:
edge_pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pair}-eval', 'pr-pca-chosen-pathway-edges.txt']),
edge_pca_chosen_pr_png = SEP.join([out_dir, '{dataset_gold_standard_pair}-eval', 'pr-pca-chosen-pathway-edges.png']),
run:
mixed_edge_table = Evaluation.from_file(input.edge_gold_standard_file).mixed_edge_table
undirected_edge_table = Evaluation.from_file(input.edge_gold_standard_file).undirected_edge_table
directed_edge_table = Evaluation.from_file(input.edge_gold_standard_file).directed_edge_table
pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, input.pathway_summary_file, out_dir)
pr_df = Evaluation.edge_precision_and_recall(pca_chosen_pathway, mixed_edge_table, directed_edge_table, undirected_edge_table)
Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, output.edge_pca_chosen_pr_file, output.edge_pca_chosen_pr_png, edge_evaluation=True)

# Returns pca coordinates for a specific algorithm and dataset
def collect_pca_coordinates_per_algo_per_dataset(wildcards):
dataset_label = get_dataset_label(wildcards)
Expand Down Expand Up @@ -556,17 +575,6 @@ rule evaluation_per_algo_ensemble_pr_curve:
node_ensembles_dict = Evaluation.edge_frequency_node_ensemble(node_table, input.ensemble_files, input.dataset_file)
Evaluation.precision_recall_curve_node_ensemble(node_ensembles_dict, node_table, output.node_pr_curve_png, output.node_pr_curve_file, include_aggregate_algo_eval)

rule evaluation_edge_dummy:
input:
edge_gold_standard_file = get_gold_standard_pickle_file,
output:
dummy_file = SEP.join([out_dir, '{dataset_gold_standard_pair}-eval', 'dummy-edge.txt']),
run:
mixed_edge_table = Evaluation.from_file(input.edge_gold_standard_file).mixed_edge_table
undirected_edge_table = Evaluation.from_file(input.edge_gold_standard_file).undirected_edge_table
directed_edge_table = Evaluation.from_file(input.edge_gold_standard_file).directed_edge_table
Evaluation.edge_dummy_function(mixed_edge_table, undirected_edge_table, directed_edge_table, output.dummy_file)

# Remove the output directory
rule clean:
shell: f'rm -rf {out_dir}'
162 changes: 152 additions & 10 deletions spras/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,9 @@ def node_precision_and_recall(file_paths: Iterable[Union[str, PathLike]], node_t
pr_df = pd.DataFrame(results)
return pr_df


@staticmethod
def visualize_precision_and_recall_plot(pr_df: pd.DataFrame, output_file: str | PathLike, output_png: str | PathLike, title: str):
def nodes_visualize_precision_and_recall_plot(pr_df: pd.DataFrame, output_file: str | PathLike, output_png: str | PathLike, title: str):
"""
Generates a scatter plot of precision and recall values for each pathway and saves both
the plot and the data.
Expand Down Expand Up @@ -230,11 +231,127 @@ def visualize_precision_and_recall_plot(pr_df: pd.DataFrame, output_file: str |
pr_df.drop(columns=['Algorithm'], inplace=True)
pr_df.to_csv(output_file, sep='\t', index=False)

def edge_precision_and_recall(file_paths: Iterable[Union[str, PathLike]], mixed_edge_table: pd.DataFrame, directed_edge_table: pd.DataFrame, undirected_edge_table: pd.DataFrame) -> pd.DataFrame:
"""
Computes edge-level precision and recall for each pathway reconstruction output file against three edge gold standard tables.

This function takes a list of file paths corresponding to pathway reconstruction algorithm outputs,
each formatted as a tab-separated file with columns 'Node1', 'Node2', 'Rank', and 'Direction'.
It compares the set of predicted edges to the three provided gold standard edge tables and computes precision and recall per file.

@param file_paths: list of file paths of pathway reconstruction algorithm outputs
@param mixed_edge_table: the gold standard edges that includes directed and undirected edges
@param directed_edge_table: the gold standard edges that only includes directed edges
@param undirected_edge_table: the gold standard edges that only includes undirected edges
@return: A DataFrame with the following columns:
- 'Pathway': Path object corresponding to each pathway file
- 'Precision': Precision of predicted nodes vs. gold standard nodes
- 'Recall': Recall of predicted nodes vs. gold standard nodes
- 'Gold_Standard_Type': Which gold standard was used to calculate the precision and recall
"""

y_true_mixed = set(map(tuple, mixed_edge_table[['Interactor1', 'Interactor2', 'Direction']].values))
y_true_directed = set(map(tuple, directed_edge_table[['Interactor1', 'Interactor2', 'Direction']].values))
y_true_undirected = set(map(tuple, undirected_edge_table[['Interactor1', 'Interactor2', 'Direction']].values))

results = []
for f in file_paths:
df = pd.read_table(f, sep='\t', header=0)
y_pred = set(map(tuple, df[['Node1', 'Node2', 'Direction']].values))

all_edges_mixed = y_true_mixed.union(y_pred)
y_true_mixed_binary = [1 if edge in y_true_mixed else 0 for edge in all_edges_mixed]
y_pred_mixed_binary = [1 if edge in y_pred else 0 for edge in all_edges_mixed]
# default to 0.0 if there is a divide by 0 error
# not using precision_recall_curve because thresholds are binary (0 or 1); rather we are directly
# calculating precision and recall per pathway
precision_mixed = precision_score(y_true_mixed_binary, y_pred_mixed_binary, zero_division=0.0)
recall_mixed = recall_score(y_true_mixed_binary, y_pred_mixed_binary, zero_division=0.0)
results.append({'Pathway': f, 'Precision': precision_mixed, 'Recall': recall_mixed, 'Gold_Standard_Type': "mixed"})
Comment on lines +265 to +273
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code block is repeated three times, right? Is there a way to reuse the same code three times and keep it this readable, or does it become harder to follow the logic? I do value the readability.


all_edges_directed = y_true_directed.union(y_pred)
y_true_directed_binary = [1 if edge in y_true_directed else 0 for edge in all_edges_directed]
y_pred_directed_binary = [1 if edge in y_pred else 0 for edge in all_edges_directed]
precision_directed = precision_score(y_true_directed_binary, y_pred_directed_binary, zero_division=0.0)
recall_directed = recall_score(y_true_directed_binary, y_pred_directed_binary, zero_division=0.0)
results.append({'Pathway': f, 'Precision': precision_directed, 'Recall': recall_directed, 'Gold_Standard_Type': "directed"})

all_edges_undirected = y_true_undirected.union(y_pred)
y_true_undirected_binary = [1 if edge in y_true_undirected else 0 for edge in all_edges_undirected]
y_pred_undirected_binary = [1 if edge in y_pred else 0 for edge in all_edges_undirected]
precision_undirected = precision_score(y_true_undirected_binary, y_pred_undirected_binary, zero_division=0.0)
recall_undirected = recall_score(y_true_undirected_binary, y_pred_undirected_binary, zero_division=0.0)
results.append({'Pathway': f, 'Precision': precision_undirected, 'Recall': recall_undirected, 'Gold_Standard_Type': "undirected"})

pr_df = pd.DataFrame(results)
return pr_df

@staticmethod
def edges_visualize_precision_and_recall_plot(pr_df: pd.DataFrame, output_file: str | PathLike, output_png: str | PathLike, title: str):
"""
Generates three scatter plot subplots showing edge precision and recall values for each pathway across three edge gold standard types,
and saves both the resulting plots and the corresponding data.

This function is intended for visualizing how different pathway reconstructions perform
(not a precision-recall curve) showing the precision and recall of each parameter combination
for each algorithm per edge gold standard dataset.

@param pr_df: Dataframe of calculated precision and recall for each pathway file per edge gold standard.
Must include a preprocessed 'Algorithm' column and 'Gold_Standard_Type" column
@param output_file: the filename to save the precision and recall of each pathway per gold standard type
@param output_png: the filename to plot the precision and recall of each pathway (not a PRC) per gold standard type
@param title: The title to use for the plot
"""
if 'Algorithm' not in pr_df.columns:
raise ValueError(
"Column 'Algorithm' not found in DataFrame. "
"The input DataFrame must include a preprocessed 'Algorithm' column to visulize a precision and recall per pathway file per gold standard type."
)
if 'Gold_Standard_Type' not in pr_df.columns:
raise ValueError(
"Column 'Gold_Standard_Type' not found in DataFrame. "
"The input DataFrame must include a preprocessed 'Gold_Standard_Type' column indicating the edge directionality used for the gold standard, which is required to visualize precision and recall for each pathway file per gold standard type."
)


gs_types = pr_df["Gold_Standard_Type"].unique().tolist()
fig, axes = plt.subplots(1, len(gs_types), figsize=(6 * len(gs_types), 5), sharex=True, sharey=True, constrained_layout=True)
color_palette = create_palette(pr_df['Algorithm'].tolist())

for ax, gs_type in zip(axes, gs_types, strict=True):
df_gs_type = pr_df[pr_df["Gold_Standard_Type"] == gs_type]
for algorithm, subset in df_gs_type.groupby('Algorithm'):
if not subset.empty:
ax.plot(
subset['Recall'],
subset['Precision'],
color=color_palette[algorithm],
marker='o',
linestyle='',
label=algorithm.capitalize()
)
ax.set_title(gs_type.capitalize())
ax.set_xlim(-0.05, 1.05)
ax.set_ylim(-0.05, 1.05)
ax.grid(True)

fig.supxlabel("Recall")
fig.supylabel("Precision")
fig.suptitle(title)
handles, labels = axes[0].get_legend_handles_labels()
fig.legend(handles, labels, loc="upper right") # TODO: when doing aggregate per algorithm, check if this needs to be fixed to be in a different place (issue might be constrained_layout)
plt.savefig(output_png)
plt.close(fig)

# save dataframe
pr_df.drop(columns=['Algorithm'], inplace=True)
pr_df.to_csv(output_file, sep='\t', index=False)

@staticmethod
def precision_and_recall_per_pathway(pr_df: pd.DataFrame, output_file: str | PathLike, output_png: str | PathLike, aggregate_per_algorithm: bool = False):
"""
Function for visualizing per pathway precision and recall across all algorithms. Each point in the plot represents
a single pathway reconstruction. If `aggregate_per_algorithm` is set to True, the plot is restricted to a single
a single pathway reconstruction. If `aggregate_per_algorithm` is set to True, each plot is restricted to a single
algorithm and titled accordingly.

@param pr_df: Dataframe of calculated precision and recall for each pathway file
Expand All @@ -252,39 +369,51 @@ def precision_and_recall_per_pathway(pr_df: pd.DataFrame, output_file: str | Pat
else:
title = "Precision and Recall Plot Per Pathway Per Algorithm"

Evaluation.visualize_precision_and_recall_plot(pr_df, output_file, output_png, title)
Evaluation.nodes_visualize_precision_and_recall_plot(pr_df, output_file, output_png, title)

else:
# this block should never be reached — having 0 pathways implies that no algorithms or parameter combinations were run,
# which indicates a deeper issue in the workflow setup.
raise ValueError("No pathways were provided to evaluate and visulize on. This likely means no algorithms or parameter combinations were run.")

@staticmethod
def precision_and_recall_pca_chosen_pathway(pr_df: pd.DataFrame, output_file: str | PathLike, output_png: str | PathLike, aggregate_per_algorithm: bool = False):
def precision_and_recall_pca_chosen_pathway(pr_df: pd.DataFrame, output_file: str | PathLike, output_png: str | PathLike, aggregate_per_algorithm: bool = False, edge_evaluation: bool = False):
"""

Function for visualizing the precision and recall of the single parameter combination selected via PCA,
either for each algorithm individually or one combination shared across all algorithms. Each point represents
a pathway reconstruction corresponding to the PCA-selected parameter combination. If `aggregate_per_algorithm`
is True, the plot includes a pca chosen pathway per algorithm and titled accordingly.
is True, the plot includes a pca chosen pathway per algorithm and titled accordingly. If `edge_evaluation` is True,
the plot will include the evaluation across the three gold standard edge files.

@param pr_df: Dataframe of calculated precision and recall for each pathway file
@param output_file: the filename to save the precision and recall of each pathway
@param output_png: the filename to plot the precision and recall of each pathway (not a PRC)
@param aggregate_per_algorithm: Boolean indicating if function is used per algorithm (Default False)
@param aggregate_per_algorithm: Boolean indicating if this function is used per algorithm (Default False)
@param edge_evaluation: Boolean indicating if this function is used for creating edge_evaluation plots (Default False; used for node evaluation)
"""
# TODO update to add in the pathways for the algorithms that do not provide a pca chosen pathway https://github.com/Reed-CompBio/spras/issues/341

if not pr_df.empty:
pr_df['Algorithm'] = pr_df['Pathway'].apply(lambda p: Path(p).parent.name.split('-')[1])
pr_df.sort_values(by=['Recall', 'Pathway'], axis=0, ascending=True, inplace=True)

if aggregate_per_algorithm:
title = "PCA-Chosen Pathway Per Algorithm Precision and Recall Plot"
if not edge_evaluation:
if aggregate_per_algorithm:
title = "Node Evaluation PCA-Chosen Pathway Per Algorithm Precision and Recall Plot"
else:
title = "Node Evaluation PCA-Chosen Pathway Across all Algorithms Precision and Recall Plot"

Evaluation.nodes_visualize_precision_and_recall_plot(pr_df, output_file, output_png, title)

else:
title = "PCA-Chosen Pathway Across All Algorithms Precision and Recall Plot"
if aggregate_per_algorithm :
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if aggregate_per_algorithm :
if aggregate_per_algorithm:

title = "Edge Evaluation PCA-Chosen Pathway Per Algorithm Precision and Recall Plot"
else:
title = "Edge Evaluation PCA-Chosen Pathway Across all Algorithms Precision and Recall Plot"

Evaluation.edges_visualize_precision_and_recall_plot(pr_df, output_file, output_png, title)

Evaluation.visualize_precision_and_recall_plot(pr_df, output_file, output_png, title)

else:
# Edge case: if all algorithms chosen use only 1 parameter combination
Expand All @@ -300,6 +429,16 @@ def precision_and_recall_pca_chosen_pathway(pr_df: pd.DataFrame, output_file: st
plt.savefig(output_png)
plt.close()

# TODO
# need to make a edge_precision_recall function to make the pr_df
# I think then the precision_and_recall_pca_chosen_pathway function can be reused but needs to be updated to be able to differentiate between nodes or edges
# i think I can do that with a boolean
# then I need to make a edges_visualize_precision_and_recall_plot that is called
# these can then be reused for no parameter selection evaluation
# i think I will need to make a new snakemake rule for each of the evaluatuon because the gold standards only include nodes or edges,
# sharing the same one will cause errors that one type of evalaution doesn't exist


@staticmethod
def pca_chosen_pathway(coordinates_files: list[Union[str, PathLike]], pathway_summary_file: str, output_dir: str):
"""
Expand Down Expand Up @@ -349,6 +488,7 @@ def pca_chosen_pathway(coordinates_files: list[Union[str, PathLike]], pathway_su
rep_pathway = os.path.join(output_dir, f"{closest_to_kde_peak['datapoint_labels']}", "pathway.txt")
rep_pathways.append(rep_pathway)

print(rep_pathways)
return rep_pathways

@staticmethod
Expand Down Expand Up @@ -536,3 +676,5 @@ def edge_dummy_function(mixed_edge_table: pd.DataFrame, undirected_edge_table: p
undirected_edge_table.to_csv(f, index=False)
f.write("\n\nDirected Edge Table\n")
directed_edge_table.to_csv(f, index=False)


Loading