Reed-CompBio · ntalluri · Oct 30, 2025 · Oct 30, 2025 · Oct 30, 2025 · Oct 30, 2025
diff --git a/Snakefile b/Snakefile
@@ -112,9 +112,12 @@ def make_final_input(wildcards):
         final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs))
         final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs))
         final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs))
+
         # dummy file
         final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}dummy-edge.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_edge_pairs))
-
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-edges.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_edge_pairs))
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-edges.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_edge_pairs))
+
     if _config.config.analysis_include_evaluation_aggregate_algo:
         final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-for-{algorithm}-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs,algorithm=algorithms))
         final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-for-{algorithm}-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs,algorithm=algorithms))
@@ -478,7 +481,7 @@ def collect_pca_coordinates_per_dataset(wildcards):
 
 # Run PCA chosen to select the representative from all pathway outputs for a given dataset, 
 # then evaluate with precision and recall against the corresponding gold standard
-rule evaluation_pca_chosen:
+rule evaluation_pca_chosen_nodes:
     input: 
         node_gold_standard_file = get_gold_standard_pickle_file,
         pca_coordinates_file = collect_pca_coordinates_per_dataset,
@@ -492,6 +495,22 @@ rule evaluation_pca_chosen:
         pr_df = Evaluation.node_precision_and_recall(pca_chosen_pathway, node_table)
         Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, output.node_pca_chosen_pr_file, output.node_pca_chosen_pr_png)
 
+rule evaluation_pca_chosen_edges:
+    input: 
+        edge_gold_standard_file = get_gold_standard_pickle_file,
+        pca_coordinates_file = collect_pca_coordinates_per_dataset,
+        pathway_summary_file = collect_summary_statistics_per_dataset
+    output: 
+        edge_pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pair}-eval', 'pr-pca-chosen-pathway-edges.txt']),
+        edge_pca_chosen_pr_png = SEP.join([out_dir, '{dataset_gold_standard_pair}-eval', 'pr-pca-chosen-pathway-edges.png']),
+    run:
+        mixed_edge_table = Evaluation.from_file(input.edge_gold_standard_file).mixed_edge_table
+        undirected_edge_table = Evaluation.from_file(input.edge_gold_standard_file).undirected_edge_table
+        directed_edge_table = Evaluation.from_file(input.edge_gold_standard_file).directed_edge_table
+        pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, input.pathway_summary_file, out_dir)
+        pr_df = Evaluation.edge_precision_and_recall(pca_chosen_pathway, mixed_edge_table, directed_edge_table, undirected_edge_table)
+        Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, output.edge_pca_chosen_pr_file, output.edge_pca_chosen_pr_png, edge_evaluation=True)
+
 # Returns pca coordinates for a specific algorithm and dataset
 def collect_pca_coordinates_per_algo_per_dataset(wildcards):
     dataset_label = get_dataset_label(wildcards)
@@ -556,17 +575,6 @@ rule evaluation_per_algo_ensemble_pr_curve:
         node_ensembles_dict = Evaluation.edge_frequency_node_ensemble(node_table, input.ensemble_files, input.dataset_file)
         Evaluation.precision_recall_curve_node_ensemble(node_ensembles_dict, node_table, output.node_pr_curve_png, output.node_pr_curve_file, include_aggregate_algo_eval)
 
-rule evaluation_edge_dummy:
-    input: 
-        edge_gold_standard_file = get_gold_standard_pickle_file,
-    output: 
-        dummy_file = SEP.join([out_dir, '{dataset_gold_standard_pair}-eval', 'dummy-edge.txt']),
-    run:
-        mixed_edge_table = Evaluation.from_file(input.edge_gold_standard_file).mixed_edge_table
-        undirected_edge_table = Evaluation.from_file(input.edge_gold_standard_file).undirected_edge_table
-        directed_edge_table = Evaluation.from_file(input.edge_gold_standard_file).directed_edge_table
-        Evaluation.edge_dummy_function(mixed_edge_table, undirected_edge_table, directed_edge_table, output.dummy_file)
-
 # Remove the output directory
 rule clean:
     shell: f'rm -rf {out_dir}'
diff --git a/spras/evaluation.py b/spras/evaluation.py
@@ -179,8 +179,9 @@ def node_precision_and_recall(file_paths: Iterable[Union[str, PathLike]], node_t
         pr_df = pd.DataFrame(results)
         return pr_df
 
+
     @staticmethod
-    def visualize_precision_and_recall_plot(pr_df: pd.DataFrame, output_file: str | PathLike, output_png: str | PathLike, title: str):
+    def nodes_visualize_precision_and_recall_plot(pr_df: pd.DataFrame, output_file: str | PathLike, output_png: str | PathLike, title: str):
         """
         Generates a scatter plot of precision and recall values for each pathway and saves both
         the plot and the data.
@@ -230,11 +231,127 @@ def visualize_precision_and_recall_plot(pr_df: pd.DataFrame, output_file: str |
         pr_df.drop(columns=['Algorithm'], inplace=True)
         pr_df.to_csv(output_file, sep='\t', index=False)
 
+    def edge_precision_and_recall(file_paths: Iterable[Union[str, PathLike]], mixed_edge_table: pd.DataFrame, directed_edge_table: pd.DataFrame, undirected_edge_table: pd.DataFrame) -> pd.DataFrame:
+        """
+        Computes edge-level precision and recall for each pathway reconstruction output file against three edge gold standard tables.
+
+        This function takes a list of file paths corresponding to pathway reconstruction algorithm outputs,
+        each formatted as a tab-separated file with columns 'Node1', 'Node2', 'Rank', and 'Direction'.
+        It compares the set of predicted edges to the three provided gold standard edge tables and computes precision and recall per file.
+
+        @param file_paths: list of file paths of pathway reconstruction algorithm outputs
+        @param mixed_edge_table: the gold standard edges that includes directed and undirected edges
+        @param directed_edge_table: the gold standard edges that only includes directed edges
+        @param undirected_edge_table: the gold standard edges that only includes undirected edges
+        @return: A DataFrame with the following columns:
+                - 'Pathway': Path object corresponding to each pathway file
+                - 'Precision': Precision of predicted nodes vs. gold standard nodes
+                - 'Recall': Recall of predicted nodes vs. gold standard nodes
+                - 'Gold_Standard_Type': Which  gold standard was used to calculate the precision and recall
+        """
+
+        y_true_mixed = set(map(tuple, mixed_edge_table[['Interactor1', 'Interactor2', 'Direction']].values))
+        y_true_directed = set(map(tuple, directed_edge_table[['Interactor1', 'Interactor2', 'Direction']].values))
+        y_true_undirected =  set(map(tuple, undirected_edge_table[['Interactor1', 'Interactor2', 'Direction']].values))
+
+        results = []
+        for f in file_paths:
+            df = pd.read_table(f, sep='\t', header=0)
+            y_pred =  set(map(tuple, df[['Node1', 'Node2', 'Direction']].values))
+
+            all_edges_mixed = y_true_mixed.union(y_pred)
+            y_true_mixed_binary = [1 if edge in y_true_mixed else 0 for edge in all_edges_mixed]
+            y_pred_mixed_binary = [1 if edge in y_pred else 0 for edge in all_edges_mixed]
+            # default to 0.0 if there is a divide by 0 error
+            # not using precision_recall_curve because thresholds are binary (0 or 1); rather we are directly
+            # calculating precision and recall per pathway
+            precision_mixed = precision_score(y_true_mixed_binary, y_pred_mixed_binary, zero_division=0.0)
+            recall_mixed = recall_score(y_true_mixed_binary, y_pred_mixed_binary, zero_division=0.0)
+            results.append({'Pathway': f, 'Precision': precision_mixed, 'Recall': recall_mixed, 'Gold_Standard_Type': "mixed"})
+
+            all_edges_directed = y_true_directed.union(y_pred)
+            y_true_directed_binary = [1 if edge in y_true_directed else 0 for edge in all_edges_directed]
+            y_pred_directed_binary = [1 if edge in y_pred else 0 for edge in all_edges_directed]
+            precision_directed = precision_score(y_true_directed_binary, y_pred_directed_binary, zero_division=0.0)
+            recall_directed = recall_score(y_true_directed_binary, y_pred_directed_binary, zero_division=0.0)
+            results.append({'Pathway': f, 'Precision': precision_directed, 'Recall': recall_directed, 'Gold_Standard_Type': "directed"})
+
+            all_edges_undirected = y_true_undirected.union(y_pred)
+            y_true_undirected_binary = [1 if edge in y_true_undirected else 0 for edge in all_edges_undirected]
+            y_pred_undirected_binary = [1 if edge in y_pred else 0 for edge in all_edges_undirected]
+            precision_undirected = precision_score(y_true_undirected_binary, y_pred_undirected_binary, zero_division=0.0)
+            recall_undirected = recall_score(y_true_undirected_binary, y_pred_undirected_binary, zero_division=0.0)
+            results.append({'Pathway': f, 'Precision': precision_undirected, 'Recall': recall_undirected, 'Gold_Standard_Type': "undirected"})
+
+        pr_df = pd.DataFrame(results)
+        return pr_df
+
+    @staticmethod
+    def edges_visualize_precision_and_recall_plot(pr_df: pd.DataFrame, output_file: str | PathLike, output_png: str | PathLike, title: str):
+        """
+        Generates three scatter plot subplots showing edge precision and recall values for each pathway across three edge gold standard types,
+        and saves both the resulting plots and the corresponding data.
+
+        This function is intended for visualizing how different pathway reconstructions perform
+        (not a precision-recall curve) showing the precision and recall of each parameter combination
+        for each algorithm per edge gold standard dataset.
+
+        @param pr_df: Dataframe of calculated precision and recall for each pathway file per edge gold standard.
+                      Must include a preprocessed 'Algorithm' column and 'Gold_Standard_Type" column
+        @param output_file: the filename to save the precision and recall of each pathway per gold standard type
+        @param output_png: the filename to plot the precision and recall of each pathway (not a PRC) per gold standard type
+        @param title: The title to use for the plot
+        """
+        if 'Algorithm' not in pr_df.columns:
+            raise ValueError(
+                "Column 'Algorithm' not found in DataFrame. "
+                "The input DataFrame must include a preprocessed 'Algorithm' column to visulize a precision and recall per pathway file per gold standard type."
+            )
+        if 'Gold_Standard_Type' not in pr_df.columns:
+            raise ValueError(
+                "Column 'Gold_Standard_Type' not found in DataFrame. "
+                "The input DataFrame must include a preprocessed 'Gold_Standard_Type' column indicating the edge directionality used for the gold standard, which is required to visualize precision and recall for each pathway file per gold standard type."
+            )
+
+
+        gs_types = pr_df["Gold_Standard_Type"].unique().tolist()
+        fig, axes = plt.subplots(1, len(gs_types), figsize=(6 * len(gs_types), 5), sharex=True, sharey=True, constrained_layout=True)
+        color_palette = create_palette(pr_df['Algorithm'].tolist())
+
+        for ax, gs_type in zip(axes, gs_types, strict=True):
+            df_gs_type = pr_df[pr_df["Gold_Standard_Type"] == gs_type]
+            for algorithm, subset in df_gs_type.groupby('Algorithm'):
+                if not subset.empty:
+                    ax.plot(
+                        subset['Recall'],
+                        subset['Precision'],
+                        color=color_palette[algorithm],
+                        marker='o',
+                        linestyle='',
+                        label=algorithm.capitalize()
+                    )
+            ax.set_title(gs_type.capitalize())
+            ax.set_xlim(-0.05, 1.05)
+            ax.set_ylim(-0.05, 1.05)
+            ax.grid(True)
+
+        fig.supxlabel("Recall")
+        fig.supylabel("Precision")
+        fig.suptitle(title)
+        handles, labels = axes[0].get_legend_handles_labels()
+        fig.legend(handles, labels, loc="upper right") # TODO: when doing aggregate per algorithm, check if this needs to be fixed to be in a different place (issue might be constrained_layout)
+        plt.savefig(output_png)
+        plt.close(fig)
+
+        # save dataframe
+        pr_df.drop(columns=['Algorithm'], inplace=True)
+        pr_df.to_csv(output_file, sep='\t', index=False)
+
     @staticmethod
     def precision_and_recall_per_pathway(pr_df: pd.DataFrame, output_file: str | PathLike, output_png: str | PathLike, aggregate_per_algorithm: bool = False):
         """
         Function for visualizing per pathway precision and recall across all algorithms. Each point in the plot represents
-        a single pathway reconstruction. If `aggregate_per_algorithm` is set to True, the plot is restricted to a single
+        a single pathway reconstruction. If `aggregate_per_algorithm` is set to True, each plot is restricted to a single
         algorithm and titled accordingly.
 
         @param pr_df: Dataframe of calculated precision and recall for each pathway file
@@ -252,39 +369,51 @@ def precision_and_recall_per_pathway(pr_df: pd.DataFrame, output_file: str | Pat
             else:
                 title = "Precision and Recall Plot Per Pathway Per Algorithm"
 
-            Evaluation.visualize_precision_and_recall_plot(pr_df, output_file, output_png, title)
+            Evaluation.nodes_visualize_precision_and_recall_plot(pr_df, output_file, output_png, title)
 
         else:
             # this block should never be reached — having 0 pathways implies that no algorithms or parameter combinations were run,
             # which indicates a deeper issue in the workflow setup.
             raise ValueError("No pathways were provided to evaluate and visulize on. This likely means no algorithms or parameter combinations were run.")
 
     @staticmethod
-    def precision_and_recall_pca_chosen_pathway(pr_df: pd.DataFrame, output_file: str | PathLike, output_png: str | PathLike, aggregate_per_algorithm: bool = False):
+    def precision_and_recall_pca_chosen_pathway(pr_df: pd.DataFrame, output_file: str | PathLike, output_png: str | PathLike, aggregate_per_algorithm: bool = False, edge_evaluation: bool = False):
         """
 
         Function for visualizing the precision and recall of the single parameter combination selected via PCA,
         either for each algorithm individually or one combination shared across all algorithms. Each point represents
         a pathway reconstruction corresponding to the PCA-selected parameter combination. If `aggregate_per_algorithm`
-        is True, the plot includes a pca chosen pathway per algorithm and titled accordingly.
+        is True, the plot includes a pca chosen pathway per algorithm and titled accordingly. If `edge_evaluation` is True,
+        the plot will include the evaluation across the three gold standard edge files.
 
         @param pr_df: Dataframe of calculated precision and recall for each pathway file
         @param output_file: the filename to save the precision and recall of each pathway
         @param output_png: the filename to plot the precision and recall of each pathway (not a PRC)
-        @param aggregate_per_algorithm: Boolean indicating if function is used per algorithm (Default False)
+        @param aggregate_per_algorithm: Boolean indicating if this function is used per algorithm (Default False)
+        @param edge_evaluation: Boolean indicating if this function is used for creating edge_evaluation plots (Default False; used for node evaluation)
         """
         # TODO update to add in the pathways for the algorithms that do not provide a pca chosen pathway https://github.com/Reed-CompBio/spras/issues/341
 
         if not pr_df.empty:
             pr_df['Algorithm'] = pr_df['Pathway'].apply(lambda p: Path(p).parent.name.split('-')[1])
             pr_df.sort_values(by=['Recall', 'Pathway'], axis=0, ascending=True, inplace=True)
 
-            if aggregate_per_algorithm:
-                title = "PCA-Chosen Pathway Per Algorithm Precision and Recall Plot"
+            if not edge_evaluation:
+                if aggregate_per_algorithm:
+                    title = "Node Evaluation PCA-Chosen Pathway Per Algorithm Precision and Recall Plot"
+                else:
+                    title = "Node Evaluation PCA-Chosen Pathway Across all Algorithms Precision and Recall Plot"
+
+                Evaluation.nodes_visualize_precision_and_recall_plot(pr_df, output_file, output_png, title)
+
             else:
-                title = "PCA-Chosen Pathway Across All Algorithms Precision and Recall Plot"
+                if aggregate_per_algorithm :
-                if aggregate_per_algorithm :
+                if aggregate_per_algorithm:
-                if aggregate_per_algorithm :
+                if aggregate_per_algorithm:
+                    title = "Edge Evaluation PCA-Chosen Pathway Per Algorithm Precision and Recall Plot"
+                else:
+                    title = "Edge Evaluation PCA-Chosen Pathway Across all Algorithms Precision and Recall Plot"
+
+                Evaluation.edges_visualize_precision_and_recall_plot(pr_df, output_file, output_png, title)
 
-            Evaluation.visualize_precision_and_recall_plot(pr_df, output_file, output_png, title)
 
         else:
             # Edge case: if all algorithms chosen use only 1 parameter combination
@@ -300,6 +429,16 @@ def precision_and_recall_pca_chosen_pathway(pr_df: pd.DataFrame, output_file: st
                 plt.savefig(output_png)
                 plt.close()
 
+    # TODO
+    # need to make a edge_precision_recall function to make the pr_df
+    # I think then the precision_and_recall_pca_chosen_pathway function can be reused but needs to be updated to be able to differentiate between nodes or edges
+    # i think I can do that with a boolean
+    # then I need to make a edges_visualize_precision_and_recall_plot that is called
+    # these can then be reused for no parameter selection evaluation
+    # i think I will need to make a new snakemake rule for each of the evaluatuon because the gold standards only include nodes or edges,
+    # sharing the same one will cause errors that one type of evalaution doesn't exist
+
+
     @staticmethod
     def pca_chosen_pathway(coordinates_files: list[Union[str, PathLike]], pathway_summary_file: str, output_dir: str):
         """
@@ -349,6 +488,7 @@ def pca_chosen_pathway(coordinates_files: list[Union[str, PathLike]], pathway_su
             rep_pathway = os.path.join(output_dir, f"{closest_to_kde_peak['datapoint_labels']}", "pathway.txt")
             rep_pathways.append(rep_pathway)
 
+        print(rep_pathways)
         return rep_pathways
 
     @staticmethod
@@ -536,3 +676,5 @@ def edge_dummy_function(mixed_edge_table: pd.DataFrame, undirected_edge_table: p
             undirected_edge_table.to_csv(f, index=False)
             f.write("\n\nDirected Edge Table\n")
             directed_edge_table.to_csv(f, index=False)
+
+