tested on various methods

ctrlaltaf · ctrlaltaf · commit 4e4229567a14 · 2024-12-02T11:21:04.000-08:00
diff --git a/Snakefile b/Snakefile
@@ -389,22 +389,21 @@ rule evaluation:
         ensemble_file=lambda wildcards: f"{out_dir}{SEP}{get_dataset_label(wildcards)}-ml{SEP}ensemble-pathway.txt",
         pca_coordinates_file =lambda wildcards: f"{out_dir}{SEP}{get_dataset_label(wildcards)}-ml{SEP}pca-coordinates.txt"
     output: 
-        pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-recall-per-pathway.txt"]),
-        pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'precision-recall-per-pathway.png']),
+        pr_node_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-recall-per-pathway.txt"]),
+        pr_node_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'precision-recall-per-pathway.png']),
         pr_edge_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-recall-per-pathway_edge.txt"]),
         pr_edge_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'precision-recall-per-pathway_edge.png']),
         pr_curve_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'precision-recall-curve-ensemble-nodes.png']),
         pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-recall-pca-chosen-pathway.txt"]),
     run:
         node_table = Evaluation.from_file(input.gold_standard_file).node_table
         edge_table = Evaluation.from_file(input.gold_standard_file).edge_table
-        Evaluation.precision_and_recall(input.pathways, node_table, algorithms, output.pr_file, output.pr_png)
+        Evaluation.precision_and_recall_node(input.pathways, node_table, algorithms, output.pr_node_file, output.pr_node_png)
         Evaluation.precision_and_recall_edge(input.pathways, edge_table, algorithms, output.pr_edge_file, output.pr_edge_png)
         node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file)
         Evaluation.precision_recall_curve_node_ensemble(node_ensemble, node_table, output.pr_curve_png)
         pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, out_dir)
-        Evaluation.precision_and_recall(pca_chosen_pathway, node_table, algorithms, output.pca_chosen_pr_file)
-        # Evaluation.precision_and_recall_edge(pca_chosen_pathway, edge_table, algorithms, output.pca_chosen_pr_file)
+        Evaluation.precision_and_recall_node(pca_chosen_pathway, node_table, algorithms, output.pca_chosen_pr_file)
 
 
 # Returns all pathways for a specific algorithm and dataset
@@ -431,9 +430,13 @@ rule evaluation_per_algo_pathways:
     output: 
         pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-recall-per-pathway.txt"]),
         pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-precision-recall-per-pathway.png']),
+        pr_edge_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-recall-per-pathway_edge.txt"]),
+        pr_edge_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-precision-recall-per-pathway_edge.png']),
     run:
         node_table = Evaluation.from_file(input.gold_standard_file).node_table
-        Evaluation.precision_and_recall(input.pathways, node_table, algorithms, output.pr_file, output.pr_png)
+        Evaluation.precision_and_recall_node(input.pathways, node_table, algorithms, output.pr_file, output.pr_png)
+        edge_table = Evaluation.from_file(input.gold_standard_file).edge_table
+        Evaluation.precision_and_recall_edge(input.pathways, edge_table, algorithms, output.pr_edge_file, output.pr_edge_png)
 
 rule evaluation_per_algo_ensemble_pr_curve:
     input: 
@@ -455,7 +458,7 @@ rule evaluation_per_algo_pca_chosen:
     run:
         node_table = Evaluation.from_file(input.gold_standard_file).node_table
         pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, out_dir)
-        Evaluation.precision_and_recall(pca_chosen_pathway, node_table, algorithms, output.pca_chosen_pr_file)
+        Evaluation.precision_and_recall_node(pca_chosen_pathway, node_table, algorithms, output.pca_chosen_pr_file)
 
 # Remove the output directory
 rule clean:
diff --git a/config/synthetic.yaml b/config/synthetic.yaml
@@ -45,7 +45,7 @@ container_registry:
 algorithms:
       - name: "pathlinker"
         params:
-              include: false
+              include: true
               run1:
                   k: range(100,201,100)
 
@@ -69,26 +69,26 @@ algorithms:
 
       - name: "meo"
         params:
-              include: false
+              include: true
               run1:
                   max_path_length: [3]
                   local_search: ["Yes"]
                   rand_restarts: [10]
 
       - name: "mincostflow"
         params:
-              include: false
+              include: true
               run1:
                   flow: [1] # The flow must be an int
                   capacity: [1]
 
       - name: "allpairs"
         params:
-              include: false
+              include: true
 
       - name: "domino"
         params:
-              include: false
+              include: true
               run1:
                   slice_threshold: [0.3]
                   module_threshold: [0.05]
diff --git a/spras/evaluation.py b/spras/evaluation.py
@@ -84,11 +84,11 @@ def load_files_from_dict(self, gold_standard_dict: Dict):
     @staticmethod
     def precision_and_recall_edge(file_paths: Iterable[Path], edge_table: pd.DataFrame, algorithms: list, output_file: str, output_png:str=None):
         """
-        Takes in file paths for a specific dataset and an associated gold standard node table.
+        Takes in file paths for a specific dataset and an associated gold standard edge table.
         Calculates precision and recall for each pathway file
         Returns output back to output_file
         @param file_paths: file paths of pathway reconstruction algorithm outputs
-        @param node_table: the gold standard nodes
+        @param edge_table: the gold standard edges
         @param algorithms: list of algorithms used in current run of SPRAS
         @param output_file: the filename to save the precision and recall of each pathway
         @param output_png (optional): the filename to plot the precision and recall of each pathway (not a PRC)
@@ -99,17 +99,12 @@ def precision_and_recall_edge(file_paths: Iterable[Path], edge_table: pd.DataFra
         results = []
         for file in file_paths:
             df = pd.read_table(file, sep="\t", header=0, usecols=["Node1", "Node2"])
-            print(file)
-            print(df)
             y_pred = set()
             for row in df.itertuples():
                 y_pred.add((row[1], row[2]))
             all_edges = set(gs_edges.union(y_pred))
             y_true_binary = [1 if (edge[0], edge[1]) in gs_edges or (edge[1], edge[0]) in gs_edges else 0 for edge in all_edges]
             y_pred_binary = [1 if (edge[0], edge[1]) in y_pred or (edge[1], edge[0]) in y_pred else 0 for edge in all_edges]
-
-            # # default to 0.0 if there is a divide by 0 error
-            # # not using precision_recall_curve because thresholds are binary (0 or 1); rather we are directly calculating precision and recall per pathway
             precision = precision_score(y_true_binary, y_pred_binary, zero_division=0.0)
             recall = recall_score(y_true_binary, y_pred_binary, zero_division=0.0)
             results.append({"Pathway": file, "Precision": precision, "Recall": recall})
@@ -152,7 +147,7 @@ def precision_and_recall_edge(file_paths: Iterable[Path], edge_table: pd.DataFra
                 plt.savefig(output_png)
 
     @staticmethod
-    def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, algorithms: list, output_file: str, output_png:str=None):
+    def precision_and_recall_node(file_paths: Iterable[Path], node_table: pd.DataFrame, algorithms: list, output_file: str, output_png:str=None):
         """
         Takes in file paths for a specific dataset and an associated gold standard node table.
         Calculates precision and recall for each pathway file