Reduced plotter and logger and bad run input taken by sse_to_roc macro

AutoDQM · Aug 8, 2023 · e28989d · e28989d
1 parent dd1018e
commit e28989d
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 34 deletions.
diff --git a/autodqm_ml/plotting/plot_tools.py b/autodqm_ml/plotting/plot_tools.py
@@ -78,17 +78,14 @@ def make_original_vs_reconstructed_plot1d(name, original, recos, run, save_name,
     h_reco = []
     bbc = {reco: [] for reco in recos.keys()}
     nfp = {reco: [] for reco in recos.keys()}
-    print("Run: "+str(run))
     for reco, info in recos.items():
         h = Hist1D(info["reco"], bins = bins, label = "%s [sse : %.2E]" % (reco, info["score"]))
         h._counts = info["reco"]
         h_reco.append(h)
         count_bad_bin = np.maximum(np.abs(original - info["reco"]),0.001)
         bad_bin_size = (np.asarray(count_bad_bin) > 0.001).sum()
-        print("ALGORITHM: "+reco)
-        if bad_bin_size > 0.1*len(info["reco"]):
+        #if bad_bin_size > 0.1*len(info["reco"]):
             #print("ANOMALY IN BBC: Bad bin count greater than 10 ("+str(bad_bin_size)+") in run "+str(run))
-            print("BBC: "+str(bad_bin_size))
         bbc[reco] = bad_bin_size
 
         reconozero = np.asarray(info["reco"])
@@ -98,9 +95,8 @@ def make_original_vs_reconstructed_plot1d(name, original, recos, run, save_name,
             if reconozero[i]-0.01*reconozero[i] < original[i] < reconozero[i]+0.01*reconozero[i]:
                 nfp_vals[i] = 0
         nfp_tot = nfp_vals.sum()
-        if nfp_tot > 1e-4:
+        #if nfp_tot > 1e-4:
             #print("ANOMALY IN 95% DEVIATION: SSE counted at the 99% percentile greater than 1e-6 ("+str(nfp_tot)+") in run "+str(run))
-            print("OPD: "+str(nfp_tot))
         nfp[reco] = nfp_tot
 
     fig, (ax1,ax2) = plt.subplots(2, sharex=True, figsize=(8,6), gridspec_kw=dict(height_ratios=[3, 1]))

diff --git a/scripts/assess.py b/scripts/assess.py
@@ -124,10 +124,9 @@ def main(args):
         for algorithm, algorithm_info in info["algorithms"].items():
             #runs_sorted = runs[awkward.argsort(runs[algorithm_info["score"]], ascending=False)]
             runs_sorted = runs
-            #print(runs_sorted)
-            #logger.info("[assess.py] For histogram '%s', algorithm '%s', the mean +/- std anomaly score is: %.2e +/- %.2e." % (h, algorithm, awkward.mean(runs[algorithm_info["score"]]), awkward.std(runs[algorithm_info["score"]])))
-            #logger.info("[assess.py] For histogram '%s', algorithm '%s', the runs with the highest anomaly scores are: " % (h, algorithm)) 
-            #logger.info("\t The runs with the highest anomaly scores are:")
+            logger.info("[assess.py] For histogram '%s', algorithm '%s', the mean +/- std anomaly score is: %.2e +/- %.2e." % (h, algorithm, awkward.mean(runs[algorithm_info["score"]]), awkward.std(runs[algorithm_info["score"]])))
+            logger.info("[assess.py] For histogram '%s', algorithm '%s', the runs with the highest anomaly scores are: " % (h, algorithm)) 
+            logger.info("\t The runs with the highest anomaly scores are:")
 
             if N == 5:
                 sse_df_ae = pd.DataFrame(runs_sorted.run_number)
@@ -145,11 +144,10 @@ def main(args):
             if any(x in algorithm.lower() for x in ["pca"]):
        	       	sse_df_pca[h] = runs_sorted[algorithm_info["score"]]
 
-            #for i in range(N):
-            #    logger.info("\t Run number : %d, Anomaly Score : %.2e" % (runs_sorted.run_number[i], runs_sorted[algorithm_info["score"]][i]))
+            for i in range(N):
+                logger.info("\t Run number : %d, Anomaly Score : %.2e" % (runs_sorted.run_number[i], runs_sorted[algorithm_info["score"]][i]))
 
     sse_df = pd.concat([sse_df_ae,sse_df_pca]).reset_index(drop=True)
-    print(sse_df['algo'])
     sse_df.to_csv(args.output_dir + "/bad_runs_sse_scores.csv",index=False)
 
     # Histogram of sse for algorithms
@@ -163,25 +161,22 @@ def main(args):
             for name, id in split_info:
                 runs_set = runs[runs[split] == id]
                 if len(runs_set) == 0:
-                #    logger.warning("[assess.py] For histogram '%s', no runs belong to the set '%s', skipping making a histogram of SSE for this." % (h, name))
+                    logger.warning("[assess.py] For histogram '%s', no runs belong to the set '%s', skipping making a histogram of SSE for this." % (h, name))
                     continue
                 recos = {}
-                #print(runs_set)
-                #print(info['algorithms'].items())
                 for algorithm, algorithm_info in info["algorithms"].items():
                     recos[algorithm] = { "score" : runs_set[algorithm_info["score"]] }
                     recos_by_label[algorithm][name] = { "score" : runs_set[algorithm_info["score"]] }
 
                 h_name = h.replace("/", "").replace(" ", "")
                 save_name = args.output_dir + "/" + h_name + "_sse_%s_%s.pdf" % (split, name)
-                #print(histograms.items('original'))
-                #make_sse_plot(h_name, recos, save_name)
+                make_sse_plot(h_name, recos, save_name)
 
             for algorithm, recos_alg in recos_by_label.items():
                 if not recos_alg:
                     continue
-                #save_name = args.output_dir + "/" + h_name + "_sse_%s_%s.pdf" % (algorithm, split)
-                #make_sse_plot(h_name, recos_alg, save_name) 
+                save_name = args.output_dir + "/" + h_name + "_sse_%s_%s.pdf" % (algorithm, split)
+                make_sse_plot(h_name, recos_alg, save_name) 
 
     # ROC curves (if there are labeled runs)
     has_labeled_runs = True
@@ -199,13 +194,13 @@ def main(args):
             roc_results[h] = {}
             for algorithm, algorithm_info in info["algorithms"].items():
                 pred = labeled_runs[algorithm_info["score"]]
-                #roc_results[h][algorithm] = calc_roc_and_unc(labeled_runs.label, pred)
+                roc_results[h][algorithm] = calc_roc_and_unc(labeled_runs.label, pred)
 
-            #h_name = h.replace("/", "").replace(" ", "")
-            #save_name = args.output_dir + "/" + h_name + "_roc.pdf"
-            #plot_roc_curve(h_name, roc_results[h], save_name)
-            #plot_roc_curve(h_name, roc_results[h], save_name.replace(".pdf", "_log.pdf"), log = True)
-            #print_eff_table(h_name, roc_results[h])
+            h_name = h.replace("/", "").replace(" ", "")
+            save_name = args.output_dir + "/" + h_name + "_roc.pdf"
+            plot_roc_curve(h_name, roc_results[h], save_name)
+            plot_roc_curve(h_name, roc_results[h], save_name.replace(".pdf", "_log.pdf"), log = True)
+            print_eff_table(h_name, roc_results[h])
 
 
     # Plots of original/reconstructed histograms
@@ -220,7 +215,7 @@ def main(args):
         selected_runs_idx = runs.run_number < 0 # dummy all False
         for run in selected_runs:
             selected_runs_idx = selected_runs_idx | (runs.run_number == run)
-        #logger.debug("[assess.py] Will make plots for the %d specified runs: %s" % (len(selected_runs), str(selected_runs)))
+        logger.debug("[assess.py] Will make plots for the %d specified runs: %s" % (len(selected_runs), str(selected_runs)))
 
     runs_trim = runs[selected_runs_idx]
     for h, info in histograms.items():
@@ -230,14 +225,16 @@ def main(args):
             original = run[info["original"]]
             recos = {}
             for algorithm, algorithm_info in info["algorithms"].items():
+                if algorithm == "test_pca": algorithm = "PCA"
+                if algorithm ==	"test_ae": algorithm = "AE"
                 if algorithm_info["reco"] is None:
                     continue
                 recos[algorithm] = { "reco" : run[algorithm_info["reco"]], "score" : run[algorithm_info["score"]]}
             h_name = h.replace("/", "").replace(" ", "")
             save_name = args.output_dir + "/" + h_name + "_Run%d.pdf" % run_number
-            #make_original_vs_reconstructed_plot(h_name, original, recos, run_number, save_name) 
+            make_original_vs_reconstructed_plot(h_name, original, recos, run_number, save_name) 
 
-    #logger.info("[assess.py] Plots written to directory '%s'." % (args.output_dir))
+    logger.info("[assess.py] Plots written to directory '%s'." % (args.output_dir))
 
     if args.make_webpage:
         os.system("cp web/index.php %s" % args.output_dir)

diff --git a/scripts/sse_scores_to_roc.py b/scripts/sse_scores_to_roc.py
@@ -28,15 +28,15 @@ def count_hists_above(Fdf, Fthreshold_list):
   return mean(bad_hist_array)
 
 
-def main(infile):
+def main(infile, bad_runs_string):
 
   #all_files = glob.glob(os.path.join("./", "scores*.csv"))
   #df = pd.concat(map(pd.read_csv, all_files), axis=1)
 
   df = pd.read_csv(infile + "/bad_runs_sse_scores.csv")
   df = df.loc[:,~df.columns.duplicated()].copy()
-  #bad_runs = [355989,355990,355991,355992,355993,355994,355995,355996,355997,356001,356002,356003,356046,356047,356048,356073,356162,356163,356164,356165,356170,356174,356175,356309,356321,356371,356375,356377,356378,356382,356383,356384,356385,356426,356427,356428,356431,356432,356436,356466,356467,356468,356469,356470,356471,356472,356473,356474,356475,356476,356478,356479,356481,356488,356489,356523,356524,356525,356526,356527,356528,356529,356530,356568,356576,356577,356581,356582,356613,356614,356709,356719,356720,356721,356722,356788,356789,356810,356825,356902,356906,356943,356944,356945,356950,356997,357059,357070,357076,357077,357078,357096,357098,357100]
-  bad_runs = [355865,356071,356074,356321,356375,356466,356467,356469,356472,356473,356476,356478,356481,356488,356489,356577,356581,356709,356719,356720,356721,356722,356788,356789,356815,356943,356944,356945,356997,356998,357077,357078,357100,357101,357103,357105,357110]
+
+  bad_runs = [int(run) for run in bad_runs_string.split(",")]
 
   df_pca = df.loc[df['algo'] == "pca"]
   df_ae = df.loc[df['algo'] == "ae"]
@@ -201,8 +201,9 @@ def main(infile):
 
 if __name__ == "__main__":
   parser = ArgumentParser(description=__doc__, formatter_class=ArgumentDefaultsHelpFormatter)
-  parser.add_argument("-i","--infile", type=str, help="Input directory where bad_runs_sse_scores.csv file is located (also output directory)")
+  parser.add_argument("-i","--infile", type=str, required=True, help="Input directory where bad_runs_sse_scores.csv file is located (also output directory)")
+  parser.add_argument("-br","--bad_runs", type=str, required=True, help="List of bad runs as determined by data certification reports or similar bodies (enter as comma separated numbers e.g. 356000,356002,...)")
   args = parser.parse_args()
 
-  main(infile=args.infile)
+  main(infile=args.infile, args.bad_runs)