Skip to content

Commit

Permalink
Reduced plotter and logger and bad run input taken by sse_to_roc macro
Browse files Browse the repository at this point in the history
  • Loading branch information
GluonicPenguin committed Aug 8, 2023
1 parent dd1018e commit e28989d
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 34 deletions.
8 changes: 2 additions & 6 deletions autodqm_ml/plotting/plot_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,17 +78,14 @@ def make_original_vs_reconstructed_plot1d(name, original, recos, run, save_name,
h_reco = []
bbc = {reco: [] for reco in recos.keys()}
nfp = {reco: [] for reco in recos.keys()}
print("Run: "+str(run))
for reco, info in recos.items():
h = Hist1D(info["reco"], bins = bins, label = "%s [sse : %.2E]" % (reco, info["score"]))
h._counts = info["reco"]
h_reco.append(h)
count_bad_bin = np.maximum(np.abs(original - info["reco"]),0.001)
bad_bin_size = (np.asarray(count_bad_bin) > 0.001).sum()
print("ALGORITHM: "+reco)
if bad_bin_size > 0.1*len(info["reco"]):
#if bad_bin_size > 0.1*len(info["reco"]):
#print("ANOMALY IN BBC: Bad bin count greater than 10 ("+str(bad_bin_size)+") in run "+str(run))
print("BBC: "+str(bad_bin_size))
bbc[reco] = bad_bin_size

reconozero = np.asarray(info["reco"])
Expand All @@ -98,9 +95,8 @@ def make_original_vs_reconstructed_plot1d(name, original, recos, run, save_name,
if reconozero[i]-0.01*reconozero[i] < original[i] < reconozero[i]+0.01*reconozero[i]:
nfp_vals[i] = 0
nfp_tot = nfp_vals.sum()
if nfp_tot > 1e-4:
#if nfp_tot > 1e-4:
#print("ANOMALY IN 95% DEVIATION: SSE counted at the 99% percentile greater than 1e-6 ("+str(nfp_tot)+") in run "+str(run))
print("OPD: "+str(nfp_tot))
nfp[reco] = nfp_tot

fig, (ax1,ax2) = plt.subplots(2, sharex=True, figsize=(8,6), gridspec_kw=dict(height_ratios=[3, 1]))
Expand Down
43 changes: 20 additions & 23 deletions scripts/assess.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,10 +124,9 @@ def main(args):
for algorithm, algorithm_info in info["algorithms"].items():
#runs_sorted = runs[awkward.argsort(runs[algorithm_info["score"]], ascending=False)]
runs_sorted = runs
#print(runs_sorted)
#logger.info("[assess.py] For histogram '%s', algorithm '%s', the mean +/- std anomaly score is: %.2e +/- %.2e." % (h, algorithm, awkward.mean(runs[algorithm_info["score"]]), awkward.std(runs[algorithm_info["score"]])))
#logger.info("[assess.py] For histogram '%s', algorithm '%s', the runs with the highest anomaly scores are: " % (h, algorithm))
#logger.info("\t The runs with the highest anomaly scores are:")
logger.info("[assess.py] For histogram '%s', algorithm '%s', the mean +/- std anomaly score is: %.2e +/- %.2e." % (h, algorithm, awkward.mean(runs[algorithm_info["score"]]), awkward.std(runs[algorithm_info["score"]])))
logger.info("[assess.py] For histogram '%s', algorithm '%s', the runs with the highest anomaly scores are: " % (h, algorithm))
logger.info("\t The runs with the highest anomaly scores are:")

if N == 5:
sse_df_ae = pd.DataFrame(runs_sorted.run_number)
Expand All @@ -145,11 +144,10 @@ def main(args):
if any(x in algorithm.lower() for x in ["pca"]):
sse_df_pca[h] = runs_sorted[algorithm_info["score"]]

#for i in range(N):
# logger.info("\t Run number : %d, Anomaly Score : %.2e" % (runs_sorted.run_number[i], runs_sorted[algorithm_info["score"]][i]))
for i in range(N):
logger.info("\t Run number : %d, Anomaly Score : %.2e" % (runs_sorted.run_number[i], runs_sorted[algorithm_info["score"]][i]))

sse_df = pd.concat([sse_df_ae,sse_df_pca]).reset_index(drop=True)
print(sse_df['algo'])
sse_df.to_csv(args.output_dir + "/bad_runs_sse_scores.csv",index=False)

# Histogram of sse for algorithms
Expand All @@ -163,25 +161,22 @@ def main(args):
for name, id in split_info:
runs_set = runs[runs[split] == id]
if len(runs_set) == 0:
# logger.warning("[assess.py] For histogram '%s', no runs belong to the set '%s', skipping making a histogram of SSE for this." % (h, name))
logger.warning("[assess.py] For histogram '%s', no runs belong to the set '%s', skipping making a histogram of SSE for this." % (h, name))
continue
recos = {}
#print(runs_set)
#print(info['algorithms'].items())
for algorithm, algorithm_info in info["algorithms"].items():
recos[algorithm] = { "score" : runs_set[algorithm_info["score"]] }
recos_by_label[algorithm][name] = { "score" : runs_set[algorithm_info["score"]] }

h_name = h.replace("/", "").replace(" ", "")
save_name = args.output_dir + "/" + h_name + "_sse_%s_%s.pdf" % (split, name)
#print(histograms.items('original'))
#make_sse_plot(h_name, recos, save_name)
make_sse_plot(h_name, recos, save_name)

for algorithm, recos_alg in recos_by_label.items():
if not recos_alg:
continue
#save_name = args.output_dir + "/" + h_name + "_sse_%s_%s.pdf" % (algorithm, split)
#make_sse_plot(h_name, recos_alg, save_name)
save_name = args.output_dir + "/" + h_name + "_sse_%s_%s.pdf" % (algorithm, split)
make_sse_plot(h_name, recos_alg, save_name)

# ROC curves (if there are labeled runs)
has_labeled_runs = True
Expand All @@ -199,13 +194,13 @@ def main(args):
roc_results[h] = {}
for algorithm, algorithm_info in info["algorithms"].items():
pred = labeled_runs[algorithm_info["score"]]
#roc_results[h][algorithm] = calc_roc_and_unc(labeled_runs.label, pred)
roc_results[h][algorithm] = calc_roc_and_unc(labeled_runs.label, pred)

#h_name = h.replace("/", "").replace(" ", "")
#save_name = args.output_dir + "/" + h_name + "_roc.pdf"
#plot_roc_curve(h_name, roc_results[h], save_name)
#plot_roc_curve(h_name, roc_results[h], save_name.replace(".pdf", "_log.pdf"), log = True)
#print_eff_table(h_name, roc_results[h])
h_name = h.replace("/", "").replace(" ", "")
save_name = args.output_dir + "/" + h_name + "_roc.pdf"
plot_roc_curve(h_name, roc_results[h], save_name)
plot_roc_curve(h_name, roc_results[h], save_name.replace(".pdf", "_log.pdf"), log = True)
print_eff_table(h_name, roc_results[h])


# Plots of original/reconstructed histograms
Expand All @@ -220,7 +215,7 @@ def main(args):
selected_runs_idx = runs.run_number < 0 # dummy all False
for run in selected_runs:
selected_runs_idx = selected_runs_idx | (runs.run_number == run)
#logger.debug("[assess.py] Will make plots for the %d specified runs: %s" % (len(selected_runs), str(selected_runs)))
logger.debug("[assess.py] Will make plots for the %d specified runs: %s" % (len(selected_runs), str(selected_runs)))

runs_trim = runs[selected_runs_idx]
for h, info in histograms.items():
Expand All @@ -230,14 +225,16 @@ def main(args):
original = run[info["original"]]
recos = {}
for algorithm, algorithm_info in info["algorithms"].items():
if algorithm == "test_pca": algorithm = "PCA"
if algorithm == "test_ae": algorithm = "AE"
if algorithm_info["reco"] is None:
continue
recos[algorithm] = { "reco" : run[algorithm_info["reco"]], "score" : run[algorithm_info["score"]]}
h_name = h.replace("/", "").replace(" ", "")
save_name = args.output_dir + "/" + h_name + "_Run%d.pdf" % run_number
#make_original_vs_reconstructed_plot(h_name, original, recos, run_number, save_name)
make_original_vs_reconstructed_plot(h_name, original, recos, run_number, save_name)

#logger.info("[assess.py] Plots written to directory '%s'." % (args.output_dir))
logger.info("[assess.py] Plots written to directory '%s'." % (args.output_dir))

if args.make_webpage:
os.system("cp web/index.php %s" % args.output_dir)
Expand Down
11 changes: 6 additions & 5 deletions scripts/sse_scores_to_roc.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@ def count_hists_above(Fdf, Fthreshold_list):
return mean(bad_hist_array)


def main(infile):
def main(infile, bad_runs_string):

#all_files = glob.glob(os.path.join("./", "scores*.csv"))
#df = pd.concat(map(pd.read_csv, all_files), axis=1)

df = pd.read_csv(infile + "/bad_runs_sse_scores.csv")
df = df.loc[:,~df.columns.duplicated()].copy()
#bad_runs = [355989,355990,355991,355992,355993,355994,355995,355996,355997,356001,356002,356003,356046,356047,356048,356073,356162,356163,356164,356165,356170,356174,356175,356309,356321,356371,356375,356377,356378,356382,356383,356384,356385,356426,356427,356428,356431,356432,356436,356466,356467,356468,356469,356470,356471,356472,356473,356474,356475,356476,356478,356479,356481,356488,356489,356523,356524,356525,356526,356527,356528,356529,356530,356568,356576,356577,356581,356582,356613,356614,356709,356719,356720,356721,356722,356788,356789,356810,356825,356902,356906,356943,356944,356945,356950,356997,357059,357070,357076,357077,357078,357096,357098,357100]
bad_runs = [355865,356071,356074,356321,356375,356466,356467,356469,356472,356473,356476,356478,356481,356488,356489,356577,356581,356709,356719,356720,356721,356722,356788,356789,356815,356943,356944,356945,356997,356998,357077,357078,357100,357101,357103,357105,357110]

bad_runs = [int(run) for run in bad_runs_string.split(",")]

df_pca = df.loc[df['algo'] == "pca"]
df_ae = df.loc[df['algo'] == "ae"]
Expand Down Expand Up @@ -201,8 +201,9 @@ def main(infile):

if __name__ == "__main__":
parser = ArgumentParser(description=__doc__, formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument("-i","--infile", type=str, help="Input directory where bad_runs_sse_scores.csv file is located (also output directory)")
parser.add_argument("-i","--infile", type=str, required=True, help="Input directory where bad_runs_sse_scores.csv file is located (also output directory)")
parser.add_argument("-br","--bad_runs", type=str, required=True, help="List of bad runs as determined by data certification reports or similar bodies (enter as comma separated numbers e.g. 356000,356002,...)")
args = parser.parse_args()

main(infile=args.infile)
main(infile=args.infile, args.bad_runs)

0 comments on commit e28989d

Please sign in to comment.