diff --git a/README.md b/README.md index 13e6c99..eb661be 100644 --- a/README.md +++ b/README.md @@ -42,15 +42,10 @@ and then rerunning the command to create the `conda` env. The resulting `conda e **3. Install autodqm-ml** -**Users** can install with: -``` -python setup.py install -``` -**Developers** are suggested to install with: +Install with: ``` pip install -e . ``` -to avoid rerunning the whole installation every time there is a change. Once your setup is installed, you can activate your python environment with ``` diff --git a/autodqm_ml/algorithms/anomaly_detection_algorithm.py b/autodqm_ml/algorithms/anomaly_detection_algorithm.py index f2c364e..cad13a9 100644 --- a/autodqm_ml/algorithms/anomaly_detection_algorithm.py +++ b/autodqm_ml/algorithms/anomaly_detection_algorithm.py @@ -2,6 +2,7 @@ import pandas import numpy import awkward +import json from autodqm_ml import utils from autodqm_ml.data_formats.histogram import Histogram @@ -72,6 +73,7 @@ def load_data(self, file = None, histograms = {}, train_frac = 0.5, remove_low_s if histograms: self.histograms = histograms + self.histogram_name_map = {} # we replace "/" and spaces in input histogram names to play nicely with other packages, this map lets you go back and forth between them logger.debug("[AnomalyDetectionAlgorithm : load_data] Loading training data from file '%s'" % (self.input_file)) @@ -81,6 +83,7 @@ def load_data(self, file = None, histograms = {}, train_frac = 0.5, remove_low_s # Set helpful metadata for histogram, histogram_info in self.histograms.items(): self.histograms[histogram]["name"] = histogram.replace("/", "").replace(" ","") + self.histogram_name_map[self.histograms[histogram]["name"]] = histogram a = awkward.to_numpy(df[histogram][0]) self.histograms[histogram]["shape"] = a.shape @@ -134,9 +137,9 @@ def load_data(self, file = None, histograms = {}, train_frac = 0.5, remove_low_s self.n_train = awkward.sum(df.train_label == 0) self.n_test = awkward.sum(df.train_label == 1) self.df = df + self.n_histograms = len(list(self.histograms.keys())) - - logger.debug("[AnomalyDetectionAlgorithm : load_data] Loaded data for %d histograms with %d events in training set and %d events in testing set." % (len(list(self.histograms.keys())), self.n_train, self.n_test)) + logger.debug("[AnomalyDetectionAlgorithm : load_data] Loaded data for %d histograms with %d events in training set and %d events in testing set." % (self.n_histograms, self.n_train, self.n_test)) self.data_is_loaded = True @@ -160,3 +163,13 @@ def save(self): self.output_file = "%s/%s.parquet" % (self.output_dir, self.input_file.split("/")[-1].replace(".parquet", "")) logger.info("[AnomalyDetectionAlgorithm : save] Saving output with additional fields to file '%s'." % (self.output_file)) awkward.to_parquet(self.df, self.output_file) + + self.config_file = "%s/%s_%s.json" % (self.output_dir, self.name, self.tag) + config = {} + for k,v in vars(self).items(): + if utils.is_json_serializable(v): + config[k] = v + + logger.info("[AnomalyDetectionAlgorithm : save] Saving AnomalyDetectionAlgorithm config to file '%s'." % (self.config_file)) + with open(self.config_file, "w") as f_out: + json.dump(config, f_out, sort_keys = True, indent = 4) diff --git a/autodqm_ml/algorithms/autoencoder.py b/autodqm_ml/algorithms/autoencoder.py index 1ee8399..b4226e6 100644 --- a/autodqm_ml/algorithms/autoencoder.py +++ b/autodqm_ml/algorithms/autoencoder.py @@ -3,6 +3,7 @@ import numpy import json import awkward +import copy import logging logger = logging.getLogger(__name__) @@ -14,26 +15,54 @@ from autodqm_ml import utils DEFAULT_OPT = { + "batch_size" : 128, + "val_batch_size" : 1024, + "learning_rate" : 0.001, + "n_epochs" : 1000, + "early_stopping" : True, + "early_stopping_rounds" : 3, "n_hidden_layers" : 2, - "n_nodes" : 25, + "n_nodes" : 50, "n_components" : 3, "kernel_1d" : 3, "kernel_2d" : 3, - "n_filters" : 8 + "strides_1d" : 1, + "strides_2d" : 1, + "dropout" : 0.0, + "batch_norm" : False, + "n_filters" : 12 } class AutoEncoder(MLAlgorithm): """ Autoencoder base class. + + :param config: dictionary with hyperparameters for autoencoder training. Any hyperparameters not specified will be taken from the default values in `DEFAULT_OPT` + :type config: dict + :param mode: string to specify whether you want to train an autoencoder for each histogram ("individual") or a single autoencoder on all histograms ("simultaneous") + :type mode: str """ def __init__(self, **kwargs): super(AutoEncoder, self).__init__(**kwargs) self.config = utils.update_dict( original = DEFAULT_OPT, - new = self.__dict__ + new = kwargs.get('config', {}) ) + self.mode = kwargs.get('autoencoder_mode', 'individual') + if self.mode is None: + self.mode = "individual" + + if not self.mode in ["individual", "simultaneous"]: + logger.exception("[AutoEncoder : __init__] mode '%s' is not a recognized option for AutoEncoder. Currently available modes are 'individual' (default) and 'simultaneous'." % (self.mode)) + raise ValueError() + self.models = {} + + logger.debug("[AutoEncoder : __init__] Constructing AutoEncoder with the following training options and hyperparameters:") + for param, value in self.config.items(): + logger.debug("\t %s : %s" % (param, str(value))) + def load_model(self, model_file): """ @@ -47,66 +76,103 @@ def save_model(self, model, model_file): """ """ + logger.debug("[AutoEncoder : save_model] Saving trained autoencoder to file '%s'." % (model_file)) model.save(model_file) - def train(self, n_epochs = 1000, batch_size = 128): + def train(self): """ """ - model_file = "%s/autoencoder_%s.h5" % (self.output_dir, self.tag) - if os.path.exists(model_file): - logger.warning("[AutoEncoder : train] A trained AutoEncoder alread exists with tag '%s' at file '%s'. We will load the saved model from the file rather than retraining. If you wish to retrain please provide a new tag or delete the old outputs." % (self.tag, model_file)) - self.model = self.load_model(model_file) - return - - inputs, outputs = self.make_inputs(split = "train") - inputs_val, outputs_val = self.make_inputs(split = "test") - - self.model = AutoEncoder_DNN(self.histograms, **self.config).model() - - self.model.compile( - optimizer = keras.optimizers.Adam(), - loss = keras.losses.MeanSquaredError() - ) + if self.mode == "simultaneous": + self.models = { None : None } + logger.debug("[AutoEncoder : train] Mode selected as 'simultaneous', meaning a single autoencoder will be trained simultaneously on all histograms. Use 'individual' if you wish to train one autoencoder for each histogram.") + elif self.mode == "individual": + self.models = { k : None for k,v in self.histograms.items() } #copy.deepcopy(self.histograms) + logger.debug("[AutoEncoder : train] Mode selected as 'individual', meaning one autoencoder will be trained for each histogram. Use 'simultaneous' if you wish to train a single autoencoder for all histograms.") + + for histogram, histogram_info in self.models.items(): + if histogram is None: + model_file = "%s/autoencoder_%s.h5" % (self.output_dir, self.tag) + else: + model_file = "%s/autoencoder_%s_%s.h5" % (self.output_dir, histogram, self.tag) + + if os.path.exists(model_file): + logger.warning("[AutoEncoder : train] A trained AutoEncoder already exists with tag '%s' at file '%s'. We will load the saved model from the file rather than retraining. If you wish to retrain please provide a new tag or delete the old outputs." % (self.tag, model_file)) + self.models[histogram] = self.load_model(model_file) + return - self.model.fit( - inputs, - outputs, - validation_data = (inputs_val, outputs_val), - callbacks = [keras.callbacks.EarlyStopping(patience = 3)], - epochs = n_epochs, - batch_size = batch_size - ) - self.save_model(self.model, model_file) - - def predict(self, batch_size = 1024): - inputs, outputs = self.make_inputs(split = "all") - pred = self.model.predict(inputs, batch_size = batch_size) - - idx = 0 - for histogram, histogram_info in self.histograms.items(): - original_hist = self.df[histogram] - if len(self.histograms.items()) >= 2: - reconstructed_hist = awkward.flatten(awkward.from_numpy(pred[idx]), axis = -1) - else: - reconstructed_hist = awkward.flatten(awkward.from_numpy(pred), axis = -1) + inputs, outputs = self.make_inputs(split = "train", histogram_name = histogram) + inputs_val, outputs_val = self.make_inputs(split = "test", histogram_name = histogram) - sse = awkward.sum( - (original_hist - reconstructed_hist) ** 2, - axis = -1 + if histogram is None: + hist_name = str(list(self.models.keys())) + else: + hist_name = histogram + logger.debug("[AutoEncoder : train] Training autoencoder with %d dimensions in latent space for histogram(s) '%s' with %d training examples." % (self.config["n_components"], hist_name, len(list(inputs.values())[0]))) + + if self.mode == "simultaneous": + histograms = self.histograms + elif self.mode == "individual": + histograms = { histogram : self.histograms[histogram] } + + model = AutoEncoder_DNN(histograms, **self.config).model() + + model.compile( + optimizer = keras.optimizers.Adam(learning_rate = self.config["learning_rate"]), + loss = keras.losses.MeanSquaredError() ) - # For 2d histograms, we need to sum over one more axis to get a single SSE score for each run - if histogram_info["n_dim"] == 2: - sse = awkward.sum(sse, axis = -1) - - self.add_prediction(histogram, sse, reconstructed_hist) - idx += 1 + callbacks = [] + if self.config["early_stopping"]: + callbacks.append(keras.callbacks.EarlyStopping(patience = self.config["early_stopping_rounds"])) + + model.fit( + inputs, + outputs, + validation_data = (inputs_val, outputs_val), + callbacks = callbacks, + epochs = self.config["n_epochs"], + batch_size = self.config["batch_size"] + ) + + self.save_model(model, model_file) + self.models[histogram] = model + + def predict(self, batch_size = 1024): + for histogram, model in self.models.items(): + inputs, outputs = self.make_inputs(split = "all", histogram_name = histogram) + predictions = model.predict(inputs, batch_size = batch_size) - def make_inputs(self, split = None): + if self.mode == "simultaneous" and self.n_histograms >= 2: + predictions = { name : pred for name, pred in zip(model.output_names, predictions) } + else: + predictions = { model.output_names[0] : predictions } + + for name, pred in predictions.items(): + hist_name = self.histogram_name_map[name.replace("output_", "")] # shape [n_runs, histogram dimensions, 1] + original_hist = self.df[hist_name] # shape [n_runs, histogram dimensions] + + reconstructed_hist = awkward.flatten( # change shape from [n_runs, histogram dimensions, 1] -> [n_runs, histogram dimensions] + awkward.from_numpy(pred), + axis = -1 + ) + + sse = awkward.sum( # perform sum along inner-most axis, i.e. first histogram dimension + (original_hist - reconstructed_hist) ** 2, + axis = -1 + ) + + # For 2d histograms, we need to sum over one more axis to get a single SSE score for each run + if self.histograms[hist_name]["n_dim"] == 2: + sse = awkward.sum(sse, axis = -1) # second histogram dimension + + self.add_prediction(hist_name, sse, reconstructed_hist) + + + def make_inputs(self, split = None, histogram_name = None): """ """ @@ -118,27 +184,29 @@ def make_inputs(self, split = None): elif split == "test": cut = self.df.train_label == 1 else: - cut = self.df.train_label >= 0 + cut = self.df.run_number >= 0 # dummy all True cut df = self.df[cut] for histogram, info in self.histograms.items(): + if histogram_name is not None: # self.mode == "individual", i.e. separate autoencoder for each histogram + if not histogram == histogram_name: # only grab the relevant histogram for this autoencoder + continue + data = tf.convert_to_tensor(df[histogram]) inputs["input_" + info["name"]] = data outputs["output_" + info["name"]] = data + + return inputs, outputs -class AutoEncoder_DNN(keras.models.Model): +class AutoEncoder_DNN(): """ - Model defined through the Keras Model Subclassing API: https://www.tensorflow.org/guide/keras/custom_layers_and_models - An AutoEncoder instance owns a single AutoEncoder_DNN, which is the actual implementation of the DNN. - + An AutoEncoder instance owns AutoEncoder_DNN(s), which is the actual implementation of the DNN. """ def __init__(self, histograms, **kwargs): - super(AutoEncoder_DNN, self).__init__() - self.n_histograms = len(histograms.keys()) self.__dict__.update(kwargs) @@ -179,7 +247,11 @@ def __init__(self, histograms, **kwargs): def model(self): - model = keras.models.Model(inputs = self.inputs, outputs = self.outputs) + model = keras.models.Model( + inputs = self.inputs, + outputs = self.outputs, + name = "autoencoder" + ) model.summary() return model @@ -209,7 +281,12 @@ def build_encoder(self, histogram, info): activation = "relu", name = name )(layer) - + if self.batch_norm: + layer = keras.layers.BatchNormalization(name = name + "_batch_norm") + if self.dropout > 0: + layer = keras.layers.Dropout(self.dropout, name = name + "_dropout") + + encoder = keras.layers.Flatten()(layer) return input, encoder @@ -229,10 +306,14 @@ def build_decoder(self, histogram, info, input): activation = "relu" n_filters = 1 name = "output_%s" % (info["name"]) + batch_norm = False + dropout = 0 else: activation = "relu" n_filters = self.n_filters name = "decoder_%d_%s" % (i, info["name"]) + batch_norm = self.batch_norm + dropout = self.dropout if info["n_dim"] == 1: layer = keras.layers.Conv1DTranspose( @@ -252,6 +333,10 @@ def build_decoder(self, histogram, info, input): activation = activation, name = name )(layer) + if batch_norm: + layer = keras.layers.BatchNormalization(name = name + "_batch_norm") + if dropout > 0: + layer = keras.layers.Dropout(self.dropout, name = name + "_dropout") output = layer return output diff --git a/autodqm_ml/algorithms/pca.py b/autodqm_ml/algorithms/pca.py index 54c73b5..cbc60eb 100644 --- a/autodqm_ml/algorithms/pca.py +++ b/autodqm_ml/algorithms/pca.py @@ -64,6 +64,8 @@ def save_model(self, pca, model_file): :param model_file: folder name to place trained PCA pickles :type model_file: str """ + logger.debug("[PCA : save_model] Saving trained PCA to file '%s'." % (model_file)) + os.system("mkdir -p %s" % self.output_dir) pcaParams = { 'name' : model_file.split("/")[-1].replace(".json", ""), @@ -141,7 +143,6 @@ def train(self): pca.fit(input) self.model[histogram] = pca - logger.debug("[PCA : train] Saving trained PCA to file '%s'." % (model_file)) self.save_model(pca, model_file) diff --git a/autodqm_ml/evaluation/roc_tools.py b/autodqm_ml/evaluation/roc_tools.py new file mode 100644 index 0000000..729e3f8 --- /dev/null +++ b/autodqm_ml/evaluation/roc_tools.py @@ -0,0 +1,143 @@ +import numpy +import random +from sklearn import metrics +from tqdm import tqdm + +import logging +logger = logging.getLogger(__name__) + +def calc_auc(y, pred, sample_weight = None, interp = 10000): + """ + Make interpolated roc curve and calculate AUC. + Keyword arguments: + y -- array of labels + pred -- array of mva scores + sample_weight -- array of per-event weights + interp -- number of points in resulting fpr and tpr arrays + """ + + if sample_weight is None: + sample_weight = numpy.ones_like(y) + + fpr, tpr, thresh = metrics.roc_curve( + y, + pred, + pos_label = 1, + sample_weight = sample_weight + ) + + fpr = sorted(fpr) + tpr = sorted(tpr) + + fpr_interp = numpy.linspace(0, 1, interp) + tpr_interp = numpy.interp(fpr_interp, fpr, tpr) # recalculate tprs at each fpr + + auc = metrics.auc(fpr, tpr) + + results = { + "fpr" : fpr_interp, + "tpr" : tpr_interp, + "auc" : auc + } + return results + +def bootstrap_indices(x): + """ + Return array of indices of len(x) to make bootstrap resamples + """ + + return numpy.random.randint(0, len(x), len(x)) + + +def calc_roc_and_unc(y, pred, sample_weight = None, n_bootstrap = 100, interp = 10000): + """ + Calculates tpr and fpr arrays (with uncertainty for tpr) and auc and uncertainty + Keyword arguments: + y -- array of labels + pred -- array of mva scores + sample_weight -- array of per-event weights + n_bootstrap -- number of bootstrap resamples to use for calculating uncs + interp -- number of points in resulting fpr and tpr arrays + """ + + y = numpy.array(y) + pred = numpy.array(pred) + + if sample_weight is None: + sample_weight = numpy.ones_like(y) + else: + sample_weight = numpy.array(sample_weight) + + logger.debug("[roc_tools.py : calc_roc_and_unc] Calculating AUC and uncertainty with %d bootstrap samples." % (n_bootstrap)) + results = calc_auc(y, pred, sample_weight) + fpr, tpr, auc = results["fpr"], results["tpr"], results["auc"] + + fprs = [fpr] + tprs = [tpr] + aucs = [auc] + + for i in tqdm(range(n_bootstrap)): + idx = bootstrap_indices(y) + + label_bootstrap = y[idx] + pred_bootstrap = pred[idx] + weights_bootstrap = sample_weight[idx] + + results_bootstrap = calc_auc(label_bootstrap, pred_bootstrap, weights_bootstrap, interp) + fpr_b, tpr_b, auc_b = results_bootstrap["fpr"], results_bootstrap["tpr"], results_bootstrap["auc"] + fprs.append(fpr_b) + tprs.append(tpr_b) + aucs.append(auc_b) + + unc = numpy.std(aucs) + tpr_mean = numpy.mean(tprs, axis=0) + tpr_unc = numpy.std(tprs, axis=0) + fpr_mean = numpy.mean(fprs, axis=0) + + results = { + "auc" : auc, + "auc_unc" : unc, + "fpr" : fpr_mean, + "tpr" : tpr_mean, + "tpr_unc" : tpr_unc + } + + return results + + +def print_eff_table(name, results): + logger.debug("[roc_tools.py : print_eff_table] Printing anomaly detection efficiencies LaTeX table for histogram '%s'." % name) + + anom_effs = [0.5, 0.9, 0.99] + effs = {} + for algorithm, res in results.items(): + tprs = [] + fprs = [] + + for eff in anom_effs: + sig_eff, idx = find_nearest(res["tpr"], eff) + tprs.append(res["tpr"][idx]) + fprs.append(res["fpr"][idx]) + + effs[algorithm] = { + "fpr" : fprs, + "tpr" : tprs, + "auc" : res["auc"], + "auc_unc" : res["auc_unc"] + } + + print("\\begin{center} \\scriptsize") + print("\\begin{tabular}{r|c|l|l|l}") + print("\\multicolumn{5}{c}{%s} \\\\ \\hline \\hline" % name) + print("\\multirow{3}{*}{Algorithm} & \\multicolumn{4}{c}{Metric (\\%)} \\\\ \\cline{2-5}") + print(" & \\multirow{2}{*}{AUC} & \\multicolumn{3}{c}{False Alarm Rate ($\\alpha_{\\text{far}}$) at fixed $\\epsilon_{\\text{anom}}$} \\\\ \\cline{3-5}") + print(" & & $\\alpha_{\\text{far}} (\\epsilon_{\\text{anom}} = %d\\%%)$ & $\\alpha_{\\text{far}} (\\epsilon_{\\text{anom}} = %d\\%%)$ & $\\alpha_{\\text{far}} (\\epsilon_{\\text{anom}} = %d\\%%)$ \\\\ \\hline" % (int(anom_effs[0] * 100.), int(anom_effs[1] * 100.), int(anom_effs[2] * 100.))) + for algo, eff in effs.items(): + print("%s & %.3f $\\pm$ %.3f & %.1f\\%% & %.1f\\%% & %.1f\\%% \\\\ \\hline" % (algo, eff["auc"], eff["auc_unc"], eff["fpr"][0] * 100., eff["fpr"][1] * 100., eff["fpr"][2] * 100.)) + print("\\end{tabular} \\end{center}") + + +def find_nearest(array,value): + val = numpy.ones_like(array)*value + idx = (numpy.abs(array-val)).argmin() + return array[idx], idx diff --git a/autodqm_ml/plotting/plot_tools.py b/autodqm_ml/plotting/plot_tools.py index 8855cbf..ee81c7f 100644 --- a/autodqm_ml/plotting/plot_tools.py +++ b/autodqm_ml/plotting/plot_tools.py @@ -210,3 +210,48 @@ def plotMSESummary(original_hists, reconstructed_hists, threshold, hist_paths, r ax.text(1.1*max(mse), 0.5*max(hist), text, wrap=True, bbox=props) fig.savefig(f'plots/{algo}/MSE_Summary.png', bbox_inches='tight') + + +def plot_roc_curve(h_name, results, save_name, **kwargs): + fig = plt.figure() + ax1 = fig.add_subplot(111) + ax1.yaxis.set_ticks_position('both') + ax1.grid(True) + + log = kwargs.get("log", False) + + idx = 0 + for algo, res in results.items(): + ax1.plot( + res["fpr"], + res["tpr"], + color = "C%d" % (idx+1), + label = algo + " [AUC = %.3f +/- %.3f]" % (res["auc"], res["auc_unc"]) + ) + ax1.fill_between( + res["fpr"], + res["tpr"] - (res["tpr_unc"] / 2.), + res["tpr"] + (res["tpr_unc"] / 2.), + color = "C%d" % (idx+1), + alpha = 0.25 + ) + idx += 1 + + if not log: + plt.ylim(0,1) + plt.xlim(0,1) + else: + plt.xlim(0.005, 1) + plt.ylim(0,1) + ax1.set_xscale("log") + + plt.xlabel("False Anomaly Rate (FPR)") + plt.ylabel("Anomaly Detection Efficiency (TPR)") + + legend = ax1.legend(loc='lower right') + + logger.debug("[plot_tools.py : plot_roc_curve] Writing plot to file '%s'." % (save_name)) + plt.savefig(save_name) + plt.savefig(save_name.replace(".pdf", ".png")) + plt.clf() + diff --git a/autodqm_ml/utils.py b/autodqm_ml/utils.py index a235ffd..35c959d 100644 --- a/autodqm_ml/utils.py +++ b/autodqm_ml/utils.py @@ -6,6 +6,7 @@ import os import copy import subprocess +import json import logging from rich.logging import RichHandler @@ -146,3 +147,12 @@ def check_proxy(): return proxy +def is_json_serializable(x): + """ + Returns True if `x` is json serializable, False if not + """ + try: + json.dumps(x) + return True + except: + return False diff --git a/scripts/assess.py b/scripts/assess.py index 8083e22..8ad7c36 100644 --- a/scripts/assess.py +++ b/scripts/assess.py @@ -6,7 +6,8 @@ from autodqm_ml.utils import setup_logger from autodqm_ml.utils import expand_path -from autodqm_ml.plotting.plot_tools import make_original_vs_reconstructed_plot, make_sse_plot +from autodqm_ml.plotting.plot_tools import make_original_vs_reconstructed_plot, make_sse_plot, plot_roc_curve +from autodqm_ml.evaluation.roc_tools import calc_roc_and_unc, print_eff_table from autodqm_ml.constants import kANOMALOUS, kGOOD def parse_arguments(): @@ -53,6 +54,12 @@ def parse_arguments(): required = False, default = None ) + parser.add_argument( + "--make_webpage", + required=False, + action="store_true", + help="make a nicely browsable web page" + ) parser.add_argument( "--debug", help = "run logger in DEBUG mode (INFO is default)", @@ -123,12 +130,14 @@ def main(args): logger.info("\t Run number : %d, Anomaly Score : %.2e" % (runs_sorted.run_number[i], runs_sorted[algorithm_info["score"]][i])) # Histogram of sse for algorithms + splits = { + "train_label" : [("train", 0), ("test", 1)], + "label" : [("anomalous", kANOMALOUS), ("good", kGOOD)] + } + for h, info in histograms.items(): - splits = { - "train_label" : [("train", 0), ("test", 1)], - "label" : [("anomalous", kANOMALOUS), ("good", kGOOD)] - } for split, split_info in splits.items(): + recos_by_label = { k : {} for k,v in info["algorithms"].items() } for name, id in split_info: runs_set = runs[runs[split] == id] if len(runs_set) == 0: @@ -137,19 +146,43 @@ def main(args): recos = {} for algorithm, algorithm_info in info["algorithms"].items(): recos[algorithm] = { "score" : runs_set[algorithm_info["score"]] } + recos_by_label[algorithm][name] = { "score" : runs_set[algorithm_info["score"]] } + h_name = h.replace("/", "").replace(" ", "") save_name = args.output_dir + "/" + h_name + "_sse_%s_%s.pdf" % (split, name) make_sse_plot(h_name, recos, save_name) + for algorithm, recos_alg in recos_by_label.items(): + if not recos_alg: + continue + save_name = args.output_dir + "/" + h_name + "_sse_%s_%s.pdf" % (algorithm, split) + make_sse_plot(h_name, recos_alg, save_name) + + + # ROC curves (if there are labeled runs) + has_labeled_runs = True + labeled_runs_cut = runs.run_number < 0 # dummy all False cut + for name, id in splits["label"]: + cut = runs.label == id + labeled_runs_cut = labeled_runs_cut | cut + runs_set = runs[cut] + has_labeled_runs = has_labeled_runs and (len(runs_set) > 0) + + if has_labeled_runs: + labeled_runs = runs[labeled_runs_cut] + roc_results = {} + for h, info in histograms.items(): + roc_results[h] = {} + for algorithm, algorithm_info in info["algorithms"].items(): + pred = labeled_runs[algorithm_info["score"]] + roc_results[h][algorithm] = calc_roc_and_unc(labeled_runs.label, pred) - #for set, id in zip(["train", "test"], [0, 1]): - # runs_set = runs[runs.train_label == id] - # recos = {} - # for algorithm, algorithm_info in info["algorithms"].items(): - # recos[algorithm] = { "score" : runs_set[algorithm_info["score"]] } - # h_name = h.replace("/", "").replace(" ", "") - # save_name = args.output_dir + "/" + h_name + "_sse_%s.pdf" % set - # make_sse_plot(h_name, recos, save_name) + h_name = h.replace("/", "").replace(" ", "") + save_name = args.output_dir + "/" + h_name + "_roc.pdf" + plot_roc_curve(h_name, roc_results[h], save_name) + plot_roc_curve(h_name, roc_results[h], save_name.replace(".pdf", "_log.pdf"), log = True) + print_eff_table(h_name, roc_results[h]) + # Plots of original/reconstructed histograms if args.runs is None: @@ -180,6 +213,12 @@ def main(args): save_name = args.output_dir + "/" + h_name + "_Run%d.pdf" % run_number make_original_vs_reconstructed_plot(h_name, original, recos, run_number, save_name) + logger.info("[assess.py] Plots written to directory '%s'." % (args.output_dir)) + + if args.make_webpage: + os.system("cp web/index.php %s" % args.output_dir) + os.system("chmod 755 %s" % args.output_dir) + os.system("chmod 755 %s/*" % args.output_dir) if __name__ == "__main__": args = parse_arguments() diff --git a/scripts/train.py b/scripts/train.py index 943fddd..111900f 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -10,25 +10,29 @@ from autodqm_ml.utils import expand_path parser = argparse.ArgumentParser() + +# Required arguments +parser.add_argument( + "--algorithm", + help = "name of algorithm ('PCA' or 'Autoencoder' or 'StatisticalTester') to train with default options OR path to json filed specifying particular options for training a given algorithm.", + type = str, + required = True +) + +# Optional arguments parser.add_argument( "--output_dir", help = "output directory to place files in", type = str, required = False, - default = "output" + default = None ) parser.add_argument( "--tag", help = "tag to identify output files", type = str, required = False, - default = "test" -) -parser.add_argument( - "--algorithm", - help = "name of algorithm ('PCA' or 'Autoencoder' or 'StatisticalTester') to train with default options OR path to json filed specifying particular options for training a given algorithm.", - type = str, - required = True + default = None ) parser.add_argument( "--input_file", @@ -58,6 +62,13 @@ required = False, default = None ) +parser.add_argument( + "--autoencoder_mode", + help = "specify whether you want to train an autoencoder for each histogram ('individual') or a single autoencoder on all histograms ('simultaneous')", + type = str, + required = False, + default = None +) parser.add_argument( "--debug", help = "run logger in DEBUG mode (INFO is default)", @@ -76,10 +87,16 @@ if not os.path.exists(args.algorithm): algorithm_config_file = expand_path(args.algorithm) else: - algorithm_config_file = algo + algorithm_config_file = args.algorithm + with open(algorithm_config_file, "r") as f_in: config = json.load(f_in) + # Add command line arguments to config + for k,v in vars(args).items(): + if v is not None: + config[k] = v # note: if you specify an argument both through command line argument and json, we give precedence to the version from command line arguments + else: config = vars(args) config["name"] = args.algorithm.lower() @@ -113,8 +130,13 @@ if args.histograms is not None: histograms = {x : { "normalize" : True} for x in args.histograms.split(",")} -else: +elif isinstance(config["histograms"], str): + histograms = {x : { "normalize" : True} for x in config["histograms"].split(",")} +elif isinstance(config["histograms"], dict): histograms = config["histograms"] +else: + logger.exception("[train.py] The `histograms` argument should either be a csv list of histogram names (str) or a dictionary (if provided through a json config).") + raise RuntimeError() # Load data algorithm.load_data( diff --git a/scripts/web/README.md b/scripts/web/README.md new file mode 100644 index 0000000..fcca79a --- /dev/null +++ b/scripts/web/README.md @@ -0,0 +1 @@ +The pretty web browser created from `index.php` is taken from Nick Amin's repo [niceplots](https://github.com/aminnj/niceplots). diff --git a/scripts/web/index.php b/scripts/web/index.php new file mode 100644 index 0000000..d5109e4 --- /dev/null +++ b/scripts/web/index.php @@ -0,0 +1,642 @@ + + + + + +<?php +$cwd = explode("/",getcwd()); +$folder = array_pop($cwd); +echo $folder; +?> + + + + + + + + + + + + + + + + +getFilename(), '.php') !== false) continue; + if( $node->isDot() ) continue; + if ( $node->isDir()) fillArrayWithFileNodes( new DirectoryIterator( $node->getPathname() ), $node->getPathname() ); + + $tmp = array( + "id" => $node->getPathname(), + "parent" => $theParent, + "text" => $node->getFilename(), + ); + if ($node->isFile()) $tmp["icon"] = "file"; // can be path to icon file + $data[] = $tmp; + } +} +fillArrayWithFileNodes( new DirectoryIterator( '.' ) ); + +// get all files in flat list +$iter = new RecursiveIteratorIterator( + new RecursiveDirectoryIterator('.', RecursiveDirectoryIterator::SKIP_DOTS), + RecursiveIteratorIterator::SELF_FIRST, + RecursiveIteratorIterator::CATCH_GET_CHILD +); +$paths = array('.'); +foreach ($iter as $path => $dir) $paths[] = $path; + +// get number of directories +$num_directories = 0; +foreach ( (new DirectoryIterator('.')) as $node ) { + if( $node->isDot() ) continue; + if ( $node->isDir()) $num_directories += 1; +} +?> + + + + + + + +
+ +
+ +
+ + +
+ +   + + +
+ +
+
+
+ Keybindings +
+
+ g / G to scroll to top/bottom
+ / to focus the search box
+ y to copy the filter state as a URL
+ s / S to sort A-Z/Z-A
+ b to toggle super-saturation mode
+ m to toggle dark mode
+ x to toggle image visibility
+
+ +
+
+
+ +   + + +
+Description:
"; + echo $description; +} +?> +
+
+
+ +
+ + +
+ + + + +