diff --git a/analysis/mc_validation/README.md b/analysis/mc_validation/README.md index e31ffc4d6..687167322 100644 --- a/analysis/mc_validation/README.md +++ b/analysis/mc_validation/README.md @@ -16,5 +16,17 @@ This directory contains scripts from the validation studies of the FullR2 privat - Should be run on the output of the topeft processor - Was used during the June 2022 MC validation studies (for TOP-22-006 pre approval checks) +:memo: The scripts below were updated in August of 2025 +* `gen_processor.py`: + - This is an updated script to produce gen level histograms for comparison of various samples +* `gen_hist_eventweights_processor.py`: + - This script produces distributions of MG weights (if they are saved in the samples) +* `gen_hist_eventweights_plotter.py`: + - This script produces plots of MG weights + - Should be run on the output of the gen_hist_eventweights_processor.py processor + +* `comp_norm.py`: + - This script plots two pkl files for different variables for comparison of shapes and normalizations + - Should be run on the output of the gen_processor.py processor diff --git a/analysis/mc_validation/comp_norm.py b/analysis/mc_validation/comp_norm.py new file mode 100644 index 000000000..4a76b15bb --- /dev/null +++ b/analysis/mc_validation/comp_norm.py @@ -0,0 +1,374 @@ +''' +This script produces plots comparing two input pkl files made from `gen_processor.py`. +The minimum set of inputs is the paths to each pkl file and the json for one of them. +If the json file contains the starting points ("StPt") it will draw a green curve reweighted to that value, +otherwise it will set all WCs to 1 for the green curve. +You can get the official starting points from the GridpackGeneration repo +e.g. Run 3 dim6top: https://github.com/TopEFT/GridpackGeneration/blob/dim6top_run3/mcgeneration/scanfiles/startpts_scale_by_1p1_Run3_dim6top.json + +Variables: +You can specify a single variable using `--var `, +otherwise the script will loop over _all_ variables listed at the bottom. + +Comparing two private EFT files: +By default the script expects the first pkl file to be an EFT file and the second to be a SM central file. +If you pass the argument `--private` it will understand that the _second_ pkl file is also a private sample. +There is a special flag `--skip` which tells the script _not_ to draw a reweighted histogram for the second pkl file. +This is useful when comparing Run 3 to Run 2 (at the SM) since some of the WC names have changed. +If you pass the argument `--central` it will assume _both_ pkl files are central samples. +By default, any missing WCs in the starting point are set to `100`, since this is what we do when making the gridpacks. +Use `--zero` to keep these missing WCs fixded to `0`. + +The flag `--abs` will not normalize the ratios. + + +Labels: +You can modify the labels by using `--str1 ` and/or `--str2 `. + +Density and flow bins: +The flag `--density` will normalize the main plots to unity and draw them on a log scale. +This is useful if your normalization is off but you want a larger plot of the shapes. +The flag `--flow` will draw the under and overflow bins. + +Lumi: +The script will try to infer the luminosity of each file based on the names. +If the names don't contain `UL1x` or `202x` it will _not_ rescale anything. +If it detects _different_ lumis, it will scale to the larger one. +Use `--no-lumi` to skip this check/scaling. + + +Exampe: +python comp_norm.py histos/2022_tllq_NewStPt4_zero.pkl.gz histos/2022_tllq_fixed0p1.pkl.gz ../../input_samples/sample_jsons/signal_samples/private_UL/2022_tllq_fixed0p1_nanoGEN.json --var t_pt --private --skip --str2 "nanoGEN gen-level fixed +/-1" +''' +import os +import re +import pickle +import gzip +import numpy as np +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +import argparse +import json +import math +import mplhep as hep +from topcoffea.modules.get_param_from_jsons import GetParam +from topcoffea.modules.paths import topcoffea_path +get_tc_param = GetParam(topcoffea_path("params/params.json")) + +#Load hists from pickle file created by TopCoffea +hists1={} +hists2={} + +parser = argparse.ArgumentParser(description='You can select which file to run over') +parser.add_argument('fin1' , default='analysis/topEFT/histos/mar03_central17_pdf_np.pkl.gz' , help = 'Variable to run over') +parser.add_argument('fin2' , default='analysis/topEFT/histos/mar03_central17_pdf_np.pkl.gz' , help = 'Variable to run over') +parser.add_argument('--str1' , default='nanoGEN gen-level EFT' , help = 'String 1') +parser.add_argument('--str2' , default='nanoAOD gen-level central' , help = 'String 2') +parser.add_argument('--density', action='store_true' , help = 'Normalize') +parser.add_argument('--flow' , action='store_true' , help = 'Overflow') +parser.add_argument('--var' , default=None , help = 'Variable to plot') +parser.add_argument('--private', action='store_true' , help = 'Use private key for second hist') +parser.add_argument('--central', action='store_true' , help = 'Use central key for first hist') +parser.add_argument('--abs' , action='store_true' , help = 'Use absolute scale for ratio') +parser.add_argument('--skip' , action='store_true' , help = 'Skip plotting EFT points') +parser.add_argument('--zero' , action='store_true' , help = 'Set missing WCs to 0') +parser.add_argument('json' , default='', help = 'Json file(s) containing files and metadata') +parser.add_argument('--small' , action='store_true', help = 'Remove all |WCs| >100') +parser.add_argument('--no-lumi' , action='store_true', help = 'Don\t rescale the lumi') +parser.add_argument('--info' , action='store_true', help = 'Print summary info') +parser.add_argument('--start' , action='store_true', help = 'Use starting point') +args = parser.parse_args() +fin1 = args.fin1 +fin2 = args.fin2 + +assert not args.private or not args.central, 'Please use either `--private` OR `--central`, not both' + +with gzip.open(fin1) as fin1: + hin = pickle.load(fin1) + for k in hin.keys(): + if k in hists1: hists1[k]+=hin[k] + else: hists1[k]=hin[k] +with gzip.open(fin2) as fin2: + hin = pickle.load(fin2) + for k in hin.keys(): + if k in hists2: hists2[k]+=hin[k] + else: hists2[k]=hin[k] + +''' +h = hists['met'] #grab the MET plot, others are available +h = hists['njets'] #grab the MET plot, others are available +h = hists['njetsnbtags'] #grab the njets,nbtags plot, others are available +ch3l = ['eemSSonZ', 'eemSSoffZ', 'mmeSSonZ', 'mmeSSoffZ','eeeSSonZ', 'eeeSSoffZ', 'mmmSSonZ', 'mmmSSoffZ'] #defin1e 3l channel +fig, ax = plt.subplots(1, 1, figsize=(7,7)) #1 panel plot, 700x700 +hist.plot1d(h.integrate('channel', ch3l).sum('sample'), overlay='cut', stack=False) #create a 1D plot +plt.show() #took me longer than I’d like to admit to get Python to draw the canvas! + +#last = np.array(list(h.values().values()))[np.arange(65).reshape(1,13,5)[0][12][4]] + +ch2lss = ['eeSSonZ', 'eeSSoffZ', 'mmSSonZ', 'mmSSoffZ', 'emSS'] +h = hists['njets'].integrate('sample', 'ttHnobb').integrate('channel', ch2lss).integrate('cut', '4j2b') +ax = hist.Bin("njets", "Jet multiplicity ", [4,5,6,7]) +print(h.values()[()]) +h.rebin('njets', ax) +print(h.values()[()]) +''' +#fname = '/afs/crc.nd.edu/user/{user[0]}/{user}/new_topcoffea/topeft/ttHJet_UL17_R1B14_NAOD-00000_10194_NDSkim.root' +#events = NanoEventsFactory.from_root(fname, schemaclass=NanoAODSchema).events() +#print(events) +var = 'l0pt' +var = 'lhe_photon_pt' +var = 'photon_pt' +var = args.var + +density=True +density=False +density=args.density +flow='none' +flow='show' +flow = 'show' if args.flow else 'none' +printout = True + +def plot(var=None, fin1=None, fin2=None, flow=None, private=False, hists1=None, hists2=None, hists3=None): + chan = '2lss_3j' + appl = 'isSR_2lSS' + chan = 'incl_sfos30_dral0p4_0j' + chan = 'incl_draj0p5_dral0p4_0j' + chan = 'incl_2los_ph_0j' + chan = 'incl_2los_ph_dral0p4_0j' + chan = 'incl_2los_ph_sfos30_dral0p4_0j' + chan = 'incl_draj_dral0p4_0j' + chan = '3l_1j' + appl = 'isAR_3l' + chan = '2los_ph_1j' + appl = 'isSR_2lOS' + chan = '2lss_2j' + appl = 'isSR_2lSS' + chan = 'incl_0j' + appl = 'isAR_incl' + + hep.style.use("CMS") + + fig, (ax, rax) = plt.subplots( + nrows=2, + ncols=1, + figsize=(12,12), + gridspec_kw={"height_ratios": (3, 1)}, + sharex=True + ) + fig.subplots_adjust(hspace=.07) + + if var == 'njets': + chan = 'incl' + + proc2 = 'private' + proc = 'TLLQ_2022_nanoGEN' + proc = 'THQ_2022_nanoGEN' + proc = 'private' + proc2 = 'private' + proc = '' + proc2 = '' + str2 = args.str2 + if args.private: + str2 = str2.replace('central', 'TOP-22-006') + if 'fixed' in args.fin2: + str2 = 'nanoGEN gen-level fixed +/-1' + + lumi = 1000*41.48 + + # Infer lumi from file name + def extract_year(s): + match = re.search(r'(?:UL(\d{2})|(\d{4}))(?:[A-Za-z]+)?', s) + if match: + if match.group(1): # Matched ULxx + return "20" + match.group(1) # Convert UL17 -> 2017 + if match.group(0): # Matched ULxx + return match.group(0) + else: # Matched yyyy + return match.group(2) + return None + lumi1, lumi2 = 0, 0 + year1 = extract_year(args.fin1) + year2 = extract_year(args.fin2) + if year1 is not None: + lumi1 = 1000.0*get_tc_param(f"lumi_{year1}") + if year2 is not None: + lumi2 = 1000.0*get_tc_param(f"lumi_{year2}") + if lumi1 > 0 and lumi2 > 0 and not args.no_lumi: + if lumi1 > lumi2: + print(f'Scaling {args.fin2} from {round(lumi2/1000)} pb^-1 to {round(lumi1/1000)} pb^-1') + hists2[var] *= lumi1/lumi2 + elif lumi2 > lumi1: + print(f'Scaling {args.fin1} from {round(lumi1/1000)} pb^-1 to {round(lumi2/1000)} pb^-1') + hists1[var] *= lumi2/lumi1 + elif not args.density: + print(f'\n\nWARNING: Could not infer the luminosity.\n Make sure {args.fin1} and {args.fin2} are normalized the same!\n\n') + + if args.private: sm = hists2[var][{'process': [s for s in hists2[var].axes['process'] if proc2 in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}) + else: sm = hists2[var][{'process': [s for s in hists2[var].axes['process'] if 'central' in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}) + err = np.sqrt(sm.variances()[()])/np.sum(sm.values()[()]) + err = np.sqrt(sm.variances()[()])#/np.sum(sm.values()[()]) + err = np.sqrt(sm.variances(flow=(flow=='show'))[()])#/np.sum(sm.values()[()]) + if flow=='show': + err = err[1:] + if not args.private: hists2[var][{'process': [s for s in hists2[var].axes['process'] if 'central' in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).plot1d(yerr=err, label=str2 + ' SM', ax=ax, density=density, flow=flow) + else: hists2[var][{'process': [s for s in hists2[var].axes['process'] if proc2 in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).plot1d(yerr=err, label=str2 + ' @ SM', ax=ax, density=density, flow=flow) + + global printout + if printout and var=='dral': + printout = False + print('hist1 / lumi', np.sum(hists1[var][{'process': [s for s in hists1[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).values(flow=True)) / lumi) + print('hist1 / lumi / xsec / k', np.sum(hists1[var][{'process': [s for s in hists1[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).values(flow=True)) / 2.2471 / lumi) + print('hist1 / lumi / 3.75', np.sum(hists1[var][{'process': [s for s in hists1[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).values(flow=True)) / 3.75 / lumi) + print('hist1 / lumi / xsec', np.sum(hists1[var][{'process': [s for s in hists1[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).values(flow=True)) / 1.513 / lumi) + print('hist2 / lumi / xsec / k', np.sum(sm.values(flow=True)) / 2.2471 / lumi) + print('hist2 / lumi / xsec', np.sum(sm.values(flow=True)) / 1.513 / lumi) + print('Removing dral<0.1: hist1 / (hist>0.1) / xsec', np.sum(hists1[var][{'process': [s for s in hists1[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).values(flow=True)) / np.sum(hists1[var][{'process': [s for s in hists1[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).values(flow=True)[10:]) / 2.2471) + print('Removing dral<0.1: hist1 / (hist>0.1)', np.sum(hists1[var][{'process': [s for s in hists1[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).values(flow=True)) / np.sum(hists1[var][{'process': [s for s in hists1[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).values(flow=True)[10:])) + print('Removing dral<0.1: (hist>0.1)', np.sum(hists1[var][{'process': [s for s in hists1[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).values(flow=True)[10:]) / lumi) + print('Removing dral<0.1: (hist2>0.1)/(hist1>0.1)', np.sum(hists2[var][{'process': [s for s in hists2[var].axes['process'] if 'central' in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).values(flow=True)[10:]) / np.sum(hists1[var][{'process': [s for s in hists1[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).values(flow=True)[10:])) + print('Removing dral<0.1: (hist2>0.1)', np.sum(hists2[var][{'process': [s for s in hists2[var].axes['process'] if 'central' in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).values(flow=True)[10:]) / lumi) + print('Removing dral<0.4: (hist>0.4)', np.sum(hists1[var][{'process': [s for s in hists1[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).values(flow=True)[40:]) / lumi) + if printout and var=='photon_pt':# and args.private: + if args.private: c = hists2[var][{'process': [s for s in hists2[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).values(flow=True) / hists1[var][{'process': [s for s in hists1[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).values(flow=True) + else: c = hists2[var][{'process': [s for s in hists2[var].axes['process'] if 'central' in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).values(flow=True) / hists1[var][{'process': [s for s in hists1[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).values(flow=True) + print('photon pT corrections (shape only)', c / (np.sum(hists2[var][{'process': [s for s in hists2[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).values(flow=True)) / np.sum(hists1[var][{'process': [s for s in hists1[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).values(flow=True)))) + if printout and var=='dral': + print('Removing lj0pt<10: hist1 / (hist>10) / xsec', np.sum(hists1[var][{'process': [s for s in hists1[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).values(flow=True)) / np.sum(hists1[var][{'process': [s for s in hists1[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).values(flow=True)[10:]) / 2.2471) + if printout: + print(f'{var} int(hist1)=', np.sum(hists1[var][{'process': [s for s in hists1[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).values()), 'int(hist2)=', np.sum(sm.values())) + s = np.sum(sm.values(flow=True)) / np.sum(hists1[var][{'process': [s for s in hists1[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).values(flow=True)) + print(f'Ratio {s}') + eft = hists1[var][{'process': [s for s in hists1[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}] + err = np.sqrt(eft.as_hist({}).variances()[()])#/np.sum(eft.values()[()]) + + max_lumi = max(lumi1, lumi2) + if args.info: + tab = ' ' + print('\n\nSummary information:') + if args.no_lumi: + print(f'{tab}{args.fin1} with lumi removed {np.round(np.sum(eft.eval({})[()]) / lumi1, 3)}') + else: + print(f'{tab}{args.fin1} with lumi removed {np.round(np.sum(eft.eval({})[()]) / max_lumi, 3)}') + if args.no_lumi: + print(f'{tab}{args.fin2} with lumi removed {np.round(np.sum(sm.values(flow=True)[()]) / lumi2, 3)}') + else: + print(f'{tab}{args.fin2} with lumi removed {np.round(np.sum(sm.values(flow=True)[()]) / max_lumi, 3)}') + print('\n\n') + + hists1[var][{'process': [s for s in hists1[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}).plot1d(yerr=False, label=args.str1 + ' @ SM', ax=ax, density=density, flow=flow) + + st_pt = None + + if args.json != '': + with open(args.json) as fin: + print(f'Loading {args.json}') + j = json.load(fin) + wc = j['WCnames'] + if 'StPt' in j: + st_pt = j['StPt'] + for wc in hists1[var].wc_names: + if st_pt is not None and wc not in st_pt and not args.zero: + st_pt[wc] = 100. + + if not args.start: + val = [1.0] * len(wc) + + lab = 'st pt.' + if st_pt is None: + lab = 'pt.' + wcs = hists1[var].wc_names + val = [1.0] * len(wcs) + st_pt = dict(zip(wcs, val)) + if args.small: + lab = 'non-SM pt.' + st_pt = {wc:(val*.1 if abs(val) < 100 else 0) for wc,val in st_pt.items()} + + if not args.central: print(f'Using {st_pt=}') + if not args.central and not args.skip: eft_err = np.sqrt(hists1[var][{'process': [s for s in hists1[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist(st_pt).variances(flow=(flow=='show'))) + eft_err = False + #if flow=='show': eft_err = eft_err[1:] # FIXME fix overflow, some vars need the extra bin + if not args.central and not args.skip: hep.histplot(hists1[var][{'process': [s for s in hists1[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist(st_pt), label=f'{args.str1} {lab}', ax=ax, density=density, flow=flow, ls='--', yerr=eft_err) + if args.private and not args.skip and not args.central: #FIXME + hep.histplot(hists2[var][{'process': [s for s in hists2[var].axes['process'] if proc2 in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist(st_pt), label=f'{str2} {lab}', ax=ax, density=density, flow=flow, yerr=False, ls='-.') + yerr = hists2[var][{'process': sum, 'channel': chan, 'systematic': 'nominal', 'appl': appl}].as_hist({}).values()[()] + + err = np.sqrt(sm.variances(flow=(flow=='show'))[()])/sm.values(flow=(flow=='show'))[()] + if flow=='show': err = err[1:] + eft = hists1[var][{'process': [s for s in hists1[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist({}) + plt.sca(rax) + (sm/sm).plot1d(yerr=err, ax=rax, flow=flow) + norm = np.sum(sm.values()) / np.sum(eft.values()) + if args.abs: norm = 1 + (eft/sm * norm).plot1d(yerr=False, ax=rax, flow=flow) + + ax2 = fig.add_axes([0.7, 0.55, 0.15, 0.15]) + eb1 = ax2.errorbar([1], 1, xerr=0.05, yerr=np.sqrt(np.sum(sm.values(flow=True)))/np.sum(sm.values(flow=True))) + eft_sm_norm = np.sum(eft.values(flow=True)[()]) #/ sm_scale + eb2 = ax2.errorbar([1], eft_sm_norm / np.sum(sm.values(flow=True)), xerr=0.05) + plt.gca().set_xticks([]) + + if not args.central and not args.skip: eft = hists1[var][{'process': [s for s in hists1[var].axes['process'] if proc in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist(st_pt) + eft_err = np.sqrt(eft.variances(flow=(flow=='show')))/np.sum(eft.values(flow=(flow=='show'))) + if flow=='show': eft_err = eft_err[1:] + norm = np.sum(sm.values()) / np.sum(eft.values()) + if args.abs: norm = 1 + if not args.central and not args.skip: (eft/sm * norm).plot1d(yerr=eft_err, ax=rax, flow=flow, ls='-.') + + eft_start_norm = np.sum(eft.values(flow=True)[()]) #/ sm_scale + if args.abs: norm = 1 + if 'fixed' in args.fin2: + eb3 = ax2.errorbar([1], eft_start_norm / np.sum(sm.values(flow=True)), xerr=0.05, linestyle='--') + + if args.private and wc and not args.skip and not args.central: #FIXME + eft = hists2[var][{'process': [s for s in hists2[var].axes['process'] if proc2 in s], 'channel': chan, 'systematic': 'nominal', 'appl': appl}][{'process': sum}].as_hist(st_pt) + norm = np.sum(sm.values()) / np.sum(eft.values()) + if args.abs: norm = 1 + (eft/sm * norm).plot1d(yerr=False, ax=rax, flow=flow, ls='--') + + if density: + ax.set_yscale('log') + ax.set_ylim(1e-4, 5e-2) + rax.set_ylim(0.5, 1.5) + ax.set_xlabel('') + ax.set_ylabel('Yield') + var_label = var + if '_pt' in var_label and 'lhe' in var_label: + var_label = var_label.split('_')[1] + elif '_pt' in var_label: + var_label = var_label.split('_')[0] + else: + var_label = var_label.split('pt')[0] + if args.abs: rax.set_ylabel(r'$\frac{dN_{\rm{EFT}}}{d p_{\rm{T}}} / \frac{dN_{\rm{ref}}}{d p_{\rm{T}}}$') + else: rax.set_ylabel(r'$(\frac{1}{N_{\rm{EFT}}} \frac{dN_{\rm{EFT}}}{d p_{\rm{T}}}) / (\frac{1}{N_{\rm{ref}}} \frac{dN_{\rm{ref}}}{d p_{\rm{T}}})$') + plt.sca(ax) + plt.legend() + plt.show() + user = os.getlogin() + com = '13' if 'UL' in args.fin1 else '13.6' + com = '13.6 vs 13' if 'UL' in args.fin2 and 'UL' not in args.fin1 else com + n_dec = 3 - math.ceil(np.log10(max_lumi/1000.)) + hep.cms.label(lumi=np.round(max_lumi/1000., n_dec), com=com) + plt.savefig(f'/afs/crc.nd.edu/user/{user[0]}/{user}/www/comp_{var}.png') + plt.savefig(f'/afs/crc.nd.edu/user/{user[0]}/{user}/www/comp_{var}.pdf') + plt.close() + +if __name__ == '__main__': + if args.var is None: + plot('photon_pt', fin1=fin1, fin2=fin2, flow=flow, private=args.private, hists1=hists1, hists2=hists2) + plot('lhe_t_pt' , fin1=fin1, fin2=fin2, flow=flow, private=args.private, hists1=hists1, hists2=hists2) + plot('t_pt' , fin1=fin1, fin2=fin2, flow=flow, private=args.private, hists1=hists1, hists2=hists2) + plot('l0pt' , fin1=fin1, fin2=fin2, flow=flow, private=args.private, hists1=hists1, hists2=hists2) + plot('j0pt' , fin1=fin1, fin2=fin2, flow=flow, private=args.private, hists1=hists1, hists2=hists2) + plot('bj0pt' , fin1=fin1, fin2=fin2, flow=flow, private=args.private, hists1=hists1, hists2=hists2) + plot('lj0pt' , fin1=fin1, fin2=fin2, flow=flow, private=args.private, hists1=hists1, hists2=hists2) + plot('dral' , fin1=fin1, fin2=fin2, flow=flow, private=args.private, hists1=hists1, hists2=hists2) + plot('draj' , fin1=fin1, fin2=fin2, flow=flow, private=args.private, hists1=hists1, hists2=hists2) + plot('mll' , fin1=fin1, fin2=fin2, flow=flow, private=args.private, hists1=hists1, hists2=hists2) + plot('invm' , fin1=fin1, fin2=fin2, flow=flow, private=args.private, hists1=hists1, hists2=hists2) + plot('invm_ttX' , fin1=fin1, fin2=fin2, flow=flow, private=args.private, hists1=hists1, hists2=hists2) + plot('invm_tX' , fin1=fin1, fin2=fin2, flow=flow, private=args.private, hists1=hists1, hists2=hists2) + plot('invm_4t' , fin1=fin1, fin2=fin2, flow=flow, private=args.private, hists1=hists1, hists2=hists2) + plot('njets' , fin1=fin1, fin2=fin2, flow=flow, private=args.private, hists1=hists1, hists2=hists2) + else: + plot(var , fin1=fin1, fin2=fin2, flow=flow, private=args.private, hists1=hists1, hists2=hists2) diff --git a/analysis/mc_validation/djr.py b/analysis/mc_validation/djr.py new file mode 100644 index 000000000..b4a9a118e --- /dev/null +++ b/analysis/mc_validation/djr.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +''' +This script produces the DJR plots from nanoGEN files +(assuming the DJR were also saved) +Example Run +python djr.py \ +--input /cms/cephfs/data/store/user/byates/tttt/nanoGEN_Run3/2022/tttt_LO_EFT/crab_tttt_nanoGEN_Run3/250715_223705/0000/ \ +--output /users/byates2/afs/www/EFT/tttt_Run3/weights/weights.pdf +''' + +import uproot +import os +import hist +import awkward as ak +import numpy as np +np.seterr(invalid='ignore') +import matplotlib.pyplot as plt +import mplhep as hep +import warnings +plt.style.use(hep.style.CMS) + +if __name__ == '__main__': + ''' + good example: + root://cmseos.fnal.gov//store/user/dspitzba/EFT/qcut30.root + + bad example: + root://cmseos.fnal.gov//store/user/cmsdas/2023/short_exercises/Generators/wjets_2j/w2jets_qcut10.root + ''' + + import argparse + + argParser = argparse.ArgumentParser(description = "Argument parser") + argParser.add_argument('--input', action='store', default='root://cmseos.fnal.gov//store/user/dspitzba/EFT/qcut80.root', help="Input file") + argParser.add_argument('--output', action='store', default='./djr.pdf', help="Output file") + argParser.add_argument('--nevents', action='store', default=50e3, help="Number of events generated") + args = argParser.parse_args() + + n_events_in = int(args.nevents) + + djr_axis = hist.axis.Regular(40, -0.5, 3.5, name="djr", label=r"$\Delta JR$") + parton_axis = hist.axis.Integer(0, 4, name="n", label="Number of partons") + #parton_axis = hist.axis.Regular(5, 0, 4, name="n", label="Number of partons") + transition_axis = hist.axis.Integer(0, 6, name="t", label="DJR X->Y") + djr = hist.Hist(djr_axis, parton_axis, transition_axis) + + files = [f for f in os.listdir(args.input) if '.root' in f and 'COPYING' not in f] + #n_events_in *= len(files) + tot = 0 + for fin in files: + #print(f"Loading input file {args.input}/{fin}") + fin = uproot.open(args.input+fin) + events = fin["Events"] + #ar = events["GenEventInfoProduct_generator__GEN./GenEventInfoProduct_generator__GEN.obj"].arrays() + #djr_values = ar['GenEventInfoProduct_generator__GEN.obj']['DJRValues_'] + #nMEPartons = ar['GenEventInfoProduct_generator__GEN.obj']['nMEPartons_'] + djr_values10 = ak.Array(events['LHEWeight_DJR10'].array()) + djr_values21 = ak.Array(events['LHEWeight_DJR21'].array()) + djr_values32 = ak.Array(events['LHEWeight_DJR32'].array()) + #djr_values = ak.concatenate([events['LHEWeight_DJR10'].array(), events['LHEWeight_DJR21'].array(), events['LHEWeight_DJR32'].array()]) + #nMEPartons = events['LHEWeight_nMEPartons'].array() + nMEPartons = events['Generator_nMEPartons'].array() + + djr.fill( + djr = np.log10(ak.flatten(djr_values10, axis=0)), + n = ak.values_astype(ak.flatten(ak.ones_like(djr_values10)*nMEPartons, axis=0), np.int32), + #n = ak.flatten(ak.ones_like(djr_values10)*nMEPartons, axis=0), + t = ak.flatten(ak.local_index(djr_values10), axis=0), + ) + djr.fill( + djr = np.log10(ak.flatten(djr_values21, axis=0)), + n = ak.values_astype(ak.flatten(ak.ones_like(djr_values10)*nMEPartons, axis=0), np.int32), + #n = ak.flatten(ak.ones_like(djr_values21)*nMEPartons, axis=0), + t = ak.flatten(ak.local_index(djr_values21), axis=0), + ) + djr.fill( + djr = np.log10(ak.flatten(djr_values32, axis=0)), + n = ak.values_astype(ak.flatten(ak.ones_like(djr_values10)*nMEPartons, axis=0), np.int32), + #n = ak.flatten(ak.ones_like(djr_values32)*nMEPartons, axis=0), + t = ak.flatten(ak.local_index(djr_values32), axis=0), + ) + n_events = ak.num(djr_values10, axis=0) + print(f"Efficiency is {n_events/n_events_in}, assuming {n_events_in} where simulated") + + print("Plotting...") + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + fig, axs = plt.subplots(3,2, figsize=(15,21)) + + for i in range(3): + for j in range(2): + transition = 2*i+j + djr[:, :, transition].plot1d( + overlay='n', + ax=axs[i][j], + label= [f'{k} partons' for k in range(4)] + ) + djr[:, :, transition][{'n':sum}].plot1d( + ax=axs[i][j], + label = ['total'], + color = 'gray', + ) + + axs[i][j].set_xlabel(r'$DJR\ %s \to %s$'%(transition, transition+1)) + axs[i][j].set_yscale('log') + axs[i][j].set_ylim(0.3,n_events*1000) + axs[i][j].legend( + loc='upper right', + bbox_to_anchor=(0.03, 0.88, 0.90, .11), + mode="expand", + ncol=2, + ) + + fig.savefig(args.output) + print(f"Figure saved in {args.output}") diff --git a/analysis/mc_validation/eft_weights.py b/analysis/mc_validation/eft_weights.py new file mode 100644 index 000000000..73a547d65 --- /dev/null +++ b/analysis/mc_validation/eft_weights.py @@ -0,0 +1,135 @@ +''' +This script produces the distributions of weights from nanoGEN files +(assuming the MG weights were also saved) +Example Run +python eft_weights.py \ +--input /cms/cephfs/data/store/user/byates/tttt/nanoGEN_Run3/2022/tttt_LO_EFT/crab_tttt_nanoGEN_Run3/250715_223705/0000/ \ +--output /users/byates2/afs/www/EFT/tttt_Run3/weights/weights.pdf +''' + +#!/usr/bin/env python3 + +import hist +import os +import awkward as ak +from coffea.nanoevents import NanoEventsFactory, NanoAODSchema +import numpy as np +import matplotlib.pyplot as plt +import mplhep as hep +import topcoffea.modules.utils as utils +plt.style.use(hep.style.CMS) + +NanoAODSchema.warn_missing_crossrefs = False + +if __name__ == '__main__': + + import argparse + # 'EFTrwgt66_ctW_0.0_ctq1_0.0_cQq81_0.0_ctZ_0.0_cQq83_0.0_ctG_0.0_ctq8_0.0_cQq13_0.0_cQq11_0.0' + + argParser = argparse.ArgumentParser(description = "Argument parser") + argParser.add_argument('--input', action='store', default='root://cmseos.fnal.gov//store/user/dspitzba/EFT/nanogen_small.root', help="Input file") + argParser.add_argument('--output', action='store', default='./weights.pdf', help="Output file") + args = argParser.parse_args() + + path = args.input + #files = ['nanogen_12.root', 'nanogen_49.root', 'nanogen_51.root', 'nanogen_54.root', 'nanogen_5.root', 'nanogen_82.root', 'nanogen_13.root', 'nanogen_4.root', 'nanogen_52.root', 'nanogen_55.root', 'nanogen_63.root', 'nanogen_9.root', 'nanogen_2.root', 'nanogen_50.root', 'nanogen_53.root', 'nanogen_56.root', 'nanogen_7.root'] + files = [f for f in os.listdir(path) if '.root' in f] + #files = [files[0]] # Only process a single file + #files = [files[x] for x in range(10)] # Only process the first 10 files + #files = ['nanogen_123_220.root'] + + weight_axis = hist.axis.Regular(22, -20, 2, name="weight_ax", label="weight", underflow=True, overflow=True) + #weight_axis = hist.axis.Regular(100, -1, 1e1, name="weight_ax", label="weight", underflow=True, overflow=True) + h_SM = hist.Hist(weight_axis) + events = NanoEventsFactory.from_root( + #args.input, + path+files[0], + schemaclass=NanoAODSchema, + ).events() + + w = events.LHEWeight + eft_weight_names = [ x for x in w.fields if x.startswith('EFTrwgt') ] + #h_ttG = hist.Hist(weight_axis) + h_ttG = [] + eft_coeffs = ak.to_numpy(events["EFTfitCoefficients"]) if hasattr(events, "EFTfitCoefficients") else None + #h_ttG_rwgt = HistEFT(weight_axis, wc_names=['ctZ', 'cpt', 'cpQM', 'cpQ3', 'ctW', 'ctp', 'ctG'], label=r"Events") + for weight in eft_weight_names: + h_ttG.append(hist.Hist(weight_axis)) + + print('[', end='') + for fin in files: + events = NanoEventsFactory.from_root( + #args.input, + path+fin, + schemaclass=NanoAODSchema, + ).events() + + w = events.LHEWeight + eft_weight_names = [ x for x in w.fields if x.startswith('EFTrwgt') ] + if not eft_weight_names: + eft_weight_names = utils.get_list_of_wc_names(path+fin) + eft_weight_names = ['_0.0_'.join(eft_weight_names)] + print(fin, eft_weight_names) + + sm_wgt = getattr(events.LHEWeight, eft_weight_names[-1]) + print('.', end='') + #print(f'Smallest non-zero SM weight: {np.min(sm_wgt[sm_wgt != 0.0])}') + h_SM.fill(weight_ax=np.log10(getattr(events.LHEWeight, eft_weight_names[-1]))) + #h_SM.fill(weight_ax=getattr(events.LHEWeight, eft_weight_names[-1])) + + #h_ttG = [(w,hist.Hist(weight_axis)) for w in ttG] + #h_ctg1 = hist.Hist(weight_axis) + #h_ctg1.fill(weight_ax=getattr(events.LHEWeight, 'EFTrwgt66_ctW_0.0_ctq1_0.0_cQq81_0.0_ctZ_0.0_cQq83_0.0_ctG_0.0_ctq8_0.0_cQq13_0.0_cQq11_0.0')) + + # EFTrwgt10_ctGRe_2.0_ctGIm_0.0_ctWRe_0.0_ctWIm_0.0_ctBRe_0.0_ctBIm_0.0_cHtbRe_0.0_cHtbIm_0.0_cHt_0.0 + #h_ctg2 = hist.Hist(weight_axis) + #h_ctg2.fill(weight_ax=getattr(events.LHEWeight, 'EFTrwgt0_ctW_-1.722436_ctq1_1.171197_cQq81_1.34397_ctZ_-6.408086_cQq83_1.555205_ctG_0.2893_ctq8_-0.625025_cQq13_-1.305265_cQq11_1.762244')) + #for (weight,h) in h_ttG: + # h.fill(weight_ax=getattr(events.LHEWeight, weight)) + + + #h_ctg1.plot1d(ax=ax, label=r'$C_{tG}=1$') + #h_ctg2.plot1d(ax=ax, label=r'$C_{tG}=2$') + #for i,(_,h) in enumerate(h_ttG): + # h.plot1d(ax=ax, label=f'wgt_{i}') + + for nw,weight in enumerate(eft_weight_names[:-1]): + h_ttG[nw].fill(weight_ax=np.log10(getattr(events.LHEWeight, weight))) + #h_ttG[nw].fill(weight_ax=getattr(events.LHEWeight, weight)) + #wgts = getattr(events.LHEWeight, 'EFTrwgt_ctZ_0.0_cpt_0.0_cpQM_0.0_cpQ3_0.0_ctW_0.0_ctp_0.0_ctG_0.0') + #h_ttG_rwgt.fill(weight_ax=wgts, eft_coeff=eft_coeffs) + print(']') + + for iweight,weight in enumerate(eft_weight_names): + fig, ax = plt.subplots() + + hep.histplot(h_SM, ax=ax, label=r'$SM=0$', flow='show', histtype='errorbar', yerr=False) + label = '$tt\gamma$' + label = 'EFT' + hep.histplot(h_ttG[iweight], ax=ax, label=label, flow='show', histtype='errorbar', yerr=False) + # 'EFTrwgt66_ctW_0.0_ctq1_0.0_cQq81_0.0_ctZ_0.0_cQq83_0.0_ctG_0.0_ctq8_0.0_cQq13_0.0_cQq11_0.0' + eft_weight = weight.split('_')[1:] + wcs = eft_weight[:-1:2] + vals = eft_weight[1::2] + vals = [float(v) for v in vals] + eft_pt = dict(zip(wcs,vals)) + #hep.histplot(h_ttG_rwgt.as_hist(eft_pt), ax=ax, label=label+' reweight', flow='show', histtype='errorbar', yerr=False) + ax.set_ylabel(r'# Events') + ax.set_xlabel(r'log(weight)') + + #ax.set_xscale("log") + ax.set_yscale("log") + #ax.set_xlim([1e-2, 1e2]) + #ax.set_xlim([1e-6, 1e2]) + #ax.set_xlim([1e-2, 1e2]) + + plt.legend() + + fig.savefig(args.output) + fig.savefig(args.output.replace('.pdf', '_rwgt{}.pdf'.format(iweight))) + fig.savefig(args.output.replace('.pdf', '_rwgt{}.png'.format(iweight))) + print(f"Figure saved in {args.output.replace('.pdf', '_rwgt{}.pdf'.format(iweight))}") + #fig.savefig(args.output.replace('.pdf', '_{}.pdf'.format(eft_weight_names[iweight]))) + #fig.savefig(args.output.replace('.pdf', '_{}.png'.format(eft_weight_names[iweight]))) + #print(f"Figure saved in {args.output.replace('.pdf', '_rwgt{}.pdf'.format(iweight))}") + plt.close() diff --git a/analysis/mc_validation/gen_hist_eventweights_plotter.py b/analysis/mc_validation/gen_hist_eventweights_plotter.py new file mode 100644 index 000000000..a21e8325d --- /dev/null +++ b/analysis/mc_validation/gen_hist_eventweights_plotter.py @@ -0,0 +1,55 @@ +''' +This script plots weights produced by `gen_hist_eventweights_processor.py` +Example: +python gen_hist_eventweights_plotter.py 2022_tllq_NewStPt4.pkl.gz /users/byates2/afs/www/EFT/tllq_NewStPt4_Run3/weights/weights.pdf +''' +import os +import pickle +import gzip +import matplotlib.pyplot as plt +import argparse +from topeft.modules import axes +BINNING = {k: v['variable'] for k,v in axes.info.items() if 'variable' in v} + +#Load hists from pickle file created by TopCoffea +hists={} + +parser = argparse.ArgumentParser(description='You can select which file to run over') +parser.add_argument('fin' , default='analysis/topEFT/histos/mar03_central17_pdf_np.pkl.gz' , help = 'File to run over') +parser.add_argument('output' , default='/users/byates2/afs/www/EFT/tllq_NewStPt4_Run3/weights/' , help = 'Output path') +args = parser.parse_args() +fin = args.fin + +with gzip.open(fin) as fin: + hin = pickle.load(fin) + for k in hin.keys(): + if isinstance(hin[k], dict): + continue + if k in hists: hists[k]+=hin[k] + else: hists[k]=hin[k] + +for h_name in hists: + ls = '-' + if 'coeff' in h_name: continue + if 'efth' in h_name: continue + if 'SM' in h_name and False: + label = 'SM' + elif 'neg' in h_name: + ls = '--' + elif 'abs' in h_name: + ls = '-.' + if 'pt' in h_name: + label = 'EFT' + h_name.split('_')[1] + else: + label = h_name.split('_')[1] + hists[h_name].plot1d(label=label, yerr=False, ls=ls, flow='show') + if 'coeff' in h_name: continue + elif 'efth' in h_name: continue + +plt.legend(ncol=3) +plt.gca().set_yscale('log') +plt.gca().set_xlabel('log(event weights)') +plt.tight_layout() +os.makedirs(f'{args.output}', exist_ok=True) +plt.savefig(f'{args.output}/weights.pdf') +plt.savefig(f'{args.output}/weights.png') diff --git a/analysis/mc_validation/gen_hist_eventweights_processor.py b/analysis/mc_validation/gen_hist_eventweights_processor.py new file mode 100644 index 000000000..c78cbb2bc --- /dev/null +++ b/analysis/mc_validation/gen_hist_eventweights_processor.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python +''' +This script produces histograms of the log of the event weights +It assumes the MG weights are saved in the nanoGEN file. + +Example: +python run_gen_hist_eventweights_processor.py ../../input_samples/sample_jsons/signal_samples/private_UL/2022_tllq_NewStPt4_nanoGEN.json -o 2022_tllq_NewStPt4 -x futures -r file:///cms/cephfs/data/ -p gen_hist_eventweights_processor.py +''' +import awkward as ak +import numpy as np +np.seterr(divide='ignore', invalid='ignore', over='ignore') +import hist +from hist import Hist +import topcoffea.modules.eft_helper as efth + +from coffea import processor + +# Get the lumi for the given year +def get_lumi(year): + lumi_dict = { + "2016APV": 19.52, + "2016": 16.81, + "2017": 41.48, + "2018": 59.83 + } + if year not in lumi_dict.keys(): + raise Exception(f"(ERROR: Unknown year \"{year}\".") + else: + return lumi_dict[year] + +# Clean the objects +def is_clean(obj_A, obj_B, drmin=0.4): + objB_near, objB_DR = obj_A.nearest(obj_B, return_metric=True) + mask = ak.fill_none(objB_DR > drmin, True) + return (mask) + +# Create list of wc values in the correct order +def order_wc_values(wcs, ref_pts): + '''Returns list of wc values in the same order as the list of wc names based on a dictionary + ''' + wc_names = wcs + ref_pts = ref_pts + + wc_values = [] + for i in wc_names: + wc_values.append(ref_pts[i]) + + return wc_values + +# Calculate event weights from wc values and eft fit coefficients +def calc_event_weights(eft_coeffs, wc_vals): + '''Returns an array that contains the event weight for each event. + eft_coeffs: Array of eft fit coefficients for each event + wc_vals: wilson coefficient values desired for the event weight calculation, listed in the same order as the wc_lst + such that the multiplication with eft_coeffs is correct + The correct ordering can be achieved with the order_wc_values function + ''' + event_weight = np.empty_like(eft_coeffs) + + wcs = np.hstack((np.ones(1),wc_vals)) + wc_cross_terms = [] + index = 0 + for j in range(len(wcs)): + for k in range (j+1): + term = wcs[j]*wcs[k] + wc_cross_terms.append(term) + event_weight = np.sum(np.multiply(wc_cross_terms, eft_coeffs), axis=1) + + return event_weight + +class AnalysisProcessor(processor.ProcessorABC): + def __init__(self, samples, wc_names_lst=[], hist_lst = None, dtype=np.float32, do_errors=False): + self._samples = samples + self._wc_names_lst = wc_names_lst + + self._dtype = dtype + self._do_errors = do_errors + + print("self._samples", self._samples) + print("self._wc_names_lst", self._wc_names_lst) + + # Create the histograms with new scikit hist + self._low = -12 + self._high = 3 + self._histo_dict = { + "weights_SM_log" : Hist(hist.axis.Regular(bins=120, start=self._low, stop=self._high, name="weights_SM_log", label='log(event weights at the SM)' ), storage="weight"), + "weights_SMneg_log" : Hist(hist.axis.Regular(bins=120, start=self._low, stop=self._high, name="weights_SMneg_log", label='log(event weights at the SMneg)' ), storage="weight"), + "weights_SMabs_log" : Hist(hist.axis.Regular(bins=120, start=self._low, stop=self._high, name="weights_SMabs_log", label='log(event weights at the SMabs)' ), storage="weight"), + "weights_SMefth_log" : Hist(hist.axis.Regular(bins=120, start=self._low, stop=self._high, name="weights_SMefth_log", label='log(event weights at the SMefth)' ), storage="weight"), + "weights_SMcoeff_log" : Hist(hist.axis.Regular(bins=120, start=self._low, stop=self._high, name="weights_SMcoeff_log", label='log(event weights at the SM)' ), storage="weight"), + "weights_pt1_log" : Hist(hist.axis.Regular(bins=120, start=self._low, stop=self._high, name="weights_pt1_log", label='log(event weights at the pt1)'), storage="weight"), + "weights_pt2_log" : Hist(hist.axis.Regular(bins=120, start=self._low, stop=self._high, name="weights_pt2_log", label='log(event weights at the pt2)'), storage="weight"), + "weights_pt3_log" : Hist(hist.axis.Regular(bins=120, start=self._low, stop=self._high, name="weights_ptself._high_log", label='log(event weights at the ptself._high)'), storage="weight"), + "weights_pt4_log" : Hist(hist.axis.Regular(bins=120, start=self._low, stop=self._high, name="weights_pt4_log", label='log(event weights at the pt4)'), storage="weight"), + "weights_pt5_log" : Hist(hist.axis.Regular(bins=120, start=self._low, stop=self._high, name="weights_pt5_log", label='log(event weights at the pt5)'), storage="weight"), + #"events_100" : {}, + #"events" : {}, + } + + # Set the list of hists to to fill + if hist_lst is None: + self._hist_lst = list(self._histo_dict.keys()) + else: + for h in hist_lst: + if h not in self._histo_dict.keys(): + raise Exception(f"Error: Cannot specify hist \"{h}\", it is not defined in self._histo_dict") + self._hist_lst = hist_lst + + print("hist_lst: ", self._hist_lst) + + @property + def columns(self): + return self._columns + + def process(self, events): + + # Dataset parameters + dataset = events.metadata['dataset'] + hist_axis_name = self._samples[dataset]["histAxisName"] + + year = self._samples[dataset]['year'] + xsec = self._samples[dataset]['xsec'] + sow = self._samples[dataset]['nSumOfWeights'] + + # Extract the EFT quadratic coefficients and optionally use them to calculate the coefficients on the w**2 quartic function + # eft_coeffs is never Jagged so convert immediately to numpy for ease of use. + eft_coeffs = ak.to_numpy(events['EFTfitCoefficients']) if hasattr(events, "EFTfitCoefficients") else None + + # else: + SM_pt = {wc: 0.0 for wc in self._wc_names_lst} + #wc_lst_SM = order_wc_values(self._wc_names_lst, SM_pt) + wc_lst_SM = np.zeros(len(self._wc_names_lst)) + event_weights_SM = calc_event_weights(eft_coeffs,wc_lst_SM) + event_weights_SMcoeff = eft_coeffs[:,0] + #wc_lst_SM = np.zeros(len(self._wc_names_lst)) + #event_weights_SM = efth.calc_eft_weights(eft_coeffs,wc_lst_SM) + event_weights_SMefth = efth.calc_eft_weights(eft_coeffs,np.zeros(len(self._wc_names_lst))) + + eft_weight_names = [ x for x in events['LHEWeight'].fields if x.startswith('EFTrwgt') ][:10] + if not eft_weight_names: + wc_lst_pt1 = np.random.uniform(-10, 10, len(self._wc_names_lst)) + #wc_lst_pt1 = order_wc_values(self._wc_names_lst, rwgt1) + event_weights_pt1 = calc_event_weights(eft_coeffs, wc_lst_pt1) + else: + wc_vals = eft_weight_names[0].split('_')[1:] + wc_vals = np.array([float(wc) for wc in wc_vals[1::2]]) + event_weights_pt1 = efth.calc_eft_weights(eft_coeffs,wc_vals) + + #wc_lst_pt2 = order_wc_values(self._wc_names_lst, rwgt2) + if not eft_weight_names: + wc_lst_pt2 = np.random.uniform(-10, 10, len(self._wc_names_lst)) + event_weights_pt2 = calc_event_weights(eft_coeffs, wc_lst_pt2) + else: + wc_vals = eft_weight_names[1].split('_')[1:] + wc_vals = np.array([float(wc) for wc in wc_vals[1::2]]) + event_weights_pt2 = efth.calc_eft_weights(eft_coeffs,wc_vals) + + #wc_lst_pt3 = order_wc_values(self._wc_names_lst, rwgt3) + if not eft_weight_names: + wc_lst_pt3 = np.random.uniform(-10, 10, len(self._wc_names_lst)) + event_weights_pt3 = calc_event_weights(eft_coeffs, wc_lst_pt3) + else: + wc_vals = eft_weight_names[2].split('_')[1:] + wc_vals = np.array([float(wc) for wc in wc_vals[1::2]]) + event_weights_pt3 = efth.calc_eft_weights(eft_coeffs,wc_vals) + + #wc_lst_pt4 = order_wc_values(self._wc_names_lst, rwgt4) + if not eft_weight_names: + wc_lst_pt4 = np.random.uniform(-10, 10, len(self._wc_names_lst)) + event_weights_pt4 = calc_event_weights(eft_coeffs, wc_lst_pt4) + else: + wc_vals = eft_weight_names[3].split('_')[1:] + wc_vals = np.array([float(wc) for wc in wc_vals[1::2]]) + event_weights_pt4 = efth.calc_eft_weights(eft_coeffs,wc_vals) + + #wc_lst_pt5 = order_wc_values(self._wc_names_lst, rwgt5) + if not eft_weight_names: + wc_lst_pt5 = np.random.uniform(-10, 10, len(self._wc_names_lst)) + event_weights_pt5 = calc_event_weights(eft_coeffs, wc_lst_pt5) + else: + wc_vals = eft_weight_names[4].split('_')[1:] + wc_vals = np.array([float(wc) for wc in wc_vals[1::2]]) + event_weights_pt5 = efth.calc_eft_weights(eft_coeffs,wc_vals) + + counts = np.ones_like(event_weights_SM) + + ######## Fill histos ######## + hout = self._histo_dict + variables_to_fill = { + "weights_SM_log" : np.nan_to_num(np.log10(event_weights_SM ), nan=self._low-1), + "weights_SMneg_log" : np.nan_to_num(np.log10(event_weights_SM[event_weights_SM < 0] ), nan=self._low-1), + "weights_SMabs_log" : np.nan_to_num(np.log10(np.abs(event_weights_SM) ), nan=self._low-1), + "weights_SMefth_log" : np.nan_to_num(np.log10(event_weights_SMefth ), nan=self._low-1), + "weights_SMcoeff_log" : np.nan_to_num(np.log10(event_weights_SMcoeff ), nan=self._low-1), + "weights_pt1_log": np.nan_to_num(np.log10(event_weights_pt1), nan=self._low-1), + "weights_pt2_log": np.nan_to_num(np.log10(event_weights_pt2), nan=self._low-1), + "weights_pt3_log": np.nan_to_num(np.log10(event_weights_pt3), nan=self._low-1), + "weights_pt4_log": np.nan_to_num(np.log10(event_weights_pt4), nan=self._low-1), + "weights_pt5_log": np.nan_to_num(np.log10(event_weights_pt5), nan=self._low-1), + } + + for var_name, var_values in variables_to_fill.items(): + if var_name not in self._hist_lst: + print(f"Skipping \"{var_name}\", it is not in the list of hists to include") + continue + + if 'neg' in var_name: hout[var_name].fill(var_values, weight=counts[event_weights_SM < 0]) + else: hout[var_name].fill(var_values, weight=counts) + #if ak.any(event_weights_SM[event_weights_SM > 100]): + # hout['events_100'][events.metadata['filename']] = event_weights_SM[event_weights_SM > 100] + #hout['events'][events.metadata['filename']] = event_weights_SM.tolist() + + return hout + + def postprocess(self, accumulator): + return accumulator + + +if __name__ == '__main__': + raise Exception('Please use `run_gen_hist_eventweights_processor.py` to run this processor!') diff --git a/analysis/mc_validation/gen_processor.py b/analysis/mc_validation/gen_processor.py new file mode 100644 index 000000000..31ad7cb9b --- /dev/null +++ b/analysis/mc_validation/gen_processor.py @@ -0,0 +1,549 @@ +#!/usr/bin/env python +import copy +import coffea +#from coffea.nanoevents.methods import candidate +import warnings +warnings.filterwarnings("ignore", message="Missing cross-reference index") +import numpy as np +import awkward as ak +np.seterr(divide='ignore', invalid='ignore', over='ignore') +from coffea import processor +from coffea.analysis_tools import PackedSelection +import hist + +import topeft.modules.object_selection as te_os +from topcoffea.modules.histEFT import HistEFT +import topcoffea.modules.eft_helper as efth +import topcoffea.modules.event_selection as tc_es +from topeft.modules.axes import info as axes_info +from topcoffea.modules.get_param_from_jsons import GetParam +from topcoffea.modules.paths import topcoffea_path +get_tc_param = GetParam(topcoffea_path("params/params.json")) + +#import topcoffea.modules.corrections as tc_cor + +def construct_cat_name(chan_str,nlep_cat,njet_str=None,flav_str=None): + + # Get the component strings + nlep_str = chan_str.split("_")[0] # Assumes n leps comes first in the str + chan_str = "_".join(chan_str.split("_")[1:]) # The rest of the channel name is everything that comes after nlep + if chan_str == "": chan_str = None # So that we properly skip this in the for loop below + if flav_str is not None: + flav_str = flav_str + if njet_str is not None: + njet_str = njet_str[-2:] # Assumes number of n jets comes at the end of the string + if "j" not in njet_str: + # The njet string should really have a "j" in it + raise Exception(f"Something when wrong while trying to consturct channel name, is \"{njet_str}\" an njet string?") + + # Put the component strings into the channel name + ret_str = nlep_str + ret_str = nlep_cat + for component in [flav_str,chan_str,njet_str]: + if component is None: continue + ret_str = "_".join([ret_str,component]) + ret_str = ret_str.replace('ph_ph', 'ph') + return ret_str + + +class AnalysisProcessor(processor.ProcessorABC): + + def __init__(self, samples, wc_names_lst=[], hist_lst=None, ecut_threshold=None, do_errors=False, do_systematics=False, split_by_lepton_flavor=False, skip_signal_regions=False, skip_control_regions=False, dtype=np.float32): + + self._samples = samples + self._wc_names_lst = wc_names_lst + self._dtype = dtype + self._do_systematics = do_systematics # Whether to process systematic samples + self._skip_signal_regions = skip_signal_regions # Whether to skip the SR categories + self._skip_control_regions = skip_control_regions # Whether to skip the CR categories + self._split_by_lepton_flavor = split_by_lepton_flavor # Whether to keep track of lepton flavors individually + + # Create the histograms + axes_info["photon_pt"] = { + "regular": (20, 0, 400), + "variable": [20,35,50,70,100,170,200,250,300], + "label": "$p_{T}$ $\gamma$ (GeV)" + } + proc_axis = hist.axis.StrCategory([], name="process", growth=True) + chan_axis = hist.axis.StrCategory([], name="channel", growth=True) + syst_axis = hist.axis.StrCategory([], name="systematic", growth=True) + appl_axis = hist.axis.StrCategory([], name="appl", label=r"AR/SR", growth=True) + self._accumulator = { + "mll_fromzg_e" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(40, 0, 200, name="mll_fromzg_e", label=r"invmass ee from z/gamma"), wc_names=wc_names_lst, rebin=False), + "mll_fromzg_m" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(40, 0, 200, name="mll_fromzg_m", label=r"invmass mm from z/gamma"), wc_names=wc_names_lst, rebin=False), + "mll_fromzg_t" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(40, 0, 200, name="mll_fromzg_t", label=r"invmass tautau from z/gamma"), wc_names=wc_names_lst, rebin=False), + "mll" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(30, 0, 300, name="mll", label=r"Invmass l0l1"), wc_names=wc_names_lst, rebin=False), + "invm" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Variable([300, 310, 320, 350, 400, 500, 700, 1000], name="invm", label=r"Invmass of system"), wc_names=wc_names_lst, rebin=False), + "invm_ttX" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Variable([400, 410, 420, 450, 500, 550, 600, 650, 700, 750, 800, 900, 1000], name="invm_ttX", label=r"Invmass of system"), wc_names=wc_names_lst, rebin=False), + "invm_tX" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Variable([200, 210, 220, 250, 300, 350, 400, 450, 500, 600, 700, 1000], name="invm_tX", label=r"Invmass of system"), wc_names=wc_names_lst, rebin=False), + "invm_4t" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Variable([700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000, 4000], name="invm_4t", label=r"Invmass of system"), wc_names=wc_names_lst, rebin=False), + "ht" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(100, 0, 1000, name="ht", label=r"Scalar sum of genjet pt"), wc_names=wc_names_lst, rebin=False), + "ht_clean" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(100, 0, 1000, name="ht_clean", label=r"Scalar sum of clean genjet pt"), wc_names=wc_names_lst, rebin=False), + "lhe_t_pt" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(50, 0, 500, name="lhe_t_pt", label=r"Pt of the leading LHE t"), wc_names=wc_names_lst, rebin=False), + "t_pt" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(10, 0, 500, name="t_pt", label=r"Pt of the leading t"), wc_names=wc_names_lst, rebin=False), + "z_pt" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(50, 0, 500, name="z_pt", label=r"Pt of the leading t"), wc_names=wc_names_lst, rebin=False), + "H_pt" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(50, 0, 500, name="H_pt", label=r"Pt of the leading t"), wc_names=wc_names_lst, rebin=False), + "tops_pt" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(50, 0, 500, name="tops_pt", label=r"Pt of the sum of the tops"), wc_names=wc_names_lst, rebin=False), + "dral" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(10,0,1, name="dral", label=r"$\Delta R(\gamma, \ell)$"), wc_names=wc_names_lst, rebin=False), + "dral_sec" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(100,0,1, name="dral_sec", label=r"Sub $\Delta R(\gamma, \ell)$"), wc_names=wc_names_lst, rebin=False), + "draj" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(10,0,1, name="draj", label=r"$\Delta R(\gamma, j)$"), wc_names=wc_names_lst, rebin=False), + "photon_pt" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Variable(axes_info["photon_pt"]["variable"], name="photon_pt", label=r"$p_{\mathrm{T}}$ $\gamma$ (GeV)"), wc_names=wc_names_lst, rebin=False), + "photon_pt_gen" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Variable([20,35,50,70,100,170,200,250,300], name="photon_pt", label=r"$p_{\mathrm{T}}$ $\gamma$ (GeV)"), wc_names=wc_names_lst, rebin=False), + "photon_pt_cnt" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(28,20,300, name="photon_pt", label=r"$p_{\mathrm{T}}$ $\gamma$ (GeV) count"), wc_names=wc_names_lst, rebin=False), + "photon_eta" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(30, -1.5, 1.5, name="photon_eta", label=r"$\eta$ $\gamma$ (GeV)"), wc_names=wc_names_lst, rebin=False), + "SumOfWeights": HistEFT(proc_axis, hist.axis.Regular(bins=1, start=0, stop=2, name="SumOfWeights", label="SumOfWeights"), wc_names=wc_names_lst), + "SumOfWeights_eft": HistEFT(proc_axis, hist.axis.Regular(bins=1, start=0, stop=2, name="SumOfWeights", label="SumOfWeights"), wc_names=wc_names_lst), + "lhe_l0pt" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(25, 0, 250, name="lhe_l0pt", label=r"Pt of leading LHE lepton"), wc_names=wc_names_lst, rebin=False), + "l0pt" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(25, 0, 250, name="l0pt", label=r"Pt of leading lepton"), wc_names=wc_names_lst, rebin=False), + "j0pt" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(15, 0, 500, name="j0pt", label=r"Pt of leading jet"), wc_names=wc_names_lst, rebin=False), + "lj0pt" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(15, 0, 500, name="lj0pt", label=r"Pt of leading light jet"), wc_names=wc_names_lst, rebin=False), + "bj0pt" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(15, 0, 500, name="bj0pt", label=r"Pt of leading b jet"), wc_names=wc_names_lst, rebin=False), + "tX_pt" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(40, 0, 400, name="tX_pt", label=r"Pt of the t(t)X system"), wc_names=wc_names_lst, rebin=False), + "njets" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(10, 0, 10, name="njets", label=r"njets"), wc_names=wc_names_lst, rebin=False), + } + self._accumulator['gen_reco'] = hist.Hist( + proc_axis, + chan_axis, + syst_axis, + appl_axis, + hist.axis.Variable([20,35,50,70,100,170,200,250,300], name='ptgen', label=r'gen $p_{\rm{T}}$'), + hist.axis.Variable([20,35,50,70,100,170,200,250,300], name='ptreco', label=r'reco $p_{\rm{T}}$'), + label=r"Events" + ) + + # Set the list of hists to fill + if hist_lst is None: + # If the hist list is none, assume we want to fill all hists + self._hist_lst = list(self._accumulator.keys()) + else: + # Otherwise, just fill the specified subset of hists + for hist_to_include in hist_lst: + if hist_to_include not in self._accumulator.keys(): + raise Exception(f"Error: Cannot specify hist \"{hist_to_include}\", it is not defined in the processor.") + self._hist_lst = hist_lst # Which hists to fill + + # Set the booleans + self._do_errors = do_errors # Whether to calculate and store the w**2 coefficients + + + @property + def accumulator(self): + return self._accumulator + + @property + def columns(self): + return self._columns + + # Main function: run on a given dataset + def process(self, events): + + ### Dataset parameters ### + dataset = events.metadata["dataset"] + + isData = self._samples[dataset]["isData"] + histAxisName = self._samples[dataset]["histAxisName"] + year = self._samples[dataset]["year"] + xsec = self._samples[dataset]["xsec"] + sow = self._samples[dataset]["nSumOfWeights"] + post_mortem = self._samples[dataset]["post_mortem"] + sow_renormUp = self._samples[dataset]["nSumOfWeights"] + sow_renormDown = self._samples[dataset]["nSumOfWeights"] + sow_factUp = self._samples[dataset]["nSumOfWeights"] + sow_factDown = self._samples[dataset]["nSumOfWeights"] + + if isData: raise Exception("Error: This processor is not for data") + + selections = PackedSelection(dtype='uint64') + + ### Get gen particles collection ### + genpart = events.GenPart + lhepart = events.LHEPart + #photon = events.Photon + genjet = events.GenJet + + ### Lepton object selection ### + + gen_top = ak.pad_none(genpart[(abs(genpart.pdgId) == 6)],2) + gen_Z = ak.pad_none(genpart[(abs(genpart.pdgId) == 23)],2) + gen_H = ak.pad_none(genpart[(abs(genpart.pdgId) == 25)],2) + gen_bos = ak.pad_none(genpart[(abs(genpart.pdgId) == 23) | (abs(genpart.pdgId) == 24) | abs(genpart.pdgId) == 25],2) + + ############################################################################################### + # Be careful with the LHE information. Not every process has the particles in the same order! # + ############################################################################################### + lhe_top = lhepart[(np.abs(lhepart.pdgId) == 6)] + lhe_top = ak.max(lhe_top.pt, axis=1) + lhe_l = lhepart[(np.abs(lhepart.pdgId) == 11) | (np.abs(lhepart.pdgId) == 13) | (np.abs(lhepart.pdgId) == 15)] + lhe_l = ak.max(lhe_l.pt, axis=1) + + gen_l = genpart[((abs(genpart.pdgId) == 11) | (abs(genpart.pdgId) == 13) | (abs(genpart.pdgId) == 15))] + gen_e = genpart[(abs(genpart.pdgId) == 11)] + gen_m = genpart[(abs(genpart.pdgId) == 13)] + gen_t = genpart[(abs(genpart.pdgId) == 15)] + + # Dressed leptons + gen_l = events.GenDressedLepton + gen_e = gen_l[(abs(gen_l.pdgId) == 11)] + gen_m = gen_l[(abs(gen_l.pdgId) == 13)] + gen_t = gen_l[(abs(gen_l.pdgId) == 15)] + + gen_p_l = genpart[(abs(genpart.pdgId) == 22)] + gen_p = events.GenIsolatedPhoton + ######### Systematics ########### + + # Define the lists of systematics we include + wgt_correction_syst_lst = [ + "phoSFUp","phoSFDown", # Exp systs + "renormUp","renormDown", # Exp systs + "factUp","factDown", # Exp systs + #"CMS_effUp","CMS_effDown" # Exp systs + ] + + gen_l = gen_l[ak.argsort(gen_l.pt, axis=-1, ascending=False)] + gen_l = ak.pad_none(gen_l,2) + gen_e = gen_e[ak.argsort(gen_e.pt, axis=-1, ascending=False)] + gen_m = gen_m[ak.argsort(gen_m.pt, axis=-1, ascending=False)] + gen_t = gen_t[ak.argsort(gen_t.pt, axis=-1, ascending=False)] + + # Object selection after padding + is_em = (np.abs(gen_l[:,0].pdgId)==11) & (np.abs(gen_l[:,1].pdgId)==13) + is_me = (np.abs(gen_l[:,1].pdgId)==11) & (np.abs(gen_l[:,0].pdgId)==13) + is2lOS = (gen_l[:,0].pdgId+gen_l[:,1].pdgId == 0) & ((ak.num(gen_e) + ak.num(gen_m))==2) + is2lSS = (gen_l[:,0].pdgId+gen_l[:,1].pdgId != 0) & ((ak.num(gen_e) + ak.num(gen_m))==2) + is2lOS_em = is2lOS & ak.any(is_em | is_me) + is3l = (ak.num(gen_e) + ak.num(gen_m))==3 + + gen_p = ak.pad_none(gen_p,1) + gen_p = ak.with_name(gen_p, 'PtEtaPhiMCandidate') + + # Jet object selection + genjet = genjet + genjet_clean_near, genjet_clean_DR = gen_p.nearest(genjet, return_metric=True) + genlep_clean_near, genlep_clean_DR = gen_p.nearest(gen_l, return_metric=True) + no_colinear = ak.all(genjet_clean_DR > 0.1, axis=1) + is_clean_jet = te_os.isClean(genjet, gen_e, drmin=0.4) & te_os.isClean(genjet, gen_m, drmin=0.4) & te_os.isClean(genjet, gen_t, drmin=0.4) & te_os.isClean(genjet, gen_p, drmin=0.1) + is_clean_ph = te_os.isClean(gen_p, gen_e, drmin=0.1) & te_os.isClean(gen_p, gen_m, drmin=0.1) + gen_p_clean = gen_p + genjet_clean = genjet + genbjet_clean = genjet + njets = ak.num(genjet_clean) + nbjets = ak.num(genbjet_clean) + atleast_1ph = ak.num(gen_p_clean)>0 + exactly_1ph = ak.num(gen_p_clean)==1 + + + selections.add("2los_CRtt", is2lOS_em & (nbjets==2) & (~atleast_1ph)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + selections.add("incl", np.ones(len(events), dtype=bool)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + selections.add("incl_l0pt", (ak.firsts(gen_l.pt)>25)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + selections.add("incl_l0pt_dral0p4", (ak.firsts(gen_l.pt)>25) & ak.fill_none(ak.firsts(genlep_clean_DR)>0.4, False)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + selections.add("incl_dral0p1", (ak.firsts(genlep_clean_DR)>0.1)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + selections.add("incl_draj_dral0p1", (ak.firsts(genlep_clean_DR)>0.1) & (ak.firsts(genjet_clean_DR)>0.1)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + selections.add("incl_draj_dral0p4", (ak.firsts(genlep_clean_DR)>0.4) & (ak.firsts(genjet_clean_DR)>0.4)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + selections.add("incl_draj0p5_dral0p4", (ak.firsts(genlep_clean_DR)>0.4) & (ak.firsts(genjet_clean_DR)>0.5)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + selections.add("2lss", is2lSS & (nbjets>=1) & (ak.firsts(genjet_clean.pt)>30)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + selections.add("2los_ph", is2lOS & (nbjets==2) & (exactly_1ph) & (ak.firsts(gen_p).pt>20)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + selections.add("2los_ph_dral0p1", is2lOS & (ak.firsts(genlep_clean_DR)>0.1) & (nbjets==2) & (atleast_1ph) & (ak.firsts(gen_p).pt>20)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + selections.add("2los_ph_dral0p4", is2lOS & (ak.firsts(genlep_clean_DR)>0.4) & (nbjets==2) & (atleast_1ph) & (ak.firsts(gen_p).pt>20)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + selections.add("2los_ph_l", is2lOS & (nbjets==2) & (ak.firsts(genjet_clean.pt)>30) & (ak.num(gen_p_l[(np.abs(gen_p_l.distinctParent.pdgId)==11) | (np.abs(gen_p_l.distinctParent.pdgId)==13)])==1) & (ak.firsts(gen_p_l[(np.abs(gen_p_l.distinctParent.pdgId)==11) | (np.abs(gen_p_l.distinctParent.pdgId)==13)]).pt>20)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + sfosz_2los_ttg_mask = tc_es.get_Z_peak_mask(gen_l[:,0:2],pt_window=15.0) + Zllgamma_SF_mask = (abs( (gen_l[:,0] + gen_l[:,1] + gen_p[:,0]).mass -91.2) > 15) + selections.add("2los_sf_Zg_CR_ULttg", (is2lOS_em & ~sfosz_2los_ttg_mask & Zllgamma_SF_mask & (nbjets==2) & atleast_1ph)) + selections.add("3l", is3l & (nbjets>=0) & (ak.firsts(genjet_clean.pt)>30)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + + + # Njets selection + selections.add("exactly_0j", (njets==0)) + selections.add("exactly_1j", (njets==1)) + selections.add("exactly_2j", (njets==2)) + selections.add("exactly_3j", (njets==3)) + selections.add("exactly_4j", (njets==4)) + selections.add("exactly_5j", (njets==5)) + selections.add("exactly_6j", (njets==6)) + selections.add("atleast_1j", (njets>=1)) + selections.add("atleast_2j", (njets>=2)) + selections.add("atleast_3j", (njets>=3)) + selections.add("atleast_4j", (njets>=4)) + selections.add("atleast_5j", (njets>=5)) + selections.add("atleast_7j", (njets>=7)) + selections.add("atleast_0j", (njets>=0)) + selections.add("atmost_1j" , (njets<=1)) + selections.add("atmost_3j" , (njets<=3)) + + + # AR/SR categories + selections.add("isSR_2lSS", is2lSS)#( events.is2l_SR) & charge2l_0) + selections.add("isSR_2lOS", is2lOS)#( events.is2l_SR) & charge2l_0) + selections.add("isSR_3l", is3l)#(~events.is2l_SR) & charge2l_0) + selections.add("isAR_2lSS", is2lSS)#(~events.is2l_SR) & charge2l_0) + selections.add("isAR_2lOS", is2lOS)#(~events.is2l_SR) & charge2l_0) + selections.add("isAR_3l", is3l)#(~events.is2l_SR) & charge2l_0) + selections.add("isAR_incl", np.ones(len(events), dtype=bool))#(~events.is2l_SR) & charge2l_0) + + + ### Get dense axis variables ### + + ht = ak.sum(genjet.pt,axis=-1) + ht_clean = ak.sum(genjet_clean.pt,axis=-1) + + tops_pt = gen_top.sum().pt + alltX = gen_top.sum() + ak.firsts(gen_bos) + tX = ak.firsts(gen_top) + ak.firsts(gen_bos) + ttX = gen_top[:,0:2].sum() + ak.firsts(gen_bos) + tttt = gen_top[:,0:5].sum()# + ak.firsts(gen_bos) + + # Invmass distributions + mll_l0l1 = (gen_l[:,0] + gen_l[:,1]).mass + + sfos_mask = gen_l[:,0].pdgId == -gen_l[:,1].pdgId + mll_l0l1 = ak.where(sfos_mask, mll_l0l1, ak.ones_like(mll_l0l1)*-1) + sfos30 = (gen_l[:,0] + gen_l[:,1]).mass >= 30 + + selections.add("incl_sfos30", sfos30) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + selections.add("incl_sfos30_dral0p4", sfos30 & (ak.firsts(genlep_clean_DR)>0.4)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + selections.add("2los_ph_sfos30", is2lOS & sfos30 & (nbjets==2) & (atleast_1ph) & (ak.firsts(gen_p).pt>20)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + selections.add("2los_ph_sfos30_dral0p4", is2lOS & sfos30 & (ak.firsts(genlep_clean_DR)>0.4) & (nbjets==2) & (atleast_1ph) & (ak.firsts(gen_p).pt>20)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + selections.add("incl_0p_sfos30_dral0p4", (sfos30 & ((events.LHE.Nc + events.LHE.Nuds + events.LHE.Nglu)==0) & ak.fill_none(ak.firsts(genlep_clean_DR)>0.4, False))) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + selections.add("ee", ((np.abs(gen_l[:,0].pdgId)==11) & (np.abs(gen_l[:,1].pdgId)==11))) + selections.add("em", (((np.abs(gen_l[:,0].pdgId)==11) & (np.abs(gen_l[:,1].pdgId)==13)) | ((np.abs(gen_l[:,0].pdgId)==13) & (np.abs(gen_l[:,1].pdgId)==11)))) + selections.add("mm", ((np.abs(gen_l[:,0].pdgId)==13) & (np.abs(gen_l[:,1].pdgId)==13))) + + gen_p_pt = ak.fill_none(ak.firsts(gen_p).pt, -1) + gen_p_eta = ak.fill_none(ak.firsts(gen_p).eta, -999) + + # Dictionary of dense axis values + dense_axis_dict = { + "photon_pt" : gen_p_pt, + "photon_eta" : gen_p_eta, + "mll" : ak.fill_none(mll_l0l1, -1), + "invm" : ak.fill_none(alltX.mass, -1), + "invm_ttX" : ak.fill_none(ttX.mass, -1), + "invm_tX" : ak.fill_none(tX.mass, -1), + "invm_4t" : ak.fill_none(tttt.mass, -1), + "t_pt" : ak.fill_none(ak.firsts(gen_top).pt, -1), + "z_pt" : ak.fill_none(ak.firsts(gen_Z).pt, -1), + "H_pt" : ak.fill_none(ak.firsts(gen_H).pt, -1), + "lhe_t_pt" : ak.fill_none(lhe_top, -1), + "l0pt" : ak.fill_none(ak.firsts(gen_l.pt), -1), + "j0pt" : ak.fill_none(ak.firsts(genjet_clean.pt), -1), + "lj0pt" : ak.fill_none(ak.firsts(genjet_clean[np.abs(genjet_clean.partonFlavour)!=5].pt), -1), + "bj0pt" : ak.fill_none(ak.firsts(genbjet_clean.pt), -1), + "draj" : ak.fill_none(ak.firsts(genjet_clean_DR), -1), + "dral" : ak.fill_none(ak.firsts(genlep_clean_DR), -1), + "dral_sec" : ak.fill_none(ak.pad_none(genlep_clean_DR, 2), -1)[:,1], + "njets" : njets, + } + cat_dict = {} + sr_cat_dict = { + "2los_ph" : { + "atleast_1j" : { + "lep_chan_lst" : ["2los_ph", "2los_ph_l"], + "lep_flav_lst" : ["ee", "em", "mm"],# no split so only run once, "mm", "em"], + "appl_lst" : ["isSR_2lOS", "isAR_2lOS"], + }, + "exactly_2j" : { + "lep_chan_lst" : ["2los_ph", "2los_ph_l"], + "lep_flav_lst" : ["ee", "em", "mm"],# no split so only run once, "mm", "em"], + "appl_lst" : ["isSR_2lOS", "isAR_2lOS"], + }, + "atleast_3j" : { + "lep_chan_lst" : ["2los_ph", "2los_ph_l"], + "lep_flav_lst" : ["ee", "em", "mm"],# no split so only run once, "mm", "em"], + "appl_lst" : ["isSR_2lOS", "isAR_2lOS"], + }, + }, + "2lss" : { + "exactly_2j" : { + "lep_chan_lst" : ["2lss"], + "lep_flav_lst" : ["ee", "em", "mm"],# no split so only run once, "mm", "em"], + "appl_lst" : ["isSR_2lSS", "isAR_2lSS"], + }, + "atleast_3j" : { + "lep_chan_lst" : ["2lss"], + "lep_flav_lst" : ["ee", "em", "mm"],# no split so only run once, "mm", "em"], + "appl_lst" : ["isSR_2lSS", "isAR_2lSS"], + }, + }, + "3l" : { + "exactly_2j" : { + "lep_chan_lst" : ["3l"], + "lep_flav_lst" : ["ee"],# no split so only run once, "mm", "em"], + "appl_lst" : ["isSR_3l", "isAR_3l"], + }, + "atleast_1j" : { + "lep_chan_lst" : ["3l"], + "lep_flav_lst" : ["ee"],# no split so only run once, "mm", "em"], + "appl_lst" : ["isSR_3l", "isAR_3l"], + }, + }, + "incl": { + "atleast_0j" : { + "lep_chan_lst" : ["incl", "incl_sfos30", "incl_draj_dral0p4", "incl_draj0p5_dral0p4", "incl_sfos30_dral0p4", "2los_ph", "2los_ph_dral0p4", "2los_ph_sfos30", "2los_ph_sfos30_dral0p4", "incl_0p", "incl_1p", "incl_0p_dral0p4", "incl_1p_dral0p4", "incl_0p_sfos30_dral0p4", "incl_l0pt", "incl_l0pt_dral0p4"], + "lep_flav_lst" : ["em", "ee", "mm"], + "appl_lst" : ["isAR_incl", "isSR_2lOS"], + }, + "atleast_1j" : { + "lep_chan_lst" : ["2los_ph", "2los_ph_dral0p4"], + "lep_flav_lst" : ["em", "ee", "mm"], + "appl_lst" : ["isSR_2lOS"], + } + } + } + cr_cat_dict = { + "2los_CRtt" : { + "atmost_3j" : { + "lep_chan_lst" : ["2los_ph"], + "lep_flav_lst" : ["em", "ee", "mm"], + "appl_lst" : ["isSR_2lOS" , "isAR_2lOS"], + }, + }, + "2los_CR_Zg_ULttg" : { + "atleast_2j" : { + "lep_chan_lst" : ["2los_sf_Zg_CR_ULttg"], + "lep_flav_lst" : ["em", "ee", "mm"], + "appl_lst" : ["isSR_2lOS", "isAR_2lOS"], + }, + }, + } + + + ### Get weights ### + + # Extract the EFT quadratic coefficients and optionally use them to calculate the coefficients on the w**2 quartic function + # eft_coeffs is never Jagged so convert immediately to numpy for ease of use. + eft_coeffs = None + if hasattr(events, "EFTfitCoefficients") or post_mortem: + eft_coeffs = ak.to_numpy(events["EFTfitCoefficients"]) + eft_coeffs_fit = ak.to_numpy(events["EFTfitCoefficients"]) + if post_mortem: + eft_coeffs = ak.to_numpy(events["EFTPostMortem"]) + LHEWeight_originalXWGTUP = ak.broadcast_arrays( + events.LHEWeight.originalXWGTUP[:, None], + eft_coeffs + )[0] + eft_coeffs = eft_coeffs*LHEWeight_originalXWGTUP + + if eft_coeffs is not None: + # Check to see if the ordering of WCs for this sample matches what want + if post_mortem and self._samples[dataset]["PMWCnames"] != self._wc_names_lst: + eft_coeffs = efth.remap_coeffs(self._samples[dataset]["PMWCnames"], self._wc_names_lst, eft_coeffs) + elif self._samples[dataset]["WCnames"] != self._wc_names_lst and not post_mortem: + eft_coeffs = efth.remap_coeffs(self._samples[dataset]["WCnames"], self._wc_names_lst, eft_coeffs) + + eft_w2_coeffs = efth.calc_w2_coeffs(eft_coeffs,self._dtype) if (self._do_errors and eft_coeffs is not None) else None + + # If this is not an eft sample, get the genWeight + if eft_coeffs is None: genw = events["genWeight"] + else: genw = np.ones_like(events["event"]) + lumi = 1000.0*get_tc_param(f"lumi_{year}") + event_weight = lumi*xsec*genw/sow + + weights_dict = {} + # For both data and MC + weights_obj_base = coffea.analysis_tools.Weights(len(events),storeIndividual=True) + weights_obj_base.add("norm",event_weight) + + events.nom = np.ones(len(events)) + + for ch_name in ["2los_CRtt", "2lss", "2los_ph", "2los_ph_l", "2los_CR_Zg_ULttg", "3l", "incl"]: + weights_dict[ch_name] = copy.deepcopy(weights_obj_base) + + ### Loop over the hists we want to fill ### + + hout = self.accumulator + counts = np.ones_like(events['event']) + + for dense_axis_name, dense_axis_vals in dense_axis_dict.items(): + if dense_axis_name not in self._hist_lst or '_cnt' in dense_axis_name: + continue + if not self._skip_signal_regions: + cat_dict.update(sr_cat_dict) + if not self._skip_control_regions: + cat_dict.update(cr_cat_dict) + if (not self._skip_signal_regions and not self._skip_control_regions): + for k in sr_cat_dict: + if k in cr_cat_dict: + raise Exception(f"The key {k} is in both CR and SR dictionaries.") + + # Set up the list of syst wgt variations to loop over + wgt_var_lst = ["nominal"] + if self._do_systematics: + if not isData: + wgt_var_lst = wgt_var_lst + wgt_correction_syst_lst + + for wgt_fluct in wgt_var_lst: + for nlep_cat in cat_dict.keys(): + flav_ch = None + njet_ch = None + njets_any_mask = selections.any(*cat_dict[nlep_cat].keys()) + weights_object = weights_dict[nlep_cat] + if (wgt_fluct == "nominal"): + # In the case of "nominal", or the jet energy systematics, no weight systematic variation is used + weight = weights_object.weight(None) + else: + # Otherwise get the weight from the Weights object + if wgt_fluct in weights_object.variations: + weight = weights_object.weight(wgt_fluct) + else: + # Note in this case there is no up/down fluct for this cateogry, so we don't want to fill a hist for it + continue + # Loop over the njets list for each channel + for njet_val in cat_dict[nlep_cat]: + + # Loop over the appropriate AR and SR for this channel + for appl in cat_dict[nlep_cat][njet_val]["appl_lst"]: + # Loop over the channels in each nlep cat (e.g. "3l_m_offZ_1b") + for lep_chan in cat_dict[nlep_cat][njet_val]["lep_chan_lst"]: + + # Loop over the lep flavor list for each channel + for lep_flav in cat_dict[nlep_cat][njet_val]["lep_flav_lst"]: + cuts_lst = [appl,lep_chan] + if 'incl' in nlep_cat: + cuts_lst = [lep_chan] + if self._split_by_lepton_flavor: + flav_ch = lep_flav + cuts_lst.append(lep_flav) + if dense_axis_name == "njets": + all_cuts_mask = (selections.all(*cuts_lst) & njets_any_mask) + else: + njet_ch = njet_val + all_cuts_mask = (selections.all(*cuts_lst)) + ch_name = construct_cat_name(lep_chan,nlep_cat,njet_str=njet_ch,flav_str=flav_ch) + + # Mask out the none values + isnotnone_mask = (ak.fill_none((dense_axis_vals != None),False)) + isnotnone_mask = isnotnone_mask & all_cuts_mask + dense_axis_vals_cut = dense_axis_vals[isnotnone_mask] + event_weight_cut = weight[isnotnone_mask] + dense_axis_vals_cut = dense_axis_vals[all_cuts_mask]#[isnotnone_mask] + event_weight_cut = weight[all_cuts_mask]#[isnotnone_mask] + eft_coeffs_cut = eft_coeffs + if eft_coeffs is not None: eft_coeffs_cut = eft_coeffs[all_cuts_mask] + + # Fill the histos + axes_fill_info_dict = { + dense_axis_name : dense_axis_vals_cut, + "process" : histAxisName, + "appl" : appl, + "channel" : ch_name, + "systematic" : wgt_fluct, + "weight" : event_weight_cut, + "eft_coeff" : eft_coeffs_cut, + } + + hout[dense_axis_name].fill(**axes_fill_info_dict) + + axes_fill_info_dict[dense_axis_name] = gen_p_pt[all_cuts_mask] + if 'photon_pt' in dense_axis_name and 'lhe' not in dense_axis_name: hout[dense_axis_name+'_gen'].fill(**axes_fill_info_dict) + + axes_fill_info_dict['weight'] = np.ones_like(event_weight_cut) + + # Do not loop over lep flavors if not self._split_by_lepton_flavor, it's a waste of time and also we'd fill the hists too many times + if not self._split_by_lepton_flavor: break + return hout + + def postprocess(self, accumulator): + return accumulator + + +if __name__ == '__main__': + raise Exception('Please use `run_gen_analysis.py` to run this processor!') diff --git a/analysis/mc_validation/quad_curves.py b/analysis/mc_validation/quad_curves.py new file mode 100644 index 000000000..ec21141b4 --- /dev/null +++ b/analysis/mc_validation/quad_curves.py @@ -0,0 +1,410 @@ +''' +This script produces the quadratic curves based on the SumOfWeights (inclusive cross-section). +As a minimum it takes in the SumOfWeights pkl file and a path to save the plots. +The path defaults to `~/afs/www/EFT/` which is the web are on `glados`. +Modify the script if you're using another system like `lxplus`. +The script finds and prints the WC values where they each scale the SM by 10%. + +By default the script skips any WCs that have complex roots +or ones that need a |value|>100 to scale the SM by 10%. +To save these, use the `--save-all` flag, which will create a sub directory called `bad`. + +Starting Points: +You can also pass the json file for the process which may have optional fields. +If you have the starting point used for the process, add it to the json file as +` +"StPt": { + "WC1": st_pt, + "WC2": st_pt, + ... + "WCn": stpt +}, +` +then the script will draw the orange dots at the starting point. +Otherwise it draws them at zero + +MadGraph dedicated point validation: +If you have the MadGraph cross-section for dedicated points, add them to the json file as +` + "mg_weights": { + "cHQ3": {"-15": 0.332, "-5": 0.299, "-1": 0.30, "1": 0.300, "5": 0.3296}, + ... + }, +` +then the script will draw the red stars and fit a quadratic to them as well. + + +Example: +python quad_curves.py ../topeft_run2/histos/2022_ttlnuJet_Run3.pkl.gz --json ../../input_samples/sample_jsons/signal_samples/private_UL/2022_ttlnuJet_nanoGEN.json --dout ttlnu_NewStPt_Run3 +''' +import os +import pickle +import gzip +import numpy as np +import matplotlib.pyplot as plt +import argparse +import json +from topcoffea.scripts.make_html import make_html + +#Load hists from pickle file created by TopCoffea +hists={} + +parser = argparse.ArgumentParser(description='You can select which file to run over') +parser.add_argument('fin' , default='analysis/topEFT/histos/mar03_central17_pdf_np.pkl.gz' , help = 'Variable to run over') +parser.add_argument('--dout' , default='ttgamma' , help = 'Where to save') +parser.add_argument('--save-all', action='store_true' , help = 'Where to save') +parser.add_argument('--json', default='', help = 'Json file(s) containing files and metadata') +parser.add_argument("--fixed" , action="extend", nargs="+", help="Fixed WC versions (sum of weights pkl file)" ) +parser.add_argument("--fixed-wcs" , action="extend", nargs="+", help="Fixed WC values (`wc:val`)" ) +parser.add_argument("--do-grad" , action="store_true", help="Do full gradient (defaults to single derivative for each WC)" ) +parser.add_argument('--scale' , default="1.1" , help = 'Where to stop') +args = parser.parse_args() +fin = args.fin +scale = float(args.scale) +assert args.fixed is None or len(args.fixed) == len(args.fixed_wcs), print(f'{len(args.fixed)} must match {len(args.fixed_wcs)}!') + +with gzip.open(fin) as fin: + hin = pickle.load(fin) + for k in hin.keys(): + if k in hists: hists[k]+=hin[k] + else: hists[k]=hin[k] + +fixed_points = [] +if args.fixed is not None: + for ifin,fin in enumerate(args.fixed): + print(f'Loading {ifin} {fin}') + with gzip.open(fin) as fin: + hin = pickle.load(fin) + fixed_points.append(hin['SumOfWeights']) + print('Done loading fixed points') + + +sow = hists['SumOfWeights'] +wcs = sow._wc_names +#wcs = ['ctW', 'ctZ'] +step = 0.01 # Small step size +wc_start = np.random.normal(0, .1, len(wcs)) # Pick some random values +wc_pt = dict(zip(wcs, wc_start)) +wc_start = wc_pt +sm = sow[{'process': sum}].eval(wc_pt)[()][1] +sm = sow[{'process': sum}].eval({})[()][1] + +for ifixed,fixed in enumerate(fixed_points): + print(args.fixed_wcs[ifixed], fixed_points[ifixed][{'process': sum}].eval({})[()], sm, sow[{'process': sum}].eval({'cHbox': 0.81})) + fixed_points[ifixed] = (args.fixed_wcs[ifixed], fixed_points[ifixed][{'process': sum}].eval({})[()][1] / sm) + print(fixed_points[ifixed]) + +wc_vals = {} + +wc_best = wc_pt + +for key in wc_start: + wc_vals[key] = [0] +wc_pt = {wc: 0 for wc in wcs} # Start at SM +#wc_pt = wc_start + +wc_start = wc_pt +#wc_pt = dict(zip(wcs, [0]*len(wcs))) +print(wc_pt) +# Gradient "ascent" in 26D +old = wc_pt # This restarts each WC to 0 to just compute the derivative, not the gradient +for n in range(1000): + val = sow[{'process': sum}].eval(wc_pt)[()][1] + # 10% of SM? + #if np.abs(val - 54688) < 1e-3: + if np.abs(val / sm - scale) < 1e-2: + print("Reached 10%", val/sm, f'in {n} steps') + wc_best = wc_pt + break + #grad = np.array([]) + grad = np.empty(len(wc_pt)) + if not args.do_grad: + old = wc_pt # This restarts each WC to 0 to just compute the derivative, not the gradient + # Compute partial derivatives + i = 0 + for wc,wc_val in wc_pt.items(): + tmp = old + prev_val = val + h_val = val + ''' + if val/sm > 1.1: + h_val = wc_val - step + else: + h_val = wc_val + step + ''' + h_val = wc_val + step + wc_pt[wc] = h_val + tmp[wc] = h_val + val = sow[{'process': sum}].eval(tmp)[()][1] + # Approximate d(xsec)/d(wc) = slope + # Shift WC value by slope * step + ''' + if args.do_grad: + grad = np.append(grad, (step*(1.1 - val/sm) / h_val + wc_val)) + #if n % 200 == 0: + # print(f'{wc=}, {val/sm=}, {h_val=}, {wc_val=}') + # print('Will step by', step*(1.1 - val/sm) / h_val + wc_val) + else: + grad = np.append(grad, (1.1 - val/sm) / h_val + wc_val) + ''' + #grad = np.append(grad, step*(1.1 - val/sm) / h_val + wc_val) + #FIXME grad = np.append(grad, (1.1 - val/sm) / h_val + wc_val) + grad[i] = step*(scale - val/sm) / h_val + wc_val + i += 1 + #grad = np.append(grad, (54688 - val/sm) / h_val * step + wc_val) + #print(wc, tmp, val, grad[-1], val/sm, (1.1 - val/sm) / h_val + wc_val) + #print(wc, grad, (1.1 - val/sm) / h_val * step + wc_val) + #grad = np.append(grad, (1.1 - val/sm) / h_val * step + wc_val) + if args.do_grad: + old = tmp # Update so the other WCs aren't replaced with 0 + #if n % 200 == 0: print(old) + wc_pt = dict(zip(wcs, grad)) + # Save current WC values for later processing + for wc,wc_val in wc_pt.items(): + wc_vals[wc].append(wc_val) + +# Set insensitive WCs to inf +#print('delta', {wc: np.abs(wc_best[wc] - wc_start[wc]) for wc in wc_best}) +for wc in wc_best: + if np.abs(wc_best[wc] - wc_start[wc]) < 1e-3 or (np.abs(wc_best[wc]) > 100): + print('Dropping', wc) + wc_best[wc] = np.inf +#print(f'{wc_best=}') +# Drop insensitive WCs +for wc in wc_best: + if wc_best[wc] == np.inf: + wc_vals.pop(wc) + wc_best[wc] = 0 + +good_wcs = wc_vals.keys() +# Redo WC values +wc_vals = {wc: [] for wc in wc_vals} +wc_pt = {wc: -2 for wc in wc_vals} +val = sow[{'process': sum}].eval({})[()][1] +# Probably overkill if not computing yields here +for n in range(100): + for wc,wc_val in wc_pt.items(): + h_val = wc_val + 0.1 + prev = val + wc_pt[wc] = np.real(h_val) + wc_vals[wc].append(h_val) + #for wc in wc_vals: + #val = sow[{'process': sum}].eval(wc_pt)[()][1] + #wc_vals[wc].append(wc_pt[wc]) +wc_1d = {} + +# Make 1D plots +user = os.getlogin() +os.makedirs(f'/afs/crc.nd.edu/user/{user[0]}/{user}/www/EFT/{args.dout}/', exist_ok=True) +#wc_start = {"ctW": 1.580000, "cpQM": 62.660000, "ctq1": 1.190000, "cQq81": 2.430000, "ctZ": 2.560000, "cQq83": 2.780000, "ctG": 0.310000, "ctq8": 2.020000, "cQq13": 1.340000, "cQq11": 1.350000, "cpt": 32.930000} +#wc_start = {"ctW": 3.160000, "ctq1": 2.380000, "cQq81": 4.860000, "ctZ": 5.120000, "cQq83": 5.560000, "ctG": 0.620000, "ctq8": 4.040000, "cQq13": 2.680000, "cQq11": 2.700000} +# tttt +#wc_start = {"cbGRe": 23.360000, "ctj1": 0.640000, "cQj31": 0.750000, "ctGRe": 1.810000, "ctj8": 0.880000, "ctHRe": 11.050000, "cQj11": 0.760000, "ctu1": 0.870000, "cHtbRe": 52.960000, "cQj18": 1.280000, "clj1": 100.000000, "cleQt1Re22": 100.000000, "ctu8": 1.430000, "cQQ1": 0.800000, "cQt1": 0.860000, "cQj38": 1.320000, "ctb8": 69.420000, "ctd8": 1.990000, "cQt8": 1.660000, "cleQt3Re11": 100.000000, "ctd1": 1.170000, "cleQt3Re33": 100.000000, "cbWRe": 17.450000, "cHbox": 7.430000, "cld": 100.000000, "cleQt3Re22": 100.000000, "cQu1": 0.890000, "cQe": 100.000000, "cQd1": 1.160000, "cHt": 3.220000, "ctWRe": 2.980000, "clu": 100.000000, "cQd8": 1.790000, "cleQt1Re33": 100.000000, "cleQt1Re11": 100.000000, "ctt": 0.650000, "cHQ3": 2.960000, "cQb8": 69.850000, "cHQ1": 10.400000, "ctBRe": 3.720000, "cQu8": 1.290000, "cte": 100.000000, "ctl": 100.000000, "cQl1": 100.000000, "cbBRe": 100.000000, "cQl3": 100.000000} +#wc_start = {"cbGRe": 16.730000, "ctj1": -0.670000, "cQj31": 0.770000, "ctGRe": -0.190000, "ctj8": 1.140000, "ctHRe": -5.150000, "cQj11": -0.770000, "ctu1": -0.870000, "cHtbRe": 25.840000, "cQj18": 1.400000, "clj1": 100.000000, "cleQt1Re22": 100.000000, "ctu8": 1.790000, "cQQ1": -0.260000, "cQt1": 0.480000, "cQj38": 1.900000, "ctb8": 46.520000, "ctd8": 2.620000, "cQt8": -1.060000, "cleQt3Re11": 100.000000, "ctd1": -1.160000, "cleQt3Re33": 100.000000, "cbWRe": 13.780000, "cHbox": 5.190000, "cld": 100.000000, "cleQt3Re22": 100.000000, "cQu1": -0.900000, "cQe": 100.000000, "cQd1": -1.160000, "cHt": 1.990000, "ctWRe": 2.260000, "clu": 100.000000, "cQd8": 2.240000, "cleQt1Re33": 100.000000, "cleQt1Re11": 100.000000, "ctt": -0.280000 , "cHQ3": 1.970000, "cQb8": 44.360000, "cHQ1": -2.010000, "ctBRe": 2.670000, "cQu8": 1.650000, "cte": 100.000000, "ctl": 100.000000, "cQl1": 100.000000, "cbBRe": 64.600000, "cQl3": 100.000000} +mg_weights = {} +sm_xsec = 1 +if args.json != '': + with open(args.json) as fin: + print(f'Loading {args.json}') + j = json.load(fin) + if 'StPt' in j: + wc_start = j['StPt'] + print(f'Using {wc_start=}') + if 'mg_weights' in j: + mg_weights = j['mg_weights'] + if 'sm_xsec' in j: + sm_xsec = j['sm_xsec'] + print(mg_weights) +wc_bad = [] +if args.save_all: + os.makedirs(f'/afs/crc.nd.edu/user/{user[0]}/{user}/www/EFT/{args.dout}/bad/', exist_ok=True) +for wc in wc_vals: + has_mg = False + vals = wc_vals[wc] + vmin = np.min(wc_vals[wc]) + vmin = np.min([np.min(wc_vals[wc]), 0.9]) + vmax = np.max(wc_vals[wc]) + vmax = np.max([np.max(wc_vals[wc]), 6]) + vmin,vmax = [-6,6] + yields = [] + # Compute yields along saved points + for val in vals: + yields.append(sow[{'process': sum}].eval({wc: val})[()][1]/sm) + if val < vmin: vmin = val + if val > vmax: vmax = val + #if np.abs(np.max(yields[-1]) - 1) < 1e-2: + # wc_best.pop(wc) + # print(f'Removing {wc}') + # if args.save_all: + # wc_bad.append(wc) + # else: + # continue + # Fit 1D curve + poly = np.polyfit(vals, yields, 2) + roots = np.roots(poly - np.array([0,0,scale])) + if np.any(np.iscomplex(roots)):# or ((np.abs(poly[0]) < 1e-3) and (np.abs(poly[1]) < 1e-3)): + print('Skipping', wc, poly, '(complex roots)') + if not args.save_all: + continue + if wc in wc_best: + wc_best.pop(wc) + if wc not in wc_bad: + wc_bad.append(wc) + # Make smooth plot + if wc not in wc_start: + wc_start[wc] = 100. + vmin = min(min(np.min(roots)*1.1, -6), wc_start[wc]) + vmax = max(max(np.max(roots)*1.1, 6), wc_start[wc]) + polymin = min(np.polyval(poly, np.linspace(vmin,vmax,100))) + if vmin < 100 and vmax > 100: + print('Skipping', wc, poly, f'10% * SM @ {wc} > 100') + if not args.save_all: + continue + if wc in wc_best: + wc_best.pop(wc) + if wc not in wc_bad: + wc_bad.append(wc) + + plt.plot(np.linspace(vmin,vmax,100), np.polyval(poly, np.linspace(vmin,vmax,100)), label=f'1D curve\n{np.round(poly[0], 2)} * {wc}^2 + {np.round(poly[1], 2)} * {wc} + {np.round(poly[2], 2)}') + # 10% larger than SM + plt.axhline(scale, linestyle='--', color='b') + #plt.plot(vals, yields, 'o', color='r') + #plt.plot(vals[::10], yields[::10], 'o', color='r') + wc_1d[wc] = roots + for iroot,root in enumerate(roots): + best = sow[{'process': sum}].eval({wc: root})[()][1] + if iroot==0: + plt.plot(root, best/sm, color='k', marker='o', label='$\sigma='+args.scale+'\sigma_{SM}$') + else: + plt.plot(root, best/sm, color='k', marker='o') + #best = sow[{'process': sum}].eval(wc_best)[()][1] + #plt.plot(wc_best[wc], best/sm, color='k', marker='o', label='26D $\sigma=1.1\sigma_{SM}$') + # Plot starting point + plt.plot(wc_start[wc], sow[{'process': sum}].eval({wc: wc_start[wc]})[()][1]/sm, '*', label='Starting point') + if 'mg_weights' in hists: + plt.plot([-1, 0, 1], hists['mg_weights'][wc]/hists['mg_weights'][wc][1], marker='*', ls='None', markersize=10, color='red') + if mg_weights: + mgv = [] + mgw = [] + #for mg_wc in mg_weights: + # if wc == mg_wc.split('_')[0]: + # if has_mg: plt.plot(mg_weights[mg_wc][0], mg_weights[mg_wc][1]/sm_xsec, marker='*', ls='None', markersize=10, color='red') + # else: plt.plot(mg_weights[mg_wc][0], mg_weights[mg_wc][1]/sm_xsec, marker='*', ls='None', markersize=10, color='red', label='MadGraph') + # #if has_mg: plt.plot(mg_weights[mg_wc][0], mg_weights[mg_wc][1]/mg_weights[mg_wc][2], marker='*', ls='None', markersize=10, color='red') + # #else: plt.plot(mg_weights[mg_wc][0], mg_weights[mg_wc][1]/mg_weights[mg_wc][2], marker='*', ls='None', markersize=10, color='red', label='MadGraph') + # has_mg = True + # mgv.append(mg_weights[mg_wc][0]) + # #mgw.append(mg_weights[mg_wc][1]/mg_weights[mg_wc][2]) + # mgw.append(mg_weights[mg_wc][1]/sm_xsec) + if wc in mg_weights: + sm_wc_xsec = sm_xsec + if '0' in mg_weights[wc]: + sm_wc_xsec = mg_weights[wc]['0'] + for wgt in mg_weights[wc]: + #if float(wgt) == 0: continue + if has_mg: plt.plot(float(wgt), mg_weights[wc][wgt]/sm_wc_xsec, marker='*', ls='None', markersize=10, color='red') + else: plt.plot(float(wgt), mg_weights[wc][wgt]/sm_wc_xsec, marker='*', ls='None', markersize=10, color='red', label='MadGraph') + #if has_mg: plt.plot(wgt, mg_weights[wc][wgt]/wgt[2], marker='*', ls='None', markersize=10, color='red') + #else: plt.plot(wgt, mg_weights[wc][wgt]/wgt[2], marker='*', ls='None', markersize=10, color='red', label='MadGraph') + has_mg = True + mgv.append(float(wgt)) + #mgw.append(mg_weights[wc][wgt]/wgt[2]) + mgw.append(mg_weights[wc][wgt]/sm_wc_xsec) + ''' + if wc == 'ctt': + mgpoly = np.polyfit(mgv, mgw, 2) + print(np.roots(mgpoly - np.array([0,0,1.1]))) + ''' + if len(mgv) > 0 and len(mgv) == len(mgw): + plt.plot(np.linspace(vmin,vmax,100), np.polyval(np.polyfit(mgv, mgw, 2), np.linspace(vmin,vmax,100)), color='r', ls='--') + if fixed_points: + for fixed in fixed_points: + fix_wc, fix_value = fixed[0].split('=') + if wc != fix_wc: + continue + plt.plot(float(fix_value), fixed[1], marker='*', ls='None', markersize=10, color='red') + plt.grid() + #plt.xlim(vmin*1.1,vmax*1.1) + #plt.xlim(-1, 1) + #plt.ylim(np.min(yields)-0.1,1.6) + #plt.ylim(np.min(yields)-0.1,max(1.5, sow[{'process': sum}].eval({wc: wc_start[wc]})[()][1]/sm * 1.1)) + plt.ylim(min(np.min(yields), polymin)-0.2,max(1.6, sow[{'process': sum}].eval({wc: wc_start[wc]})[()][1]/sm * 1.1)) + #plt.ylim(np.min(yields)-0.8,max(1.5, sow[{'process': sum}].eval({wc: wc_start[wc]})[()][1]/sm * 1.1)) + #if 'box' in wc: plt.ylim(0, 1.5) + plt.xlabel(wc) + plt.ylabel('$\sigma_{EFT} / \sigma_{SM}$') + plt.legend() + scale_post = args.scale.replace('.', 'p') + 'SM' + if wc in wc_best: plt.savefig(f'/afs/crc.nd.edu/user/{user[0]}/{user}//www/EFT/{args.dout}/{wc}_quad_{scale_post}.png') + else: plt.savefig(f'/afs/crc.nd.edu/user/{user[0]}/{user}/www/EFT/{args.dout}/bad/{wc}_quad_{scale_post}.png') + #print(wc, vals, yields) + plt.close() +#print(wc_best) +print("All roots:", {k:wc_1d[k] for k in wcs if k in wc_1d}) +''' +for wc in wc_1d: + if np.iscomplex(wc_1d[wc][0]): + print(wc, wc_1d[wc], len(wc_1d[wc])) + wc_1d[wc][0] = 100 + print(wc, wc_1d[wc]) + if np.iscomplex(wc_1d[wc][1]): + print(wc, wc_1d[wc]) + wc_1d[wc][1] = 100 + print(wc, wc_1d[wc]) +''' +print("Smallest root:", {k:(100.0 if np.any(np.iscomplex(wc_1d[k])) else wc_1d[k][np.argmin(np.abs(wc_1d[k]))]) for k in wcs if k in wc_1d}) +#print({k:(np.max(wc_1d[k]) if k in wc_1d else 100) for k in wcs}) +for wc in wc_1d: + if np.any(np.iscomplex(wc_1d[wc])) or np.min(np.abs(wc_1d[wc]) > 100): + wc_1d[wc] = [100.0, 100.0] +print({k:np.round(wc_1d[k][np.argmin(np.abs(wc_1d[k]))], 2) for k in wcs if k in wc_1d}) +#print({k:(100.0 if np.any(np.iscomplex(wc_1d[k]) | (np.min(np.abs(wc_1d[k]) > 100)) else np.round(wc_1d[k][np.argmin(np.abs(wc_1d[k]))], 2)) for k in wcs if k in wc_1d}) +#print({k:np.round(np.(wc_1d[k]), 2) for k in wcs if k in wc_1d}) +print('Good wcs', [wc for wc in wc_1d]) + +# 2D plots +wcs = ['ctW', 'ctZ'] +wcs = ['ctZ'] +wc_pt = {wc: -2 for wc in wcs} +wc_vals = {wc: [] for wc in wcs} +grads = {(wc1,wc2): [] for iwc,wc1 in enumerate(wcs) for wc2 in wcs[iwc:] if wc1 != wc2} +yields = {(wc1,wc2): [] for iwc,wc1 in enumerate(wcs) for wc2 in wcs[iwc:] if wc1 != wc2} +num = 10 +r = 10 +for iwc,wc1 in enumerate(wcs): + for wc2 in wcs[iwc+1:]: + for v1 in np.linspace(-r, r, num+1): + tmp = wc_pt + prev = val + tmp[wc1] = v1 + val1 = sow[{'process': sum}].eval(tmp)[()][1] + for v2 in np.linspace(-r, r, num+1): + tmp = wc_pt + tmp[wc2] = v2 + val2 = sow[{'process': sum}].eval(tmp)[()][1] + tmp[wc1] = v1 + tmp[wc2] = v2 + val = sow[{'process': sum}].eval(tmp)[()][1] + grads[(wc1,wc2)].append(((val1 - prev) / (2*r/num), (val2-prev) / (2*r/num))) + wc_vals[wc1].append(v1) + wc_vals[wc2].append(v2) + wc_pt[wc1] = v1 + wc_pt[wc2] = v2 + val = sow[{'process': sum}].eval(wc_pt)[()][1] + yields[(wc1,wc2)].append(val/sm) +for wc1,wc2 in yields: + color = np.linspace(-1, 5, len(yields[(wc1,wc2)])) + grad1 = np.array([g[0] for g in grads[(wc1,wc2)]]) + grad2 = np.array([g[1] for g in grads[(wc1,wc2)]]) + grad1 /= np.sqrt(np.square(grad1) + np.square(grad2)) + grad2 /= np.sqrt(np.square(grad1) + np.square(grad2)) + #grad1 *= step + #grad2 *= step + c=plt.quiver(wc_vals[wc1], wc_vals[wc2], grad1, grad2, color, cmap=plt.cm.jet) + plt.colorbar(c, cmap=plt.cm.jet) + plt.xlabel(wc1) + plt.ylabel(wc2) + plt.savefig(f'/afs/crc.nd.edu/user/{user[0]}/{user}/www/EFT/{args.dout}/{wc1}_{wc2}_quad.png') + +make_html(f'/afs/crc.nd.edu/user/{user[0]}/{user}/www/EFT/{args.dout}/') +if args.save_all: make_html(f'/afs/crc.nd.edu/user/{user[0]}/{user}/www/EFT/{args.dout}/bad/') diff --git a/analysis/mc_validation/run_gen_analysis.py b/analysis/mc_validation/run_gen_analysis.py new file mode 100644 index 000000000..7f8a9f9fe --- /dev/null +++ b/analysis/mc_validation/run_gen_analysis.py @@ -0,0 +1,374 @@ +#!/usr/bin/env python +''' +This script runs the gen processor +Example: +python run_gen_analysis.py ../../input_samples/sample_jsons/signal_samples/private_UL/2022_tllq_NewStPt4_zero_nanoGEN.json -o 2022_tllq_NewStPt4_zero -x futures -r file:///cms/cephfs/data/ +''' + +import argparse +import json +import time +import cloudpickle +import gzip +import os + +from coffea import processor +from coffea.nanoevents import NanoAODSchema + +import topcoffea.modules.utils as utils +import topcoffea.modules.remote_environment as remote_environment + +from topeft.modules.dataDrivenEstimation import DataDrivenProducer +from topeft.modules.get_renormfact_envelope import get_renormfact_envelope +import gen_processor + +LST_OF_KNOWN_EXECUTORS = ["futures","work_queue"] + +WGT_VAR_LST = [ + #"nSumOfWeights_ISRUp", + #"nSumOfWeights_ISRDown", + #"nSumOfWeights_FSRUp", + #"nSumOfWeights_FSRDown", + #"nSumOfWeights_renormUp", + #"nSumOfWeights_renormDown", + #"nSumOfWeights_factUp", + #"nSumOfWeights_factDown", + #"nSumOfWeights_renormfactUp", + #"nSumOfWeights_renormfactDown", +] + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='You can customize your run') + parser.add_argument('jsonFiles' , nargs='?', default='', help = 'Json file(s) containing files and metadata') + parser.add_argument('--executor','-x' , default='work_queue', help = 'Which executor to use') + parser.add_argument('--prefix', '-r' , nargs='?', default='', help = 'Prefix or redirector to look for the files') + parser.add_argument('--test','-t' , action='store_true' , help = 'To perform a test, run over a few events in a couple of chunks') + parser.add_argument('--pretend' , action='store_true', help = 'Read json files but, not execute the analysis') + parser.add_argument('--nworkers','-n' , default=8 , help = 'Number of workers') + parser.add_argument('--chunksize','-s' , default=100000, help = 'Number of events per chunk') + parser.add_argument('--nchunks','-c' , default=None, help = 'You can choose to run only a number of chunks') + parser.add_argument('--outname','-o' , default='plotsTopEFT', help = 'Name of the output file with histograms') + parser.add_argument('--outpath','-p' , default='histos', help = 'Name of the output directory') + parser.add_argument('--treename' , default='Events', help = 'Name of the tree inside the files') + parser.add_argument('--do-errors' , action='store_true', help = 'Save the w**2 coefficients') + parser.add_argument('--do-systs', action='store_true', help = 'Compute systematic variations') + parser.add_argument('--split-lep-flavor', action='store_true', help = 'Split up categories by lepton flavor') + parser.add_argument('--skip-sr', action='store_true', help = 'Skip all signal region categories') + parser.add_argument('--skip-cr', action='store_true', help = 'Skip all control region categories') + parser.add_argument('--do-np' , action='store_true', help = 'Perform nonprompt estimation on the output hist, and save a new hist with the np contribution included. Note that signal, background and data samples should all be processed together in order for this option to make sense.') + parser.add_argument('--do-renormfact-envelope', action='store_true', help = 'Perform renorm/fact envelope calculation on the output hist (saves the modified with the the same name as the original.') + parser.add_argument('--wc-list', action='extend', nargs='+', help = 'Specify a list of Wilson coefficients to use in filling histograms.') + parser.add_argument('--hist-list', action='extend', nargs='+', help = 'Specify a list of histograms to fill.') + parser.add_argument('--ecut', default=None , help = 'Energy cut threshold i.e. throw out events above this (GeV)') + parser.add_argument('--port', default='9123-9130', help = 'Specify the Work Queue port. An integer PORT or an integer range PORT_MIN-PORT_MAX.') + + + args = parser.parse_args() + jsonFiles = args.jsonFiles + prefix = args.prefix + executor = args.executor + dotest = args.test + nworkers = int(args.nworkers) + chunksize = int(args.chunksize) + nchunks = int(args.nchunks) if not args.nchunks is None else args.nchunks + outname = args.outname + outpath = args.outpath + pretend = args.pretend + treename = args.treename + do_errors = args.do_errors + do_systs = args.do_systs + split_lep_flavor = args.split_lep_flavor + skip_sr = args.skip_sr + skip_cr = args.skip_cr + do_np = args.do_np + do_renormfact_envelope = args.do_renormfact_envelope + wc_lst = args.wc_list if args.wc_list is not None else [] + + # Check if we have valid options + if executor not in LST_OF_KNOWN_EXECUTORS: + raise Exception(f"The \"{executor}\" executor is not known. Please specify an executor from the known executors ({LST_OF_KNOWN_EXECUTORS}). Exiting.") + if do_renormfact_envelope: + if not do_systs: + raise Exception("Error: Cannot specify do_renormfact_envelope if we are not including systematics.") + if not do_np: + raise Exception("Error: Cannot specify do_renormfact_envelope if we have not already done the integration across the appl axis that occurs in the data driven estimator step.") + if dotest: + if executor == "futures": + nchunks = 2 + chunksize = 10000 + nworkers = 1 + print('Running a fast test with %i workers, %i chunks of %i events'%(nworkers, nchunks, chunksize)) + else: + raise Exception(f"The \"test\" option is not set up to work with the {executor} executor. Exiting.") + + + # Set the threshold for the ecut (if not applying a cut, should be None) + ecut_threshold = args.ecut + if ecut_threshold is not None: ecut_threshold = float(args.ecut) + + if executor == "work_queue": + # construct wq port range + port = list(map(int, args.port.split('-'))) + if len(port) < 1: + raise ValueError("At least one port value should be specified.") + if len(port) > 2: + raise ValueError("More than one port range was specified.") + if len(port) == 1: + # convert single values into a range of one element + port.append(port[0]) + + # Figure out which hists to include + if args.hist_list == ["ana"]: + # Here we hardcode a list of hists used for the analysis + hist_lst = ["cosZ", "cosq", "azim", "tops_pt", "tX_pt", "njets"] + elif args.hist_list == ["cr"]: + # Here we hardcode a list of hists used for the CRs + hist_lst = ["lj0pt", "ptz", "met", "ljptsum", "l0pt", "l0eta", "l1pt", "l1eta", "j0pt", "j0eta", "njets", "nbtagsl", "invmass"] + else: + # We want to specify a custom list + # If we don't specify this argument, it will be None, and the processor will fill all hists + hist_lst = args.hist_list + + + ### Load samples from json + samplesdict = {} + allInputFiles = [] + + def LoadJsonToSampleName(jsonFile, prefix): + sampleName = jsonFile if not '/' in jsonFile else jsonFile[jsonFile.rfind('/')+1:] + if sampleName.endswith('.json'): sampleName = sampleName[:-5] + with open(jsonFile) as jf: + samplesdict[sampleName] = json.load(jf) + samplesdict[sampleName]['redirector'] = prefix + + if isinstance(jsonFiles, str) and ',' in jsonFiles: + jsonFiles = jsonFiles.replace(' ', '').split(',') + elif isinstance(jsonFiles, str): + jsonFiles = [jsonFiles] + for jsonFile in jsonFiles: + if os.path.isdir(jsonFile): + if not jsonFile.endswith('/'): jsonFile+='/' + for f in os.path.listdir(jsonFile): + if f.endswith('.json'): allInputFiles.append(jsonFile+f) + else: + allInputFiles.append(jsonFile) + + # Read from cfg files + for f in allInputFiles: + if not os.path.isfile(f): + raise Exception(f'[ERROR] Input file {f} not found!') + # This input file is a json file, not a cfg + if f.endswith('.json'): + LoadJsonToSampleName(f, prefix) + # Open cfg files + else: + with open(f) as fin: + print(' >> Reading json from cfg file...') + lines = fin.readlines() + for l in lines: + if '#' in l: + l=l[:l.find('#')] + l = l.replace(' ', '').replace('\n', '') + if l == '': continue + if ',' in l: + l = l.split(',') + for nl in l: + if not os.path.isfile(l): + prefix = nl + else: + LoadJsonToSampleName(nl, prefix) + else: + if not os.path.isfile(l): + prefix = l + else: + LoadJsonToSampleName(l, prefix) + + flist = {} + nevts_total = 0 + for sname in samplesdict.keys(): + redirector = samplesdict[sname]['redirector'] + flist[sname] = [(redirector+f) for f in samplesdict[sname]['files']] + samplesdict[sname]['year'] = samplesdict[sname]['year'] + samplesdict[sname]['xsec'] = float(samplesdict[sname]['xsec']) + samplesdict[sname]['nEvents'] = int(samplesdict[sname]['nEvents']) + nevts_total += samplesdict[sname]['nEvents'] + samplesdict[sname]['nGenEvents'] = int(samplesdict[sname]['nGenEvents']) + samplesdict[sname]['nSumOfWeights'] = float(samplesdict[sname]['nSumOfWeights']) + samplesdict[sname]['post_mortem'] = samplesdict[sname]['post_mortem'] if 'post_mortem' in samplesdict[sname] else False + if not samplesdict[sname]["isData"]: + for wgt_var in WGT_VAR_LST: + # Check that MC samples have all needed weight sums (only needed if doing systs) + if do_systs: + if (wgt_var not in samplesdict[sname]): + raise Exception(f"Missing weight variation \"{wgt_var}\".") + else: + samplesdict[sname][wgt_var] = float(samplesdict[sname][wgt_var]) + # Print file info + print('>> '+sname) + print(' - isData? : %s' %('YES' if samplesdict[sname]['isData'] else 'NO')) + print(' - year : %s' %samplesdict[sname]['year']) + print(' - xsec : %f' %samplesdict[sname]['xsec']) + print(' - histAxisName : %s' %samplesdict[sname]['histAxisName']) + print(' - options : %s' %samplesdict[sname]['options']) + print(' - tree : %s' %samplesdict[sname]['treeName']) + print(' - nEvents : %i' %samplesdict[sname]['nEvents']) + print(' - nGenEvents : %i' %samplesdict[sname]['nGenEvents']) + print(' - SumWeights : %i' %samplesdict[sname]['nSumOfWeights']) + print(' - post_mortem : %s' %samplesdict[sname]['post_mortem']) + if not samplesdict[sname]["isData"]: + for wgt_var in WGT_VAR_LST: + if wgt_var in samplesdict[sname]: + print(f' - {wgt_var}: {samplesdict[sname][wgt_var]}') + print(' - Prefix : %s' %samplesdict[sname]['redirector']) + print(' - nFiles : %i' %len(samplesdict[sname]['files'])) + for fname in samplesdict[sname]['files']: print(' %s'%fname) + + if pretend: + print('pretending...') + exit() + + # Extract the list of all WCs, as long as we haven't already specified one. + if len(wc_lst) == 0: + for k in samplesdict.keys(): + if samplesdict[k]['post_mortem']: + for wc in samplesdict[k]['PMWCnames']: + if wc not in wc_lst: + wc_lst.append(wc) + else: + for wc in samplesdict[k]['WCnames']: + if wc not in wc_lst: + wc_lst.append(wc) + + if len(wc_lst) > 0: + # Yes, why not have the output be in correct English? + if len(wc_lst) == 1: + wc_print = wc_lst[0] + elif len(wc_lst) == 2: + wc_print = wc_lst[0] + ' and ' + wc_lst[1] + else: + wc_print = ', '.join(wc_lst[:-1]) + ', and ' + wc_lst[-1] + print('Wilson Coefficients: {}.'.format(wc_print)) + else: + print('No Wilson coefficients specified') + + processor_instance = gen_processor.AnalysisProcessor(samplesdict,wc_lst,hist_lst,ecut_threshold,do_errors,do_systs,split_lep_flavor,skip_sr,skip_cr) + + if executor == "work_queue": + executor_args = { + 'master_name': '{}-work_queue-coffea'.format(os.environ['USER']), + + # find a port to run work queue in this range: + 'port': port, + + 'debug_log': 'debug.log', + 'transactions_log': 'tr.log', + 'stats_log': 'stats.log', + 'tasks_accum_log': 'tasks.log', + + 'environment_file': remote_environment.get_environment( + extra_pip_local = {"topeft": ["topeft", "setup.py"]}, + ), + #'filepath': f'/project01/ndcms/{os.environ["USER"]}', + 'extra_input_files': ["gen_processor.py"], + + 'retries': 5, + + # use mid-range compression for chunks results. 9 is the default for work + # queue in coffea. Valid values are 0 (minimum compression, less memory + # usage) to 16 (maximum compression, more memory usage). + 'compression': 0, + + "filepath": f'/tmp/{os.environ["USER"]}', + # automatically find an adequate resource allocation for tasks. + # tasks are first tried using the maximum resources seen of previously ran + # tasks. on resource exhaustion, they are retried with the maximum resource + # values, if specified below. if a maximum is not specified, the task waits + # forever until a larger worker connects. + 'resource_monitor': True, + 'resources_mode': 'auto', + + # this resource values may be omitted when using + # resources_mode: 'auto', but they do make the initial portion + # of a workflow run a little bit faster. + # Rather than using whole workers in the exploratory mode of + # resources_mode: auto, tasks are forever limited to a maximum + # of 8GB of mem and disk. + # + # NOTE: The very first tasks in the exploratory + # mode will use the values specified here, so workers need to be at least + # this large. If left unspecified, tasks will use whole workers in the + # exploratory mode. + # 'cores': 1, + # 'disk': 8000, #MB + # 'memory': 10000, #MB + + # control the size of accumulation tasks. Results are + # accumulated in groups of size chunks_per_accum, keeping at + # most chunks_per_accum at the same time in memory per task. + 'chunks_per_accum': 25, + 'chunks_accum_in_mem': 2, + + # terminate workers on which tasks have been running longer than average. + # This is useful for temporary conditions on worker nodes where a task will + # be finish faster is ran in another worker. + # the time limit is computed by multipliying the average runtime of tasks + # by the value of 'fast_terminate_workers'. Since some tasks can be + # legitimately slow, no task can trigger the termination of workers twice. + # + # warning: small values (e.g. close to 1) may cause the workflow to misbehave, + # as most tasks will be terminated. + # + # Less than 1 disables it. + 'fast_terminate_workers': 0, + + # print messages when tasks are submitted, finished, etc., + # together with their resource allocation and usage. If a task + # fails, its standard output is also printed, so we can turn + # off print_stdout for all tasks. + 'verbose': True, + 'print_stdout': False, + } + + # Run the processor and get the output + tstart = time.time() + + if executor == "futures": + exec_instance = processor.futures_executor(workers=nworkers) + runner = processor.Runner(exec_instance, schema=NanoAODSchema, chunksize=chunksize, maxchunks=nchunks) + elif executor == "work_queue": + executor = processor.WorkQueueExecutor(**executor_args) + runner = processor.Runner(executor, schema=NanoAODSchema, chunksize=chunksize, maxchunks=nchunks, skipbadfiles=False, xrootdtimeout=180) + + output = runner(flist, treename, processor_instance) + + dt = time.time() - tstart + + if executor == "work_queue": + print('Processed {} events in {} seconds ({:.2f} evts/sec).'.format(nevts_total,dt,nevts_total/dt)) + + if executor == "futures": + print("Processing time: %1.2f s with %i workers (%.2f s cpu overall)" % (dt, nworkers, dt*nworkers, )) + + # Save the output + if not os.path.isdir(outpath): os.system("mkdir -p %s"%outpath) + out_pkl_file = os.path.join(outpath,outname+".pkl.gz") + print(f"\nSaving output in {out_pkl_file}...") + with gzip.open(out_pkl_file, "wb") as fout: + cloudpickle.dump(output, fout) + print("Done!") + + # Run the data driven estimation, save the output + if do_np: + print("\nDoing the nonprompt estimation...") + out_pkl_file_name_np = os.path.join(outpath,outname+"_np.pkl.gz") + ddp = DataDrivenProducer(out_pkl_file,out_pkl_file_name_np) + print(f"Saving output in {out_pkl_file_name_np}...") + ddp.dumpToPickle() + print("Done!") + # Run the renorm fact envelope calculation + if do_renormfact_envelope: + print("\nDoing the renorm. fact. envelope calculation...") + dict_of_histos = utils.get_hist_from_pkl(out_pkl_file_name_np,allow_empty=False) + dict_of_histos_after_applying_envelope = get_renormfact_envelope(dict_of_histos) + utils.dump_to_pkl(out_pkl_file_name_np,dict_of_histos_after_applying_envelope) diff --git a/analysis/mc_validation/run_gen_hist_eventweights_processor.py b/analysis/mc_validation/run_gen_hist_eventweights_processor.py new file mode 100644 index 000000000..28a6ef2ea --- /dev/null +++ b/analysis/mc_validation/run_gen_hist_eventweights_processor.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python +''' +This script runs the gen weights processor +Example: +python run_gen_hist_eventweights_processor.py ../../input_samples/sample_jsons/signal_samples/private_UL/2022_tllq_fixed0p1_nanoGEN.json -o 2022_tllq_fixed0p1 -x futures -r file:///cms/cephfs/data/ +''' +import argparse +import json +import time +import cloudpickle +import gzip +import os +import importlib + +from coffea import processor +from coffea.nanoevents import NanoAODSchema + +import topcoffea.modules.remote_environment as remote_environment + +LST_OF_KNOWN_EXECUTORS = ["futures","work_queue"] + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='You can customize your run') + parser.add_argument('jsonFiles' , nargs='?', help = 'Json file(s) containing files and metadata') + parser.add_argument('--executor','-x' , default='work_queue', help = 'Which executor to use') + parser.add_argument('--prefix', '-r' , nargs='?', default='', help = 'Prefix or redirector to look for the files') + parser.add_argument('--pretend' , action='store_true', help = 'Read json files but, not execute the analysis') + parser.add_argument('--nworkers','-n' , default=8 , help = 'Number of workers') + parser.add_argument('--chunksize','-s', default=100000 , help = 'Number of events per chunk') + parser.add_argument('--nchunks','-c' , default=None , help = 'You can choose to run only a number of chunks') + parser.add_argument('--outname','-o' , default='histos', help = 'Name of the output file with histograms') + parser.add_argument('--treename' , default='Events', help = 'Name of the tree inside the files') + parser.add_argument('--wc-list', action='extend', nargs='+', help = 'Specify a list of Wilson coefficients to use in filling histograms.') + parser.add_argument('--hist-list', action='extend', nargs='+', help = 'Specify a list of histograms to fill.') + parser.add_argument('--port', default='9123-9130', help = 'Specify the Work Queue port. An integer PORT or an integer range PORT_MIN-PORT_MAX.') + parser.add_argument('--processor', '-p', default='gen_hist_eventweights_processor.py', help='Specify processor file name') + + args = parser.parse_args() + jsonFiles = args.jsonFiles + executor = args.executor + prefix = args.prefix + pretend = args.pretend + nworkers = int(args.nworkers) + chunksize = int(args.chunksize) + nchunks = int(args.nchunks) if not args.nchunks is None else args.nchunks + outname = args.outname + treename = args.treename + wc_lst = args.wc_list if args.wc_list is not None else [] + proc_file = args.processor + proc_name = args.processor[:-3] + + print("\n running with processor: ", proc_file, '\n') + print("\n jsonFiles argument: ", jsonFiles, '\n') + + analysis_processor = importlib.import_module(proc_name) + + # Check if we have valid options + if executor not in LST_OF_KNOWN_EXECUTORS: + raise Exception(f"The \"{executor}\" executor is not known. Please specify an executor from the known executors ({LST_OF_KNOWN_EXECUTORS}). Exiting.") + + if executor == "work_queue": + # construct wq port range + port = list(map(int, args.port.split('-'))) + if len(port) < 1: + raise ValueError("At least one port value should be specified.") + if len(port) > 2: + raise ValueError("More than one port range was specified.") + if len(port) == 1: + # convert single values into a range of one element + port.append(port[0]) + + if args.hist_list == ["weights"]: + hist_lst = ["weights_SM_log" , + "weights_pt0_log", + "weights_pt1_log", + "weights_pt2_log", + "weights_pt3_log", + "weights_pt4_log", + "weights_pt5_log"] + elif args.hist_list == ["kinematics"]: + hist_lst = ["tops_pt", "avg_top_pt", "l0pt", "dr_leps", "ht", "jets_pt", "j0pt", "njets", "mtt", "mll"] + else: + hist_lst = args.hist_list + + # Load samples from json and setup the inputs to the processor + samplesdict = {} + allInputFiles = [] + + nevts_total = 0 + def LoadJsonToSampleName(jsonFile, prefix): + global nevts_total + sampleName = jsonFile if not '/' in jsonFile else jsonFile[jsonFile.rfind('/')+1:] + if sampleName.endswith('.json'): sampleName = sampleName[:-5] + with open(jsonFile) as jf: + samplesdict[sampleName] = json.load(jf) + samplesdict[sampleName]['redirector'] = prefix + nevts_total += samplesdict[sampleName]["nEvents"] + + if isinstance(jsonFiles, str) and ',' in jsonFiles: + jsonFiles = jsonFiles.replace(' ', '').split(',') + elif isinstance(jsonFiles, str): + jsonFiles = [jsonFiles] + + for jsonFile in jsonFiles: + if os.path.isdir(jsonFile): + if not jsonFile.endswith('/'): jsonFile+='/' + for f in os.path.listdir(jsonFile): + if f.endswith('.json'): allInputFiles.append(jsonFile+f) + else: + allInputFiles.append(jsonFile) + + # Read from cfg files + for f in allInputFiles: + if not os.path.isfile(f): + raise Exception(f'[ERROR] Input file {f} not found!') + # This input file is a json file, not a cfg + if f.endswith('.json'): + LoadJsonToSampleName(f, prefix) + # Open cfg files + else: + with open(f) as fin: + print(' >> Reading json from cfg file...') + lines = fin.readlines() + for l in lines: + if '#' in l: + l=l[:l.find('#')] + l = l.replace(' ', '').replace('\n', '') + if l == '': continue + if ',' in l: + l = l.split(',') + for nl in l: + if not os.path.isfile(l): + prefix = nl + else: + LoadJsonToSampleName(nl, prefix) + else: + if not os.path.isfile(l): + prefix = l + else: + LoadJsonToSampleName(l, prefix) + # else: + # if not l.endswith('.json'): + # prefix = l + # else: + # LoadJsonToSampleName(l, prefix) + + flist = {} + for sname in samplesdict.keys(): + #flist[sname] = samplesdict[sname]['files'] + redirector = samplesdict[sname]['redirector'] + flist[sname] = [(redirector+f) for f in samplesdict[sname]['files']] + + # Extract the list of all WCs, as long as we haven't already specified one. + if len(wc_lst) == 0: + for k in samplesdict.keys(): + for wc in samplesdict[k]['WCnames']: + if wc not in wc_lst: + wc_lst.append(wc) + + if len(wc_lst) > 0: + # Yes, why not have the output be in correct English? + if len(wc_lst) == 1: + wc_print = wc_lst[0] + elif len(wc_lst) == 2: + wc_print = wc_lst[0] + ' and ' + wc_lst[1] + else: + wc_print = ', '.join(wc_lst[:-1]) + ', and ' + wc_lst[-1] + print('Wilson Coefficients: {}.'.format(wc_print)) + + else: + print('No Wilson coefficients specified') + + # Run the processor and get the output + processor_instance = analysis_processor.AnalysisProcessor(samplesdict,wc_lst,hist_lst) + + if executor == "work_queue": + executor_args = { + 'master_name': '{}-work_queue-coffea'.format(os.environ['USER']), + + # find a port to run work queue in this range: + 'port': port, + + 'debug_log': 'debug.log', + 'transactions_log': 'tr.log', + 'stats_log': 'stats.log', + 'tasks_accum_log': 'tasks.log', + + 'environment_file': remote_environment.get_environment( + extra_pip_local = {"topeft": ["topeft", "setup.py"]}, + ), + #'filepath': f'/project01/ndcms/{os.environ["USER"]}', + 'extra_input_files' : [proc_file], + + 'retries': 5, + + # use mid-range compression for chunks results. 9 is the default for work + # queue in coffea. Valid values are 0 (minimum compression, less memory + # usage) to 16 (maximum compression, more memory usage). + 'compression': 0, + + "filepath": f'/tmp/{os.environ["USER"]}', + # automatically find an adequate resource allocation for tasks. + # tasks are first tried using the maximum resources seen of previously ran + # tasks. on resource exhaustion, they are retried with the maximum resource + # values, if specified below. if a maximum is not specified, the task waits + # forever until a larger worker connects. + 'resource_monitor': True, + 'resources_mode': 'auto', + + # this resource values may be omitted when using + # resources_mode: 'auto', but they do make the initial portion + # of a workflow run a little bit faster. + # Rather than using whole workers in the exploratory mode of + # resources_mode: auto, tasks are forever limited to a maximum + # of 8GB of mem and disk. + # + # NOTE: The very first tasks in the exploratory + # mode will use the values specified here, so workers need to be at least + # this large. If left unspecified, tasks will use whole workers in the + # exploratory mode. + # 'cores': 1, + # 'disk': 8000, #MB + # 'memory': 10000, #MB + + # control the size of accumulation tasks. Results are + # accumulated in groups of size chunks_per_accum, keeping at + # most chunks_per_accum at the same time in memory per task. + 'chunks_per_accum': 25, + 'chunks_accum_in_mem': 2, + + # terminate workers on which tasks have been running longer than average. + # This is useful for temporary conditions on worker nodes where a task will + # be finish faster is ran in another worker. + # the time limit is computed by multipliying the average runtime of tasks + # by the value of 'fast_terminate_workers'. Since some tasks can be + # legitimately slow, no task can trigger the termination of workers twice. + # + # warning: small values (e.g. close to 1) may cause the workflow to misbehave, + # as most tasks will be terminated. + # + # Less than 1 disables it. + 'fast_terminate_workers': 0, + + # print messages when tasks are submitted, finished, etc., + # together with their resource allocation and usage. If a task + # fails, its standard output is also printed, so we can turn + # off print_stdout for all tasks. + 'verbose': True, + 'print_stdout': False, + } + + + # Run the processor and get the output + tstart = time.time() + + if executor == "futures": + exec_instance = processor.FuturesExecutor(workers=nworkers) + runner = processor.Runner(exec_instance, schema=NanoAODSchema, chunksize=chunksize, maxchunks=nchunks) + elif executor == "work_queue": + executor = processor.WorkQueueExecutor(**executor_args) + runner = processor.Runner(executor, schema=NanoAODSchema, chunksize=chunksize, maxchunks=nchunks, skipbadfiles=False, xrootdtimeout=180) + + output = runner(flist, treename, processor_instance) + + dt = time.time() - tstart + + if executor == "work_queue": + print('Processed {} events in {} seconds ({:.2f} evts/sec).'.format(nevts_total,dt,nevts_total/dt)) + elif executor == "futures": + print("Processing time: %1.2f s with %i workers (%.2f s cpu overall)" % (dt, nworkers, dt*nworkers, )) + + + # Save the output + outpath = "." + if not os.path.isdir(outpath): os.system("mkdir -p %s"%outpath) + out_pkl_file = os.path.join(outpath,outname+".pkl.gz") + print(f"\nSaving output in {out_pkl_file}...") + with gzip.open(out_pkl_file, "wb") as fout: + cloudpickle.dump(output, fout) + print("Done!") + diff --git a/analysis/mc_validation/scan_wcs.sh b/analysis/mc_validation/scan_wcs.sh new file mode 100644 index 000000000..2396172e4 --- /dev/null +++ b/analysis/mc_validation/scan_wcs.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +wcs=("cHQ1" "cHQ3" "cHt" "cHtbRe" "cleQt1Re11" "cleQt1Re22" "cleQt1Re33" "cleQt3Re11" "cleQt3Re22" "cleQt3Re33" "cQe1" "cQe2" "cQe3" "cQj11" "cQj18" "cQj31" "cQj38" "cQl11" "cQl12" "cQl31" "cQl32" "cQl33" "ctBRe" "cte1" "cte2" "cte3" "ctGRe" "ctHRe" "ctj1" "ctj8" "ctl1" "ctl2" "ctWRe") +proc="ttll" + +wcs=("cQu8" "cte1" "cQu1" "ctt" "cQd8" "clu" "ctWRe" "cHt" "cHQ1" "cbBRe" "cte3" "cte2" "ctHRe" "cleQt1Re11" "cQj31" "cQe3" "cbWRe" "cHtbRe" "cleQt1Re33" "cQl32" "cQj11" "cQb8" "cQe1" "cQt1" "cQd1" "cQl31" "cHbox" "cleQt1Re22" "cQj38" "ctu8" "cQl33" "ctGRe" "ctj1" "cQl11" "cQe2" "ctb8" "ctl2" "ctl1" "cQl12" "ctj8" "ctu1" "cQj18" "cleQt3Re22" "ctd8" "cQt8" "cleQt3Re33" "clj1" "cQQ1" "cld" "cHQ3") +proc="tttt" + + +wsc=("cbBRe" "cbWRe" "cHbox" "cHQ1" "cHQ3" "cHt" "cHtbRe" "cld" "cleQt1Re11" "cleQt1Re22" "cleQt1Re33" "cleQt3Re22" "cleQt3Re33" "clj1" "clu" "cQb8" "cQd1" "cQd8" "cQe1" "cQe2" "cQe3" "cQj11" "cQj18" "cQj31" "cQj38" "cQl11" "cQl12" "cQl31" "cQl32" "cQl33" "cQQ1" "cQt1" "cQt8" "cQu1" "cQu8" "ctb8" "ctd8" "cte1" "cte2" "cte3" "ctGRe" "ctj1" "ctj8" "ctl1" "ctl2" "ctt" "ctu1" "ctu8" "ctWRe") +wsc=("cbBRe" "cbWRe" "cHbox" "cHQ1" "cHQ3" "cHt" "cHtbRe" "cld" "cleQt1Re11" "cleQt1Re22" "cleQt1Re33" "cleQt3Re22" "cleQt3Re33" "clj1" "clu" "cQb8" "cQd1" "cQd8" "cQe1" "cQe2" "cQe3" "cQj11" "cQj18" "cQj31" "cQj38" "cQl11" "cQl12" "cQl31" "cQl32" "cQl33" "cQu1" "cQu8" "ctb8" "ctd8" "cte1" "cte2" "cte3" "ctGRe" "ctj1" "ctj8" "ctl1" "ctl2" "ctu1" "ctu8" "ctWRe" "ctd1" "ctd8") + +# Current list of SMEFTSim WCs we'd want to use +wcs=("ctHRe" "cHQ1" "ctWRe" "ctBRe" "ctGRe" "cbWRe" "cHQ3" "cHtbRe" "cHt" "cQl31" "cQl32" "cQl33" "cQl11" "cQl12" "cQl12" "cQe1" "cQe2" "cQe3" "ctl1" "ctl2" "ctl2" "cte1" "cte2" "cte3" "cleQt3Re11" "cleQt3Re22" "cleQt3Re33" "cleQt1Re11" "cleQt1Re22" "cleQt1Re33" "cQj31" "cQj38" "cQj11" "ctj1" "cQj18" "ctj8" "clj1" "cHbox" "ctu1" "ctb8" "clu" "cld" "cQb8" "ctd8" "cQd1" "cQd8" "ctd1" "cQu1" "cbBRe" "ctu8" "cQu8") + +four_heavy=("ctt" "cQQ1" "cQt1" "cQt8") + + +# Current list of dim6top WCs used in TOP-22-006 +wcs=("ctW" "ctZ" "ctp" "cpQM" "ctG" "cbW" "cpQ3" "cptb" "cpt" "cQl3i" "cQlMi" "cQei" "ctli" "ctei" "ctlSi" "ctlTi" "cQq13" "cQq83" "cQq11" "ctq1" "cQq81" "ctq8" "ctt1" "cQQ1" "cQt8" "cQt1") + +four_heavy=("ctt1" "cQQ1" "cQt1" "cQt8") + +# Current list of dim6top WCs including new WCs not in TOP-22-006 that will be in the Run 2 + Run 3 analysis +wcs=("ctW" "ctZ" "ctp" "cpQM" "ctG" "cbW" "cpQ3" "cptb" "cpt" "cQl3i" "cQlMi" "cQei" "ctli" "ctei" "ctlSi" "ctlTi" "cQq13" "cQq83" "cQq11" "ctq1" "cQq81" "ctq8" "ctt1" "cQQ1" "cQt8" "cQt1" "ctu1" "ctb8" "cQb8" "ctd8" "cQd1" "cQd8" "ctd1" "cQu1" "ctu8" "cQu8") + +four_heavy=("ctt1" "cQQ1" "cQt1" "cQt8") + +if [ -n "$1" ]; then + proc=$1 +fi + +if [[ "tttt" == *"$proc"* ]]; then + wcs=("${wcs[@]}" "${four_heavy}") + echo "Parsing tttt, adding four heavy qurak WCs" +fi + +tag="Run3Dim6TopWithTOP22006AxisScan" +wc_tag="7pts_500" + +rm log_wcs_${proc} +for wc in ${wcs[@]} +do + #python validate_eft_wc.py --tolerance 3 --wc $wc --proc $proc --tag Run3With52WCsSMEFTsimTopMasslessAxisScan 2>&1 | tee -a log_wcs_${proc} + #python validate_eft_wc.py --tolerance 3 --wc $wc --proc $proc --tag Run3With52WCsSMEFTsimTopMasslessTOP22006AxisScan --wc-tag 7pts 2>&1 | tee -a log_wcs_${proc} + python validate_eft_wc.py --tolerance 3 --wc $wc --proc $proc --tag $tag --wc-tag $wc_tag 2>&1 | tee -a log_wcs_${proc} +done diff --git a/analysis/mc_validation/validate_eft_wc.py b/analysis/mc_validation/validate_eft_wc.py new file mode 100644 index 000000000..1a025bca1 --- /dev/null +++ b/analysis/mc_validation/validate_eft_wc.py @@ -0,0 +1,484 @@ +''' +This script runs over a collection of outputs for a particular process +and WC and checks if each gridpack produced consistent reweighting. + +wc: the WC to run over +proc: the process to run over +tag: the tag used when making the gridpack +wc_tag: an optional tag used when making the nanoGEN samples (defaults to blank/None if not given) +tolerance: adjust the threshold for agreement +debug: print out more messages + +This script first looks for the customize cards in `InputCards`. +If it can't find that folder, it will try to extract it from the gridpacks. +To run properly, either ensure you have the gridpacks on the same servery you'll run from, +or simply copy the customize cards with a strucutre like: +`InputCards/ttllNuNuJetNoHiggs/ttllNuNuJetNoHiggs_Run3Dim6TopWithTOP22006AxisScan_run0_customizecards.dat` + +Example: +`python validate_eft_wc.py --proc ttH --tag Run3Dim6TopWithTOP22006AxisScan --wc-tag 7pts_500 --wc ctp` +''' +import os +import sys +import json +import argparse +import numpy as np +import matplotlib.pyplot as plt + +from topcoffea.modules.utils import get_hist_from_pkl +from topcoffea.scripts.make_html import make_html + +parser = argparse.ArgumentParser(description='You can select which file to run over') +parser.add_argument('--tolerance', '-t' , default='3' , help = 'Tolerance level (e.g. 3 = 1e-3)') +parser.add_argument('--wc', required=True, help = 'WC to use') +parser.add_argument('--proc', required=True, help = 'Process to use') +parser.add_argument('--tag', required=True, help = 'Gridpack tag (example: Run3With52WCsSMEFTsimTopMasslessTOP22006AxisScan)') +parser.add_argument('--wc-tag', default=None, help = 'Gridpack wc-tag (example: Run3With52WCsSMEFTsimTopMasslessTOP22006AxisScan)') +parser.add_argument('--debug', '-v', action="store_true", help = 'Debug printouts') + +if len(sys.argv) < 7: + parser.print_help(sys.stderr) + exit(1) +args = parser.parse_args() + +TOLERANCE = int(args.tolerance) + +process_map = { + 'tllq': 'tllq4fNoSchanWNoHiggs0p', + 'ttlnu': 'ttlnuJet', + 'tHq': 'tHq4f', + 'ttH': 'ttHJet', + 'ttll': 'ttllNuNuJetNoHiggs', + 'tttt': 'tttt', + 'tWZ': 'tWZll', + 'tWZll': 'tWZll', +} + +def get_sow(fname, run='run0'): + hname = f'{fname}_{run}.pkl.gz' + if 'run0' in run: + hname = f'{fname}.pkl.gz' + if not os.path.exists(hname): + print(f'WARNING: {hname} does not exist! Skipping!') + return None + hists = get_hist_from_pkl(hname) + return hists['SumOfWeights'] + + +def get_orig_xsec(fname, run='run0'): + hname = f'{fname}_{run}.pkl.gz' + if 'run0' in run: + hname = f'{fname}.pkl.gz' + if not os.path.exists(hname): + print(f'WARNING: {hname} does not exist! Skipping!') + return None + hists = get_hist_from_pkl(hname) + if 'originalXWGTUP' in hists: + if args.debug: print(f'Returning for {run}: {hists["originalXWGTUP"]=}') + return hists['originalXWGTUP'] + else: + print(f'WARNING! originalXWGTUP not found in {fname}') + return 0 + + +def get_start(path, fname, run='run0'): + #/hadoop/store/user/byates2/ttgamma_dilep_ttgamma_run0_slc7_amd64_gcc630_CMSSW_9_3_16_tarball.tar.xz + #gridpack = f'{path}/{fname}_{run}_slc7_amd64_gcc630_CMSSW_9_3_16_tarball.tar.xz' + card_path = 'InputCards' + card_name = f'{fname}_{run}_customizecards.dat' + card = f'{card_path}/{process_map[args.proc]}/{card_name}' + if not os.path.exists(card): + tcard = f'{card_path}/{card_name}' + gridpack = f'{path}/{fname}_{run}_slc7_amd64_gcc10_CMSSW_12_4_8_tarball.tar.xz' + if not os.path.exists(gridpack): + print(f'WARNING: Gridpack {gridpack} not found!') + return None + print(f'Untaring {card} from {gridpack}') + os.system(f'tar xf {gridpack} {tcard}') + #os.system(f'ls -lrth InputCards') + os.system(f'mkdir -p InputCards/{process_map[args.proc]}') + os.system(f'mv {tcard} {card}') + with open(card) as fin: + lines = [x.strip() for x in fin.readlines()] + lines = (l.replace('set param_card ', '') for l in lines) + wcs = (l.split() for l in lines if l[0] == 'c') + wcs = ((l[0], float(l[1])) for l in wcs) + return dict(wcs) + + +def get_json(fname, run='run0'): + #../../input_samples/sample_jsons/signal_samples/private_UL/UL16_TTGamma_Dilept_run0.json + jname = f'../../input_samples/sample_jsons/signal_samples/private_UL/2022_{fname}_{run}.json' + if 'run0' in run: + jname = f'../../input_samples/sample_jsons/signal_samples/private_UL/2022_{fname}.json' + if not os.path.exists(jname): + print(f'WARNING: {jname} does not exist! Skipping!') + return None + with open(jname) as fin: + j = json.load(fin) + return j + + +def get_tasks(j): + return len(j['files']) + + +def rename_wcs(wc_pt, wcs): + tmp = {} + if wc_pt is None: return tmp + for wc, pt in wc_pt.items(): + if wc in wcs: + tmp[wc] = pt + elif wc[:-1] + 'i' in wcs: + tmp[wc[:-1]+'i'] = pt + print(tmp) + else: + raise Exception(f'Not sure how to handle {wc}!') + return tmp + + +def get_xsec(hist, wc_pt={}, sow=1, debug=False): + num = np.sum(hist[{'process': sum}].eval(wc_pt)[()]) + if debug: print(f'{num=} / {sow=}') + return np.sum(hist[{'process': sum}].eval(wc_pt)[()]) / sow + + +def get_xsec_error(hist, wc_pt={}, sow=1, debug=False): + num = np.sum(hist[{'process': sum}].eval(wc_pt)[()]) + var = np.sum(hist[{'process': sum}].as_hist(wc_pt).variances()[()]) + return get_xsec(hist, wc_pt, sow) * np.sqrt(sow) / sow # at the starting point the weights are equal, so error = xsec * sqrt(n_tasks) / n_tasks + return np.sqrt(var) / sow + + +def get_mg_weight(fname, run='run0', runs=[1,2,3], xsec=1, sow=1): + hname = f'{fname}_{run}.pkl.gz' + if 'run0' in run: + hname = f'{fname}.pkl.gz' + if not os.path.exists(hname): + print(f'WARNING: {hname} does not exist! Skipping!') + return None + hists = get_hist_from_pkl(hname) + run = run[3:] + weights = [] + skip = False + for irun in runs: #[0, 1, 2, 3]: + int_run = (int(irun[3:])) + if irun == f'run{int(run)}' and int_run == 0: + skip = True + weights.append(0) + continue + if f'EFT_weights{int_run-skip}' in hists: + #weights.append(np.sum(hists[f'EFT_weights{int_run-skip}'].values()) / hists['originalXWGTUP']) + #weights.append(np.sum(hists[f'EFT_weights{int_run-skip}'].values()) / np.sum(hists['SumOfWeights'].values()) / sow) + weights.append(np.sum(hists[f'EFT_weights{int_run-skip}'].values()) / sow) + #weights.append(hists[f'EFT_weights{irun-skip}'] / sow) + key = f"EFT_weights{int_run-skip}" + if args.debug: + print( + f"Adding run{run} wgt{int_run-skip} " + f"{np.sum(hists[key].values())} -> " + f"{np.sum(hists[key].values()) / sow} " + f"{sow=} {xsec=}" + ) + #weights.append(hists[f'EFT_weights{irun}'] / sow * xsec) + else: + print(irun, f'EFT_weights{int_run-skip} not found, adding 0') + weights.append(0) + if 'SM_weights' in hists: + if args.debug: + val = np.sum(hists["SM_weights"].values()) + print( + f"Adding run{run} SM {val} -> {val / sow} {sow=} {xsec=}" + ) + weights.append(np.sum(hists['SM_weights'].values()) / sow) + else: + weights.append(0) + return weights + + +def make_plots(runs, wc_pts, jobs, h, orig_xsecs, mg_weights, mg_sm, sample, rel=True): + user = os.getlogin() + xsec_points = [] + wc_points = [] + poly = [] + + mg_draw = True + mgw_draw = True + if args.debug: print(f'HERE {runs=} {jobs=}') + for irun,run in enumerate(runs): + wc_points.append([]) + xsec_points.append([]) + if run is None: continue + for ipt, wc_pt in enumerate(wc_pts): + h = get_sow(dname + '/' + fname, run) + if h is None: continue + if wc_pt is None: continue + + # Catch edge case where first file is missing _and_ the WC has a lepotn flavor + # (e.g. `cte1` -> `ctei`) + bad = False + for wc in wc_pt: + if wc not in h.wc_names: bad = True + if bad: continue + + xsec = get_xsec(h, wc_pt, jobs[irun]) / (get_xsec(h, {}, jobs[irun]) if rel else 1.) + # if irun == ipt: print(f'{rel=} {np.sum(h.as_hist({}).values())} {run=} {xsec=} ({get_xsec(h, wc_pt, jobs[irun])} / {get_xsec(h, {}, jobs[irun]) if rel else 1.} = {get_xsec(h, wc_pt, jobs[irun]) / get_xsec(h, {}, jobs[irun]) if rel else 1.})') + if irun==ipt and args.debug: print(f'{xsec=} {jobs[irun]=} {mg_weights[irun]=}') + xsec_err = get_xsec_error(h, wc_pts[irun], jobs[irun]) # Errors at starting points + xsec_points[irun].append(xsec) + wc_points[irun].append(list(wc_pt.values())[0]) + #if xsec > 100: # Don't plot extreme values + #continue + # xsec_err = False # FIXME + if xsec_err > 10: xsec_err = False # FIXME + # if irun==ipt and not rel: print(f'HERE {run} {xsec_err=} {xsec_err / xsec =}') + if ipt == irun: + label = f'st. pt. {list(wc_pts[irun].keys())[0]}={np.round(list(wc_pts[irun].values())[0], 3)}' + plt.errorbar(list(wc_pt.values())[0], xsec, yerr=xsec_err, color=colors[irun], markerfacecolor='none', markeredgecolor=colors[irun], label=label, marker='o') + #plt.errorbar(list(wc_pt.values())[0], xsec, yerr=xsec_err, color=colors[irun], markerfacecolor='none', markeredgecolor=colors[irun], label=f'st. pt. {wc_pts[irun]}', marker='o') + if orig_xsecs[irun] is not None and orig_xsecs[irun] > 0: + if args.debug: print(f'{run} {wc_pt} {orig_xsecs[irun]=} / ' + str(get_xsec(h, {}, jobs[irun]) if rel else 1.) + ' -> ' + str(orig_xsecs[irun] / (get_xsec(h, {}, jobs[irun]) if rel else 1.))) + if mg_draw: + plt.plot(list(wc_pt.values())[0], orig_xsecs[irun] / (get_xsec(h, {}, jobs[irun]) if rel else 1.), marker='*', color='k', label='MG xsec', zorder=100, markersize=10, fillstyle='none') + mg_draw = False + else: + plt.plot(list(wc_pt.values())[0], orig_xsecs[irun] / (get_xsec(h, {}, jobs[irun]) if rel else 1.), marker='*', color='k', zorder=100, markersize=10, fillstyle='none') + else: + if len(xsecs) <= ipt or xsecs[ipt] is None: + plt.errorbar(list(wc_pt.values())[0], xsec, yerr=xsec_err, color=colors[irun], markerfacecolor='none', markeredgecolor=colors[irun]) + else: + plt.errorbar(list(wc_pt.values())[0], xsec, yerr=xsec_err, color=colors[irun], markerfacecolor='none', markeredgecolor=colors[irun], marker='o') + if mg_weights[irun] is None: continue + if len(mg_weights[irun]) <= ipt: continue + if mg_weights[irun][ipt] is None: continue + #if args.debug: print(f"normalizing {run}") + #if args.debug: print(f"{wc_pt}") + #if args.debug: print(f"{mg_weights[irun][ipt]}") + #if args.debug: print(f"to {(get_xsec(h, {}, jobs[irun]) if rel else 1.)}") + #if args.debug: print(f"-> {mg_weights[irun][ipt] / (get_xsec(h, {}, jobs[irun]) if rel else 1.)}") + #if args.debug: print(f"normalizing {run} {wc_pt} {mg_weights[irun][ipt]} to {(get_xsec(h, {}, jobs[irun]) if rel else 1.)} -> {mg_weights[irun][ipt] / (get_xsec(h, {}, jobs[irun]) if rel else 1.)}") + #if args.debug: print(f"For {irun} {wc_pt} hist predicts {get_xsec(h, wc_pt, jobs[irun])} and MG gave {mg_weights[irun][ipt]}.") + if args.debug and np.abs(get_xsec(h, wc_pt, jobs[irun]) - mg_weights[irun][ipt]) / mg_weights[irun][ipt] > 10**(-1*TOLERANCE): print(f"For {irun} {wc_pt} hist predicts {get_xsec(h, wc_pt, jobs[irun])} and MG gave {mg_weights[irun][ipt]}.") + if len(mg_weights) > irun and mg_weights[irun] is not None and len(mg_weights[irun]) > ipt and mg_weights[irun][ipt] is not None and mg_weights[irun][ipt] > 0: # and False: # FIXME + # TODO understand why MG SM is sometimes _very_ off, using hist predicted SM for now + #if args.debug: print(f'Drawing {mg_weights[irun][ipt]} -> {mg_weights[irun][ipt] * (mg_sm[irun] if not rel else 1.)}') + if args.debug: print(f'Drawing {mg_weights[irun][ipt]} -> {mg_weights[irun][ipt] * (get_xsec(h, {}, jobs[irun]) if not rel else 1.)}') + if mgw_draw: + plt.plot(list(wc_pt.values())[0], mg_weights[irun][ipt] * (get_xsec(h, {}, jobs[irun]) if not rel else 1.), marker='s', color=colors[irun], label='MG weight', zorder=100, markersize=10, fillstyle='none', linestyle='--') + #plt.plot(list(wc_pt.values())[0], mg_weights[irun][ipt] * (mg_sm[irun] if not rel else 1.), marker='s', color=colors[irun], label='MG weight', zorder=100, markersize=10, fillstyle='none', linestyle='--') + mgw_draw = False + else: + plt.plot(list(wc_pt.values())[0], mg_weights[irun][ipt] * (get_xsec(h, {}, jobs[irun]) if not rel else 1.), marker='s', color=colors[irun], zorder=100, markersize=10, fillstyle='none', linestyle='--') + #plt.plot(list(wc_pt.values())[0], mg_weights[irun][ipt] * (mg_sm[irun] if not rel else 1.), marker='s', color=colors[irun], zorder=100, markersize=10, fillstyle='none', linestyle='--') + if len(wc_points[irun])==0: + poly.append(None) + continue + #if any(xsec_pt > 100 for xsec_pt in xsec_points[irun]): + # continue + poly.append(np.polyfit(wc_points[irun], xsec_points[irun], 2)) + + #print(f'{poly=}') + for irun,run in enumerate(runs): + if len(wc_points[irun]) == 0: continue + if len(poly) < irun+1: continue + if len(wc_points) < irun+1: continue + if poly[irun] is None: continue + if wc_points[irun] is None: continue + xaxis = np.linspace(min(wc_points[irun]),max(wc_points[irun]),100) + plt.plot(xaxis, np.polyval(poly[irun], xaxis), color=colors[irun], ls=styles[irun % 4]) + + if poly is None: return + plt.legend(ncol=2) + wc = args.wc + plt.xlabel(wc) + if rel: + plt.ylabel(f'xsec({wc}) / xsec(SM)') + else: + plt.ylabel(f'xsec({wc})') + postfix = '_rel' if rel else '' + ax = plt.gca() + # ax.set_yscale('log') + xsec_points = [x if x is not None else 1. for x in xsec_points] + max_xsec = np.max(np.array(xsec_points)) + if isinstance(max_xsec, list): # TODO undertand why this is different for tttt + max_xsec = max(max_xsec) + if max_xsec is None: + max_xsec = 1. + max_xsec *= 1.5 + plt.ylim(0, min(max_xsec, 20)) + if rel: + ax.set_yticks(np.arange(0, min(max_xsec, 20), 1)) + plt.grid() + if args.wc_tag is not None: + sample = sample.replace(f'_{args.wc_tag}', '') + if args.debug: print(f'/users/{user}/afs/www/EFT/1D/mg_xsec/{args.proc}/{sample}{postfix}.png') + plt.savefig(f'/users/{user}/afs/www/EFT/1D/mg_xsec/{args.proc}/{sample}{postfix}.png') + plt.close() + + +if __name__ == '__main__': + dname = f'/scratch365/{os.environ["USER"]}/wc_validation/1D/' + # dname = f'/afs/crc.nd.edu/user/y/ywan2/Public/forBrent/1D_gridpack_tllq/' # FIXME this is for Wynona's pkl files + fname = 'sow_ttG_di_eft_histEFT' + fname = f'2022_ttH_{args.wc}.pkl.gz' + #fname = 'UL16_TTGamma_Dilept_nanoGEN' + proc = args.proc.replace('NuNuJetNoHiggs', '')#.replace('Jet', '') + fname = f'2022_{proc}_{args.wc}' + # fname = f'2022_nanoGEN_{proc}_{args.wc}' # FIXME this is for Wynona's pkl files + sample = f"{proc}_{args.wc}" + if args.wc_tag is not None: + sample += f'_{args.wc_tag}' + fname += f'_{args.wc_tag}' + if args.debug: print(sample) + default_points = [-20, -6.67, 6.67, 20] + ipt = 0 + #sample = 'TTGamma_Dilept_nanoGEN' + #runs = ['run0', 'full11', '9_run0'] + runs = ['run0', 'run1'] + #runs = ['run0', 'run1', 'run2', 'run3'] + #runs = ['run1', 'run2'] + runs = [f'run{irun}' for irun in range(7)] + lumi = 137 * 1000 + #runs = [run for run in runs if '3' not in run] + #runs = ['run0', 'run1'] + good_runs = [] + wc_pts = [] + xsecs = [] + xsec_errs = [] + orig_xsecs = [] + mg_weights = [] + mg_sm = [] + sows = [] + files = [] + jobs = [] + BLUE = '\033[94m' + GREEN = '\033[92m' + RED = '\033[91m' + ENDC = '\033[0m' + match = GREEN+'GOOD'+ENDC + bad = RED+'BAD'+ENDC + fail = [False] * len(runs) + for irun,run in enumerate(runs): + j = get_json(sample, run) + h = get_sow(dname + '/' + fname, run) + wc_pt = get_start('/cms/cephfs/data/store/user/' + os.environ['USER'] + '/Run3_gridpacks', f'{process_map[args.proc]}_{args.wc}{args.tag}', run) + # wc_pt = get_start('/cms/cephfs/data/store/user/ywan2/Run3_gridpacks/tllq/', f'{process_map[args.proc]}_{args.wc}{args.tag}', run) # FIXME Wynona's gridpacks + wc_pts.append(wc_pt) + if h is None or j is None or wc_pt is None: # or run == 'run3' or run == 'run4' or run == 'run2': + jobs.append(-1) + #wc_pts.append({args.wc: default_points[ipt]}) + ipt += 1 + xsecs.append(None) + xsec_errs.append(None) + orig_xsecs.append(None) + mg_weights.append(None) + mg_sm.append(None) + good_runs.append(None) + fail[irun] = True + continue + # ttHJet_cHQ1Run3With52WCsSMEFTsimTopMasslessTOP22006AxisScan_run1_slc7_amd64_gcc10_CMSSW_12_4_8_tarball.tar.xz + #wc_pt = {"cpQM": 62.000000, "ctW": 1.580000, "ctq1": 1.190000, "cQq81": 2.430000, "ctZ": 2.560000, "cQq83": 2.780000, "ctG": 0.310000, "ctq8": 2.020000, "cQq13": 1.340000, "cQq11": 1.350000, "cpt": 32.930000} + #xsec = get_xsec(h, wc_pt, j['nSumOfWeights'])# * jobs[irun]) + #jobs.append(j['nTasks']) + wc_pt = get_start('/cms/cephfs/data/store/user/' + os.environ['USER'] + '/Run3_gridpacks', f'{process_map[args.proc]}_{args.wc}{args.tag}', run) + # wc_pt = get_start('/cms/cephfs/data/store/user/ywan2/Run3_gridpacks/tllq/', f'{process_map[args.proc]}_{args.wc}{args.tag}', run) # FIXME Wynona's gridpacks + wc_pt = rename_wcs(wc_pt, h.wc_names) + wc_pts[-1] = wc_pt + #wc_pts.append(wc_pt) + jobs.append(get_tasks(j)) + #jobs.append(100) + xsec = get_xsec(h, wc_pt, jobs[irun]) + #xsec = lumi * get_xsec(h, wc_pt, jobs[irun]) / get_xsec(h, {}, jobs[irun]) + orig_xsecs.append(get_orig_xsec(dname + '/' + fname, run) / jobs[irun]) + mgw = get_mg_weight(dname + '/' + fname, run, runs, orig_xsecs[-1], jobs[irun]) + mg_sm.append(mgw.pop()) + mg_sm[-1] = 1 if mg_sm[-1] == 0 else mg_sm[-1] + mg_weights.append([w / mg_sm[-1] for w in mgw]) + if '0.0' in wc_pt: + mg_sm[-1] = mg_weights[-1] + if args.debug: print(f'{run=} {wc_pt=} {xsec=} {orig_xsecs[-1]=} {mg_weights[-1]=} {mg_sm=}') + #mg_weights.append(get_mg_weight(dname + '/' + fname, run, runs, orig_xsecs[-1], jobs[irun]) / get_xsec(h, {}, jobs[irun])) + xsec_err = get_xsec_error(h, wc_pts[irun], jobs[irun]) # Errors at starting points + print(f'Run {run} starts at:\n{wc_pt}\nwith {xsec} pb ({jobs[irun]} tasks)') + sows.append(j['nSumOfWeights']) + xsecs.append(np.round(xsec, TOLERANCE)) + xsec_errs.append(np.round(xsec_err, TOLERANCE)) + good_runs.append(run) + if args.debug: print(f'Added {xsecs[-1]} {xsec_errs[-1]} {orig_xsecs[-1]}') + files.append(len(j['files'])) + print(f'{BLUE}{sample} {run}: {xsec}{ENDC}') + if all(fail): + print(f'WARNING: No pkl files found for {sample}!') + exit() + colors = ['red', 'green', 'blue', 'orange', 'yellow', 'brown', 'magenta'] + styles = ['-', '--', '-.', ':'] + ''' + for irun,run in enumerate(runs): + wc_points.append([]) + xsec_points.append([]) + for ipt, wc_pt in enumerate(wc_pts): + h = get_sow(dname + '/' + fname, run) + xsec = get_xsec(h, wc_pt, jobs[irun]) / get_xsec(h, {}, jobs[irun]) + xsec_err = get_xsec_error(h, wc_pts[irun], jobs[irun]) # Errors at starting points + xsec_points[irun].append(xsec) + wc_points[irun].append(list(wc_pt.values())[0]) + if ipt == irun: + plt.errorbar(list(wc_pt.values())[0], xsec, yerr=xsec_err, color=colors[irun], markerfacecolor='none', markeredgecolor=colors[irun], label=f'st. pt. {wc_pts[irun]}', marker='o') + else: + plt.errorbar(list(wc_pt.values())[0], xsec, yerr=xsec_err, color=colors[irun], markerfacecolor='none', markeredgecolor=colors[irun], marker='o') + poly.append(np.polyfit(wc_points[irun], xsec_points[irun], 2)) + for irun,run in enumerate(runs): + xaxis = np.linspace(min(wc_points[irun]),max(wc_points[irun]),100) + plt.plot(xaxis, np.polyval(poly[irun], xaxis), color=colors[irun], ls=styles[irun]) + ''' + + user = os.getlogin() + os.makedirs(f'/users/{user}/afs/www/EFT/1D/mg_xsec/{args.proc}/', exist_ok=True) + make_plots(good_runs, wc_pts, jobs, h, orig_xsecs, mg_weights, mg_sm, sample, rel=True) + make_plots(good_runs, wc_pts, jobs, h, orig_xsecs, mg_weights, mg_sm, sample, rel=False) + make_html(f'/users/{user}/afs/www/EFT/1D/mg_xsec/{args.proc}') + + for irun,run in enumerate(runs): + for ipt, wc_pt in enumerate(wc_pts): + if irun == ipt: + continue + h = get_sow(dname + '/' + fname, run) + if h is None: continue + if wc_pt is None: continue + if sows is None: continue + if len(xsecs) < ipt+1: continue + if xsecs[ipt] is None: continue + if xsecs[irun] is None: continue + if len(xsec_errs) < ipt+1: continue + if xsec_errs[ipt] is None: continue + if xsec_errs[irun] is None: continue + if len(sows) < ipt+1: continue + if len(runs) < ipt+1: continue + if runs[ipt] is None: continue + if xsecs[irun] == 0.: # Skim the SM since it can't reweight to some points reliably + continue + # sow = sows[irun] + #xsec = get_xsec(h, wc_pt, sow)# * jobs[irun]) + xsec = get_xsec(h, wc_pt, jobs[irun]) + if xsec is None: + continue + #print(wc_pt, f'{xsec=}') + #print(f'Reweighting {run} to {runs[ipt]} ({wc_pt})') + pdiff = np.round(100 * (1 - xsecs[ipt] / xsec), TOLERANCE) + if np.abs(xsec - xsecs[ipt]) < 10**(-1*TOLERANCE) or np.abs(xsecs[ipt] - xsec) < xsec_errs[ipt]: + print(f'{sample} {run}: {np.round(xsecs[irun], TOLERANCE)} -> {np.round(xsec, TOLERANCE)} {runs[ipt]}: {np.round(xsecs[ipt], TOLERANCE)} +/- {xsec_errs[ipt]} {match} ({pdiff}% diff) (MG xsec={orig_xsecs[irun]})') + else: + print(f'{sample} {run}: {np.round(xsecs[irun], TOLERANCE)} -> {np.round(xsec, TOLERANCE)} {runs[ipt]}: {np.round(xsecs[ipt], TOLERANCE)} +/- {xsec_errs[ipt]} {bad} {pdiff}% diff (expected {xsecs[ipt]}) (MG xsec={orig_xsecs[irun]})') + ''' + plt.legend() + plt.savefig(f'/users/byates2/afs/www/mg_xsec_{sample}.png') + ''' + for irun,run in enumerate(runs): + h = get_sow(dname + '/' + fname, run) + if h is None: continue + if len(xsecs) < irun+1: continue + if xsecs[irun] is None: continue + #sow = sows[irun] + #xsec = get_xsec(h, wc_pt, sow)# * jobs[irun]) + xsec = get_xsec(h, {}, jobs[irun]) + print(f'{sample} {run}: {np.round(xsecs[irun], TOLERANCE)} @SM: {BLUE}{np.round(xsec, TOLERANCE)}{ENDC}') diff --git a/analysis/topeft_run2/make_jsons.py b/analysis/topeft_run2/make_jsons.py index 700560b76..f5f1bf7c9 100644 --- a/analysis/topeft_run2/make_jsons.py +++ b/analysis/topeft_run2/make_jsons.py @@ -1713,11 +1713,11 @@ def main(): ######### Make/remake JSONs ######### # Private UL samples - #make_jsons_for_dict_of_samples(test_private_UL17_dict,"/hadoop","2017",out_dir_test_private_UL) - #make_jsons_for_dict_of_samples(private_UL17_dict,"/hadoop","2017",out_dir_private_UL) - #make_jsons_for_dict_of_samples(private_UL18_dict,"/hadoop","2018",out_dir_private_UL) - #make_jsons_for_dict_of_samples(private_UL16_dict,"/hadoop","2016",out_dir_private_UL) - #make_jsons_for_dict_of_samples(private_UL16APV_dict,"/hadoop","2016APV",out_dir_private_UL) # Not sure what we need here for the year, can remake the JSONs later to update when we have SFs etc set up for 2016 stuff (right now I think it's mostly just 18) + #make_jsons_for_dict_of_samples(test_private_UL17_dict,"/cms/cephfs/data","2017",out_dir_test_private_UL) + make_jsons_for_dict_of_samples(private_UL17_dict,"/cms/cephfs/data","2017",out_dir_private_UL) + make_jsons_for_dict_of_samples(private_UL18_dict,"/cms/cephfs/data","2018",out_dir_private_UL) + make_jsons_for_dict_of_samples(private_UL16_dict,"/cms/cephfs/data","2016",out_dir_private_UL) + make_jsons_for_dict_of_samples(private_UL16APV_dict,"/cms/cephfs/data","2016APV",out_dir_private_UL) # Not sure what we need here for the year, can remake the JSONs later to update when we have SFs etc set up for 2016 stuff (right now I think it's mostly just 18) # Subsets of files for small debugging tests local files (scratch365 at ND) #make_jsons_for_dict_of_samples(private_2017_dict,"","2017",out_dir_top19001_local) @@ -1728,21 +1728,21 @@ def main(): #make_jsons_for_dict_of_samples(central_2016APV_dict,"root://ndcms.crc.nd.edu/","2016APV",out_dir_central_2016APV,on_das=True) #make_jsons_for_dict_of_samples(central_2017_correctnPartonsInBorn_dict,"root://ndcms.crc.nd.edu/","2017",out_dir_central_2017,on_das=True) #make_jsons_for_dict_of_samples(central_2017_dict,"root://ndcms.crc.nd.edu/","2017",out_dir_central_2017,on_das=True) - #make_jsons_for_dict_of_samples(central_2017_dict,"/hadoop","2017",out_dir_central_2017,on_das=False) # ttH, ttW, ttZ, and tZq are at ND + #make_jsons_for_dict_of_samples(central_2017_dict,"/cms/cephfs/data","2017",out_dir_central_2017,on_das=False) # ttH, ttW, ttZ, and tZq are at ND #make_jsons_for_dict_of_samples(sync_dict,"root://ndcms.crc.nd.edu/","2017",out_dir_central_sync) - #make_jsons_for_dict_of_samples(central_UL16_dict, "/hadoop","2016", out_dir_central_UL,on_das=False) # Central signal samples ar at ND now - #make_jsons_for_dict_of_samples(central_UL16APV_dict, "/hadoop","2016APV", out_dir_central_UL,on_das=False) # Central signal samples ar at ND now - #make_jsons_for_dict_of_samples(central_UL17_dict, "/hadoop","2017", out_dir_central_UL,on_das=False) # Central signal samples ar at ND now - #make_jsons_for_dict_of_samples(central_UL18_dict, "/hadoop","2018", out_dir_central_UL,on_das=False) # Central signal samples ar at ND now + #make_jsons_for_dict_of_samples(central_UL16_dict, "/cms/cephfs/data","2016", out_dir_central_UL,on_das=False) # Central signal samples ar at ND now + #make_jsons_for_dict_of_samples(central_UL16APV_dict, "/cms/cephfs/data","2016APV", out_dir_central_UL,on_das=False) # Central signal samples ar at ND now + #make_jsons_for_dict_of_samples(central_UL17_dict, "/cms/cephfs/data","2017", out_dir_central_UL,on_das=False) # Central signal samples ar at ND now + #make_jsons_for_dict_of_samples(central_UL18_dict, "/cms/cephfs/data","2018", out_dir_central_UL,on_das=False) # Central signal samples ar at ND now # Central background samples # Note: Some of the bkg dicts have both a "path" and a "path_local" (these are samples that generated JSONs for after moving the samples to ND), # while the others only have a "path" (i.e. dataset name), these were the ones we produced JSONs for prior to moving the samples to ND, but # these should also be located at ND, so if the samples need to be remade, you can add a "local_path" with the path starting with /store - make_jsons_for_dict_of_samples(central_UL17_bkg_dict, "/hadoop","2017", out_dir_central_bkg_UL,on_das=False) # Background samples are at ND now - make_jsons_for_dict_of_samples(central_UL18_bkg_dict, "/hadoop","2018", out_dir_central_bkg_UL,on_das=False) # Background samples are at ND now - make_jsons_for_dict_of_samples(central_UL16_bkg_dict, "/hadoop","2016", out_dir_central_bkg_UL,on_das=False) # Background samples are at ND now - make_jsons_for_dict_of_samples(central_UL16APV_bkg_dict,"/hadoop","2016APV",out_dir_central_bkg_UL,on_das=False) # Background samples are at ND now + #make_jsons_for_dict_of_samples(central_UL17_bkg_dict, "/cms/cephfs/data","2017", out_dir_central_bkg_UL,on_das=False) # Background samples are at ND now + #make_jsons_for_dict_of_samples(central_UL18_bkg_dict, "/cms/cephfs/data","2018", out_dir_central_bkg_UL,on_das=False) # Background samples are at ND now + #make_jsons_for_dict_of_samples(central_UL16_bkg_dict, "/cms/cephfs/data","2016", out_dir_central_bkg_UL,on_das=False) # Background samples are at ND now + #make_jsons_for_dict_of_samples(central_UL16APV_bkg_dict,"/cms/cephfs/data","2016APV",out_dir_central_bkg_UL,on_das=False) # Background samples are at ND now # Data samples #make_jsons_for_dict_of_samples(data_2016APV_dict,"root://ndcms.crc.nd.edu/","2016APV",out_dir_data_2016,on_das=True) diff --git a/analysis/topeft_run2/make_jsons_fast.py b/analysis/topeft_run2/make_jsons_fast.py new file mode 100644 index 000000000..572466ec6 --- /dev/null +++ b/analysis/topeft_run2/make_jsons_fast.py @@ -0,0 +1,384 @@ +''' +This script is used to make sets of json files much faster (using the updates added in TopCoffea in PR #72 quick_tools_and_post_mortem) +It makes use of the `just_write` option skip the slow step of loading the number of events (a field we don't use) + +Run with: +python make_jsons_fast.py + +The basics: +Modify `make_jsons_for_dict_of_samples` to point to the dictionary you want to process. +e.g.: +make_jsons_for_dict_of_samples(ttH,"/cms/cephfs/data/",out_dir_private_UL,"2017") +The first argument is the dictionary, the second is the prefix where the files are stored, +the third is the year of the sample, and the fourth is where you want the jsons to be saved. +There is an optional `on_das` flag that's only used for central samples (e.g., NOT our private EFT samples) + +The dictionary's structure is + +[NAME] = { + "[PROC_NAME]: { + path_to_files + histAxisName + xsecName + post_mortem (leave off if not using post-mortem reweighted samples (usually the case) + list of post-mortem WCs PMWCnames (again leave off unless you're using them) + } +} + +Example without post-mortem: +tllq = { + "UL17_tllq_b1" : { + "path": "/store/user/byates2/post-mortem/test/lobster_test_20250501_1233/UL17_tllq_b1/", + #"histAxisName" : "tllq_privateUL17", + "histAxisName" : "tllq_privateUL17", + "xsecName": "tZq", + } +} + +Example with post-mortem: +tllq = { + "UL17_tllq_b1" : { + "path": "/store/user/byates2/post-mortem/test/lobster_test_20250501_1233/UL17_tllq_b1/", + #"histAxisName" : "tllq_privateUL17", + "histAxisName" : "tllq_privateUL17", + "xsecName": "tZq", + "post_mortem": True, + "PMWCnames": ["cQq11", "cptb", "ctlTi", "ctZ", "ctq1", "cQl3i", "cQlMi", "cpQ3", "ctW", "ctp", "cQq13", "cbB", "cbG", "cpt", "ctlSi", "cbW", "cpQM", "ctq8", "ctG", "ctei", "cQq81", "cQei", "ctli", "cQq83"], + } +} +''' + +import os +import re +import subprocess + +import topcoffea.modules.sample_lst_jsons_tools as sjt +from topeft.modules.combine_json_ext import combine_json_ext +from topeft.modules.combine_json_batch import combine_json_batch + +######################################################### +# This is the complete set of UL samples for TOP-22-006 # +######################################################### + +private_UL17_dict = { + + + "UL17_ttHJet_b1" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL17/Round1/Batch1/naodOnly_step/v4/nAOD_step_ttHJet_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "ttHJet_privateUL17", + "xsecName": "ttHnobb", + }, + "UL17_ttHJet_b2" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL17/Round1/Batch2/naodOnly_step/v3/nAOD_step_ttHJet_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "ttHJet_privateUL17", + "xsecName": "ttHnobb", + }, + "UL17_ttHJet_b3" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL17/Round1/Batch3/naodOnly_step/v4/nAOD_step_ttHJet_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "ttHJet_privateUL17", + "xsecName": "ttHnobb", + }, + + "UL17_ttlnuJet_b1" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL17/Round1/Batch1/naodOnly_step/v4/nAOD_step_ttlnuJet_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "ttlnuJet_privateUL17", + "xsecName": "TTWJetsToLNu", + }, + "UL17_ttlnuJet_b2" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL17/Round1/Batch2/naodOnly_step/v3/nAOD_step_ttlnuJet_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "ttlnuJet_privateUL17", + "xsecName": "TTWJetsToLNu", + }, + "UL17_ttlnuJet_b3" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL17/Round1/Batch3/naodOnly_step/v4/nAOD_step_ttlnuJet_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "ttlnuJet_privateUL17", + "xsecName": "TTWJetsToLNu", + }, + + "UL17_ttllJet_b1" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL17/Round1/Batch1/naodOnly_step/v4/nAOD_step_ttllNuNuJetNoHiggs_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "ttllJet_privateUL17", + "xsecName": "TTZToLLNuNu_M_10", + }, + "UL17_ttllJet_b2" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL17/Round1/Batch2/naodOnly_step/v3/nAOD_step_ttllNuNuJetNoHiggs_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "ttllJet_privateUL17", + "xsecName": "TTZToLLNuNu_M_10", + }, + "UL17_ttllJet_b3" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL17/Round1/Batch3/naodOnly_step/v4/nAOD_step_ttllNuNuJetNoHiggs_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "ttllJet_privateUL17", + "xsecName": "TTZToLLNuNu_M_10", + }, + + "UL17_tllq_b1" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL17/Round1/Batch1/naodOnly_step/v4/nAOD_step_tllq4fNoSchanWNoHiggs0p_all22WCsStartPtCheckV2dim6TopMay20GST_run0", + "histAxisName" : "tllq_privateUL17", + "xsecName": "tZq", + }, + "UL17_tllq_b2" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL17/Round1/Batch2/naodOnly_step/v3/nAOD_step_tllq4fNoSchanWNoHiggs0p_all22WCsStartPtCheckV2dim6TopMay20GST_run0", + "histAxisName": "tllq_privateUL17", + "xsecName": "tZq", + }, + "UL17_tllq_b3" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL17/Round1/Batch3/naodOnly_step/v4/nAOD_step_tllq4fNoSchanWNoHiggs0p_all22WCsStartPtCheckV2dim6TopMay20GST_run0", + "histAxisName": "tllq_privateUL17", + "xsecName": "tZq", + }, + + "UL17_tHq_b1" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL17/Round1/Batch1/naodOnly_step/v4/nAOD_step_tHq4f_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "tHq_privateUL17", + "xsecName": "tHq", + }, + "UL17_tHq_b2" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL17/Round1/Batch2/naodOnly_step/v3/nAOD_step_tHq4f_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "tHq_privateUL17", + "xsecName": "tHq", + }, + "UL17_tHq_b3" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL17/Round1/Batch3/naodOnly_step/v4/nAOD_step_tHq4f_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "tHq_privateUL17", + "xsecName": "tHq", + }, + + "UL17_tttt_b4" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL17/Round1/Batch4/naodOnly_step/v2/nAOD_step_tttt_FourtopsMay3v1_run0", + "histAxisName": "tttt_privateUL17", + "xsecName": "tttt", + }, +} + +private_UL18_dict = { + + "UL18_ttHJet_b1" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL18/Round1/Batch1/naodOnly_step/v5/nAOD_step_ttHJet_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "ttHJet_privateUL18", + "xsecName": "ttHnobb", + }, + "UL18_ttHJet_b2" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL18/Round1/Batch2/naodOnly_step/v2/nAOD_step_ttHJet_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "ttHJet_privateUL18", + "xsecName": "ttHnobb", + }, + "UL18_ttHJet_b3" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL18/Round1/Batch3/naodOnly_step/v2/nAOD_step_ttHJet_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "ttHJet_privateUL18", + "xsecName": "ttHnobb", + }, + + "UL18_ttlnuJet_b1" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL18/Round1/Batch1/naodOnly_step/v5/nAOD_step_ttlnuJet_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "ttlnuJet_privateUL18", + "xsecName": "TTWJetsToLNu", + }, + "UL18_ttlnuJet_b2" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL18/Round1/Batch2/naodOnly_step/v2/nAOD_step_ttlnuJet_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "ttlnuJet_privateUL18", + "xsecName": "TTWJetsToLNu", + }, + "UL18_ttlnuJet_b3" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL18/Round1/Batch3/naodOnly_step/v2/nAOD_step_ttlnuJet_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "ttlnuJet_privateUL18", + "xsecName": "TTWJetsToLNu", + }, + + "UL18_ttllJet_b1" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL18/Round1/Batch1/naodOnly_step/v5/nAOD_step_ttllNuNuJetNoHiggs_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "ttllJet_privateUL18", + "xsecName": "TTZToLLNuNu_M_10", + }, + "UL18_ttllJet_b2" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL18/Round1/Batch2/naodOnly_step/v2/nAOD_step_ttllNuNuJetNoHiggs_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "ttllJet_privateUL18", + "xsecName": "TTZToLLNuNu_M_10", + }, + "UL18_ttllJet_b3" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL18/Round1/Batch3/naodOnly_step/v2/nAOD_step_ttllNuNuJetNoHiggs_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "ttllJet_privateUL18", + "xsecName": "TTZToLLNuNu_M_10", + }, + + "UL18_tllq_b1" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL18/Round1/Batch1/naodOnly_step/v5/nAOD_step_tllq4fNoSchanWNoHiggs0p_all22WCsStartPtCheckV2dim6TopMay20GST_run0", + "histAxisName": "tllq_privateUL18", + "xsecName": "tZq", + }, + "UL18_tllq_b2" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL18/Round1/Batch2/naodOnly_step/v2/nAOD_step_tllq4fNoSchanWNoHiggs0p_all22WCsStartPtCheckV2dim6TopMay20GST_run0", + "histAxisName": "tllq_privateUL18", + "xsecName": "tZq", + }, + "UL18_tllq_b3" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL18/Round1/Batch3/naodOnly_step/v2/nAOD_step_tllq4fNoSchanWNoHiggs0p_all22WCsStartPtCheckV2dim6TopMay20GST_run0", + "histAxisName": "tllq_privateUL18", + "xsecName": "tZq", + }, + + "UL18_tHq_b1" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL18/Round1/Batch1/naodOnly_step/v5/nAOD_step_tHq4f_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "tHq_privateUL18", + "xsecName": "tHq", + }, + "UL18_tHq_b2" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL18/Round1/Batch2/naodOnly_step/v2/nAOD_step_tHq4f_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "tHq_privateUL18", + "xsecName": "tHq", + }, + "UL18_tHq_b3" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL18/Round1/Batch3/naodOnly_step/v2/nAOD_step_tHq4f_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "tHq_privateUL18", + "xsecName": "tHq", + }, + + "UL18_tttt_b4" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL18/Round1/Batch4/naodOnly_step/v2/nAOD_step_tttt_FourtopsMay3v1_run0", + "histAxisName": "tttt_privateUL18", + "xsecName": "tttt", + }, +} + +private_UL16_dict = { + + "UL16_ttHJet_b1" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL16/Round1/Batch1/naodOnly_step/v2/nAOD_step_ttHJet_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "ttHJet_privateUL16", + "xsecName": "ttHnobb", + }, + "UL16_ttlnuJet_b1" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL16/Round1/Batch1/naodOnly_step/v2/nAOD_step_ttlnuJet_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "ttlnuJet_privateUL16", + "xsecName": "TTWJetsToLNu", + }, + "UL16_ttllJet_b1" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL16/Round1/Batch1/naodOnly_step/v2/nAOD_step_ttllNuNuJetNoHiggs_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "ttllJet_privateUL16", + "xsecName": "TTZToLLNuNu_M_10", + }, + "UL16_tllq_b1" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL16/Round1/Batch1/naodOnly_step/v2/nAOD_step_tllq4fNoSchanWNoHiggs0p_all22WCsStartPtCheckV2dim6TopMay20GST_run0", + "histAxisName": "tllq_privateUL16", + "xsecName": "tZq", + }, + "UL16_tHq_b1" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL16/Round1/Batch1/naodOnly_step/v2/nAOD_step_tHq4f_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "tHq_privateUL16", + "xsecName": "tHq", + }, + "UL16_tttt_b1" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL16/Round1/Batch1/naodOnly_step/v2/nAOD_step_tttt_FourtopsMay3v1_run0", + "histAxisName": "tttt_privateUL16", + "xsecName": "tttt", + }, +} + +private_UL16APV_dict = { + + "UL16APV_ttHJet_b1" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL16APV/Round1/Batch1/naodOnly_step/v2/nAOD_step_ttHJet_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "ttHJet_privateUL16APV", + "xsecName": "ttHnobb", + }, + "UL16APV_ttlnuJet_b1" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL16APV/Round1/Batch1/naodOnly_step/v2/nAOD_step_ttlnuJet_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "ttlnuJet_privateUL16APV", + "xsecName": "TTWJetsToLNu", + }, + "UL16APV_ttllJet_b1" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL16APV/Round1/Batch1/naodOnly_step/v2/nAOD_step_ttllNuNuJetNoHiggs_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "ttllJet_privateUL16APV", + "xsecName": "TTZToLLNuNu_M_10", + }, + "UL16APV_tllq_b1" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL16APV/Round1/Batch1/naodOnly_step/v2/nAOD_step_tllq4fNoSchanWNoHiggs0p_all22WCsStartPtCheckV2dim6TopMay20GST_run0", + "histAxisName": "tllq_privateUL16APV", + "xsecName": "tZq", + }, + "UL16APV_tHq_b1" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL16APV/Round1/Batch1/naodOnly_step/v2/nAOD_step_tHq4f_all22WCsStartPtCheckdim6TopMay20GST_run0", + "histAxisName": "tHq_privateUL16APV", + "xsecName": "tHq", + }, + "UL16APV_tttt_b1" : { + "path" : "/store/user/kmohrman/FullProduction/FullR2/UL16APV/Round1/Batch1/naodOnly_step/v2/nAOD_step_tttt_FourtopsMay3v1_run0", + "histAxisName": "tttt_privateUL16APV", + "xsecName": "tttt", + }, +} + + +def make_jsons_for_dict_of_samples(samples_dict,prefix,year,out_dir,on_das=False): + failed = [] + #samples_dict = {k:v for k,v in samples_dict.items() if year in k} + for sample_name,sample_info in sorted(samples_dict.items()): + if year is None: + year = sample_name.split('_')[0] + if 'UL' in sample_name: + year.replace('UL', '20') + print(f"\n\nMaking JSON for {sample_name}...") + path = sample_info["path"] + if not on_das and "path_local" in sample_info: + # The bkg samples are now at ND, but we wanted to leave the dataset names in the dictionaries as well (in case we want to access remotely) + # So for these samples we have a "path" (i.e. dataset name to be used when on_das=True), as well as a "local_path" for acessing locally + # Note, it would probably make more sense to call "path" something like "path_das" (and "path_local" just "path"), but did not want to change the existing names.. + path = sample_info["path_local"] + hist_axis_name = sample_info["histAxisName"] + xsec_name = sample_info["xsecName"] + postfix = '' + postmortem = False + if 'post_mortem' in sample_info and sample_info['post_mortem']: + postfix = '_post-mortem' + postmortem = sample_info['PMWCnames'] + else: + postmortem=None + sjt.make_json( + sample_dir = path, + sample_name = sample_name, + prefix = prefix, + sample_yr = year, + xsec_name = xsec_name, + hist_axis_name = hist_axis_name, + on_das = on_das, + just_write = True, + postfix = postfix, + post_mortem = postmortem + ) + out_name = sample_name+postfix+".json" + if not os.path.exists(out_name): + failed.append(sample_name) + + subprocess.run(["mv",out_name,out_dir]) + if '_ext' in out_name: + combine_json_ext(out_dir+'/'+out_name) # Merge with non-ext version + os.remove(out_dir+'/'+out_name) # Remove (now) outdated ext version + # Only run if more than one file exists (differentiates between `*_b2.json` and `*_b2_atPSI.json` + r = re.compile(re.sub(r'_b[1-9]', '_b[1-9]', out_name)) + matches = [b for b in str(subprocess.check_output(["ls",'.'], shell=True)).split('\\n') if bool(r.match(b))] + if re.search('_b[2-9]', out_name) and len(matches)>1: + combine_json_batch(out_dir+'/'+out_name) # Merge batches + os.remove(out_dir+'/'+out_name) # Remove (now) outdated batch version + + print("sample name:",sample_name) + print("\tpath:",path,"\n\thistAxisName:",hist_axis_name,"\n\txsecName",xsec_name,"\n\tout name:",out_name,"\n\tout dir:",out_dir) + args = ['python', 'run_sow2.py', out_dir+'/'+out_name, '-o', sample_name, '-x', 'futures', '--xrd', prefix] + subprocess.run(args) + + args = ['python', 'update_json_sow2.py', 'histos/'+sample_name+'.pkl.gz', '--json-dir', out_dir, '--match-files', out_name] + subprocess.run(args) + + if len(failed): + print("Failed:") + for l in failed: + print(f"\t{l}") + else: + print("Failed: None") + +jsons_path = "../../input_samples/sample_jsons/" +out_dir_private_UL = os.path.join(jsons_path,"signal_samples/private_UL/") +make_jsons_for_dict_of_samples(private_UL17_dict,"/hadoop","2017",out_dir_private_UL) +make_jsons_for_dict_of_samples(private_UL18_dict,"/hadoop","2018",out_dir_private_UL) +make_jsons_for_dict_of_samples(private_UL16_dict,"/hadoop","2016",out_dir_private_UL) +make_jsons_for_dict_of_samples(private_UL16APV_dict,"/hadoop","2016APV",out_dir_private_UL) # Not sure what we need here for the year, can remake the JSONs later to update when we have SFs etc set up for 2016 stuff (right now I think it's mostly just 18) +# Post-mortem reweighting example +#make_jsons_for_dict_of_samples(tllq,"/cms/cephfs/data/",out_dir_private_UL,year="2017",postmortem=["cQq11", "cptb", "ctlTi", "ctZ", "ctq1", "cQl3i", "cQlMi", "cpQ3", "ctW", "ctp", "cQq13", "cbB", "cbG", "cpt", "ctlSi", "cbW", "cpQM", "ctq8", "ctG", "ctei", "cQq81", "cQei", "ctli", "cQq83"]) diff --git a/input_samples/sample_jsons/signal_samples/central_2022/2022_TTLNu.json b/input_samples/sample_jsons/signal_samples/central_2022/2022_TTLNu.json index 1923f04d1..8a1e6ff40 100644 --- a/input_samples/sample_jsons/signal_samples/central_2022/2022_TTLNu.json +++ b/input_samples/sample_jsons/signal_samples/central_2022/2022_TTLNu.json @@ -2,7 +2,7 @@ "xsec": 0.2505, "year": "2022", "treeName": "Events", - "histAxisName": "ttLNu_cental2022", + "histAxisName": "ttlnu_central2022", "options": "", "WCnames": [], "files": [ @@ -25,4 +25,4 @@ "nSumOfWeights_renormDown_factUp": 50239.482219815254, "nSumOfWeights_renormfactDown": 50239.482219815254, "nSumOfWeights_renormfactUp": 50239.482219815254 -} \ No newline at end of file +} diff --git a/input_samples/sample_jsons/signal_samples/central_2022/2022_ttHnobb-1Jets.json b/input_samples/sample_jsons/signal_samples/central_2022/2022_ttHnobb-1Jets.json index cfd510340..1c2d484fe 100644 --- a/input_samples/sample_jsons/signal_samples/central_2022/2022_ttHnobb-1Jets.json +++ b/input_samples/sample_jsons/signal_samples/central_2022/2022_ttHnobb-1Jets.json @@ -1,5 +1,5 @@ { - "xsec": 0.5742, + "xsec": 0.24111, "year": "2022", "treeName": "Events", "histAxisName": "ttHnobb-1Jets_central2022", @@ -189,4 +189,4 @@ "nSumOfWeights_renormDown_factUp": 7042940.23423746, "nSumOfWeights_renormfactDown": 6537410.671014825, "nSumOfWeights_renormfactUp": 7042940.234237459 -} \ No newline at end of file +}