From c93b95817602ebd5109d5632723e534356887145 Mon Sep 17 00:00:00 2001
From: Andrew42 <awightma@nd.edu>
Date: Thu, 3 Nov 2022 18:22:28 -0500
Subject: [PATCH 1/5] Implement new clipping to avoid t2w crash

---
 topcoffea/modules/datacard_tools.py | 30 +++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/topcoffea/modules/datacard_tools.py b/topcoffea/modules/datacard_tools.py
index 546f0dae4..c227096b5 100644
--- a/topcoffea/modules/datacard_tools.py
+++ b/topcoffea/modules/datacard_tools.py
@@ -15,7 +15,11 @@
 from topcoffea.modules.paths import topcoffea_path
 import topcoffea.modules.eft_helper as efth
 
-PRECISION = 6   # Decimal point precision in the text datacard output
+PRECISION = 6           # Decimal point precision in the text datacard output
+NOM_CLIP_SCALE = 1e-3   # When clipping negative yield bins, this is the ratio to the nominal yield used
+
+# np.set_printoptions(precision=8,sign=' ',floatmode='fixed')
+np.set_printoptions(linewidth=100,formatter={'float': lambda x: f"{x:>+12.8f}"})
 
 def prune_axis(h,axis,to_keep):
     """ Convenience method to remove all categories except for a selected subset."""
@@ -181,7 +185,8 @@ class DatacardMaker():
         "o0pt":    [0,100,200,400],
         "bl0pt":   [0,100,200,400],
         "l0pt":    [0,50,100,200],
-        "lj0pt":   [0,150,250,500]
+        # "lj0pt":   [0,150,250,500],
+        "lj0pt": [0,500],
     }
 
     YEARS = ["UL16","UL16APV","UL17","UL18"]
@@ -807,22 +812,31 @@ def analyze(self,km_dist,ch,selected_wcs, crop_negative_bins):
                         for sp_key,arr in data_sm.items():
                             data_obs += arr
                 for base,v in decomposed_templates.items():
+                    # There should be only 1 sparse axis at this point, the systematics axis
                     proc_name = f"{p}_{base}"
                     col_width = max(len(proc_name),col_width)
                     text_card_info[proc_name] = {
                         "shapes": set(),
                         "rate": -1
                     }
-                    # There should be only 1 sparse axis at this point, the systematics axis
+                    # Construct a positive non-zero scaled down version of the nominal yields
+                    if len(v):
+                        nz_nom_arr = np.abs(v[('nominal',)][0]*NOM_CLIP_SCALE)
                     for sp_key,arr in v.items():
                         if crop_negative_bins:
-                            negative_bin_mask = np.where( arr[0] < 0) # see where bins are negative
-                            arr[0][negative_bin_mask] = np.zeros_like( arr[0][negative_bin_mask] )  # set those to zero
+                            bin_mask = np.where( arr[0] < 0)        # see where bins are negative
+                            if self.verbose and np.sum(nz_nom_arr[bin_mask] > 0):
+                                print(f"{' '*2}{proc_name}_{sp_key[0]}: {arr[0][bin_mask]} -> {nz_nom_arr[bin_mask]}")
+                                print(f"{' '*6}{'Before:':<7} {arr[0]}")
+                            arr[0][bin_mask] = nz_nom_arr[bin_mask] # replace negative values with non-zero values
                             if arr[1] is not None:
-                                arr[1][negative_bin_mask] = np.zeros_like( arr[1][negative_bin_mask] )  # if there's a sumw2 defined, that one's set to zero as well. Otherwise we will get 0 +/- something, which is compatible with negative 
-
+                                # If there's a sumw2 defined, that one's clipped as well.
+                                #   Otherwise we will get 0 +/- something, which is compatible with
+                                #   negative
+                                arr[1][bin_mask] = nz_nom_arr[bin_mask]**2
+                            if self.verbose and np.sum(nz_nom_arr[bin_mask] > 0):
+                                print(f"{' '*6}{'After:':<7} {arr[0]}")
                         syst = sp_key[0]
-
                         sum_arr = sum(arr[0])
                         if syst == "nominal" and base == "sm":
                             if self.verbose:

From 88cd668169d417f6633945627cb57d874edc72ff Mon Sep 17 00:00:00 2001
From: Andrew42 <awightma@nd.edu>
Date: Thu, 3 Nov 2022 18:24:35 -0500
Subject: [PATCH 2/5] restore proper lj0pt binning

---
 topcoffea/modules/datacard_tools.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/topcoffea/modules/datacard_tools.py b/topcoffea/modules/datacard_tools.py
index c227096b5..f0734af4c 100644
--- a/topcoffea/modules/datacard_tools.py
+++ b/topcoffea/modules/datacard_tools.py
@@ -185,8 +185,7 @@ class DatacardMaker():
         "o0pt":    [0,100,200,400],
         "bl0pt":   [0,100,200,400],
         "l0pt":    [0,50,100,200],
-        # "lj0pt":   [0,150,250,500],
-        "lj0pt": [0,500],
+        "lj0pt":   [0,150,250,500],
     }
 
     YEARS = ["UL16","UL16APV","UL17","UL18"]

From 0e1069bbe39a39c12e54864f52548e8a385c272e Mon Sep 17 00:00:00 2001
From: Andrew42 <awightma@nd.edu>
Date: Fri, 4 Nov 2022 12:01:38 -0500
Subject: [PATCH 3/5] Minor change to variable naming

---
 topcoffea/modules/datacard_tools.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/topcoffea/modules/datacard_tools.py b/topcoffea/modules/datacard_tools.py
index f0734af4c..72f50f760 100644
--- a/topcoffea/modules/datacard_tools.py
+++ b/topcoffea/modules/datacard_tools.py
@@ -810,9 +810,9 @@ def analyze(self,km_dist,ch,selected_wcs, crop_negative_bins):
                             raise RuntimeError("filling obs data more than once!")
                         for sp_key,arr in data_sm.items():
                             data_obs += arr
-                for base,v in decomposed_templates.items():
+                for eft_term,v in decomposed_templates.items():
                     # There should be only 1 sparse axis at this point, the systematics axis
-                    proc_name = f"{p}_{base}"
+                    proc_name = f"{p}_{eft_term}"
                     col_width = max(len(proc_name),col_width)
                     text_card_info[proc_name] = {
                         "shapes": set(),
@@ -837,7 +837,7 @@ def analyze(self,km_dist,ch,selected_wcs, crop_negative_bins):
                                 print(f"{' '*6}{'After:':<7} {arr[0]}")
                         syst = sp_key[0]
                         sum_arr = sum(arr[0])
-                        if syst == "nominal" and base == "sm":
+                        if syst == "nominal" and eft_term == "sm":
                             if self.verbose:
                                 print(f"\t{proc_name:<12}: {sum_arr:.4f} {arr[0]}")
                             if not self.use_real_data:

From 30bec25e61cdd5d475b4fe64a620f0cd6fd11dae Mon Sep 17 00:00:00 2001
From: Andrew42 <awightma@nd.edu>
Date: Thu, 8 Dec 2022 13:45:40 -0600
Subject: [PATCH 4/5] Add handling of appl axis and modify clipping procedure

---
 topcoffea/modules/datacard_tools.py | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/topcoffea/modules/datacard_tools.py b/topcoffea/modules/datacard_tools.py
index 72f50f760..8ffa68aef 100644
--- a/topcoffea/modules/datacard_tools.py
+++ b/topcoffea/modules/datacard_tools.py
@@ -431,6 +431,10 @@ def read(self,fpath):
                         continue
             h = h.remove(to_remove,"sample")
 
+            # Integrate out the application region axis if its present
+            if "appl" in [x.name for x in h.sparse_axes()]:
+                h = h.integrate("appl",["isSR_2lSS","isSR_3l","isSR_4l"])   # This is pretty hardcoded right now, might want to fix
+
             if not self.do_nuisance:
                 # Remove all shape systematics
                 h = prune_axis(h,"systematic",["nominal"])
@@ -794,6 +798,10 @@ def analyze(self,km_dist,ch,selected_wcs, crop_negative_bins):
         text_card_info = {}
         outf_root_name = os.path.join(self.out_dir,outf_root_name)
         with uproot.recreate(outf_root_name) as f:
+            # Get a reference for how many total events (ignoring signal processes) are in a given bin
+            ch_hist.set_sm()
+            ref_bins,ref_stats = ch_hist.remove(["data"]+list(self.SIGNALS),"sample").integrate("sample").integrate("systematic",["nominal"]).values(sumw2=True,overflow='all')[()]
+            np.sqrt(ref_stats,out=ref_stats)
             for p,wcs in selected_wcs.items():
                 proc_hist = ch_hist.integrate("sample",[p])
                 if self.verbose:
@@ -818,9 +826,26 @@ def analyze(self,km_dist,ch,selected_wcs, crop_negative_bins):
                         "shapes": set(),
                         "rate": -1
                     }
-                    # Construct a positive non-zero scaled down version of the nominal yields
+                    # Construct a positive non-zero scaled down version of the nominal yields and
+                    #   check if any negative yield bins are 'large'
                     if len(v):
-                        nz_nom_arr = np.abs(v[('nominal',)][0]*NOM_CLIP_SCALE)
+                        nom_arr = v[('nominal',)][0]
+                        bin_mask = np.where( nom_arr < 0)
+                        chk_arr = np.zeros_like(nom_arr)
+                        np.divide(nom_arr,ref_bins,out=chk_arr,where=ref_bins != 0)
+
+                        # if np.sum(np.where(np.abs(chk_arr[bin_mask]) > 0.01)):
+                        if np.sum(np.where(np.abs(nom_arr[bin_mask]) > ref_stats[bin_mask])):
+                            diff_arr = ref_stats - np.abs(nom_arr)
+                            print(f"ERROR: {proc_name} has bin with large negative contribution")
+                            print(f"{' '*6}Reference: {ref_bins}")
+                            print(f"{' '*6}Ref stats: {ref_stats}")
+                            print(f"{' '*6}Nominal:   {nom_arr}")
+                            print(f"{' '*6}Diff:      {diff_arr}")
+                            # print(f"{' '*6}Ratio:     {chk_arr}")
+
+                        nz_nom_arr = np.abs(nom_arr*NOM_CLIP_SCALE)
+
                     for sp_key,arr in v.items():
                         if crop_negative_bins:
                             bin_mask = np.where( arr[0] < 0)        # see where bins are negative

From 69a7f29eee11debba37f9e678135a7dd93aa2504 Mon Sep 17 00:00:00 2001
From: Kelci Mohrman <kmohrman@nd.edu>
Date: Wed, 28 Dec 2022 23:17:22 -0500
Subject: [PATCH 5/5] It seems the name of loop variable changed

---
 topcoffea/modules/datacard_tools.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/topcoffea/modules/datacard_tools.py b/topcoffea/modules/datacard_tools.py
index 87a616a84..a1cdc2d4f 100644
--- a/topcoffea/modules/datacard_tools.py
+++ b/topcoffea/modules/datacard_tools.py
@@ -957,7 +957,7 @@ def analyze(self,km_dist,ch,selected_wcs, crop_negative_bins):
                                 hist_name = hist_name.replace(syst_base,split_syst)
                                 all_shapes.add(split_syst)
                                 text_card_info[proc_name]["shapes"].add(split_syst)
-                                if base == "sm" and self.verbose:
+                                if eft_term == "sm" and self.verbose:
                                     print(f"\tDecorrelate {p} for {syst_base} into {split_syst} ({syst.replace(syst_base,'')})")
                             else:
                                 all_shapes.add(syst_base)