Merge pull request #493 from anpicci/add-canonicalize_process_name-helper

anpicci · web-flow · commit 0fb6e71cb064 · 2025-11-11T23:02:49.000+01:00
Canonicalize data-driven histogram keys
diff --git a/topeft/modules/dataDrivenEstimation.py b/topeft/modules/dataDrivenEstimation.py
@@ -1,14 +1,20 @@
 import argparse
-import topcoffea.modules.utils as utils
-import cloudpickle
-from collections import defaultdict
-import re
 import gzip
+import logging
+import re
+from collections import defaultdict
+
+import cloudpickle
+import topcoffea.modules.utils as utils
 
 from topeft.modules.paths import topeft_path
+from topeft.modules.utils import canonicalize_process_name
 from topcoffea.modules.get_param_from_jsons import GetParam
 get_te_param = GetParam(topeft_path("params/params.json"))
 
+
+logger = logging.getLogger(__name__)
+
 class DataDrivenProducer:
     def __init__(self, inputHist, outputName):
         if isinstance(inputHist, str) and inputHist.endswith('.pkl.gz'): # we are plugging a pickle file
@@ -74,11 +80,14 @@ def DDFakes(self):
                             sampleName=match.group('process')
                             year=match.group('year')
                             if year.startswith("202"):
-                                nonPromptName='flips%s'%year
+                                raw_flips_name = f"flips{year}"
                             else:
-                                nonPromptName='flipsUL%s'%year
+                                raw_flips_name = f"flipsUL{year}"
+                            flips_name = canonicalize_process_name(raw_flips_name)
+                            if raw_flips_name == flips_name:
+                                logger.debug("Process name '%s' already canonical", raw_flips_name)
                             if self.dataName==sampleName:
-                                newNameDictData[nonPromptName].append(process)
+                                newNameDictData[flips_name].append(process)
                         hFlips=hAR.group('process', newNameDictData)
 
                         # remove any up/down FF variations from the flip histo since we don't use that info
@@ -107,13 +116,16 @@ def DDFakes(self):
                             year=match.group('year')
 
                             if "2022" in year or "2023" in year:
-                                nonPromptName='nonprompt%s'%year
+                                raw_nonprompt_name = f"nonprompt{year}"
                             else:
-                                nonPromptName='nonpromptUL%s'%year
+                                raw_nonprompt_name = f"nonpromptUL{year}"
+                            nonprompt_name = canonicalize_process_name(raw_nonprompt_name)
+                            if raw_nonprompt_name == nonprompt_name:
+                                logger.debug("Process name '%s' already canonical", raw_nonprompt_name)
                             if self.dataName==sampleName:
-                                newNameDictData[nonPromptName].append(process)
+                                newNameDictData[nonprompt_name].append(process)
                             elif sampleName in self.promptSubtractionSamples:
-                                newNameDictNoData[nonPromptName].append(process)
+                                newNameDictNoData[nonprompt_name].append(process)
                             else:
                                 print(f"We won't consider {sampleName} for the prompt subtraction in the appl. region")
                         hFakes=hAR.group('process', newNameDictData)
diff --git a/topeft/modules/utils.py b/topeft/modules/utils.py
@@ -37,6 +37,40 @@ def get_pdiff(a,b,in_percent=False):
 
 ############## Strings manipulations and tools ##############
 
+
+def canonicalize_process_name(process_name):
+    """Return *process_name* with only the leading alphabetic token lowercased.
+
+    Examples:
+        ``NonPromptUL16`` becomes ``nonpromptUL16`` while ``Flips2023BPix``
+        becomes ``flips2023BPix``.
+
+    Args:
+        process_name (str): The process identifier to canonicalize.
+
+    Returns:
+        str: The canonicalized process name. Strings without a leading
+        alphabetic token are returned unchanged.
+    """
+
+    match = re.match(r"([A-Za-z]+)(.*)", process_name)
+    if not match:
+        return process_name
+
+    prefix, remainder = match.groups()
+
+    # Preserve trailing all-caps segments (e.g. ``UL``) that follow a mixed-case
+    # prefix so that ``NonPromptUL16`` becomes ``nonpromptUL16`` instead of
+    # ``nonpromptul16``.
+    suffix_match = re.search(r"([A-Z]{2,})$", prefix)
+    if suffix_match and any(ch.islower() for ch in prefix[:suffix_match.start()]):
+        lowered = prefix[:suffix_match.start()].lower() + prefix[suffix_match.start():]
+    else:
+        lowered = prefix.lower()
+
+    return lowered + remainder
+
+
 # Match strings using one or more regular expressions
 def regex_match(lst,regex_lst):
     # NOTE: For the regex_lst patterns, we use the raw string to generate the regular expression.