Skip to content

Commit 0fb6e71

Browse files
authored
Merge pull request #493 from anpicci/add-canonicalize_process_name-helper
Canonicalize data-driven histogram keys
2 parents df2d334 + b648cb9 commit 0fb6e71

File tree

2 files changed

+57
-11
lines changed

2 files changed

+57
-11
lines changed

topeft/modules/dataDrivenEstimation.py

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,20 @@
11
import argparse
2-
import topcoffea.modules.utils as utils
3-
import cloudpickle
4-
from collections import defaultdict
5-
import re
62
import gzip
3+
import logging
4+
import re
5+
from collections import defaultdict
6+
7+
import cloudpickle
8+
import topcoffea.modules.utils as utils
79

810
from topeft.modules.paths import topeft_path
11+
from topeft.modules.utils import canonicalize_process_name
912
from topcoffea.modules.get_param_from_jsons import GetParam
1013
get_te_param = GetParam(topeft_path("params/params.json"))
1114

15+
16+
logger = logging.getLogger(__name__)
17+
1218
class DataDrivenProducer:
1319
def __init__(self, inputHist, outputName):
1420
if isinstance(inputHist, str) and inputHist.endswith('.pkl.gz'): # we are plugging a pickle file
@@ -74,11 +80,14 @@ def DDFakes(self):
7480
sampleName=match.group('process')
7581
year=match.group('year')
7682
if year.startswith("202"):
77-
nonPromptName='flips%s'%year
83+
raw_flips_name = f"flips{year}"
7884
else:
79-
nonPromptName='flipsUL%s'%year
85+
raw_flips_name = f"flipsUL{year}"
86+
flips_name = canonicalize_process_name(raw_flips_name)
87+
if raw_flips_name == flips_name:
88+
logger.debug("Process name '%s' already canonical", raw_flips_name)
8089
if self.dataName==sampleName:
81-
newNameDictData[nonPromptName].append(process)
90+
newNameDictData[flips_name].append(process)
8291
hFlips=hAR.group('process', newNameDictData)
8392

8493
# remove any up/down FF variations from the flip histo since we don't use that info
@@ -107,13 +116,16 @@ def DDFakes(self):
107116
year=match.group('year')
108117

109118
if "2022" in year or "2023" in year:
110-
nonPromptName='nonprompt%s'%year
119+
raw_nonprompt_name = f"nonprompt{year}"
111120
else:
112-
nonPromptName='nonpromptUL%s'%year
121+
raw_nonprompt_name = f"nonpromptUL{year}"
122+
nonprompt_name = canonicalize_process_name(raw_nonprompt_name)
123+
if raw_nonprompt_name == nonprompt_name:
124+
logger.debug("Process name '%s' already canonical", raw_nonprompt_name)
113125
if self.dataName==sampleName:
114-
newNameDictData[nonPromptName].append(process)
126+
newNameDictData[nonprompt_name].append(process)
115127
elif sampleName in self.promptSubtractionSamples:
116-
newNameDictNoData[nonPromptName].append(process)
128+
newNameDictNoData[nonprompt_name].append(process)
117129
else:
118130
print(f"We won't consider {sampleName} for the prompt subtraction in the appl. region")
119131
hFakes=hAR.group('process', newNameDictData)

topeft/modules/utils.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,40 @@ def get_pdiff(a,b,in_percent=False):
3737

3838
############## Strings manipulations and tools ##############
3939

40+
41+
def canonicalize_process_name(process_name):
42+
"""Return *process_name* with only the leading alphabetic token lowercased.
43+
44+
Examples:
45+
``NonPromptUL16`` becomes ``nonpromptUL16`` while ``Flips2023BPix``
46+
becomes ``flips2023BPix``.
47+
48+
Args:
49+
process_name (str): The process identifier to canonicalize.
50+
51+
Returns:
52+
str: The canonicalized process name. Strings without a leading
53+
alphabetic token are returned unchanged.
54+
"""
55+
56+
match = re.match(r"([A-Za-z]+)(.*)", process_name)
57+
if not match:
58+
return process_name
59+
60+
prefix, remainder = match.groups()
61+
62+
# Preserve trailing all-caps segments (e.g. ``UL``) that follow a mixed-case
63+
# prefix so that ``NonPromptUL16`` becomes ``nonpromptUL16`` instead of
64+
# ``nonpromptul16``.
65+
suffix_match = re.search(r"([A-Z]{2,})$", prefix)
66+
if suffix_match and any(ch.islower() for ch in prefix[:suffix_match.start()]):
67+
lowered = prefix[:suffix_match.start()].lower() + prefix[suffix_match.start():]
68+
else:
69+
lowered = prefix.lower()
70+
71+
return lowered + remainder
72+
73+
4074
# Match strings using one or more regular expressions
4175
def regex_match(lst,regex_lst):
4276
# NOTE: For the regex_lst patterns, we use the raw string to generate the regular expression.

0 commit comments

Comments
 (0)