Skip to content

Commit 2d0becd

Browse files
committed
HPCC-34568 Improve the roxie complete line extraction tool
Signed-off-by: Gavin Halliday <[email protected]>
1 parent 41ef25f commit 2d0becd

File tree

1 file changed

+112
-4
lines changed

1 file changed

+112
-4
lines changed

tools/roxie/extract-roxie-timings.py

Lines changed: 112 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,16 @@
2525
import argparse
2626
import datetime
2727

28+
def parse_time_string(time_str, context="time"):
29+
try:
30+
return datetime.datetime.strptime(time_str, '%H:%M:%S.%f').time()
31+
except ValueError:
32+
try:
33+
return datetime.datetime.strptime(time_str, '%H:%M:%S').time()
34+
except ValueError:
35+
print(f"Error: Invalid {context} format. Use 'HH:MM:SS.fff' or 'HH:MM:SS'")
36+
sys.exit(1)
37+
2838
def calculateDerivedStats(curRow):
2939

3040
timeElapsed = float(curRow.get("elapsed", 0.0))
@@ -192,6 +202,7 @@ def calculateSummaryStats(curRow, numCpus, numRows):
192202
curRow["CpuLoad@10q/s"] = 10 / perCpuTransactionsPerSecond
193203

194204
def calculateAverageStats(avgRow, totalRow, numRows):
205+
avgRow['time'] = numRows
195206
for statName in allStats:
196207
if statName in totalRow and type(totalRow[statName]) != str:
197208
avgRow[statName] = float(totalRow[statName]) / numRows if numRows else 0
@@ -210,7 +221,7 @@ def printRow(curRow):
210221

211222

212223
if __name__ == "__main__":
213-
allStats = dict(time=1, elapsed=1)
224+
seenStats = dict(time=1, elapsed=1)
214225
allServices = dict()
215226

216227
minTimeStamp = ''
@@ -234,6 +245,9 @@ def printRow(curRow):
234245
parser.add_argument("--nosummary", "-n", help="Avoid including a summary", action='store_true')
235246
parser.add_argument("--summaryonly", "-s", help="Only generate a summary", action='store_true')
236247
parser.add_argument("--avgonly", "-v", help="Only generate average summary", action='store_true')
248+
parser.add_argument("--starttime", help="Start time for filtering within a day (HH:MM:SS or HH:MM:SS.fff)")
249+
parser.add_argument("--endtime", help="End time for filtering within a day (HH:MM:SS or HH:MM:SS.fff)")
250+
parser.add_argument("--elapsed", type=float, help="Elapsed time in seconds (used with --starttime to calculate endtime)")
237251
args = parser.parse_args()
238252
combineServices = args.all
239253
averageOnly = args.avgonly
@@ -242,6 +256,31 @@ def printRow(curRow):
242256
ignoreQueryCase = args.ignorecase
243257
cpus = args.cpu
244258

259+
# Process time filtering arguments (time within a day)
260+
startTimeFilter = None
261+
endTimeFilter = None
262+
263+
if args.starttime:
264+
startTimeFilter = parse_time_string(args.starttime, "starttime")
265+
266+
if args.endtime:
267+
endTimeFilter = parse_time_string(args.endtime, "endtime")
268+
269+
# If starttime and elapsed are specified, calculate endtime
270+
if args.starttime and args.elapsed and not args.endtime:
271+
if startTimeFilter:
272+
# Convert time to datetime, add elapsed seconds, then back to time
273+
base_datetime = datetime.datetime.combine(datetime.date.today(), startTimeFilter)
274+
end_datetime = base_datetime + datetime.timedelta(seconds=args.elapsed)
275+
endTimeFilter = end_datetime.time()
276+
277+
# Validate that endtime is after starttime (handling day wraparound)
278+
if startTimeFilter and endTimeFilter:
279+
# For simplicity, we'll assume no day wraparound for validation
280+
# In practice, if endtime < starttime, it could mean next day
281+
if endTimeFilter <= startTimeFilter:
282+
print("Warning: endtime is before or equal to starttime - this may indicate day wraparound")
283+
245284
csv.field_size_limit(0x100000)
246285
with open(args.filename, encoding='latin1') as csv_file:
247286
csv_reader = csv.reader(csv_file, delimiter=' ')
@@ -276,14 +315,40 @@ def printRow(curRow):
276315
elapsed = int(elapsedMatch.group(1)) if elapsedMatch else 0
277316
curRow["elapsed"] = elapsed
278317

279-
#MORE: Unimplemented - allow timestamp filtering
280318
timestamp = ''
319+
thisTime = ''
281320
for i in range(len(row)):
282321
if yearMonthDayPattern.match(row[i]):
283322
timestamp = row[i] + ' ' + row[i+1]
284-
curRow["time"] = row[i+1]
323+
thisTime = row[i+1]
324+
curRow["time"] = thisTime
285325
break
286326

327+
# Apply time filtering if specified
328+
if startTimeFilter or endTimeFilter:
329+
if thisTime:
330+
try:
331+
rowTime = parse_time_string(thisTime, "row time")
332+
except SystemExit:
333+
# Skip rows with unparseable times
334+
continue
335+
336+
# Skip row if time is before starttime
337+
if startTimeFilter and rowTime < startTimeFilter:
338+
continue
339+
340+
# Skip row if time is after endtime
341+
# Handle potential day wraparound: if endtime < starttime, assume endtime is next day
342+
if endTimeFilter:
343+
if startTimeFilter and endTimeFilter < startTimeFilter:
344+
# Day wraparound case: accept times >= starttime OR <= endtime
345+
if not (rowTime >= startTimeFilter or rowTime <= endTimeFilter):
346+
continue
347+
else:
348+
# Normal case: accept times <= endtime
349+
if rowTime > endTimeFilter:
350+
continue
351+
287352
if minTimeStamp == '' or timestamp < minTimeStamp:
288353
minTimeStamp = timestamp
289354
minElapsed = elapsed
@@ -313,7 +378,7 @@ def printRow(curRow):
313378
else:
314379
if suppress > 0:
315380
continue
316-
allStats[name] = 1
381+
seenStats[name] = 1
317382
castValue = -1
318383
#Remove any trailing comma that should not be present
319384
if value[-1] == ',':
@@ -369,6 +434,49 @@ def printRow(curRow):
369434
allServices[serviceName].append(curRow)
370435
line_count += 1
371436

437+
# Sort all the statistics that were found in the file into alphabetical order, and insert them into allStats
438+
# The dictionary initializer fixes any columns that should be at the start
439+
# First general important stats, network related stats, index stats, the rest
440+
allStats = dict(time=1, elapsed=1,
441+
TimeSoapcall=1, TimeLocalExecute=1,
442+
Net=1,
443+
duplicatePackets=1,
444+
resentPackets=1,
445+
NumAckRetries=1,
446+
NumAgentRequests=1,
447+
SizeAgentRequests=1,
448+
SizeAgentReply=1,
449+
SizeContinuationData=1,
450+
Ag=1,
451+
TimeAgentWait=1,
452+
TimeAgentQueue=1,
453+
TimeAgentProcess=1,
454+
TimeIBYTIDelay=1,
455+
Key=1,
456+
NumNodeCacheHits=1,
457+
NumNodeCacheAdds=1,
458+
TimeNodeLoad=1,
459+
TimeNodeRead=1,
460+
NumNodeDiskFetches=1,
461+
TimeNodeFetch=1,
462+
NumLeafCacheHits=1,
463+
NumLeafCacheAdds=1,
464+
TimeLeafLoad=1,
465+
TimeLeafRead=1,
466+
NumLeafDiskFetches=1,
467+
TimeLeafFetch=1,
468+
)
469+
470+
#Add all non-function stats
471+
for statName in sorted(list(seenStats.keys())):
472+
if statName[0] != 'f':
473+
allStats[statName] = 1
474+
475+
#Add all function stats (e.g. regex)
476+
for statName in sorted(list(seenStats.keys())):
477+
if statName[0] == 'f':
478+
allStats[statName] = 1
479+
372480
allStats[' '] = 1
373481
allStats["%BranchMiss"]=1
374482
allStats["%LeafMiss"] = 1

0 commit comments

Comments
 (0)