Skip to content

Commit 703f715

Browse files
fix: append -- sleep to toplev so it exits after collection window
toplev in -C (CPU-pinned) or -a (system-wide) mode runs indefinitely without a workload command. Previous code relied on SIGINT after a timeout, but this caused cascading TimeoutExpired exceptions on 96-core systems where toplev takes extra time to start. Now appends '-- sleep <duration>' to the toplev command, so perf stat runs for exactly the requested duration and toplev exits cleanly.
1 parent ac6550f commit 703f715

2 files changed

Lines changed: 12 additions & 20 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "topdown-profiler"
3-
version = "0.1.4"
3+
version = "0.1.5"
44
description = "Intel Top-Down Microarchitecture Analysis collector with MCP server, label-based querying, and pluggable SQL backends."
55
authors = ["redis-performance"]
66
readme = "README.md"

topdown/collector/toplev.py

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -51,16 +51,16 @@ def build_command(self) -> list[str]:
5151
def run(self, duration_seconds: int) -> tuple[str, str]:
5252
"""Run toplev for a duration, return (stdout, stderr).
5353
54-
toplev needs extra time beyond the collection duration for:
55-
- Initial PMU event list download (first run on a new CPU)
56-
- perf stat startup and calibration
57-
- Output flushing after SIGINT
58-
59-
We use duration + 60s as the timeout buffer to handle slow starts
60-
on many-core systems (96+ cores on Sapphire Rapids, etc.).
54+
Appends ``-- sleep <duration>`` so toplev exits naturally after the
55+
collection window. The timeout buffer (duration + 120s) handles
56+
slow PMU event list downloads on first run and perf stat startup
57+
on many-core systems (96+ cores on Sapphire Rapids).
6158
"""
6259
cmd = self.build_command()
63-
logger.info("Running: %s (duration=%ds)", " ".join(cmd), duration_seconds)
60+
# Tell toplev how long to collect by running a sleep workload.
61+
# Without this, toplev in -C / -a mode runs indefinitely.
62+
cmd.extend(["--", "sleep", str(duration_seconds)])
63+
logger.info("Running: %s", " ".join(cmd))
6464

6565
try:
6666
proc = subprocess.Popen(
@@ -71,7 +71,9 @@ def run(self, duration_seconds: int) -> tuple[str, str]:
7171
)
7272

7373
try:
74-
stdout, stderr = proc.communicate(timeout=duration_seconds + 60)
74+
stdout, stderr = proc.communicate(
75+
timeout=duration_seconds + 120,
76+
)
7577
except subprocess.TimeoutExpired:
7678
proc.send_signal(signal.SIGINT)
7779
try:
@@ -80,16 +82,6 @@ def run(self, duration_seconds: int) -> tuple[str, str]:
8082
proc.kill()
8183
stdout, stderr = proc.communicate(timeout=5)
8284

83-
# toplev uses SIGINT to stop collection normally
84-
time.sleep(0.5)
85-
if proc.poll() is None:
86-
proc.send_signal(signal.SIGINT)
87-
try:
88-
stdout, stderr = proc.communicate(timeout=10)
89-
except subprocess.TimeoutExpired:
90-
proc.kill()
91-
stdout, stderr = proc.communicate(timeout=5)
92-
9385
return stdout, stderr
9486

9587
except FileNotFoundError:

0 commit comments

Comments
 (0)