From e5c8cafacb3ad4b77df77fd8f533c5ff9c7a2b9f Mon Sep 17 00:00:00 2001 From: slicklash Date: Tue, 8 Oct 2024 23:50:56 +0200 Subject: [PATCH 1/3] add pdeathsigger --- executable.Dockerfile | 6 +- gprofiler/main.py | 30 +-------- gprofiler/metadata/system_metadata.py | 7 +- gprofiler/metadata/versions.py | 4 +- gprofiler/profilers/java.py | 1 + gprofiler/profilers/python.py | 1 + gprofiler/utils/__init__.py | 92 +++++++++++++-------------- gprofiler/utils/fs.py | 2 +- scripts/pdeathsigger.c | 28 ++++++++ tests/test_java.py | 22 ++++--- tests/test_perf.py | 3 + tests/utils.py | 13 +++- 12 files changed, 119 insertions(+), 90 deletions(-) create mode 100644 scripts/pdeathsigger.c diff --git a/executable.Dockerfile b/executable.Dockerfile index 7b181c6b6..11dc41419 100644 --- a/executable.Dockerfile +++ b/executable.Dockerfile @@ -74,11 +74,12 @@ RUN ./phpspy_build.sh # async-profiler glibc FROM centos${AP_BUILDER_CENTOS} AS async-profiler-builder-glibc WORKDIR /tmp -COPY scripts/async_profiler_env_glibc.sh scripts/fix_centos7.sh ./ +COPY scripts/async_profiler_env_glibc.sh scripts/fix_centos7.sh scripts/pdeathsigger.c ./ RUN if grep -q "CentOS Linux 7" /etc/os-release ; then \ ./fix_centos7.sh; \ fi -RUN ./async_profiler_env_glibc.sh +RUN ./async_profiler_env_glibc.sh && \ + gcc -static -o pdeathsigger pdeathsigger.c COPY scripts/async_profiler_build_shared.sh . RUN ./async_profiler_build_shared.sh @@ -256,6 +257,7 @@ COPY --from=async-profiler-builder-glibc /usr/bin/xargs gprofiler/resources/php/ COPY --from=async-profiler-builder-glibc /tmp/async-profiler/build/bin/asprof gprofiler/resources/java/asprof COPY --from=async-profiler-builder-glibc /tmp/async-profiler/build/async-profiler-version gprofiler/resources/java/async-profiler-version +COPY --from=async-profiler-builder-glibc /tmp/pdeathsigger gprofiler/resources/pdeathsigger COPY --from=async-profiler-centos-min-test-glibc /libasyncProfiler.so gprofiler/resources/java/glibc/libasyncProfiler.so COPY --from=async-profiler-builder-musl /tmp/async-profiler/build/lib/libasyncProfiler.so gprofiler/resources/java/musl/libasyncProfiler.so COPY --from=node-package-builder-musl /tmp/module_build gprofiler/resources/node/module/musl diff --git a/gprofiler/main.py b/gprofiler/main.py index 9e845e891..597f85820 100644 --- a/gprofiler/main.py +++ b/gprofiler/main.py @@ -20,13 +20,12 @@ import logging.handlers import os import shutil -import signal import sys import time import traceback from pathlib import Path from threading import Event -from types import FrameType, TracebackType +from types import TracebackType from typing import Iterable, List, Optional, Type, cast import configargparse @@ -74,6 +73,7 @@ reset_umask, resource_path, run_process, + setup_signals, ) from gprofiler.utils.fs import escape_filename, mkdir_owned_root from gprofiler.utils.proxy import get_https_proxy @@ -98,21 +98,6 @@ UPLOAD_FILE_SUBCOMMAND = "upload-file" -# 1 KeyboardInterrupt raised per this many seconds, no matter how many SIGINTs we get. -SIGINT_RATELIMIT = 0.5 - -last_signal_ts: Optional[float] = None - - -def sigint_handler(sig: int, frame: Optional[FrameType]) -> None: - global last_signal_ts - ts = time.monotonic() - # no need for atomicity here: we can't get another SIGINT before this one returns. - # https://www.gnu.org/software/libc/manual/html_node/Signals-in-Handler.html#Signals-in-Handler - if last_signal_ts is None or ts > last_signal_ts + SIGINT_RATELIMIT: - last_signal_ts = ts - raise KeyboardInterrupt - class GProfiler: def __init__( @@ -939,17 +924,6 @@ def verify_preconditions(args: configargparse.Namespace, processes_to_profile: O sys.exit(1) -def setup_signals() -> None: - # When we run under staticx & PyInstaller, both of them forward (some of the) signals to gProfiler. - # We catch SIGINTs and ratelimit them, to avoid being interrupted again during the handling of the - # first INT. - # See my commit message for more information. - signal.signal(signal.SIGINT, sigint_handler) - # handle SIGTERM in the same manner - gracefully stop gProfiler. - # SIGTERM is also forwarded by staticx & PyInstaller, so we need to ratelimit it. - signal.signal(signal.SIGTERM, sigint_handler) - - def log_system_info() -> None: system_info = get_static_system_info() logger.info(f"gProfiler Python version: {system_info.python_version}") diff --git a/gprofiler/metadata/system_metadata.py b/gprofiler/metadata/system_metadata.py index 285ea3859..f2d818929 100644 --- a/gprofiler/metadata/system_metadata.py +++ b/gprofiler/metadata/system_metadata.py @@ -48,7 +48,12 @@ def decode_libc_version(version: bytes) -> str: try: ldd_version = run_process( - ["ldd", "--version"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, suppress_log=True, check=False + ["ldd", "--version"], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + suppress_log=True, + check=False, + pdeathsigger=False, ).stdout except FileNotFoundError: ldd_version = b"ldd not found" diff --git a/gprofiler/metadata/versions.py b/gprofiler/metadata/versions.py index a0a3b55bd..5a0e8b700 100644 --- a/gprofiler/metadata/versions.py +++ b/gprofiler/metadata/versions.py @@ -35,7 +35,9 @@ def get_exe_version( exe_path = f"/proc/{get_process_nspid(process.pid)}/exe" def _run_get_version() -> "CompletedProcess[bytes]": - return run_process([exe_path, version_arg], stop_event=stop_event, timeout=get_version_timeout) + return run_process( + [exe_path, version_arg], stop_event=stop_event, timeout=get_version_timeout, pdeathsigger=False + ) try: cp = run_in_ns(["pid", "mnt"], _run_get_version, process.pid) diff --git a/gprofiler/profilers/java.py b/gprofiler/profilers/java.py index a5737541f..fbba33856 100644 --- a/gprofiler/profilers/java.py +++ b/gprofiler/profilers/java.py @@ -350,6 +350,7 @@ def _run_java_version() -> "CompletedProcess[bytes]": ], stop_event=stop_event, timeout=_JAVA_VERSION_TIMEOUT, + pdeathsigger=False, ) # doesn't work without changing PID NS as well (I'm getting ENOENT for libjli.so) diff --git a/gprofiler/profilers/python.py b/gprofiler/profilers/python.py index 12027cfc6..404e1f3c3 100644 --- a/gprofiler/profilers/python.py +++ b/gprofiler/profilers/python.py @@ -137,6 +137,7 @@ def _run_python_process_in_ns() -> "CompletedProcess[bytes]": [python_path, "-S", "-c", "import sys; print(sys.maxunicode)"], stop_event=self._stop_event, timeout=self._PYTHON_TIMEOUT, + pdeathsigger=False, ) return run_in_ns(["pid", "mnt"], _run_python_process_in_ns, process.pid).stdout.decode().strip() diff --git a/gprofiler/utils/__init__.py b/gprofiler/utils/__init__.py index e9823286c..b8db9c83f 100644 --- a/gprofiler/utils/__init__.py +++ b/gprofiler/utils/__init__.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import atexit import ctypes import datetime import glob @@ -33,6 +34,7 @@ from subprocess import CompletedProcess, Popen, TimeoutExpired from tempfile import TemporaryDirectory from threading import Event +from types import FrameType from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union, cast import importlib_resources @@ -70,6 +72,12 @@ gprofiler_mutex: Optional[socket.socket] = None +# 1 KeyboardInterrupt raised per this many seconds, no matter how many SIGINTs we get. +SIGINT_RATELIMIT = 0.5 + +_last_signal_ts: Optional[float] = None +_processes: List[Popen] = [] + @lru_cache(maxsize=None) def resource_path(relative_path: str = "") -> str: @@ -90,52 +98,20 @@ def is_root() -> bool: return os.geteuid() == 0 -libc: Optional[ctypes.CDLL] = None - - -def prctl(*argv: Any) -> int: - global libc - if libc is None: - libc = ctypes.CDLL("libc.so.6", use_errno=True) - return cast(int, libc.prctl(*argv)) - - -PR_SET_PDEATHSIG = 1 - - -def set_child_termination_on_parent_death() -> int: - ret = prctl(PR_SET_PDEATHSIG, signal.SIGTERM) - if ret != 0: - errno = ctypes.get_errno() - logger.warning( - f"Failed to set parent-death signal on child process. errno: {errno}, strerror: {os.strerror(errno)}" - ) - return ret - - -def wrap_callbacks(callbacks: List[Callable]) -> Callable: - # Expects array of callback. - # Returns one callback that call each one of them, and returns the retval of last callback - def wrapper() -> Any: - ret = None - for cb in callbacks: - ret = cb() - - return ret - - return wrapper - - def start_process( cmd: Union[str, List[str]], via_staticx: bool = False, - term_on_parent_death: bool = True, tmpdir: Optional[Path] = None, **kwargs: Any, ) -> Popen: + global _processes + if isinstance(cmd, str): cmd = [cmd] + if kwargs.pop("pdeathsigger", True) and is_linux(): + cmd = [resource_path("pdeathsigger")] + cmd if is_linux() else cmd + logger.debug("Running command", command=cmd) env = kwargs.pop("env", None) @@ -161,23 +137,17 @@ def start_process( # explicitly remove our directory from LD_LIBRARY_PATH env["LD_LIBRARY_PATH"] = "" - if is_windows(): - cur_preexec_fn = None # preexec_fn is not supported on Windows platforms. subprocess.py reports this. - else: - cur_preexec_fn = kwargs.pop("preexec_fn", os.setpgrp) - if term_on_parent_death: - cur_preexec_fn = wrap_callbacks([set_child_termination_on_parent_death, cur_preexec_fn]) - - popen = Popen( + process = Popen( cmd, stdout=kwargs.pop("stdout", subprocess.PIPE), stderr=kwargs.pop("stderr", subprocess.PIPE), stdin=subprocess.PIPE, - preexec_fn=cur_preexec_fn, + start_new_session=is_linux(), # TODO: change to "process_group" after upgrade to Python 3.11+ env=env, **kwargs, ) - return popen + _processes.append(process) + return process def wait_event(timeout: float, stop_event: Event, condition: Callable[[], bool], interval: float = 0.1) -> None: @@ -361,6 +331,7 @@ def pgrep_maps(match: str) -> List[Process]: shell=True, suppress_log=True, check=False, + pdeathsigger=False, ) # 0 - found # 1 - not found @@ -563,3 +534,30 @@ def merge_dicts(source: Dict[str, Any], dest: Dict[str, Any]) -> Dict[str, Any]: def is_profiler_disabled(profile_mode: str) -> bool: return profile_mode in ("none", "disabled") + + +def _exit_handler() -> None: + for process in _processes: + process.kill() + + +def _sigint_handler(sig: int, frame: Optional[FrameType]) -> None: + global _last_signal_ts + ts = time.monotonic() + # no need for atomicity here: we can't get another SIGINT before this one returns. + # https://www.gnu.org/software/libc/manual/html_node/Signals-in-Handler.html#Signals-in-Handler + if _last_signal_ts is None or ts > _last_signal_ts + SIGINT_RATELIMIT: + _last_signal_ts = ts + raise KeyboardInterrupt + + +def setup_signals() -> None: + atexit.register(_exit_handler) + # When we run under staticx & PyInstaller, both of them forward (some of the) signals to gProfiler. + # We catch SIGINTs and ratelimit them, to avoid being interrupted again during the handling of the + # first INT. + # See my commit message for more information. + signal.signal(signal.SIGINT, _sigint_handler) + # handle SIGTERM in the same manner - gracefully stop gProfiler. + # SIGTERM is also forwarded by staticx & PyInstaller, so we need to ratelimit it. + signal.signal(signal.SIGTERM, _sigint_handler) diff --git a/gprofiler/utils/fs.py b/gprofiler/utils/fs.py index 974ebab14..40a50e074 100644 --- a/gprofiler/utils/fs.py +++ b/gprofiler/utils/fs.py @@ -57,7 +57,7 @@ def is_rw_exec_dir(path: Path) -> bool: # try executing try: - run_process([str(test_script)], suppress_log=True) + run_process([str(test_script)], suppress_log=True, pdeathsigger=False) except PermissionError: # noexec return False diff --git a/scripts/pdeathsigger.c b/scripts/pdeathsigger.c new file mode 100644 index 000000000..fe8f5b98e --- /dev/null +++ b/scripts/pdeathsigger.c @@ -0,0 +1,28 @@ +#include +#include +#include +#include +#include + +/* + preexec_fn is not safe to use in the presence of threads, + child process could deadlock before exec is called. + this little shim is a workaround to avoid using preexe_fn and + still get the desired behavior (PR_SET_PDEATHSIG). +*/ +int main(int argc, char *argv[]) { + if (argc < 2) { + fprintf(stderr, "Usage: %s /path/to/binary [args...]\n", argv[0]); + return 1; + } + + if (prctl(PR_SET_PDEATHSIG, SIGKILL) == -1) { + perror("prctl"); + return 1; + } + + execvp(argv[1], &argv[1]); + + perror("execvp"); + return 1; +} diff --git a/tests/test_java.py b/tests/test_java.py index 620d7352a..9569ea9f5 100644 --- a/tests/test_java.py +++ b/tests/test_java.py @@ -226,6 +226,7 @@ def test_java_safemode_version_check( monkeypatch.setitem(JavaProfiler.MINIMAL_SUPPORTED_VERSIONS, 8, (Version("8.999"), 0)) with make_java_profiler(profiler_state) as profiler: + profiler._profiler_state.get_container_name(application_pid) process = profiler._select_processes_to_profile()[0] jvm_version_str = cast_away_optional(get_java_version(process, profiler._profiler_state.stop_event)) jvm_version = parse_jvm_version(jvm_version_str) @@ -245,6 +246,7 @@ def test_java_safemode_build_number_check( profiler_state: ProfilerState, ) -> None: with make_java_profiler(profiler_state) as profiler: + profiler._profiler_state.get_container_name(application_pid) process = profiler._select_processes_to_profile()[0] jvm_version_str = cast_away_optional(get_java_version(process, profiler._profiler_state.stop_event)) jvm_version = parse_jvm_version(jvm_version_str) @@ -417,6 +419,7 @@ def test_sanity_other_jvms( frequency=99, java_async_profiler_mode="cpu", ) as profiler: + profiler._profiler_state.get_container_name(application_pid) process = psutil.Process(application_pid) assert search_for in cast_away_optional(get_java_version(process, profiler._profiler_state.stop_event)) process_collapsed = snapshot_pid_collapsed(profiler, application_pid) @@ -438,6 +441,7 @@ def test_sanity_latest_jvms( """ with make_java_profiler(profiler_state) as profiler: + profiler._profiler_state.get_container_name(application_pid) # sanity check that this is the correct JVM we're targeting assert search_for in cast_away_optional( get_java_version(psutil.Process(application_pid), profiler._profiler_state.stop_event) @@ -482,11 +486,12 @@ def _filter_record(r: LogRecord) -> bool: # find the log record of # Running command (command=['/app/gprofiler/resources/java/apsprof', '', 'load', # '/path/to/libasyncProfiler.so', 'true', 'start,...']) + command = log_record_extra(r).get("command", []) return ( r.message == "Running command" - and len(log_record_extra(r)["command"]) == 6 - and log_record_extra(r)["command"][2] == "load" - and any(map(lambda k: k in log_record_extra(r)["command"][5], ["start,", "stop,"])) + and len(command) == 7 + and command[3] == "load" + and any(map(lambda k: k in command[6], ["start,", "stop,"])) ) return list(filter(_filter_record, records)) @@ -563,7 +568,7 @@ def test_java_noexec_or_ro_dirs( assert len(jattach_loads) == 2 # 3rd part of commandline to AP - shall begin with POSSIBLE_AP_DIRS[1] assert all( - log_record_extra(jl)["command"][3].startswith(f"{gprofiler.profilers.java.POSSIBLE_AP_DIRS[1]}/async-profiler-") + log_record_extra(jl)["command"][4].startswith(f"{gprofiler.profilers.java.POSSIBLE_AP_DIRS[1]}/async-profiler-") for jl in jattach_loads ) @@ -612,7 +617,7 @@ def test_java_symlinks_in_paths( # 2 entries - start and stop assert len(jattach_loads) == 2 # 3rd part of commandline to AP - shall begin with the final, resolved path. - assert all(log_record_extra(jl)["command"][3].startswith("/run/final_tmp/gprofiler_tmp/") for jl in jattach_loads) + assert all(log_record_extra(jl)["command"][4].startswith("/run/final_tmp/gprofiler_tmp/") for jl in jattach_loads) @pytest.mark.parametrize("in_container", [True]) # only in container is enough @@ -1246,10 +1251,9 @@ def test_collect_flags_unsupported_filtered_out( f"exec java {java_cli_flags} -jar Fibonacci.jar", ], ) as container: - assert ( - profiler._metadata.get_jvm_flags_serialized(psutil.Process(container.attrs["State"]["Pid"])) - == expected_flags - ) + pid = container.attrs["State"]["Pid"] + profiler._profiler_state.get_container_name(pid) + assert profiler._metadata.get_jvm_flags_serialized(psutil.Process(pid)) == expected_flags log_record = next(filter(lambda r: r.message == "Missing requested flags:", caplog.records)) # use slicing to remove the leading -XX: instead of removeprefix as it's not available in python 3.8 assert ( diff --git a/tests/test_perf.py b/tests/test_perf.py index 04b144a05..cbdb62f2b 100644 --- a/tests/test_perf.py +++ b/tests/test_perf.py @@ -15,6 +15,7 @@ # import logging +import time from threading import Event from typing import Dict, cast @@ -144,6 +145,7 @@ def test_perf_comm_change( I'm not sure it can be done, i.e is this info even kept anywhere). """ with system_profiler as profiler: + time.sleep(2) # first run - we get the changed name, because the app started before perf began recording. _assert_comm_in_profile(profiler, application_pid, False) @@ -170,6 +172,7 @@ def test_perf_thread_comm_is_process_comm( starts after perf, the exec comm of the process should be used (see test_perf_comm_change) """ with system_profiler as profiler: + time.sleep(2) # running perf & script now with --show-task-events would show: # pative 1925947 [010] 987095.272656: PERF_RECORD_COMM: pative:1925904/1925947 # our perf will prefer to use the exec comm, OR oldest comm available if exec diff --git a/tests/utils.py b/tests/utils.py index dc22f136e..b1203dd66 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -141,6 +141,7 @@ def run_privileged_container( def _no_errors(logs: str) -> None: # example line: [2021-06-12 10:13:57,528] ERROR: gprofiler: ruby profiling failed assert "] ERROR: " not in logs, f"found ERRORs in gProfiler logs!: {logs}" + assert "Could not acquire gProfiler's lock" not in logs, f"found lock error in gProfiler logs!: {logs}" def run_gprofiler_in_container(docker_client: DockerClient, image: Image, command: List[str], **kwargs: Any) -> None: @@ -205,7 +206,17 @@ def assert_ldd_version_container(container: Container, version: str) -> None: def snapshot_pid_profile(profiler: ProfilerInterface, pid: int) -> ProfileData: - return profiler.snapshot()[pid] + last_snapshot = profiler.snapshot() + + def has_profile() -> bool: + nonlocal last_snapshot + if pid in last_snapshot: + return True + last_snapshot = profiler.snapshot() + return pid in last_snapshot + + wait_event(timeout=5, stop_event=Event(), condition=has_profile, interval=0.1) + return last_snapshot[pid] def snapshot_pid_collapsed(profiler: ProfilerInterface, pid: int) -> StackToSampleCount: From 22665e8ddbf0417aa1c550ed21ffe8ef4a455a3f Mon Sep 17 00:00:00 2001 From: slicklash Date: Wed, 9 Oct 2024 00:51:11 +0200 Subject: [PATCH 2/3] up --- tests/test_java.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/test_java.py b/tests/test_java.py index 9569ea9f5..2f22ec188 100644 --- a/tests/test_java.py +++ b/tests/test_java.py @@ -226,7 +226,6 @@ def test_java_safemode_version_check( monkeypatch.setitem(JavaProfiler.MINIMAL_SUPPORTED_VERSIONS, 8, (Version("8.999"), 0)) with make_java_profiler(profiler_state) as profiler: - profiler._profiler_state.get_container_name(application_pid) process = profiler._select_processes_to_profile()[0] jvm_version_str = cast_away_optional(get_java_version(process, profiler._profiler_state.stop_event)) jvm_version = parse_jvm_version(jvm_version_str) @@ -246,7 +245,6 @@ def test_java_safemode_build_number_check( profiler_state: ProfilerState, ) -> None: with make_java_profiler(profiler_state) as profiler: - profiler._profiler_state.get_container_name(application_pid) process = profiler._select_processes_to_profile()[0] jvm_version_str = cast_away_optional(get_java_version(process, profiler._profiler_state.stop_event)) jvm_version = parse_jvm_version(jvm_version_str) @@ -419,7 +417,6 @@ def test_sanity_other_jvms( frequency=99, java_async_profiler_mode="cpu", ) as profiler: - profiler._profiler_state.get_container_name(application_pid) process = psutil.Process(application_pid) assert search_for in cast_away_optional(get_java_version(process, profiler._profiler_state.stop_event)) process_collapsed = snapshot_pid_collapsed(profiler, application_pid) @@ -441,7 +438,6 @@ def test_sanity_latest_jvms( """ with make_java_profiler(profiler_state) as profiler: - profiler._profiler_state.get_container_name(application_pid) # sanity check that this is the correct JVM we're targeting assert search_for in cast_away_optional( get_java_version(psutil.Process(application_pid), profiler._profiler_state.stop_event) From 97575da7631ebacd012c13f61fb6f4239067f612 Mon Sep 17 00:00:00 2001 From: slicklash Date: Tue, 15 Oct 2024 16:31:47 +0200 Subject: [PATCH 3/3] 2s interval for snapshot_pid_profile --- tests/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utils.py b/tests/utils.py index b1203dd66..c8b3cde31 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -215,7 +215,7 @@ def has_profile() -> bool: last_snapshot = profiler.snapshot() return pid in last_snapshot - wait_event(timeout=5, stop_event=Event(), condition=has_profile, interval=0.1) + wait_event(timeout=5, stop_event=Event(), condition=has_profile, interval=2.0) return last_snapshot[pid]