Skip to content

add pdeathsigger #931

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions executable.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,12 @@ RUN ./phpspy_build.sh
# async-profiler glibc
FROM centos${AP_BUILDER_CENTOS} AS async-profiler-builder-glibc
WORKDIR /tmp
COPY scripts/async_profiler_env_glibc.sh scripts/fix_centos7.sh ./
COPY scripts/async_profiler_env_glibc.sh scripts/fix_centos7.sh scripts/pdeathsigger.c ./
RUN if grep -q "CentOS Linux 7" /etc/os-release ; then \
./fix_centos7.sh; \
fi
RUN ./async_profiler_env_glibc.sh
RUN ./async_profiler_env_glibc.sh && \
gcc -static -o pdeathsigger pdeathsigger.c

COPY scripts/async_profiler_build_shared.sh .
RUN ./async_profiler_build_shared.sh
Expand Down Expand Up @@ -256,6 +257,7 @@ COPY --from=async-profiler-builder-glibc /usr/bin/xargs gprofiler/resources/php/

COPY --from=async-profiler-builder-glibc /tmp/async-profiler/build/bin/asprof gprofiler/resources/java/asprof
COPY --from=async-profiler-builder-glibc /tmp/async-profiler/build/async-profiler-version gprofiler/resources/java/async-profiler-version
COPY --from=async-profiler-builder-glibc /tmp/pdeathsigger gprofiler/resources/pdeathsigger
COPY --from=async-profiler-centos-min-test-glibc /libasyncProfiler.so gprofiler/resources/java/glibc/libasyncProfiler.so
COPY --from=async-profiler-builder-musl /tmp/async-profiler/build/lib/libasyncProfiler.so gprofiler/resources/java/musl/libasyncProfiler.so
COPY --from=node-package-builder-musl /tmp/module_build gprofiler/resources/node/module/musl
Expand Down
30 changes: 2 additions & 28 deletions gprofiler/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,12 @@
import logging.handlers
import os
import shutil
import signal
import sys
import time
import traceback
from pathlib import Path
from threading import Event
from types import FrameType, TracebackType
from types import TracebackType
from typing import Iterable, List, Optional, Type, cast

import configargparse
Expand Down Expand Up @@ -74,6 +73,7 @@
reset_umask,
resource_path,
run_process,
setup_signals,
)
from gprofiler.utils.fs import escape_filename, mkdir_owned_root
from gprofiler.utils.proxy import get_https_proxy
Expand All @@ -98,21 +98,6 @@

UPLOAD_FILE_SUBCOMMAND = "upload-file"

# 1 KeyboardInterrupt raised per this many seconds, no matter how many SIGINTs we get.
SIGINT_RATELIMIT = 0.5

last_signal_ts: Optional[float] = None


def sigint_handler(sig: int, frame: Optional[FrameType]) -> None:
global last_signal_ts
ts = time.monotonic()
# no need for atomicity here: we can't get another SIGINT before this one returns.
# https://www.gnu.org/software/libc/manual/html_node/Signals-in-Handler.html#Signals-in-Handler
if last_signal_ts is None or ts > last_signal_ts + SIGINT_RATELIMIT:
last_signal_ts = ts
raise KeyboardInterrupt


class GProfiler:
def __init__(
Expand Down Expand Up @@ -939,17 +924,6 @@ def verify_preconditions(args: configargparse.Namespace, processes_to_profile: O
sys.exit(1)


def setup_signals() -> None:
# When we run under staticx & PyInstaller, both of them forward (some of the) signals to gProfiler.
# We catch SIGINTs and ratelimit them, to avoid being interrupted again during the handling of the
# first INT.
# See my commit message for more information.
signal.signal(signal.SIGINT, sigint_handler)
# handle SIGTERM in the same manner - gracefully stop gProfiler.
# SIGTERM is also forwarded by staticx & PyInstaller, so we need to ratelimit it.
signal.signal(signal.SIGTERM, sigint_handler)


def log_system_info() -> None:
system_info = get_static_system_info()
logger.info(f"gProfiler Python version: {system_info.python_version}")
Expand Down
7 changes: 6 additions & 1 deletion gprofiler/metadata/system_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,12 @@ def decode_libc_version(version: bytes) -> str:

try:
ldd_version = run_process(
["ldd", "--version"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, suppress_log=True, check=False
["ldd", "--version"],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
suppress_log=True,
check=False,
pdeathsigger=False,
).stdout
except FileNotFoundError:
ldd_version = b"ldd not found"
Expand Down
4 changes: 3 additions & 1 deletion gprofiler/metadata/versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ def get_exe_version(
exe_path = f"/proc/{get_process_nspid(process.pid)}/exe"

def _run_get_version() -> "CompletedProcess[bytes]":
return run_process([exe_path, version_arg], stop_event=stop_event, timeout=get_version_timeout)
return run_process(
[exe_path, version_arg], stop_event=stop_event, timeout=get_version_timeout, pdeathsigger=False
)

try:
cp = run_in_ns(["pid", "mnt"], _run_get_version, process.pid)
Expand Down
1 change: 1 addition & 0 deletions gprofiler/profilers/java.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,7 @@ def _run_java_version() -> "CompletedProcess[bytes]":
],
stop_event=stop_event,
timeout=_JAVA_VERSION_TIMEOUT,
pdeathsigger=False,
)

# doesn't work without changing PID NS as well (I'm getting ENOENT for libjli.so)
Expand Down
1 change: 1 addition & 0 deletions gprofiler/profilers/python.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ def _run_python_process_in_ns() -> "CompletedProcess[bytes]":
[python_path, "-S", "-c", "import sys; print(sys.maxunicode)"],
stop_event=self._stop_event,
timeout=self._PYTHON_TIMEOUT,
pdeathsigger=False,
)

return run_in_ns(["pid", "mnt"], _run_python_process_in_ns, process.pid).stdout.decode().strip()
Expand Down
92 changes: 45 additions & 47 deletions gprofiler/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import atexit
import ctypes
import datetime
import glob
Expand All @@ -33,6 +34,7 @@
from subprocess import CompletedProcess, Popen, TimeoutExpired
from tempfile import TemporaryDirectory
from threading import Event
from types import FrameType
from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union, cast

import importlib_resources
Expand Down Expand Up @@ -70,6 +72,12 @@

gprofiler_mutex: Optional[socket.socket] = None

# 1 KeyboardInterrupt raised per this many seconds, no matter how many SIGINTs we get.
SIGINT_RATELIMIT = 0.5

_last_signal_ts: Optional[float] = None
_processes: List[Popen] = []


@lru_cache(maxsize=None)
def resource_path(relative_path: str = "") -> str:
Expand All @@ -90,52 +98,20 @@ def is_root() -> bool:
return os.geteuid() == 0


libc: Optional[ctypes.CDLL] = None


def prctl(*argv: Any) -> int:
global libc
if libc is None:
libc = ctypes.CDLL("libc.so.6", use_errno=True)
return cast(int, libc.prctl(*argv))


PR_SET_PDEATHSIG = 1


def set_child_termination_on_parent_death() -> int:
ret = prctl(PR_SET_PDEATHSIG, signal.SIGTERM)
if ret != 0:
errno = ctypes.get_errno()
logger.warning(
f"Failed to set parent-death signal on child process. errno: {errno}, strerror: {os.strerror(errno)}"
)
return ret


def wrap_callbacks(callbacks: List[Callable]) -> Callable:
# Expects array of callback.
# Returns one callback that call each one of them, and returns the retval of last callback
def wrapper() -> Any:
ret = None
for cb in callbacks:
ret = cb()

return ret

return wrapper


def start_process(
cmd: Union[str, List[str]],
via_staticx: bool = False,
term_on_parent_death: bool = True,
tmpdir: Optional[Path] = None,
**kwargs: Any,
) -> Popen:
global _processes

if isinstance(cmd, str):
cmd = [cmd]

if kwargs.pop("pdeathsigger", True) and is_linux():
cmd = [resource_path("pdeathsigger")] + cmd if is_linux() else cmd

logger.debug("Running command", command=cmd)

env = kwargs.pop("env", None)
Expand All @@ -161,23 +137,17 @@ def start_process(
# explicitly remove our directory from LD_LIBRARY_PATH
env["LD_LIBRARY_PATH"] = ""

if is_windows():
cur_preexec_fn = None # preexec_fn is not supported on Windows platforms. subprocess.py reports this.
else:
cur_preexec_fn = kwargs.pop("preexec_fn", os.setpgrp)
if term_on_parent_death:
cur_preexec_fn = wrap_callbacks([set_child_termination_on_parent_death, cur_preexec_fn])

popen = Popen(
process = Popen(
cmd,
stdout=kwargs.pop("stdout", subprocess.PIPE),
stderr=kwargs.pop("stderr", subprocess.PIPE),
stdin=subprocess.PIPE,
preexec_fn=cur_preexec_fn,
start_new_session=is_linux(), # TODO: change to "process_group" after upgrade to Python 3.11+
env=env,
**kwargs,
)
return popen
_processes.append(process)
return process


def wait_event(timeout: float, stop_event: Event, condition: Callable[[], bool], interval: float = 0.1) -> None:
Expand Down Expand Up @@ -361,6 +331,7 @@ def pgrep_maps(match: str) -> List[Process]:
shell=True,
suppress_log=True,
check=False,
pdeathsigger=False,
)
# 0 - found
# 1 - not found
Expand Down Expand Up @@ -563,3 +534,30 @@ def merge_dicts(source: Dict[str, Any], dest: Dict[str, Any]) -> Dict[str, Any]:

def is_profiler_disabled(profile_mode: str) -> bool:
return profile_mode in ("none", "disabled")


def _exit_handler() -> None:
for process in _processes:
process.kill()


def _sigint_handler(sig: int, frame: Optional[FrameType]) -> None:
global _last_signal_ts
ts = time.monotonic()
# no need for atomicity here: we can't get another SIGINT before this one returns.
# https://www.gnu.org/software/libc/manual/html_node/Signals-in-Handler.html#Signals-in-Handler
if _last_signal_ts is None or ts > _last_signal_ts + SIGINT_RATELIMIT:
_last_signal_ts = ts
raise KeyboardInterrupt


def setup_signals() -> None:
atexit.register(_exit_handler)
# When we run under staticx & PyInstaller, both of them forward (some of the) signals to gProfiler.
# We catch SIGINTs and ratelimit them, to avoid being interrupted again during the handling of the
# first INT.
# See my commit message for more information.
signal.signal(signal.SIGINT, _sigint_handler)
# handle SIGTERM in the same manner - gracefully stop gProfiler.
# SIGTERM is also forwarded by staticx & PyInstaller, so we need to ratelimit it.
signal.signal(signal.SIGTERM, _sigint_handler)
2 changes: 1 addition & 1 deletion gprofiler/utils/fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def is_rw_exec_dir(path: Path) -> bool:

# try executing
try:
run_process([str(test_script)], suppress_log=True)
run_process([str(test_script)], suppress_log=True, pdeathsigger=False)
except PermissionError:
# noexec
return False
Expand Down
28 changes: 28 additions & 0 deletions scripts/pdeathsigger.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/prctl.h>
#include <signal.h>

/*
preexec_fn is not safe to use in the presence of threads,
child process could deadlock before exec is called.
this little shim is a workaround to avoid using preexe_fn and
still get the desired behavior (PR_SET_PDEATHSIG).
*/
int main(int argc, char *argv[]) {
if (argc < 2) {
fprintf(stderr, "Usage: %s /path/to/binary [args...]\n", argv[0]);
return 1;
}

if (prctl(PR_SET_PDEATHSIG, SIGKILL) == -1) {
perror("prctl");
return 1;
}

execvp(argv[1], &argv[1]);

perror("execvp");
return 1;
}
18 changes: 9 additions & 9 deletions tests/test_java.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,11 +482,12 @@ def _filter_record(r: LogRecord) -> bool:
# find the log record of
# Running command (command=['/app/gprofiler/resources/java/apsprof', '<PID>', 'load',
# '/path/to/libasyncProfiler.so', 'true', 'start,...'])
command = log_record_extra(r).get("command", [])
return (
r.message == "Running command"
and len(log_record_extra(r)["command"]) == 6
and log_record_extra(r)["command"][2] == "load"
and any(map(lambda k: k in log_record_extra(r)["command"][5], ["start,", "stop,"]))
and len(command) == 7
and command[3] == "load"
and any(map(lambda k: k in command[6], ["start,", "stop,"]))
)

return list(filter(_filter_record, records))
Expand Down Expand Up @@ -563,7 +564,7 @@ def test_java_noexec_or_ro_dirs(
assert len(jattach_loads) == 2
# 3rd part of commandline to AP - shall begin with POSSIBLE_AP_DIRS[1]
assert all(
log_record_extra(jl)["command"][3].startswith(f"{gprofiler.profilers.java.POSSIBLE_AP_DIRS[1]}/async-profiler-")
log_record_extra(jl)["command"][4].startswith(f"{gprofiler.profilers.java.POSSIBLE_AP_DIRS[1]}/async-profiler-")
for jl in jattach_loads
)

Expand Down Expand Up @@ -612,7 +613,7 @@ def test_java_symlinks_in_paths(
# 2 entries - start and stop
assert len(jattach_loads) == 2
# 3rd part of commandline to AP - shall begin with the final, resolved path.
assert all(log_record_extra(jl)["command"][3].startswith("/run/final_tmp/gprofiler_tmp/") for jl in jattach_loads)
assert all(log_record_extra(jl)["command"][4].startswith("/run/final_tmp/gprofiler_tmp/") for jl in jattach_loads)


@pytest.mark.parametrize("in_container", [True]) # only in container is enough
Expand Down Expand Up @@ -1246,10 +1247,9 @@ def test_collect_flags_unsupported_filtered_out(
f"exec java {java_cli_flags} -jar Fibonacci.jar",
],
) as container:
assert (
profiler._metadata.get_jvm_flags_serialized(psutil.Process(container.attrs["State"]["Pid"]))
== expected_flags
)
pid = container.attrs["State"]["Pid"]
profiler._profiler_state.get_container_name(pid)
assert profiler._metadata.get_jvm_flags_serialized(psutil.Process(pid)) == expected_flags
log_record = next(filter(lambda r: r.message == "Missing requested flags:", caplog.records))
# use slicing to remove the leading -XX: instead of removeprefix as it's not available in python 3.8
assert (
Expand Down
3 changes: 3 additions & 0 deletions tests/test_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#

import logging
import time
from threading import Event
from typing import Dict, cast

Expand Down Expand Up @@ -144,6 +145,7 @@ def test_perf_comm_change(
I'm not sure it can be done, i.e is this info even kept anywhere).
"""
with system_profiler as profiler:
time.sleep(2)
# first run - we get the changed name, because the app started before perf began recording.
_assert_comm_in_profile(profiler, application_pid, False)

Expand All @@ -170,6 +172,7 @@ def test_perf_thread_comm_is_process_comm(
starts after perf, the exec comm of the process should be used (see test_perf_comm_change)
"""
with system_profiler as profiler:
time.sleep(2)
# running perf & script now with --show-task-events would show:
# pative 1925947 [010] 987095.272656: PERF_RECORD_COMM: pative:1925904/1925947
# our perf will prefer to use the exec comm, OR oldest comm available if exec
Expand Down
Loading
Loading