From b01cabe60ce8f67b47a0190af36a5f68cc0837af Mon Sep 17 00:00:00 2001 From: Josh Usiskin <56369778+jusiskin@users.noreply.github.com> Date: Wed, 2 Oct 2024 14:56:53 +0000 Subject: [PATCH 1/9] feat: directly signal subprocesses before using sudo Signed-off-by: Josh Usiskin <56369778+jusiskin@users.noreply.github.com> --- src/openjd/sessions/_runner_base.py | 16 +--- .../_scripts/_posix/_signal_subprocess.sh | 21 ----- src/openjd/sessions/_subprocess.py | 91 ++++++++++++++++--- 3 files changed, 80 insertions(+), 48 deletions(-) diff --git a/src/openjd/sessions/_runner_base.py b/src/openjd/sessions/_runner_base.py index bced5d4b..17178448 100644 --- a/src/openjd/sessions/_runner_base.py +++ b/src/openjd/sessions/_runner_base.py @@ -376,18 +376,7 @@ def _run(self, args: Sequence[str], time_limit: Optional[timedelta] = None) -> N def _generate_command_shell_script(self, args: Sequence[str]) -> str: """Generate a shell script for running a command given by the args.""" script = list[str]() - script.append( - ( - "#!/bin/sh\n" - "_term() {\n" - " echo 'Caught SIGTERM'\n" - ' test "${CHILD_PID:-}" != "" && echo "Sending SIGTERM to ${CHILD_PID}" && kill -s TERM "${CHILD_PID}"\n' - ' wait "${CHILD_PID}"\n' - " exit $?\n" # The wait returns the exit code of the waited-for process - "}\n" - "trap _term TERM" - ) - ) + script.append("#!/bin/sh\n") if self._os_env_vars: for name, value in self._os_env_vars.items(): if value is None: @@ -398,8 +387,7 @@ def _generate_command_shell_script(self, args: Sequence[str]) -> str: # Note: Single quotes around the path as it may have spaces, and we don't want to # process any shell commands in the path. script.append(f"cd '{self._startup_directory}'") - script.append(shlex.join(args) + " &") - script.append(("CHILD_PID=$!\n" 'wait "$CHILD_PID"\n' "exit $?\n")) + script.append("exec " + shlex.join(args)) return "\n".join(script) def _materialize_files( diff --git a/src/openjd/sessions/_scripts/_posix/_signal_subprocess.sh b/src/openjd/sessions/_scripts/_posix/_signal_subprocess.sh index eaa609e0..75b3565b 100755 --- a/src/openjd/sessions/_scripts/_posix/_signal_subprocess.sh +++ b/src/openjd/sessions/_scripts/_posix/_signal_subprocess.sh @@ -31,35 +31,14 @@ set -x PID="$1" SIG="$2" -SIGNAL_CHILD="${3:-False}" -INCL_SUBPROCS="${4:-False}" [ -f /bin/kill ] && KILL=/bin/kill [ ! -n "${KILL:-}" ] && [ -f /usr/bin/kill ] && KILL=/usr/bin/kill -[ -f /bin/pgrep ] && PGREP=/bin/pgrep -[ ! -n "${PGREP:-}" ] && [ -f /usr/bin/pgrep ] && PGREP=/usr/bin/pgrep - if [ ! -n "${KILL:-}" ] then echo "ERROR - Could not find the 'kill' command under /bin or /usr/bin. Please install it." exit 1 fi -if [ ! -n "${PGREP:-}" ] -then - echo "ERROR - Could not find the 'pgrep' command under /bin or /usr/bin. Please install it." - exit 1 -fi - -if test "${SIGNAL_CHILD}" = "True" -then - PID=$( "${PGREP}" -P "${PID}" ) -fi - -if test "${INCL_SUBPROCS}" = "True" -then - PID=-"${PID}" -fi - exec "$KILL" -s "$SIG" -- "$PID" diff --git a/src/openjd/sessions/_subprocess.py b/src/openjd/sessions/_subprocess.py index 3242669e..f9af2df2 100644 --- a/src/openjd/sessions/_subprocess.py +++ b/src/openjd/sessions/_subprocess.py @@ -2,6 +2,7 @@ import os import shlex +import signal import time from ._os_checker import is_posix, is_windows @@ -14,7 +15,7 @@ from threading import Event, Thread from ._logging import LoggerAdapter, LogContent, LogExtraInfo from subprocess import DEVNULL, PIPE, STDOUT, Popen, list2cmdline, run -from typing import Callable, Optional, Sequence, cast +from typing import Callable, Literal, Optional, Sequence, cast from pathlib import Path from datetime import timedelta import sys @@ -200,7 +201,7 @@ def notify(self) -> None: """ if self._process is not None and self._process.poll() is None: if is_posix(): - self._posix_signal_subprocess(signal="term", signal_subprocesses=False) + self._posix_signal_subprocess(signal_name="term") else: self._windows_notify_subprocess() @@ -216,7 +217,7 @@ def terminate(self) -> None: """ if self._process is not None and self._process.poll() is None: if is_posix(): - self._posix_signal_subprocess(signal="kill", signal_subprocesses=True) + self._posix_signal_subprocess(signal_name="kill") else: self._logger.info( f"INTERRUPT: Start killing the process tree with the root pid: {self._process.pid}", @@ -298,7 +299,7 @@ def _start_subprocess(self) -> Optional[Popen]: ) return None - def _log_subproc_stdout(self): + def _log_subproc_stdout(self) -> None: """ Blocking call which logs the STDOUT of the running subproc until the subprocess exits. @@ -423,7 +424,10 @@ def _tosigned(n: int) -> int: extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), ) - def _posix_signal_subprocess(self, signal: str, signal_subprocesses: bool = False) -> None: + def _posix_signal_subprocess( + self, + signal_name: Literal["term", "kill"], + ) -> None: """Send a given named signal, via pkill, to the subprocess when it is running as a different user than this process. """ @@ -450,23 +454,84 @@ def _posix_signal_subprocess(self, signal: str, signal_subprocesses: bool = Fals # b. When we run the command using `sudo` then we need to either run code that does the whole # algorithm as the other user, or `sudo` to send every process signal. + numeric_signal = 0 + if signal_name == "term": + numeric_signal = signal.SIGTERM + elif signal_name == "kill": + numeric_signal = signal.SIGKILL + else: + raise NotImplementedError(f"Unsupported signal: {signal_name}") + cmd = list[str]() - signal_child = False + pgid = os.getpgid(process.pid) + sudo_subproc_pid: Optional[int] = None if self._user is not None: user = cast(PosixSessionUser, self._user) # Only sudo if the user to run as is not the same as the current user. if not user.is_process_user(): cmd.extend(["sudo", "-u", user.user, "-i"]) - signal_child = True + pgrep_result = run( + ["pgrep", "-P", str(process.pid)], + stdout=PIPE, + stderr=STDOUT, + stdin=DEVNULL, + text=True, + ) + if pgrep_result.returncode != 0: + self._logger.warning( + f"Failed to send signal '{signal_name}' to subprocess {process.pid}: Unable to query child processes of sudo process" + ) + return + results = pgrep_result.stdout.splitlines() + if len(results) != 1: + self._logger.warning( + f"Failed to send signal '{signal_name}' to subprocess {process.pid}: More than one child proccess of sudo process" + ) + return + sudo_subproc_pid = int(results[0]) + pgid = os.getpgid(sudo_subproc_pid) + + # Try directly signaling the process(es) first + try: + if signal_name == "kill": + self._logger.info( + f'INTERRUPT: Sending signal "{signal_name}" to process group {pgid}', + extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), + ) + os.killpg(pgid, numeric_signal) + else: + self._logger.info( + f'INTERRUPT: Sending signal "{signal_name}" to process {process.pid}', + extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), + ) + os.kill(sudo_subproc_pid or process.pid, numeric_signal) + except OSError: + self._logger.info( + "Could not directly send signal {signal_name} to {process.pid}, trying sudo.", + extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), + ) + else: + return + + signal_pid_str: str + if signal_name == "kill": + signal_pid_str = f"-{pgid}" + elif sudo_subproc_pid: + signal_pid_str = str(sudo_subproc_pid) + else: + signal_pid_str = str(process.pid) + + # pstree_result = run(["pstree", "-p"], stdout=PIPE, stderr=STDOUT, stdin=DEVNULL, text=True) + # self._logger.info("Pstree output: %s", pstree_result.stdout.read()) cmd.extend( [ - str(POSIX_SIGNAL_SUBPROC_SCRIPT_PATH), - str(process.pid), - signal, - str(signal_child), - str(signal_subprocesses), + "kill", + "-s", + signal_name, + "--", + signal_pid_str, ] ) self._logger.info( @@ -481,7 +546,7 @@ def _posix_signal_subprocess(self, signal: str, signal_subprocesses: bool = Fals ) if result.returncode != 0: self._logger.warning( - f"Failed to send signal '{signal}' to subprocess {process.pid}: %s", + f"Failed to send signal '{signal_name}' to subprocess {signal_pid_str}: %s", result.stdout.decode("utf-8"), extra=LogExtraInfo( openjd_log_content=LogContent.PROCESS_CONTROL | LogContent.EXCEPTION_INFO From 52f62a0d6b5299796c14cc1c7480c88e9e65719d Mon Sep 17 00:00:00 2001 From: Josh Usiskin <56369778+jusiskin@users.noreply.github.com> Date: Thu, 19 Sep 2024 15:59:02 +0000 Subject: [PATCH 2/9] feat: CAP_KILL detection Signed-off-by: Josh Usiskin <56369778+jusiskin@users.noreply.github.com> --- pyproject.toml | 1 + src/openjd/sessions/_linux/__init__.py | 1 + src/openjd/sessions/_linux/_capabilities.py | 242 ++++++++++++++++++++ 3 files changed, 244 insertions(+) create mode 100644 src/openjd/sessions/_linux/__init__.py create mode 100644 src/openjd/sessions/_linux/_capabilities.py diff --git a/pyproject.toml b/pyproject.toml index 58797c09..8dee984b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -169,6 +169,7 @@ fail_under = 79 # https://github.com/wemake-services/coverage-conditional-plugin [tool.coverage.coverage_conditional_plugin.omit] "sys_platform != 'win32'" = [ + "src/openjd/sessions/_linux/*.py", "src/openjd/sessions/_win32/*.py", "src/openjd/sessions/_scripts/_windows/*.py", "src/openjd/sessions/_windows*.py" diff --git a/src/openjd/sessions/_linux/__init__.py b/src/openjd/sessions/_linux/__init__.py new file mode 100644 index 00000000..8d929cc8 --- /dev/null +++ b/src/openjd/sessions/_linux/__init__.py @@ -0,0 +1 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/src/openjd/sessions/_linux/_capabilities.py b/src/openjd/sessions/_linux/_capabilities.py new file mode 100644 index 00000000..890217e3 --- /dev/null +++ b/src/openjd/sessions/_linux/_capabilities.py @@ -0,0 +1,242 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + +"""This module contains code for interacting with Linux capabilities. The module uses the ctypes +module from the Python standard library to wrap the libcap library. + +See https://man7.org/linux/man-pages/man7/capabilities.7.html for details on this Linux kernel +feature. +""" + +import ctypes +import os +import sys +from contextlib import contextmanager +from ctypes.util import find_library +from enum import Enum +from functools import cache +from typing import Any, Generator, Tuple, TYPE_CHECKING + + +from .._logging import LOG + + +# Capability sets +CAP_EFFECTIVE = 0 +CAP_PERMITTED = 1 +CAP_INHERITABLE = 2 + +# Capability bit numbers +CAP_KILL = 5 + +# Values for cap_flag_value_t arguments +CAP_CLEAR = 0 +CAP_SET = 1 + +cap_flag_t = ctypes.c_int +cap_flag_value_t = ctypes.c_int +cap_value_t = ctypes.c_int + + +class CapabilitySetType(Enum): + INHERITABLE = CAP_INHERITABLE + PERMITTED = CAP_PERMITTED + EFFECTIVE = CAP_EFFECTIVE + + +class UserCapHeader(ctypes.Structure): + _fields_ = [ + ("version", ctypes.c_uint32), + ("pid", ctypes.c_int), + ] + + +class UserCapData(ctypes.Structure): + _fields_ = [ + ("effective", ctypes.c_uint32), + ("permitted", ctypes.c_uint32), + ("inheritable", ctypes.c_uint32), + ] + + +class Cap(ctypes.Structure): + _fields_ = [ + ("head", UserCapHeader), + ("data", UserCapData), + ] + + +if TYPE_CHECKING: + cap_t = ctypes._Pointer[Cap] + cap_flag_value_ptr = ctypes._Pointer[cap_flag_value_t] + cap_value_ptr = ctypes._Pointer[cap_value_t] + ssize_ptr_t = ctypes._Pointer[ctypes.c_ssize_t] +else: + cap_t = ctypes.POINTER(Cap) + cap_flag_value_ptr = ctypes.POINTER(cap_flag_value_t) + cap_value_ptr = ctypes.POINTER(cap_value_t) + ssize_ptr_t = ctypes.POINTER(ctypes.c_ssize_t) + + +def _cap_set_err_check( + result: ctypes.c_int, + func: Any, + args: Tuple[Any, ...], +) -> ctypes.c_int: + if result != 0: + errno = ctypes.get_errno() + raise OSError(errno, os.strerror(errno)) + return result + + +def _cap_get_proc_err_check( + result: cap_t, + func: Any, + args: Tuple[cap_t, cap_flag_t, ctypes.c_int, cap_value_ptr, cap_flag_value_t], +) -> cap_t: + if not result: + errno = ctypes.get_errno() + raise OSError(errno, os.strerror(errno)) + return result + + +def _cap_get_flag_errcheck( + result: ctypes.c_int, func: Any, args: Tuple[cap_t, cap_value_t, cap_flag_t, cap_flag_value_ptr] +) -> ctypes.c_int: + if result != 0: + errno = ctypes.get_errno() + raise OSError(errno, os.strerror(errno)) + return result + + +@cache +def _get_libcap() -> ctypes.CDLL: + if not sys.platform.startswith("linux"): + raise OSError(f"libcap is only available on Linux, but found platform: {sys.platform}") + + libcap = ctypes.CDLL(find_library("cap"), use_errno=True) + + # https://man7.org/linux/man-pages/man3/cap_set_proc.3.html + libcap.cap_set_proc.restype = ctypes.c_int + libcap.cap_set_proc.argtypes = [ + ctypes.POINTER(Cap), + ] + libcap.cap_set_proc.errcheck = _cap_set_err_check # type: ignore + + # https://man7.org/linux/man-pages/man3/cap_get_proc.3.html + libcap.cap_get_proc.restype = cap_t + libcap.cap_get_proc.argtypes = [] + libcap.cap_get_proc.errcheck = _cap_get_proc_err_check # type: ignore + + # https://man7.org/linux/man-pages/man3/cap_set_flag.3.html + libcap.cap_set_flag.restype = ctypes.c_int + libcap.cap_set_flag.argtypes = [ + cap_t, + cap_flag_t, + ctypes.c_int, + cap_value_ptr, + cap_flag_value_t, + ] + + # https://man7.org/linux/man-pages/man3/cap_get_flag.3.html + libcap.cap_get_flag.restype = ctypes.c_int + libcap.cap_get_flag.argtypes = ( + cap_t, + cap_value_t, + cap_flag_t, + cap_flag_value_ptr, + ) + libcap.cap_get_flag.errcheck = _cap_get_flag_errcheck # type: ignore + + return libcap + + +def _has_capability( + *, + caps: cap_t, + capability: int, + capability_set_type: CapabilitySetType, +) -> bool: + libcap = _get_libcap() + flag_value = cap_flag_value_t() + libcap.cap_get_flag(caps, capability, capability_set_type.value, ctypes.byref(flag_value)) + return flag_value.value == CAP_SET + + +@contextmanager +def try_use_cap_kill() -> Generator[bool, None, None]: + """ + A context-manager that attempts to leverage the CAP_KILL Linux capability. + + If CAP_KILL is in the current thread's effective set, this context-manager takes no action and + yields True. + + If CAP_KILL is not in the effective set but is in the permitted set, the context-manager: + 1. adds CAP_KILL to the effective set before entering the context-manager + 2. yields True + 3. clears CAP_KILL from the effective set when exiting the context-manager + + Otherwise, the context-manager does nothing and yields False + + Returns: + A context manager that yields a bool. See above for details. + """ + if not sys.platform.startswith("linux"): + raise OSError(f"Only Linux is supported, but platform is {sys.platform}") + + libcap = _get_libcap() + caps = libcap.cap_get_proc() + + if _has_capability( + caps=caps, capability=CAP_KILL, capability_set_type=CapabilitySetType.EFFECTIVE + ): + LOG.debug("CAP_KILL is in the thread's effective set") + # CAP_KILL is already in the effective set + yield True + elif _has_capability( + caps=caps, + capability=CAP_KILL, + capability_set_type=CapabilitySetType.PERMITTED, + ): + # CAP_KILL is in the permitted set. We will temporarily add it to the effective set + LOG.debug("CAP_KILL is in the thread's permitted set. Temporarily adding to effective set") + cap_value_arr_t = cap_value_t * 1 + cap_value_arr = cap_value_arr_t() + cap_value_arr[0] = CAP_KILL + libcap.cap_set_flag( + caps, + CAP_EFFECTIVE, + 1, + cap_value_arr, + CAP_SET, + ) + libcap.cap_set_proc(caps) + try: + yield True + finally: + # Clear CAP_KILL from the effective set + LOG.debug("Clearing CAP_KILL from the thread's effective set") + libcap.cap_set_flag( + caps, + CAP_EFFECTIVE, + 1, + cap_value_arr, + CAP_CLEAR, + ) + libcap.cap_set_proc(caps) + else: + yield False + + +def main() -> None: + """A developer debugging entrypoint for testing the try_use_cap_kill() behaviour""" + import logging + + logging.basicConfig(level=logging.DEBUG) + logging.getLogger("openjd.sessions").setLevel(logging.DEBUG) + + with try_use_cap_kill() as has_cap_kill: + LOG.info("Has CAP_KILL: %s", has_cap_kill) + + +if __name__ == "__main__": + main() From 458b687af527e4254339dcfda2883d977176f498 Mon Sep 17 00:00:00 2001 From: Josh Usiskin <56369778+jusiskin@users.noreply.github.com> Date: Wed, 2 Oct 2024 14:50:08 +0000 Subject: [PATCH 3/9] test: send cancel signals directly on Linux when CAP_KILL detected Signed-off-by: Josh Usiskin <56369778+jusiskin@users.noreply.github.com> --- pyproject.toml | 3 + scripts/run_sudo_tests.sh | 23 +++++ test/openjd/sessions/conftest.py | 19 ++++ .../support_files/output_signal_sender.c | 43 +++++++++ test/openjd/sessions/test_runner_base.py | 91 +++++++++++++++++++ .../localuser_sudo_environment/Dockerfile | 6 +- 6 files changed, 183 insertions(+), 2 deletions(-) create mode 100644 test/openjd/sessions/support_files/output_signal_sender.c diff --git a/pyproject.toml b/pyproject.toml index 8dee984b..a930b8b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -148,6 +148,9 @@ addopts = [ "--numprocesses=auto", "--timeout=30" ] +markers = [ + "requires_cap_kill: tests that require CAP_KILL Linux capability", +] [tool.coverage.run] diff --git a/scripts/run_sudo_tests.sh b/scripts/run_sudo_tests.sh index d9bc9a47..6bbb5f7c 100755 --- a/scripts/run_sudo_tests.sh +++ b/scripts/run_sudo_tests.sh @@ -67,3 +67,26 @@ if test "${BUILD_ONLY}" == "True"; then fi docker run --name test_openjd_sudo --rm ${ARGS} "${CONTAINER_IMAGE_TAG}:latest" + +if test "${USE_LDAP}" != "True"; then + # Run capability tests + # First with CAP_KILL in effective and permitted capability sets + docker run --name test_openjd_sudo --user root --rm ${ARGS} "${CONTAINER_IMAGE_TAG}:latest" \ + capsh \ + --caps='cap_setuid,cap_setgid,cap_setpcap=ep cap_kill=eip' \ + --keep=1 \ + --user=hostuser \ + --addamb=cap_kill \ + -- \ + -c 'capsh --noamb --caps=cap_kill=ep -- -c "hatch run test --no-cov -m requires_cap_kill"' + # Second with CAP_KILL in permitted capability set but not effective capability set + # this tests that OpenJD will add CAP_KILL to the effective capability set if needed + docker run --name test_openjd_sudo --user root --rm ${ARGS} "${CONTAINER_IMAGE_TAG}:latest" \ + capsh \ + --caps='cap_setuid,cap_setgid,cap_setpcap=ep cap_kill=eip' \ + --keep=1 \ + --user=hostuser \ + --addamb=cap_kill \ + -- \ + -c 'capsh --noamb --caps=cap_kill=p -- -c "hatch run test --no-cov -m requires_cap_kill"' +fi diff --git a/test/openjd/sessions/conftest.py b/test/openjd/sessions/conftest.py index 28bec725..659e0dab 100644 --- a/test/openjd/sessions/conftest.py +++ b/test/openjd/sessions/conftest.py @@ -36,6 +36,25 @@ POSIX_SET_DISJOINT_USER_ENV_VARS_MESSAGE = f"Must define environment vars {POSIX_DISJOINT_USER_ENV_VAR} and {POSIX_DISJOINT_GROUP_ENV_VAR} to run target-user impersonation tests on posix." +def pytest_collection_modifyitems(config, items): + """This is a pytest hook that provides a default mark expression if one was not provided. By + default, we want to de-select tests that require the CAP_KILL Linux capability. + + Those tests should only be selected when running the Docker container test workflow + described in DEVELOPMENT.md which grant the necessary capabilities and specify a + mark expression. + + See: + - https://docs.pytest.org/en/8.3.x/reference/reference.html#pytest.hookspec.pytest_collection_modifyitems + - https://docs.pytest.org/en/8.3.x/reference/reference.html#command-line-flags + """ + mark_expr = config.getoption("markexpr", False) + if not mark_expr: + config.option.markexpr = "not requires_cap_kill" + else: + config.option.markexpr = mark_expr + + def build_logger(handler: QueueHandler) -> LoggerAdapter: charset = string.ascii_letters + string.digits + string.punctuation name_suffix = "".join(random.choices(charset, k=32)) diff --git a/test/openjd/sessions/support_files/output_signal_sender.c b/test/openjd/sessions/support_files/output_signal_sender.c new file mode 100644 index 00000000..87a0adc1 --- /dev/null +++ b/test/openjd/sessions/support_files/output_signal_sender.c @@ -0,0 +1,43 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + +// This is a minimal C program that sleeps until it receives a SIGTERM signal +// and outputs the process ID of the sender. + +#include +#include +#include +#include +#include +#include +#include +#include + +static bool received_signal = false; + +static void signal_handler(int sig, siginfo_t *siginfo, void *context) { + // Output PID of the sending process + int pid_of_sending_process = (int) siginfo->si_pid; + printf("%d\n", pid_of_sending_process); + + // Tell main loop to exit + received_signal = true; +} + +int main(int argc, char *argv[]) { + // register signal handler + struct sigaction signal_action; + signal_action.sa_sigaction = *signal_handler; + // get details about the signal + signal_action.sa_flags |= SA_SIGINFO; + if(sigaction(SIGTERM, &signal_action, NULL) != 0) { + printf("Could not register signal handler\n"); + return errno; + } + + // sleep until SIGINT received + while(!received_signal) { + sleep(1); + } + + return 0; +} diff --git a/test/openjd/sessions/test_runner_base.py b/test/openjd/sessions/test_runner_base.py index 55bab0eb..c2a789f7 100644 --- a/test/openjd/sessions/test_runner_base.py +++ b/test/openjd/sessions/test_runner_base.py @@ -787,6 +787,97 @@ def test_cancel_notify( delta_t = time_end - now assert timedelta(seconds=1) < delta_t < timedelta(seconds=3) + @pytest.mark.skipif(not is_posix(), reason="posix-only test") + @pytest.mark.xfail( + not has_posix_target_user(), + reason=POSIX_SET_TARGET_USER_ENV_VARS_MESSAGE, + ) + @pytest.mark.requires_cap_kill + def test_cancel_notify_direct_signal_with_cap_kill( + self, + tmp_path: Path, + message_queue: SimpleQueue, + queue_handler: QueueHandler, + ) -> None: + # Test for Linux hosts, that when CAP_KILL is in the permitted (and possibly effective) + # capability set(s), that the runner will: + # 1. directly signal the subprocess to notify + # 2. retain the status of CAP_KILL in the thread's effective capability set + + # GIVEN + logger = build_logger(queue_handler) + + from openjd.sessions._linux._capabilities import ( + _has_capability, + _get_libcap, + CAP_KILL, + CapabilitySetType, + ) + + # Record whether CAP_KILL is in the effective capability set before + # notifying the subprocess + libcap = _get_libcap() + caps = libcap.cap_get_proc() + cap_kill_was_effective = _has_capability( + caps=caps, + capability=CAP_KILL, + capability_set_type=CapabilitySetType.EFFECTIVE, + ) + + with NotifyingRunner(logger=logger, session_working_directory=tmp_path) as runner: + # Path to compiled C program that outputs the PID of the process sending the signal + output_signal_sender_app_loc = ( + Path(__file__).parent / "support_files" / "output_signal_sender" + ).resolve() + assert output_signal_sender_app_loc.exists(), "output_signal_sender is not compiled." + runner._run([str(output_signal_sender_app_loc)]) + + # WHEN + secs = 2 if not is_windows() else 5 + time.sleep(secs) # Give the process a little time to do something + runner.cancel(time_limit=timedelta(seconds=2)) + + # THEN + for _ in range(10): + if runner.state == ScriptRunnerState.CANCELED: + break + time.sleep(1) + else: + # Terminate the subprocess + runner.cancel() + assert False, "output_signal_sender process did not exit when sent SIGTERM" + assert runner.exit_code == 0 + + # Collect stdout lines. Based on the code of output_signal_sender.c, only a single + # line should be output with the PID of the process that sent the SIGINT signal. + # Extracting the log line requires finding the preceeding log line emitted by the runner, + # then taking the following line and parsing it as an integer + messages = collect_queue_messages(message_queue) + for i, message in enumerate(messages): + if message.startswith('INTERRUPT: Sending signal "term" to process '): + break + else: + assert False, "Could not find log line before stdout" + pid_line = messages[i + 1] + signal_sender_pid = int(pid_line) + + current_pid = os.getpid() + assert ( + current_pid == signal_sender_pid + ), "The runner's subprocess was not directly signalled" + + # Assert that the presence/absence of CAP_KILL in the effective capability set + # is unchanged after calling Runner.cancel() + caps = libcap.cap_get_proc() + cap_kill_effective_after_cancel = _has_capability( + caps=caps, + capability=CAP_KILL, + capability_set_type=CapabilitySetType.EFFECTIVE, + ) + assert ( + cap_kill_was_effective == cap_kill_effective_after_cancel + ), "CAP_KILL added/removed from effetive set and persisted after cancelation" + @pytest.mark.usefixtures("message_queue", "queue_handler") def test_cancel_double_cancel_notify( self, diff --git a/testing_containers/localuser_sudo_environment/Dockerfile b/testing_containers/localuser_sudo_environment/Dockerfile index 5ef00112..d02f35e0 100644 --- a/testing_containers/localuser_sudo_environment/Dockerfile +++ b/testing_containers/localuser_sudo_environment/Dockerfile @@ -20,7 +20,7 @@ ENV PIP_INDEX_URL=$PIP_INDEX_URL # hostuser: hostuser, sharedgroup # targetuser: targetuser, sharedgroup # disjointuser: disjointuser, disjointgroup -RUN apt-get update && apt-get install -y sudo && \ +RUN apt-get update && apt-get install -y sudo psmisc libcap2-bin libcap2 libcap-dev gcc && \ # Clean up apt cache rm -rf /var/lib/apt/lists/* && \ apt-get clean && \ @@ -39,6 +39,8 @@ USER hostuser COPY --chown=hostuser:hostuser . /code/ WORKDIR /code -RUN hatch env create +RUN hatch env create && \ + # compile the output_signal_sender program which outputs the PID of a process sending a SIGTERM signal \ + gcc -Wall /code/test/openjd/sessions/support_files/output_signal_sender.c -o /code/test/openjd/sessions/support_files/output_signal_sender CMD ["hatch", "run", "test", "--no-cov"] \ No newline at end of file From ce5ca7a6f79e65ff900ee8d77efae9925ee9265a Mon Sep 17 00:00:00 2001 From: Josh Usiskin <56369778+jusiskin@users.noreply.github.com> Date: Thu, 14 Nov 2024 19:44:57 +0000 Subject: [PATCH 4/9] feat: optimize sudo subprocess PID retrieval Signed-off-by: Josh Usiskin <56369778+jusiskin@users.noreply.github.com> --- src/openjd/sessions/_os_checker.py | 7 ++ src/openjd/sessions/_subprocess.py | 185 ++++++++++++++++++++++++----- 2 files changed, 162 insertions(+), 30 deletions(-) diff --git a/src/openjd/sessions/_os_checker.py b/src/openjd/sessions/_os_checker.py index 6166326d..c42c2dda 100644 --- a/src/openjd/sessions/_os_checker.py +++ b/src/openjd/sessions/_os_checker.py @@ -1,11 +1,18 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. import os +import sys +LINUX = "linux" +MACOS = "darwin" POSIX = "posix" WINDOWS = "nt" +def is_linux() -> bool: + return sys.platform == LINUX + + def is_posix() -> bool: return os.name == POSIX diff --git a/src/openjd/sessions/_subprocess.py b/src/openjd/sessions/_subprocess.py index f9af2df2..b2a99c84 100644 --- a/src/openjd/sessions/_subprocess.py +++ b/src/openjd/sessions/_subprocess.py @@ -1,10 +1,11 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +import glob import os import shlex import signal import time -from ._os_checker import is_posix, is_windows +from ._os_checker import is_linux, is_posix, is_windows if is_windows(): from subprocess import CREATE_NEW_PROCESS_GROUP, CREATE_NO_WINDOW # type: ignore @@ -15,7 +16,7 @@ from threading import Event, Thread from ._logging import LoggerAdapter, LogContent, LogExtraInfo from subprocess import DEVNULL, PIPE, STDOUT, Popen, list2cmdline, run -from typing import Callable, Literal, Optional, Sequence, cast +from typing import Callable, Literal, NamedTuple, Optional, Sequence, cast from pathlib import Path from datetime import timedelta import sys @@ -50,6 +51,20 @@ class LoggingSubprocess(object): """A process whose stdout/stderr lines are sent to a given Logger.""" + class _PosixSignalTarget(NamedTuple): + """A target for sending signals on POSIX hosts""" + + pid: int + """The process ID""" + + pgid: int + """The process group ID""" + + class _FindSignalTargetError(Exception): + """Exception when unable to detect the signal target""" + + pass + _logger: LoggerAdapter _process: Optional[Popen] _args: Sequence[str] @@ -64,6 +79,8 @@ class LoggingSubprocess(object): _pid: Optional[int] _returncode: Optional[int] + _posix_signal_target: Optional[_PosixSignalTarget] + def __init__( self, *, @@ -94,12 +111,11 @@ def __init__( self._has_started = Event() self._pid = None self._returncode = None + self._posix_signal_target = None @property def pid(self) -> Optional[int]: - if self._pid is not None: - return self._pid - return None + return self._pid @property def exit_code(self) -> Optional[int]: @@ -166,6 +182,9 @@ def run(self) -> None: self._pid = self._process.pid + if is_posix(): + self._posix_signal_target = self._find_posix_signal_target() + self._logger.info( f"Command started as pid: {self._process.pid}", extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), @@ -189,6 +208,121 @@ def run(self) -> None: self._process = None del proc + def _find_posix_signal_target( + self, + *, + timeout_seconds: float = 1, + ) -> Optional[_PosixSignalTarget]: + process = self._process + if process is None: + raise ValueError("Process not launched") + process_pid = process.pid + if not is_posix(): + raise NotImplementedError(f"Only POSIX supported, but running on {sys.platform}") + if timeout_seconds <= 0: + raise ValueError( + f"Expected positive value for timeout_seconds but got {timeout_seconds}" + ) + + # If a user is not specified or is the same as the host process user, we signal the + # immediate child + if not self._user or self._user.is_process_user(): + return LoggingSubprocess._PosixSignalTarget( + pid=process_pid, + pgid=os.getpgid(process_pid), + ) + + # For cross-user support, we use sudo which creates an intermediate process: + # + # openjd-process + # | + # +-- sudo + # | + # +-- subprocess + # + # Sudo forwards signals that it is able to handle, but in the case of SIGKILL sudo cannot + # handle the signal and the kernel will kill it leaving the child orphaned. We need to + # send SIGKILL signals to the subprocess of sudo + start = time.monotonic() + now = start + + # Repeatedly scan for child processes + # + # This is put in a retry loop, because it takes a non-zero amount of time before sudo and + # the kernel finish creating the subprocess. We cap this because the process may exit + # quickly and we may never find the child process. + try: + while now - start < timeout_seconds: + if is_linux(): + if signal_target := self._find_linux_signal_target(sudo_pid=process_pid): + return signal_target + else: + if signal_target := self._find_nonlinux_signal_target(sudo_pid=process_pid): + return signal_target + + # If we did not find any child processes yet, sleep for some time and retry + time.sleep(min(0.05, now - start)) + now = time.monotonic() + except LoggingSubprocess._FindSignalTargetError as e: + self._logger.warning( + f"Unable to determine signal target: {e}", + extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), + ) + + return None + + def _find_linux_signal_target(self, *, sudo_pid: int) -> Optional[_PosixSignalTarget]: + # Look for the child process of sudo using procfs. See + # https://docs.kernel.org/filesystems/proc.html#proc-pid-task-tid-children-information-about-task-children + children_pids: set[int] = set() + for task_children_path in glob.glob(f"/proc/{sudo_pid}/task/**/children"): + with open(task_children_path, "r") as f: + children_pids.update(int(pid_str) for pid_str in f.read().split()) + + # If we found exactly one child, we return it + if len(children_pids) == 1: + child_pid = children_pids.pop() + self._logger.info( + f"Command process (sudo child) PID is {child_pid}", + extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), + ) + return LoggingSubprocess._PosixSignalTarget( + pid=child_pid, + pgid=os.getpgid(child_pid), + ) + # If we found multiple child processes, this violates our assumptions about how sudo + # works. We will fall-back to using pkill for signalling the process + elif len(children_pids) > 1: + raise LoggingSubprocess._FindSignalTargetError( + f"Expected single child processes of sudo, but found {children_pids}" + ) + return None + + def _find_nonlinux_signal_target(self, *, sudo_pid: int) -> Optional[_PosixSignalTarget]: + pgrep_result = run( + ["pgrep", "-P", str(sudo_pid)], + stdout=PIPE, + stderr=STDOUT, + stdin=DEVNULL, + text=True, + ) + if pgrep_result.returncode != 0: + raise LoggingSubprocess._FindSignalTargetError( + "Unable to query child processes of sudo process" + ) + results = pgrep_result.stdout.splitlines() + if len(results) > 1: + raise LoggingSubprocess._FindSignalTargetError( + f"Expected a single child process of sudo, but found {results}" + ) + elif len(results) == 0: + return None + sudo_subproc_pid = int(results[0]) + return LoggingSubprocess._PosixSignalTarget( + pid=sudo_subproc_pid, + pgid=os.getpgid(sudo_subproc_pid), + ) + def notify(self) -> None: """The 'Notify' part of Open Job Description's subprocess cancelation method. On Linux/macOS: @@ -462,7 +596,7 @@ def _posix_signal_subprocess( else: raise NotImplementedError(f"Unsupported signal: {signal_name}") - cmd = list[str]() + kill_cmd = list[str]() pgid = os.getpgid(process.pid) sudo_subproc_pid: Optional[int] = None @@ -470,27 +604,18 @@ def _posix_signal_subprocess( user = cast(PosixSessionUser, self._user) # Only sudo if the user to run as is not the same as the current user. if not user.is_process_user(): - cmd.extend(["sudo", "-u", user.user, "-i"]) - pgrep_result = run( - ["pgrep", "-P", str(process.pid)], - stdout=PIPE, - stderr=STDOUT, - stdin=DEVNULL, - text=True, - ) - if pgrep_result.returncode != 0: - self._logger.warning( - f"Failed to send signal '{signal_name}' to subprocess {process.pid}: Unable to query child processes of sudo process" - ) - return - results = pgrep_result.stdout.splitlines() - if len(results) != 1: - self._logger.warning( - f"Failed to send signal '{signal_name}' to subprocess {process.pid}: More than one child proccess of sudo process" - ) - return - sudo_subproc_pid = int(results[0]) - pgid = os.getpgid(sudo_subproc_pid) + kill_cmd = ["sudo", "-u", user.user, "-i"] + + # If we were unable to detect sudo's child process PID after launching the + # subprocess, we try again now + if not self._posix_signal_target: + self._posix_signal_target = self._find_posix_signal_target() + + if not self._posix_signal_target: + self._logger.warning( + f"Failed to send signal '{signal_name}' to subprocess {process.pid}: Unable to query child processes of sudo process" + ) + return # Try directly signaling the process(es) first try: @@ -525,7 +650,7 @@ def _posix_signal_subprocess( # pstree_result = run(["pstree", "-p"], stdout=PIPE, stderr=STDOUT, stdin=DEVNULL, text=True) # self._logger.info("Pstree output: %s", pstree_result.stdout.read()) - cmd.extend( + kill_cmd.extend( [ "kill", "-s", @@ -535,11 +660,11 @@ def _posix_signal_subprocess( ] ) self._logger.info( - f"INTERRUPT: Running: {shlex.join(cmd)}", + f"INTERRUPT: Running: {shlex.join(kill_cmd)}", extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), ) result = run( - cmd, + kill_cmd, stdout=PIPE, stderr=STDOUT, stdin=DEVNULL, From c9186bbbb2ca45e3b020a923e8f5339ac91e233a Mon Sep 17 00:00:00 2001 From: Josh Usiskin <56369778+jusiskin@users.noreply.github.com> Date: Fri, 15 Nov 2024 05:09:27 +0000 Subject: [PATCH 5/9] fix: wait for sudo to create a process group Signed-off-by: Josh Usiskin <56369778+jusiskin@users.noreply.github.com> --- src/openjd/sessions/_subprocess.py | 244 +++++++++++++++++------------ 1 file changed, 146 insertions(+), 98 deletions(-) diff --git a/src/openjd/sessions/_subprocess.py b/src/openjd/sessions/_subprocess.py index b2a99c84..e77fb207 100644 --- a/src/openjd/sessions/_subprocess.py +++ b/src/openjd/sessions/_subprocess.py @@ -4,24 +4,26 @@ import os import shlex import signal +import sys import time +from contextlib import nullcontext +from datetime import timedelta +from pathlib import Path +from queue import Queue, Empty +from subprocess import DEVNULL, PIPE, STDOUT, Popen, list2cmdline, run +from threading import Event, Thread +from typing import Any +from typing import Callable, Literal, NamedTuple, Optional, Sequence, cast + +from ._linux._capabilities import try_use_cap_kill +from ._logging import LoggerAdapter, LogContent, LogExtraInfo from ._os_checker import is_linux, is_posix, is_windows +from ._session_user import PosixSessionUser, WindowsSessionUser, SessionUser if is_windows(): from subprocess import CREATE_NEW_PROCESS_GROUP, CREATE_NO_WINDOW # type: ignore from ._win32._popen_as_user import PopenWindowsAsUser # type: ignore from ._windows_process_killer import kill_windows_process_tree -from queue import Queue, Empty -from typing import Any -from threading import Event, Thread -from ._logging import LoggerAdapter, LogContent, LogExtraInfo -from subprocess import DEVNULL, PIPE, STDOUT, Popen, list2cmdline, run -from typing import Callable, Literal, NamedTuple, Optional, Sequence, cast -from pathlib import Path -from datetime import timedelta -import sys - -from ._session_user import PosixSessionUser, WindowsSessionUser, SessionUser __all__ = ("LoggingSubprocess",) @@ -48,22 +50,24 @@ STDOUT_END_GRACETIME_SECONDS = 5 -class LoggingSubprocess(object): - """A process whose stdout/stderr lines are sent to a given Logger.""" +class SudoChildProcessIDs(NamedTuple): + """Process IDs for the sudo child process when the process is cross-user on POSIX""" + + pid: int + """The process ID""" - class _PosixSignalTarget(NamedTuple): - """A target for sending signals on POSIX hosts""" + pgid: int + """The process group ID""" - pid: int - """The process ID""" - pgid: int - """The process group ID""" +class FindSignalTargetError(Exception): + """Exception when unable to detect the signal target""" - class _FindSignalTargetError(Exception): - """Exception when unable to detect the signal target""" + pass - pass + +class LoggingSubprocess(object): + """A process whose stdout/stderr lines are sent to a given Logger.""" _logger: LoggerAdapter _process: Optional[Popen] @@ -77,10 +81,9 @@ class _FindSignalTargetError(Exception): _working_dir: Optional[str] _pid: Optional[int] + _sudo_child_process_ids: Optional[SudoChildProcessIDs] _returncode: Optional[int] - _posix_signal_target: Optional[_PosixSignalTarget] - def __init__( self, *, @@ -111,7 +114,7 @@ def __init__( self._has_started = Event() self._pid = None self._returncode = None - self._posix_signal_target = None + self._sudo_child_process_ids = None @property def pid(self) -> Optional[int]: @@ -183,7 +186,7 @@ def run(self) -> None: self._pid = self._process.pid if is_posix(): - self._posix_signal_target = self._find_posix_signal_target() + self._sudo_child_process_ids = self._find_sudo_child_process_ids() self._logger.info( f"Command started as pid: {self._process.pid}", @@ -208,11 +211,11 @@ def run(self) -> None: self._process = None del proc - def _find_posix_signal_target( + def _find_sudo_child_process_ids( self, *, timeout_seconds: float = 1, - ) -> Optional[_PosixSignalTarget]: + ) -> Optional[SudoChildProcessIDs]: process = self._process if process is None: raise ValueError("Process not launched") @@ -227,9 +230,10 @@ def _find_posix_signal_target( # If a user is not specified or is the same as the host process user, we signal the # immediate child if not self._user or self._user.is_process_user(): - return LoggingSubprocess._PosixSignalTarget( + pgid = os.getpgid(process_pid) + return SudoChildProcessIDs( pid=process_pid, - pgid=os.getpgid(process_pid), + pgid=pgid, ) # For cross-user support, we use sudo which creates an intermediate process: @@ -245,60 +249,90 @@ def _find_posix_signal_target( # send SIGKILL signals to the subprocess of sudo start = time.monotonic() now = start + sudo_pgid = os.getpgid(process_pid) # Repeatedly scan for child processes # # This is put in a retry loop, because it takes a non-zero amount of time before sudo and # the kernel finish creating the subprocess. We cap this because the process may exit # quickly and we may never find the child process. + sudo_child_pid: Optional[int] = None + sudo_child_pgid: Optional[int] = None try: while now - start < timeout_seconds: - if is_linux(): - if signal_target := self._find_linux_signal_target(sudo_pid=process_pid): - return signal_target - else: - if signal_target := self._find_nonlinux_signal_target(sudo_pid=process_pid): - return signal_target + if not sudo_child_pid: + if is_linux(): + sudo_child_pid = self._find_sudo_child_process_id_procfs( + sudo_pid=process_pid + ) + else: + sudo_child_pid = self._find_child_process_id_pgrep(sudo_pid=process_pid) + + if sudo_child_pid: + try: + sudo_child_pgid = os.getpgid(sudo_child_pid) + except ProcessLookupError: + # If the process has exited, we short-circuit + return None + # sudo first forks, then creates a new process group. There is a race condition + # where the process group ID we observe has not yet changed. If the PGID detected + # matches the PGID of sudo, then we retry again in the loop + if sudo_child_pgid == sudo_pgid: + sudo_child_pgid = None + else: + break # If we did not find any child processes yet, sleep for some time and retry time.sleep(min(0.05, now - start)) now = time.monotonic() - except LoggingSubprocess._FindSignalTargetError as e: + if not sudo_child_pid or not sudo_child_pgid: + raise FindSignalTargetError("unable to detect subprocess before timeout") + except FindSignalTargetError as e: self._logger.warning( f"Unable to determine signal target: {e}", extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), ) - return None + if sudo_child_pid and sudo_child_pgid: + self._logger.debug( + f"Signal target = PID {sudo_child_pid} / PGID {sudo_child_pgid}", + extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), + ) + return SudoChildProcessIDs( + pid=sudo_child_pid, + pgid=sudo_child_pgid, + ) + else: + return None - def _find_linux_signal_target(self, *, sudo_pid: int) -> Optional[_PosixSignalTarget]: + def _find_sudo_child_process_id_procfs(self, *, sudo_pid: int) -> Optional[int]: # Look for the child process of sudo using procfs. See # https://docs.kernel.org/filesystems/proc.html#proc-pid-task-tid-children-information-about-task-children - children_pids: set[int] = set() + + child_pids: set[int] = set() for task_children_path in glob.glob(f"/proc/{sudo_pid}/task/**/children"): with open(task_children_path, "r") as f: - children_pids.update(int(pid_str) for pid_str in f.read().split()) + child_pids.update(int(pid_str) for pid_str in f.read().split()) # If we found exactly one child, we return it - if len(children_pids) == 1: - child_pid = children_pids.pop() - self._logger.info( - f"Command process (sudo child) PID is {child_pid}", + if len(child_pids) == 1: + + child_pid = child_pids.pop() + + self._logger.debug( + f"Session action process (sudo child) PID is {child_pid}", extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), ) - return LoggingSubprocess._PosixSignalTarget( - pid=child_pid, - pgid=os.getpgid(child_pid), - ) + return child_pid # If we found multiple child processes, this violates our assumptions about how sudo # works. We will fall-back to using pkill for signalling the process - elif len(children_pids) > 1: - raise LoggingSubprocess._FindSignalTargetError( - f"Expected single child processes of sudo, but found {children_pids}" + elif len(child_pids) > 1: + raise FindSignalTargetError( + f"Expected single child processes of sudo, but found {child_pids}" ) return None - def _find_nonlinux_signal_target(self, *, sudo_pid: int) -> Optional[_PosixSignalTarget]: + def _find_child_process_id_pgrep(self, *, sudo_pid: int) -> Optional[int]: pgrep_result = run( ["pgrep", "-P", str(sudo_pid)], stdout=PIPE, @@ -307,21 +341,16 @@ def _find_nonlinux_signal_target(self, *, sudo_pid: int) -> Optional[_PosixSigna text=True, ) if pgrep_result.returncode != 0: - raise LoggingSubprocess._FindSignalTargetError( - "Unable to query child processes of sudo process" - ) + raise FindSignalTargetError("Unable to query child processes of sudo process") results = pgrep_result.stdout.splitlines() if len(results) > 1: - raise LoggingSubprocess._FindSignalTargetError( + raise FindSignalTargetError( f"Expected a single child process of sudo, but found {results}" ) elif len(results) == 0: return None sudo_subproc_pid = int(results[0]) - return LoggingSubprocess._PosixSignalTarget( - pid=sudo_subproc_pid, - pgid=os.getpgid(sudo_subproc_pid), - ) + return sudo_subproc_pid def notify(self) -> None: """The 'Notify' part of Open Job Description's subprocess cancelation method. @@ -562,9 +591,8 @@ def _posix_signal_subprocess( self, signal_name: Literal["term", "kill"], ) -> None: - """Send a given named signal, via pkill, to the subprocess when it is running - as a different user than this process. - """ + """Send a given named signal to the subprocess.""" + # We can run into a race condition where the process exits (and another thread sets self._process to None) # before the cancellation happens, so we swap to a local variable to ensure a cancellation that is not needed, # does not raise an exception here. @@ -591,14 +619,28 @@ def _posix_signal_subprocess( numeric_signal = 0 if signal_name == "term": numeric_signal = signal.SIGTERM + # SIGTERM is the simpler case. In the cross-user sudo case, we can send a signal to the + # sudo process and it will forward the signal. For the same-user case, the subprocess + # is the one we want to signal. + self._logger.info( + f'INTERRUPT: Sending signal "{signal_name}" to process {process.pid}', + extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), + ) + try: + os.kill(process.pid, numeric_signal) + except OSError: + self._logger.warning( + f"INTERRUPT: Unable to send {signal_name} to {process.pid}", + extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), + ) + finally: + return elif signal_name == "kill": numeric_signal = signal.SIGKILL else: raise NotImplementedError(f"Unsupported signal: {signal_name}") kill_cmd = list[str]() - pgid = os.getpgid(process.pid) - sudo_subproc_pid: Optional[int] = None if self._user is not None: user = cast(PosixSessionUser, self._user) @@ -608,47 +650,40 @@ def _posix_signal_subprocess( # If we were unable to detect sudo's child process PID after launching the # subprocess, we try again now - if not self._posix_signal_target: - self._posix_signal_target = self._find_posix_signal_target() + if not self._sudo_child_process_ids: + self._sudo_child_process_ids = self._find_sudo_child_process_ids() - if not self._posix_signal_target: + if not self._sudo_child_process_ids: self._logger.warning( - f"Failed to send signal '{signal_name}' to subprocess {process.pid}: Unable to query child processes of sudo process" + f"Failed to send signal '{signal_name}': Unable to determine child process of sudo" ) return # Try directly signaling the process(es) first - try: - if signal_name == "kill": - self._logger.info( - f'INTERRUPT: Sending signal "{signal_name}" to process group {pgid}', - extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), - ) - os.killpg(pgid, numeric_signal) + ctx_mgr = try_use_cap_kill() if is_linux() else nullcontext(enter_result=False) + with ctx_mgr as has_cap_kill: + if has_cap_kill or not self._user or self._user.is_process_user(): + try: + self._logger.info( + f'INTERRUPT: Sending signal "{signal_name}" to process group {self._sudo_child_process_ids.pgid}', + extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), + ) + os.killpg(self._sudo_child_process_ids.pgid, numeric_signal) + except OSError: + self._logger.info( + "Could not directly send signal {signal_name} to {self._posix_signal_target.pid}, trying sudo.", + extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), + ) + else: + return else: self._logger.info( - f'INTERRUPT: Sending signal "{signal_name}" to process {process.pid}', + "Could not directly send signal {signal_name} to {process.pid}, trying sudo.", extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), ) - os.kill(sudo_subproc_pid or process.pid, numeric_signal) - except OSError: - self._logger.info( - "Could not directly send signal {signal_name} to {process.pid}, trying sudo.", - extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), - ) - else: - return - signal_pid_str: str - if signal_name == "kill": - signal_pid_str = f"-{pgid}" - elif sudo_subproc_pid: - signal_pid_str = str(sudo_subproc_pid) - else: - signal_pid_str = str(process.pid) - - # pstree_result = run(["pstree", "-p"], stdout=PIPE, stderr=STDOUT, stdin=DEVNULL, text=True) - # self._logger.info("Pstree output: %s", pstree_result.stdout.read()) + # Uncomment to visualize process tree when debugging tests + # self._log_process_tree() kill_cmd.extend( [ @@ -656,7 +691,7 @@ def _posix_signal_subprocess( "-s", signal_name, "--", - signal_pid_str, + f"-{self._sudo_child_process_ids.pgid}", ] ) self._logger.info( @@ -671,13 +706,26 @@ def _posix_signal_subprocess( ) if result.returncode != 0: self._logger.warning( - f"Failed to send signal '{signal_name}' to subprocess {signal_pid_str}: %s", + f"Failed to send signal '{signal_name}' to PGID {self._sudo_child_process_ids.pgid}: %s", result.stdout.decode("utf-8"), extra=LogExtraInfo( openjd_log_content=LogContent.PROCESS_CONTROL | LogContent.EXCEPTION_INFO ), ) + def _log_process_tree(self) -> None: + """A developer method to visualize the process tree including PIDs and PGIDs when debuging tests""" + pstree_result = run(["pstree", "-pg"], stdout=PIPE, stderr=STDOUT, stdin=DEVNULL, text=True) + self._logger.debug( + f"pstree -pg output: {pstree_result.stdout}", + extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), + ) + ps_result = run(["ps", "-ejH"], stdout=PIPE, stderr=STDOUT, stdin=DEVNULL, text=True) + self._logger.debug( + f"ps -ejH output:\n{ps_result.stdout}", + extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), + ) + def _windows_notify_subprocess(self) -> None: """Sends a CTRL_BREAK_EVENT signal to the subprocess""" # Convince the type checker that accessing _process is okay From 621232c591f258d862ad0930b6b2d11fd30ca82e Mon Sep 17 00:00:00 2001 From: Josh Usiskin <56369778+jusiskin@users.noreply.github.com> Date: Fri, 15 Nov 2024 05:46:24 +0000 Subject: [PATCH 6/9] chore: fix linting errors on Windows and CI checks Signed-off-by: Josh Usiskin <56369778+jusiskin@users.noreply.github.com> --- pyproject.toml | 4 +++- src/openjd/sessions/_subprocess.py | 10 ++++++++-- .../localuser_sudo_environment/Dockerfile | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a930b8b2..a750e944 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -172,11 +172,13 @@ fail_under = 79 # https://github.com/wemake-services/coverage-conditional-plugin [tool.coverage.coverage_conditional_plugin.omit] "sys_platform != 'win32'" = [ - "src/openjd/sessions/_linux/*.py", "src/openjd/sessions/_win32/*.py", "src/openjd/sessions/_scripts/_windows/*.py", "src/openjd/sessions/_windows*.py" ] +"sys_platform != 'linux'" = [ + "src/openjd/sessions/_linux/*.py", +] [tool.coverage.coverage_conditional_plugin.rules] # This cannot be empty otherwise coverage-conditional-plugin crashes with: diff --git a/src/openjd/sessions/_subprocess.py b/src/openjd/sessions/_subprocess.py index e77fb207..cc70fb98 100644 --- a/src/openjd/sessions/_subprocess.py +++ b/src/openjd/sessions/_subprocess.py @@ -216,6 +216,9 @@ def _find_sudo_child_process_ids( *, timeout_seconds: float = 1, ) -> Optional[SudoChildProcessIDs]: + # Hint to mypy to not raise module attribute errors (e.g. missing os.getpgid) + if sys.platform == "win32": + raise NotImplementedError("This method is for POSIX hosts only") process = self._process if process is None: raise ValueError("Process not launched") @@ -593,6 +596,10 @@ def _posix_signal_subprocess( ) -> None: """Send a given named signal to the subprocess.""" + # Hint to mypy to not raise module attribute errors (e.g. missing os.getpgid) + if sys.platform == "win32": + raise NotImplementedError("This method is for POSIX hosts only") + # We can run into a race condition where the process exits (and another thread sets self._process to None) # before the cancellation happens, so we swap to a local variable to ensure a cancellation that is not needed, # does not raise an exception here. @@ -633,8 +640,7 @@ def _posix_signal_subprocess( f"INTERRUPT: Unable to send {signal_name} to {process.pid}", extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), ) - finally: - return + return elif signal_name == "kill": numeric_signal = signal.SIGKILL else: diff --git a/testing_containers/localuser_sudo_environment/Dockerfile b/testing_containers/localuser_sudo_environment/Dockerfile index d02f35e0..89d4e1be 100644 --- a/testing_containers/localuser_sudo_environment/Dockerfile +++ b/testing_containers/localuser_sudo_environment/Dockerfile @@ -20,7 +20,7 @@ ENV PIP_INDEX_URL=$PIP_INDEX_URL # hostuser: hostuser, sharedgroup # targetuser: targetuser, sharedgroup # disjointuser: disjointuser, disjointgroup -RUN apt-get update && apt-get install -y sudo psmisc libcap2-bin libcap2 libcap-dev gcc && \ +RUN apt-get update && apt-get install -y gcc libcap2-bin libcap2 libcap-dev psmisc sudo && \ # Clean up apt cache rm -rf /var/lib/apt/lists/* && \ apt-get clean && \ From d098f25433ddc535c269c4e43e67a3d7e9deb20d Mon Sep 17 00:00:00 2001 From: Josh Usiskin <56369778+jusiskin@users.noreply.github.com> Date: Mon, 18 Nov 2024 16:22:13 +0000 Subject: [PATCH 7/9] chore: move sudo child process code to Linux dir for conditional coverage Signed-off-by: Josh Usiskin <56369778+jusiskin@users.noreply.github.com> --- src/openjd/sessions/_linux/_sudo.py | 154 +++++++++++++++++++++ src/openjd/sessions/_subprocess.py | 199 ++++------------------------ 2 files changed, 179 insertions(+), 174 deletions(-) create mode 100644 src/openjd/sessions/_linux/_sudo.py diff --git a/src/openjd/sessions/_linux/_sudo.py b/src/openjd/sessions/_linux/_sudo.py new file mode 100644 index 00000000..40a57843 --- /dev/null +++ b/src/openjd/sessions/_linux/_sudo.py @@ -0,0 +1,154 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + +import glob +import os +import sys +import time +from subprocess import Popen, DEVNULL, PIPE, STDOUT, run +from typing import Optional + +from .._logging import LoggerAdapter, LogContent, LogExtraInfo +from .._os_checker import is_posix, is_linux + + +class FindSignalTargetError(Exception): + """Exception when unable to detect the signal target""" + + pass + + +def find_sudo_child_process_group_id( + *, + logger: LoggerAdapter, + sudo_process: Popen, + timeout_seconds: float = 1, +) -> Optional[int]: + # Hint to mypy to not raise module attribute errors (e.g. missing os.getpgid) + if sys.platform == "win32": + raise NotImplementedError("This method is for POSIX hosts only") + if not is_posix(): + raise NotImplementedError(f"Only POSIX supported, but running on {sys.platform}") + if timeout_seconds <= 0: + raise ValueError(f"Expected positive value for timeout_seconds but got {timeout_seconds}") + + # For cross-user support, we use sudo which creates an intermediate process: + # + # openjd-process + # | + # +-- sudo + # | + # +-- subprocess + # + # Sudo forwards signals that it is able to handle, but in the case of SIGKILL sudo cannot + # handle the signal and the kernel will kill it leaving the child orphaned. We need to + # send SIGKILL signals to the subprocess of sudo + start = time.monotonic() + now = start + sudo_pgid = os.getpgid(sudo_process.pid) + + # Repeatedly scan for child processes + # + # This is put in a retry loop, because it takes a non-zero amount of time before sudo and + # the kernel finish creating the subprocess. We cap this because the process may exit + # quickly and we may never find the child process. + sudo_child_pid: Optional[int] = None + sudo_child_pgid: Optional[int] = None + try: + while now - start < timeout_seconds: + if not sudo_child_pid: + if is_linux(): + sudo_child_pid = find_sudo_child_process_id_procfs( + sudo_pid=sudo_process.pid, + logger=logger, + ) + else: + sudo_child_pid = find_child_process_id_pgrep( + sudo_pid=sudo_process.pid, + ) + + if sudo_child_pid: + try: + sudo_child_pgid = os.getpgid(sudo_child_pid) + except ProcessLookupError: + # If the process has exited, we short-circuit + return None + # sudo first forks, then creates a new process group. There is a race condition + # where the process group ID we observe has not yet changed. If the PGID detected + # matches the PGID of sudo, then we retry again in the loop + if sudo_child_pgid == sudo_pgid: + sudo_child_pgid = None + else: + break + + # If we did not find any child processes yet, sleep for some time and retry + time.sleep(min(0.05, now - start)) + now = time.monotonic() + if not sudo_child_pid or not sudo_child_pgid: + raise FindSignalTargetError("unable to detect subprocess before timeout") + except FindSignalTargetError as e: + logger.warning( + f"Unable to determine signal target: {e}", + extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), + ) + + if sudo_child_pgid: + logger.debug( + f"Signal target PGID = {sudo_child_pgid}", + extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), + ) + + return sudo_child_pgid + + +def find_sudo_child_process_id_procfs( + *, + logger: LoggerAdapter, + sudo_pid: int, +) -> Optional[int]: + # Look for the child process of sudo using procfs. See + # https://docs.kernel.org/filesystems/proc.html#proc-pid-task-tid-children-information-about-task-children + + child_pids: set[int] = set() + for task_children_path in glob.glob(f"/proc/{sudo_pid}/task/**/children"): + with open(task_children_path, "r") as f: + child_pids.update(int(pid_str) for pid_str in f.read().split()) + + # If we found exactly one child, we return it + if len(child_pids) == 1: + + child_pid = child_pids.pop() + + logger.debug( + f"Session action process (sudo child) PID is {child_pid}", + extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), + ) + return child_pid + # If we found multiple child processes, this violates our assumptions about how sudo + # works. We will fall-back to using pkill for signalling the process + elif len(child_pids) > 1: + raise FindSignalTargetError( + f"Expected single child processes of sudo, but found {child_pids}" + ) + return None + + +def find_child_process_id_pgrep( + *, + sudo_pid: int, +) -> Optional[int]: + pgrep_result = run( + ["pgrep", "-P", str(sudo_pid)], + stdout=PIPE, + stderr=STDOUT, + stdin=DEVNULL, + text=True, + ) + if pgrep_result.returncode != 0: + raise FindSignalTargetError("Unable to query child processes of sudo process") + results = pgrep_result.stdout.splitlines() + if len(results) > 1: + raise FindSignalTargetError(f"Expected a single child process of sudo, but found {results}") + elif len(results) == 0: + return None + sudo_subproc_pid = int(results[0]) + return sudo_subproc_pid diff --git a/src/openjd/sessions/_subprocess.py b/src/openjd/sessions/_subprocess.py index cc70fb98..d36716cd 100644 --- a/src/openjd/sessions/_subprocess.py +++ b/src/openjd/sessions/_subprocess.py @@ -1,6 +1,5 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -import glob import os import shlex import signal @@ -13,14 +12,15 @@ from subprocess import DEVNULL, PIPE, STDOUT, Popen, list2cmdline, run from threading import Event, Thread from typing import Any -from typing import Callable, Literal, NamedTuple, Optional, Sequence, cast +from typing import Callable, Literal, Optional, Sequence, cast from ._linux._capabilities import try_use_cap_kill +from ._linux._sudo import find_sudo_child_process_group_id from ._logging import LoggerAdapter, LogContent, LogExtraInfo from ._os_checker import is_linux, is_posix, is_windows from ._session_user import PosixSessionUser, WindowsSessionUser, SessionUser -if is_windows(): +if is_windows(): # pragma: nocover from subprocess import CREATE_NEW_PROCESS_GROUP, CREATE_NO_WINDOW # type: ignore from ._win32._popen_as_user import PopenWindowsAsUser # type: ignore from ._windows_process_killer import kill_windows_process_tree @@ -50,22 +50,6 @@ STDOUT_END_GRACETIME_SECONDS = 5 -class SudoChildProcessIDs(NamedTuple): - """Process IDs for the sudo child process when the process is cross-user on POSIX""" - - pid: int - """The process ID""" - - pgid: int - """The process group ID""" - - -class FindSignalTargetError(Exception): - """Exception when unable to detect the signal target""" - - pass - - class LoggingSubprocess(object): """A process whose stdout/stderr lines are sent to a given Logger.""" @@ -81,7 +65,7 @@ class LoggingSubprocess(object): _working_dir: Optional[str] _pid: Optional[int] - _sudo_child_process_ids: Optional[SudoChildProcessIDs] + _sudo_child_process_group_id: Optional[int] _returncode: Optional[int] def __init__( @@ -114,7 +98,7 @@ def __init__( self._has_started = Event() self._pid = None self._returncode = None - self._sudo_child_process_ids = None + self._sudo_child_process_group_id = None @property def pid(self) -> Optional[int]: @@ -185,8 +169,16 @@ def run(self) -> None: self._pid = self._process.pid - if is_posix(): - self._sudo_child_process_ids = self._find_sudo_child_process_ids() + # Would use is_posix(), but it doesn't short-circuit mypy which then complains + # about os.getpgid not being a valid attribute. + if not sys.platform == "win32": + if not self._user or self._user.is_process_user(): + self._sudo_child_process_group_id = os.getpgid(self._process.pid) + else: + self._sudo_child_process_group_id = find_sudo_child_process_group_id( + logger=self._logger, + sudo_process=self._process, + ) self._logger.info( f"Command started as pid: {self._process.pid}", @@ -211,150 +203,6 @@ def run(self) -> None: self._process = None del proc - def _find_sudo_child_process_ids( - self, - *, - timeout_seconds: float = 1, - ) -> Optional[SudoChildProcessIDs]: - # Hint to mypy to not raise module attribute errors (e.g. missing os.getpgid) - if sys.platform == "win32": - raise NotImplementedError("This method is for POSIX hosts only") - process = self._process - if process is None: - raise ValueError("Process not launched") - process_pid = process.pid - if not is_posix(): - raise NotImplementedError(f"Only POSIX supported, but running on {sys.platform}") - if timeout_seconds <= 0: - raise ValueError( - f"Expected positive value for timeout_seconds but got {timeout_seconds}" - ) - - # If a user is not specified or is the same as the host process user, we signal the - # immediate child - if not self._user or self._user.is_process_user(): - pgid = os.getpgid(process_pid) - return SudoChildProcessIDs( - pid=process_pid, - pgid=pgid, - ) - - # For cross-user support, we use sudo which creates an intermediate process: - # - # openjd-process - # | - # +-- sudo - # | - # +-- subprocess - # - # Sudo forwards signals that it is able to handle, but in the case of SIGKILL sudo cannot - # handle the signal and the kernel will kill it leaving the child orphaned. We need to - # send SIGKILL signals to the subprocess of sudo - start = time.monotonic() - now = start - sudo_pgid = os.getpgid(process_pid) - - # Repeatedly scan for child processes - # - # This is put in a retry loop, because it takes a non-zero amount of time before sudo and - # the kernel finish creating the subprocess. We cap this because the process may exit - # quickly and we may never find the child process. - sudo_child_pid: Optional[int] = None - sudo_child_pgid: Optional[int] = None - try: - while now - start < timeout_seconds: - if not sudo_child_pid: - if is_linux(): - sudo_child_pid = self._find_sudo_child_process_id_procfs( - sudo_pid=process_pid - ) - else: - sudo_child_pid = self._find_child_process_id_pgrep(sudo_pid=process_pid) - - if sudo_child_pid: - try: - sudo_child_pgid = os.getpgid(sudo_child_pid) - except ProcessLookupError: - # If the process has exited, we short-circuit - return None - # sudo first forks, then creates a new process group. There is a race condition - # where the process group ID we observe has not yet changed. If the PGID detected - # matches the PGID of sudo, then we retry again in the loop - if sudo_child_pgid == sudo_pgid: - sudo_child_pgid = None - else: - break - - # If we did not find any child processes yet, sleep for some time and retry - time.sleep(min(0.05, now - start)) - now = time.monotonic() - if not sudo_child_pid or not sudo_child_pgid: - raise FindSignalTargetError("unable to detect subprocess before timeout") - except FindSignalTargetError as e: - self._logger.warning( - f"Unable to determine signal target: {e}", - extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), - ) - - if sudo_child_pid and sudo_child_pgid: - self._logger.debug( - f"Signal target = PID {sudo_child_pid} / PGID {sudo_child_pgid}", - extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), - ) - return SudoChildProcessIDs( - pid=sudo_child_pid, - pgid=sudo_child_pgid, - ) - else: - return None - - def _find_sudo_child_process_id_procfs(self, *, sudo_pid: int) -> Optional[int]: - # Look for the child process of sudo using procfs. See - # https://docs.kernel.org/filesystems/proc.html#proc-pid-task-tid-children-information-about-task-children - - child_pids: set[int] = set() - for task_children_path in glob.glob(f"/proc/{sudo_pid}/task/**/children"): - with open(task_children_path, "r") as f: - child_pids.update(int(pid_str) for pid_str in f.read().split()) - - # If we found exactly one child, we return it - if len(child_pids) == 1: - - child_pid = child_pids.pop() - - self._logger.debug( - f"Session action process (sudo child) PID is {child_pid}", - extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), - ) - return child_pid - # If we found multiple child processes, this violates our assumptions about how sudo - # works. We will fall-back to using pkill for signalling the process - elif len(child_pids) > 1: - raise FindSignalTargetError( - f"Expected single child processes of sudo, but found {child_pids}" - ) - return None - - def _find_child_process_id_pgrep(self, *, sudo_pid: int) -> Optional[int]: - pgrep_result = run( - ["pgrep", "-P", str(sudo_pid)], - stdout=PIPE, - stderr=STDOUT, - stdin=DEVNULL, - text=True, - ) - if pgrep_result.returncode != 0: - raise FindSignalTargetError("Unable to query child processes of sudo process") - results = pgrep_result.stdout.splitlines() - if len(results) > 1: - raise FindSignalTargetError( - f"Expected a single child process of sudo, but found {results}" - ) - elif len(results) == 0: - return None - sudo_subproc_pid = int(results[0]) - return sudo_subproc_pid - def notify(self) -> None: """The 'Notify' part of Open Job Description's subprocess cancelation method. On Linux/macOS: @@ -656,10 +504,13 @@ def _posix_signal_subprocess( # If we were unable to detect sudo's child process PID after launching the # subprocess, we try again now - if not self._sudo_child_process_ids: - self._sudo_child_process_ids = self._find_sudo_child_process_ids() + if not self._sudo_child_process_group_id: + self._sudo_child_process_group_id = find_sudo_child_process_group_id( + logger=self._logger, + sudo_process=process, + ) - if not self._sudo_child_process_ids: + if not self._sudo_child_process_group_id: self._logger.warning( f"Failed to send signal '{signal_name}': Unable to determine child process of sudo" ) @@ -671,10 +522,10 @@ def _posix_signal_subprocess( if has_cap_kill or not self._user or self._user.is_process_user(): try: self._logger.info( - f'INTERRUPT: Sending signal "{signal_name}" to process group {self._sudo_child_process_ids.pgid}', + f'INTERRUPT: Sending signal "{signal_name}" to process group {self._sudo_child_process_group_id}', extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), ) - os.killpg(self._sudo_child_process_ids.pgid, numeric_signal) + os.killpg(self._sudo_child_process_group_id, numeric_signal) except OSError: self._logger.info( "Could not directly send signal {signal_name} to {self._posix_signal_target.pid}, trying sudo.", @@ -697,7 +548,7 @@ def _posix_signal_subprocess( "-s", signal_name, "--", - f"-{self._sudo_child_process_ids.pgid}", + f"-{self._sudo_child_process_group_id}", ] ) self._logger.info( @@ -712,7 +563,7 @@ def _posix_signal_subprocess( ) if result.returncode != 0: self._logger.warning( - f"Failed to send signal '{signal_name}' to PGID {self._sudo_child_process_ids.pgid}: %s", + f"Failed to send signal '{signal_name}' to PGID {self._sudo_child_process_group_id}: %s", result.stdout.decode("utf-8"), extra=LogExtraInfo( openjd_log_content=LogContent.PROCESS_CONTROL | LogContent.EXCEPTION_INFO From 1f6ba8bae8b94fdcc0ac08a0de01691c9bf66f5b Mon Sep 17 00:00:00 2001 From: Josh Usiskin <56369778+jusiskin@users.noreply.github.com> Date: Mon, 18 Nov 2024 23:48:58 +0000 Subject: [PATCH 8/9] feat: error handling when libcap not found Signed-off-by: Josh Usiskin <56369778+jusiskin@users.noreply.github.com> --- src/openjd/sessions/_linux/_capabilities.py | 19 ++++++++++++++++--- test/openjd/sessions/test_runner_base.py | 1 + .../localuser_sudo_environment/Dockerfile | 2 +- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/openjd/sessions/_linux/_capabilities.py b/src/openjd/sessions/_linux/_capabilities.py index 890217e3..f702ecd0 100644 --- a/src/openjd/sessions/_linux/_capabilities.py +++ b/src/openjd/sessions/_linux/_capabilities.py @@ -14,7 +14,7 @@ from ctypes.util import find_library from enum import Enum from functools import cache -from typing import Any, Generator, Tuple, TYPE_CHECKING +from typing import Any, Generator, Optional, Tuple, TYPE_CHECKING from .._logging import LOG @@ -109,11 +109,18 @@ def _cap_get_flag_errcheck( @cache -def _get_libcap() -> ctypes.CDLL: +def _get_libcap() -> Optional[ctypes.CDLL]: if not sys.platform.startswith("linux"): raise OSError(f"libcap is only available on Linux, but found platform: {sys.platform}") - libcap = ctypes.CDLL(find_library("cap"), use_errno=True) + libcap_path = find_library("cap") + if libcap_path is None: + LOG.info( + "Unable to locate libcap. Session action cancelation signals will be sent using sudo" + ) + return None + + libcap = ctypes.CDLL(libcap_path, use_errno=True) # https://man7.org/linux/man-pages/man3/cap_set_proc.3.html libcap.cap_set_proc.restype = ctypes.c_int @@ -157,6 +164,7 @@ def _has_capability( capability_set_type: CapabilitySetType, ) -> bool: libcap = _get_libcap() + assert libcap is not None flag_value = cap_flag_value_t() libcap.cap_get_flag(caps, capability, capability_set_type.value, ctypes.byref(flag_value)) return flag_value.value == CAP_SET @@ -184,6 +192,11 @@ def try_use_cap_kill() -> Generator[bool, None, None]: raise OSError(f"Only Linux is supported, but platform is {sys.platform}") libcap = _get_libcap() + # If libcap is not found, we yield False indicating we are not aware of having CAP_KILL + if not libcap: + yield False + return + caps = libcap.cap_get_proc() if _has_capability( diff --git a/test/openjd/sessions/test_runner_base.py b/test/openjd/sessions/test_runner_base.py index c2a789f7..8ce21158 100644 --- a/test/openjd/sessions/test_runner_base.py +++ b/test/openjd/sessions/test_runner_base.py @@ -817,6 +817,7 @@ def test_cancel_notify_direct_signal_with_cap_kill( # Record whether CAP_KILL is in the effective capability set before # notifying the subprocess libcap = _get_libcap() + assert libcap is not None, "Libcap not found" caps = libcap.cap_get_proc() cap_kill_was_effective = _has_capability( caps=caps, diff --git a/testing_containers/localuser_sudo_environment/Dockerfile b/testing_containers/localuser_sudo_environment/Dockerfile index 89d4e1be..140f8da2 100644 --- a/testing_containers/localuser_sudo_environment/Dockerfile +++ b/testing_containers/localuser_sudo_environment/Dockerfile @@ -20,7 +20,7 @@ ENV PIP_INDEX_URL=$PIP_INDEX_URL # hostuser: hostuser, sharedgroup # targetuser: targetuser, sharedgroup # disjointuser: disjointuser, disjointgroup -RUN apt-get update && apt-get install -y gcc libcap2-bin libcap2 libcap-dev psmisc sudo && \ +RUN apt-get update && apt-get install -y gcc libcap2-bin psmisc sudo && \ # Clean up apt cache rm -rf /var/lib/apt/lists/* && \ apt-get clean && \ From 00c40ffc3267cc55a870bee2718d36b5c938bb99 Mon Sep 17 00:00:00 2001 From: Josh Usiskin <56369778+jusiskin@users.noreply.github.com> Date: Tue, 19 Nov 2024 00:33:07 +0000 Subject: [PATCH 9/9] chore: apply code review feedback Signed-off-by: Josh Usiskin <56369778+jusiskin@users.noreply.github.com> --- src/openjd/sessions/_linux/_capabilities.py | 11 +++++++---- src/openjd/sessions/_linux/_sudo.py | 2 +- src/openjd/sessions/_subprocess.py | 3 ++- test/openjd/sessions/test_runner_base.py | 2 ++ 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/openjd/sessions/_linux/_capabilities.py b/src/openjd/sessions/_linux/_capabilities.py index f702ecd0..0a7983a1 100644 --- a/src/openjd/sessions/_linux/_capabilities.py +++ b/src/openjd/sessions/_linux/_capabilities.py @@ -125,7 +125,7 @@ def _get_libcap() -> Optional[ctypes.CDLL]: # https://man7.org/linux/man-pages/man3/cap_set_proc.3.html libcap.cap_set_proc.restype = ctypes.c_int libcap.cap_set_proc.argtypes = [ - ctypes.POINTER(Cap), + cap_t, ] libcap.cap_set_proc.errcheck = _cap_set_err_check # type: ignore @@ -159,12 +159,11 @@ def _get_libcap() -> Optional[ctypes.CDLL]: def _has_capability( *, + libcap: ctypes.CDLL, caps: cap_t, capability: int, capability_set_type: CapabilitySetType, ) -> bool: - libcap = _get_libcap() - assert libcap is not None flag_value = cap_flag_value_t() libcap.cap_get_flag(caps, capability, capability_set_type.value, ctypes.byref(flag_value)) return flag_value.value == CAP_SET @@ -200,12 +199,16 @@ def try_use_cap_kill() -> Generator[bool, None, None]: caps = libcap.cap_get_proc() if _has_capability( - caps=caps, capability=CAP_KILL, capability_set_type=CapabilitySetType.EFFECTIVE + libcap=libcap, + caps=caps, + capability=CAP_KILL, + capability_set_type=CapabilitySetType.EFFECTIVE, ): LOG.debug("CAP_KILL is in the thread's effective set") # CAP_KILL is already in the effective set yield True elif _has_capability( + libcap=libcap, caps=caps, capability=CAP_KILL, capability_set_type=CapabilitySetType.PERMITTED, diff --git a/src/openjd/sessions/_linux/_sudo.py b/src/openjd/sessions/_linux/_sudo.py index 40a57843..43a1fa8c 100644 --- a/src/openjd/sessions/_linux/_sudo.py +++ b/src/openjd/sessions/_linux/_sudo.py @@ -81,7 +81,7 @@ def find_sudo_child_process_group_id( break # If we did not find any child processes yet, sleep for some time and retry - time.sleep(min(0.05, now - start)) + time.sleep(min(0.05, timeout_seconds - (now - start))) now = time.monotonic() if not sudo_child_pid or not sudo_child_pgid: raise FindSignalTargetError("unable to detect subprocess before timeout") diff --git a/src/openjd/sessions/_subprocess.py b/src/openjd/sessions/_subprocess.py index d36716cd..ea29011c 100644 --- a/src/openjd/sessions/_subprocess.py +++ b/src/openjd/sessions/_subprocess.py @@ -512,7 +512,8 @@ def _posix_signal_subprocess( if not self._sudo_child_process_group_id: self._logger.warning( - f"Failed to send signal '{signal_name}': Unable to determine child process of sudo" + f"Failed to send signal '{signal_name}': Unable to determine child process of sudo", + extra=LogExtraInfo(openjd_log_content=LogContent.PROCESS_CONTROL), ) return diff --git a/test/openjd/sessions/test_runner_base.py b/test/openjd/sessions/test_runner_base.py index 8ce21158..03037874 100644 --- a/test/openjd/sessions/test_runner_base.py +++ b/test/openjd/sessions/test_runner_base.py @@ -820,6 +820,7 @@ def test_cancel_notify_direct_signal_with_cap_kill( assert libcap is not None, "Libcap not found" caps = libcap.cap_get_proc() cap_kill_was_effective = _has_capability( + libcap=libcap, caps=caps, capability=CAP_KILL, capability_set_type=CapabilitySetType.EFFECTIVE, @@ -871,6 +872,7 @@ def test_cancel_notify_direct_signal_with_cap_kill( # is unchanged after calling Runner.cancel() caps = libcap.cap_get_proc() cap_kill_effective_after_cancel = _has_capability( + libcap=libcap, caps=caps, capability=CAP_KILL, capability_set_type=CapabilitySetType.EFFECTIVE,