diff --git a/.ci/jenkins/lib/test-matrix.yaml b/.ci/jenkins/lib/test-matrix.yaml index 34836cb2f..935273e86 100644 --- a/.ci/jenkins/lib/test-matrix.yaml +++ b/.ci/jenkins/lib/test-matrix.yaml @@ -25,7 +25,7 @@ timeout_minutes: 240 # label is defined at jenkins slave configuration, we want to run the job on a gpu agent and be able to esaly replace it without having to change this file runs_on_agents: - {nodeLabel: 'H100'} - # - {nodeLabel: 'DGX'} + - {nodeLabel: 'DGX'} matrix: axes: diff --git a/src/core/meson.build b/src/core/meson.build index 54e6344d2..7fbf8d121 100644 --- a/src/core/meson.build +++ b/src/core/meson.build @@ -54,12 +54,13 @@ if libtransfer_engine.found() and not disable_mooncake_backend and 'Mooncake' in endif nixl_lib = library('nixl', + 'signalhandler.cpp', 'nixl_agent.cpp', 'nixl_plugin_manager.cpp', 'nixl_listener.cpp', 'telemetry.cpp', include_directories: [ nixl_inc_dirs, utils_inc_dirs ], - link_args: ['-lstdc++fs'], + link_args: ['-lstdc++fs', '-lbacktrace'], dependencies: nixl_lib_deps, install: true) diff --git a/src/core/signalhandler.cpp b/src/core/signalhandler.cpp new file mode 100644 index 000000000..a219fda23 --- /dev/null +++ b/src/core/signalhandler.cpp @@ -0,0 +1,89 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define MAX_BACKTRACE_DEPTH 100 + +void +print_backtrace() { + void *buffer[MAX_BACKTRACE_DEPTH]; + int nptrs = backtrace(buffer, MAX_BACKTRACE_DEPTH); + backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO); +} + +void +gdb_signal_handler(int sig) { + signal(sig, SIG_DFL); + + const char *header = "\n!!! Caught signal. Generating backtrace: !!!\n"; + ssize_t ignored __attribute__((unused)) = write(STDERR_FILENO, header, strlen(header)); + + print_backtrace(); + + pid_t tid = fork(); + if (tid == 0) { + // Child process + char pid_buf[30] = {0}; + sprintf(pid_buf, "%d", getppid()); + + char exe_path_buf[1024]; + ssize_t len = readlink("/proc/self/exe", exe_path_buf, sizeof(exe_path_buf) - 1); + if (len != -1) { + exe_path_buf[len] = '\0'; + } else { + strcpy(exe_path_buf, "UNKNOWN_EXE"); + } + + // Replace child process with GDB + execlp("gdb", + "gdb", + "-q", + exe_path_buf, + pid_buf, + "--batch", + "-ex", + "thread apply all bt full", + "-ex", + "quit", + (char *)NULL); + + _exit(1); + } else if (tid > 0) { + // Parent process + int status; + waitpid(tid, &status, 0); + } + + // Re-raise signal to get core dump + raise(sig); +} + +__attribute__((constructor)) void +setup_gdb_handler() { + signal(SIGSEGV, gdb_signal_handler); + signal(SIGABRT, gdb_signal_handler); + signal(SIGFPE, gdb_signal_handler); + signal(SIGILL, gdb_signal_handler); + signal(SIGBUS, gdb_signal_handler); +} diff --git a/test/unit/plugins/ucx/ucx_backend_test.cpp b/test/unit/plugins/ucx/ucx_backend_test.cpp index 30b207c0b..9f18b233f 100644 --- a/test/unit/plugins/ucx/ucx_backend_test.cpp +++ b/test/unit/plugins/ucx/ucx_backend_test.cpp @@ -558,7 +558,7 @@ test_inter_agent_transfer(bool p_thread, nixl_mem_t dst_mem_type, int dst_dev_id) { int ret; - int iter = 10; + int iter = 1; std::cout << std::endl << std::endl; std::cout << "****************************************************" << std::endl; @@ -671,7 +671,7 @@ test_inter_agent_transfer(bool p_thread, // As well as all the remote notes, asking to remove them one by one // need to provide list of descs ucx1->unloadMD (rmd1); - //ucx2->unloadMD (rmd2); + ucx2->unloadMD (rmd2); // Release memory regions deallocateAndDeregister(ucx1, src_dev_id, src_mem_type, addr1, lmd1); @@ -681,7 +681,7 @@ test_inter_agent_transfer(bool p_thread, ucx1->disconnect(agent2); // TODO: Causes race condition - investigate conn management implementation - //ucx2->disconnect(agent1); + ucx2->disconnect(agent1); } int main()