Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .ci/jenkins/lib/test-matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ timeout_minutes: 240
# label is defined at jenkins slave configuration, we want to run the job on a gpu agent and be able to esaly replace it without having to change this file
runs_on_agents:
- {nodeLabel: 'H100'}
# - {nodeLabel: 'DGX'}
- {nodeLabel: 'DGX'}

matrix:
axes:
Expand Down
3 changes: 2 additions & 1 deletion src/core/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,13 @@ if libtransfer_engine.found() and not disable_mooncake_backend and 'Mooncake' in
endif

nixl_lib = library('nixl',
'signalhandler.cpp',
'nixl_agent.cpp',
'nixl_plugin_manager.cpp',
'nixl_listener.cpp',
'telemetry.cpp',
include_directories: [ nixl_inc_dirs, utils_inc_dirs ],
link_args: ['-lstdc++fs'],
link_args: ['-lstdc++fs', '-lbacktrace'],
dependencies: nixl_lib_deps,
install: true)

Expand Down
89 changes: 89 additions & 0 deletions src/core/signalhandler.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <csignal>
#include <iostream>
#include <execinfo.h>
#include <cstdlib>
#include <unistd.h>
#include <sys/wait.h>
#include <cstring>

#define MAX_BACKTRACE_DEPTH 100

void
print_backtrace() {
void *buffer[MAX_BACKTRACE_DEPTH];
int nptrs = backtrace(buffer, MAX_BACKTRACE_DEPTH);
backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO);
}

void
gdb_signal_handler(int sig) {
signal(sig, SIG_DFL);

const char *header = "\n!!! Caught signal. Generating backtrace: !!!\n";
ssize_t ignored __attribute__((unused)) = write(STDERR_FILENO, header, strlen(header));

print_backtrace();

pid_t tid = fork();
if (tid == 0) {
// Child process
char pid_buf[30] = {0};
sprintf(pid_buf, "%d", getppid());

char exe_path_buf[1024];
ssize_t len = readlink("/proc/self/exe", exe_path_buf, sizeof(exe_path_buf) - 1);
if (len != -1) {
exe_path_buf[len] = '\0';
} else {
strcpy(exe_path_buf, "UNKNOWN_EXE");
}

// Replace child process with GDB
execlp("gdb",
"gdb",
"-q",
exe_path_buf,
pid_buf,
"--batch",
"-ex",
"thread apply all bt full",
"-ex",
"quit",
(char *)NULL);

_exit(1);
} else if (tid > 0) {
// Parent process
int status;
waitpid(tid, &status, 0);
}

// Re-raise signal to get core dump
raise(sig);
}

__attribute__((constructor)) void
setup_gdb_handler() {
signal(SIGSEGV, gdb_signal_handler);
signal(SIGABRT, gdb_signal_handler);
signal(SIGFPE, gdb_signal_handler);
signal(SIGILL, gdb_signal_handler);
signal(SIGBUS, gdb_signal_handler);
}
6 changes: 3 additions & 3 deletions test/unit/plugins/ucx/ucx_backend_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,7 @@ test_inter_agent_transfer(bool p_thread,
nixl_mem_t dst_mem_type,
int dst_dev_id) {
int ret;
int iter = 10;
int iter = 1;

std::cout << std::endl << std::endl;
std::cout << "****************************************************" << std::endl;
Expand Down Expand Up @@ -671,7 +671,7 @@ test_inter_agent_transfer(bool p_thread,
// As well as all the remote notes, asking to remove them one by one
// need to provide list of descs
ucx1->unloadMD (rmd1);
//ucx2->unloadMD (rmd2);
ucx2->unloadMD (rmd2);

// Release memory regions
deallocateAndDeregister(ucx1, src_dev_id, src_mem_type, addr1, lmd1);
Expand All @@ -681,7 +681,7 @@ test_inter_agent_transfer(bool p_thread,
ucx1->disconnect(agent2);

// TODO: Causes race condition - investigate conn management implementation
//ucx2->disconnect(agent1);
ucx2->disconnect(agent1);
}

int main()
Expand Down
Loading