Skip to content

Commit 6bffe05

Browse files
committed
fix(libfabric): Use PCI bus ID for GPU-to-EFA mapping
Fix incorrect EFA device selection when CUDA_VISIBLE_DEVICES is set by using PCI bus IDs instead of enumeration order. Query physical GPU via cuPointerGetAttributes(), map to hwloc topology index, and select correct EFA devices based on PCIe proximity. Fixes GPU device ID mismatch between CUDA and hwloc enumeration that caused wrong EFA rails to be selected in vLLM and multi-GPU workloads.
1 parent 0f64414 commit 6bffe05

File tree

6 files changed

+141
-58
lines changed

6 files changed

+141
-58
lines changed

src/plugins/libfabric/libfabric_backend.cpp

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656

5757
#ifdef HAVE_CUDA
5858
static int
59-
cudaQueryAddr(void *address, bool &is_dev, CUdevice &dev, CUcontext &ctx) {
59+
cudaQueryAddr(void *address, bool &is_dev, CUdevice &dev, CUcontext &ctx, std::string &pci_bus_id) {
6060
CUmemorytype mem_type = CU_MEMORYTYPE_HOST;
6161
uint32_t is_managed = 0;
6262
CUpointer_attribute attr_type[4];
@@ -75,6 +75,19 @@ cudaQueryAddr(void *address, bool &is_dev, CUdevice &dev, CUcontext &ctx) {
7575
result = cuPointerGetAttributes(4, attr_type, attr_data, (CUdeviceptr)address);
7676
is_dev = (mem_type == CU_MEMORYTYPE_DEVICE);
7777

78+
// Get PCI bus ID if device memory
79+
if (result == CUDA_SUCCESS && is_dev) {
80+
char pci_buf[32];
81+
CUresult pci_result = cuDeviceGetPCIBusId(pci_buf, sizeof(pci_buf), dev);
82+
if (pci_result == CUDA_SUCCESS) {
83+
pci_bus_id = std::string(pci_buf);
84+
} else {
85+
pci_bus_id = "";
86+
}
87+
} else {
88+
pci_bus_id = "";
89+
}
90+
7891
return (CUDA_SUCCESS != result);
7992
}
8093

@@ -89,14 +102,15 @@ nixlLibfabricCudaCtx::cudaUpdateCtxPtr(void *address, int expected_dev, bool &wa
89102
bool is_dev;
90103
CUdevice dev;
91104
CUcontext ctx;
105+
std::string pci_bus_id; // Not used here, but required by cudaQueryAddr
92106
int ret;
93107

94108
was_updated = false;
95109

96110
if (expected_dev == -1) return -1;
97111
if (myDevId_ != -1 && expected_dev != myDevId_) return -1;
98112

99-
ret = cudaQueryAddr(address, is_dev, dev, ctx);
113+
ret = cudaQueryAddr(address, is_dev, dev, ctx, pci_bus_id);
100114
if (ret) return ret;
101115
if (!is_dev) return 0;
102116
if (dev != expected_dev) return -1;
@@ -734,6 +748,7 @@ nixlLibfabricEngine::registerMem(const nixlBlobDesc &mem,
734748
priv->length_ = mem.len;
735749
priv->gpu_device_id_ = mem.devId; // Store GPU device ID
736750

751+
std::string pci_bus_id = "";
737752
#ifdef HAVE_CUDA
738753
// Handle CUDA memory registration with GPU Direct RDMA support
739754
if (nixl_mem == VRAM_SEG) {
@@ -760,6 +775,19 @@ nixlLibfabricEngine::registerMem(const nixlBlobDesc &mem,
760775
}
761776
NIXL_DEBUG << "Set CUDA device context to GPU " << mem.devId;
762777
}
778+
779+
// Query PCI bus ID from memory address (AFTER setting context)
780+
bool is_dev;
781+
CUdevice dev;
782+
CUcontext ctx;
783+
784+
int ret = cudaQueryAddr((void *)mem.addr, is_dev, dev, ctx, pci_bus_id);
785+
if (ret || !is_dev) {
786+
NIXL_ERROR << "Failed to query device from memory " << (void *)mem.addr;
787+
return NIXL_ERR_BACKEND;
788+
}
789+
790+
NIXL_DEBUG << "Queried PCI bus ID: " << pci_bus_id << " for GPU " << mem.devId;
763791
}
764792
#endif
765793

@@ -777,12 +805,14 @@ nixlLibfabricEngine::registerMem(const nixlBlobDesc &mem,
777805

778806
// Use Rail Manager for centralized memory registration with GPU Direct RDMA support
779807
NIXL_TRACE << "Registering memory: addr=" << (void *)mem.addr << " len=" << mem.len
780-
<< " mem_type=" << nixl_mem << " devId=" << mem.devId;
808+
<< " mem_type=" << nixl_mem << " devId=" << mem.devId
809+
<< (nixl_mem == VRAM_SEG ? " pci_bus_id=" + pci_bus_id : "");
781810

782811
nixl_status_t status = rail_manager.registerMemory((void *)mem.addr,
783812
mem.len,
784813
nixl_mem,
785814
mem.devId,
815+
pci_bus_id,
786816
priv->rail_mr_list_,
787817
priv->rail_key_list_,
788818
priv->selected_rails_);

src/utils/libfabric/libfabric_rail_manager.cpp

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "libfabric/libfabric_topology.h"
2222
#include "common/nixl_log.h"
2323
#include "serdes/serdes.h"
24+
#include <sstream>
2425

2526
// Forward declaration for LibfabricUtils namespace
2627
namespace LibfabricUtils {
@@ -46,6 +47,7 @@ nixlLibfabricRailManager::nixlLibfabricRailManager(size_t striping_threshold)
4647

4748
// Get network devices from topology and create rails automatically
4849
std::vector<std::string> all_devices = topology->getAllDevices();
50+
4951
std::string selected_provider_name = topology->getProviderName();
5052

5153
NIXL_DEBUG << "Got " << all_devices.size()
@@ -321,16 +323,25 @@ nixlLibfabricRailManager::prepareAndSubmitTransfer(
321323
std::vector<size_t>
322324
nixlLibfabricRailManager::selectRailsForMemory(void *mem_addr,
323325
nixl_mem_t mem_type,
324-
int gpu_id) const {
326+
int gpu_id,
327+
const std::string &gpu_pci_bus_id) const {
325328
if (mem_type == VRAM_SEG) {
326329
#ifdef HAVE_CUDA
327330
if (gpu_id < 0) {
328331
NIXL_ERROR << "Invalid GPU ID " << gpu_id << " for VRAM memory " << mem_addr;
329332
return {}; // Return empty vector to indicate failure
330333
}
331-
std::vector<std::string> gpu_efa_devices = topology->getEfaDevicesForGpu(gpu_id);
334+
335+
// Use PCI bus ID provided by caller (queried in backend layer)
336+
if (gpu_pci_bus_id.empty()) {
337+
NIXL_ERROR << "Empty PCI bus ID provided for VRAM memory " << mem_addr;
338+
return {}; // Return empty vector to indicate failure
339+
}
340+
341+
// Get EFA devices for this PCI bus ID
342+
std::vector<std::string> gpu_efa_devices = topology->getEfaDevicesForGPUPci(gpu_pci_bus_id);
332343
if (gpu_efa_devices.empty()) {
333-
NIXL_ERROR << "No EFA devices found for GPU " << gpu_id;
344+
NIXL_ERROR << "No EFA devices found for PCI " << gpu_pci_bus_id;
334345
return {}; // Return empty vector to indicate failure
335346
}
336347
std::vector<size_t> gpu_rails;
@@ -340,26 +351,26 @@ nixlLibfabricRailManager::selectRailsForMemory(void *mem_addr,
340351
// Bounds check: ensure rail index is valid
341352
if (it->second < data_rails_.size()) {
342353
gpu_rails.push_back(it->second);
343-
NIXL_DEBUG << "VRAM memory " << mem_addr << " on GPU " << gpu_id
354+
NIXL_DEBUG << "VRAM memory " << mem_addr << " on GPU-PCI " << gpu_pci_bus_id
344355
<< " mapped to rail " << it->second << " (EFA device=" << efa_device
345356
<< ")";
346357
} else {
347358
NIXL_WARN << "EFA device " << efa_device << " maps to rail " << it->second
348359
<< " but only " << data_rails_.size() << " rails available";
349360
}
350361
} else {
351-
NIXL_WARN << "EFA device " << efa_device << " not found in rail mapping for GPU "
352-
<< gpu_id;
362+
NIXL_WARN << "EFA device " << efa_device
363+
<< " not found in rail mapping for GPU-PCI " << gpu_pci_bus_id;
353364
}
354365
}
355366

356367
if (gpu_rails.empty()) {
357-
NIXL_ERROR << "No valid rail mapping found for GPU " << gpu_id << " (checked "
358-
<< gpu_efa_devices.size() << " EFA devices)";
368+
NIXL_ERROR << "No valid rail mapping found for GPU-PCI " << gpu_pci_bus_id
369+
<< " (checked " << gpu_efa_devices.size() << " EFA devices)";
359370
return {};
360371
}
361372

362-
NIXL_DEBUG << "VRAM memory " << mem_addr << " on GPU " << gpu_id << " will use "
373+
NIXL_DEBUG << "VRAM memory " << mem_addr << " on GPU-PCI " << gpu_pci_bus_id << " will use "
363374
<< gpu_rails.size() << " rails total";
364375
return gpu_rails;
365376
#else
@@ -390,6 +401,7 @@ nixlLibfabricRailManager::registerMemory(void *buffer,
390401
size_t length,
391402
nixl_mem_t mem_type,
392403
int gpu_id,
404+
const std::string &gpu_pci_bus_id,
393405
std::vector<struct fid_mr *> &mr_list_out,
394406
std::vector<uint64_t> &key_list_out,
395407
std::vector<size_t> &selected_rails_out) {
@@ -398,8 +410,11 @@ nixlLibfabricRailManager::registerMemory(void *buffer,
398410
return NIXL_ERR_INVALID_PARAM;
399411
}
400412

401-
// Use internal rail selection with explicit GPU ID
402-
std::vector<size_t> selected_rails = selectRailsForMemory(buffer, mem_type, gpu_id);
413+
// Select rails based on memory type and PCI bus ID
414+
// For VRAM: uses PCI bus ID provided by backend to map to topology-aware rails
415+
// For DRAM: uses all available rails
416+
std::vector<size_t> selected_rails =
417+
selectRailsForMemory(buffer, mem_type, gpu_id, gpu_pci_bus_id);
403418
if (selected_rails.empty()) {
404419
NIXL_ERROR << "No rails selected for memory type " << mem_type;
405420
return NIXL_ERR_NOT_SUPPORTED;
@@ -429,6 +444,7 @@ nixlLibfabricRailManager::registerMemory(void *buffer,
429444

430445
struct fid_mr *mr;
431446
uint64_t key;
447+
// Pass gpu_id parameter to individual rail's registerMemory calls
432448
nixl_status_t status =
433449
data_rails_[rail_idx]->registerMemory(buffer, length, mem_type, gpu_id, &mr, &key);
434450
if (status != NIXL_SUCCESS) {

src/utils/libfabric/libfabric_rail_manager.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ class nixlLibfabricRailManager {
110110
* @param length Buffer size in bytes
111111
* @param mem_type Memory type (DRAM_SEG or VRAM_SEG)
112112
* @param gpu_id GPU device ID (used for VRAM_SEG, ignored for DRAM_SEG)
113+
* @param gpu_pci_bus_id PCI bus ID for VRAM-GPU (queried in backend layer), empty for DRAM
113114
* @param mr_list_out Memory registration handles, indexed by rail ID
114115
* @param key_list_out Remote access keys, indexed by rail ID
115116
* @param selected_rails_out List of rail IDs where memory was registered
@@ -120,6 +121,7 @@ class nixlLibfabricRailManager {
120121
size_t length,
121122
nixl_mem_t mem_type,
122123
int gpu_id,
124+
const std::string &gpu_pci_bus_id,
123125
std::vector<struct fid_mr *> &mr_list_out,
124126
std::vector<uint64_t> &key_list_out,
125127
std::vector<size_t> &selected_rails_out);
@@ -316,7 +318,10 @@ class nixlLibfabricRailManager {
316318

317319
// Internal rail selection method
318320
std::vector<size_t>
319-
selectRailsForMemory(void *mem_addr, nixl_mem_t mem_type, int gpu_id) const;
321+
selectRailsForMemory(void *mem_addr,
322+
nixl_mem_t mem_type,
323+
int gpu_id,
324+
const std::string &pci_bus_id = "") const;
320325

321326
// Helper functions for connection SerDes
322327
void

src/utils/libfabric/libfabric_topology.cpp

Lines changed: 52 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -135,18 +135,37 @@ nixlLibfabricTopology::discoverEfaDevices() {
135135
}
136136

137137
std::vector<std::string>
138-
nixlLibfabricTopology::getEfaDevicesForGpu(int gpu_id) const {
139-
auto it = gpu_to_efa_devices.find(gpu_id);
140-
if (it != gpu_to_efa_devices.end()) {
141-
return it->second;
138+
nixlLibfabricTopology::getEfaDevicesForGPUPci(const std::string &pci_bus_id) const {
139+
// Normalize PCI bus ID format to match hwloc format
140+
// CUDA format: "0000:59:00.0" → hwloc format: "0:59:00.0"
141+
unsigned int domain, bus, device, function;
142+
if (sscanf(pci_bus_id.c_str(), "%x:%x:%x.%x", &domain, &bus, &device, &function) == 4) {
143+
char normalized_pci[32];
144+
snprintf(normalized_pci,
145+
sizeof(normalized_pci),
146+
"%x:%02x:%02x.%x",
147+
domain,
148+
bus,
149+
device,
150+
function);
151+
std::string normalized_id(normalized_pci);
152+
153+
auto it = pci_to_efa_devices.find(normalized_id);
154+
if (it != pci_to_efa_devices.end()) {
155+
NIXL_DEBUG << "Found EFA devices for PCI " << pci_bus_id << " (normalized to "
156+
<< normalized_id << ")";
157+
return it->second;
158+
}
159+
// PCI ID parsed successfully but not found in mapping
160+
NIXL_WARN << "PCI bus ID " << pci_bus_id << " (normalized to " << normalized_id
161+
<< ") not found in GPU-EFA mapping, returning all devices";
162+
} else {
163+
// Failed to parse PCI bus ID format
164+
NIXL_WARN << "Failed to parse PCI bus ID format: " << pci_bus_id
165+
<< ", returning all devices";
142166
}
143-
NIXL_WARN << "No EFA devices found for GPU " << gpu_id << ", returning all devices";
144-
return all_devices;
145-
}
146167

147-
bool
148-
nixlLibfabricTopology::isValidGpuId(int gpu_id) const {
149-
return gpu_id >= 0 && gpu_id < num_gpus;
168+
return all_devices;
150169
}
151170

152171
bool
@@ -165,10 +184,10 @@ nixlLibfabricTopology::printTopologyInfo() const {
165184
for (size_t i = 0; i < all_devices.size(); ++i) {
166185
NIXL_TRACE << " [" << i << "] " << all_devices[i];
167186
}
168-
NIXL_TRACE << "GPU → EFA mapping:";
169-
for (const auto &pair : gpu_to_efa_devices) {
187+
NIXL_TRACE << "GPU-PCI → EFA mapping:";
188+
for (const auto &pair : pci_to_efa_devices) {
170189
std::stringstream ss;
171-
ss << " GPU " << pair.first << " → [";
190+
ss << " GPU-PCI " << pair.first << " → [";
172191
for (size_t i = 0; i < pair.second.size(); ++i) {
173192
if (i > 0) ss << ", ";
174193
ss << pair.second[i];
@@ -423,15 +442,15 @@ nixlLibfabricTopology::buildPcieToLibfabricMapping() {
423442

424443
nixl_status_t
425444
nixlLibfabricTopology::buildGpuToEfaMapping() {
426-
gpu_to_efa_devices.clear();
445+
pci_to_efa_devices.clear();
427446
// Implement NIXL's topology-aware GPU-EFA grouping algorithm
428447
nixl_status_t status = buildTopologyAwareGrouping();
429448
if (status != NIXL_SUCCESS) {
430449
NIXL_WARN << "Topology-aware grouping failed, using fallback to use all available devices";
431450
return buildFallbackMapping();
432451
}
433452

434-
NIXL_TRACE << "Built GPU→EFA mapping for " << gpu_to_efa_devices.size()
453+
NIXL_TRACE << "Built PCI→EFA mapping for " << pci_to_efa_devices.size()
435454
<< " GPUs using topology-aware algorithm";
436455

437456
return NIXL_SUCCESS;
@@ -527,13 +546,17 @@ nixlLibfabricTopology::buildTopologyAwareGrouping() {
527546
}
528547

529548
if (gpu_index >= 0) {
530-
gpu_to_efa_devices[gpu_index] = gpu_efa_devices;
531-
532-
NIXL_TRACE << "GPU " << gpu_index << " (" << std::hex << group.closest_gpu.domain_id
533-
<< ":" << static_cast<int>(group.closest_gpu.bus_id) << ":"
534-
<< static_cast<int>(group.closest_gpu.device_id) << "."
535-
<< static_cast<int>(group.closest_gpu.function_id) << std::dec << ") → "
536-
<< gpu_efa_devices.size() << " EFA devices";
549+
// Store mapping using PCI bus ID as key
550+
std::string pci_bus_id = getPcieAddressFromHwlocObj(group.closest_gpu.hwloc_node);
551+
pci_to_efa_devices[pci_bus_id] = gpu_efa_devices;
552+
553+
NIXL_TRACE << "PCI " << pci_bus_id << " (GPU " << gpu_index << ") → "
554+
<< gpu_efa_devices.size() << " EFA devices: [";
555+
for (size_t i = 0; i < gpu_efa_devices.size(); ++i) {
556+
if (i > 0) NIXL_TRACE << ", ";
557+
NIXL_TRACE << gpu_efa_devices[i];
558+
}
559+
NIXL_TRACE << "]";
537560
}
538561
}
539562
}
@@ -543,15 +566,12 @@ nixlLibfabricTopology::buildTopologyAwareGrouping() {
543566
nixl_status_t
544567
nixlLibfabricTopology::buildFallbackMapping() {
545568
// Fallback: if specific mapping failed, use simple approach
546-
gpu_to_efa_devices.clear();
547-
// Give all devices to all GPUs (not optimal but functional)
548-
for (int gpu_id = 0; gpu_id < num_gpus; ++gpu_id) {
549-
gpu_to_efa_devices[gpu_id] = all_devices;
550-
}
569+
// We can't build PCI-based mapping without topology, so just return success
570+
// getEfaDevicesForPci() will return all_devices when no mapping is found
571+
NIXL_WARN << "Using fallback: all GPUs will use all available EFA devices";
551572
return NIXL_SUCCESS;
552573
}
553574

554-
555575
// hwloc helper methods
556576

557577
std::string
@@ -607,8 +627,8 @@ nixlLibfabricTopology::groupNicsWithGpus(const std::vector<NicInfo> &discovered_
607627
// Implement NIXL's topology-aware NIC grouping algorithm
608628

609629
// Step 1: Mark topology nodes that have NICs in their subtree
610-
std::map<hwloc_obj_t, int> node_group_counts;
611-
std::map<hwloc_obj_t, std::vector<NicInfo>> node_nics;
630+
std::unordered_map<hwloc_obj_t, int> node_group_counts;
631+
std::unordered_map<hwloc_obj_t, std::vector<NicInfo>> node_nics;
612632
std::set<hwloc_obj_t> nic_subtree_nodes;
613633
// Mark all nodes that have NICs in their subtree and collect NICs per node
614634
for (const auto &nic : discovered_nics) {
@@ -621,7 +641,7 @@ nixlLibfabricTopology::groupNicsWithGpus(const std::vector<NicInfo> &discovered_
621641
}
622642

623643
// Step 2: For each GPU, walk up until finding a NIC subtree node and increment its count
624-
std::map<hwloc_obj_t, std::vector<GpuInfo>> node_gpus;
644+
std::unordered_map<hwloc_obj_t, std::vector<GpuInfo>> node_gpus;
625645

626646
for (const auto &gpu : discovered_gpus) {
627647
hwloc_obj_t node = gpu.hwloc_node;
@@ -637,7 +657,7 @@ nixlLibfabricTopology::groupNicsWithGpus(const std::vector<NicInfo> &discovered_
637657
}
638658

639659
// Step 3: Collect all NICs that need to be grouped and assign them to ancestor nodes
640-
std::map<hwloc_obj_t, std::vector<NicInfo>> ancestor_nics;
660+
std::unordered_map<hwloc_obj_t, std::vector<NicInfo>> ancestor_nics;
641661

642662
for (const auto &pair : node_nics) {
643663
hwloc_obj_t nic_node = pair.first;

0 commit comments

Comments
 (0)