fix(libfabric): Use PCI bus ID for GPU-to-EFA mapping

akkart-aws · akkart-aws · commit 6bffe055c007 · 2026-01-05T18:21:25.000Z
Fix incorrect EFA device selection when CUDA_VISIBLE_DEVICES is set by using
PCI bus IDs instead of enumeration order. Query physical GPU via cuPointerGetAttributes(),
map to hwloc topology index, and select correct EFA devices based on PCIe proximity.

Fixes GPU device ID mismatch between CUDA and hwloc enumeration that caused
wrong EFA rails to be selected in vLLM and multi-GPU workloads.
diff --git a/src/plugins/libfabric/libfabric_backend.cpp b/src/plugins/libfabric/libfabric_backend.cpp
@@ -56,7 +56,7 @@
 
 #ifdef HAVE_CUDA
 static int
-cudaQueryAddr(void *address, bool &is_dev, CUdevice &dev, CUcontext &ctx) {
+cudaQueryAddr(void *address, bool &is_dev, CUdevice &dev, CUcontext &ctx, std::string &pci_bus_id) {
     CUmemorytype mem_type = CU_MEMORYTYPE_HOST;
     uint32_t is_managed = 0;
     CUpointer_attribute attr_type[4];
@@ -75,6 +75,19 @@ cudaQueryAddr(void *address, bool &is_dev, CUdevice &dev, CUcontext &ctx) {
     result = cuPointerGetAttributes(4, attr_type, attr_data, (CUdeviceptr)address);
     is_dev = (mem_type == CU_MEMORYTYPE_DEVICE);
 
+    // Get PCI bus ID if device memory
+    if (result == CUDA_SUCCESS && is_dev) {
+        char pci_buf[32];
+        CUresult pci_result = cuDeviceGetPCIBusId(pci_buf, sizeof(pci_buf), dev);
+        if (pci_result == CUDA_SUCCESS) {
+            pci_bus_id = std::string(pci_buf);
+        } else {
+            pci_bus_id = "";
+        }
+    } else {
+        pci_bus_id = "";
+    }
+
     return (CUDA_SUCCESS != result);
 }
 
@@ -89,14 +102,15 @@ nixlLibfabricCudaCtx::cudaUpdateCtxPtr(void *address, int expected_dev, bool &wa
     bool is_dev;
     CUdevice dev;
     CUcontext ctx;
+    std::string pci_bus_id; // Not used here, but required by cudaQueryAddr
     int ret;
 
     was_updated = false;
 
     if (expected_dev == -1) return -1;
     if (myDevId_ != -1 && expected_dev != myDevId_) return -1;
 
-    ret = cudaQueryAddr(address, is_dev, dev, ctx);
+    ret = cudaQueryAddr(address, is_dev, dev, ctx, pci_bus_id);
     if (ret) return ret;
     if (!is_dev) return 0;
     if (dev != expected_dev) return -1;
@@ -734,6 +748,7 @@ nixlLibfabricEngine::registerMem(const nixlBlobDesc &mem,
     priv->length_ = mem.len;
     priv->gpu_device_id_ = mem.devId; // Store GPU device ID
 
+    std::string pci_bus_id = "";
 #ifdef HAVE_CUDA
     // Handle CUDA memory registration with GPU Direct RDMA support
     if (nixl_mem == VRAM_SEG) {
@@ -760,6 +775,19 @@ nixlLibfabricEngine::registerMem(const nixlBlobDesc &mem,
             }
             NIXL_DEBUG << "Set CUDA device context to GPU " << mem.devId;
         }
+
+        // Query PCI bus ID from memory address (AFTER setting context)
+        bool is_dev;
+        CUdevice dev;
+        CUcontext ctx;
+
+        int ret = cudaQueryAddr((void *)mem.addr, is_dev, dev, ctx, pci_bus_id);
+        if (ret || !is_dev) {
+            NIXL_ERROR << "Failed to query device from memory " << (void *)mem.addr;
+            return NIXL_ERR_BACKEND;
+        }
+
+        NIXL_DEBUG << "Queried PCI bus ID: " << pci_bus_id << " for GPU " << mem.devId;
     }
 #endif
 
@@ -777,12 +805,14 @@ nixlLibfabricEngine::registerMem(const nixlBlobDesc &mem,
 
     // Use Rail Manager for centralized memory registration with GPU Direct RDMA support
     NIXL_TRACE << "Registering memory: addr=" << (void *)mem.addr << " len=" << mem.len
-               << " mem_type=" << nixl_mem << " devId=" << mem.devId;
+               << " mem_type=" << nixl_mem << " devId=" << mem.devId
+               << (nixl_mem == VRAM_SEG ? " pci_bus_id=" + pci_bus_id : "");
 
     nixl_status_t status = rail_manager.registerMemory((void *)mem.addr,
                                                        mem.len,
                                                        nixl_mem,
                                                        mem.devId,
+                                                       pci_bus_id,
                                                        priv->rail_mr_list_,
                                                        priv->rail_key_list_,
                                                        priv->selected_rails_);
diff --git a/src/utils/libfabric/libfabric_rail_manager.cpp b/src/utils/libfabric/libfabric_rail_manager.cpp
@@ -21,6 +21,7 @@
 #include "libfabric/libfabric_topology.h"
 #include "common/nixl_log.h"
 #include "serdes/serdes.h"
+#include <sstream>
 
 // Forward declaration for LibfabricUtils namespace
 namespace LibfabricUtils {
@@ -46,6 +47,7 @@ nixlLibfabricRailManager::nixlLibfabricRailManager(size_t striping_threshold)
 
     // Get network devices from topology and create rails automatically
     std::vector<std::string> all_devices = topology->getAllDevices();
+
     std::string selected_provider_name = topology->getProviderName();
 
     NIXL_DEBUG << "Got " << all_devices.size()
@@ -321,16 +323,25 @@ nixlLibfabricRailManager::prepareAndSubmitTransfer(
 std::vector<size_t>
 nixlLibfabricRailManager::selectRailsForMemory(void *mem_addr,
                                                nixl_mem_t mem_type,
-                                               int gpu_id) const {
+                                               int gpu_id,
+                                               const std::string &gpu_pci_bus_id) const {
     if (mem_type == VRAM_SEG) {
 #ifdef HAVE_CUDA
         if (gpu_id < 0) {
             NIXL_ERROR << "Invalid GPU ID " << gpu_id << " for VRAM memory " << mem_addr;
             return {}; // Return empty vector to indicate failure
         }
-        std::vector<std::string> gpu_efa_devices = topology->getEfaDevicesForGpu(gpu_id);
+
+        // Use PCI bus ID provided by caller (queried in backend layer)
+        if (gpu_pci_bus_id.empty()) {
+            NIXL_ERROR << "Empty PCI bus ID provided for VRAM memory " << mem_addr;
+            return {}; // Return empty vector to indicate failure
+        }
+
+        // Get EFA devices for this PCI bus ID
+        std::vector<std::string> gpu_efa_devices = topology->getEfaDevicesForGPUPci(gpu_pci_bus_id);
         if (gpu_efa_devices.empty()) {
-            NIXL_ERROR << "No EFA devices found for GPU " << gpu_id;
+            NIXL_ERROR << "No EFA devices found for PCI " << gpu_pci_bus_id;
             return {}; // Return empty vector to indicate failure
         }
         std::vector<size_t> gpu_rails;
@@ -340,26 +351,26 @@ nixlLibfabricRailManager::selectRailsForMemory(void *mem_addr,
                 // Bounds check: ensure rail index is valid
                 if (it->second < data_rails_.size()) {
                     gpu_rails.push_back(it->second);
-                    NIXL_DEBUG << "VRAM memory " << mem_addr << " on GPU " << gpu_id
+                    NIXL_DEBUG << "VRAM memory " << mem_addr << " on GPU-PCI " << gpu_pci_bus_id
                                << " mapped to rail " << it->second << " (EFA device=" << efa_device
                                << ")";
                 } else {
                     NIXL_WARN << "EFA device " << efa_device << " maps to rail " << it->second
                               << " but only " << data_rails_.size() << " rails available";
                 }
             } else {
-                NIXL_WARN << "EFA device " << efa_device << " not found in rail mapping for GPU "
-                          << gpu_id;
+                NIXL_WARN << "EFA device " << efa_device
+                          << " not found in rail mapping for GPU-PCI " << gpu_pci_bus_id;
             }
         }
 
         if (gpu_rails.empty()) {
-            NIXL_ERROR << "No valid rail mapping found for GPU " << gpu_id << " (checked "
-                       << gpu_efa_devices.size() << " EFA devices)";
+            NIXL_ERROR << "No valid rail mapping found for GPU-PCI " << gpu_pci_bus_id
+                       << " (checked " << gpu_efa_devices.size() << " EFA devices)";
             return {};
         }
 
-        NIXL_DEBUG << "VRAM memory " << mem_addr << " on GPU " << gpu_id << " will use "
+        NIXL_DEBUG << "VRAM memory " << mem_addr << " on GPU-PCI " << gpu_pci_bus_id << " will use "
                    << gpu_rails.size() << " rails total";
         return gpu_rails;
 #else
@@ -390,6 +401,7 @@ nixlLibfabricRailManager::registerMemory(void *buffer,
                                          size_t length,
                                          nixl_mem_t mem_type,
                                          int gpu_id,
+                                         const std::string &gpu_pci_bus_id,
                                          std::vector<struct fid_mr *> &mr_list_out,
                                          std::vector<uint64_t> &key_list_out,
                                          std::vector<size_t> &selected_rails_out) {
@@ -398,8 +410,11 @@ nixlLibfabricRailManager::registerMemory(void *buffer,
         return NIXL_ERR_INVALID_PARAM;
     }
 
-    // Use internal rail selection with explicit GPU ID
-    std::vector<size_t> selected_rails = selectRailsForMemory(buffer, mem_type, gpu_id);
+    // Select rails based on memory type and PCI bus ID
+    // For VRAM: uses PCI bus ID provided by backend to map to topology-aware rails
+    // For DRAM: uses all available rails
+    std::vector<size_t> selected_rails =
+        selectRailsForMemory(buffer, mem_type, gpu_id, gpu_pci_bus_id);
     if (selected_rails.empty()) {
         NIXL_ERROR << "No rails selected for memory type " << mem_type;
         return NIXL_ERR_NOT_SUPPORTED;
@@ -429,6 +444,7 @@ nixlLibfabricRailManager::registerMemory(void *buffer,
 
         struct fid_mr *mr;
         uint64_t key;
+        // Pass gpu_id parameter to individual rail's registerMemory calls
         nixl_status_t status =
             data_rails_[rail_idx]->registerMemory(buffer, length, mem_type, gpu_id, &mr, &key);
         if (status != NIXL_SUCCESS) {
diff --git a/src/utils/libfabric/libfabric_rail_manager.h b/src/utils/libfabric/libfabric_rail_manager.h
@@ -110,6 +110,7 @@ class nixlLibfabricRailManager {
      * @param length Buffer size in bytes
      * @param mem_type Memory type (DRAM_SEG or VRAM_SEG)
      * @param gpu_id GPU device ID (used for VRAM_SEG, ignored for DRAM_SEG)
+     * @param gpu_pci_bus_id PCI bus ID for VRAM-GPU (queried in backend layer), empty for DRAM
      * @param mr_list_out Memory registration handles, indexed by rail ID
      * @param key_list_out Remote access keys, indexed by rail ID
      * @param selected_rails_out List of rail IDs where memory was registered
@@ -120,6 +121,7 @@ class nixlLibfabricRailManager {
                    size_t length,
                    nixl_mem_t mem_type,
                    int gpu_id,
+                   const std::string &gpu_pci_bus_id,
                    std::vector<struct fid_mr *> &mr_list_out,
                    std::vector<uint64_t> &key_list_out,
                    std::vector<size_t> &selected_rails_out);
@@ -316,7 +318,10 @@ class nixlLibfabricRailManager {
 
     // Internal rail selection method
     std::vector<size_t>
-    selectRailsForMemory(void *mem_addr, nixl_mem_t mem_type, int gpu_id) const;
+    selectRailsForMemory(void *mem_addr,
+                         nixl_mem_t mem_type,
+                         int gpu_id,
+                         const std::string &pci_bus_id = "") const;
 
     // Helper functions for connection SerDes
     void
diff --git a/src/utils/libfabric/libfabric_topology.cpp b/src/utils/libfabric/libfabric_topology.cpp
@@ -135,18 +135,37 @@ nixlLibfabricTopology::discoverEfaDevices() {
 }
 
 std::vector<std::string>
-nixlLibfabricTopology::getEfaDevicesForGpu(int gpu_id) const {
-    auto it = gpu_to_efa_devices.find(gpu_id);
-    if (it != gpu_to_efa_devices.end()) {
-        return it->second;
+nixlLibfabricTopology::getEfaDevicesForGPUPci(const std::string &pci_bus_id) const {
+    // Normalize PCI bus ID format to match hwloc format
+    // CUDA format: "0000:59:00.0" → hwloc format: "0:59:00.0"
+    unsigned int domain, bus, device, function;
+    if (sscanf(pci_bus_id.c_str(), "%x:%x:%x.%x", &domain, &bus, &device, &function) == 4) {
+        char normalized_pci[32];
+        snprintf(normalized_pci,
+                 sizeof(normalized_pci),
+                 "%x:%02x:%02x.%x",
+                 domain,
+                 bus,
+                 device,
+                 function);
+        std::string normalized_id(normalized_pci);
+
+        auto it = pci_to_efa_devices.find(normalized_id);
+        if (it != pci_to_efa_devices.end()) {
+            NIXL_DEBUG << "Found EFA devices for PCI " << pci_bus_id << " (normalized to "
+                       << normalized_id << ")";
+            return it->second;
+        }
+        // PCI ID parsed successfully but not found in mapping
+        NIXL_WARN << "PCI bus ID " << pci_bus_id << " (normalized to " << normalized_id
+                  << ") not found in GPU-EFA mapping, returning all devices";
+    } else {
+        // Failed to parse PCI bus ID format
+        NIXL_WARN << "Failed to parse PCI bus ID format: " << pci_bus_id
+                  << ", returning all devices";
     }
-    NIXL_WARN << "No EFA devices found for GPU " << gpu_id << ", returning all devices";
-    return all_devices;
-}
 
-bool
-nixlLibfabricTopology::isValidGpuId(int gpu_id) const {
-    return gpu_id >= 0 && gpu_id < num_gpus;
+    return all_devices;
 }
 
 bool
@@ -165,10 +184,10 @@ nixlLibfabricTopology::printTopologyInfo() const {
     for (size_t i = 0; i < all_devices.size(); ++i) {
         NIXL_TRACE << "  [" << i << "] " << all_devices[i];
     }
-    NIXL_TRACE << "GPU → EFA mapping:";
-    for (const auto &pair : gpu_to_efa_devices) {
+    NIXL_TRACE << "GPU-PCI → EFA mapping:";
+    for (const auto &pair : pci_to_efa_devices) {
         std::stringstream ss;
-        ss << "  GPU " << pair.first << " → [";
+        ss << "  GPU-PCI " << pair.first << " → [";
         for (size_t i = 0; i < pair.second.size(); ++i) {
             if (i > 0) ss << ", ";
             ss << pair.second[i];
@@ -423,15 +442,15 @@ nixlLibfabricTopology::buildPcieToLibfabricMapping() {
 
 nixl_status_t
 nixlLibfabricTopology::buildGpuToEfaMapping() {
-    gpu_to_efa_devices.clear();
+    pci_to_efa_devices.clear();
     // Implement NIXL's topology-aware GPU-EFA grouping algorithm
     nixl_status_t status = buildTopologyAwareGrouping();
     if (status != NIXL_SUCCESS) {
         NIXL_WARN << "Topology-aware grouping failed, using fallback to use all available devices";
         return buildFallbackMapping();
     }
 
-    NIXL_TRACE << "Built GPU→EFA mapping for " << gpu_to_efa_devices.size()
+    NIXL_TRACE << "Built PCI→EFA mapping for " << pci_to_efa_devices.size()
                << " GPUs using topology-aware algorithm";
 
     return NIXL_SUCCESS;
@@ -527,13 +546,17 @@ nixlLibfabricTopology::buildTopologyAwareGrouping() {
             }
 
             if (gpu_index >= 0) {
-                gpu_to_efa_devices[gpu_index] = gpu_efa_devices;
-
-                NIXL_TRACE << "GPU " << gpu_index << " (" << std::hex << group.closest_gpu.domain_id
-                           << ":" << static_cast<int>(group.closest_gpu.bus_id) << ":"
-                           << static_cast<int>(group.closest_gpu.device_id) << "."
-                           << static_cast<int>(group.closest_gpu.function_id) << std::dec << ") → "
-                           << gpu_efa_devices.size() << " EFA devices";
+                // Store mapping using PCI bus ID as key
+                std::string pci_bus_id = getPcieAddressFromHwlocObj(group.closest_gpu.hwloc_node);
+                pci_to_efa_devices[pci_bus_id] = gpu_efa_devices;
+
+                NIXL_TRACE << "PCI " << pci_bus_id << " (GPU " << gpu_index << ") → "
+                           << gpu_efa_devices.size() << " EFA devices: [";
+                for (size_t i = 0; i < gpu_efa_devices.size(); ++i) {
+                    if (i > 0) NIXL_TRACE << ", ";
+                    NIXL_TRACE << gpu_efa_devices[i];
+                }
+                NIXL_TRACE << "]";
             }
         }
     }
@@ -543,15 +566,12 @@ nixlLibfabricTopology::buildTopologyAwareGrouping() {
 nixl_status_t
 nixlLibfabricTopology::buildFallbackMapping() {
     // Fallback: if specific mapping failed, use simple approach
-    gpu_to_efa_devices.clear();
-    // Give all devices to all GPUs (not optimal but functional)
-    for (int gpu_id = 0; gpu_id < num_gpus; ++gpu_id) {
-        gpu_to_efa_devices[gpu_id] = all_devices;
-    }
+    // We can't build PCI-based mapping without topology, so just return success
+    // getEfaDevicesForPci() will return all_devices when no mapping is found
+    NIXL_WARN << "Using fallback: all GPUs will use all available EFA devices";
     return NIXL_SUCCESS;
 }
 
-
 // hwloc helper methods
 
 std::string
@@ -607,8 +627,8 @@ nixlLibfabricTopology::groupNicsWithGpus(const std::vector<NicInfo> &discovered_
     // Implement NIXL's topology-aware NIC grouping algorithm
 
     // Step 1: Mark topology nodes that have NICs in their subtree
-    std::map<hwloc_obj_t, int> node_group_counts;
-    std::map<hwloc_obj_t, std::vector<NicInfo>> node_nics;
+    std::unordered_map<hwloc_obj_t, int> node_group_counts;
+    std::unordered_map<hwloc_obj_t, std::vector<NicInfo>> node_nics;
     std::set<hwloc_obj_t> nic_subtree_nodes;
     // Mark all nodes that have NICs in their subtree and collect NICs per node
     for (const auto &nic : discovered_nics) {
@@ -621,7 +641,7 @@ nixlLibfabricTopology::groupNicsWithGpus(const std::vector<NicInfo> &discovered_
     }
 
     // Step 2: For each GPU, walk up until finding a NIC subtree node and increment its count
-    std::map<hwloc_obj_t, std::vector<GpuInfo>> node_gpus;
+    std::unordered_map<hwloc_obj_t, std::vector<GpuInfo>> node_gpus;
 
     for (const auto &gpu : discovered_gpus) {
         hwloc_obj_t node = gpu.hwloc_node;
@@ -637,7 +657,7 @@ nixlLibfabricTopology::groupNicsWithGpus(const std::vector<NicInfo> &discovered_
     }
 
     // Step 3: Collect all NICs that need to be grouped and assign them to ancestor nodes
-    std::map<hwloc_obj_t, std::vector<NicInfo>> ancestor_nics;
+    std::unordered_map<hwloc_obj_t, std::vector<NicInfo>> ancestor_nics;
 
     for (const auto &pair : node_nics) {
         hwloc_obj_t nic_node = pair.first;
diff --git a/src/utils/libfabric/libfabric_topology.h b/src/utils/libfabric/libfabric_topology.h
diff --git a/test/unit/utils/libfabric/libfabric_topology_test.cpp b/test/unit/utils/libfabric/libfabric_topology_test.cpp