diff --git a/.gitlab/test_cpp.sh b/.gitlab/test_cpp.sh
index d119cbd0bb..074efb0891 100755
--- a/.gitlab/test_cpp.sh
+++ b/.gitlab/test_cpp.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/.gitlab/test_nixlbench.sh b/.gitlab/test_nixlbench.sh
index 575602726f..116a6e666c 100755
--- a/.gitlab/test_nixlbench.sh
+++ b/.gitlab/test_nixlbench.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/.gitlab/test_python.sh b/.gitlab/test_python.sh
index 3aec4c2795..fe51cff457 100755
--- a/.gitlab/test_python.sh
+++ b/.gitlab/test_python.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/benchmark/nixlbench/src/runtime/etcd/etcd_rt.cpp b/benchmark/nixlbench/src/runtime/etcd/etcd_rt.cpp
index 9a840954d5..872c44e07f 100644
--- a/benchmark/nixlbench/src/runtime/etcd/etcd_rt.cpp
+++ b/benchmark/nixlbench/src/runtime/etcd/etcd_rt.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/examples/cpp/nixl_etcd_example.cpp b/examples/cpp/nixl_etcd_example.cpp
index 07f383a2b7..db04edd2f2 100644
--- a/examples/cpp/nixl_etcd_example.cpp
+++ b/examples/cpp/nixl_etcd_example.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/examples/cpp/telemetry_reader.cpp b/examples/cpp/telemetry_reader.cpp
index facad1f668..344ce4724d 100644
--- a/examples/cpp/telemetry_reader.cpp
+++ b/examples/cpp/telemetry_reader.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/examples/python/remote_storage_example/nixl_p2p_storage_example.py b/examples/python/remote_storage_example/nixl_p2p_storage_example.py
index a0b19c71de..01366d3468 100644
--- a/examples/python/remote_storage_example/nixl_p2p_storage_example.py
+++ b/examples/python/remote_storage_example/nixl_p2p_storage_example.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/examples/python/telemetry_reader.py b/examples/python/telemetry_reader.py
index 150897bf67..e730b861bd 100755
--- a/examples/python/telemetry_reader.py
+++ b/examples/python/telemetry_reader.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/plugins/gpunetio/gpunetio_backend.cpp b/src/plugins/gpunetio/gpunetio_backend.cpp
index d0e44138bc..901313c1f1 100644
--- a/src/plugins/gpunetio/gpunetio_backend.cpp
+++ b/src/plugins/gpunetio/gpunetio_backend.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,15 +17,18 @@
 
 #include "gpunetio_backend.h"
 #include "serdes/serdes.h"
-#include <arpa/inet.h>
 #include <cassert>
 #include <stdexcept>
+#include <cstdlib>
+#include <errno.h>
 #include <unistd.h>
 #include "common/nixl_log.h"
 #include <arpa/inet.h>
 
 const char info_delimiter = '-';
 
+extern "C" int
+doca_query_mkey_swapped();
 /****************************************
  * Constructor/Destructor
  *****************************************/
@@ -47,6 +50,21 @@ nixlDocaEngine::nixlDocaEngine(const nixlBackendInitParams *init_params)
     result = doca_log_backend_set_sdk_level(sdk_log, DOCA_LOG_LEVEL_ERROR);
     if (result != DOCA_SUCCESS) throw std::invalid_argument("Can't initialize doca log");
 
+    // Detect device-side expectation for key swapping, allow env override
+    int device_mkey_swapped = -1;
+    device_mkey_swapped = doca_query_mkey_swapped();
+    if (device_mkey_swapped >= 0) swap_keys_config = (device_mkey_swapped == 1);
+    const char *env_swap = std::getenv("NIXL_GPUNETIO_SWAP_KEYS");
+    if (env_swap != nullptr) {
+        if (std::string(env_swap) == "0" || std::string(env_swap) == "false")
+            swap_keys_config = false;
+        else
+            swap_keys_config = true;
+    }
+    NIXL_DEBUG << "GPUNETIO key byte-order swap (htonl): "
+               << (swap_keys_config ? "enabled" : "disabled") << ", device expects swapped: "
+               << (device_mkey_swapped >= 0 ? device_mkey_swapped : -1);
+
     NIXL_INFO << "DOCA network devices ";
     // Temporary: will extend to more GPUs in a dedicated PR
     if (custom_params->count("network_devices") > 1)
@@ -179,6 +197,15 @@ nixlDocaEngine::nixlDocaEngine(const nixlBackendInitParams *init_params)
     } else {
         doca_devinfo_get_ipv4_addr(
             doca_dev_as_devinfo(ddev), (uint8_t *)ipv4_addr, DOCA_DEVINFO_IPV4_ADDR_SIZE);
+        if (const char *override_ip = std::getenv("NIXL_DOCASIM_IPV4_OVERRIDE")) {
+            struct in_addr addr_override = {};
+            if (inet_pton(AF_INET, override_ip, &addr_override) == 1) {
+                std::memcpy(ipv4_addr, &addr_override.s_addr, sizeof(ipv4_addr));
+                NIXL_INFO << "DOCA IP override applied from env " << override_ip;
+            } else {
+                NIXL_WARN << "Invalid IPv4 override in NIXL_DOCASIM_IPV4_OVERRIDE: " << override_ip;
+            }
+        }
         NIXL_DEBUG << "DOCA IP address " << static_cast<unsigned>(ipv4_addr[0]) << " "
                    << static_cast<unsigned>(ipv4_addr[1]) << " "
                    << static_cast<unsigned>(ipv4_addr[2]) << " "
@@ -448,7 +475,10 @@ nixlDocaEngine::nixlDocaInitNotif(const std::string &remote_agent, doca_dev *dev
     // Ensure notif list is not added twice for the same peer
     notifMap[remote_agent] = notif;
     ((volatile struct docaNotif *)notif_fill_cpu)->msg_buf = (uintptr_t)notif->recv_addr;
-    ((volatile struct docaNotif *)notif_fill_cpu)->msg_lkey = notif->recv_mr->get_lkey();
+    ((volatile struct docaNotif *)notif_fill_cpu)->msg_lkey =
+        swap_keys_config ? htonl(notif->recv_mr->get_lkey()) : notif->recv_mr->get_lkey();
+    ((volatile struct docaNotif *)notif_fill_cpu)->keys_are_swapped = swap_keys_config ? 1 : 0;
+
     ((volatile struct docaNotif *)notif_fill_cpu)->msg_size = notif->elems_size;
     std::atomic_thread_fence(std::memory_order_seq_cst);
     ((volatile struct docaNotif *)notif_fill_cpu)->qp_gpu =
@@ -513,8 +543,10 @@ nixlDocaEngine::progressThreadStart() {
         /* Set port and IP: */
         server_addr.sin_family = AF_INET;
         server_addr.sin_port = htons(DOCA_RDMA_CM_LOCAL_PORT_SERVER);
-        server_addr.sin_addr.s_addr = INADDR_ANY; /* listen on any interface */
-
+        //        server_addr.sin_addr.s_addr = INADDR_ANY;
+        /* listen on any interface */
+        std::memcpy(&server_addr.sin_addr.s_addr, ipv4_addr, sizeof(ipv4_addr));
+        if (server_addr.sin_addr.s_addr == 0) server_addr.sin_addr.s_addr = INADDR_ANY;
         /* Bind to the set port and IP: */
         if (bind(oob_sock_server, (struct sockaddr *)&server_addr, sizeof(server_addr)) < 0) {
             NIXL_ERROR << "Couldn't bind to the port " << DOCA_RDMA_CM_LOCAL_PORT_SERVER;
@@ -929,6 +961,8 @@ nixlDocaEngine::getConnInfo(std::string &str) const {
     ss << (int)ipv4_addr[0] << "." << (int)ipv4_addr[1] << "." << (int)ipv4_addr[2] << "."
        << (int)ipv4_addr[3];
     str = ss.str();
+    NIXL_DEBUG << "getConnInfo DOCA: " << str;
+
     return NIXL_SUCCESS;
 }
 
@@ -1025,6 +1059,16 @@ nixlDocaEngine::registerMem(const nixlBlobDesc &mem,
        << info_delimiter << ((size_t)priv->mr->get_tot_size());
     priv->remoteMrStr = ss.str();
 
+    uint32_t lkey = priv->mr->get_lkey();
+    uint32_t rkey = priv->mr->get_rkey();
+    NIXL_INFO << "GPUNETIO registerMem publish dev " << priv->devId << " addr " << std::showbase
+              << std::hex << std::uppercase << (uintptr_t)priv->mr->get_addr() << " len "
+              << (uint64_t)priv->mr->get_tot_size() << " lkey " << lkey << " rkey " << rkey
+              << std::noshowbase << std::dec;
+    NIXL_DEBUG << "[dbg] publish raw: dev=" << priv->devId
+               << " addr_dec=" << (uintptr_t)priv->mr->get_addr()
+               << " len_dec=" << (uint64_t)priv->mr->get_tot_size() << " lkey_dec=" << lkey
+               << " rkey_dec=" << rkey;
     out = (nixlBackendMD *)priv;
 
     return NIXL_SUCCESS;
@@ -1043,6 +1087,8 @@ nixl_status_t
 nixlDocaEngine::getPublicData(const nixlBackendMD *meta, std::string &str) const {
     const nixlDocaPrivateMetadata *priv = (nixlDocaPrivateMetadata *)meta;
     str = priv->remoteMrStr;
+    NIXL_TRACE << "[dbg] getPublicData remoteMrStr=" << str;
+
     return NIXL_SUCCESS;
 }
 
@@ -1071,10 +1117,27 @@ nixlDocaEngine::loadRemoteMD(const nixlBlobDesc &input,
     std::stringstream ss(input.metaInfo.data());
     while (std::getline(ss, token, info_delimiter))
         tokens.push_back(token);
-
-    uint32_t rkey = static_cast<uint32_t>(atoi(tokens[0].c_str()));
-    uintptr_t addr = static_cast<uintptr_t>(atol(tokens[1].c_str()));
-    size_t tot_size = static_cast<size_t>(atol(tokens[2].c_str()));
+    // Parse as unsigned to avoid overflow/truncation (rkeys often exceed INT_MAX)
+    NIXL_TRACE << "[dbg] loadRemoteMD tokens size=" << tokens.size();
+    for (size_t i = 0; i < tokens.size(); ++i)
+        NIXL_TRACE << "[dbg] token[" << i << "]=" << tokens[i];
+    const char *p0 = tokens[0].c_str();
+    const char *p1 = tokens[1].c_str();
+    const char *p2 = tokens[2].c_str();
+    errno = 0;
+    unsigned long rkey_ul = strtoul(p0, nullptr, 10);
+    unsigned long long addr_ull = strtoull(p1, nullptr, 10);
+    unsigned long long size_ull = strtoull(p2, nullptr, 10);
+    if (errno != 0) {
+        NIXL_ERROR << "Failed to parse remote metadata (errno=" << errno << ")";
+        return NIXL_ERR_INVALID_PARAM;
+    }
+    uint32_t rkey = static_cast<uint32_t>(rkey_ul);
+    uintptr_t addr = static_cast<uintptr_t>(addr_ull);
+    size_t tot_size = static_cast<size_t>(size_ull);
+    NIXL_TRACE << "[dbg] parsed remote MD rkey=" << std::showbase << std::hex << std::uppercase
+               << rkey << " addr=" << addr << " size=" << (uint64_t)tot_size << std::noshowbase
+               << std::dec;
 
     // Empty mmap, filled with imported data
     try {
@@ -1145,6 +1208,8 @@ nixlDocaEngine::prepXfer(const nixl_xfer_op_t &operation,
     pos = treq->start_pos;
 
     do {
+        xferReqRingCpu[pos].keys_are_swapped = swap_keys_config ? 1 : 0;
+
         for (uint32_t idx = 0; idx < lcnt && idx < DOCA_XFER_REQ_SIZE; idx++) {
             size_t lsize = local[idx].len;
             size_t rsize = remote[idx].len;
@@ -1153,12 +1218,31 @@ nixlDocaEngine::prepXfer(const nixl_xfer_op_t &operation,
             lmd = (nixlDocaPrivateMetadata *)local[idx].metadataP;
             rmd = (nixlDocaPublicMetadata *)remote[idx].metadataP;
 
-            xferReqRingCpu[pos].lbuf[idx] = (uintptr_t)lmd->mr->get_addr();
-            xferReqRingCpu[pos].lkey[idx] = (uintptr_t)lmd->mr->get_lkey();
-            xferReqRingCpu[pos].rbuf[idx] = (uintptr_t)rmd->mr->get_addr();
-            xferReqRingCpu[pos].rkey[idx] = (uintptr_t)rmd->mr->get_rkey();
+            uint32_t lkey_host = lmd->mr->get_lkey();
+            uint32_t rkey_host = rmd->mr->get_rkey();
+            uint32_t lkey_be = swap_keys_config ? htonl(lkey_host) : lkey_host;
+            uint32_t rkey_be = swap_keys_config ? htonl(rkey_host) : rkey_host;
+
+            // Local buffer: use the descriptor address (within the local MR)
+            xferReqRingCpu[pos].lbuf[idx] = (uintptr_t)local[idx].addr;
+            xferReqRingCpu[pos].lkey[idx] = lkey_be;
+            uintptr_t dbg_rbuf_desc = (uintptr_t)remote[idx].addr;
+            uintptr_t dbg_rbuf_mr = (uintptr_t)rmd->mr->get_addr();
+            // Use the published remote MR address (not the initiator-side descriptor address)
+            xferReqRingCpu[pos].rbuf[idx] = dbg_rbuf_mr;
+            xferReqRingCpu[pos].rkey[idx] = rkey_be;
             xferReqRingCpu[pos].size[idx] = lsize;
             xferReqRingCpu[pos].num++;
+
+            NIXL_INFO << "GPUNETIO prepXfer queue_pos " << pos << " idx " << idx << " laddr "
+                      << std::showbase << std::hex << std::uppercase
+                      << xferReqRingCpu[pos].lbuf[idx] << " lkey " << lkey_host << " lkey_be "
+                      << lkey_be << " raddr " << xferReqRingCpu[pos].rbuf[idx] << " rkey "
+                      << rkey_host << " rkey_be " << rkey_be << " size "
+                      << (uint64_t)xferReqRingCpu[pos].size[idx] << std::noshowbase << std::dec;
+            NIXL_TRACE << "[dbg] remote_desc_addr=" << std::showbase << std::hex << std::uppercase
+                       << dbg_rbuf_desc << " remote_mr_addr=" << dbg_rbuf_mr << " used=mr_addr"
+                       << std::noshowbase << std::dec;
         }
 
         xferReqRingCpu[pos].last_rsvd = last_rsvd_flags;
@@ -1199,7 +1283,9 @@ nixlDocaEngine::prepXfer(const nixl_xfer_op_t &operation,
             (notif->send_pi.fetch_add(1) & (notif->elems_num - 1));
         xferReqRingCpu[treq->end_pos - 1].msg_sz = newMsg.size();
         xferReqRingCpu[treq->end_pos - 1].lbuf_notif = notif_addr;
-        xferReqRingCpu[treq->end_pos - 1].lkey_notif = notif->send_mr->get_lkey();
+        uint32_t notif_lkey_host = notif->send_mr->get_lkey();
+        uint32_t notif_lkey_be = swap_keys_config ? htonl(notif_lkey_host) : notif_lkey_host;
+        xferReqRingCpu[treq->end_pos - 1].lkey_notif = notif_lkey_be;
 
         memcpy((void *)notif_addr, newMsg.c_str(), newMsg.size());
 
@@ -1378,7 +1464,10 @@ nixlDocaEngine::genNotif(const std::string &remote_agent, const std::string &msg
 
     std::lock_guard<std::mutex> lock(notifSendLock);
     ((volatile struct docaNotif *)notif_send_cpu)->msg_buf = msg_buf;
-    ((volatile struct docaNotif *)notif_send_cpu)->msg_lkey = notif->send_mr->get_lkey();
+    ((volatile struct docaNotif *)notif_send_cpu)->msg_lkey =
+        swap_keys_config ? htonl(notif->send_mr->get_lkey()) : notif->send_mr->get_lkey();
+    ((volatile struct docaNotif *)notif_send_cpu)->keys_are_swapped = swap_keys_config ? 1 : 0;
+
     ((volatile struct docaNotif *)notif_send_cpu)->msg_size = newMsg.size();
     std::atomic_thread_fence(std::memory_order_seq_cst);
     ((volatile struct docaNotif *)notif_send_cpu)->qp_gpu =
diff --git a/src/plugins/gpunetio/gpunetio_backend.h b/src/plugins/gpunetio/gpunetio_backend.h
index 9a17efc9f2..915e3c646d 100644
--- a/src/plugins/gpunetio/gpunetio_backend.h
+++ b/src/plugins/gpunetio/gpunetio_backend.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -185,6 +185,10 @@ class nixlDocaEngine : public nixlBackendEngine {
     std::unordered_map<std::string, int> connMap;
     std::unordered_map<std::string, struct nixlDocaNotif *> notifMap;
 
+    // Whether to swap (htonl) mkeys before handing to GPU. Default: true.
+    bool swap_keys_config = true;
+    // Extra debug dump controls (env-driven)
+
     pthread_t server_thread_id;
 
     class nixlDocaBckndReq : public nixlBackendReqH {
diff --git a/src/plugins/gpunetio/gpunetio_backend_aux.h b/src/plugins/gpunetio/gpunetio_backend_aux.h
index 8fc897f50b..3392adc3f2 100644
--- a/src/plugins/gpunetio/gpunetio_backend_aux.h
+++ b/src/plugins/gpunetio/gpunetio_backend_aux.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -87,6 +87,7 @@ struct docaXferReqGpu {
     uint32_t rkey[DOCA_XFER_REQ_SIZE];
     uint16_t num;
     uint8_t in_use;
+    uint8_t keys_are_swapped;
     uint32_t conn_idx;
     uint32_t has_notif_msg_idx;
     uint32_t msg_sz;
@@ -119,6 +120,8 @@ struct docaXferCompletion {
 struct docaNotif {
     doca_gpu_dev_verbs_qp *qp_gpu;
     uint32_t msg_lkey;
+    uint8_t keys_are_swapped;
+    uint8_t _pad_keys[3];
     uintptr_t msg_buf;
     size_t msg_size;
     uint32_t msg_num;
diff --git a/src/plugins/gpunetio/gpunetio_kernels.cu b/src/plugins/gpunetio/gpunetio_kernels.cu
index 7e97cc7ae4..60bc4a42d4 100644
--- a/src/plugins/gpunetio/gpunetio_kernels.cu
+++ b/src/plugins/gpunetio/gpunetio_kernels.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -24,6 +24,16 @@
 
 #define ENABLE_DEBUG 0
 
+// Expose at runtime whether device code was compiled to expect pre-swapped mkeys
+__device__ __constant__ int __doca_mkey_swapped = DOCA_GPUNETIO_VERBS_MKEY_SWAPPED;
+
+extern "C" int
+doca_query_mkey_swapped() {
+    int v = 0;
+    if (cudaMemcpyFromSymbol(&v, __doca_mkey_swapped, sizeof(v)) != cudaSuccess) return -1;
+    return v;
+}
+
 __device__ inline void
 nixl_gpunetio_dev_cq_print_cqe_err(struct mlx5_cqe64 *cqe64) {
     struct mlx5_err_cqe_ex *err_cqe = (struct mlx5_err_cqe_ex *)cqe64;
@@ -39,6 +49,15 @@ nixl_gpunetio_dev_cq_print_cqe_err(struct mlx5_cqe64 *cqe64) {
            err_cqe->s_wqe_opcode_qpn);
 }
 
+__device__ static __forceinline__ uint32_t
+nixl_prepare_mkey(uint32_t key, uint8_t stored_swapped) {
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    return stored_swapped ? key : doca_gpu_dev_verbs_bswap32(key);
+#else
+    return stored_swapped ? doca_gpu_dev_verbs_bswap32(key) : key;
+#endif
+}
+
 /**
  * @brief [Internal] Poll the Completion Queue (CQ) at a specific index respecting NIXL
  * requirements. Non-blocking polling, just one-time CQE check.
@@ -118,15 +137,30 @@ kernel_read(doca_gpu_dev_verbs_qp *qp, struct docaXferReqGpu *xferReqRing, uint3
         wqe_idx = base_wqe_idx + idx;
         wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
 
+        uint32_t rkey =
+            nixl_prepare_mkey(xferReqRing[pos].rkey[idx], xferReqRing[pos].keys_are_swapped);
+        uint32_t lkey =
+            nixl_prepare_mkey(xferReqRing[pos].lkey[idx], xferReqRing[pos].keys_are_swapped);
+
         doca_gpu_dev_verbs_wqe_prepare_read(qp,
                                             wqe_ptr,
                                             wqe_idx,
                                             cflag,
                                             (uint64_t)(xferReqRing[pos].rbuf[idx]),
-                                            xferReqRing[pos].rkey[idx],
+                                            rkey,
                                             (uint64_t)(xferReqRing[pos].lbuf[idx]),
-                                            xferReqRing[pos].lkey[idx],
+                                            lkey,
                                             xferReqRing[pos].size[idx]);
+#if ENABLE_DEBUG == 1
+        if (qp->need_dump == false) {
+            printf("prepare_read raddr %lx rkey %x laddr %lx lkey %x size %ld\n",
+                   (uint64_t)(xferReqRing[pos].rbuf[idx]),
+                   rkey,
+                   (uint64_t)(xferReqRing[pos].lbuf[idx]),
+                   lkey,
+                   (uint64_t)xferReqRing[pos].size[idx]);
+        }
+#endif
     }
     __syncthreads();
 
@@ -135,12 +169,15 @@ kernel_read(doca_gpu_dev_verbs_qp *qp, struct docaXferReqGpu *xferReqRing, uint3
             wqe_idx++;
             wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
 
+            uint32_t dump_lkey = nixl_prepare_mkey(xferReqRing[pos].lkey[tot_wqe - 1],
+                                                   xferReqRing[pos].keys_are_swapped);
+
             doca_gpu_dev_verbs_wqe_prepare_dump(qp,
                                                 wqe_ptr,
                                                 wqe_idx,
                                                 DOCA_GPUNETIO_MLX5_WQE_CTRL_CQ_UPDATE,
                                                 (uint64_t)(xferReqRing[pos].lbuf[tot_wqe - 1]),
-                                                xferReqRing[pos].lkey[tot_wqe - 1],
+                                                dump_lkey,
                                                 1);
         }
         doca_gpu_dev_verbs_mark_wqes_ready(qp, base_wqe_idx, wqe_idx);
@@ -183,12 +220,16 @@ kernel_write(doca_gpu_dev_verbs_qp *qp, struct docaXferReqGpu *xferReqRing, uint
         wqe_idx = base_wqe_idx + idx;
         wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
 
+        uint32_t rkey =
+            nixl_prepare_mkey(xferReqRing[pos].rkey[idx], xferReqRing[pos].keys_are_swapped);
+        uint32_t lkey =
+            nixl_prepare_mkey(xferReqRing[pos].lkey[idx], xferReqRing[pos].keys_are_swapped);
 #if ENABLE_DEBUG == 1
         printf("prepare_write radd %lx rkey %x ladd %lx lkey %x size %ld\n",
                (uint64_t)(xferReqRing[pos].rbuf[idx]),
-               xferReqRing[pos].rkey[idx],
+               rkey,
                (uint64_t)(xferReqRing[pos].lbuf[idx]),
-               xferReqRing[pos].lkey[idx],
+               lkey,
                (uint64_t)xferReqRing[pos].size[idx]);
 #endif
         doca_gpu_dev_verbs_wqe_prepare_write(qp,
@@ -198,9 +239,9 @@ kernel_write(doca_gpu_dev_verbs_qp *qp, struct docaXferReqGpu *xferReqRing, uint
                                              cflag,
                                              0,
                                              (uint64_t)(xferReqRing[pos].rbuf[idx]),
-                                             xferReqRing[pos].rkey[idx],
+                                             rkey,
                                              (uint64_t)(xferReqRing[pos].lbuf[idx]),
-                                             xferReqRing[pos].lkey[idx],
+                                             lkey,
                                              xferReqRing[pos].size[idx]);
     }
     __syncthreads();
@@ -273,12 +314,15 @@ kernel_progress(struct docaXferCompletion *completion_list,
                                        completion_list[index].xferReqRingGpu->has_notif_msg_idx),
                                    (int)completion_list[index].xferReqRingGpu->msg_sz);
 #endif
+                            uint32_t notif_lkey = nixl_prepare_mkey(
+                                completion_list[index].xferReqRingGpu->lkey_notif,
+                                completion_list[index].xferReqRingGpu->keys_are_swapped);
                             doca_gpu_dev_verbs_send(
                                 completion_list[index].xferReqRingGpu->qp_notif,
                                 doca_gpu_dev_verbs_addr{
                                     .addr = (uint64_t)(completion_list[index]
                                                            .xferReqRingGpu->lbuf_notif),
-                                    .key = completion_list[index].xferReqRingGpu->lkey_notif},
+                                    .key = notif_lkey},
                                 completion_list[index].xferReqRingGpu->msg_sz,
                                 &out_ticket);
 
@@ -344,11 +388,13 @@ kernel_progress(struct docaXferCompletion *completion_list,
                 for (int idx = 0; idx < DOCA_MAX_NOTIF_INFLIGHT; idx++) {
                     struct mlx5_wqe_data_seg *rwqe_ptr =
                         doca_gpu_dev_verbs_get_rwqe_ptr(notif_fill->qp_gpu, idx);
+                    uint32_t fill_lkey =
+                        nixl_prepare_mkey(notif_fill->msg_lkey, notif_fill->keys_are_swapped);
                     doca_gpu_dev_verbs_wqe_prepare_recv(
                         notif_fill->qp_gpu,
                         rwqe_ptr,
                         (uint64_t)(notif_fill->msg_buf + (notif_fill->msg_size * idx)),
-                        notif_fill->msg_lkey,
+                        fill_lkey,
                         notif_fill->msg_size);
                 }
 
@@ -367,10 +413,12 @@ kernel_progress(struct docaXferCompletion *completion_list,
     if (blockIdx.x == 2) {
         while (DOCA_GPUNETIO_VOLATILE(*exit_flag) == 0) {
             if (DOCA_GPUNETIO_VOLATILE(notif_send_gpu->qp_gpu) != nullptr) {
+                uint32_t send_lkey =
+                    nixl_prepare_mkey(notif_send_gpu->msg_lkey, notif_send_gpu->keys_are_swapped);
                 doca_gpu_dev_verbs_send(
                     notif_send_gpu->qp_gpu,
                     doca_gpu_dev_verbs_addr{.addr = (uint64_t)notif_send_gpu->msg_buf,
-                                            .key = notif_send_gpu->msg_lkey},
+                                            .key = send_lkey},
                     notif_send_gpu->msg_size,
                     &out_ticket);
 
@@ -379,7 +427,7 @@ kernel_progress(struct docaXferCompletion *completion_list,
                 printf("Notif correctly sent %ld addr %lx msg_lkey %x qp %p size %d\n",
                        out_ticket,
                        notif_send_gpu->msg_buf,
-                       notif_send_gpu->msg_lkey,
+                       send_lkey,
                        (void *)notif_send_gpu->qp_gpu,
                        (int)notif_send_gpu->msg_size);
 #endif
@@ -414,7 +462,6 @@ doca_kernel_write(cudaStream_t stream,
 
     return DOCA_SUCCESS;
 }
-
 doca_error_t
 doca_kernel_read(cudaStream_t stream,
                  doca_gpu_dev_verbs_qp *qp,
diff --git a/src/plugins/gpunetio/verbs/verbs.cpp b/src/plugins/gpunetio/verbs/verbs.cpp
index 11add6a3c2..829309a729 100644
--- a/src/plugins/gpunetio/verbs/verbs.cpp
+++ b/src/plugins/gpunetio/verbs/verbs.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -463,8 +463,14 @@ mr::mr(doca_gpu *gpu_dev_, void *addr_, uint32_t elem_num_, size_t elem_size_, s
         if (ibmr == nullptr) throw std::invalid_argument("Failed to create mr");
     }
 
-    lkey = htobe32(ibmr->lkey);
-    rkey = htobe32(ibmr->rkey);
+    lkey = ibmr->lkey;
+    rkey = ibmr->rkey;
+
+    NIXL_DEBUG << "[dbg] MR path: " << (dmabuf_fd >= 0 ? "dmabuf" : "peermem/ibv_reg_mr")
+               << ", addr 0x" << std::hex << std::uppercase << (uintptr_t)addr << std::dec
+               << " len 0x" << std::hex << (uint64_t)tot_size << std::dec << " lkey 0x" << std::hex
+               << (uint32_t)lkey << std::dec << " rkey 0x" << std::hex << (uint32_t)rkey
+               << std::dec;
 }
 
 mr::mr(void *addr_, size_t tot_size_, uint32_t rkey_)
diff --git a/subprojects/.wraplock b/subprojects/.wraplock
new file mode 100644
index 0000000000..a4d636226b
--- /dev/null
+++ b/subprojects/.wraplock
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
diff --git a/test/gtest/device_api/utils.cu b/test/gtest/device_api/utils.cu
index 208d2fbdb5..46fc93d9ea 100644
--- a/test/gtest/device_api/utils.cu
+++ b/test/gtest/device_api/utils.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 
diff --git a/test/gtest/telemetry_test.cpp b/test/gtest/telemetry_test.cpp
index 63c5f0c020..1b95bd2081 100644
--- a/test/gtest/telemetry_test.cpp
+++ b/test/gtest/telemetry_test.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/test/gtest/test_transfer.cpp b/test/gtest/test_transfer.cpp
index c6f5d1a929..0493090051 100644
--- a/test/gtest/test_transfer.cpp
+++ b/test/gtest/test_transfer.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/test/nixl/nixl_test.cpp b/test/nixl/nixl_test.cpp
index ce6d81cc16..055496d889 100644
--- a/test/nixl/nixl_test.cpp
+++ b/test/nixl/nixl_test.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/test/unit/plugins/hf3fs/nixl_hf3fs_mt_test.cpp b/test/unit/plugins/hf3fs/nixl_hf3fs_mt_test.cpp
index 191c461b4d..4aa0d64cbc 100644
--- a/test/unit/plugins/hf3fs/nixl_hf3fs_mt_test.cpp
+++ b/test/unit/plugins/hf3fs/nixl_hf3fs_mt_test.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/test/unit/plugins/hf3fs/nixl_hf3fs_test.cpp b/test/unit/plugins/hf3fs/nixl_hf3fs_test.cpp
index 54a2b36b3d..c928516856 100644
--- a/test/unit/plugins/hf3fs/nixl_hf3fs_test.cpp
+++ b/test/unit/plugins/hf3fs/nixl_hf3fs_test.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");