intel-staging · kpjeeja · Oct 30, 2025 · Oct 17, 2025 · Oct 30, 2025 · Nov 3, 2025
diff --git a/benchmark/kvbench/README.md b/benchmark/kvbench/README.md
@@ -118,7 +118,7 @@ These arguments are used by both `plan` and `profile` commands:
 | -------- | ----------- |
 | `--source` | Source of the nixl descriptors [file, memory, gpu] (default: file) |
 | `--destination` | Destination of the nixl descriptors [file, memory, gpu] (default: memory) |
-| `--backend` | Communication backend [UCX, UCX_MO, GDS, GDS_MT, POSIX, GPUNETIO, Mooncake, HF3FS, OBJ] (default: UCX) |
+| `--backend` | Communication backend [UCX, UCX_MO, GDS, GDS_MT, POSIX, GPUNETIO, Mooncake, HF3FS, OBJ, LIBFABRIC] (default: UCX) |
 | `--worker_type` | Worker to use to transfer data [nixl, nvshmem] (default: nixl) |
 | `--initiator_seg_type` | Memory segment type for initiator [DRAM, VRAM] (default: DRAM) |
 | `--target_seg_type` | Memory segment type for target [DRAM, VRAM] (default: DRAM) |

diff --git a/benchmark/kvbench/commands/args.py b/benchmark/kvbench/commands/args.py
@@ -72,7 +72,7 @@ def nixl_bench_args(func):
     func = click.option(
         "--backend",
         type=str,
-        help="Communication backend [UCX, UCX_MO, GDS, GDS_MT, POSIX, GPUNETIO, Mooncake, HF3FS, OBJ] (default: UCX)",
+        help="Communication backend [UCX, UCX_MO, GDS, GDS_MT, POSIX, GPUNETIO, Mooncake, HF3FS, OBJ, LIBFABRIC] (default: UCX)",
     )(func)
     func = click.option(
         "--worker_type",

diff --git a/benchmark/kvbench/test/custom_traffic_perftest.py b/benchmark/kvbench/test/custom_traffic_perftest.py
@@ -55,6 +55,8 @@ def __init__(
         self.nixl_agent = nixl_agent
         if mem_type in ("cuda", "vram"):
             device = torch.device("cuda")
+        else mem_type in ("hpu", "vram"):
+            device = torch.device("hpu")
         elif mem_type in ("cpu", "dram"):
             device = torch.device("cpu")
         else:
@@ -95,6 +97,8 @@ def destroy(self):
         if hasattr(self.buf, "is_cuda") and self.buf.is_cuda:
             del self.buf
             torch.cuda.empty_cache()
+        if hasattr(self.buf, "is_hpu") and self.buf.is_hpu:
+            del self.buf
 
 
 class CTPerftest:
@@ -122,6 +126,15 @@ def __init__(
             logger.warning(
                 "Cuda buffers detected, but the env var CUDA_VISIBLE_DEVICES is not set, this will cause every process in the same host to use the same GPU device."
             )
+
+        if (
+            not os.environ.get("HABANA_VISIBLE_MODULES")
+            and self.traffic_pattern.mem_type == "hpu"
+        ):
+            logger.warning(
+                "hpu  buffers detected, but the env var HABANA_VISIBLE_DEVICES is not set, this will cause every process in the same host to use the same GPU device."
+            )
+
 
         """Initialize the buffers, one big send and recv buffer is used for all the transfers
         it has to be chunked inside each transfer to get buffers per ranks
@@ -250,7 +263,7 @@ def _warmup(
         self,
         iters=15,
         fill_value: int = 100000,
-        mem_type: Literal["cuda", "vram", "cpu", "dram"] = "cuda",
+        mem_type: Literal["cuda", "vram", "cpu", "dram", "hpu", "vram"] = "cuda",
     ):
         full_matrix = np.full((self.world_size, self.world_size), fill_value=fill_value)
         tp = TrafficPattern(matrix=full_matrix, mem_type=mem_type)

diff --git a/benchmark/kvbench/test/sequential_custom_traffic_perftest.py b/benchmark/kvbench/test/sequential_custom_traffic_perftest.py
@@ -71,6 +71,15 @@ def __init__(
             logger.warning(
                 "Cuda buffers detected, but the env var CUDA_VISIBLE_DEVICES is not set, this will cause every process in the same host to use the same GPU device."
             )
+
+        if (
+            not os.environ.get("HABANA_VISIBLE_MODULES")
+            and self.traffic_pattern.mem_type == "hpu"
+        ):
+            logger.warning(
+                "hpu  buffers detected, but the env var HABANA_VISIBLE_DEVICES is not set, this will cause every process in the same host to use the same GPU device."
+            )
+
         assert "UCX" in self.nixl_agent.get_plugin_list(), "UCX plugin is not loaded"
 
         # NixlBuffer caches buffers and reuse them if they are big enough, let's initialize them once, with the largest needed size

diff --git a/benchmark/kvbench/test/traffic_pattern.py b/benchmark/kvbench/test/traffic_pattern.py
@@ -35,7 +35,7 @@ class TrafficPattern:
     """
 
     matrix: np.ndarray
-    mem_type: Literal["cuda", "vram", "cpu", "dram"]
+    mem_type: Literal["cuda", "vram", "cpu", "dram","hpu"]
     xfer_op: Literal["WRITE", "READ"] = "WRITE"
     shards: int = 1
     dtype: torch.dtype = torch.int8

diff --git a/benchmark/nixlbench/meson.build b/benchmark/nixlbench/meson.build
@@ -103,16 +103,87 @@ if cuda_available
     endif
 endif
 
+# SynapseAI (Habana Gaudi) dependency detection
+synapse_inc_path = get_option('synapsepath_inc')
+synapse_lib_path = get_option('synapsepath_lib')
+
+if synapse_lib_path == ''
+    #use default path
+    # Try to find both libSynapse and hl-thunk libraries
+    synapse_lib = cpp.find_library('Synapse',
+                                 dirs: ['/usr/lib/habanalabs', '/usr/local/lib/habanalabs'],
+                               required: false)
+    hlthunk_lib = cpp.find_library('hl-thunk',
+                               dirs: ['/usr/lib/habanalabs', '/usr/local/lib/habanalabs'],
+                               required: false)
+else
+    synapse_lib = cpp.find_library('Synapse',
+                                 dirs: [synapse_lib_path],
+                               required: false)
+    hlthunk_lib = cpp.find_library('hl-thunk',
+                               dirs: [synapse_lib_path],
+                               required: false)
+endif
+
+if synapse_inc_path == ''
+  #use default path
+  synapse_inc_path = '/usr/include/habanalabs/'
+endif
+
+# SynapseAI support requires both libraries
+synapseai_dep = dependency('', required: false)  # Initialize as not found
+if synapse_lib.found() and hlthunk_lib.found()
+    synapseai_dep = declare_dependency(dependencies: [synapse_lib, hlthunk_lib])
+elif hlthunk_lib.found()
+    # Fallback to just hl-thunk if libSynapse not available
+    synapseai_dep = hlthunk_lib
+endif
+
+if synapseai_dep.found()
+    # Create proper dependency with include paths (including DRM path for habanalabs headers)
+    synapseai_dep = declare_dependency(
+        dependencies: synapseai_dep,
+        include_directories: [
+            include_directories('/usr/include/drm'),
+            include_directories(synapse_inc_path)
+        ]
+    )
+    message('Found SynapseAI support for Habana Gaudi devices')
+    synapseai_available = true
+else
+    synapseai_available = false
+    warning('SynapseAI not found. Habana Gaudi device support will be disabled.')
+endif
+
 # GFlags
 gflags_dep = dependency('gflags', required: true)
 
 # OpenMP
 openmp_dep = dependency('openmp', required: true)
-
-# Check for etcd-cpp-api - use multiple methods for discovery
+# Try pkg-config first
 etcd_dep = dependency('etcd-cpp-api', required : false)
+if not etcd_dep.found()
+    # Fallback: manual configuration
+    #   message('etcd-cpp-api not found via pkg-config, using manual configuration')
 
-# Ensure etcd is available
+    # Check if we have the library files
+    etcd_lib = meson.get_compiler('cpp').find_library('etcd-cpp-api', 
+                    dirs: ['/usr/local/lib'],
+                                  required: false)
+
+      if etcd_lib.found()
+            etcd_dep = declare_dependency(
+                    include_directories: include_directories('/usr/local/include'),
+                          dependencies: [etcd_lib],
+            # Add any required dependencies for etcd-cpp-api
+            link_args: []  # Add any additional link args if needed
+            )
+            message('etcd-cpp-api found manually in /usr/local/lib')
+       else
+            etcd_dep = disabler()
+            message('etcd-cpp-api not found anywhere')
+      endif
+endif
 etcd_available = etcd_dep.found()
 if etcd_available
     add_project_arguments('-DHAVE_ETCD', language: 'cpp')
@@ -148,7 +219,14 @@ if cuda_fabric_available
     add_project_arguments('-DHAVE_CUDA_FABRIC', language: 'cpp')
 endif
 
+if synapseai_available
+    add_project_arguments('-DHAVE_SYNAPSEAI', language: 'cpp')
+endif
+
 # Subprojects
+if synapseai_available
+   subdir('src/synapseai')
+endif
 subdir('src/utils')
 subdir('src/runtime')
 subdir('src/worker')
@@ -161,6 +239,7 @@ configure_file(
         'HAVE_NVSHMEM': nvshmem_available ? '1' : '0',
         'HAVE_CUDA': cuda_available ? '1' : '0',
         'HAVE_CUDA_FABRIC': cuda_fabric_available ? '1' : '0',
+        'HAVE_SYNAPSEAI': synapseai_available ? '1' : '0',
     },
     install: true,
     install_dir: get_option('includedir') / 'nixlbench'
@@ -174,6 +253,11 @@ endif
 if cuda_available
     deps += [cuda_dep]
 endif
+
+if synapseai_available
+    deps += [synapseai_dep]
+    message('add synapseai_dep')
+endif
 if nvshmem_available
     deps += [nvshmem_lib]
     args += [
@@ -185,9 +269,9 @@ if nvshmem_available
     ]
 endif
 
-if not etcd_available
-    error('No runtime available or not found')
-endif
+#if not etcd_available
+#    error('No runtime available or not found')
+#endif
 
 if nvshmem_available
     # Use nvcc directly for compilation and linking
@@ -240,11 +324,21 @@ if nvshmem_available
         install_dir: get_option('bindir'),
         depends: [nixlbench_runtimes, utils_lib, worker_libs])
 else
-    executable('nixlbench', 'src/main.cpp',
-                include_directories: inc_dir,
-                link_with: [nixlbench_runtimes, utils_lib, worker_libs],
-                dependencies: deps,
-                link_args: args,
-                install: true,
-                install_dir: get_option('bindir'))
+   if synapseai_available
+       executable('nixlbench', 'src/main.cpp',
+                   include_directories: inc_dir,
+                   link_with: [nixlbench_runtimes, utils_lib, worker_libs, synapseaiutils_lib],
+                   dependencies: deps,
+                   link_args: args,
+                   install: true,
+                   install_dir: get_option('bindir'))
+     else
+       executable('nixlbench', 'src/main.cpp',
+                   include_directories: inc_dir,
+                   link_with: [nixlbench_runtimes, utils_lib, worker_libs],
+                   dependencies: deps,
+                   link_args: args,
+                   install: true,
+                   install_dir: get_option('bindir'))
+     endif
 endif
diff --git a/benchmark/nixlbench/meson_options.txt b/benchmark/nixlbench/meson_options.txt
@@ -21,3 +21,5 @@ option('etcd_lib_path', type: 'string', value: '', description: 'Path to ETCD C+
 option('nixl_path', type: 'string', value: '/usr/local', description: 'Path to NiXL')
 option('nvshmem_inc_path', type: 'string', value: '', description: 'Path to NVSHMEM include directory')
 option('nvshmem_lib_path', type: 'string', value: '', description: 'Path to NVSHMEM library directory')
+option('synapsepath_inc', type: 'string', value: '', description: 'Include path for Intel Gaudi/ HPU')
+option('synapsepath_lib', type: 'string', value: '', description: 'Library path for Intel Gaudi/ HPU')
diff --git a/benchmark/nixlbench/src/synapseai/meson.build b/benchmark/nixlbench/src/synapseai/meson.build
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+synapseaiutils_sources = [
+  'synapse_utils.cpp',
+  'synapse_utils.h',
+]
+
+synapseaiutils_deps = [
+  synapseai_dep
+]
+
+synapseaiutils_lib = static_library('synapseaiutils',
+  synapseaiutils_sources,
+  dependencies: synapseaiutils_deps,
+  include_directories: inc_dir
+)
+synapseaiutils_dep = declare_dependency(
+  link_with: synapseaiutils_lib,
+  dependencies: synapseaiutils_deps,
+  include_directories: inc_dir
+)