Test passing a custom policy to DispatchAdjacentDifference, DispatchMergeSort, DispatchScan, DispatchBatchMemcpy (#7289)

bernhardmgruber · web-flow · commit 69976bc350c4 · 2026-01-21T23:30:14.000Z
diff --git a/cub/test/catch2_test_device_adjacent_difference_custom_policy_hub.cu b/cub/test/catch2_test_device_adjacent_difference_custom_policy_hub.cu
@@ -0,0 +1,75 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "insert_nested_NVTX_range_guard.h"
+
+#include <cub/device/device_adjacent_difference.cuh>
+
+#include <thrust/detail/raw_pointer_cast.h>
+
+#include <cuda/std/functional>
+#include <cuda/std/numeric>
+
+#include <c2h/catch2_test_helper.h>
+
+using namespace cub;
+
+// TODO(bgruber): drop this test with CCCL 4.0 when we drop the adjacent difference dispatcher after publishing the
+// tuning API
+
+template <typename InputIteratorT>
+struct my_policy_hub
+{
+  using ValueT = cub::detail::it_value_t<InputIteratorT>;
+
+  // from Policy500 of the CUB adjacent difference tunings
+  struct MaxPolicy : ChainedPolicy<500, MaxPolicy, MaxPolicy>
+  {
+    using AdjacentDifferencePolicy =
+      AgentAdjacentDifferencePolicy<128,
+                                    Nominal8BItemsToItems<ValueT>(7),
+                                    BLOCK_LOAD_WARP_TRANSPOSE,
+                                    LOAD_LDG,
+                                    BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+};
+
+C2H_TEST("DispatchAdjacentDifference::Dispatch: custom policy hub", "[device][adjacent_difference]")
+{
+  using value_t            = int;
+  using offset_t           = unsigned;
+  using difference_op_t    = cuda::std::minus<>;
+  const offset_t num_items = 12345;
+
+  c2h::device_vector<value_t> in_items(num_items);
+  c2h::device_vector<value_t> out_items(num_items);
+  c2h::gen(C2H_SEED(1), in_items);
+
+  c2h::host_vector<value_t> host_in(in_items);
+  c2h::host_vector<value_t> expected(num_items);
+  cuda::std::adjacent_difference(host_in.begin(), host_in.end(), expected.begin(), cuda::std::minus<value_t>{});
+
+  using policy_hub_t = my_policy_hub<value_t*>;
+  using dispatch_t =
+    DispatchAdjacentDifference<value_t*, value_t*, difference_op_t, offset_t, MayAlias::No, ReadOption::Left, policy_hub_t>;
+  size_t temp_size = 0;
+  dispatch_t::Dispatch(
+    nullptr,
+    temp_size,
+    thrust::raw_pointer_cast(in_items.data()),
+    thrust::raw_pointer_cast(out_items.data()),
+    num_items,
+    difference_op_t{},
+    /* stream */ nullptr);
+  c2h::device_vector<std::uint8_t> temp_storage(temp_size, thrust::no_init);
+  dispatch_t::Dispatch(
+    thrust::raw_pointer_cast(temp_storage.data()),
+    temp_size,
+    thrust::raw_pointer_cast(in_items.data()),
+    thrust::raw_pointer_cast(out_items.data()),
+    num_items,
+    difference_op_t{},
+    /* stream */ nullptr);
+
+  REQUIRE(out_items == expected);
+}
diff --git a/cub/test/catch2_test_device_batch_memcpy_custom_policy_hub.cu b/cub/test/catch2_test_device_batch_memcpy_custom_policy_hub.cu
@@ -0,0 +1,115 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "insert_nested_NVTX_range_guard.h"
+
+#include <cub/device/dispatch/dispatch_batch_memcpy.cuh>
+
+#include <thrust/detail/raw_pointer_cast.h>
+
+#include <cuda/std/array>
+#include <cuda/std/cstdint>
+
+#include <c2h/catch2_test_helper.h>
+
+using namespace cub;
+
+// TODO(bgruber): drop this test with CCCL 4.0 when we drop the batch memcpy dispatcher after publishing the tuning API
+
+template <class BufferOffsetT, class BlockOffsetT>
+struct my_policy_hub
+{
+  static constexpr uint32_t BLOCK_THREADS         = 128U;
+  static constexpr uint32_t BUFFERS_PER_THREAD    = 4U;
+  static constexpr uint32_t TLEV_BYTES_PER_THREAD = 8U;
+
+  static constexpr uint32_t LARGE_BUFFER_BLOCK_THREADS    = 256U;
+  static constexpr uint32_t LARGE_BUFFER_BYTES_PER_THREAD = 32U;
+
+  static constexpr uint32_t WARP_LEVEL_THRESHOLD  = 128;
+  static constexpr uint32_t BLOCK_LEVEL_THRESHOLD = 8 * 1024;
+
+  using buff_delay_constructor_t  = cub::detail::default_delay_constructor_t<BufferOffsetT>;
+  using block_delay_constructor_t = cub::detail::default_delay_constructor_t<BlockOffsetT>;
+
+  // from Policy500 of the CUB batch memcpy tunings
+  struct MaxPolicy : ChainedPolicy<500, MaxPolicy, MaxPolicy>
+  {
+    using AgentSmallBufferPolicyT = cub::detail::batch_memcpy::AgentBatchMemcpyPolicy<
+      BLOCK_THREADS,
+      BUFFERS_PER_THREAD,
+      TLEV_BYTES_PER_THREAD,
+      /* PREFER_POW2_BITS */ true,
+      LARGE_BUFFER_BLOCK_THREADS * LARGE_BUFFER_BYTES_PER_THREAD,
+      WARP_LEVEL_THRESHOLD,
+      BLOCK_LEVEL_THRESHOLD,
+      buff_delay_constructor_t,
+      block_delay_constructor_t>;
+
+    using AgentLargeBufferPolicyT =
+      cub::detail::batch_memcpy::agent_large_buffer_policy<LARGE_BUFFER_BLOCK_THREADS, LARGE_BUFFER_BYTES_PER_THREAD>;
+  };
+};
+
+C2H_TEST("DispatchBatchMemcpy::Dispatch: custom policy hub", "[device][memcpy]")
+{
+  using value_t         = cuda::std::uint8_t;
+  using buffer_size_t   = cuda::std::uint32_t;
+  using block_offset_t  = cuda::std::uint32_t;
+  using buffer_offset_t = cub::detail::batch_memcpy::per_invocation_buffer_offset_t;
+
+  const cuda::std::array<buffer_size_t, 5> buffer_sizes{3, 128, 512, 4096, 9000};
+
+  c2h::host_vector<c2h::device_vector<value_t>> in_buffers(buffer_sizes.size());
+  c2h::host_vector<c2h::device_vector<value_t>> out_buffers(buffer_sizes.size());
+
+  c2h::host_vector<value_t*> h_in_ptrs(buffer_sizes.size());
+  c2h::host_vector<value_t*> h_out_ptrs(buffer_sizes.size());
+  c2h::host_vector<buffer_size_t> h_sizes(buffer_sizes.size());
+
+  for (buffer_size_t i = 0; i < buffer_sizes.size(); ++i)
+  {
+    const auto bytes = buffer_sizes[i];
+    in_buffers[i].resize(bytes);
+    out_buffers[i].resize(bytes);
+    c2h::gen(C2H_SEED(1), in_buffers[i]);
+
+    h_in_ptrs[i]  = thrust::raw_pointer_cast(in_buffers[i].data());
+    h_out_ptrs[i] = thrust::raw_pointer_cast(out_buffers[i].data());
+    h_sizes[i]    = bytes;
+  }
+
+  c2h::device_vector<value_t*> d_in_ptrs    = h_in_ptrs;
+  c2h::device_vector<value_t*> d_out_ptrs   = h_out_ptrs;
+  c2h::device_vector<buffer_size_t> d_sizes = h_sizes;
+
+  using policy_hub_t = my_policy_hub<buffer_offset_t, block_offset_t>;
+  using dispatch_t =
+    cub::detail::DispatchBatchMemcpy<value_t**, value_t**, buffer_size_t*, block_offset_t, CopyAlg::Memcpy, policy_hub_t>;
+
+  size_t temp_size = 0;
+  dispatch_t::Dispatch(
+    nullptr,
+    temp_size,
+    thrust::raw_pointer_cast(d_in_ptrs.data()),
+    thrust::raw_pointer_cast(d_out_ptrs.data()),
+    thrust::raw_pointer_cast(d_sizes.data()),
+    static_cast<cuda::std::int64_t>(buffer_sizes.size()),
+    /* stream */ nullptr);
+  c2h::device_vector<::cuda::std::uint8_t> temp_storage(temp_size, thrust::no_init);
+  dispatch_t::Dispatch(
+    thrust::raw_pointer_cast(temp_storage.data()),
+    temp_size,
+    thrust::raw_pointer_cast(d_in_ptrs.data()),
+    thrust::raw_pointer_cast(d_out_ptrs.data()),
+    thrust::raw_pointer_cast(d_sizes.data()),
+    static_cast<cuda::std::int64_t>(buffer_sizes.size()),
+    /* stream */ nullptr);
+
+  for (size_t i = 0; i < buffer_sizes.size(); ++i)
+  {
+    c2h::host_vector<value_t> host_in(in_buffers[i]);
+    c2h::host_vector<value_t> host_out(out_buffers[i]);
+    REQUIRE(host_out == host_in);
+  }
+}
diff --git a/cub/test/catch2_test_device_merge_sort_custom_policy_hub.cu b/cub/test/catch2_test_device_merge_sort_custom_policy_hub.cu
@@ -0,0 +1,74 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "insert_nested_NVTX_range_guard.h"
+
+#include <cub/device/device_merge_sort.cuh>
+
+#include <thrust/detail/raw_pointer_cast.h>
+
+#include <algorithm>
+
+#include "catch2_test_device_merge_sort_common.cuh"
+#include <c2h/catch2_test_helper.h>
+
+using namespace cub;
+
+// TODO(bgruber): drop this test with CCCL 4.0 when we drop the merge sort dispatcher after publishing the tuning API
+
+template <typename KeyIteratorT>
+struct my_policy_hub
+{
+  using KeyT = cub::detail::it_value_t<KeyIteratorT>;
+
+  // from Policy500 of the CUB merge sort tunings
+  struct MaxPolicy : ChainedPolicy<500, MaxPolicy, MaxPolicy>
+  {
+    using MergeSortPolicy =
+      AgentMergeSortPolicy<256,
+                           Nominal4BItemsToItems<KeyT>(11),
+                           BLOCK_LOAD_WARP_TRANSPOSE,
+                           LOAD_LDG,
+                           BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+};
+
+C2H_TEST("DispatchMergeSort::Dispatch: custom policy hub", "[merge][sort][device]")
+{
+  using key_t              = int;
+  using offset_t           = unsigned;
+  const offset_t num_items = 12345;
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<key_t> out_keys(num_items);
+  c2h::gen(C2H_SEED(1), in_keys);
+
+  using policy_hub_t = my_policy_hub<key_t*>;
+  using dispatch_t = DispatchMergeSort<key_t*, NullType*, key_t*, NullType*, offset_t, custom_less_op_t, policy_hub_t>;
+  size_t temp_size = 0;
+  dispatch_t::Dispatch(
+    nullptr,
+    temp_size,
+    thrust::raw_pointer_cast(in_keys.data()),
+    nullptr,
+    thrust::raw_pointer_cast(out_keys.data()),
+    nullptr,
+    num_items,
+    custom_less_op_t{},
+    /* stream */ nullptr);
+  c2h::device_vector<uint8_t> temp_storage(temp_size, thrust::no_init);
+  dispatch_t::Dispatch(
+    thrust::raw_pointer_cast(temp_storage.data()),
+    temp_size,
+    thrust::raw_pointer_cast(in_keys.data()),
+    nullptr,
+    thrust::raw_pointer_cast(out_keys.data()),
+    nullptr,
+    num_items,
+    custom_less_op_t{},
+    /* stream */ nullptr);
+
+  c2h::host_vector<key_t> ref_keys = in_keys;
+  std::stable_sort(ref_keys.begin(), ref_keys.end(), custom_less_op_t{});
+  REQUIRE(ref_keys == out_keys);
+}
diff --git a/cub/test/catch2_test_device_scan_custom_policy_hub.cu b/cub/test/catch2_test_device_scan_custom_policy_hub.cu
@@ -0,0 +1,71 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "insert_nested_NVTX_range_guard.h"
+
+#include <cub/device/device_scan.cuh>
+
+#include <thrust/detail/raw_pointer_cast.h>
+
+#include <cuda/std/functional>
+
+#include "catch2_test_device_scan.cuh"
+#include <c2h/catch2_test_helper.h>
+
+using namespace cub;
+
+// TODO(bgruber): drop this test with CCCL 4.0 when we drop the scan dispatcher after publishing the tuning API
+
+template <typename InputValueT, typename OutputValueT, typename AccumT, typename OffsetT, typename ScanOpT>
+struct my_policy_hub
+{
+  // from Policy500 of the CUB scan tunings
+  struct MaxPolicy : ChainedPolicy<500, MaxPolicy, MaxPolicy>
+  {
+    using ScanPolicyT =
+      AgentScanPolicy<128, 12, AccumT, BLOCK_LOAD_DIRECT, LOAD_CA, BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, BLOCK_SCAN_RAKING>;
+  };
+};
+
+C2H_TEST("DispatchScan::Dispatch: custom policy hub", "[scan][device]")
+{
+  using value_t            = int;
+  using offset_t           = unsigned;
+  using scan_op_t          = cuda::std::plus<>;
+  using accum_t            = cuda::std::__accumulator_t<scan_op_t, value_t, value_t>;
+  const offset_t num_items = 12345;
+
+  c2h::device_vector<value_t> in_items(num_items);
+  c2h::device_vector<value_t> out_items(num_items, thrust::no_init);
+  c2h::gen(C2H_SEED(1), in_items);
+
+  c2h::host_vector<value_t> expected(num_items);
+  c2h::host_vector<value_t> host_items(in_items);
+  compute_inclusive_scan_reference(host_items.cbegin(), host_items.cend(), expected.begin(), scan_op_t{}, value_t{});
+
+  using policy_hub_t = my_policy_hub<value_t, value_t, accum_t, offset_t, scan_op_t>;
+  using dispatch_t =
+    DispatchScan<value_t*, value_t*, scan_op_t, NullType, offset_t, accum_t, ForceInclusive::No, policy_hub_t>;
+  size_t temp_size = 0;
+  dispatch_t::Dispatch(
+    nullptr,
+    temp_size,
+    thrust::raw_pointer_cast(in_items.data()),
+    thrust::raw_pointer_cast(out_items.data()),
+    scan_op_t{},
+    NullType{},
+    num_items,
+    /* stream */ nullptr);
+  c2h::device_vector<uint8_t> temp_storage(temp_size, thrust::no_init);
+  dispatch_t::Dispatch(
+    thrust::raw_pointer_cast(temp_storage.data()),
+    temp_size,
+    thrust::raw_pointer_cast(in_items.data()),
+    thrust::raw_pointer_cast(out_items.data()),
+    scan_op_t{},
+    NullType{},
+    num_items,
+    /* stream */ nullptr);
+
+  REQUIRE(out_items == expected);
+}