Merge branch 'main' into stf_fix_place_equality

caugonnet · web-flow · commit 4e6119d1e444 · 2026-02-04T16:20:40.000+01:00
diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh
@@ -108,7 +108,110 @@ private:
     }
   }
 
+  // TODO(bgruber): we want to eventually forward the output tuple to the kernel and optimize writing multiple streams
+  template <detail::transform::requires_stable_address StableAddress = detail::transform::requires_stable_address::no,
+            typename... RandomAccessIteratorsIn,
+            typename... RandomAccessIteratorsOut,
+            typename NumItemsT,
+            typename Predicate,
+            typename TransformOp,
+            typename Env>
+  CUB_RUNTIME_FUNCTION static cudaError_t TransformInternal(
+    ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
+    ::cuda::std::tuple<RandomAccessIteratorsOut...> outputs,
+    NumItemsT num_items,
+    Predicate predicate,
+    TransformOp transform_op,
+    Env env)
+  {
+    return TransformInternal<StableAddress>(
+      ::cuda::std::move(inputs),
+      ::cuda::make_zip_iterator(::cuda::std::move(outputs)),
+      num_items,
+      ::cuda::std::move(predicate),
+      ::cuda::std::move(transform_op),
+      ::cuda::std::move(env));
+  }
+
 public:
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //! Transforms many input sequences into many output sequence, by applying a transformation operation on corresponding
+  //! input elements and writing the tuple result to the corresponding output elements. No guarantee is given on the
+  //! identity (i.e. address) of the objects passed to the call operator of the transformation operation.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin transform-many-many
+  //!     :end-before: example-end transform-many-many
+  //!
+  //! @endrst
+  //!
+  //! @param inputs A tuple of iterators to the input sequences where num_items elements are read from each. The
+  //! iterators' value types must be trivially relocatable.
+  //! @param outputs A tuple of iterators to the output sequences where num_items results are written to each. Each
+  //! sequence may point to the beginning of one of the input sequences, performing the transformation inplace. Any
+  //! output sequence must not overlap with any of the input sequence in any other way.
+  //! @param num_items The number of elements in each input and output sequence.
+  //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
+  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
+  //! operator must be a tuple where each tuple element is assignable to the corresponding dereferenced output
+  //! iterators.
+  //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
+  //! stream\ :sub:`0`
+  template <typename... RandomAccessIteratorsIn,
+            typename... RandomAccessIteratorsOut,
+            typename NumItemsT,
+            typename TransformOp,
+            typename Env = ::cuda::std::execution::env<>>
+  CUB_RUNTIME_FUNCTION static cudaError_t Transform(
+    ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
+    ::cuda::std::tuple<RandomAccessIteratorsOut...> outputs,
+    NumItemsT num_items,
+    TransformOp transform_op,
+    Env env = {})
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Transform");
+    return TransformInternal(
+      ::cuda::std::move(inputs),
+      ::cuda::std::move(outputs),
+      num_items,
+      detail::transform::always_true_predicate{},
+      ::cuda::std::move(transform_op),
+      ::cuda::std::move(env));
+  }
+
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
+  // Overload with additional parameters to specify temporary storage. Provided for compatibility with other CUB APIs.
+  template <typename... RandomAccessIteratorsIn,
+            typename... RandomAccessIteratorsOut,
+            typename NumItemsT,
+            typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t Transform(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
+    ::cuda::std::tuple<RandomAccessIteratorsOut...> outputs,
+    NumItemsT num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return Transform(
+      ::cuda::std::move(inputs), ::cuda::std::move(outputs), num_items, ::cuda::std::move(transform_op), stream);
+  }
+#endif // _CCCL_DOXYGEN_INVOKED
+
   //! @rst
   //! Overview
   //! +++++++++++++++++++++++++++++++++++++++++++++
diff --git a/cub/test/catch2_test_device_transform.cu b/cub/test/catch2_test_device_transform.cu
@@ -435,22 +435,42 @@ C2H_TEST("DeviceTransform::Transform fancy output iterator type with void value
   REQUIRE(result == c2h::device_vector<type>(num_items, 3));
 }
 
-C2H_TEST("DeviceTransform::Transform mixed input iterator types", "[device][transform]")
+struct plus_mul_neg
 {
-  using type          = int;
+  template <typename T>
+  __host__ __device__ auto operator()(T a, T b) const
+  {
+    return cuda::std::tuple{a + b, a * b, -a};
+  }
+};
+
+C2H_TEST("DeviceTransform::Transform mixed iterator types 2 -> 3", "[device][transform]")
+{
+  using type          = unsigned; // overflow is defined
   const int num_items = GENERATE(100, 100'000); // try to hit the small and full tile code paths
   cuda::counting_iterator<type> a{0};
   c2h::device_vector<type> b(num_items, thrust::no_init);
   c2h::gen(C2H_SEED(1), b);
 
-  c2h::device_vector<type> result(num_items, thrust::no_init);
-  transform_many(cuda::std::make_tuple(a, b.begin()), result.begin(), num_items, cuda::std::plus<type>{});
+  c2h::device_vector<type> result_a(num_items, thrust::no_init);
+  c2h::device_vector<type> result_b(num_items, thrust::no_init);
+  c2h::device_vector<type> result_c(num_items, thrust::no_init);
+  transform_many(
+    cuda::std::make_tuple(a, b.begin()),
+    cuda::std::make_tuple(
+      result_a.begin(), result_b.begin(), thrust::make_transform_output_iterator(result_c.begin(), cuda::std::negate{})),
+    num_items,
+    plus_mul_neg{});
 
   // compute reference and verify
   c2h::host_vector<type> b_h = b;
-  c2h::host_vector<type> reference_h(num_items);
-  std::transform(a, a + num_items, b_h.begin(), reference_h.begin(), std::plus<type>{});
-  REQUIRE(reference_h == result);
+  c2h::host_vector<type> reference_a_h(num_items, thrust::no_init);
+  std::transform(a, a + num_items, b_h.begin(), reference_a_h.begin(), cuda::std::plus<type>{});
+  c2h::host_vector<type> reference_b_h(num_items, thrust::no_init);
+  std::transform(a, a + num_items, b_h.begin(), reference_b_h.begin(), cuda::std::multiplies<type>{});
+  CHECK(reference_a_h == result_a);
+  CHECK(reference_b_h == result_b);
+  CHECK(thrust::equal(a, a + num_items, result_c.begin()));
 }
 
 struct plus_needs_stable_address
diff --git a/cub/test/catch2_test_device_transform_api.cu b/cub/test/catch2_test_device_transform_api.cu
@@ -10,6 +10,37 @@
 
 #include <c2h/catch2_test_helper.h>
 
+// need a separate function because the ext. lambda needs to be enclosed by a function with external linkage on Windows
+void test_transform_many_many_api()
+{
+  // example-begin transform-many-many
+  auto input1 = thrust::device_vector<int>{0, -1, 2, -3, 4, -5};
+  auto input2 = thrust::device_vector<double>{5.2, 3.1, -1.1, 3.0, 3.2, 0.0};
+  auto op     = [] __device__(int a, double b) -> cuda::std::tuple<double, bool> {
+    const double product = a * b;
+    return {product, product < 0};
+  };
+
+  auto result1 = thrust::device_vector<double>(input1.size(), thrust::no_init);
+  auto result2 = thrust::device_vector<bool>(input1.size(), thrust::no_init);
+  cub::DeviceTransform::Transform(
+    cuda::std::tuple{input1.begin(), input2.begin()},
+    cuda::std::tuple{result1.begin(), result2.begin()},
+    input1.size(),
+    op);
+
+  const auto expected1 = thrust::host_vector<double>{0, -3.1, -2.2, -9, 12.8, -0};
+  const auto expected2 = thrust::host_vector<bool>{false, true, true, true, false, false};
+  // example-end transform-many-many
+  CHECK(result1 == expected1);
+  CHECK(result2 == expected2);
+}
+
+C2H_TEST("DeviceTransform::Transform many->many API example", "[device][device_transform]")
+{
+  test_transform_many_many_api();
+}
+
 // need a separate function because the ext. lambda needs to be enclosed by a function with external linkage on Windows
 void test_transform_api()
 {
@@ -21,7 +52,7 @@ void test_transform_api()
     return (a + b) * c;
   };
 
-  auto result = thrust::device_vector<int>(input1.size());
+  auto result = thrust::device_vector<int>(input1.size(), thrust::no_init);
   cub::DeviceTransform::Transform(
     cuda::std::tuple{input1.begin(), input2.begin(), input3}, result.begin(), input1.size(), op);
 
@@ -74,7 +105,7 @@ void test_transform_stable_api()
     return a + input2_ptr[i];
   };
 
-  auto result = thrust::device_vector<int>(input1.size());
+  auto result = thrust::device_vector<int>(input1.size(), thrust::no_init);
   cub::DeviceTransform::TransformStableArgumentAddresses(
     cuda::std::tuple{input1_ptr}, result.begin(), input1.size(), op);
 
diff --git a/libcudacxx/include/cuda/std/__pstl/cuda/reduce.h b/libcudacxx/include/cuda/std/__pstl/cuda/reduce.h
@@ -41,14 +41,18 @@ _CCCL_DIAG_POP
 #  include <cuda/std/__exception/cuda_error.h>
 #  include <cuda/std/__execution/env.h>
 #  include <cuda/std/__execution/policy.h>
+#  include <cuda/std/__functional/invoke.h>
 #  include <cuda/std/__iterator/distance.h>
 #  include <cuda/std/__iterator/iterator_traits.h>
+#  include <cuda/std/__iterator/next.h>
 #  include <cuda/std/__memory/addressof.h>
 #  include <cuda/std/__memory/construct_at.h>
 #  include <cuda/std/__new/bad_alloc.h>
 #  include <cuda/std/__numeric/reduce.h>
 #  include <cuda/std/__pstl/dispatch.h>
 #  include <cuda/std/__type_traits/always_false.h>
+#  include <cuda/std/__type_traits/is_nothrow_constructible.h>
+#  include <cuda/std/__utility/forward.h>
 #  include <cuda/std/__utility/move.h>
 
 #  include <cuda_runtime.h>
@@ -76,7 +80,8 @@ struct __pstl_dispatch<__pstl_algorithm::__reduce, __execution_backend::__cuda>
       {}
 
       template <class _Index, class _Up>
-      _CCCL_DEVICE_API void operator()(_Index, _Up&& __value)
+      _CCCL_DEVICE_API _CCCL_FORCEINLINE void
+      operator()(_Index, _Up&& __value) noexcept(is_nothrow_constructible_v<_Tp, _Up>)
       {
         ::cuda::std::__construct_at(__ptr_, ::cuda::std::forward<_Up>(__value));
       }
@@ -97,57 +102,66 @@ struct __pstl_dispatch<__pstl_algorithm::__reduce, __execution_backend::__cuda>
     _CCCL_HOST_API ~__allocation_guard()
     {
       __resource_.deallocate(__stream_, __ptr_, __num_bytes_, alignof(_Tp));
-      __stream_.sync();
     }
 
-    [[nodiscard]] _CCCL_HOST_API auto __get_result_iter()
+    [[nodiscard]] _CCCL_HOST_API auto __get_result_iter() noexcept
     {
       if constexpr (::cuda::std::__detail::__can_optimize_construct_at<_Tp, _AccumT>)
       {
-        return reinterpret_cast<_Tp*>(__ptr_);
+        return __ptr_;
       }
       else
       {
-        return ::cuda::tabulate_output_iterator{__construct_result{reinterpret_cast<_Tp*>(__ptr_)}};
+        return ::cuda::tabulate_output_iterator{__construct_result{__ptr_}};
       }
     }
 
-    [[nodiscard]] _CCCL_HOST_API void* __get_temp_storage()
+    [[nodiscard]] _CCCL_HOST_API void* __get_temp_storage() noexcept
     {
-      return static_cast<void*>(reinterpret_cast<unsigned char*>(__ptr_) + sizeof(_Tp));
+      return static_cast<void*>(__ptr_ + 1);
     }
   };
 
-  template <class _Policy, class _Iter, class _Tp, class _BinaryOp>
+  template <class _Policy, class _Iter, class _Size, class _Tp, class _BinaryOp>
   [[nodiscard]] _CCCL_HOST_API static _Tp
-  __par_impl([[maybe_unused]] const _Policy& __policy, _Iter __first, _Iter __last, _Tp __init, _BinaryOp __func)
+  __par_impl([[maybe_unused]] const _Policy& __policy, _Iter __first, _Size __count, _Tp __init, _BinaryOp __func)
   {
     _Tp __ret;
 
+    // We need to know the accumulator type to determine whether we need construct_at for the return value
+    using _AccumT = __accumulator_t<_BinaryOp, iter_reference_t<_Iter>, _Tp>;
+
+    // Determine temporary device storage requirements for reduce
+    void* __temp_storage = nullptr;
+    size_t __num_bytes   = 0;
+    _CCCL_TRY_CUDA_API(
+      ::cub::DeviceReduce::Reduce,
+      "__pstl_cuda_reduce: determination of device storage for cub::DeviceReduce::Reduce failed",
+      __temp_storage,
+      __num_bytes,
+      __first,
+      static_cast<_Tp*>(nullptr),
+      __count,
+      __func,
+      __init);
+
+    // Allocate memory for result
+    auto __stream   = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy);
+    auto __resource = ::cuda::__call_or(
+      ::cuda::mr::get_memory_resource, ::cuda::device_default_memory_pool(__stream.device()), __policy);
+
     {
-      // We need to know the accumulator type to determine whether we need construct_at for the return value
-      using _AccumT = __accumulator_t<_BinaryOp, iter_reference_t<_Iter>, _Tp>;
-
-      //!    // Determine temporary device storage requirements for reduce
-      void* __temp_storage   = nullptr;
-      size_t __num_bytes     = 0;
-      const auto __num_items = ::cuda::std::distance(__first, __last);
-      ::cub::DeviceReduce::Reduce(
-        __temp_storage, __num_bytes, __first, static_cast<_Tp*>(nullptr), __num_items, __func, __init);
-
-      // Allocate memory for result
-      auto __stream   = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy);
-      auto __resource = ::cuda::__call_or(
-        ::cuda::mr::get_memory_resource, ::cuda::device_default_memory_pool(__stream.device()), __policy);
       __allocation_guard<_Tp, _AccumT, decltype(__resource)> __guard{__stream, __resource, __num_bytes};
 
       // Run the reduction
-      ::cub::DeviceReduce::Reduce(
+      _CCCL_TRY_CUDA_API(
+        ::cub::DeviceReduce::Reduce,
+        "__pstl_cuda_reduce: kernel launch of cub::DeviceReduce::Reduce failed",
         __guard.__get_temp_storage(),
         __num_bytes,
         ::cuda::std::move(__first),
         __guard.__get_result_iter(),
-        __num_items,
+        __count,
         ::cuda::std::move(__func),
         ::cuda::std::move(__init),
         __stream.get());
@@ -163,23 +177,20 @@ struct __pstl_dispatch<__pstl_algorithm::__reduce, __execution_backend::__cuda>
         __stream.get());
     }
 
+    __stream.sync();
     return __ret;
   }
 
-  template <class _Policy, class _Iter, class _Tp, class _BinaryOp>
+  template <class _Policy, class _Iter, class _Size, class _Tp, class _BinaryOp>
   [[nodiscard]] _CCCL_HOST_API _Tp
-  operator()([[maybe_unused]] const _Policy& __policy, _Iter __first, _Iter __last, _Tp __init, _BinaryOp __func) const
+  operator()([[maybe_unused]] const _Policy& __policy, _Iter __first, _Size __count, _Tp __init, _BinaryOp __func) const
   {
     if constexpr (::cuda::std::__has_random_access_traversal<_Iter>)
     {
       try
       {
         return __par_impl(
-          __policy,
-          ::cuda::std::move(__first),
-          ::cuda::std::move(__last),
-          ::cuda::std::move(__init),
-          ::cuda::std::move(__func));
+          __policy, ::cuda::std::move(__first), __count, ::cuda::std::move(__init), ::cuda::std::move(__func));
       }
       catch (const ::cuda::cuda_error& __err)
       {
@@ -198,9 +209,17 @@ struct __pstl_dispatch<__pstl_algorithm::__reduce, __execution_backend::__cuda>
       static_assert(__always_false_v<_Policy>,
                     "__pstl_dispatch: CUDA backend of cuda::std::reduce requires at least random access iterators");
       return ::cuda::std::reduce(
-        ::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__init), ::cuda::std::move(__func));
+        __first, ::cuda::std::next(__first, __count), ::cuda::std::move(__init), ::cuda::std::move(__func));
     }
   }
+
+  template <class _Policy, class _Iter, class _Tp, class _BinaryOp>
+  [[nodiscard]] _CCCL_HOST_API _Tp
+  operator()([[maybe_unused]] const _Policy& __policy, _Iter __first, _Iter __last, _Tp __init, _BinaryOp __func) const
+  {
+    const auto __count = ::cuda::std::distance(__first, __last);
+    return (*this)(__policy, ::cuda::std::move(__first), __count, ::cuda::std::move(__init), ::cuda::std::move(__func));
+  }
 };
 
 _CCCL_END_NAMESPACE_ARCH_DEPENDENT
diff --git a/libcudacxx/include/cuda/std/__pstl/reduce.h b/libcudacxx/include/cuda/std/__pstl/reduce.h
diff --git a/python/cuda_cccl/cuda/coop/_caching.py b/python/cuda_cccl/cuda/coop/_caching.py