diff --git a/CMakeLists.txt b/CMakeLists.txt index 6ca37091c3c..6fcd2a00087 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -606,6 +606,20 @@ if(HPX_WITH_CUDA AND HPX_WITH_HIP) ) endif() +# HPX THRUST configuration +set(HPX_WITH_THRUST_DEFAULT OFF) +if(HPX_WITH_CUDA) + set(HPX_WITH_THRUST_DEFAULT ON) +endif() + +hpx_option( + HPX_WITH_THRUST BOOL + "Enable support for NVIDIA Thrust integration (default: ON when CUDA is enabled, OFF otherwise)" + ${HPX_WITH_THRUST_DEFAULT} + ADVANCED + CATEGORY "Build Targets" +) + # ## HPX STDEXEC configuration ## hpx_option( diff --git a/libs/core/CMakeLists.txt b/libs/core/CMakeLists.txt index 3e6550e93f2..bff87d65898 100644 --- a/libs/core/CMakeLists.txt +++ b/libs/core/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 The STE||AR-Group +# Copyright (c) 2020-2023 The STE||AR-Group # # SPDX-License-Identifier: BSL-1.0 # Distributed under the Boost Software License, Version 1.0. (See accompanying @@ -79,6 +79,7 @@ set(_hpx_core_modules threading threading_base threadmanager + thrust timed_execution timing topology @@ -98,4 +99,4 @@ endforeach() set(_hpx_core_modules ${_hpx_core_modules} PARENT_SCOPE -) +) \ No newline at end of file diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/fill.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/fill.hpp index 9cd17aa0476..6445df1c6c3 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/fill.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/fill.hpp @@ -384,4 +384,4 @@ namespace hpx { } fill_n{}; } // namespace hpx -#endif // DOXYGEN +#endif // DOXYGEN \ No newline at end of file diff --git a/libs/core/execution_base/include/hpx/execution_base/any_sender.hpp b/libs/core/execution_base/include/hpx/execution_base/any_sender.hpp index d31ee22ef51..c3a13fcf117 100644 --- a/libs/core/execution_base/include/hpx/execution_base/any_sender.hpp +++ b/libs/core/execution_base/include/hpx/execution_base/any_sender.hpp @@ -36,7 +36,7 @@ namespace hpx::detail { template using empty_vtable_t = typename empty_vtable_type::type; -#if !defined(HPX_MSVC) && defined(HPX_HAVE_CXX20_TRIVIAL_VIRTUAL_DESTRUCTOR) + #if !defined(HPX_MSVC) && defined(HPX_HAVE_CXX20_TRIVIAL_VIRTUAL_DESTRUCTOR) && !defined(__CUDACC__) && !defined(HPX_COMPUTE_DEVICE_CODE) template inline constexpr empty_vtable_t empty_vtable{}; diff --git a/libs/core/modules.rst b/libs/core/modules.rst index 46dbbe21dec..9b834fe8310 100644 --- a/libs/core/modules.rst +++ b/libs/core/modules.rst @@ -81,4 +81,4 @@ Core modules /libs/core/topology/docs/index.rst /libs/core/type_support/docs/index.rst /libs/core/util/docs/index.rst - /libs/core/version/docs/index.rst + /libs/core/version/docs/index.rst \ No newline at end of file diff --git a/libs/core/thrust/CMakeLists.txt b/libs/core/thrust/CMakeLists.txt new file mode 100644 index 00000000000..97f12813515 --- /dev/null +++ b/libs/core/thrust/CMakeLists.txt @@ -0,0 +1,41 @@ +# Copyright (c) 2025 The STE||AR-Group +# +# SPDX-License-Identifier: BSL-1.0 +# Distributed under the Boost Software License, Version 1.0. (See accompanying +# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +if(NOT (HPX_WITH_CUDA AND HPX_WITH_THRUST)) + return() +endif() + +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") + +set(thrust_headers + hpx/thrust/policy.hpp + hpx/thrust/algorithms.hpp + hpx/thrust/detail/algorithm_map.hpp +) + +# cmake-format: off +set(thrust_compat_headers) +# cmake-format: on + +set(thrust_sources) + +include(HPX_AddModule) +add_hpx_module( + core thrust + GLOBAL_HEADER_GEN ON + SOURCES ${thrust_sources} + HEADERS ${thrust_headers} + COMPAT_HEADERS ${thrust_compat_headers} + MODULE_DEPENDENCIES + hpx_algorithms + hpx_concepts + hpx_execution + hpx_execution_base + hpx_functional + hpx_futures + hpx_async_cuda + CMAKE_SUBDIRS examples tests +) diff --git a/libs/core/thrust/docs/index.rst b/libs/core/thrust/docs/index.rst new file mode 100644 index 00000000000..0bdafc866e2 --- /dev/null +++ b/libs/core/thrust/docs/index.rst @@ -0,0 +1,90 @@ +.. + Copyright (c) 2025 The STE||AR-Group + + SPDX-License-Identifier: BSL-1.0 + Distributed under the Boost Software License, Version 1.0. (See accompanying + file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +.. _modules_thrust: + +====== +thrust +====== + +The thrust module integrates |hpx| parallel algorithms with NVIDIA Thrust, +enabling GPU acceleration using familiar |hpx| algorithm syntax. + +Execution Policies +================== + +``hpx::thrust::thrust_host_policy`` + CPU execution using optimized parallel implementations + +``hpx::thrust::thrust_device_policy`` + Synchronous GPU execution + +``hpx::thrust::thrust_task_policy`` + Asynchronous GPU execution returning |hpx| futures + +Policy Mappings +=============== + +The |hpx| thrust policies map directly to Thrust execution policies: + +.. code-block:: c++ + + // Policy mappings + hpx::thrust::thrust_host_policy -> thrust::host + hpx::thrust::thrust_device_policy -> thrust::device + hpx::thrust::thrust_task_policy -> thrust::par_nosync + +Usage +===== + +Synchronous Device Execution +----------------------------- + +.. code-block:: c++ + + #include + #include + #include + + hpx::thrust::thrust_device_policy device{}; + thrust::device_vector d_vec(1000, 0); + + hpx::fill(device, d_vec.begin(), d_vec.end(), 42); + int sum = hpx::reduce(device, d_vec.begin(), d_vec.end(), 0); + +Asynchronous Execution with Futures +------------------------------------ + +.. code-block:: c++ + + #include + #include + + int hpx_main() { + // Required for async operations + hpx::cuda::experimental::enable_user_polling polling_guard("default"); + + hpx::thrust::thrust_task_policy task{}; + thrust::device_vector d_vec(1000, 1); + + auto fill_future = hpx::fill(task, d_vec.begin(), d_vec.end(), 42); + fill_future.get(); // Standard HPX future operations + + } + +Build Requirements +================== + +* CUDA Toolkit 12.4.0+ +* |hpx| with ``HPX_WITH_CUDA=ON`` +* Enable with: ``cmake -DHPX_WITH_CUDA=ON ...`` + +See Also +======== + +* :ref:`modules_async_cuda` - CUDA integration +* :ref:`modules_execution` - |hpx| execution policies diff --git a/libs/core/thrust/examples/CMakeLists.txt b/libs/core/thrust/examples/CMakeLists.txt new file mode 100644 index 00000000000..b10e440a70f --- /dev/null +++ b/libs/core/thrust/examples/CMakeLists.txt @@ -0,0 +1,34 @@ +# Copyright (c) 2025 Aditya Sapra +# +# SPDX-License-Identifier: BSL-1.0 +# Distributed under the Boost Software License, Version 1.0. (See accompanying +# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +if(HPX_WITH_EXAMPLES) + add_hpx_pseudo_target(examples.modules.thrust) + add_hpx_pseudo_dependencies(examples.modules examples.modules.thrust) + if(HPX_WITH_CUDA AND HPX_WITH_THRUST) + set(thrust_examples device_fill host_fill device_copy device_transform async_fill_n reverse_reduce) + + foreach(example ${thrust_examples}) + set(source ${example}.cu) + source_group("Source Files" FILES ${source}) + + add_hpx_executable( + ${example} INTERNAL_FLAGS + SOURCES ${source} + EXCLUDE_FROM_ALL + HPX_PREFIX ${HPX_BUILD_PREFIX} + FOLDER "Examples/Modules/Core/Thrust" + ) + + add_hpx_pseudo_dependencies(examples.modules.thrust ${example}) + endforeach() + endif() + if(HPX_WITH_TESTS AND HPX_WITH_TESTS_EXAMPLES) + add_hpx_pseudo_target(tests.examples.modules.thrust) + add_hpx_pseudo_dependencies( + tests.examples.modules tests.examples.modules.thrust + ) + endif() +endif() diff --git a/libs/core/thrust/examples/async_fill_n.cu b/libs/core/thrust/examples/async_fill_n.cu new file mode 100644 index 00000000000..e5072f64e7d --- /dev/null +++ b/libs/core/thrust/examples/async_fill_n.cu @@ -0,0 +1,35 @@ +// Copyright (c) 2025 Aditya Sapra +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// + +#include +#include +#include + +#include +#include + +#include +#include +#include + +int hpx_main(int, char**) +{ + // Enable CUDA event polling on the "default" pool for HPX <-> CUDA future bridging + hpx::cuda::experimental::enable_user_polling polling_guard("default"); + + hpx::thrust::thrust_task_policy task{}; + thrust::device_vector v(1024, 0); + auto f = hpx::fill_n(task, v.begin(), 512, 9); + f.get(); + return hpx::local::finalize(); +} + +int main(int argc, char** argv) +{ + hpx::local::init_params init_args; + return hpx::local::init(hpx_main, argc, argv, init_args); +} diff --git a/libs/core/thrust/examples/device_copy.cu b/libs/core/thrust/examples/device_copy.cu new file mode 100644 index 00000000000..950f165c940 --- /dev/null +++ b/libs/core/thrust/examples/device_copy.cu @@ -0,0 +1,30 @@ +// Copyright (c) 2025 Aditya Sapra +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// + +#include +#include +#include + +#include +#include + +#include + +int hpx_main(int, char**) +{ + hpx::thrust::thrust_device_policy device{}; + thrust::device_vector src(1024, 5); + thrust::device_vector dst(1024, 0); + hpx::copy(device, src.begin(), src.end(), dst.begin()); + return hpx::local::finalize(); +} + +int main(int argc, char** argv) +{ + hpx::local::init_params init_args; + return hpx::local::init(hpx_main, argc, argv, init_args); +} diff --git a/libs/core/thrust/examples/device_fill.cu b/libs/core/thrust/examples/device_fill.cu new file mode 100644 index 00000000000..521df44075d --- /dev/null +++ b/libs/core/thrust/examples/device_fill.cu @@ -0,0 +1,29 @@ +// Copyright (c) 2025 Aditya Sapra +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// + +#include +#include +#include + +#include +#include + +#include + +int hpx_main(int, char**) +{ + hpx::thrust::thrust_device_policy device{}; + thrust::device_vector v(1024, 0); + hpx::fill(device, v.begin(), v.end(), 1); + return hpx::local::finalize(); +} + +int main(int argc, char** argv) +{ + hpx::local::init_params init_args; + return hpx::local::init(hpx_main, argc, argv, init_args); +} diff --git a/libs/core/thrust/examples/device_transform.cu b/libs/core/thrust/examples/device_transform.cu new file mode 100644 index 00000000000..621bdf91240 --- /dev/null +++ b/libs/core/thrust/examples/device_transform.cu @@ -0,0 +1,33 @@ +// Copyright (c) 2025 Aditya Sapra +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// + +#include +#include +#include + +#include +#include + +#include + +int hpx_main(int, char**) +{ + hpx::thrust::thrust_device_policy device{}; + thrust::device_vector a(1024, 2); + thrust::device_vector b(1024, 3); + thrust::device_vector out(1024, 0); + + hpx::transform(device, a.begin(), a.end(), b.begin(), out.begin(), + [] __device__(int x, int y) { return x + y; }); + return hpx::local::finalize(); +} + +int main(int argc, char** argv) +{ + hpx::local::init_params init_args; + return hpx::local::init(hpx_main, argc, argv, init_args); +} diff --git a/libs/core/thrust/examples/host_fill.cu b/libs/core/thrust/examples/host_fill.cu new file mode 100644 index 00000000000..9da1759c2a2 --- /dev/null +++ b/libs/core/thrust/examples/host_fill.cu @@ -0,0 +1,29 @@ +// Copyright (c) 2025 Aditya Sapra +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// + +#include +#include +#include + +#include +#include + +#include + +int hpx_main(int, char**) +{ + hpx::thrust::thrust_host_policy host{}; + std::vector v(1024, 0); + hpx::fill(host, v.begin(), v.end(), 42); + return hpx::local::finalize(); +} + +int main(int argc, char** argv) +{ + hpx::local::init_params init_args; + return hpx::local::init(hpx_main, argc, argv, init_args); +} diff --git a/libs/core/thrust/examples/reverse_reduce.cu b/libs/core/thrust/examples/reverse_reduce.cu new file mode 100644 index 00000000000..e69e21a0f51 --- /dev/null +++ b/libs/core/thrust/examples/reverse_reduce.cu @@ -0,0 +1,34 @@ +// Copyright (c) 2025 Aditya Sapra +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// + +#include +#include +#include + +#include +#include + +#include + +int hpx_main(int, char**) +{ + hpx::thrust::thrust_device_policy device{}; + thrust::device_vector v(10); + for (int i = 0; i < 10; ++i) + v[i] = i + 1; + + hpx::reverse(device, v.begin(), v.end()); + int sum = hpx::reduce(device, v.begin(), v.end(), 0); + (void) sum; + return hpx::local::finalize(); +} + +int main(int argc, char** argv) +{ + hpx::local::init_params init_args; + return hpx::local::init(hpx_main, argc, argv, init_args); +} diff --git a/libs/core/thrust/include/hpx/thrust/algorithms.hpp b/libs/core/thrust/include/hpx/thrust/algorithms.hpp new file mode 100644 index 00000000000..7c935a5b1f9 --- /dev/null +++ b/libs/core/thrust/include/hpx/thrust/algorithms.hpp @@ -0,0 +1,58 @@ +// Copyright (c) 2025 Aditya Sapra +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace hpx::thrust { + + template + requires( + is_thrust_execution_policy_v> && + detail::is_algorithm_mapped) + decltype(auto) tag_invoke(HPXTag tag, ThrustPolicy&& policy, Args&&... args) + { + if constexpr (hpx::is_async_execution_policy_v< + std::decay_t>) + { + if constexpr (std::is_void_v::invoke(std::declval(), + std::declval()...))>) + { + detail::algorithm_map::invoke( + HPX_FORWARD(ThrustPolicy, policy), + HPX_FORWARD(Args, args)...); + return policy.get_future(); // future + } + else + { + auto result = detail::algorithm_map::invoke( + HPX_FORWARD(ThrustPolicy, policy), + HPX_FORWARD(Args, args)...); + return policy.get_future().then( + [result = HPX_MOVE(result)](auto&& f) mutable { + f.get(); + return HPX_MOVE(result); + }); // future + } + } + else + { + return detail::algorithm_map::invoke( + HPX_FORWARD(ThrustPolicy, policy), HPX_FORWARD(Args, args)...); + } + } +} // namespace hpx::thrust diff --git a/libs/core/thrust/include/hpx/thrust/detail/algorithm_map.hpp b/libs/core/thrust/include/hpx/thrust/detail/algorithm_map.hpp new file mode 100644 index 00000000000..1008f4ef103 --- /dev/null +++ b/libs/core/thrust/include/hpx/thrust/detail/algorithm_map.hpp @@ -0,0 +1,688 @@ +// Copyright (c) 2025 Aditya Sapra +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// + +#pragma once + +// HPX algorithm headers for tag types +#include +#include +#include + +// Centralized Thrust algorithm headers +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include // HPX_FORWARD +#include +#include + +namespace hpx::thrust::detail { + + template + struct + algorithm_map; // No definition = compilation error for unmapped algorithms + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::fill(policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::fill_n(policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::copy(policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::transform( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::for_each(policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::reduce(policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::sort(policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::find(policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::count(policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::unique(policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::reverse(policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::generate(policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::generate_n( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::remove(policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::remove_if( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::replace(policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::replace_if( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::merge(policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::partition( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::transform_reduce( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::copy_n(policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::copy_if(policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::count_if(policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::find_if(policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::find_if_not( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::stable_sort( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::stable_partition( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::partition_copy( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::remove_copy( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::remove_copy_if( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::replace_copy( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::replace_copy_if( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::reverse_copy( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::unique_copy( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::adjacent_difference( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::transform_inclusive_scan( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::transform_exclusive_scan( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::equal(policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::mismatch(policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::min_element( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::max_element( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::minmax_element( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::all_of(policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::any_of(policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::none_of(policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::swap_ranges( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::uninitialized_copy( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::uninitialized_copy_n( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::uninitialized_fill( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::uninitialized_fill_n( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::set_union( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::set_intersection( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::set_difference( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::set_symmetric_difference( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::for_each_n( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::is_sorted( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::is_partitioned( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::inclusive_scan( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + template <> + struct algorithm_map + { + template + static constexpr decltype(auto) invoke(Policy&& policy, Args&&... args) + { + return ::thrust::exclusive_scan( + policy.get(), HPX_FORWARD(Args, args)...); + } + }; + + // boolean predicate indicating whether a mapping exists + template + concept is_algorithm_mapped = requires { + algorithm_map::invoke( + std::declval(), std::declval()...); + }; + +} // namespace hpx::thrust::detail diff --git a/libs/core/thrust/include/hpx/thrust/policy.hpp b/libs/core/thrust/include/hpx/thrust/policy.hpp new file mode 100644 index 00000000000..914e5abb969 --- /dev/null +++ b/libs/core/thrust/include/hpx/thrust/policy.hpp @@ -0,0 +1,603 @@ +// Copyright (c) 2025 Aditya Sapra +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#pragma once + +// Required HPX execution headers +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace hpx::thrust { + + struct thrust_task_policy; //for async ops + template + struct thrust_task_policy_shim; + + struct thrust_policy; + template + struct thrust_policy_shim; + + struct thrust_host_policy; + struct thrust_device_policy; + + struct thrust_task_policy + { + using executor_type = hpx::execution::parallel_executor; + using executor_parameters_type = + hpx::execution::experimental::extract_executor_parameters< + executor_type>::type; + using execution_category = hpx::execution::parallel_execution_tag; + + template + struct rebind + { + using type = thrust_task_policy_shim; + }; + + constexpr thrust_task_policy() {} + + thrust_task_policy operator()( + hpx::execution::experimental::to_task_t) const + { + return *this; + } + + thrust_task_policy_shim on( + hpx::cuda::experimental::target const& t) const; + + // HPX execution policy interface + executor_type executor() const + { + return executor_type{}; + } + + executor_parameters_type& parameters() + { + return params_; + } + constexpr executor_parameters_type const& parameters() const + { + return params_; + } + + // Async helpers with default-target fallback for base policy + bool has_target() const + { + return false; + } + + hpx::cuda::experimental::target const& target_or_default() const + { + return hpx::cuda::experimental::get_default_target(); + } + + cudaStream_t stream() const + { + return target_or_default().native_handle().get_stream(); + } + + auto get() const + { + return ::thrust::cuda::par_nosync.on(stream()); + } + + hpx::future get_future() const + { + return target_or_default().get_future_with_event(); + } + + private: + executor_parameters_type params_{}; + }; + + template + struct thrust_task_policy_shim : thrust_task_policy + { + using executor_type = Executor; + using executor_parameters_type = Parameters; + using execution_category = + typename hpx::traits::executor_execution_category< + executor_type>::type; + + template + struct rebind + { + using type = thrust_task_policy_shim; + }; + + thrust_task_policy_shim operator()( + hpx::execution::experimental::to_task_t) const + { + return *this; + } + + // Bind a CUDA target explicitly for async GPU execution (returns a new shim) + thrust_task_policy_shim on( + hpx::cuda::experimental::target const& t) const + { + thrust_task_policy_shim copy = *this; + copy.bound_target_ = + std::make_shared(t); + return copy; + } + + // Async helpers with default-target fallback + bool has_target() const + { + return static_cast(bound_target_); + } + + hpx::cuda::experimental::target const& target_or_default() const + { + return bound_target_ ? + *bound_target_ : + hpx::cuda::experimental::get_default_target(); + } + + cudaStream_t stream() const + { + return target_or_default().native_handle().get_stream(); + } + + auto get() const + { + return ::thrust::cuda::par_nosync.on(stream()); + } + + hpx::future get_future() const + { + return target_or_default().get_future_with_event(); + } + + // HPX execution policy interface for shim + Executor& executor() + { + return exec_; + } + Executor const& executor() const + { + return exec_; + } + + Parameters& parameters() + { + return params_; + } + Parameters const& parameters() const + { + return params_; + } + + template ::value && + std::is_constructible::value, + Dependent>::type> + constexpr thrust_task_policy_shim() + { + } + + template + constexpr thrust_task_policy_shim( + Executor_&& exec, Parameters_&& params) + : exec_(std::forward(exec)) + , params_(std::forward(params)) + { + } + + // Construct with an already bound CUDA target + explicit thrust_task_policy_shim( + std::shared_ptr tgt) + : bound_target_(std::move(tgt)) + { + } + + private: + Executor exec_{}; + Parameters params_{}; + std::shared_ptr bound_target_{}; + }; + + inline thrust_task_policy_shim + thrust_task_policy::on(hpx::cuda::experimental::target const& t) const + { + using shim_type = + thrust_task_policy_shim; + return shim_type(std::make_shared(t)); + } + + // Base thrust_policy + struct thrust_policy + { + using executor_type = hpx::execution::parallel_executor; + using executor_parameters_type = + hpx::execution::experimental::extract_executor_parameters< + executor_type>::type; + using execution_category = hpx::execution::parallel_execution_tag; + + template + struct rebind + { + using type = thrust_policy_shim; + }; + + constexpr thrust_policy() {} + + thrust_task_policy operator()( + hpx::execution::experimental::to_task_t) const + { + return thrust_task_policy(); + } + + template + typename hpx::execution::experimental::rebind_executor::type + on(Executor_&& exec) const + { + using executor_type = typename std::decay::type; + static_assert(hpx::traits::is_executor_any_v, + "hpx::traits::is_executor_any_v"); + + return hpx::execution::experimental::create_rebound_policy( + thrust_policy(), HPX_FORWARD(Executor_, exec), parameters()); + } + + template ::type> + typename hpx::execution::experimental::rebind_executor::type + with(Parameters_&&... params) const + { + return hpx::execution::experimental::create_rebound_policy( + thrust_policy(), executor(), + hpx::execution::experimental::join_executor_parameters( + HPX_FORWARD(Parameters_, params)...)); + } + + executor_type executor() const + { + return executor_type{}; + } + + executor_parameters_type& parameters() + { + return params_; + } + constexpr executor_parameters_type const& parameters() const + { + return params_; + } + + private: + executor_parameters_type params_{}; + }; + + template + struct thrust_policy_shim : thrust_policy + { + using executor_type = Executor; + using executor_parameters_type = Parameters; + using execution_category = + typename hpx::traits::executor_execution_category< + executor_type>::type; + + template + struct rebind + { + using type = thrust_policy_shim; + }; + + thrust_task_policy_shim operator()( + hpx::execution::experimental::to_task_t) const + { + return thrust_task_policy_shim( + exec_, params_); + } + + template + typename hpx::execution::experimental::rebind_executor< + thrust_policy_shim, Executor_, executor_parameters_type>::type + on(Executor_&& exec) const + { + using executor_type = typename std::decay::type; + static_assert(hpx::traits::is_executor_any_v, + "hpx::traits::is_executor_any_v"); + + return hpx::execution::experimental::create_rebound_policy( + thrust_policy_shim(), HPX_FORWARD(Executor_, exec), + parameters()); + } + + template ::type> + typename hpx::execution::experimental::rebind_executor< + thrust_policy_shim, executor_type, ParametersType>::type + with(Parameters_&&... params) const + { + return hpx::execution::experimental::create_rebound_policy( + thrust_policy_shim(), executor(), + hpx::execution::experimental::join_executor_parameters( + HPX_FORWARD(Parameters_, params)...)); + } + + Executor& executor() + { + return exec_; + } + + Executor const& executor() const + { + return exec_; + } + + Parameters& parameters() + { + return params_; + } + + Parameters const& parameters() const + { + return params_; + } + + template ::value && + std::is_constructible::value, + Dependent>::type> + constexpr thrust_policy_shim() + { + } + + template + constexpr thrust_policy_shim(Executor_&& exec, Parameters_&& params) + : exec_(std::forward(exec)) + , params_(std::forward(params)) + { + } + + private: + Executor exec_; + Parameters params_; + }; + + // Host-specific policy that inherits from thrust_policy + struct thrust_host_policy : thrust_policy + { + constexpr thrust_host_policy() = default; + + // Return thrust::host execution policy + constexpr auto get() const + { + return ::thrust::host; + } + }; + + // Device-specific policy that inherits from thrust_policy + struct thrust_device_policy : thrust_policy + { + constexpr thrust_device_policy() = default; + + // Return thrust::device execution policy + constexpr auto get() const + { + return ::thrust::device; + } + }; + + // Global policy instances + inline constexpr thrust_host_policy thrust_host{}; + inline constexpr thrust_device_policy thrust_device{}; + + // Legacy support - default thrust policy (keep for backward compatibility) + static constexpr thrust_policy thrust; + + template + struct is_thrust_execution_policy : std::false_type + { + }; + + template <> + struct is_thrust_execution_policy + : std::true_type + { + }; + + template + struct is_thrust_execution_policy< + hpx::thrust::thrust_policy_shim> : std::true_type + { + }; + + template <> + struct is_thrust_execution_policy + : std::true_type + { + }; + + template <> + struct is_thrust_execution_policy + : std::true_type + { + }; + + template <> + struct is_thrust_execution_policy + : std::true_type + { + }; + + template + struct is_thrust_execution_policy< + hpx::thrust::thrust_task_policy_shim> + : std::true_type + { + }; + + template + inline constexpr bool is_thrust_execution_policy_v = + is_thrust_execution_policy::value; + + namespace detail { + template + struct get_policy_result; + + template + struct get_policy_result>>> + { + static_assert(is_thrust_execution_policy< + std::decay_t>::value, + "get_policy_result can only be used with Thrust execution " + "policies"); + + using type = hpx::future; + + template + static constexpr decltype(auto) call(Future&& future) + { + return std::forward(future); + } + }; + + template + struct get_policy_result>>> + { + static_assert(is_thrust_execution_policy< + std::decay_t>::value, + "get_policy_result can only be used with Thrust execution " + "policies"); + + template + static constexpr decltype(auto) call(Future&& future) + { + return std::forward(future).get(); + } + }; + } // namespace detail + +} // namespace hpx::thrust + +namespace hpx::detail { + template + struct is_rebound_execution_policy< + hpx::thrust::thrust_policy_shim> : std::true_type + { + }; + + template + struct is_rebound_execution_policy< + hpx::thrust::thrust_task_policy_shim> + : std::true_type + { + }; + + template <> + struct is_execution_policy : std::true_type + { + }; + + template + struct is_execution_policy< + hpx::thrust::thrust_policy_shim> : std::true_type + { + }; + + template <> + struct is_execution_policy : std::true_type + { + }; + + template <> + struct is_execution_policy + : std::true_type + { + }; + + template <> + struct is_execution_policy : std::true_type + { + }; + + template + struct is_execution_policy< + hpx::thrust::thrust_task_policy_shim> + : std::true_type + { + }; + + template <> + struct is_parallel_execution_policy + : std::true_type + { + }; + + template + struct is_parallel_execution_policy< + hpx::thrust::thrust_policy_shim> : std::true_type + { + }; + + template <> + struct is_parallel_execution_policy + : std::true_type + { + }; + + template <> + struct is_parallel_execution_policy + : std::true_type + { + }; + + template <> + struct is_parallel_execution_policy + : std::true_type + { + }; + + template + struct is_parallel_execution_policy< + hpx::thrust::thrust_task_policy_shim> + : std::true_type + { + }; + + template <> + struct is_async_execution_policy + : std::true_type + { + }; + + template + struct is_async_execution_policy< + hpx::thrust::thrust_task_policy_shim> + : std::true_type + { + }; + +} // namespace hpx::detail diff --git a/libs/core/thrust/tests/CMakeLists.txt b/libs/core/thrust/tests/CMakeLists.txt new file mode 100644 index 00000000000..36a3b6dace7 --- /dev/null +++ b/libs/core/thrust/tests/CMakeLists.txt @@ -0,0 +1,42 @@ +# Copyright (c) 2025 The STE||AR-Group +# +# SPDX-License-Identifier: BSL-1.0 +# Distributed under the Boost Software License, Version 1.0. (See accompanying +# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +include(HPX_Message) + +if(HPX_WITH_TESTS) + if(HPX_WITH_TESTS_UNIT) + add_hpx_pseudo_target(tests.unit.modules.thrust) + add_hpx_pseudo_dependencies( + tests.unit.modules tests.unit.modules.thrust + ) + add_subdirectory(unit) + endif() + + if(HPX_WITH_TESTS_REGRESSIONS) + add_hpx_pseudo_target(tests.regressions.modules.thrust) + add_hpx_pseudo_dependencies( + tests.regressions.modules tests.regressions.modules.thrust + ) + add_subdirectory(regressions) + endif() + + if(HPX_WITH_TESTS_BENCHMARKS) + add_hpx_pseudo_target(tests.performance.modules.thrust) + add_hpx_pseudo_dependencies( + tests.performance.modules tests.performance.modules.thrust + ) + add_subdirectory(performance) + endif() + + if(HPX_WITH_TESTS_HEADERS) + add_hpx_header_tests( + modules.thrust + HEADERS ${thrust_headers} + HEADER_ROOT ${PROJECT_SOURCE_DIR}/include + DEPENDENCIES hpx_thrust + ) + endif() +endif() diff --git a/libs/core/thrust/tests/performance/CMakeLists.txt b/libs/core/thrust/tests/performance/CMakeLists.txt new file mode 100644 index 00000000000..349c21ef09e --- /dev/null +++ b/libs/core/thrust/tests/performance/CMakeLists.txt @@ -0,0 +1,5 @@ +# Copyright (c) 2025 The STE||AR-Group +# +# SPDX-License-Identifier: BSL-1.0 +# Distributed under the Boost Software License, Version 1.0. (See accompanying +# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) diff --git a/libs/core/thrust/tests/regressions/CMakeLists.txt b/libs/core/thrust/tests/regressions/CMakeLists.txt new file mode 100644 index 00000000000..349c21ef09e --- /dev/null +++ b/libs/core/thrust/tests/regressions/CMakeLists.txt @@ -0,0 +1,5 @@ +# Copyright (c) 2025 The STE||AR-Group +# +# SPDX-License-Identifier: BSL-1.0 +# Distributed under the Boost Software License, Version 1.0. (See accompanying +# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) diff --git a/libs/core/thrust/tests/unit/CMakeLists.txt b/libs/core/thrust/tests/unit/CMakeLists.txt new file mode 100644 index 00000000000..f06180949d0 --- /dev/null +++ b/libs/core/thrust/tests/unit/CMakeLists.txt @@ -0,0 +1,34 @@ +# Copyright (c) 2025 The STE||AR-Group +# +# SPDX-License-Identifier: BSL-1.0 +# Distributed under the Boost Software License, Version 1.0. (See accompanying +# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +if(NOT (HPX_WITH_CUDA AND HPX_WITH_THRUST)) + return() +endif() + +set(tests thrust_policy_test) +set(thrust_policy_test_PARAMETERS THREADS_PER_LOCALITY 4) +set(thrust_policy_test_CUDA ON) + +foreach(test ${tests}) + if(${${test}_CUDA}) + set(sources ${test}.cu) + else() + set(sources ${test}.cpp) + endif() + + source_group("Source Files" FILES ${sources}) + + add_hpx_executable( + ${test}_test INTERNAL_FLAGS + SOURCES ${sources} ${${test}_FLAGS} + DEPENDENCIES ${${test_program}_FLAGS} + EXCLUDE_FROM_ALL + HPX_PREFIX ${HPX_BUILD_PREFIX} + FOLDER "Tests/Unit/Modules/Core/Thrust" + ) + + add_hpx_unit_test("modules.thrust" ${test} ${${test}_PARAMETERS}) +endforeach() diff --git a/libs/core/thrust/tests/unit/thrust_policy_test.cu b/libs/core/thrust/tests/unit/thrust_policy_test.cu new file mode 100644 index 00000000000..bb01b87524d --- /dev/null +++ b/libs/core/thrust/tests/unit/thrust_policy_test.cu @@ -0,0 +1,281 @@ +// Copyright (c) 2025 Aditya Sapra +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +int hpx_main(hpx::program_options::variables_map&) +{ + // Enable CUDA event polling on the "default" pool for HPX <-> CUDA future bridging + hpx::cuda::experimental::enable_user_polling polling_guard("default"); + + std::size_t size = 10000; + int fill_value = 42; + + std::cout << "=== HPX-Thrust Universal Algorithm Dispatch Test ===" + << std::endl; + std::cout << "Vector size: " << size << " elements" << std::endl; + + // Test 1: CPU Sequential Fill + std::cout << "\n--- Test 1: CPU Sequential Fill (hpx::execution::seq) ---" + << std::endl; + std::vector cpu_vec(size, 0); + auto start_cpu = std::chrono::high_resolution_clock::now(); + hpx::fill(hpx::execution::seq, cpu_vec.begin(), cpu_vec.end(), fill_value); + auto stop_cpu = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_cpu = + stop_cpu - start_cpu; + HPX_TEST(duration_cpu.count() >= 0.0); + bool success_cpu = + (cpu_vec[0] == fill_value) && (cpu_vec[size - 1] == fill_value); + HPX_TEST(success_cpu); + + // Test 2: Thrust Host Policy + std::cout << "\n--- Test 2: Thrust Host Policy ---" << std::endl; + hpx::thrust::thrust_host_policy host_policy{}; + std::vector host_vec(size, 0); + std::vector host_vec2(size, 1); + std::vector host_result(size, 0); + + std::cout << "Testing hpx::fill with thrust_host_policy..." << std::endl; + auto start_host = std::chrono::high_resolution_clock::now(); + hpx::fill(host_policy, host_vec.begin(), host_vec.end(), fill_value); + auto stop_host = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_host = + stop_host - start_host; + HPX_TEST(duration_host.count() >= 0.0); + + std::cout << "Testing hpx::copy with thrust_host_policy..." << std::endl; + hpx::copy( + host_policy, host_vec.begin(), host_vec.end(), host_result.begin()); + + std::cout << " Testing hpx::transform with thrust_host_policy..." + << std::endl; + hpx::transform(host_policy, host_vec.begin(), host_vec.end(), + host_vec2.begin(), host_result.begin(), + [](int a, int b) { return a + b; }); + + bool success_host = (host_vec[0] == fill_value) && + (host_vec[size - 1] == fill_value) && + (host_result[0] == fill_value + 1) && + (host_result[size - 1] == fill_value + 1); + HPX_TEST(success_host); + + // Test 3: Thrust Device Policy - Multiple Algorithms + std::cout + << "\n--- Test 3: Thrust Device Policy (GPU Multi-Algorithm Test) ---" + << std::endl; + hpx::thrust::thrust_device_policy device_policy{}; + thrust::device_vector device_vec(size, 0); + thrust::device_vector device_vec2(size, 2); + thrust::device_vector device_result(size, 0); + + std::cout << " Testing hpx::fill with thrust_device_policy..." << std::endl; + + // Warm-up run to account for CUDA initialization overhead + hpx::fill(device_policy, device_vec.begin(), device_vec.end(), 1); + + auto start_device = std::chrono::high_resolution_clock::now(); + hpx::fill(device_policy, device_vec.begin(), device_vec.end(), fill_value); + auto stop_device = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_device = + stop_device - start_device; + HPX_TEST(duration_device.count() >= 0.0); + + std::cout << " Testing hpx::copy with thrust_device_policy..." << std::endl; + hpx::copy(device_policy, device_vec.begin(), device_vec.end(), + device_result.begin()); + + std::cout << " Testing hpx::transform with thrust_device_policy..." + << std::endl; + hpx::transform(device_policy, device_vec.begin(), device_vec.end(), + device_vec2.begin(), device_result.begin(), + [] __device__(int a, int b) { return a * b; }); + + // Verify GPU results by copying back to host + int first_val = device_vec[0]; + int last_val = device_vec[size - 1]; + int first_result = device_result[0]; + int last_result = device_result[size - 1]; + bool success_device = (first_val == fill_value) && + (last_val == fill_value) && (first_result == fill_value * 2) && + (last_result == fill_value * 2); + HPX_TEST(success_device); + + // Test 4: Test NEW algorithms that weren't available before + std::cout << "\n--- Test 4: Testing Additional Algorithms ---" << std::endl; + + // Initialize test data + thrust::host_vector numbers(10); + std::iota( + numbers.begin(), numbers.end(), 1); // Fill with 1, 2, 3, ..., 10 + thrust::device_vector d_numbers = numbers; + + std::cout << " Testing hpx::reverse with thrust_device_policy..." + << std::endl; + hpx::reverse(device_policy, d_numbers.begin(), d_numbers.end()); + + // Copy back to see result + thrust::host_vector reversed_result = d_numbers; + + std::cout << " Testing hpx::reduce with thrust_device_policy..." + << std::endl; + int sum = hpx::reduce(device_policy, d_numbers.begin(), d_numbers.end(), 0); + HPX_TEST_EQ(sum, 55); + + // Test 5: Global Instance Test + std::cout << "\n--- Test 5: Global Instance Test ---" << std::endl; + std::cout << "Testing global thrust_host and thrust_device instances..." + << std::endl; + + // Test global thrust_host instance + std::vector global_host_vec(1000, 0); + hpx::fill(hpx::thrust::thrust_host, global_host_vec.begin(), + global_host_vec.end(), 99); + bool global_host_success = + (global_host_vec[0] == 99) && (global_host_vec[999] == 99); + HPX_TEST(global_host_success); + + // Test global thrust_device instance + thrust::device_vector global_device_vec(1000, 0); + hpx::fill(hpx::thrust::thrust_device, global_device_vec.begin(), + global_device_vec.end(), 88); + int first_device = global_device_vec[0]; + int last_device = global_device_vec[999]; + bool global_device_success = (first_device == 88) && (last_device == 88); + HPX_TEST(global_device_success); + + // Performance Comparison + std::cout << "\n=== Performance Comparison ===" << std::endl; + std::cout << "CPU Sequential: " << duration_cpu.count() << " ms" + << std::endl; + std::cout << "Thrust Host: " << duration_host.count() << " ms" + << std::endl; + std::cout << "Thrust Device: " << duration_device.count() << " ms" + << std::endl; + + bool all_tests_passed = success_cpu && success_host && success_device && + global_host_success && global_device_success && + (sum == 55); // Check reduce result + + // Test 6: Task policy with explicit target (async: fill) + std::cout << "\n--- Test 6: Task policy (explicit target) async fill ---" + << std::endl; + hpx::cuda::experimental::target tgt = + hpx::cuda::experimental::get_default_target(); + hpx::thrust::thrust_task_policy task_policy{}; + auto p_task = task_policy.on(tgt); + thrust::device_vector dev_task_vec(size, 0); + auto f_fill = + hpx::fill(p_task, dev_task_vec.begin(), dev_task_vec.end(), 7); + static_assert(std::is_same>::value, + "async fill should return future"); + f_fill.get(); + bool task_explicit_ok = + (dev_task_vec[0] == 7) && (dev_task_vec[size - 1] == 7); + HPX_TEST(task_explicit_ok); + all_tests_passed = all_tests_passed && task_explicit_ok; + + // Test 7: Task policy default-target fallback (async: reverse) + std::cout << "\n--- Test 7: Task policy (default target) async reverse ---" + << std::endl; + thrust::host_vector h_small(10); + std::iota(h_small.begin(), h_small.end(), 1); + thrust::device_vector d_small = h_small; + hpx::thrust::thrust_task_policy + default_task{}; // no .on(target) -> uses default target + auto f_rev = hpx::reverse(default_task, d_small.begin(), d_small.end()); + f_rev.get(); + thrust::host_vector h_after = d_small; + bool task_default_ok = (h_after.front() == 10) && (h_after.back() == 1); + HPX_TEST(task_default_ok); + all_tests_passed = all_tests_passed && task_default_ok; + + // Test 8: Same-stream ordering (reuse bound target): fill 1 then fill 2 + std::cout << "\n--- Test 8: Same-stream ordering (fill->fill) ---" + << std::endl; + thrust::device_vector order_vec(size, 0); + auto f1 = hpx::fill(p_task, order_vec.begin(), order_vec.end(), 1); + auto f2 = hpx::fill(p_task, order_vec.begin(), order_vec.end(), 2); + hpx::when_all(f1, f2).get(); + bool order_ok = (order_vec[0] == 2) && (order_vec[size - 1] == 2); + HPX_TEST(order_ok); + all_tests_passed = all_tests_passed && order_ok; + + // Test 9: Async algorithms with return values (fill_n) + std::cout << "\n--- Test 9: Async fill_n with return value ---" + << std::endl; + thrust::device_vector fill_n_vec(size, 0); + + // Test async fill_n - should return future + auto f_fill_n = hpx::fill_n(default_task, fill_n_vec.begin(), size / 2, 99); + static_assert(std::is_same_v::iterator>>, + "async fill_n should return future"); + + // Get the result iterator when operation completes + auto result_iter = f_fill_n.get(); + + // Verify the iterator points to the correct position + auto expected_iter = fill_n_vec.begin() + size / 2; + bool iter_correct = (result_iter == expected_iter); + + // Verify the actual fill operation worked + bool first_half_filled = + (fill_n_vec[0] == 99) && (fill_n_vec[size / 2 - 1] == 99); + bool second_half_untouched = + (fill_n_vec[size / 2] == 0) && (fill_n_vec[size - 1] == 0); + + bool fill_n_success = + iter_correct && first_half_filled && second_half_untouched; + HPX_TEST(fill_n_success); + + all_tests_passed = all_tests_passed && fill_n_success; + + std::cout << "\n=== Test Results ===" << std::endl; + std::cout + << "Algorithms tested: fill, fill_n, copy, transform, reverse, reduce" + << std::endl; + std::cout << "Policies tested: thrust_host_policy, thrust_device_policy" + << std::endl; + std::cout << "Universal tag_invoke: " + << (all_tests_passed ? "Working" : "Some issues detected") + << std::endl; + std::cout << "Overall result: " + << (all_tests_passed ? "ALL TESTS PASSED" : "SOME TESTS FAILED") + << std::endl; + + (void) all_tests_passed; + return hpx::local::finalize(); +} + +int main(int argc, char** argv) +{ + hpx::local::init_params init_args; + auto result = hpx::local::init(hpx_main, argc, argv, init_args); + return result; +}