diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 07a4314c76d..e61189cd0e9 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -809,6 +809,7 @@ add_library( src/utilities/cuda.cpp src/utilities/cuda_memcpy.cu src/utilities/default_stream.cpp + src/utilities/grid_1d.cu src/utilities/host_memory.cpp src/utilities/host_worker_pool.cpp src/utilities/linked_column.cpp diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index b31f76d67a1..448d623e8ef 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh index 0c41b22fb92..5e30a0cfde0 100644 --- a/cpp/benchmarks/join/generate_input_tables.cuh +++ b/cpp/benchmarks/join/generate_input_tables.cuh @@ -18,6 +18,7 @@ #include #include +#include #include #include #include diff --git a/cpp/benchmarks/ndsh/utilities.cpp b/cpp/benchmarks/ndsh/utilities.cpp index f99dc303ad2..766d30bdfeb 100644 --- a/cpp/benchmarks/ndsh/utilities.cpp +++ b/cpp/benchmarks/ndsh/utilities.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu index 3aff75d840e..ac29acb78d7 100644 --- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu +++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh index 448e5195b56..1a3f88f69de 100644 --- a/cpp/include/cudf/detail/copy_if_else.cuh +++ b/cpp/include/cudf/detail/copy_if_else.cuh @@ -21,6 +21,7 @@ #include #include #include +#include #include #include diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh index 022c5c40ea0..0d56d2e6a15 100644 --- a/cpp/include/cudf/detail/copy_range.cuh +++ b/cpp/include/cudf/detail/copy_range.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include #include #include diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh index 67adeb8c411..ffb00e78ece 100644 --- a/cpp/include/cudf/detail/gather.cuh +++ b/cpp/include/cudf/detail/gather.cuh @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index d7d3a247943..31b658de64f 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh index b518e003c20..4758b874642 100644 --- a/cpp/include/cudf/detail/utilities/cuda.cuh +++ b/cpp/include/cudf/detail/utilities/cuda.cuh @@ -16,10 +16,8 @@ #pragma once -#include #include #include -#include #include @@ -28,120 +26,12 @@ namespace cudf { namespace detail { + /** * @brief Size of a warp in a CUDA kernel. */ static constexpr size_type warp_size{32}; -/** - * @brief A kernel grid configuration construction gadget for simple - * one-dimensional kernels, with protection against integer overflow. - */ -class grid_1d { - public: - thread_index_type const num_threads_per_block; - thread_index_type const num_blocks; - /** - * @param overall_num_elements The number of elements the kernel needs to - * handle/process, in its main, one-dimensional/linear input (e.g. one or more - * cuDF columns) - * @param num_threads_per_block The grid block size, determined according to - * the kernel's specific features (amount of shared memory necessary, SM - * functional units use pattern etc.); this can't be determined - * generically/automatically (as opposed to the number of blocks) - * @param elements_per_thread Typically, a single kernel thread processes more - * than a single element; this affects the number of threads the grid must - * contain - */ - grid_1d(thread_index_type overall_num_elements, - thread_index_type num_threads_per_block, - thread_index_type elements_per_thread = 1) - : num_threads_per_block(num_threads_per_block), - num_blocks(util::div_rounding_up_safe(overall_num_elements, - elements_per_thread * num_threads_per_block)) - { - CUDF_EXPECTS(num_threads_per_block > 0, "num_threads_per_block must be > 0"); - CUDF_EXPECTS(num_blocks > 0, "num_blocks must be > 0"); - } - - /** - * @brief Returns the global thread index in a 1D grid. - * - * The returned index is unique across the entire grid. - * - * @param thread_id The thread index within the block - * @param block_id The block index within the grid - * @param num_threads_per_block The number of threads per block - * @return thread_index_type The global thread index - */ - __device__ static constexpr thread_index_type global_thread_id( - thread_index_type thread_id, - thread_index_type block_id, - thread_index_type num_threads_per_block) - { - return thread_id + block_id * num_threads_per_block; - } - - /** - * @brief Returns the global thread index of the current thread in a 1D grid. - * - * @return thread_index_type The global thread index - */ - static __device__ thread_index_type global_thread_id() - { - return global_thread_id(threadIdx.x, blockIdx.x, blockDim.x); - } - - /** - * @brief Returns the global thread index of the current thread in a 1D grid. - * - * @tparam num_threads_per_block The number of threads per block - * - * @return thread_index_type The global thread index - */ - template - static __device__ thread_index_type global_thread_id() - { - return global_thread_id(threadIdx.x, blockIdx.x, num_threads_per_block); - } - - /** - * @brief Returns the stride of a 1D grid. - * - * The returned stride is the total number of threads in the grid. - * - * @param thread_id The thread index within the block - * @param block_id The block index within the grid - * @param num_threads_per_block The number of threads per block - * @return thread_index_type The global thread index - */ - __device__ static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block, - thread_index_type num_blocks_per_grid) - { - return num_threads_per_block * num_blocks_per_grid; - } - - /** - * @brief Returns the stride of the current 1D grid. - * - * @return thread_index_type The number of threads in the grid. - */ - static __device__ thread_index_type grid_stride() { return grid_stride(blockDim.x, gridDim.x); } - - /** - * @brief Returns the stride of the current 1D grid. - * - * @tparam num_threads_per_block The number of threads per block - * - * @return thread_index_type The number of threads in the grid. - */ - template - static __device__ thread_index_type grid_stride() - { - return grid_stride(num_threads_per_block, gridDim.x); - } -}; - /** * @brief Performs a sum reduction of values from the same lane across all * warps in a thread block and returns the result on thread 0 of the block. diff --git a/cpp/include/cudf/detail/utilities/grid_1d.cuh b/cpp/include/cudf/detail/utilities/grid_1d.cuh new file mode 100644 index 00000000000..33aa19b3c7a --- /dev/null +++ b/cpp/include/cudf/detail/utilities/grid_1d.cuh @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace CUDF_EXPORT cudf { +namespace detail { + +/** + * @brief A kernel grid configuration construction gadget for simple + * one-dimensional kernels, with protection against integer overflow. + */ +class grid_1d { + public: + thread_index_type const num_threads_per_block; + thread_index_type const num_blocks; + + /** + * @param overall_num_elements The number of elements the kernel needs to + * handle/process, in its main, one-dimensional/linear input (e.g. one or more + * cuDF columns) + * @param num_threads_per_block The grid block size, determined according to + * the kernel's specific features (amount of shared memory necessary, SM + * functional units use pattern etc.); this can't be determined + * generically/automatically (as opposed to the number of blocks) + * @param elements_per_thread Typically, a single kernel thread processes more + * than a single element; this affects the number of threads the grid must + * contain + */ + grid_1d(thread_index_type overall_num_elements, + thread_index_type num_threads_per_block, + thread_index_type elements_per_thread = 1); + + /** + * @brief Returns the global thread index in a 1D grid. + * + * The returned index is unique across the entire grid. + * + * @param thread_id The thread index within the block + * @param block_id The block index within the grid + * @param num_threads_per_block The number of threads per block + * @return thread_index_type The global thread index + */ + __device__ static constexpr thread_index_type global_thread_id( + thread_index_type thread_id, + thread_index_type block_id, + thread_index_type num_threads_per_block) + { + return thread_id + block_id * num_threads_per_block; + } + + /** + * @brief Returns the global thread index of the current thread in a 1D grid. + * + * @return thread_index_type The global thread index + */ + static __device__ thread_index_type global_thread_id() + { + return global_thread_id(threadIdx.x, blockIdx.x, blockDim.x); + } + + /** + * @brief Returns the global thread index of the current thread in a 1D grid. + * + * @tparam num_threads_per_block The number of threads per block + * + * @return thread_index_type The global thread index + */ + template + static __device__ thread_index_type global_thread_id() + { + return global_thread_id(threadIdx.x, blockIdx.x, num_threads_per_block); + } + + /** + * @brief Returns the stride of a 1D grid. + * + * The returned stride is the total number of threads in the grid. + * + * @param thread_id The thread index within the block + * @param block_id The block index within the grid + * @param num_threads_per_block The number of threads per block + * @return thread_index_type The global thread index + */ + __device__ static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block, + thread_index_type num_blocks_per_grid) + { + return num_threads_per_block * num_blocks_per_grid; + } + + /** + * @brief Returns the stride of the current 1D grid. + * + * @return thread_index_type The number of threads in the grid. + */ + static __device__ thread_index_type grid_stride() { return grid_stride(blockDim.x, gridDim.x); } + + /** + * @brief Returns the stride of the current 1D grid. + * + * @tparam num_threads_per_block The number of threads per block + * + * @return thread_index_type The number of threads in the grid. + */ + template + static __device__ thread_index_type grid_stride() + { + return grid_stride(num_threads_per_block, gridDim.x); + } +}; + +} // namespace detail +} // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh index c4e1d81824f..30af77dd8ef 100644 --- a/cpp/include/cudf/detail/valid_if.cuh +++ b/cpp/include/cudf/detail/valid_if.cuh @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index 0393e9914a1..73b53dbe9be 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh index cd386ea886f..3402abd9d36 100644 --- a/cpp/include/cudf/strings/detail/strings_children.cuh +++ b/cpp/include/cudf/strings/detail/strings_children.cuh @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/binaryop/jit/kernel.cu b/cpp/src/binaryop/jit/kernel.cu index 1133e9ac22e..dc021ea99a6 100644 --- a/cpp/src/binaryop/jit/kernel.cu +++ b/cpp/src/binaryop/jit/kernel.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Copyright 2018-2019 BlazingDB, Inc. * Copyright 2018 Christian Noboa Mardini @@ -18,6 +18,7 @@ * limitations under the License. */ +#include #include #include #include @@ -51,8 +52,8 @@ CUDF_KERNEL void kernel_v_v(cudf::size_type size, TypeLhs* lhs_data, TypeRhs* rhs_data) { - auto const start = threadIdx.x + static_cast(blockIdx.x) * blockDim.x; - auto const step = static_cast(blockDim.x) * gridDim.x; + auto const start = cudf::detail::grid_1d::global_thread_id(); + auto const step = cudf::detail::grid_1d::grid_stride(); for (auto i = start; i < size; i += step) { out_data[i] = TypeOpe::template operate(lhs_data[i], rhs_data[i]); @@ -70,8 +71,8 @@ CUDF_KERNEL void kernel_v_v_with_validity(cudf::size_type size, cudf::bitmask_type const* rhs_mask, cudf::size_type rhs_offset) { - auto const start = threadIdx.x + static_cast(blockIdx.x) * blockDim.x; - auto const step = static_cast(blockDim.x) * gridDim.x; + auto const start = cudf::detail::grid_1d::global_thread_id(); + auto const step = cudf::detail::grid_1d::grid_stride(); for (auto i = start; i < size; i += step) { bool output_valid = false; diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index 72a7a2a8437..29b63182a61 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index 79dcbd4c934..10735cfd8bf 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index f6506c80e1b..08656a0052e 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index cd14eb96ec4..9330fabe535 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh index 50c74f29ab8..bd043671174 100644 --- a/cpp/src/groupby/hash/compute_mapping_indices.cuh +++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include #include diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu index efabfe69a8f..878126bbea5 100644 --- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/interop/to_arrow_host.cu b/cpp/src/interop/to_arrow_host.cu index 79425f6190a..728d75a008f 100644 --- a/cpp/src/interop/to_arrow_host.cu +++ b/cpp/src/interop/to_arrow_host.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu index 25538e010b6..f370bed41c6 100644 --- a/cpp/src/io/comp/unsnap.cu +++ b/cpp/src/io/comp/unsnap.cu @@ -18,6 +18,8 @@ #include "io/utilities/block_utils.cuh" #include +#include +#include #include diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu index 71d20f46ea3..53557fc9d40 100644 --- a/cpp/src/io/csv/csv_gpu.cu +++ b/cpp/src/io/csv/csv_gpu.cu @@ -21,6 +21,8 @@ #include "io/utilities/trie.cuh" #include +#include +#include #include #include #include diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index d115a28b399..efa6e44e6ac 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index e1ec384c8d4..d462f08a763 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 6cd13e44260..f472f8d54bb 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -23,6 +23,8 @@ #include #include #include +#include +#include #include #include diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index c68029725f4..abcdc31aa8b 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -19,6 +19,7 @@ #include "parquet_gpu.hpp" #include +#include #include diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index c34f5fb314a..50df6a36463 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu index 6e74bebbd73..31e6a3a7a8a 100644 --- a/cpp/src/join/conditional_join.cu +++ b/cpp/src/join/conditional_join.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh index 62769862f54..db249f7335f 100644 --- a/cpp/src/join/conditional_join_kernels.cuh +++ b/cpp/src/join/conditional_join_kernels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu index 9c527359f94..2109dc8951d 100644 --- a/cpp/src/join/mixed_join.cu +++ b/cpp/src/join/mixed_join.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/join/mixed_join_kernel.cuh b/cpp/src/join/mixed_join_kernel.cuh index 4565626edad..830c783f2e1 100644 --- a/cpp/src/join/mixed_join_kernel.cuh +++ b/cpp/src/join/mixed_join_kernel.cuh @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/join/mixed_join_kernel.hpp b/cpp/src/join/mixed_join_kernel.hpp index cc92e9d8ba4..8187ad03aca 100644 --- a/cpp/src/join/mixed_join_kernel.hpp +++ b/cpp/src/join/mixed_join_kernel.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include "join/mixed_join_common_utils.cuh" #include +#include #include #include diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh index b08298e64e4..5d46d450b2e 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cuh +++ b/cpp/src/join/mixed_join_kernels_semi.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #include "mixed_join_common_utils.cuh" #include +#include #include #include diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu index 3b3464c86a0..4926c6b2792 100644 --- a/cpp/src/join/mixed_join_semi.cu +++ b/cpp/src/join/mixed_join_semi.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh index c0261de0190..07d9bee4aff 100644 --- a/cpp/src/join/mixed_join_size_kernel.cuh +++ b/cpp/src/join/mixed_join_size_kernel.cuh @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/join/mixed_join_size_kernel.hpp b/cpp/src/join/mixed_join_size_kernel.hpp index 676e0903749..776229f11c1 100644 --- a/cpp/src/join/mixed_join_size_kernel.hpp +++ b/cpp/src/join/mixed_join_size_kernel.hpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu index ebecf161ada..23523d0a0a4 100644 --- a/cpp/src/json/json_path.cu +++ b/cpp/src/json/json_path.cu @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu index 0ce111d9f4b..f4653cc6892 100644 --- a/cpp/src/partitioning/partitioning.cu +++ b/cpp/src/partitioning/partitioning.cu @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include #include #include diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu index 7117283a56a..7f8e80d0230 100644 --- a/cpp/src/quantiles/tdigest/tdigest.cu +++ b/cpp/src/quantiles/tdigest/tdigest.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index b79818d1f90..e81b89f4ed5 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu index 9029e7f9ee8..ce36ce2a190 100644 --- a/cpp/src/replace/replace.cu +++ b/cpp/src/replace/replace.cu @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh index 937425bd703..c32d831a4f1 100644 --- a/cpp/src/rolling/detail/rolling.cuh +++ b/cpp/src/rolling/detail/rolling.cuh @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu index 466f120022b..12219a286f7 100644 --- a/cpp/src/rolling/jit/kernel.cu +++ b/cpp/src/rolling/jit/kernel.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include "rolling/detail/rolling_jit.hpp" #include "rolling/jit/operation.hpp" +#include #include #include @@ -51,8 +52,8 @@ CUDF_KERNEL void gpu_rolling_new(cudf::size_type nrows, FollowingWindowType following_window_begin, cudf::size_type min_periods) { - cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x; - cudf::thread_index_type const stride = blockDim.x * gridDim.x; + auto i = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); cudf::size_type warp_valid_count{0}; diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu index e91ed58297e..ac1f32160ed 100644 --- a/cpp/src/strings/attributes.cu +++ b/cpp/src/strings/attributes.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu index 632a7f407c3..490d8ce8a5c 100644 --- a/cpp/src/strings/case.cu +++ b/cpp/src/strings/case.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu index 174e51910cc..b5dd46f8461 100644 --- a/cpp/src/strings/convert/convert_urls.cu +++ b/cpp/src/strings/convert/convert_urls.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index 2b86518d3c8..7cbb208ebb3 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include #include #include diff --git a/cpp/src/strings/like.cu b/cpp/src/strings/like.cu index 41525fd03b4..508f5055b86 100644 --- a/cpp/src/strings/like.cu +++ b/cpp/src/strings/like.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh index 2594fd7b6da..4ed4fcc996f 100644 --- a/cpp/src/strings/regex/utilities.cuh +++ b/cpp/src/strings/regex/utilities.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu index ffcefbe11b6..11ccb0df194 100644 --- a/cpp/src/strings/replace/multi.cu +++ b/cpp/src/strings/replace/multi.cu @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include #include diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu index 2632d6099c8..c61c4f60980 100644 --- a/cpp/src/strings/replace/replace.cu +++ b/cpp/src/strings/replace/replace.cu @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include #include #include diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu index 58083541164..06d9b42b093 100644 --- a/cpp/src/strings/search/contains_multiple.cu +++ b/cpp/src/strings/search/contains_multiple.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu index 7d01498234c..cb862bcdeda 100644 --- a/cpp/src/strings/search/find.cu +++ b/cpp/src/strings/search/find.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu index fba1f815c4d..ea4542b61cd 100644 --- a/cpp/src/strings/slice.cu +++ b/cpp/src/strings/slice.cu @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include #include #include diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh index 5abac368f8a..197e54aa646 100644 --- a/cpp/src/strings/split/split.cuh +++ b/cpp/src/strings/split/split.cuh @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include #include #include diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu index 8e00a29f8e9..4af220b2a41 100644 --- a/cpp/src/strings/strings_column_factories.cu +++ b/cpp/src/strings/strings_column_factories.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/text/bpe/byte_pair_encoding.cu b/cpp/src/text/bpe/byte_pair_encoding.cu index 3368289f64a..1f2340141d8 100644 --- a/cpp/src/text/bpe/byte_pair_encoding.cu +++ b/cpp/src/text/bpe/byte_pair_encoding.cu @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index 95cbdfd0fda..5fd0b790521 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu index bc0c08fa7d7..94819801b05 100644 --- a/cpp/src/text/jaccard.cu +++ b/cpp/src/text/jaccard.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 6debbac9094..94d376be63a 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index 08af56968c0..701056e44f2 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -26,6 +26,8 @@ #include #include #include +#include +#include #include #include #include diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu index 4c54409c41a..d9943fb781b 100644 --- a/cpp/src/text/subword/data_normalizer.cu +++ b/cpp/src/text/subword/data_normalizer.cu @@ -20,6 +20,8 @@ #include #include +#include +#include #include #include #include diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu index 0e1c4941208..58653be7dd7 100644 --- a/cpp/src/text/subword/subword_tokenize.cu +++ b/cpp/src/text/subword/subword_tokenize.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu index 6e7a2a14235..a2de52b5659 100644 --- a/cpp/src/text/subword/wordpiece_tokenizer.cu +++ b/cpp/src/text/subword/wordpiece_tokenizer.cu @@ -19,6 +19,7 @@ #include "text/subword/detail/wordpiece_tokenizer.hpp" #include +#include #include #include diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu index 77a01d555db..56c00723b77 100644 --- a/cpp/src/text/tokenize.cu +++ b/cpp/src/text/tokenize.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu index f25d1484b4c..e8e65848913 100644 --- a/cpp/src/text/vocabulary_tokenize.cu +++ b/cpp/src/text/vocabulary_tokenize.cu @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/text/wordpiece_tokenize.cu b/cpp/src/text/wordpiece_tokenize.cu index fb44d671902..02f9d38940c 100644 --- a/cpp/src/text/wordpiece_tokenize.cu +++ b/cpp/src/text/wordpiece_tokenize.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu index 2f3cb2ae7b6..ec6b6182299 100644 --- a/cpp/src/transform/compute_column.cu +++ b/cpp/src/transform/compute_column.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/transform/compute_column_kernel.cuh b/cpp/src/transform/compute_column_kernel.cuh index a8eb30c19eb..3b194df0a5f 100644 --- a/cpp/src/transform/compute_column_kernel.cuh +++ b/cpp/src/transform/compute_column_kernel.cuh @@ -21,6 +21,7 @@ #include #include #include +#include #include #include diff --git a/cpp/src/transform/compute_column_kernel.hpp b/cpp/src/transform/compute_column_kernel.hpp index be23920469c..c5816c3eff7 100644 --- a/cpp/src/transform/compute_column_kernel.hpp +++ b/cpp/src/transform/compute_column_kernel.hpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/cpp/src/transform/jit/kernel.cu b/cpp/src/transform/jit/kernel.cu index 7117b84cd9b..1784f2d2428 100644 --- a/cpp/src/transform/jit/kernel.cu +++ b/cpp/src/transform/jit/kernel.cu @@ -17,6 +17,7 @@ #include "jit/span.cuh" #include +#include #include #include #include @@ -127,12 +128,9 @@ CUDF_KERNEL void kernel(cudf::mutable_column_device_view_core const* outputs, // inputs to JITIFY kernels have to be either sized-integral types or pointers. Structs or // references can't be passed directly/correctly as they will be crossing an ABI boundary - // cannot use global_thread_id utility due to a JIT build issue by including - // the `cudf/detail/utilities/cuda.cuh` header - auto const block_size = static_cast(blockDim.x); - thread_index_type const start = threadIdx.x + blockIdx.x * block_size; - thread_index_type const stride = block_size * gridDim.x; - thread_index_type const size = outputs[0].size(); + auto const start = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); + auto const size = outputs[0].size(); for (auto i = start; i < size; i += stride) { if (Out::is_null(outputs, i)) { continue; } @@ -150,13 +148,10 @@ CUDF_KERNEL void fixed_point_kernel(cudf::mutable_column_device_view_core const* cudf::column_device_view_core const* inputs, void* user_data) { - // cannot use global_thread_id utility due to a JIT build issue by including - // the `cudf/detail/utilities/cuda.cuh` header - auto const block_size = static_cast(blockDim.x); - thread_index_type const start = threadIdx.x + blockIdx.x * block_size; - thread_index_type const stride = block_size * gridDim.x; - thread_index_type const size = outputs[0].size(); - auto const output_scale = static_cast(outputs[0].type().scale()); + auto const start = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); + auto const size = outputs[0].size(); + auto const output_scale = static_cast(outputs[0].type().scale()); for (auto i = start; i < size; i += stride) { typename Out::type result{numeric::scaled_integer{0, output_scale}}; @@ -178,12 +173,9 @@ CUDF_KERNEL void span_kernel(cudf::jit::device_optional_span cudf::column_device_view_core const* inputs, void* user_data) { - // cannot use global_thread_id utility due to a JIT build issue by including - // the `cudf/detail/utilities/cuda.cuh` header - auto const block_size = static_cast(blockDim.x); - thread_index_type const start = threadIdx.x + blockIdx.x * block_size; - thread_index_type const stride = block_size * gridDim.x; - thread_index_type const size = outputs[0].size(); + auto const start = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); + auto const size = outputs[0].size(); for (auto i = start; i < size; i += stride) { if (Out::is_null(outputs, i)) { continue; } diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu index 4039defed8c..ded16ef958d 100644 --- a/cpp/src/transform/row_bit_count.cu +++ b/cpp/src/transform/row_bit_count.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu index c0af27a1748..29355d30a61 100644 --- a/cpp/src/utilities/cuda_memcpy.cu +++ b/cpp/src/utilities/cuda_memcpy.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,8 @@ #include #include +#include +#include #include #include diff --git a/cpp/src/utilities/grid_1d.cu b/cpp/src/utilities/grid_1d.cu new file mode 100644 index 00000000000..ed00d7527f2 --- /dev/null +++ b/cpp/src/utilities/grid_1d.cu @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +namespace cudf::detail { + +grid_1d::grid_1d(thread_index_type overall_num_elements, + thread_index_type num_threads_per_block, + thread_index_type elements_per_thread) + : num_threads_per_block(num_threads_per_block), + num_blocks( + util::div_rounding_up_safe(overall_num_elements, elements_per_thread * num_threads_per_block)) +{ + CUDF_EXPECTS(num_threads_per_block > 0, "num_threads_per_block must be > 0"); + CUDF_EXPECTS(num_blocks > 0, "num_blocks must be > 0"); +} + +} // namespace cudf::detail