Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -809,6 +809,7 @@ add_library(
src/utilities/cuda.cpp
src/utilities/cuda_memcpy.cu
src/utilities/default_stream.cpp
src/utilities/grid_1d.cu
src/utilities/host_memory.cpp
src/utilities/host_worker_pool.cpp
src/utilities/linked_column.cpp
Expand Down
1 change: 1 addition & 0 deletions cpp/benchmarks/common/generate_input.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <cudf/copying.hpp>
#include <cudf/detail/gather.hpp>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/detail/valid_if.cuh>
#include <cudf/filling.hpp>
#include <cudf/null_mask.hpp>
Expand Down
1 change: 1 addition & 0 deletions cpp/benchmarks/join/generate_input_tables.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/cuda.hpp>
#include <cudf/detail/utilities/grid_1d.cuh>
#include <cudf/types.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>
Expand Down
1 change: 1 addition & 0 deletions cpp/benchmarks/ndsh/utilities.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/copying.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/groupby.hpp>
#include <cudf/join/join.hpp>
#include <cudf/reduction.hpp>
Expand Down
3 changes: 2 additions & 1 deletion cpp/benchmarks/type_dispatcher/type_dispatcher.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -20,6 +20,7 @@
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_view.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/grid_1d.cuh>
#include <cudf/filling.hpp>
#include <cudf/scalar/scalar_factories.hpp>
#include <cudf/table/table_device_view.cuh>
Expand Down
1 change: 1 addition & 0 deletions cpp/include/cudf/detail/copy_if_else.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/device_scalar.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/grid_1d.cuh>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/utilities/memory_resource.hpp>

Expand Down
4 changes: 3 additions & 1 deletion cpp/include/cudf/detail/copy_range.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -20,6 +20,8 @@
#include <cudf/copying.hpp>
#include <cudf/detail/device_scalar.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/grid_1d.cuh>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/bit.hpp>
#include <cudf/utilities/default_stream.hpp>
Expand Down
1 change: 1 addition & 0 deletions cpp/include/cudf/detail/gather.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/utilities/assert.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/grid_1d.cuh>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/detail/valid_if.cuh>
#include <cudf/dictionary/dictionary_column_view.hpp>
Expand Down
1 change: 1 addition & 0 deletions cpp/include/cudf/detail/null_mask.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <cudf/column/column_device_view.cuh>
#include <cudf/detail/device_scalar.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/grid_1d.cuh>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/detail/valid_if.cuh>
#include <cudf/null_mask.hpp>
Expand Down
112 changes: 1 addition & 111 deletions cpp/include/cudf/detail/utilities/cuda.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,8 @@

#pragma once

#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>

#include <rmm/cuda_stream_view.hpp>

Expand All @@ -28,120 +26,12 @@

namespace cudf {
namespace detail {

/**
* @brief Size of a warp in a CUDA kernel.
*/
static constexpr size_type warp_size{32};

/**
* @brief A kernel grid configuration construction gadget for simple
* one-dimensional kernels, with protection against integer overflow.
*/
class grid_1d {
public:
thread_index_type const num_threads_per_block;
thread_index_type const num_blocks;
/**
* @param overall_num_elements The number of elements the kernel needs to
* handle/process, in its main, one-dimensional/linear input (e.g. one or more
* cuDF columns)
* @param num_threads_per_block The grid block size, determined according to
* the kernel's specific features (amount of shared memory necessary, SM
* functional units use pattern etc.); this can't be determined
* generically/automatically (as opposed to the number of blocks)
* @param elements_per_thread Typically, a single kernel thread processes more
* than a single element; this affects the number of threads the grid must
* contain
*/
grid_1d(thread_index_type overall_num_elements,
thread_index_type num_threads_per_block,
thread_index_type elements_per_thread = 1)
: num_threads_per_block(num_threads_per_block),
num_blocks(util::div_rounding_up_safe(overall_num_elements,
elements_per_thread * num_threads_per_block))
{
CUDF_EXPECTS(num_threads_per_block > 0, "num_threads_per_block must be > 0");
CUDF_EXPECTS(num_blocks > 0, "num_blocks must be > 0");
}

/**
* @brief Returns the global thread index in a 1D grid.
*
* The returned index is unique across the entire grid.
*
* @param thread_id The thread index within the block
* @param block_id The block index within the grid
* @param num_threads_per_block The number of threads per block
* @return thread_index_type The global thread index
*/
__device__ static constexpr thread_index_type global_thread_id(
thread_index_type thread_id,
thread_index_type block_id,
thread_index_type num_threads_per_block)
{
return thread_id + block_id * num_threads_per_block;
}

/**
* @brief Returns the global thread index of the current thread in a 1D grid.
*
* @return thread_index_type The global thread index
*/
static __device__ thread_index_type global_thread_id()
{
return global_thread_id(threadIdx.x, blockIdx.x, blockDim.x);
}

/**
* @brief Returns the global thread index of the current thread in a 1D grid.
*
* @tparam num_threads_per_block The number of threads per block
*
* @return thread_index_type The global thread index
*/
template <thread_index_type num_threads_per_block>
static __device__ thread_index_type global_thread_id()
{
return global_thread_id(threadIdx.x, blockIdx.x, num_threads_per_block);
}

/**
* @brief Returns the stride of a 1D grid.
*
* The returned stride is the total number of threads in the grid.
*
* @param thread_id The thread index within the block
* @param block_id The block index within the grid
* @param num_threads_per_block The number of threads per block
* @return thread_index_type The global thread index
*/
__device__ static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block,
thread_index_type num_blocks_per_grid)
{
return num_threads_per_block * num_blocks_per_grid;
}

/**
* @brief Returns the stride of the current 1D grid.
*
* @return thread_index_type The number of threads in the grid.
*/
static __device__ thread_index_type grid_stride() { return grid_stride(blockDim.x, gridDim.x); }

/**
* @brief Returns the stride of the current 1D grid.
*
* @tparam num_threads_per_block The number of threads per block
*
* @return thread_index_type The number of threads in the grid.
*/
template <thread_index_type num_threads_per_block>
static __device__ thread_index_type grid_stride()
{
return grid_stride(num_threads_per_block, gridDim.x);
}
};

/**
* @brief Performs a sum reduction of values from the same lane across all
* warps in a thread block and returns the result on thread 0 of the block.
Expand Down
128 changes: 128 additions & 0 deletions cpp/include/cudf/detail/utilities/grid_1d.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/types.hpp>

namespace CUDF_EXPORT cudf {
namespace detail {

/**
* @brief A kernel grid configuration construction gadget for simple
* one-dimensional kernels, with protection against integer overflow.
*/
class grid_1d {
public:
thread_index_type const num_threads_per_block;
thread_index_type const num_blocks;

/**
* @param overall_num_elements The number of elements the kernel needs to
* handle/process, in its main, one-dimensional/linear input (e.g. one or more
* cuDF columns)
* @param num_threads_per_block The grid block size, determined according to
* the kernel's specific features (amount of shared memory necessary, SM
* functional units use pattern etc.); this can't be determined
* generically/automatically (as opposed to the number of blocks)
* @param elements_per_thread Typically, a single kernel thread processes more
* than a single element; this affects the number of threads the grid must
* contain
*/
grid_1d(thread_index_type overall_num_elements,
thread_index_type num_threads_per_block,
thread_index_type elements_per_thread = 1);

/**
* @brief Returns the global thread index in a 1D grid.
*
* The returned index is unique across the entire grid.
*
* @param thread_id The thread index within the block
* @param block_id The block index within the grid
* @param num_threads_per_block The number of threads per block
* @return thread_index_type The global thread index
*/
__device__ static constexpr thread_index_type global_thread_id(
thread_index_type thread_id,
thread_index_type block_id,
thread_index_type num_threads_per_block)
{
return thread_id + block_id * num_threads_per_block;
}

/**
* @brief Returns the global thread index of the current thread in a 1D grid.
*
* @return thread_index_type The global thread index
*/
static __device__ thread_index_type global_thread_id()
{
return global_thread_id(threadIdx.x, blockIdx.x, blockDim.x);
}

/**
* @brief Returns the global thread index of the current thread in a 1D grid.
*
* @tparam num_threads_per_block The number of threads per block
*
* @return thread_index_type The global thread index
*/
template <thread_index_type num_threads_per_block>
static __device__ thread_index_type global_thread_id()
{
return global_thread_id(threadIdx.x, blockIdx.x, num_threads_per_block);
}

/**
* @brief Returns the stride of a 1D grid.
*
* The returned stride is the total number of threads in the grid.
*
* @param thread_id The thread index within the block
* @param block_id The block index within the grid
* @param num_threads_per_block The number of threads per block
* @return thread_index_type The global thread index
*/
__device__ static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block,
thread_index_type num_blocks_per_grid)
{
return num_threads_per_block * num_blocks_per_grid;
}

/**
* @brief Returns the stride of the current 1D grid.
*
* @return thread_index_type The number of threads in the grid.
*/
static __device__ thread_index_type grid_stride() { return grid_stride(blockDim.x, gridDim.x); }

/**
* @brief Returns the stride of the current 1D grid.
*
* @tparam num_threads_per_block The number of threads per block
*
* @return thread_index_type The number of threads in the grid.
*/
template <thread_index_type num_threads_per_block>
static __device__ thread_index_type grid_stride()
{
return grid_stride(num_threads_per_block, gridDim.x);
}
};

} // namespace detail
} // namespace CUDF_EXPORT cudf
1 change: 1 addition & 0 deletions cpp/include/cudf/detail/valid_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <cudf/detail/device_scalar.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/grid_1d.cuh>
#include <cudf/types.hpp>
#include <cudf/utilities/bit.hpp>
#include <cudf/utilities/default_stream.hpp>
Expand Down
1 change: 1 addition & 0 deletions cpp/include/cudf/strings/detail/gather.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <cudf/copying.hpp>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/grid_1d.cuh>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/strings_column_view.hpp>
Expand Down
1 change: 1 addition & 0 deletions cpp/include/cudf/strings/detail/strings_children.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/detail/sizes_to_offsets_iterator.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/grid_1d.cuh>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/utilities.hpp>
#include <cudf/utilities/default_stream.hpp>
Expand Down
Loading
Loading