-
Notifications
You must be signed in to change notification settings - Fork 981
Fix OOB memory access in Orc and Parquet stacks from fixed-width unaligned loads #20458
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
rapids-bot
merged 20 commits into
rapidsai:main
from
mhaseeb123:fix/oob-access-hybrid-scan-dicts
Nov 5, 2025
Merged
Changes from 14 commits
Commits
Show all changes
20 commits
Select commit
Hold shift + click to select a range
f36396b
Fix for OOB memory access in hybrid scan dictionary
mhaseeb123 3be50e7
Use the correct unaligned load in ORC as well
mhaseeb123 0d5629a
style fix
mhaseeb123 ea3bf21
Use unaligned_load
mhaseeb123 ebbc708
style fix
mhaseeb123 db495d1
Fix OOB access for PARQUET_TEST
mhaseeb123 9ddc7af
Use static_assert to make sure our block size is big enough
mhaseeb123 ac46598
style
mhaseeb123 e0965ff
Modernize
mhaseeb123 52e7bab
Minor comments update
mhaseeb123 30bb183
Minor comments
mhaseeb123 21f966b
Merge branch 'main' into fix/oob-access-hybrid-scan-dicts
mhaseeb123 28f62b3
Use `cuda::std::is_same_v` instead
mhaseeb123 58a0697
Remove unused util and modernize `warp_reduce_or`
mhaseeb123 cc4722d
Minor improvements
mhaseeb123 95a9100
Modernize more warp utils
mhaseeb123 34ecf92
Apply suggestion from @mhaseeb123
mhaseeb123 8bd293e
Merge branch 'main' of https://github.com/rapidsai/cudf into fix/oob-…
mhaseeb123 abb20dd
Merge branch 'main' into fix/oob-access-hybrid-scan-dicts
mhaseeb123 bc10cdb
Merge branch 'fix/oob-access-hybrid-scan-dicts' of https://github.com…
mhaseeb123 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,10 +1,15 @@ | ||
| /* | ||
| * SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION. | ||
| * SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION. | ||
| * SPDX-License-Identifier: Apache-2.0 | ||
| */ | ||
|
|
||
| #pragma once | ||
| #include <cstdint> | ||
| #include <cudf/detail/utilities/cuda.cuh> | ||
|
|
||
| #include <cooperative_groups.h> | ||
| #include <cub/cub.cuh> | ||
| #include <cuda/std/cstdint> | ||
| #include <cuda/std/cstring> | ||
|
|
||
| namespace cudf { | ||
| namespace io { | ||
|
|
@@ -21,39 +26,20 @@ inline __device__ T shuffle_xor(T var, uint32_t delta) | |
| return __shfl_xor_sync(~0, var, delta); | ||
| } | ||
|
|
||
| inline __device__ void syncwarp() { __syncwarp(); } | ||
|
|
||
| inline __device__ uint32_t ballot(int pred) { return __ballot_sync(~0, pred); } | ||
|
|
||
| // Warp reduction helpers | ||
| template <typename T> | ||
| inline __device__ T WarpReduceOr2(T acc) | ||
| { | ||
| return acc | shuffle_xor(acc, 1); | ||
| } | ||
| template <typename T> | ||
| inline __device__ T WarpReduceOr4(T acc) | ||
| { | ||
| acc = WarpReduceOr2(acc); | ||
| return acc | shuffle_xor(acc, 2); | ||
| } | ||
| template <typename T> | ||
| inline __device__ T WarpReduceOr8(T acc) | ||
| { | ||
| acc = WarpReduceOr4(acc); | ||
| return acc | shuffle_xor(acc, 4); | ||
| } | ||
| template <typename T> | ||
| inline __device__ T WarpReduceOr16(T acc) | ||
| template <cudf::size_type size, typename T> | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Modernized these into one |
||
| inline __device__ T warp_reduce_or(T acc) | ||
| { | ||
| acc = WarpReduceOr8(acc); | ||
| return acc | shuffle_xor(acc, 8); | ||
| } | ||
| template <typename T> | ||
| inline __device__ T WarpReduceOr32(T acc) | ||
| { | ||
| acc = WarpReduceOr16(acc); | ||
| return acc | shuffle_xor(acc, 16); | ||
| static_assert(size >= 2 and size <= cudf::detail::warp_size and (size & (size - 1)) == 0, | ||
| "Size must be a power of 2 and less than or equal to the warp size"); | ||
| if constexpr (size == 2) { | ||
| return acc | shuffle_xor(acc, 1); | ||
| } else { | ||
| acc = warp_reduce_or<size / 2>(acc); | ||
| return acc | shuffle_xor(acc, size / 2); | ||
| } | ||
| } | ||
|
|
||
| template <typename T> | ||
|
|
@@ -113,66 +99,56 @@ inline __device__ double Int128ToDouble_rn(uint64_t lo, int64_t hi) | |
| return sign * __fma_rn(__ll2double_rn(hi), 4294967296.0 * 4294967296.0, __ull2double_rn(lo)); | ||
| } | ||
|
|
||
| inline __device__ uint32_t unaligned_load32(uint8_t const* p) | ||
| { | ||
| uint32_t ofs = 3 & reinterpret_cast<uintptr_t>(p); | ||
| auto const* p32 = reinterpret_cast<uint32_t const*>(p - ofs); | ||
| uint32_t v = p32[0]; | ||
| return (ofs) ? __funnelshift_r(v, p32[1], ofs * 8) : v; | ||
| } | ||
|
|
||
| inline __device__ uint64_t unaligned_load64(uint8_t const* p) | ||
| template <typename T> | ||
| requires(cuda::std::is_same_v<T, uint32_t> or cuda::std::is_same_v<T, uint64_t>) | ||
mhaseeb123 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| inline __device__ T unaligned_load(uint8_t const* p) | ||
| { | ||
| uint32_t ofs = 3 & reinterpret_cast<uintptr_t>(p); | ||
| auto const* p32 = reinterpret_cast<uint32_t const*>(p - ofs); | ||
| uint32_t v0 = p32[0]; | ||
| uint32_t v1 = p32[1]; | ||
| if (ofs) { | ||
| v0 = __funnelshift_r(v0, v1, ofs * 8); | ||
| v1 = __funnelshift_r(v1, p32[2], ofs * 8); | ||
| } | ||
| return (((uint64_t)v1) << 32) | v0; | ||
| T value; | ||
| cuda::std::memcpy(&value, p, sizeof(T)); | ||
| return value; | ||
| } | ||
|
|
||
| template <unsigned int nthreads, bool sync_before_store> | ||
| inline __device__ void memcpy_block(void* dstv, void const* srcv, uint32_t len, uint32_t t) | ||
| { | ||
| template <uint32_t nthreads, bool sync_before_store> | ||
| inline __device__ void memcpy_block(void* dstv, | ||
| void const* srcv, | ||
| uint32_t len, | ||
| cooperative_groups::thread_block const& block) | ||
| { | ||
| static_assert( | ||
| nthreads >= sizeof(uint32_t), | ||
| "The kernel block size (nthreads) must be greater than or equal to the size of uint32_t"); | ||
| auto const t = block.thread_rank(); | ||
| auto* dst = static_cast<uint8_t*>(dstv); | ||
| auto const* src = static_cast<uint8_t const*>(srcv); | ||
| uint32_t dst_align_bytes, src_align_bytes, src_align_bits; | ||
| // Align output to 32-bit | ||
| dst_align_bytes = 3 & -reinterpret_cast<intptr_t>(dst); | ||
| auto const dst_align_bytes = static_cast<uint32_t>(0x3 & -reinterpret_cast<intptr_t>(dst)); | ||
| if (dst_align_bytes != 0) { | ||
| uint32_t align_len = min(dst_align_bytes, len); | ||
| uint8_t b; | ||
| if (t < align_len) { b = src[t]; } | ||
| if constexpr (sync_before_store) { __syncthreads(); } | ||
| if (t < align_len) { dst[t] = b; } | ||
| auto const align_len = cuda::std::min<uint32_t>(dst_align_bytes, len); | ||
| uint8_t byte; | ||
| if (t < align_len) { byte = src[t]; } | ||
| if constexpr (sync_before_store) { block.sync(); } | ||
| if (t < align_len) { dst[t] = byte; } | ||
| src += align_len; | ||
| dst += align_len; | ||
| len -= align_len; | ||
| } | ||
| src_align_bytes = (uint32_t)(3 & reinterpret_cast<uintptr_t>(src)); | ||
| src_align_bits = src_align_bytes * 8; | ||
| while (len >= 4) { | ||
| auto const* src32 = reinterpret_cast<uint32_t const*>(src - src_align_bytes); | ||
| uint32_t copy_cnt = min(len >> 2, nthreads); | ||
| uint32_t v; | ||
| if (t < copy_cnt) { | ||
| v = src32[t]; | ||
| if (src_align_bits != 0) { v = __funnelshift_r(v, src32[t + 1], src_align_bits); } | ||
| } | ||
| if constexpr (sync_before_store) { __syncthreads(); } | ||
| if (t < copy_cnt) { reinterpret_cast<uint32_t*>(dst)[t] = v; } | ||
| src += copy_cnt * 4; | ||
| dst += copy_cnt * 4; | ||
| len -= copy_cnt * 4; | ||
| // Copy 32-bit chunks | ||
| while (len >= sizeof(uint32_t)) { | ||
| auto const copy_cnt = cuda::std::min<uint32_t>(len / sizeof(uint32_t), nthreads); | ||
| uint32_t value; | ||
| if (t < copy_cnt) { value = unaligned_load<uint32_t>(src + (t * sizeof(uint32_t))); } | ||
| if constexpr (sync_before_store) { block.sync(); } | ||
| if (t < copy_cnt) { reinterpret_cast<uint32_t*>(dst)[t] = value; } | ||
| src += copy_cnt * sizeof(uint32_t); | ||
| dst += copy_cnt * sizeof(uint32_t); | ||
| len -= copy_cnt * sizeof(uint32_t); | ||
| } | ||
| // Copy the remaining bytes | ||
| if (len != 0) { | ||
| uint8_t b; | ||
| if (t < len) { b = src[t]; } | ||
| if constexpr (sync_before_store) { __syncthreads(); } | ||
| if (t < len) { dst[t] = b; } | ||
| uint8_t byte; | ||
| if (t < len) { byte = src[t]; } | ||
| if constexpr (sync_before_store) { block.sync(); } | ||
| if (t < len) { dst[t] = byte; } | ||
| } | ||
| } | ||
|
|
||
|
|
||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed as not being used anywhere