Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions cpp/src/io/avro/avro_gpu.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/
#include "avro_gpu.hpp"
Expand Down Expand Up @@ -182,7 +182,7 @@ avro_decode_row(schemadesc_s const* schema,
if (dataptr != nullptr && dst_row >= 0) {
uint32_t v;
if (cur + 3 < end) {
v = unaligned_load32(cur);
v = unaligned_load<uint32_t>(cur);
cur += 4;
} else {
v = 0;
Expand All @@ -198,7 +198,7 @@ avro_decode_row(schemadesc_s const* schema,
if (dataptr != nullptr && dst_row >= 0) {
uint64_t v;
if (cur + 7 < end) {
v = unaligned_load64(cur);
v = unaligned_load<uint64_t>(cur);
cur += 8;
} else {
v = 0;
Expand Down
17 changes: 3 additions & 14 deletions cpp/src/io/comp/snap.cu
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,6 @@ static inline __device__ uint32_t snap_hash(uint32_t v)
return (v * ((1 << 20) + (0x2a00) + (0x6a) + 1)) >> (32 - hash_bits);
}

/**
* @brief Fetches four consecutive bytes
*/
static inline __device__ uint32_t fetch4(uint8_t const* src)
{
uint32_t src_align = 3 & reinterpret_cast<uintptr_t>(src);
auto const* src32 = reinterpret_cast<uint32_t const*>(src - src_align);
uint32_t v = src32[0];
return (src_align) ? __funnelshift_r(v, src32[1], src_align * 8) : v;
}

/**
* @brief Outputs a snappy literal symbol
*
Expand Down Expand Up @@ -170,7 +159,7 @@ static __device__ uint32_t FindFourByteMatch(snap_state_s* s,
if (t == 0) { s->copy_length = 0; }
do {
bool valid4 = (pos + t + 4 <= len);
uint32_t data32 = (valid4) ? fetch4(src + pos + t) : 0;
uint32_t data32 = (valid4) ? cudf::io::unaligned_load<uint32_t>(src + pos + t) : 0;
uint32_t hash = (valid4) ? snap_hash(data32) : 0;
uint32_t local_match = HashMatchAny(hash, t);
uint32_t local_match_lane = 31 - __clz(local_match & ((1 << t) - 1));
Expand All @@ -183,8 +172,8 @@ static __device__ uint32_t FindFourByteMatch(snap_state_s* s,
} else {
offset = (pos & ~0xffff) | s->hash_map[hash];
if (offset >= pos) { offset = (offset >= 0x1'0000) ? offset - 0x1'0000 : pos; }
match =
(offset < pos && offset + max_copy_distance >= pos + t && fetch4(src + offset) == data32);
match = (offset < pos && offset + max_copy_distance >= pos + t &&
cudf::io::unaligned_load<uint32_t>(src + offset) == data32);
}
} else {
match = 0;
Expand Down
18 changes: 9 additions & 9 deletions cpp/src/io/parquet/experimental/dictionary_page_filter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -254,8 +254,8 @@ __device__ __forceinline__ void decode_int96timestamp(uint8_t const* int96_ptr,
// Note: This function has been modified from the original at
// https://github.com/rapidsai/cudf/blob/c89c83c00c729a86c56570693b627f31408bc2c9/cpp/src/io/parquet/page_data.cuh#L133-L198

int64_t nanos = cudf::io::unaligned_load64(int96_ptr);
int64_t days = cudf::io::unaligned_load32(int96_ptr + sizeof(int64_t));
int64_t nanos = cudf::io::unaligned_load<uint64_t>(int96_ptr);
int64_t days = cudf::io::unaligned_load<uint32_t>(int96_ptr + sizeof(int64_t));

// Convert from Julian day at noon to UTC seconds
cudf::duration_D duration_days{
Expand Down Expand Up @@ -489,7 +489,7 @@ __device__ T decode_fixed_width_value(PageInfo const& page,
// Handle timestamps
if constexpr (cudf::is_timestamp<T>()) {
auto const timestamp =
cudf::io::unaligned_load32(page_data + (value_idx * sizeof(uint32_t)));
cudf::io::unaligned_load<uint32_t>(page_data + (value_idx * sizeof(uint32_t)));
if (timestamp_scale != 0) {
decoded_value = T{typename T::duration(static_cast<typename T::rep>(timestamp))};
} else {
Expand All @@ -513,7 +513,7 @@ __device__ T decode_fixed_width_value(PageInfo const& page,
}

auto const duration =
cudf::io::unaligned_load32(page_data + (value_idx * sizeof(uint32_t)));
cudf::io::unaligned_load<uint32_t>(page_data + (value_idx * sizeof(uint32_t)));
decoded_value = T{static_cast<typename T::rep>(duration)};
}
// Handle other int32 encoded values including smaller bitwidths and decimal32
Expand All @@ -523,8 +523,8 @@ __device__ T decode_fixed_width_value(PageInfo const& page,
set_error(error, decode_error::INVALID_DATA_TYPE);
return {};
}
decoded_value =
static_cast<T>(cudf::io::unaligned_load32(page_data + (value_idx * sizeof(uint32_t))));
decoded_value = static_cast<T>(
cudf::io::unaligned_load<uint32_t>(page_data + (value_idx * sizeof(uint32_t))));
}
break;
}
Expand All @@ -538,7 +538,7 @@ __device__ T decode_fixed_width_value(PageInfo const& page,
// Handle timestamps
if constexpr (cudf::is_timestamp<T>()) {
int64_t const timestamp =
cudf::io::unaligned_load64(page_data + (value_idx * sizeof(int64_t)));
cudf::io::unaligned_load<uint64_t>(page_data + (value_idx * sizeof(int64_t)));
if (timestamp_scale != 0) {
decoded_value = T{typename T::duration(
static_cast<typename T::rep>(convert_to_timestamp64(timestamp, timestamp_scale)))};
Expand All @@ -548,8 +548,8 @@ __device__ T decode_fixed_width_value(PageInfo const& page,
}
// Handle durations and other int64 encoded values including decimal64
else {
decoded_value =
static_cast<T>(cudf::io::unaligned_load64(page_data + (value_idx * sizeof(uint64_t))));
decoded_value = static_cast<T>(
cudf::io::unaligned_load<uint64_t>(page_data + (value_idx * sizeof(uint64_t))));
}
break;
}
Expand Down
13 changes: 2 additions & 11 deletions cpp/src/io/parquet/page_data.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -64,20 +64,11 @@ inline __device__ void gpuStoreOutput(uint32_t* dst,
uint32_t dict_pos,
uint32_t dict_size)
{
uint32_t bytebuf;
unsigned int ofs = 3 & reinterpret_cast<size_t>(src8);
src8 -= ofs; // align to 32-bit boundary
ofs <<= 3; // bytes -> bits
if (dict_pos < dict_size) {
bytebuf = *reinterpret_cast<uint32_t const*>(src8 + dict_pos);
if (ofs) {
uint32_t bytebufnext = *reinterpret_cast<uint32_t const*>(src8 + dict_pos + 4);
bytebuf = __funnelshift_r(bytebuf, bytebufnext, ofs);
}
*dst = cudf::io::unaligned_load<uint32_t>(src8 + dict_pos);
} else {
bytebuf = 0;
*dst = 0;
}
*dst = bytebuf;
}

/**
Expand Down
29 changes: 9 additions & 20 deletions cpp/src/io/utilities/block_utils.cuh
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

#pragma once
#include <cstdint>
#include <cuda/std/cstdint>
#include <cuda/std/cstring>

namespace cudf {
namespace io {
Expand Down Expand Up @@ -113,25 +114,13 @@ inline __device__ double Int128ToDouble_rn(uint64_t lo, int64_t hi)
return sign * __fma_rn(__ll2double_rn(hi), 4294967296.0 * 4294967296.0, __ull2double_rn(lo));
}

inline __device__ uint32_t unaligned_load32(uint8_t const* p)
{
uint32_t ofs = 3 & reinterpret_cast<uintptr_t>(p);
auto const* p32 = reinterpret_cast<uint32_t const*>(p - ofs);
uint32_t v = p32[0];
return (ofs) ? __funnelshift_r(v, p32[1], ofs * 8) : v;
}

inline __device__ uint64_t unaligned_load64(uint8_t const* p)
template <typename T>
requires(std::is_same_v<T, uint32_t> or std::is_same_v<T, uint64_t>)
inline __device__ T unaligned_load(uint8_t const* p)
{
uint32_t ofs = 3 & reinterpret_cast<uintptr_t>(p);
auto const* p32 = reinterpret_cast<uint32_t const*>(p - ofs);
uint32_t v0 = p32[0];
uint32_t v1 = p32[1];
if (ofs) {
v0 = __funnelshift_r(v0, v1, ofs * 8);
v1 = __funnelshift_r(v1, p32[2], ofs * 8);
}
return (((uint64_t)v1) << 32) | v0;
T value;
cuda::std::memcpy(&value, p, sizeof(T));
return value;
}

template <unsigned int nthreads, bool sync_before_store>
Expand Down
Loading