Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 3 additions & 14 deletions cpp/src/io/comp/snap.cu
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,6 @@ static inline __device__ uint32_t snap_hash(uint32_t v)
return (v * ((1 << 20) + (0x2a00) + (0x6a) + 1)) >> (32 - hash_bits);
}

/**
* @brief Fetches four consecutive bytes
*/
static inline __device__ uint32_t fetch4(uint8_t const* src)
{
uint32_t src_align = 3 & reinterpret_cast<uintptr_t>(src);
auto const* src32 = reinterpret_cast<uint32_t const*>(src - src_align);
uint32_t v = src32[0];
return (src_align) ? __funnelshift_r(v, src32[1], src_align * 8) : v;
}

/**
* @brief Outputs a snappy literal symbol
*
Expand Down Expand Up @@ -170,7 +159,7 @@ static __device__ uint32_t FindFourByteMatch(snap_state_s* s,
if (t == 0) { s->copy_length = 0; }
do {
bool valid4 = (pos + t + 4 <= len);
uint32_t data32 = (valid4) ? fetch4(src + pos + t) : 0;
uint32_t data32 = (valid4) ? cudf::io::unaligned_load32(src + pos + t) : 0;
uint32_t hash = (valid4) ? snap_hash(data32) : 0;
uint32_t local_match = HashMatchAny(hash, t);
uint32_t local_match_lane = 31 - __clz(local_match & ((1 << t) - 1));
Expand All @@ -183,8 +172,8 @@ static __device__ uint32_t FindFourByteMatch(snap_state_s* s,
} else {
offset = (pos & ~0xffff) | s->hash_map[hash];
if (offset >= pos) { offset = (offset >= 0x1'0000) ? offset - 0x1'0000 : pos; }
match =
(offset < pos && offset + max_copy_distance >= pos + t && fetch4(src + offset) == data32);
match = (offset < pos && offset + max_copy_distance >= pos + t &&
cudf::io::unaligned_load32(src + offset) == data32);
}
} else {
match = 0;
Expand Down
24 changes: 9 additions & 15 deletions cpp/src/io/utilities/block_utils.cuh
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

#pragma once
#include <cstdint>
#include <cuda/std/cstdint>
#include <cuda/std/cstring>

namespace cudf {
namespace io {
Expand Down Expand Up @@ -115,23 +116,16 @@ inline __device__ double Int128ToDouble_rn(uint64_t lo, int64_t hi)

inline __device__ uint32_t unaligned_load32(uint8_t const* p)
{
uint32_t ofs = 3 & reinterpret_cast<uintptr_t>(p);
auto const* p32 = reinterpret_cast<uint32_t const*>(p - ofs);
uint32_t v = p32[0];
return (ofs) ? __funnelshift_r(v, p32[1], ofs * 8) : v;
uint32_t value;
cuda::std::memcpy(&value, p, sizeof(uint32_t));
return value;
}

inline __device__ uint64_t unaligned_load64(uint8_t const* p)
{
uint32_t ofs = 3 & reinterpret_cast<uintptr_t>(p);
auto const* p32 = reinterpret_cast<uint32_t const*>(p - ofs);
uint32_t v0 = p32[0];
uint32_t v1 = p32[1];
if (ofs) {
v0 = __funnelshift_r(v0, v1, ofs * 8);
v1 = __funnelshift_r(v1, p32[2], ofs * 8);
}
return (((uint64_t)v1) << 32) | v0;
uint64_t value;
cuda::std::memcpy(&value, p, sizeof(uint64_t));
return value;
}

template <unsigned int nthreads, bool sync_before_store>
Expand Down