Skip to content

Commit 50884d0

Browse files
committed
fix use .getValue() for binary arrays
1 parent 3242099 commit 50884d0

File tree

2 files changed

+47
-38
lines changed

2 files changed

+47
-38
lines changed

cpp/src/parquet/chunker_internal.cc

+41-35
Original file line numberDiff line numberDiff line change
@@ -69,30 +69,33 @@ void ContentDefinedChunker::Roll(const bool value) {
6969
has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0);
7070
}
7171

72-
template <typename T>
73-
void ContentDefinedChunker::Roll(const T* value) {
74-
constexpr size_t BYTE_WIDTH = sizeof(T);
75-
chunk_size_ += BYTE_WIDTH;
72+
template <int ByteWidth>
73+
void ContentDefinedChunker::Roll(const uint8_t* value) {
74+
chunk_size_ += ByteWidth;
7675
if (chunk_size_ < min_size_) {
7776
// short-circuit if we haven't reached the minimum chunk size, this speeds up the
7877
// chunking process since the gearhash doesn't need to be updated
7978
return;
8079
}
81-
auto bytes = reinterpret_cast<const uint8_t*>(value);
82-
for (size_t i = 0; i < BYTE_WIDTH; ++i) {
83-
rolling_hash_ = (rolling_hash_ << 1) + kGearhashTable[nth_run_][bytes[i]];
80+
for (size_t i = 0; i < ByteWidth; ++i) {
81+
rolling_hash_ = (rolling_hash_ << 1) + kGearhashTable[nth_run_][value[i]];
8482
has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0);
8583
}
8684
}
8785

88-
void ContentDefinedChunker::Roll(const uint8_t* value, int64_t num_bytes) {
89-
chunk_size_ += num_bytes;
86+
template <typename T>
87+
void ContentDefinedChunker::Roll(const T* value) {
88+
return Roll<sizeof(T)>(reinterpret_cast<const uint8_t*>(value));
89+
}
90+
91+
void ContentDefinedChunker::Roll(const uint8_t* value, int64_t length) {
92+
chunk_size_ += length;
9093
if (chunk_size_ < min_size_) {
9194
// short-circuit if we haven't reached the minimum chunk size, this speeds up the
9295
// chunking process since the gearhash doesn't need to be updated
9396
return;
9497
}
95-
for (int64_t i = 0; i < num_bytes; ++i) {
98+
for (auto i = 0; i < length; ++i) {
9699
rolling_hash_ = (rolling_hash_ << 1) + kGearhashTable[nth_run_][value[i]];
97100
has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0);
98101
}
@@ -202,21 +205,22 @@ const std::vector<Chunk> ContentDefinedChunker::Calculate(const int16_t* def_lev
202205
return chunks;
203206
}
204207

205-
#define FIXED_WIDTH_CASE(CType) \
206-
{ \
207-
const auto raw_values = values.data()->GetValues<CType>(1); \
208-
return Calculate(def_levels, rep_levels, num_levels, \
209-
[&](int64_t i) { return Roll(raw_values + i); }); \
208+
#define FIXED_WIDTH_CASE(ByteWidth) \
209+
{ \
210+
const auto raw_values = values.data()->GetValues<uint8_t>(1); \
211+
return Calculate(def_levels, rep_levels, num_levels, [&](int64_t i) { \
212+
return Roll<ByteWidth>(raw_values + i * ByteWidth); \
213+
}); \
210214
}
211215

212-
#define BINARY_LIKE_CASE(OffsetCType) \
216+
#define BINARY_LIKE_CASE(ArrayType) \
213217
{ \
214-
const auto raw_offsets = values.data()->GetValues<OffsetCType>(1); \
215-
const auto raw_values = values.data()->GetValues<uint8_t>(2); \
218+
const auto& array = static_cast<const ArrayType&>(values); \
219+
const uint8_t* value; \
220+
ArrayType::offset_type length; \
216221
return Calculate(def_levels, rep_levels, num_levels, [&](int64_t i) { \
217-
const OffsetCType pos = raw_offsets[i]; \
218-
const OffsetCType length = raw_offsets[i + 1] - pos; \
219-
Roll(raw_values + pos, length); \
222+
value = array.GetValue(i, &length); \
223+
Roll(value, length); \
220224
}); \
221225
}
222226

@@ -235,40 +239,42 @@ const std::vector<Chunk> ContentDefinedChunker::GetBoundaries(
235239
}
236240
case ::arrow::Type::INT8:
237241
case ::arrow::Type::UINT8:
238-
FIXED_WIDTH_CASE(uint8_t)
242+
FIXED_WIDTH_CASE(1)
239243
case ::arrow::Type::INT16:
240244
case ::arrow::Type::UINT16:
241245
case ::arrow::Type::HALF_FLOAT:
242-
FIXED_WIDTH_CASE(uint16_t)
246+
FIXED_WIDTH_CASE(2)
243247
case ::arrow::Type::INT32:
244248
case ::arrow::Type::UINT32:
245249
case ::arrow::Type::FLOAT:
246250
case ::arrow::Type::DATE32:
247251
case ::arrow::Type::TIME32:
248-
FIXED_WIDTH_CASE(uint32_t)
252+
FIXED_WIDTH_CASE(4)
249253
case ::arrow::Type::INT64:
250254
case ::arrow::Type::UINT64:
251255
case ::arrow::Type::DOUBLE:
252256
case ::arrow::Type::DATE64:
253257
case ::arrow::Type::TIME64:
254258
case ::arrow::Type::TIMESTAMP:
255259
case ::arrow::Type::DURATION:
256-
FIXED_WIDTH_CASE(uint64_t)
260+
FIXED_WIDTH_CASE(8)
261+
case ::arrow::Type::DECIMAL128:
262+
FIXED_WIDTH_CASE(16)
263+
case ::arrow::Type::DECIMAL256:
264+
FIXED_WIDTH_CASE(32)
257265
case ::arrow::Type::BINARY:
266+
BINARY_LIKE_CASE(::arrow::BinaryArray)
258267
case ::arrow::Type::STRING:
259-
BINARY_LIKE_CASE(int32_t)
268+
BINARY_LIKE_CASE(::arrow::StringArray)
260269
case ::arrow::Type::LARGE_BINARY:
270+
BINARY_LIKE_CASE(::arrow::LargeBinaryArray)
261271
case ::arrow::Type::LARGE_STRING:
262-
BINARY_LIKE_CASE(int64_t)
263-
case ::arrow::Type::DECIMAL128:
264-
case ::arrow::Type::DECIMAL256:
272+
BINARY_LIKE_CASE(::arrow::LargeStringArray)
265273
case ::arrow::Type::FIXED_SIZE_BINARY: {
266-
const auto raw_values = values.data()->GetValues<uint8_t>(1);
267-
const auto byte_width =
268-
static_cast<const ::arrow::FixedSizeBinaryArray&>(values).byte_width();
269-
return Calculate(def_levels, rep_levels, num_levels, [&](int64_t i) {
270-
return Roll(raw_values + i * byte_width, byte_width);
271-
});
274+
const auto& array = static_cast<const ::arrow::FixedSizeBinaryArray&>(values);
275+
const auto byte_width = array.byte_width();
276+
return Calculate(def_levels, rep_levels, num_levels,
277+
[&](int64_t i) { Roll(array.GetValue(i), byte_width); });
272278
}
273279
case ::arrow::Type::DICTIONARY:
274280
return GetBoundaries(

cpp/src/parquet/chunker_internal.h

+6-3
Original file line numberDiff line numberDiff line change
@@ -132,16 +132,19 @@ class ContentDefinedChunker {
132132
const ::arrow::Array& values);
133133

134134
private:
135-
void Roll(const bool value);
135+
inline void Roll(const bool value);
136136

137137
// Update the rolling hash with a compile-time known sized value, set has_matched_ to
138138
// true if the hash matches the mask.
139+
template <int ByteWidth>
140+
void inline Roll(const uint8_t* value);
141+
139142
template <typename T>
140-
void Roll(const T* value);
143+
inline void Roll(const T* value);
141144

142145
// Update the rolling hash with a binary-like value, set has_matched_ to true if the
143146
// hash matches the mask.
144-
void Roll(const uint8_t* value, int64_t num_bytes);
147+
inline void Roll(const uint8_t* value, int64_t length);
145148

146149
// Evaluate whether a new chunk should be created based on the has_matched_, nth_run_
147150
// and chunk_size_ state.

0 commit comments

Comments
 (0)