@@ -69,30 +69,33 @@ void ContentDefinedChunker::Roll(const bool value) {
69
69
has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0 );
70
70
}
71
71
72
- template <typename T>
73
- void ContentDefinedChunker::Roll (const T* value) {
74
- constexpr size_t BYTE_WIDTH = sizeof (T);
75
- chunk_size_ += BYTE_WIDTH;
72
+ template <int ByteWidth>
73
+ void ContentDefinedChunker::Roll (const uint8_t * value) {
74
+ chunk_size_ += ByteWidth;
76
75
if (chunk_size_ < min_size_) {
77
76
// short-circuit if we haven't reached the minimum chunk size, this speeds up the
78
77
// chunking process since the gearhash doesn't need to be updated
79
78
return ;
80
79
}
81
- auto bytes = reinterpret_cast <const uint8_t *>(value);
82
- for (size_t i = 0 ; i < BYTE_WIDTH; ++i) {
83
- rolling_hash_ = (rolling_hash_ << 1 ) + kGearhashTable [nth_run_][bytes[i]];
80
+ for (size_t i = 0 ; i < ByteWidth; ++i) {
81
+ rolling_hash_ = (rolling_hash_ << 1 ) + kGearhashTable [nth_run_][value[i]];
84
82
has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0 );
85
83
}
86
84
}
87
85
88
- void ContentDefinedChunker::Roll (const uint8_t * value, int64_t num_bytes) {
89
- chunk_size_ += num_bytes;
86
+ template <typename T>
87
+ void ContentDefinedChunker::Roll (const T* value) {
88
+ return Roll<sizeof (T)>(reinterpret_cast <const uint8_t *>(value));
89
+ }
90
+
91
+ void ContentDefinedChunker::Roll (const uint8_t * value, int64_t length) {
92
+ chunk_size_ += length;
90
93
if (chunk_size_ < min_size_) {
91
94
// short-circuit if we haven't reached the minimum chunk size, this speeds up the
92
95
// chunking process since the gearhash doesn't need to be updated
93
96
return ;
94
97
}
95
- for (int64_t i = 0 ; i < num_bytes ; ++i) {
98
+ for (auto i = 0 ; i < length ; ++i) {
96
99
rolling_hash_ = (rolling_hash_ << 1 ) + kGearhashTable [nth_run_][value[i]];
97
100
has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0 );
98
101
}
@@ -202,21 +205,22 @@ const std::vector<Chunk> ContentDefinedChunker::Calculate(const int16_t* def_lev
202
205
return chunks;
203
206
}
204
207
205
- #define FIXED_WIDTH_CASE (CType ) \
206
- { \
207
- const auto raw_values = values.data ()->GetValues <CType>(1 ); \
208
- return Calculate (def_levels, rep_levels, num_levels, \
209
- [&](int64_t i) { return Roll (raw_values + i); }); \
208
+ #define FIXED_WIDTH_CASE (ByteWidth ) \
209
+ { \
210
+ const auto raw_values = values.data ()->GetValues <uint8_t >(1 ); \
211
+ return Calculate (def_levels, rep_levels, num_levels, [&](int64_t i) { \
212
+ return Roll<ByteWidth>(raw_values + i * ByteWidth); \
213
+ }); \
210
214
}
211
215
212
- #define BINARY_LIKE_CASE (OffsetCType ) \
216
+ #define BINARY_LIKE_CASE (ArrayType ) \
213
217
{ \
214
- const auto raw_offsets = values.data ()->GetValues <OffsetCType>(1 ); \
215
- const auto raw_values = values.data ()->GetValues <uint8_t >(2 ); \
218
+ const auto & array = static_cast <const ArrayType&>(values); \
219
+ const uint8_t * value; \
220
+ ArrayType::offset_type length; \
216
221
return Calculate (def_levels, rep_levels, num_levels, [&](int64_t i) { \
217
- const OffsetCType pos = raw_offsets[i]; \
218
- const OffsetCType length = raw_offsets[i + 1 ] - pos; \
219
- Roll (raw_values + pos, length); \
222
+ value = array.GetValue (i, &length); \
223
+ Roll (value, length); \
220
224
}); \
221
225
}
222
226
@@ -235,40 +239,42 @@ const std::vector<Chunk> ContentDefinedChunker::GetBoundaries(
235
239
}
236
240
case ::arrow::Type::INT8:
237
241
case ::arrow::Type::UINT8:
238
- FIXED_WIDTH_CASE (uint8_t )
242
+ FIXED_WIDTH_CASE (1 )
239
243
case ::arrow::Type::INT16:
240
244
case ::arrow::Type::UINT16:
241
245
case ::arrow::Type::HALF_FLOAT:
242
- FIXED_WIDTH_CASE (uint16_t )
246
+ FIXED_WIDTH_CASE (2 )
243
247
case ::arrow::Type::INT32:
244
248
case ::arrow::Type::UINT32:
245
249
case ::arrow::Type::FLOAT:
246
250
case ::arrow::Type::DATE32:
247
251
case ::arrow::Type::TIME32:
248
- FIXED_WIDTH_CASE (uint32_t )
252
+ FIXED_WIDTH_CASE (4 )
249
253
case ::arrow::Type::INT64:
250
254
case ::arrow::Type::UINT64:
251
255
case ::arrow::Type::DOUBLE:
252
256
case ::arrow::Type::DATE64:
253
257
case ::arrow::Type::TIME64:
254
258
case ::arrow::Type::TIMESTAMP:
255
259
case ::arrow::Type::DURATION:
256
- FIXED_WIDTH_CASE (uint64_t )
260
+ FIXED_WIDTH_CASE (8 )
261
+ case ::arrow::Type::DECIMAL128:
262
+ FIXED_WIDTH_CASE (16 )
263
+ case ::arrow::Type::DECIMAL256:
264
+ FIXED_WIDTH_CASE (32 )
257
265
case ::arrow::Type::BINARY:
266
+ BINARY_LIKE_CASE (::arrow::BinaryArray)
258
267
case ::arrow::Type::STRING:
259
- BINARY_LIKE_CASE (int32_t )
268
+ BINARY_LIKE_CASE (::arrow::StringArray )
260
269
case ::arrow::Type::LARGE_BINARY:
270
+ BINARY_LIKE_CASE (::arrow::LargeBinaryArray)
261
271
case ::arrow::Type::LARGE_STRING:
262
- BINARY_LIKE_CASE (int64_t )
263
- case ::arrow::Type::DECIMAL128:
264
- case ::arrow::Type::DECIMAL256:
272
+ BINARY_LIKE_CASE (::arrow::LargeStringArray)
265
273
case ::arrow::Type::FIXED_SIZE_BINARY: {
266
- const auto raw_values = values.data ()->GetValues <uint8_t >(1 );
267
- const auto byte_width =
268
- static_cast <const ::arrow::FixedSizeBinaryArray&>(values).byte_width ();
269
- return Calculate (def_levels, rep_levels, num_levels, [&](int64_t i) {
270
- return Roll (raw_values + i * byte_width, byte_width);
271
- });
274
+ const auto & array = static_cast <const ::arrow::FixedSizeBinaryArray&>(values);
275
+ const auto byte_width = array.byte_width ();
276
+ return Calculate (def_levels, rep_levels, num_levels,
277
+ [&](int64_t i) { Roll (array.GetValue (i), byte_width); });
272
278
}
273
279
case ::arrow::Type::DICTIONARY:
274
280
return GetBoundaries (
0 commit comments