Skip to content

Commit 534c8e3

Browse files
committed
Address review comments
1 parent d64ee0b commit 534c8e3

File tree

2 files changed

+11
-10
lines changed

2 files changed

+11
-10
lines changed

cpp/src/parquet/chunker_internal.cc

+9-8
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,8 @@ namespace parquet::internal {
4545
/// This implementation uses cut-point skipping because it improves the overall
4646
/// performance and a more accurate alternative to have less skewed chunk size
4747
/// distribution. Instead of using two different masks (one with a lower and one with a
48-
/// probability of matching and switching them based on the actual chunk size), we rather
49-
/// use 8 different gear hash tables and require having 8 consecutive matches while
48+
/// higher probability of matching and switching them based on the actual chunk size), we
49+
/// rather use 8 different gear hash tables and require having 8 consecutive matches while
5050
/// switching between the used hashtables. This approach is based on central limit theorem
5151
/// and approximates normal distribution of the chunk sizes.
5252
//
@@ -139,8 +139,9 @@ bool ContentDefinedChunker::NeedNewChunk() {
139139
has_matched_ = false;
140140
// in order to have a normal distribution of chunk sizes, we only create a new chunk
141141
// if the adjused mask matches the rolling hash 8 times in a row, each run uses a
142-
// different gearhash table (gearhash's chunk size has exponential distribution, and
143-
// we use central limit theorem to approximate normal distribution)
142+
// different gearhash table (gearhash's chunk size has geometric distribution, and
143+
// we use central limit theorem to approximate normal distribution, see section 6.2.1
144+
// in paper https://www.cidrdb.org/cidr2023/papers/p43-low.pdf)
144145
if (ARROW_PREDICT_FALSE(++nth_run_ >= 7)) {
145146
nth_run_ = 0;
146147
chunk_size_ = 0;
@@ -158,10 +159,10 @@ bool ContentDefinedChunker::NeedNewChunk() {
158159
}
159160

160161
template <typename RollFunc>
161-
const std::vector<Chunk> ContentDefinedChunker::Calculate(const int16_t* def_levels,
162-
const int16_t* rep_levels,
163-
int64_t num_levels,
164-
const RollFunc& RollValue) {
162+
std::vector<Chunk> ContentDefinedChunker::Calculate(const int16_t* def_levels,
163+
const int16_t* rep_levels,
164+
int64_t num_levels,
165+
const RollFunc& RollValue) {
165166
std::vector<Chunk> chunks;
166167
int64_t offset;
167168
int64_t prev_offset = 0;

cpp/src/parquet/chunker_internal.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -152,8 +152,8 @@ class ContentDefinedChunker {
152152

153153
// Calculate the chunk boundaries for typed Arrow arrays.
154154
template <typename RollFunc>
155-
const std::vector<Chunk> Calculate(const int16_t* def_levels, const int16_t* rep_levels,
156-
int64_t num_levels, const RollFunc& RollValue);
155+
std::vector<Chunk> Calculate(const int16_t* def_levels, const int16_t* rep_levels,
156+
int64_t num_levels, const RollFunc& RollValue);
157157

158158
// Reference to the column's level information
159159
const internal::LevelInfo& level_info_;

0 commit comments

Comments
 (0)