@@ -45,8 +45,8 @@ namespace parquet::internal {
45
45
// / This implementation uses cut-point skipping because it improves the overall
46
46
// / performance and a more accurate alternative to have less skewed chunk size
47
47
// / distribution. Instead of using two different masks (one with a lower and one with a
48
- // / probability of matching and switching them based on the actual chunk size), we rather
49
- // / use 8 different gear hash tables and require having 8 consecutive matches while
48
+ // / higher probability of matching and switching them based on the actual chunk size), we
49
+ // / rather use 8 different gear hash tables and require having 8 consecutive matches while
50
50
// / switching between the used hashtables. This approach is based on central limit theorem
51
51
// / and approximates normal distribution of the chunk sizes.
52
52
//
@@ -139,8 +139,9 @@ bool ContentDefinedChunker::NeedNewChunk() {
139
139
has_matched_ = false ;
140
140
// in order to have a normal distribution of chunk sizes, we only create a new chunk
141
141
// if the adjused mask matches the rolling hash 8 times in a row, each run uses a
142
- // different gearhash table (gearhash's chunk size has exponential distribution, and
143
- // we use central limit theorem to approximate normal distribution)
142
+ // different gearhash table (gearhash's chunk size has geometric distribution, and
143
+ // we use central limit theorem to approximate normal distribution, see section 6.2.1
144
+ // in paper https://www.cidrdb.org/cidr2023/papers/p43-low.pdf)
144
145
if (ARROW_PREDICT_FALSE (++nth_run_ >= 7 )) {
145
146
nth_run_ = 0 ;
146
147
chunk_size_ = 0 ;
@@ -158,10 +159,10 @@ bool ContentDefinedChunker::NeedNewChunk() {
158
159
}
159
160
160
161
template <typename RollFunc>
161
- const std::vector<Chunk> ContentDefinedChunker::Calculate (const int16_t * def_levels,
162
- const int16_t * rep_levels,
163
- int64_t num_levels,
164
- const RollFunc& RollValue) {
162
+ std::vector<Chunk> ContentDefinedChunker::Calculate (const int16_t * def_levels,
163
+ const int16_t * rep_levels,
164
+ int64_t num_levels,
165
+ const RollFunc& RollValue) {
165
166
std::vector<Chunk> chunks;
166
167
int64_t offset;
167
168
int64_t prev_offset = 0 ;
0 commit comments