|
28 | 28 |
|
29 | 29 | namespace parquet::internal {
|
30 | 30 |
|
| 31 | +/// Calculate the mask to use for the rolling hash, the mask is used to determine if a |
| 32 | +/// new chunk should be created based on the rolling hash value. The mask is calculated |
| 33 | +/// based on the min_size, max_size and norm_factor parameters. |
| 34 | +/// |
| 35 | +/// Assuming that the gear hash hash random values with a uniform distribution, then each |
| 36 | +/// bit in the actual value of rolling_hash_ has even probability of being set so a mask |
| 37 | +/// with the top N bits set has a probability of 1/2^N of matching the rolling hash. This |
| 38 | +/// is the judgment criteria for the original gear hash based content-defined chunking. |
| 39 | +/// The main drawback of this approach is the non-uniform distribution of the chunk sizes. |
| 40 | +/// |
| 41 | +/// Later on the FastCDC has improved the process by introducing: |
| 42 | +/// - sub-minimum chunk cut-point skipping (not hashing the first `min_size` bytes) |
| 43 | +/// - chunk size normalization (using two masks) |
| 44 | +/// |
| 45 | +/// This implementation uses cut-point skipping because it improves the overall |
| 46 | +/// performance and a more accurate alternative to have less skewed chunk size |
| 47 | +/// distribution. Instead of using two different masks (one with a lower and one with a |
| 48 | +/// probability of matching and switching them based on the actual chunk size), we rather |
| 49 | +/// use 8 different gear hash tables and require having 8 consecutive matches while |
| 50 | +/// switching between the used hashtables. This approach is based on central limit theorem |
| 51 | +/// and approximates normal distribution of the chunk sizes. |
| 52 | +// |
| 53 | +// @param min_size The minimum chunk size (default 256KiB) |
| 54 | +// @param max_size The maximum chunk size (default 1MiB) |
| 55 | +// @param norm_factor Normalization factor (default 0) |
| 56 | +// @return The mask used to compare against the rolling hash |
31 | 57 | static uint64_t GetMask(int64_t min_size, int64_t max_size, uint8_t norm_factor) {
|
32 |
| - // we aim for gaussian-like distribution of chunk sizes between min_size and max_size |
| 58 | + // calculate the average size of the chunks |
33 | 59 | int64_t avg_size = (min_size + max_size) / 2;
|
34 |
| - // we skip calculating gearhash for the first `min_size` bytes, so we are looking for |
35 |
| - // a smaller chunk as the average size |
| 60 | + // since we are skipping the first `min_size` bytes for each chunk, we need to |
| 61 | + // target a smaller chunk size to reach the average size after skipping the first |
| 62 | + // `min_size` bytes |
36 | 63 | int64_t target_size = avg_size - min_size;
|
| 64 | + // assuming that the gear hash has a uniform distribution, we can calculate the mask |
| 65 | + // by taking the log2 of the target size |
37 | 66 | size_t mask_bits = static_cast<size_t>(std::floor(std::log2(target_size)));
|
38 |
| - // -3 because we are using 8 hash tables to have more gaussian-like distribution |
39 |
| - // `norm_factor` narrows the chunk size distribution aroun avg_size |
| 67 | + // -3 because we are using 8 hash tables to have more gaussian-like distribution, |
| 68 | + // a user defined `norm_factor` can be used to adjust the mask size, hence the matching |
| 69 | + // probability, by increasing the norm_factor we increase the probability of matching |
| 70 | + // the mask, forcing the distribution closer to the average size |
40 | 71 | size_t effective_bits = mask_bits - 3 - norm_factor;
|
41 | 72 | return std::numeric_limits<uint64_t>::max() << (64 - effective_bits);
|
42 | 73 | }
|
|
0 commit comments