-
Notifications
You must be signed in to change notification settings - Fork 29.3k
Turbopack: Refactor compression dictionary #82336
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: sokra/block-in-place-read
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -1,6 +1,6 @@ | ||||||
use std::{ | ||||||
borrow::Cow, | ||||||
cmp::min, | ||||||
cmp::{max, min}, | ||||||
fs::File, | ||||||
io::{BufWriter, Seek, Write}, | ||||||
path::Path, | ||||||
|
@@ -35,11 +35,9 @@ const AMQF_FALSE_POSITIVE_RATE: f64 = 0.01; | |||||
const KEY_COMPRESSION_DICTIONARY_SIZE: usize = 64 * 1024 - 1; | ||||||
/// The maximum bytes that should be selected as key samples to create a compression dictionary | ||||||
const KEY_COMPRESSION_SAMPLES_SIZE: usize = 256 * 1024; | ||||||
/// The minimum bytes that should be selected as keys samples. Below that no compression dictionary | ||||||
/// The minimum bytes that should be selected as key samples. Below that no compression dictionary | ||||||
/// is used. | ||||||
const MIN_KEY_COMPRESSION_SAMPLES_SIZE: usize = 1024; | ||||||
/// The bytes that are used per key entry for a sample. | ||||||
const COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY: usize = 100; | ||||||
/// The minimum bytes that are used per key entry for a sample. | ||||||
const MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY: usize = 16; | ||||||
|
||||||
|
@@ -147,12 +145,10 @@ pub fn write_static_stored_file<E: Entry>( | |||||
} | ||||||
|
||||||
fn get_compression_buffer_capacity(total_key_size: usize) -> usize { | ||||||
let mut size = 0; | ||||||
if total_key_size >= MIN_KEY_COMPRESSION_SAMPLES_SIZE { | ||||||
let key_compression_samples_size = min(KEY_COMPRESSION_SAMPLES_SIZE, total_key_size / 16); | ||||||
size = key_compression_samples_size; | ||||||
if total_key_size < MIN_KEY_COMPRESSION_SAMPLES_SIZE { | ||||||
return 0; | ||||||
} | ||||||
size | ||||||
min(KEY_COMPRESSION_SAMPLES_SIZE, total_key_size / 16) | ||||||
} | ||||||
|
||||||
/// Computes compression dictionaries from keys of all entries | ||||||
|
@@ -162,23 +158,28 @@ fn compute_key_compression_dictionary<E: Entry>( | |||||
total_key_size: usize, | ||||||
buffer: &mut Vec<u8>, | ||||||
) -> Result<Vec<u8>> { | ||||||
if total_key_size < MIN_KEY_COMPRESSION_SAMPLES_SIZE { | ||||||
let key_compression_samples_size = get_compression_buffer_capacity(total_key_size); | ||||||
if key_compression_samples_size == 0 { | ||||||
return Ok(Vec::new()); | ||||||
} | ||||||
let key_compression_samples_size = min(KEY_COMPRESSION_SAMPLES_SIZE, total_key_size / 16); | ||||||
|
||||||
let max_sample_size = max( | ||||||
MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY, | ||||||
key_compression_samples_size / 1024, | ||||||
); | ||||||
|
||||||
let mut sample_sizes = Vec::new(); | ||||||
|
||||||
// Limit the number of iterations to avoid infinite loops | ||||||
let max_iterations = total_key_size / COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY * 2; | ||||||
for i in 0..max_iterations { | ||||||
let entry = &entries[i % entries.len()]; | ||||||
for entry in entries { | ||||||
let key_remaining = key_compression_samples_size - buffer.len(); | ||||||
if key_remaining < MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY { | ||||||
break; | ||||||
} | ||||||
let len = entry.key_len(); | ||||||
if len >= MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY { | ||||||
let used_len = min(key_remaining, COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY); | ||||||
let optimal_len = | ||||||
(len / 8).clamp(MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY, max_sample_size); | ||||||
let used_len = min(key_remaining, optimal_len); | ||||||
if len <= used_len { | ||||||
sample_sizes.push(len); | ||||||
entry.write_key_to(buffer); | ||||||
|
@@ -193,10 +194,12 @@ fn compute_key_compression_dictionary<E: Entry>( | |||||
} | ||||||
} | ||||||
} | ||||||
debug_assert!(buffer.len() == sample_sizes.iter().sum::<usize>()); | ||||||
vercel[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
let result = if buffer.len() > MIN_KEY_COMPRESSION_SAMPLES_SIZE && sample_sizes.len() > 5 { | ||||||
zstd::dict::from_continuous(buffer, &sample_sizes, KEY_COMPRESSION_DICTIONARY_SIZE) | ||||||
.context("Key dictionary creation failed")? | ||||||
/// The zlib dict builder requires at least 7 samples | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
The comment incorrectly refers to "zlib dict builder" when the code actually uses the zstd library. View DetailsAnalysisLine 197 contains a comment stating "The zlib dict builder requires at least 7 samples", but the actual code on line 202 calls The comment should be corrected to reference the correct library (zstd) to ensure accurate documentation and avoid confusion about the API requirements being satisfied. RecommendationUpdate the comment on line 197 to accurately reflect that this is for the zstd library: /// The zstd dict builder requires at least 7 samples
|
||||||
const MIN_SAMPLE_SIZE: usize = 7; | ||||||
let result = if buffer.len() > MIN_KEY_COMPRESSION_SAMPLES_SIZE | ||||||
&& sample_sizes.len() > MIN_SAMPLE_SIZE | ||||||
Comment on lines
+197
to
+200
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The comment says "at least 7 samples", but the code checks for more than 7 samples? |
||||||
{ | ||||||
zstd::dict::from_continuous(buffer, &sample_sizes, KEY_COMPRESSION_DICTIONARY_SIZE)? | ||||||
} else { | ||||||
Vec::new() | ||||||
}; | ||||||
|
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The compression dictionary sampling algorithm was changed from cycling through entries multiple times to a single pass, which can dramatically under-sample keys when there are few entries relative to the desired sample buffer size.
View Details
Analysis
The
compute_key_compression_dictionary
function has undergone a significant algorithmic change that can result in poor-quality compression dictionaries:Before: Used
max_iterations = total_key_size / COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY * 2
to cycle through entries multiple times withentries[i % entries.len()]
, collecting 100-byte samples until the buffer was adequately filled or iterations were exhausted.After: Changed to
for entry in entries
- a single pass through all entries with dynamic sample sizes of(len / 8).clamp(16, max_sample_size)
.Critical Issue: When there are few entries with large keys, the new algorithm severely under-samples:
Example scenario:
total_key_size = 5000
)key_compression_samples_size = min(262144, 5000/16) = 312
bytes available for samplesmax_sample_size = max(16, 312/1024) = 16
bytes per sampleBefore: ~100 iterations through 5 entries, potentially filling the 312-byte buffer
After: Single pass through 5 entries collecting only 5 × 16 = 80 bytes (25% of capacity)
This under-sampling will produce inferior compression dictionaries, significantly impacting storage compression efficiency.
Recommendation
Restore the iteration logic to ensure adequate sampling when entries are few but large. Consider either:
Restore the multi-pass approach: Keep the
max_iterations
calculation andentries[i % entries.len()]
pattern to allow cycling through entries multiple times when needed.Hybrid approach: Use single-pass for cases with many entries, but fall back to multi-pass when
entries.len() * average_sample_size < key_compression_samples_size * threshold
(e.g., threshold = 0.8).Adaptive sampling: Instead of fixed sample sizes, calculate how much to sample from each entry based on
key_compression_samples_size / entries.len()
to ensure the buffer is adequately filled.