Skip to content

Commit 531e938

Browse files
authored
Turbopack: improve compression dictionary generation (#80061)
### What? Improve the generation of the compression dictionary.
1 parent 5083e60 commit 531e938

File tree

1 file changed

+54
-44
lines changed

1 file changed

+54
-44
lines changed

turbopack/crates/turbo-persistence/src/static_sorted_file_builder.rs

Lines changed: 54 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use std::{
22
borrow::Cow,
3-
cmp::min,
3+
cmp::{max, min},
44
fs::File,
55
io::{self, BufWriter, Seek, Write},
66
path::Path,
@@ -45,6 +45,8 @@ const MIN_VALUE_COMPRESSION_SAMPLES_SIZE: usize = 1024;
4545
const MIN_KEY_COMPRESSION_SAMPLES_SIZE: usize = 1024;
4646
/// The bytes that are used per key/value entry for a sample.
4747
const COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY: usize = 100;
48+
/// The minimum bytes that are used per key/value entry for a sample.
49+
const MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY: usize = 16;
4850

4951
/// Trait for entries from that SST files can be created
5052
pub trait Entry {
@@ -153,58 +155,75 @@ impl<'a> StaticSortedFileBuilder<'a> {
153155
{
154156
return Ok(());
155157
}
156-
let key_compression_samples_size = min(KEY_COMPRESSION_SAMPLES_SIZE, total_key_size / 10);
158+
let key_compression_samples_size = min(KEY_COMPRESSION_SAMPLES_SIZE, total_key_size / 16);
157159
let value_compression_samples_size =
158-
min(VALUE_COMPRESSION_SAMPLES_SIZE, total_value_size / 10);
160+
min(VALUE_COMPRESSION_SAMPLES_SIZE, total_value_size / 16);
159161
let mut value_samples = Vec::with_capacity(value_compression_samples_size);
160162
let mut value_sample_sizes = Vec::new();
161163
let mut key_samples = Vec::with_capacity(key_compression_samples_size);
162164
let mut key_sample_sizes = Vec::new();
163-
let mut i = 12345678 % entries.len();
164-
let mut j = 0;
165-
loop {
166-
let entry = &entries[i];
165+
166+
// Limit the number of iterations to avoid infinite loops
167+
let max_iterations =
168+
max(total_key_size, total_value_size) / COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY * 2;
169+
for i in 0..max_iterations {
170+
let entry = &entries[i % entries.len()];
167171
let value_remaining = value_compression_samples_size - value_samples.len();
168-
let key_remaining = key_compression_samples_size - key_samples.len();
169-
if value_remaining > 0
170-
&& let EntryValue::Small { value } | EntryValue::Medium { value } = entry.value()
171-
{
172-
let value = if value.len() <= COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY {
173-
value
174-
} else {
175-
j = (j + 12345678) % (value.len() - COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY);
176-
&value[j..j + COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY]
177-
};
178-
if value.len() <= value_remaining {
179-
value_sample_sizes.push(value.len());
180-
value_samples.extend_from_slice(value);
181-
} else {
182-
value_sample_sizes.push(value_remaining);
183-
value_samples.extend_from_slice(&value[..value_remaining]);
172+
if value_remaining < MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY {
173+
break;
174+
}
175+
if let EntryValue::Small { value } | EntryValue::Medium { value } = entry.value() {
176+
let len = value.len();
177+
if len >= MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY {
178+
let used_len = min(value_remaining, COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY);
179+
if len <= used_len {
180+
value_sample_sizes.push(len);
181+
value_samples.extend_from_slice(value);
182+
} else {
183+
value_sample_sizes.push(used_len);
184+
let p = value_samples.len() % (len - used_len);
185+
value_samples.extend_from_slice(&value[p..p + used_len]);
186+
};
184187
}
185188
}
186-
if key_remaining > 0 {
189+
}
190+
assert!(value_samples.len() == value_sample_sizes.iter().sum::<usize>());
191+
if value_samples.len() > MIN_VALUE_COMPRESSION_SAMPLES_SIZE && value_sample_sizes.len() > 5
192+
{
193+
self.value_compression_dictionary = zstd::dict::from_continuous(
194+
&value_samples,
195+
&value_sample_sizes,
196+
VALUE_COMPRESSION_DICTIONARY_SIZE,
197+
)
198+
.context("Value dictionary creation failed")?;
199+
} else {
200+
self.value_compression_dictionary = Vec::new();
201+
}
202+
203+
for i in 0..max_iterations {
204+
let entry = &entries[i % entries.len()];
205+
let key_remaining = key_compression_samples_size - key_samples.len();
206+
if key_remaining < MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY {
207+
break;
208+
}
209+
let len = entry.key_len();
210+
if len >= MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY {
187211
let used_len = min(key_remaining, COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY);
188-
if entry.key_len() <= used_len {
189-
key_sample_sizes.push(entry.key_len());
212+
if len <= used_len {
213+
key_sample_sizes.push(len);
190214
entry.write_key_to(&mut key_samples);
191215
} else {
192-
let mut temp = Vec::with_capacity(entry.key_len());
216+
let mut temp = Vec::with_capacity(len);
193217
entry.write_key_to(&mut temp);
194-
debug_assert!(temp.len() == entry.key_len());
218+
debug_assert!(temp.len() == len);
195219

196-
j = (j + 12345678) % (temp.len() - used_len);
220+
let p = key_samples.len() % (len - used_len);
197221
key_sample_sizes.push(used_len);
198-
key_samples.extend_from_slice(&temp[j..j + used_len]);
222+
key_samples.extend_from_slice(&temp[p..p + used_len]);
199223
}
200224
}
201-
if key_remaining == 0 && value_remaining == 0 {
202-
break;
203-
}
204-
i = (i + 12345678) % entries.len();
205225
}
206226
assert!(key_samples.len() == key_sample_sizes.iter().sum::<usize>());
207-
assert!(value_samples.len() == value_sample_sizes.iter().sum::<usize>());
208227
if key_samples.len() > MIN_KEY_COMPRESSION_SAMPLES_SIZE && key_sample_sizes.len() > 5 {
209228
self.key_compression_dictionary = zstd::dict::from_continuous(
210229
&key_samples,
@@ -213,15 +232,6 @@ impl<'a> StaticSortedFileBuilder<'a> {
213232
)
214233
.context("Key dictionary creation failed")?;
215234
}
216-
if value_samples.len() > MIN_VALUE_COMPRESSION_SAMPLES_SIZE && value_sample_sizes.len() > 5
217-
{
218-
self.value_compression_dictionary = zstd::dict::from_continuous(
219-
&value_samples,
220-
&value_sample_sizes,
221-
VALUE_COMPRESSION_DICTIONARY_SIZE,
222-
)
223-
.context("Value dictionary creation failed")?;
224-
}
225235
Ok(())
226236
}
227237

0 commit comments

Comments
 (0)