diff --git a/Cargo.lock b/Cargo.lock
index 20569deda8a29..35ca8c2c56741 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5093,9 +5093,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
 [[package]]
 name = "pkg-config"
-version = "0.3.26"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
+checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
 
 [[package]]
 name = "plotters"
@@ -9228,7 +9228,6 @@ dependencies = [
  "parking_lot",
  "pot",
  "rand 0.9.0",
- "rayon",
  "regex",
  "ringmap",
  "rstest",
@@ -11680,9 +11679,9 @@ dependencies = [
 
 [[package]]
 name = "zstd"
-version = "0.13.2"
+version = "0.13.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9"
+checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a"
 dependencies = [
  "zstd-safe",
 ]
@@ -11698,9 +11697,9 @@ dependencies = [
 
 [[package]]
 name = "zstd-sys"
-version = "2.0.10+zstd.1.5.6"
+version = "2.0.15+zstd.1.5.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c253a4914af5bafc8fa8c86ee400827e83cf6ec01195ec1f1ed8441bf00d65aa"
+checksum = "eb81183ddd97d0c74cedf1d50d85c8d08c1b8b68ee863bdee9e706eedba1a237"
 dependencies = [
  "cc",
  "pkg-config",
diff --git a/packages/next/src/server/dev/hot-reloader-turbopack.ts b/packages/next/src/server/dev/hot-reloader-turbopack.ts
index 031623b54c6bc..54aa9b6d7099d 100644
--- a/packages/next/src/server/dev/hot-reloader-turbopack.ts
+++ b/packages/next/src/server/dev/hot-reloader-turbopack.ts
@@ -258,7 +258,7 @@ export async function createHotReloaderTurbopack(
     }
   )
   backgroundLogCompilationEvents(project, {
-    eventTypes: ['StartupCacheInvalidationEvent'],
+    eventTypes: ['StartupCacheInvalidationEvent', 'TimingEvent'],
   })
   setBundlerFindSourceMapImplementation(
     getSourceMapFromTurbopack.bind(null, project, projectPath)
diff --git a/turbopack/crates/turbo-persistence-tools/src/main.rs b/turbopack/crates/turbo-persistence-tools/src/main.rs
index 25fbe6d31201b..a1b2bb15a1f09 100644
--- a/turbopack/crates/turbo-persistence-tools/src/main.rs
+++ b/turbopack/crates/turbo-persistence-tools/src/main.rs
@@ -3,7 +3,7 @@
 use std::path::PathBuf;
 
 use anyhow::{Context, Result, bail};
-use turbo_persistence::{MetaFileEntryInfo, TurboPersistence};
+use turbo_persistence::{MetaFileEntryInfo, SerialScheduler, TurboPersistence};
 
 fn main() -> Result<()> {
     // Get CLI argument
@@ -16,7 +16,7 @@ fn main() -> Result<()> {
         bail!("The provided path does not exist: {}", path.display());
     }
 
-    let db = TurboPersistence::open_read_only(path)?;
+    let db: TurboPersistence<SerialScheduler> = TurboPersistence::open_read_only(path)?;
     let meta_info = db
         .meta_info()
         .context("Failed to retrieve meta information")?;
@@ -35,7 +35,6 @@ fn main() -> Result<()> {
             amqf_entries,
             sst_size,
             key_compression_dictionary_size,
-            value_compression_dictionary_size,
             block_count,
         } in meta_file.entries
         {
@@ -45,15 +44,11 @@ fn main() -> Result<()> {
             );
             println!("    AMQF {amqf_entries} entries = {} KiB", amqf_size / 1024);
             println!(
-                "    {} KiB = {} kiB key compression dict + {} KiB value compression dict + \
-                 {block_count} blocks (avg {} bytes/block)",
+                "    {} KiB = {} kiB key compression dict + {block_count} blocks (avg {} \
+                 bytes/block)",
                 sst_size / 1024,
                 key_compression_dictionary_size / 1024,
-                value_compression_dictionary_size / 1024,
-                (sst_size
-                    - key_compression_dictionary_size as u64
-                    - value_compression_dictionary_size as u64)
-                    / block_count as u64
+                (sst_size - key_compression_dictionary_size as u64) / block_count as u64
             );
         }
         if !meta_file.obsolete_sst_files.is_empty() {
diff --git a/turbopack/crates/turbo-persistence/Cargo.toml b/turbopack/crates/turbo-persistence/Cargo.toml
index c3e26489251ba..27583be17c893 100644
--- a/turbopack/crates/turbo-persistence/Cargo.toml
+++ b/turbopack/crates/turbo-persistence/Cargo.toml
@@ -22,16 +22,16 @@ memmap2 = "0.9.5"
 parking_lot = { workspace = true }
 qfilter = { version = "0.2.4", features = ["serde"] }
 quick_cache = { workspace = true }
-rayon = { workspace = true }
 rustc-hash = { workspace = true }
 smallvec = { workspace = true}
 thread_local = { workspace = true }
 tracing = { workspace = true }
 twox-hash = { version = "2.0.1", features = ["xxhash64"] }
-zstd = { version = "0.13.2", features = ["zdict_builder"] }
+zstd = { version = "0.13.3", features = ["zdict_builder"] }
 
 [dev-dependencies]
 rand = { workspace = true, features = ["small_rng"] }
+rayon = { workspace = true }
 tempfile = { workspace = true }
 
 [lints]
diff --git a/turbopack/crates/turbo-persistence/README.md b/turbopack/crates/turbo-persistence/README.md
index 93d97ab052ad6..51baa4c5a457c 100644
--- a/turbopack/crates/turbo-persistence/README.md
+++ b/turbopack/crates/turbo-persistence/README.md
@@ -45,7 +45,6 @@ A meta file can contain metadata about multiple SST files. The metadata is store
   - foreach described SST file
     - 4 bytes sequence number of the SST file
     - 2 bytes key Compression Dictionary length
-    - 2 bytes value Compression Dictionary length
     - 2 bytes block count
     - 8 bytes min hash
     - 8 bytes max hash
@@ -59,7 +58,6 @@ A meta file can contain metadata about multiple SST files. The metadata is store
 The SST file contains only data without any header.
 
 - serialized key Compression Dictionary
-- serialized value Compression Dictionary
 - foreach block
   - 4 bytes uncompressed block length
   - compressed data
diff --git a/turbopack/crates/turbo-persistence/src/collector.rs b/turbopack/crates/turbo-persistence/src/collector.rs
index ea8b04ab16e70..b955d6102bec1 100644
--- a/turbopack/crates/turbo-persistence/src/collector.rs
+++ b/turbopack/crates/turbo-persistence/src/collector.rs
@@ -1,3 +1,5 @@
+use std::mem::take;
+
 use crate::{
     ValueBuffer,
     collector_entry::{CollectorEntry, CollectorEntryValue, EntryKey},
@@ -90,11 +92,11 @@ impl<K: StoreKey, const SIZE_SHIFT: usize> Collector<K, SIZE_SHIFT> {
         self.entries.push(entry);
     }
 
-    /// Sorts the entries and returns them along with the total key and value sizes. This doesn't
+    /// Sorts the entries and returns them along with the total key size. This doesn't
     /// clear the entries.
-    pub fn sorted(&mut self) -> (&[CollectorEntry<K>], usize, usize) {
+    pub fn sorted(&mut self) -> (&[CollectorEntry<K>], usize) {
         self.entries.sort_unstable_by(|a, b| a.key.cmp(&b.key));
-        (&self.entries, self.total_key_size, self.total_value_size)
+        (&self.entries, self.total_key_size)
     }
 
     /// Clears the collector.
@@ -111,4 +113,11 @@ impl<K: StoreKey, const SIZE_SHIFT: usize> Collector<K, SIZE_SHIFT> {
         self.total_value_size = 0;
         self.entries.drain(..)
     }
+
+    /// Clears the collector and drops the capacity
+    pub fn drop_contents(&mut self) {
+        drop(take(&mut self.entries));
+        self.total_key_size = 0;
+        self.total_value_size = 0;
+    }
 }
diff --git a/turbopack/crates/turbo-persistence/src/compaction/selector.rs b/turbopack/crates/turbo-persistence/src/compaction/selector.rs
index 814a4aa87a69e..f5bc91687f2b1 100644
--- a/turbopack/crates/turbo-persistence/src/compaction/selector.rs
+++ b/turbopack/crates/turbo-persistence/src/compaction/selector.rs
@@ -136,8 +136,8 @@ impl Default for CompactConfig {
             optimal_merge_count: 8,
             max_merge_count: 32,
             max_merge_bytes: 500 * MB,
-            min_merge_duplication_bytes: MB,
-            optimal_merge_duplication_bytes: 10 * MB,
+            min_merge_duplication_bytes: 50 * MB,
+            optimal_merge_duplication_bytes: 100 * MB,
             max_merge_segment_count: 8,
         }
     }
@@ -233,13 +233,20 @@ pub fn get_merge_segments<T: Compactable>(
             // We have reached the maximum number of merge jobs, so we stop here.
             break;
         }
-        let mut current_range = start_compactable.range();
+        let start_compactable_range = start_compactable.range();
+        let start_compactable_size = start_compactable.size();
+        let mut current_range = start_compactable_range.clone();
 
         // We might need to restart the search if we need to extend the range.
         'search: loop {
             let mut current_set = smallvec![start_index];
-            let mut current_size = start_compactable.size();
+            let mut current_size = start_compactable_size;
             let mut duplication = IntervalMap::<Option<DuplicationInfo>>::new();
+            duplication.update(start_compactable_range.clone(), |dup_info| {
+                dup_info
+                    .get_or_insert_default()
+                    .add(start_compactable_size, &start_compactable_range);
+            });
             let mut current_skip = 0;
 
             // We will capture compactables in the current_range until we find a optimal merge
@@ -609,8 +616,8 @@ mod tests {
                 min_merge_count: 2,
                 optimal_merge_count: 4,
                 max_merge_bytes: 5000,
-                min_merge_duplication_bytes: 200,
-                optimal_merge_duplication_bytes: 500,
+                min_merge_duplication_bytes: 500,
+                optimal_merge_duplication_bytes: 1000,
                 max_merge_segment_count: 4,
             };
             let jobs = get_merge_segments(&containers, &config);
@@ -653,7 +660,7 @@ mod tests {
         println!("Number of compactions: {number_of_compactions}");
 
         let metrics = compute_metrics(&containers, 0..=KEY_RANGE);
-        assert!(number_of_compactions < 40);
+        assert!(number_of_compactions < 30);
         assert!(containers.len() < 30);
         assert!(metrics.duplication < 0.5);
     }
diff --git a/turbopack/crates/turbo-persistence/src/compression.rs b/turbopack/crates/turbo-persistence/src/compression.rs
new file mode 100644
index 0000000000000..093ac87a99ad5
--- /dev/null
+++ b/turbopack/crates/turbo-persistence/src/compression.rs
@@ -0,0 +1,56 @@
+use std::{mem::MaybeUninit, sync::Arc};
+
+use anyhow::{Context, Result};
+use lzzzz::lz4::{ACC_LEVEL_DEFAULT, decompress, decompress_with_dict};
+
+#[tracing::instrument(level = "trace", skip_all)]
+pub fn decompress_into_arc(
+    uncompressed_length: u32,
+    block: &[u8],
+    compression_dictionary: Option<&[u8]>,
+    _long_term: bool,
+) -> Result<Arc<[u8]>> {
+    // We directly allocate the buffer in an Arc to avoid copying it into an Arc and avoiding
+    // double indirection. This is a dynamically sized arc.
+    let buffer: Arc<[MaybeUninit<u8>]> = Arc::new_zeroed_slice(uncompressed_length as usize);
+    // Assume that the buffer is initialized.
+    let buffer = Arc::into_raw(buffer);
+    // Safety: Assuming that the buffer is initialized is safe because we just created it as
+    // zeroed slice and u8 doesn't require initialization.
+    let mut buffer = unsafe { Arc::from_raw(buffer as *mut [u8]) };
+    // Safety: We know that the buffer is not shared yet.
+    let decompressed = unsafe { Arc::get_mut_unchecked(&mut buffer) };
+    let bytes_writes = if let Some(dict) = compression_dictionary {
+        // Safety: decompress_with_dict will only write to `decompressed` and not read from it.
+        decompress_with_dict(block, decompressed, dict)?
+    } else {
+        // Safety: decompress will only write to `decompressed` and not read from it.
+        decompress(block, decompressed)?
+    };
+    assert_eq!(
+        bytes_writes, uncompressed_length as usize,
+        "Decompressed length does not match expected length"
+    );
+    // Safety: The buffer is now fully initialized and can be used.
+    Ok(buffer)
+}
+
+#[tracing::instrument(level = "trace", skip_all)]
+pub fn compress_into_buffer(
+    block: &[u8],
+    dict: Option<&[u8]>,
+    _long_term: bool,
+    buffer: &mut Vec<u8>,
+) -> Result<()> {
+    let mut compressor = if let Some(dict) = dict {
+        lzzzz::lz4::Compressor::with_dict(dict)
+    } else {
+        lzzzz::lz4::Compressor::new()
+    }
+    .context("LZ4 compressor creation failed")?;
+    let acc_factor = ACC_LEVEL_DEFAULT;
+    compressor
+        .next_to_vec(block, buffer, acc_factor)
+        .context("Compression failed")?;
+    Ok(())
+}
diff --git a/turbopack/crates/turbo-persistence/src/db.rs b/turbopack/crates/turbo-persistence/src/db.rs
index 4ba703fa75ea5..86c256d2bcac3 100644
--- a/turbopack/crates/turbo-persistence/src/db.rs
+++ b/turbopack/crates/turbo-persistence/src/db.rs
@@ -3,29 +3,24 @@ use std::{
     collections::HashSet,
     fs::{self, File, OpenOptions, ReadDir},
     io::{BufWriter, Write},
-    mem::{MaybeUninit, swap, transmute},
+    mem::swap,
     ops::RangeInclusive,
     path::{Path, PathBuf},
-    sync::{
-        Arc,
-        atomic::{AtomicBool, AtomicU32, Ordering},
-    },
+    sync::atomic::{AtomicBool, AtomicU32, Ordering},
 };
 
 use anyhow::{Context, Result, bail};
 use byteorder::{BE, ReadBytesExt, WriteBytesExt};
 use jiff::Timestamp;
-use lzzzz::lz4::decompress;
 use memmap2::Mmap;
 use parking_lot::{Mutex, RwLock};
-use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator};
-use tracing::Span;
 
 pub use crate::compaction::selector::CompactConfig;
 use crate::{
     QueryKey,
     arc_slice::ArcSlice,
     compaction::selector::{Compactable, compute_metrics, get_merge_segments},
+    compression::decompress_into_arc,
     constants::{
         AMQF_AVG_SIZE, AMQF_CACHE_SIZE, DATA_THRESHOLD_PER_COMPACTED_FILE, KEY_BLOCK_AVG_SIZE,
         KEY_BLOCK_CACHE_SIZE, MAX_ENTRIES_PER_COMPACTED_FILE, VALUE_BLOCK_AVG_SIZE,
@@ -36,6 +31,7 @@ use crate::{
     merge_iter::MergeIter,
     meta_file::{AmqfCache, MetaFile, MetaLookupResult, StaticSortedFileRange},
     meta_file_builder::MetaFileBuilder,
+    parallel_scheduler::ParallelScheduler,
     sst_filter::SstFilter,
     static_sorted_file::{BlockCache, SstLookupResult},
     static_sorted_file_builder::{StaticSortedFileBuilderMeta, write_static_stored_file},
@@ -108,7 +104,8 @@ struct TrackedStats {
 
 /// TurboPersistence is a persistent key-value store. It is limited to a single writer at a time
 /// using a single write batch. It allows for concurrent reads.
-pub struct TurboPersistence {
+pub struct TurboPersistence<S: ParallelScheduler> {
+    parallel_scheduler: S,
     /// The path to the directory where the database is stored
     path: PathBuf,
     /// If true, the database is opened in read-only mode. In this mode, no writes are allowed and
@@ -148,9 +145,26 @@ pub struct CommitOptions {
     keys_written: u64,
 }
 
-impl TurboPersistence {
-    fn new(path: PathBuf, read_only: bool) -> Self {
+impl<S: ParallelScheduler + Default> TurboPersistence<S> {
+    /// Open a TurboPersistence database at the given path.
+    /// This will read the directory and might performance cleanup when the database was not closed
+    /// properly. Cleanup only requires to read a few bytes from a few files and to delete
+    /// files, so it's fast.
+    pub fn open(path: PathBuf) -> Result<Self> {
+        Self::open_with_parallel_scheduler(path, Default::default())
+    }
+
+    /// Open a TurboPersistence database at the given path in read only mode.
+    /// This will read the directory. No Cleanup is performed.
+    pub fn open_read_only(path: PathBuf) -> Result<Self> {
+        Self::open_read_only_with_parallel_scheduler(path, Default::default())
+    }
+}
+
+impl<S: ParallelScheduler> TurboPersistence<S> {
+    fn new(path: PathBuf, read_only: bool, parallel_scheduler: S) -> Self {
         Self {
+            parallel_scheduler,
             path,
             read_only,
             inner: RwLock::new(Inner {
@@ -188,16 +202,19 @@ impl TurboPersistence {
     /// This will read the directory and might performance cleanup when the database was not closed
     /// properly. Cleanup only requires to read a few bytes from a few files and to delete
     /// files, so it's fast.
-    pub fn open(path: PathBuf) -> Result<Self> {
-        let mut db = Self::new(path, false);
+    pub fn open_with_parallel_scheduler(path: PathBuf, parallel_scheduler: S) -> Result<Self> {
+        let mut db = Self::new(path, false, parallel_scheduler);
         db.open_directory(false)?;
         Ok(db)
     }
 
     /// Open a TurboPersistence database at the given path in read only mode.
     /// This will read the directory. No Cleanup is performed.
-    pub fn open_read_only(path: PathBuf) -> Result<Self> {
-        let mut db = Self::new(path, true);
+    pub fn open_read_only_with_parallel_scheduler(
+        path: PathBuf,
+        parallel_scheduler: S,
+    ) -> Result<Self> {
+        let mut db = Self::new(path, true, parallel_scheduler);
         db.open_directory(false)?;
         Ok(db)
     }
@@ -341,16 +358,12 @@ impl TurboPersistence {
 
         meta_files.retain(|seq| !deleted_files.contains(seq));
         meta_files.sort_unstable();
-        let span = Span::current();
-        let mut meta_files = meta_files
-            .into_par_iter()
-            .with_min_len(1)
-            .map(|seq| {
-                let _span = span.enter();
+        let mut meta_files = self
+            .parallel_scheduler
+            .parallel_map_collect::<_, _, Result<Vec<MetaFile>>>(&meta_files, |&seq| {
                 let meta_file = MetaFile::open(&self.path, seq)?;
                 Ok(meta_file)
-            })
-            .collect::<Result<Vec<MetaFile>>>()?;
+            })?;
 
         let mut sst_filter = SstFilter::new();
         for meta_file in meta_files.iter_mut().rev() {
@@ -376,14 +389,9 @@ impl TurboPersistence {
         #[cfg(target_os = "linux")]
         mmap.advise(memmap2::Advice::Unmergeable)?;
         let mut compressed = &mmap[..];
-        let uncompressed_length = compressed.read_u32::<BE>()? as usize;
-
-        let buffer = Arc::new_zeroed_slice(uncompressed_length);
-        // Safety: MaybeUninit<u8> can be safely transmuted to u8.
-        let mut buffer = unsafe { transmute::<Arc<[MaybeUninit<u8>]>, Arc<[u8]>>(buffer) };
-        // Safety: We know that the buffer is not shared yet.
-        let decompressed = unsafe { Arc::get_mut_unchecked(&mut buffer) };
-        decompress(compressed, decompressed)?;
+        let uncompressed_length = compressed.read_u32::<BE>()?;
+
+        let buffer = decompress_into_arc(uncompressed_length, compressed, None, true)?;
         Ok(ArcSlice::from(buffer))
     }
 
@@ -398,7 +406,7 @@ impl TurboPersistence {
     /// This data will only become visible after the WriteBatch is committed.
     pub fn write_batch<K: StoreKey + Send + Sync + 'static, const FAMILIES: usize>(
         &self,
-    ) -> Result<WriteBatch<K, FAMILIES>> {
+    ) -> Result<WriteBatch<K, S, FAMILIES>> {
         if self.read_only {
             bail!("Cannot write to a read-only database");
         }
@@ -413,7 +421,11 @@ impl TurboPersistence {
             );
         }
         let current = self.inner.read().current_sequence_number;
-        Ok(WriteBatch::new(self.path.clone(), current))
+        Ok(WriteBatch::new(
+            self.path.clone(),
+            current,
+            self.parallel_scheduler.clone(),
+        ))
     }
 
     fn open_log(&self) -> Result<BufWriter<File>> {
@@ -432,7 +444,7 @@ impl TurboPersistence {
     /// visible to readers.
     pub fn commit_write_batch<K: StoreKey + Send + Sync + 'static, const FAMILIES: usize>(
         &self,
-        mut write_batch: WriteBatch<K, FAMILIES>,
+        mut write_batch: WriteBatch<K, S, FAMILIES>,
     ) -> Result<()> {
         if self.read_only {
             unreachable!("It's not possible to create a write batch for a read-only database");
@@ -475,27 +487,31 @@ impl TurboPersistence {
 
         new_meta_files.sort_unstable_by_key(|(seq, _)| *seq);
 
-        let mut new_meta_files = new_meta_files
-            .into_par_iter()
-            .with_min_len(1)
-            .map(|(seq, file)| {
-                file.sync_all()?;
-                let meta_file = MetaFile::open(&self.path, seq)?;
-                Ok(meta_file)
-            })
-            .collect::<Result<Vec<_>>>()?;
+        let mut new_meta_files = self
+            .parallel_scheduler
+            .vec_into_parallel_map_collect::<_, _, Result<Vec<_>>>(
+                new_meta_files,
+                |(seq, file)| {
+                    file.sync_all()?;
+                    let meta_file = MetaFile::open(&self.path, seq)?;
+                    Ok(meta_file)
+                },
+            )?;
 
         let mut sst_filter = SstFilter::new();
         for meta_file in new_meta_files.iter_mut().rev() {
             sst_filter.apply_filter(meta_file);
         }
 
-        for (_, file) in new_sst_files.iter() {
-            file.sync_all()?;
-        }
-        for (_, file) in new_blob_files.iter() {
-            file.sync_all()?;
-        }
+        self.parallel_scheduler.block_in_place(|| {
+            for (_, file) in new_sst_files.iter() {
+                file.sync_all()?;
+            }
+            for (_, file) in new_blob_files.iter() {
+                file.sync_all()?;
+            }
+            anyhow::Ok(())
+        })?;
 
         let new_meta_info = new_meta_files
             .iter()
@@ -548,86 +564,88 @@ impl TurboPersistence {
             inner.current_sequence_number = seq;
         }
 
-        if has_delete_file {
-            sst_seq_numbers_to_delete.sort_unstable();
-            meta_seq_numbers_to_delete.sort_unstable();
-            blob_seq_numbers_to_delete.sort_unstable();
-            // Write *.del file, marking the selected files as to delete
-            let mut buf = Vec::with_capacity(
-                (sst_seq_numbers_to_delete.len()
-                    + meta_seq_numbers_to_delete.len()
-                    + blob_seq_numbers_to_delete.len())
-                    * size_of::<u32>(),
-            );
-            for seq in sst_seq_numbers_to_delete.iter() {
-                buf.write_u32::<BE>(*seq)?;
-            }
-            for seq in meta_seq_numbers_to_delete.iter() {
-                buf.write_u32::<BE>(*seq)?;
-            }
-            for seq in blob_seq_numbers_to_delete.iter() {
-                buf.write_u32::<BE>(*seq)?;
-            }
-            let mut file = File::create(self.path.join(format!("{seq:08}.del")))?;
-            file.write_all(&buf)?;
-            file.sync_all()?;
-        }
-
-        let mut current_file = OpenOptions::new()
-            .write(true)
-            .truncate(false)
-            .read(false)
-            .open(self.path.join("CURRENT"))?;
-        current_file.write_u32::<BE>(seq)?;
-        current_file.sync_all()?;
-
-        for seq in sst_seq_numbers_to_delete.iter() {
-            fs::remove_file(self.path.join(format!("{seq:08}.sst")))?;
-        }
-        for seq in meta_seq_numbers_to_delete.iter() {
-            fs::remove_file(self.path.join(format!("{seq:08}.meta")))?;
-        }
-        for seq in blob_seq_numbers_to_delete.iter() {
-            fs::remove_file(self.path.join(format!("{seq:08}.blob")))?;
-        }
-
-        {
-            let mut log = self.open_log()?;
-            writeln!(log, "Time {time}")?;
-            let span = time.until(Timestamp::now())?;
-            writeln!(log, "Commit {seq:08} {keys_written} keys in {span:#}")?;
-            for (seq, family, ssts, obsolete) in new_meta_info {
-                writeln!(log, "{seq:08} META family:{family}",)?;
-                for (seq, min, max, size) in ssts {
-                    writeln!(
-                        log,
-                        "  {seq:08} SST  {min:016x}-{max:016x} {} MiB",
-                        size / 1024 / 1024
-                    )?;
+        self.parallel_scheduler.block_in_place(|| {
+            if has_delete_file {
+                sst_seq_numbers_to_delete.sort_unstable();
+                meta_seq_numbers_to_delete.sort_unstable();
+                blob_seq_numbers_to_delete.sort_unstable();
+                // Write *.del file, marking the selected files as to delete
+                let mut buf = Vec::with_capacity(
+                    (sst_seq_numbers_to_delete.len()
+                        + meta_seq_numbers_to_delete.len()
+                        + blob_seq_numbers_to_delete.len())
+                        * size_of::<u32>(),
+                );
+                for seq in sst_seq_numbers_to_delete.iter() {
+                    buf.write_u32::<BE>(*seq)?;
                 }
-                for seq in obsolete {
-                    writeln!(log, "  {seq:08} OBSOLETE SST")?;
+                for seq in meta_seq_numbers_to_delete.iter() {
+                    buf.write_u32::<BE>(*seq)?;
                 }
+                for seq in blob_seq_numbers_to_delete.iter() {
+                    buf.write_u32::<BE>(*seq)?;
+                }
+                let mut file = File::create(self.path.join(format!("{seq:08}.del")))?;
+                file.write_all(&buf)?;
+                file.sync_all()?;
             }
-            new_sst_files.sort_unstable_by_key(|(seq, _)| *seq);
-            for (seq, _) in new_sst_files.iter() {
-                writeln!(log, "{seq:08} NEW SST")?;
-            }
-            new_blob_files.sort_unstable_by_key(|(seq, _)| *seq);
-            for (seq, _) in new_blob_files.iter() {
-                writeln!(log, "{seq:08} NEW BLOB")?;
-            }
+
+            let mut current_file = OpenOptions::new()
+                .write(true)
+                .truncate(false)
+                .read(false)
+                .open(self.path.join("CURRENT"))?;
+            current_file.write_u32::<BE>(seq)?;
+            current_file.sync_all()?;
+
             for seq in sst_seq_numbers_to_delete.iter() {
-                writeln!(log, "{seq:08} SST DELETED")?;
+                fs::remove_file(self.path.join(format!("{seq:08}.sst")))?;
             }
             for seq in meta_seq_numbers_to_delete.iter() {
-                writeln!(log, "{seq:08} META DELETED")?;
+                fs::remove_file(self.path.join(format!("{seq:08}.meta")))?;
             }
             for seq in blob_seq_numbers_to_delete.iter() {
-                writeln!(log, "{seq:08} BLOB DELETED")?;
+                fs::remove_file(self.path.join(format!("{seq:08}.blob")))?;
             }
-        }
 
+            {
+                let mut log = self.open_log()?;
+                writeln!(log, "Time {time}")?;
+                let span = time.until(Timestamp::now())?;
+                writeln!(log, "Commit {seq:08} {keys_written} keys in {span:#}")?;
+                for (seq, family, ssts, obsolete) in new_meta_info {
+                    writeln!(log, "{seq:08} META family:{family}",)?;
+                    for (seq, min, max, size) in ssts {
+                        writeln!(
+                            log,
+                            "  {seq:08} SST  {min:016x}-{max:016x} {} MiB",
+                            size / 1024 / 1024
+                        )?;
+                    }
+                    for seq in obsolete {
+                        writeln!(log, "  {seq:08} OBSOLETE SST")?;
+                    }
+                }
+                new_sst_files.sort_unstable_by_key(|(seq, _)| *seq);
+                for (seq, _) in new_sst_files.iter() {
+                    writeln!(log, "{seq:08} NEW SST")?;
+                }
+                new_blob_files.sort_unstable_by_key(|(seq, _)| *seq);
+                for (seq, _) in new_blob_files.iter() {
+                    writeln!(log, "{seq:08} NEW BLOB")?;
+                }
+                for seq in sst_seq_numbers_to_delete.iter() {
+                    writeln!(log, "{seq:08} SST DELETED")?;
+                }
+                for seq in meta_seq_numbers_to_delete.iter() {
+                    writeln!(log, "{seq:08} META DELETED")?;
+                }
+                for seq in blob_seq_numbers_to_delete.iter() {
+                    writeln!(log, "{seq:08} BLOB DELETED")?;
+                }
+            }
+            anyhow::Ok(())
+        })?;
         Ok(())
     }
 
@@ -650,7 +668,7 @@ impl TurboPersistence {
     /// files is above the given threshold. The coverage is the average number of SST files that
     /// need to be read to find a key. It also limits the maximum number of SST files that are
     /// merged at once, which is the main factor for the runtime of the compaction.
-    pub fn compact(&self, compact_config: &CompactConfig) -> Result<()> {
+    pub fn compact(&self, compact_config: &CompactConfig) -> Result<bool> {
         if self.read_only {
             bail!("Compaction is not allowed on a read only database");
         }
@@ -689,7 +707,8 @@ impl TurboPersistence {
             .context("Failed to compact database")?;
         }
 
-        if !new_meta_files.is_empty() {
+        let has_changes = !new_meta_files.is_empty();
+        if has_changes {
             self.commit(CommitOptions {
                 new_meta_files,
                 new_sst_files,
@@ -704,7 +723,7 @@ impl TurboPersistence {
 
         self.active_write_operation.store(false, Ordering::Release);
 
-        Ok(())
+        Ok(has_changes)
     }
 
     /// Internal function to perform a compaction.
@@ -777,7 +796,6 @@ impl TurboPersistence {
         let path = &self.path;
 
         let log_mutex = Mutex::new(());
-        let span = Span::current();
 
         struct PartialResultPerFamily {
             new_meta_file: Option<(u32, File)>,
@@ -789,335 +807,339 @@ impl TurboPersistence {
 
         let mut compact_config = compact_config.clone();
         let merge_jobs = sst_by_family
-            .iter()
-            .map(|ssts_with_ranges| {
+            .into_iter()
+            .enumerate()
+            .filter_map(|(family, ssts_with_ranges)| {
                 if compact_config.max_merge_segment_count == 0 {
-                    return Vec::new();
+                    return None;
                 }
-                let merge_jobs = get_merge_segments(ssts_with_ranges, &compact_config);
+                let merge_jobs = get_merge_segments(&ssts_with_ranges, &compact_config);
                 compact_config.max_merge_segment_count -= merge_jobs.len();
-                merge_jobs
+                Some((family, ssts_with_ranges, merge_jobs))
             })
             .collect::<Vec<_>>();
 
-        let result = sst_by_family
-            .into_par_iter()
-            .zip(merge_jobs.into_par_iter())
-            .with_min_len(1)
-            .enumerate()
-            .map(|(family, (ssts_with_ranges, merge_jobs))| {
-                let family = family as u32;
-                let _span = span.clone().entered();
-
-                if merge_jobs.is_empty() {
-                    return Ok(PartialResultPerFamily {
-                        new_meta_file: None,
-                        new_sst_files: Vec::new(),
-                        sst_seq_numbers_to_delete: Vec::new(),
-                        blob_seq_numbers_to_delete: Vec::new(),
-                        keys_written: 0,
-                    });
-                }
-
-                {
-                    let metrics = compute_metrics(&ssts_with_ranges, 0..=u64::MAX);
-                    let guard = log_mutex.lock();
-                    let mut log = self.open_log()?;
-                    writeln!(
-                        log,
-                        "Compaction for family {family} (coverage: {}, overlap: {}, duplication: \
-                         {} / {} MiB):",
-                        metrics.coverage,
-                        metrics.overlap,
-                        metrics.duplication,
-                        metrics.duplicated_size / 1024 / 1024
-                    )?;
-                    for job in merge_jobs.iter() {
-                        writeln!(log, "  merge")?;
-                        for i in job.iter() {
-                            let seq = ssts_with_ranges[*i].seq;
-                            let (min, max) = ssts_with_ranges[*i].range().into_inner();
-                            writeln!(log, "    {seq:08} {min:016x}-{max:016x}")?;
-                        }
+        let result = self
+            .parallel_scheduler
+            .vec_into_parallel_map_collect::<_, _, Result<Vec<_>>>(
+                merge_jobs,
+                |(family, ssts_with_ranges, merge_jobs)| {
+                    let family = family as u32;
+
+                    if merge_jobs.is_empty() {
+                        return Ok(PartialResultPerFamily {
+                            new_meta_file: None,
+                            new_sst_files: Vec::new(),
+                            sst_seq_numbers_to_delete: Vec::new(),
+                            blob_seq_numbers_to_delete: Vec::new(),
+                            keys_written: 0,
+                        });
                     }
-                    drop(guard);
-                }
-
-                // Later we will remove the merged files
-                let sst_seq_numbers_to_delete = merge_jobs
-                    .iter()
-                    .filter(|l| l.len() > 1)
-                    .flat_map(|l| l.iter().copied())
-                    .map(|index| ssts_with_ranges[index].seq)
-                    .collect::<Vec<_>>();
 
-                // Merge SST files
-                let span = tracing::trace_span!("merge files");
-                enum PartialMergeResult<'l> {
-                    Merged {
-                        new_sst_files: Vec<(u32, File, StaticSortedFileBuilderMeta<'static>)>,
-                        blob_seq_numbers_to_delete: Vec<u32>,
-                        keys_written: u64,
-                    },
-                    Move {
-                        seq: u32,
-                        meta: StaticSortedFileBuilderMeta<'l>,
-                    },
-                }
-                let merge_result = merge_jobs
-                    .into_par_iter()
-                    .with_min_len(1)
-                    .map(|indices| {
-                        let _span = span.clone().entered();
-                        if indices.len() == 1 {
-                            // If we only have one file, we can just move it
-                            let index = indices[0];
-                            let meta_index = ssts_with_ranges[index].meta_index;
-                            let index_in_meta = ssts_with_ranges[index].index_in_meta;
-                            let meta_file = &meta_files[meta_index];
-                            let entry = meta_file.entry(index_in_meta);
-                            let amqf = Cow::Borrowed(entry.raw_amqf(meta_file.amqf_data()));
-                            let meta = StaticSortedFileBuilderMeta {
-                                min_hash: entry.min_hash(),
-                                max_hash: entry.max_hash(),
-                                amqf,
-                                key_compression_dictionary_length: entry
-                                    .key_compression_dictionary_length(),
-                                value_compression_dictionary_length: entry
-                                    .value_compression_dictionary_length(),
-                                block_count: entry.block_count(),
-                                size: entry.size(),
-                                entries: 0,
-                            };
-                            return Ok(PartialMergeResult::Move {
-                                seq: entry.sequence_number(),
-                                meta,
-                            });
+                    self.parallel_scheduler.block_in_place(|| {
+                        let metrics = compute_metrics(&ssts_with_ranges, 0..=u64::MAX);
+                        let guard = log_mutex.lock();
+                        let mut log = self.open_log()?;
+                        writeln!(
+                            log,
+                            "Compaction for family {family} (coverage: {}, overlap: {}, \
+                             duplication: {} / {} MiB):",
+                            metrics.coverage,
+                            metrics.overlap,
+                            metrics.duplication,
+                            metrics.duplicated_size / 1024 / 1024
+                        )?;
+                        for job in merge_jobs.iter() {
+                            writeln!(log, "  merge")?;
+                            for i in job.iter() {
+                                let seq = ssts_with_ranges[*i].seq;
+                                let (min, max) = ssts_with_ranges[*i].range().into_inner();
+                                writeln!(log, "    {seq:08} {min:016x}-{max:016x}")?;
+                            }
                         }
+                        drop(guard);
+                        anyhow::Ok(())
+                    })?;
 
-                        fn create_sst_file(
-                            entries: &[LookupEntry],
-                            total_key_size: usize,
-                            total_value_size: usize,
-                            path: &Path,
+                    // Later we will remove the merged files
+                    let sst_seq_numbers_to_delete = merge_jobs
+                        .iter()
+                        .filter(|l| l.len() > 1)
+                        .flat_map(|l| l.iter().copied())
+                        .map(|index| ssts_with_ranges[index].seq)
+                        .collect::<Vec<_>>();
+
+                    // Merge SST files
+                    let span = tracing::trace_span!("merge files");
+                    enum PartialMergeResult<'l> {
+                        Merged {
+                            new_sst_files: Vec<(u32, File, StaticSortedFileBuilderMeta<'static>)>,
+                            blob_seq_numbers_to_delete: Vec<u32>,
+                            keys_written: u64,
+                        },
+                        Move {
                             seq: u32,
-                        ) -> Result<(u32, File, StaticSortedFileBuilderMeta<'static>)>
-                        {
-                            let _span = tracing::trace_span!("write merged sst file").entered();
-                            let (meta, file) = write_static_stored_file(
-                                entries,
-                                total_key_size,
-                                total_value_size,
-                                &path.join(format!("{seq:08}.sst")),
-                            )?;
-                            Ok((seq, file, meta))
-                        }
+                            meta: StaticSortedFileBuilderMeta<'l>,
+                        },
+                    }
+                    let merge_result = self
+                        .parallel_scheduler
+                        .vec_into_parallel_map_collect::<_, _, Result<Vec<_>>>(
+                            merge_jobs,
+                            |indices| {
+                                let _span = span.clone().entered();
+                                if indices.len() == 1 {
+                                    // If we only have one file, we can just move it
+                                    let index = indices[0];
+                                    let meta_index = ssts_with_ranges[index].meta_index;
+                                    let index_in_meta = ssts_with_ranges[index].index_in_meta;
+                                    let meta_file = &meta_files[meta_index];
+                                    let entry = meta_file.entry(index_in_meta);
+                                    let amqf = Cow::Borrowed(entry.raw_amqf(meta_file.amqf_data()));
+                                    let meta = StaticSortedFileBuilderMeta {
+                                        min_hash: entry.min_hash(),
+                                        max_hash: entry.max_hash(),
+                                        amqf,
+                                        key_compression_dictionary_length: entry
+                                            .key_compression_dictionary_length(),
+                                        block_count: entry.block_count(),
+                                        size: entry.size(),
+                                        entries: 0,
+                                    };
+                                    return Ok(PartialMergeResult::Move {
+                                        seq: entry.sequence_number(),
+                                        meta,
+                                    });
+                                }
+
+                                fn create_sst_file<'l, S: ParallelScheduler>(
+                                    parallel_scheduler: &S,
+                                    entries: &[LookupEntry<'l>],
+                                    total_key_size: usize,
+                                    path: &Path,
+                                    seq: u32,
+                                ) -> Result<(u32, File, StaticSortedFileBuilderMeta<'static>)>
+                                {
+                                    let _span =
+                                        tracing::trace_span!("write merged sst file").entered();
+                                    let (meta, file) = parallel_scheduler.block_in_place(|| {
+                                        write_static_stored_file(
+                                            entries,
+                                            total_key_size,
+                                            &path.join(format!("{seq:08}.sst")),
+                                        )
+                                    })?;
+                                    Ok((seq, file, meta))
+                                }
 
-                        let mut new_sst_files = Vec::new();
-
-                        // Iterate all SST files
-                        let iters = indices
-                            .iter()
-                            .map(|&index| {
-                                let meta_index = ssts_with_ranges[index].meta_index;
-                                let index_in_meta = ssts_with_ranges[index].index_in_meta;
-                                let meta = &meta_files[meta_index];
-                                meta.entry(index_in_meta)
-                                    .sst(meta)?
-                                    .iter(key_block_cache, value_block_cache)
-                            })
-                            .collect::<Result<Vec<_>>>()?;
-
-                        let iter = MergeIter::new(iters.into_iter())?;
-
-                        // TODO figure out how to delete blobs when they are no longer
-                        // referenced
-                        let blob_seq_numbers_to_delete: Vec<u32> = Vec::new();
-
-                        let mut keys_written = 0;
-
-                        let mut total_key_size = 0;
-                        let mut total_value_size = 0;
-                        let mut current: Option<LookupEntry> = None;
-                        let mut entries = Vec::new();
-                        let mut last_entries = Vec::new();
-                        let mut last_entries_total_sizes = (0, 0);
-                        for entry in iter {
-                            let entry = entry?;
-
-                            // Remove duplicates
-                            if let Some(current) = current.take() {
-                                if current.key != entry.key {
-                                    let key_size = current.key.len();
-                                    let value_size = current.value.size_in_sst();
-                                    total_key_size += key_size;
-                                    total_value_size += value_size;
-
-                                    if total_key_size + total_value_size
-                                        > DATA_THRESHOLD_PER_COMPACTED_FILE
-                                        || entries.len() >= MAX_ENTRIES_PER_COMPACTED_FILE
-                                    {
-                                        let (selected_total_key_size, selected_total_value_size) =
-                                            last_entries_total_sizes;
-                                        swap(&mut entries, &mut last_entries);
-                                        last_entries_total_sizes = (
-                                            total_key_size - key_size,
-                                            total_value_size - value_size,
-                                        );
-                                        total_key_size = key_size;
-                                        total_value_size = value_size;
-
-                                        if !entries.is_empty() {
-                                            let seq =
-                                                sequence_number.fetch_add(1, Ordering::SeqCst) + 1;
-
-                                            keys_written += entries.len() as u64;
-                                            new_sst_files.push(create_sst_file(
-                                                &entries,
-                                                selected_total_key_size,
-                                                selected_total_value_size,
-                                                path,
-                                                seq,
-                                            )?);
-
-                                            entries.clear();
+                                let mut new_sst_files = Vec::new();
+
+                                // Iterate all SST files
+                                let iters = indices
+                                    .iter()
+                                    .map(|&index| {
+                                        let meta_index = ssts_with_ranges[index].meta_index;
+                                        let index_in_meta = ssts_with_ranges[index].index_in_meta;
+                                        let meta = &meta_files[meta_index];
+                                        meta.entry(index_in_meta)
+                                            .sst(meta)?
+                                            .iter(key_block_cache, value_block_cache)
+                                    })
+                                    .collect::<Result<Vec<_>>>()?;
+
+                                let iter = MergeIter::new(iters.into_iter())?;
+
+                                // TODO figure out how to delete blobs when they are no longer
+                                // referenced
+                                let blob_seq_numbers_to_delete: Vec<u32> = Vec::new();
+
+                                let mut keys_written = 0;
+
+                                let mut total_key_size = 0;
+                                let mut total_value_size = 0;
+                                let mut current: Option<LookupEntry<'_>> = None;
+                                let mut entries = Vec::new();
+                                let mut last_entries = Vec::new();
+                                let mut last_entries_total_key_size = 0;
+                                for entry in iter {
+                                    let entry = entry?;
+
+                                    // Remove duplicates
+                                    if let Some(current) = current.take() {
+                                        if current.key != entry.key {
+                                            let key_size = current.key.len();
+                                            let value_size =
+                                                current.value.uncompressed_size_in_sst();
+                                            total_key_size += key_size;
+                                            total_value_size += value_size;
+
+                                            if total_key_size + total_value_size
+                                                > DATA_THRESHOLD_PER_COMPACTED_FILE
+                                                || entries.len() >= MAX_ENTRIES_PER_COMPACTED_FILE
+                                            {
+                                                let selected_total_key_size =
+                                                    last_entries_total_key_size;
+                                                swap(&mut entries, &mut last_entries);
+                                                last_entries_total_key_size =
+                                                    total_key_size - key_size;
+                                                total_key_size = key_size;
+                                                total_value_size = value_size;
+
+                                                if !entries.is_empty() {
+                                                    let seq = sequence_number
+                                                        .fetch_add(1, Ordering::SeqCst)
+                                                        + 1;
+
+                                                    keys_written += entries.len() as u64;
+                                                    new_sst_files.push(create_sst_file(
+                                                        &self.parallel_scheduler,
+                                                        &entries,
+                                                        selected_total_key_size,
+                                                        path,
+                                                        seq,
+                                                    )?);
+
+                                                    entries.clear();
+                                                }
+                                            }
+
+                                            entries.push(current);
+                                        } else {
+                                            // Override value
                                         }
                                     }
+                                    current = Some(entry);
+                                }
+                                if let Some(entry) = current {
+                                    total_key_size += entry.key.len();
+                                    // Obsolete as we no longer need total_value_size
+                                    // total_value_size += entry.value.uncompressed_size_in_sst();
+                                    entries.push(entry);
+                                }
 
-                                    entries.push(current);
-                                } else {
-                                    // Override value
+                                // If we have one set of entries left, write them to a new SST file
+                                if last_entries.is_empty() && !entries.is_empty() {
+                                    let seq = sequence_number.fetch_add(1, Ordering::SeqCst) + 1;
+
+                                    keys_written += entries.len() as u64;
+                                    new_sst_files.push(create_sst_file(
+                                        &self.parallel_scheduler,
+                                        &entries,
+                                        total_key_size,
+                                        path,
+                                        seq,
+                                    )?);
+                                } else
+                                // If we have two sets of entries left, merge them and
+                                // split it into two SST files, to avoid having a
+                                // single SST file that is very small.
+                                if !last_entries.is_empty() {
+                                    last_entries.append(&mut entries);
+
+                                    last_entries_total_key_size += total_key_size;
+
+                                    let (part1, part2) =
+                                        last_entries.split_at(last_entries.len() / 2);
+
+                                    let seq1 = sequence_number.fetch_add(1, Ordering::SeqCst) + 1;
+                                    let seq2 = sequence_number.fetch_add(1, Ordering::SeqCst) + 1;
+
+                                    keys_written += part1.len() as u64;
+                                    new_sst_files.push(create_sst_file(
+                                        &self.parallel_scheduler,
+                                        part1,
+                                        // We don't know the exact sizes so we estimate them
+                                        last_entries_total_key_size / 2,
+                                        path,
+                                        seq1,
+                                    )?);
+
+                                    keys_written += part2.len() as u64;
+                                    new_sst_files.push(create_sst_file(
+                                        &self.parallel_scheduler,
+                                        part2,
+                                        last_entries_total_key_size / 2,
+                                        path,
+                                        seq2,
+                                    )?);
                                 }
+                                Ok(PartialMergeResult::Merged {
+                                    new_sst_files,
+                                    blob_seq_numbers_to_delete,
+                                    keys_written,
+                                })
+                            },
+                        )
+                        .with_context(|| {
+                            format!("Failed to merge database files for family {family}")
+                        })?;
+
+                    let Some((sst_files_len, blob_delete_len)) = merge_result
+                        .iter()
+                        .map(|r| {
+                            if let PartialMergeResult::Merged {
+                                new_sst_files,
+                                blob_seq_numbers_to_delete,
+                                keys_written: _,
+                            } = r
+                            {
+                                (new_sst_files.len(), blob_seq_numbers_to_delete.len())
+                            } else {
+                                (0, 0)
                             }
-                            current = Some(entry);
-                        }
-                        if let Some(entry) = current {
-                            total_key_size += entry.key.len();
-                            total_value_size += entry.value.size_in_sst();
-                            entries.push(entry);
-                        }
-
-                        // If we have one set of entries left, write them to a new SST file
-                        if last_entries.is_empty() && !entries.is_empty() {
-                            let seq = sequence_number.fetch_add(1, Ordering::SeqCst) + 1;
-
-                            keys_written += entries.len() as u64;
-                            new_sst_files.push(create_sst_file(
-                                &entries,
-                                total_key_size,
-                                total_value_size,
-                                path,
-                                seq,
-                            )?);
-                        } else
-                        // If we have two sets of entries left, merge them and
-                        // split it into two SST files, to avoid having a
-                        // single SST file that is very small.
-                        if !last_entries.is_empty() {
-                            last_entries.append(&mut entries);
-
-                            last_entries_total_sizes.0 += total_key_size;
-                            last_entries_total_sizes.1 += total_value_size;
-
-                            let (part1, part2) = last_entries.split_at(last_entries.len() / 2);
-
-                            let seq1 = sequence_number.fetch_add(1, Ordering::SeqCst) + 1;
-                            let seq2 = sequence_number.fetch_add(1, Ordering::SeqCst) + 1;
-
-                            keys_written += part1.len() as u64;
-                            new_sst_files.push(create_sst_file(
-                                part1,
-                                // We don't know the exact sizes so we estimate them
-                                last_entries_total_sizes.0 / 2,
-                                last_entries_total_sizes.1 / 2,
-                                path,
-                                seq1,
-                            )?);
-
-                            keys_written += part2.len() as u64;
-                            new_sst_files.push(create_sst_file(
-                                part2,
-                                last_entries_total_sizes.0 / 2,
-                                last_entries_total_sizes.1 / 2,
-                                path,
-                                seq2,
-                            )?);
-                        }
-                        Ok(PartialMergeResult::Merged {
-                            new_sst_files,
-                            blob_seq_numbers_to_delete,
-                            keys_written,
                         })
-                    })
-                    .collect::<Result<Vec<_>>>()
-                    .with_context(|| {
-                        format!("Failed to merge database files for family {family}")
-                    })?;
-
-                let Some((sst_files_len, blob_delete_len)) = merge_result
-                    .iter()
-                    .map(|r| {
-                        if let PartialMergeResult::Merged {
-                            new_sst_files,
-                            blob_seq_numbers_to_delete,
-                            keys_written: _,
-                        } = r
-                        {
-                            (new_sst_files.len(), blob_seq_numbers_to_delete.len())
-                        } else {
-                            (0, 0)
-                        }
-                    })
-                    .reduce(|(a1, a2), (b1, b2)| (a1 + b1, a2 + b2))
-                else {
-                    unreachable!()
-                };
-
-                let mut new_sst_files = Vec::with_capacity(sst_files_len);
-                let mut blob_seq_numbers_to_delete = Vec::with_capacity(blob_delete_len);
-
-                let mut meta_file_builder = MetaFileBuilder::new(family);
-
-                let mut keys_written = 0;
-                for result in merge_result {
-                    match result {
-                        PartialMergeResult::Merged {
-                            new_sst_files: merged_new_sst_files,
-                            blob_seq_numbers_to_delete: merged_blob_seq_numbers_to_delete,
-                            keys_written: merged_keys_written,
-                        } => {
-                            for (seq, file, meta) in merged_new_sst_files {
+                        .reduce(|(a1, a2), (b1, b2)| (a1 + b1, a2 + b2))
+                    else {
+                        unreachable!()
+                    };
+
+                    let mut new_sst_files = Vec::with_capacity(sst_files_len);
+                    let mut blob_seq_numbers_to_delete = Vec::with_capacity(blob_delete_len);
+
+                    let mut meta_file_builder = MetaFileBuilder::new(family);
+
+                    let mut keys_written = 0;
+                    for result in merge_result {
+                        match result {
+                            PartialMergeResult::Merged {
+                                new_sst_files: merged_new_sst_files,
+                                blob_seq_numbers_to_delete: merged_blob_seq_numbers_to_delete,
+                                keys_written: merged_keys_written,
+                            } => {
+                                for (seq, file, meta) in merged_new_sst_files {
+                                    meta_file_builder.add(seq, meta);
+                                    new_sst_files.push((seq, file));
+                                }
+                                blob_seq_numbers_to_delete
+                                    .extend(merged_blob_seq_numbers_to_delete);
+                                keys_written += merged_keys_written;
+                            }
+                            PartialMergeResult::Move { seq, meta } => {
                                 meta_file_builder.add(seq, meta);
-                                new_sst_files.push((seq, file));
                             }
-                            blob_seq_numbers_to_delete.extend(merged_blob_seq_numbers_to_delete);
-                            keys_written += merged_keys_written;
-                        }
-                        PartialMergeResult::Move { seq, meta } => {
-                            meta_file_builder.add(seq, meta);
                         }
                     }
-                }
 
-                for &seq in sst_seq_numbers_to_delete.iter() {
-                    meta_file_builder.add_obsolete_sst_file(seq);
-                }
+                    for &seq in sst_seq_numbers_to_delete.iter() {
+                        meta_file_builder.add_obsolete_sst_file(seq);
+                    }
 
-                let seq = sequence_number.fetch_add(1, Ordering::SeqCst) + 1;
-                let meta_file = {
-                    let _span = tracing::trace_span!("write meta file").entered();
-                    meta_file_builder.write(&self.path, seq)?
-                };
-
-                Ok(PartialResultPerFamily {
-                    new_meta_file: Some((seq, meta_file)),
-                    new_sst_files,
-                    sst_seq_numbers_to_delete,
-                    blob_seq_numbers_to_delete,
-                    keys_written,
-                })
-            })
-            .collect::<Result<Vec<_>>>()?;
+                    let seq = sequence_number.fetch_add(1, Ordering::SeqCst) + 1;
+                    let meta_file = {
+                        let _span = tracing::trace_span!("write meta file").entered();
+                        self.parallel_scheduler
+                            .block_in_place(|| meta_file_builder.write(&self.path, seq))?
+                    };
+
+                    Ok(PartialResultPerFamily {
+                        new_meta_file: Some((seq, meta_file)),
+                        new_sst_files,
+                        sst_seq_numbers_to_delete,
+                        blob_seq_numbers_to_delete,
+                        keys_written,
+                    })
+                },
+            )?;
 
         for PartialResultPerFamily {
             new_meta_file: inner_new_meta_file,
@@ -1237,8 +1259,6 @@ impl TurboPersistence {
                             amqf_entries: amqf.len(),
                             key_compression_dictionary_size: entry
                                 .key_compression_dictionary_length(),
-                            value_compression_dictionary_size: entry
-                                .value_compression_dictionary_length(),
                             block_count: entry.block_count(),
                         }
                     })
@@ -1276,6 +1296,5 @@ pub struct MetaFileEntryInfo {
     pub amqf_entries: usize,
     pub sst_size: u64,
     pub key_compression_dictionary_size: u16,
-    pub value_compression_dictionary_size: u16,
     pub block_count: u16,
 }
diff --git a/turbopack/crates/turbo-persistence/src/lib.rs b/turbopack/crates/turbo-persistence/src/lib.rs
index 70c87199f396c..b95f481c9002a 100644
--- a/turbopack/crates/turbo-persistence/src/lib.rs
+++ b/turbopack/crates/turbo-persistence/src/lib.rs
@@ -8,24 +8,27 @@ mod arc_slice;
 mod collector;
 mod collector_entry;
 mod compaction;
+mod compression;
 mod constants;
 mod db;
 mod key;
 mod lookup_entry;
 mod merge_iter;
+mod meta_file;
+mod meta_file_builder;
+mod parallel_scheduler;
+mod sst_filter;
 mod static_sorted_file;
 mod static_sorted_file_builder;
+mod value_buf;
 mod write_batch;
 
-mod meta_file;
-mod meta_file_builder;
-mod sst_filter;
 #[cfg(test)]
 mod tests;
-mod value_buf;
 
 pub use arc_slice::ArcSlice;
 pub use db::{CompactConfig, MetaFileEntryInfo, MetaFileInfo, TurboPersistence};
 pub use key::{KeyBase, QueryKey, StoreKey};
+pub use parallel_scheduler::{ParallelScheduler, SerialScheduler};
 pub use value_buf::ValueBuffer;
 pub use write_batch::WriteBatch;
diff --git a/turbopack/crates/turbo-persistence/src/lookup_entry.rs b/turbopack/crates/turbo-persistence/src/lookup_entry.rs
index 0a1612b05520a..c55adca31eaea 100644
--- a/turbopack/crates/turbo-persistence/src/lookup_entry.rs
+++ b/turbopack/crates/turbo-persistence/src/lookup_entry.rs
@@ -14,28 +14,42 @@ pub enum LookupValue {
     Blob { sequence_number: u32 },
 }
 
-impl LookupValue {
+/// A value from a SST file lookup.
+pub enum LazyLookupValue<'l> {
+    /// A LookupValue
+    Eager(LookupValue),
+    /// A medium sized value that is still compressed.
+    Medium {
+        uncompressed_size: u32,
+        block: &'l [u8],
+    },
+}
+
+impl LazyLookupValue<'_> {
     /// Returns the size of the value in the SST file.
-    pub fn size_in_sst(&self) -> usize {
+    pub fn uncompressed_size_in_sst(&self) -> usize {
         match self {
-            LookupValue::Slice { value } => value.len(),
-            LookupValue::Deleted => 0,
-            LookupValue::Blob { .. } => 0,
+            LazyLookupValue::Eager(LookupValue::Slice { value }) => value.len(),
+            LazyLookupValue::Eager(LookupValue::Deleted) => 0,
+            LazyLookupValue::Eager(LookupValue::Blob { .. }) => 0,
+            LazyLookupValue::Medium {
+                uncompressed_size, ..
+            } => *uncompressed_size as usize,
         }
     }
 }
 
 /// An entry from a SST file lookup.
-pub struct LookupEntry {
+pub struct LookupEntry<'l> {
     /// The hash of the key.
     pub hash: u64,
     /// The key.
     pub key: ArcSlice<u8>,
     /// The value.
-    pub value: LookupValue,
+    pub value: LazyLookupValue<'l>,
 }
 
-impl Entry for LookupEntry {
+impl Entry for LookupEntry<'_> {
     fn key_hash(&self) -> u64 {
         self.hash
     }
@@ -50,17 +64,24 @@ impl Entry for LookupEntry {
 
     fn value(&self) -> EntryValue<'_> {
         match &self.value {
-            LookupValue::Deleted => EntryValue::Deleted,
-            LookupValue::Slice { value } => {
+            LazyLookupValue::Eager(LookupValue::Deleted) => EntryValue::Deleted,
+            LazyLookupValue::Eager(LookupValue::Slice { value }) => {
                 if value.len() > MAX_SMALL_VALUE_SIZE {
                     EntryValue::Medium { value }
                 } else {
                     EntryValue::Small { value }
                 }
             }
-            LookupValue::Blob { sequence_number } => EntryValue::Large {
+            LazyLookupValue::Eager(LookupValue::Blob { sequence_number }) => EntryValue::Large {
                 blob: *sequence_number,
             },
+            LazyLookupValue::Medium {
+                uncompressed_size,
+                block,
+            } => EntryValue::MediumCompressed {
+                uncompressed_size: *uncompressed_size,
+                block,
+            },
         }
     }
 }
diff --git a/turbopack/crates/turbo-persistence/src/merge_iter.rs b/turbopack/crates/turbo-persistence/src/merge_iter.rs
index 251ef32c26db5..087484db4ca3b 100644
--- a/turbopack/crates/turbo-persistence/src/merge_iter.rs
+++ b/turbopack/crates/turbo-persistence/src/merge_iter.rs
@@ -6,27 +6,27 @@ use crate::lookup_entry::LookupEntry;
 
 /// An active iterator that is being merged. It has peeked the next element and can be compared
 /// according to that element. The `order` is used when multiple iterators have the same key.
-struct ActiveIterator<T: Iterator<Item = Result<LookupEntry>>> {
+struct ActiveIterator<'l, T: Iterator<Item = Result<LookupEntry<'l>>>> {
     iter: T,
     order: usize,
-    entry: LookupEntry,
+    entry: LookupEntry<'l>,
 }
 
-impl<T: Iterator<Item = Result<LookupEntry>>> PartialEq for ActiveIterator<T> {
+impl<'l, T: Iterator<Item = Result<LookupEntry<'l>>>> PartialEq for ActiveIterator<'l, T> {
     fn eq(&self, other: &Self) -> bool {
         self.entry.hash == other.entry.hash && *self.entry.key == *other.entry.key
     }
 }
 
-impl<T: Iterator<Item = Result<LookupEntry>>> Eq for ActiveIterator<T> {}
+impl<'l, T: Iterator<Item = Result<LookupEntry<'l>>>> Eq for ActiveIterator<'l, T> {}
 
-impl<T: Iterator<Item = Result<LookupEntry>>> PartialOrd for ActiveIterator<T> {
+impl<'l, T: Iterator<Item = Result<LookupEntry<'l>>>> PartialOrd for ActiveIterator<'l, T> {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
         Some(self.cmp(other))
     }
 }
 
-impl<T: Iterator<Item = Result<LookupEntry>>> Ord for ActiveIterator<T> {
+impl<'l, T: Iterator<Item = Result<LookupEntry<'l>>>> Ord for ActiveIterator<'l, T> {
     fn cmp(&self, other: &Self) -> Ordering {
         self.entry
             .hash
@@ -39,11 +39,11 @@ impl<T: Iterator<Item = Result<LookupEntry>>> Ord for ActiveIterator<T> {
 
 /// An iterator that merges multiple sorted iterators into a single sorted iterator. Internal it
 /// uses an heap of iterators to iterate them in order.
-pub struct MergeIter<T: Iterator<Item = Result<LookupEntry>>> {
-    heap: BinaryHeap<ActiveIterator<T>>,
+pub struct MergeIter<'l, T: Iterator<Item = Result<LookupEntry<'l>>>> {
+    heap: BinaryHeap<ActiveIterator<'l, T>>,
 }
 
-impl<T: Iterator<Item = Result<LookupEntry>>> MergeIter<T> {
+impl<'l, T: Iterator<Item = Result<LookupEntry<'l>>>> MergeIter<'l, T> {
     pub fn new(iters: impl Iterator<Item = T>) -> Result<Self> {
         let mut heap = BinaryHeap::new();
         for (order, mut iter) in iters.enumerate() {
@@ -56,8 +56,8 @@ impl<T: Iterator<Item = Result<LookupEntry>>> MergeIter<T> {
     }
 }
 
-impl<T: Iterator<Item = Result<LookupEntry>>> Iterator for MergeIter<T> {
-    type Item = Result<LookupEntry>;
+impl<'l, T: Iterator<Item = Result<LookupEntry<'l>>>> Iterator for MergeIter<'l, T> {
+    type Item = Result<LookupEntry<'l>>;
 
     fn next(&mut self) -> Option<Self::Item> {
         let ActiveIterator {
diff --git a/turbopack/crates/turbo-persistence/src/meta_file.rs b/turbopack/crates/turbo-persistence/src/meta_file.rs
index 871cccd3cd512..3c0f1b3aa755e 100644
--- a/turbopack/crates/turbo-persistence/src/meta_file.rs
+++ b/turbopack/crates/turbo-persistence/src/meta_file.rs
@@ -144,10 +144,6 @@ impl MetaEntry {
         self.sst_data.key_compression_dictionary_length
     }
 
-    pub fn value_compression_dictionary_length(&self) -> u16 {
-        self.sst_data.value_compression_dictionary_length
-    }
-
     pub fn block_count(&self) -> u16 {
         self.sst_data.block_count
     }
@@ -222,7 +218,6 @@ impl MetaFile {
                 sst_data: StaticSortedFileMetaData {
                     sequence_number: file.read_u32::<BE>()?,
                     key_compression_dictionary_length: file.read_u16::<BE>()?,
-                    value_compression_dictionary_length: file.read_u16::<BE>()?,
                     block_count: file.read_u16::<BE>()?,
                 },
                 family,
diff --git a/turbopack/crates/turbo-persistence/src/meta_file_builder.rs b/turbopack/crates/turbo-persistence/src/meta_file_builder.rs
index afa402ac68473..6783175368371 100644
--- a/turbopack/crates/turbo-persistence/src/meta_file_builder.rs
+++ b/turbopack/crates/turbo-persistence/src/meta_file_builder.rs
@@ -58,7 +58,6 @@ impl<'a> MetaFileBuilder<'a> {
         for (sequence_number, sst) in &self.entries {
             file.write_u32::<BE>(*sequence_number)?;
             file.write_u16::<BE>(sst.key_compression_dictionary_length)?;
-            file.write_u16::<BE>(sst.value_compression_dictionary_length)?;
             file.write_u16::<BE>(sst.block_count)?;
             file.write_u64::<BE>(sst.min_hash)?;
             file.write_u64::<BE>(sst.max_hash)?;
diff --git a/turbopack/crates/turbo-persistence/src/parallel_scheduler.rs b/turbopack/crates/turbo-persistence/src/parallel_scheduler.rs
new file mode 100644
index 0000000000000..b2415b54aa423
--- /dev/null
+++ b/turbopack/crates/turbo-persistence/src/parallel_scheduler.rs
@@ -0,0 +1,148 @@
+pub trait ParallelScheduler: Clone + Sync + Send {
+    fn block_in_place<R>(&self, f: impl FnOnce() -> R + Send) -> R
+    where
+        R: Send;
+
+    fn parallel_for_each<T>(&self, items: &[T], f: impl Fn(&T) + Send + Sync)
+    where
+        T: Sync;
+
+    fn try_parallel_for_each<'l, T, E>(
+        &self,
+        items: &'l [T],
+        f: impl (Fn(&'l T) -> Result<(), E>) + Send + Sync,
+    ) -> Result<(), E>
+    where
+        T: Sync,
+        E: Send + 'static;
+
+    fn try_parallel_for_each_mut<'l, T, E>(
+        &self,
+        items: &'l mut [T],
+        f: impl (Fn(&'l mut T) -> Result<(), E>) + Send + Sync,
+    ) -> Result<(), E>
+    where
+        T: Send + Sync,
+        E: Send + 'static;
+
+    fn try_vec_into_parallel_for_each<T, E>(
+        &self,
+        items: Vec<T>,
+        f: impl (Fn(T) -> Result<(), E>) + Send + Sync,
+    ) -> Result<(), E>
+    where
+        T: Send + Sync,
+        E: Send + 'static;
+
+    fn parallel_map_collect<'l, T, I, R>(
+        &self,
+        items: &'l [T],
+        f: impl Fn(&'l T) -> I + Send + Sync,
+    ) -> R
+    where
+        T: Sync,
+        I: Send + Sync + 'l,
+        R: FromIterator<I>;
+
+    fn vec_into_parallel_map_collect<T, I, R>(
+        &self,
+        items: Vec<T>,
+        f: impl Fn(T) -> I + Send + Sync,
+    ) -> R
+    where
+        T: Send + Sync,
+        I: Send + Sync,
+        R: FromIterator<I>;
+}
+
+#[derive(Clone, Copy, Default)]
+pub struct SerialScheduler;
+
+impl ParallelScheduler for SerialScheduler {
+    fn block_in_place<R>(&self, f: impl FnOnce() -> R + Send) -> R
+    where
+        R: Send,
+    {
+        f()
+    }
+
+    fn parallel_for_each<T>(&self, items: &[T], f: impl Fn(&T) + Send + Sync)
+    where
+        T: Sync,
+    {
+        for item in items {
+            f(item);
+        }
+    }
+
+    fn try_parallel_for_each<'l, T, E>(
+        &self,
+        items: &'l [T],
+        f: impl (Fn(&'l T) -> Result<(), E>) + Send + Sync,
+    ) -> Result<(), E>
+    where
+        T: Sync,
+        E: Send,
+    {
+        for item in items {
+            f(item)?;
+        }
+        Ok(())
+    }
+
+    fn try_parallel_for_each_mut<'l, T, E>(
+        &self,
+        items: &'l mut [T],
+        f: impl (Fn(&'l mut T) -> Result<(), E>) + Send + Sync,
+    ) -> Result<(), E>
+    where
+        T: Sync,
+        E: Send,
+    {
+        for item in items {
+            f(item)?;
+        }
+        Ok(())
+    }
+
+    fn try_vec_into_parallel_for_each<T, E>(
+        &self,
+        items: Vec<T>,
+        f: impl (Fn(T) -> Result<(), E>) + Send + Sync,
+    ) -> Result<(), E>
+    where
+        T: Sync,
+        E: Send,
+    {
+        for item in items {
+            f(item)?;
+        }
+        Ok(())
+    }
+
+    fn parallel_map_collect<'l, T, I, R>(
+        &self,
+        items: &'l [T],
+        f: impl Fn(&'l T) -> I + Send + Sync,
+    ) -> R
+    where
+        T: Sync,
+        I: Send + Sync + 'l,
+        R: FromIterator<I>,
+    {
+        items.iter().map(f).collect()
+    }
+
+    fn vec_into_parallel_map_collect<T, I, R>(
+        &self,
+        items: Vec<T>,
+        f: impl Fn(T) -> I + Send + Sync,
+    ) -> R
+    where
+        T: Send + Sync,
+        I: Send + Sync,
+        R: FromIterator<I>,
+    {
+        items.into_iter().map(f).collect()
+    }
+}
diff --git a/turbopack/crates/turbo-persistence/src/static_sorted_file.rs b/turbopack/crates/turbo-persistence/src/static_sorted_file.rs
index 59b199a6248bf..eac0b9b33d97a 100644
--- a/turbopack/crates/turbo-persistence/src/static_sorted_file.rs
+++ b/turbopack/crates/turbo-persistence/src/static_sorted_file.rs
@@ -2,15 +2,12 @@ use std::{
     cmp::Ordering,
     fs::File,
     hash::BuildHasherDefault,
-    mem::{MaybeUninit, transmute},
     ops::Range,
     path::{Path, PathBuf},
-    sync::Arc,
 };
 
 use anyhow::{Context, Result, bail};
 use byteorder::{BE, ReadBytesExt};
-use lzzzz::lz4::decompress_with_dict;
 use memmap2::Mmap;
 use quick_cache::sync::GuardResult;
 use rustc_hash::FxHasher;
@@ -18,7 +15,8 @@ use rustc_hash::FxHasher;
 use crate::{
     QueryKey,
     arc_slice::ArcSlice,
-    lookup_entry::{LookupEntry, LookupValue},
+    compression::decompress_into_arc,
+    lookup_entry::{LazyLookupValue, LookupEntry, LookupValue},
 };
 
 /// The block header for an index block.
@@ -67,8 +65,6 @@ pub struct StaticSortedFileMetaData {
     pub sequence_number: u32,
     /// The length of the key compression dictionary.
     pub key_compression_dictionary_length: u16,
-    /// The length of the value compression dictionary.
-    pub value_compression_dictionary_length: u16,
     /// The number of blocks in the SST file.
     pub block_count: u16,
 }
@@ -81,8 +77,7 @@ impl StaticSortedFileMetaData {
 
     pub fn blocks_start(&self) -> usize {
         let k: usize = self.key_compression_dictionary_length.into();
-        let v: usize = self.value_compression_dictionary_length.into();
-        k + v
+        k
     }
 
     pub fn key_compression_dictionary_range(&self) -> Range<usize> {
@@ -90,12 +85,6 @@ impl StaticSortedFileMetaData {
         let end: usize = self.key_compression_dictionary_length.into();
         start..end
     }
-
-    pub fn value_compression_dictionary_range(&self) -> Range<usize> {
-        let start = self.key_compression_dictionary_length as usize;
-        let end = start + self.value_compression_dictionary_length as usize;
-        start..end
-    }
 }
 
 /// A memory mapped SST file.
@@ -312,7 +301,7 @@ impl StaticSortedFile {
             match value_block_cache.get_value_or_guard(&(self.meta.sequence_number, block), None) {
                 GuardResult::Value(block) => block,
                 GuardResult::Guard(guard) => {
-                    let block = self.read_value_block(block)?;
+                    let block = self.read_small_value_block(block)?;
                     let _ = guard.insert(block.clone());
                     block
                 }
@@ -325,20 +314,41 @@ impl StaticSortedFile {
     fn read_key_block(&self, block_index: u16) -> Result<ArcSlice<u8>> {
         self.read_block(
             block_index,
-            &self.mmap[self.meta.key_compression_dictionary_range()],
+            Some(&self.mmap[self.meta.key_compression_dictionary_range()]),
+            false,
         )
     }
 
+    /// Reads a value block from the file.
+    fn read_small_value_block(&self, block_index: u16) -> Result<ArcSlice<u8>> {
+        self.read_block(block_index, None, false)
+    }
+
     /// Reads a value block from the file.
     fn read_value_block(&self, block_index: u16) -> Result<ArcSlice<u8>> {
-        self.read_block(
-            block_index,
-            &self.mmap[self.meta.value_compression_dictionary_range()],
-        )
+        self.read_block(block_index, None, true)
     }
 
     /// Reads a block from the file.
-    fn read_block(&self, block_index: u16, compression_dictionary: &[u8]) -> Result<ArcSlice<u8>> {
+    fn read_block(
+        &self,
+        block_index: u16,
+        compression_dictionary: Option<&[u8]>,
+        long_term: bool,
+    ) -> Result<ArcSlice<u8>> {
+        let (uncompressed_length, block) = self.get_compressed_block(block_index)?;
+
+        let buffer = decompress_into_arc(
+            uncompressed_length,
+            block,
+            compression_dictionary,
+            long_term,
+        )?;
+        Ok(ArcSlice::from(buffer))
+    }
+
+    /// Gets the slice of the compressed block from the memory mapped file.
+    fn get_compressed_block(&self, block_index: u16) -> Result<(u32, &[u8])> {
         #[cfg(feature = "strict_checks")]
         if block_index >= self.meta.block_count {
             bail!(
@@ -386,17 +396,9 @@ impl StaticSortedFile {
                 self.meta.blocks_start()
             );
         }
-        let uncompressed_length =
-            (&self.mmap[block_start..block_start + 4]).read_u32::<BE>()? as usize;
-        let block = self.mmap[block_start + 4..block_end].to_vec();
-
-        let buffer = Arc::new_zeroed_slice(uncompressed_length);
-        // Safety: MaybeUninit<u8> can be safely transmuted to u8.
-        let mut buffer = unsafe { transmute::<Arc<[MaybeUninit<u8>]>, Arc<[u8]>>(buffer) };
-        // Safety: We know that the buffer is not shared yet.
-        let decompressed = unsafe { Arc::get_mut_unchecked(&mut buffer) };
-        decompress_with_dict(&block, decompressed, compression_dictionary)?;
-        Ok(ArcSlice::from(buffer))
+        let uncompressed_length = (&self.mmap[block_start..block_start + 4]).read_u32::<BE>()?;
+        let block = &self.mmap[block_start + 4..block_end];
+        Ok((uncompressed_length, block))
     }
 }
 
@@ -423,15 +425,15 @@ struct CurrentIndexBlock {
     index: usize,
 }
 
-impl Iterator for StaticSortedFileIter<'_> {
-    type Item = Result<LookupEntry>;
+impl<'l> Iterator for StaticSortedFileIter<'l> {
+    type Item = Result<LookupEntry<'l>>;
 
     fn next(&mut self) -> Option<Self::Item> {
         self.next_internal().transpose()
     }
 }
 
-impl StaticSortedFileIter<'_> {
+impl<'l> StaticSortedFileIter<'l> {
     /// Enters a block at the given index.
     fn enter_block(&mut self, block_index: u16) -> Result<()> {
         let block_arc = self.this.get_key_block(block_index, self.key_block_cache)?;
@@ -468,7 +470,7 @@ impl StaticSortedFileIter<'_> {
     }
 
     /// Gets the next entry in the file and moves the cursor.
-    fn next_internal(&mut self) -> Result<Option<LookupEntry>> {
+    fn next_internal(&mut self) -> Result<Option<LookupEntry<'l>>> {
         loop {
             if let Some(CurrentKeyBlock {
                 offsets,
@@ -479,9 +481,20 @@ impl StaticSortedFileIter<'_> {
             {
                 let GetKeyEntryResult { hash, key, ty, val } =
                     get_key_entry(&offsets, &entries, entry_count, index)?;
-                let value = self
-                    .this
-                    .handle_key_match(ty, val, self.value_block_cache)?;
+                let value = if ty == KEY_BLOCK_ENTRY_TYPE_MEDIUM {
+                    let mut val = val;
+                    let block = val.read_u16::<BE>()?;
+                    let (uncompressed_size, block) = self.this.get_compressed_block(block)?;
+                    LazyLookupValue::Medium {
+                        uncompressed_size,
+                        block,
+                    }
+                } else {
+                    let value = self
+                        .this
+                        .handle_key_match(ty, val, self.value_block_cache)?;
+                    LazyLookupValue::Eager(value)
+                };
                 let entry = LookupEntry {
                     hash,
                     // Safety: The key is a valid slice of the entries.
diff --git a/turbopack/crates/turbo-persistence/src/static_sorted_file_builder.rs b/turbopack/crates/turbo-persistence/src/static_sorted_file_builder.rs
index 74f3776797263..257ebcb4836b5 100644
--- a/turbopack/crates/turbo-persistence/src/static_sorted_file_builder.rs
+++ b/turbopack/crates/turbo-persistence/src/static_sorted_file_builder.rs
@@ -1,6 +1,6 @@
 use std::{
     borrow::Cow,
-    cmp::min,
+    cmp::{max, min},
     fs::File,
     io::{BufWriter, Seek, Write},
     path::Path,
@@ -8,43 +8,36 @@ use std::{
 
 use anyhow::{Context, Result};
 use byteorder::{BE, ByteOrder, WriteBytesExt};
-use lzzzz::lz4::{ACC_LEVEL_DEFAULT, max_compressed_size};
 
-use crate::static_sorted_file::{
-    BLOCK_TYPE_INDEX, BLOCK_TYPE_KEY, KEY_BLOCK_ENTRY_TYPE_BLOB, KEY_BLOCK_ENTRY_TYPE_DELETED,
-    KEY_BLOCK_ENTRY_TYPE_MEDIUM, KEY_BLOCK_ENTRY_TYPE_SMALL,
+use crate::{
+    compression::compress_into_buffer,
+    static_sorted_file::{
+        BLOCK_TYPE_INDEX, BLOCK_TYPE_KEY, KEY_BLOCK_ENTRY_TYPE_BLOB, KEY_BLOCK_ENTRY_TYPE_DELETED,
+        KEY_BLOCK_ENTRY_TYPE_MEDIUM, KEY_BLOCK_ENTRY_TYPE_SMALL,
+    },
 };
 
 /// The maximum number of entries that should go into a single key block
-const MAX_KEY_BLOCK_ENTRIES: usize = 100 * 1024;
+const MAX_KEY_BLOCK_ENTRIES: usize = MAX_KEY_BLOCK_SIZE / KEY_BLOCK_ENTRY_META_OVERHEAD;
 /// The maximum bytes that should go into a single key block
 // Note this must fit into 3 bytes length
 const MAX_KEY_BLOCK_SIZE: usize = 16 * 1024;
 /// Overhead of bytes that should be counted for entries in a key block in addition to the key size
 const KEY_BLOCK_ENTRY_META_OVERHEAD: usize = 8;
 /// The maximum number of entries that should go into a single small value block
-const MAX_SMALL_VALUE_BLOCK_ENTRIES: usize = 100 * 1024;
+const MAX_SMALL_VALUE_BLOCK_ENTRIES: usize = MAX_SMALL_VALUE_BLOCK_SIZE;
 /// The maximum bytes that should go into a single small value block
-const MAX_SMALL_VALUE_BLOCK_SIZE: usize = 16 * 1024;
+const MAX_SMALL_VALUE_BLOCK_SIZE: usize = 64 * 1024;
 /// The aimed false positive rate for the AMQF
 const AMQF_FALSE_POSITIVE_RATE: f64 = 0.01;
 
-/// The maximum compression dictionary size for value blocks
-const VALUE_COMPRESSION_DICTIONARY_SIZE: usize = 64 * 1024 - 1;
 /// The maximum compression dictionary size for key and index blocks
 const KEY_COMPRESSION_DICTIONARY_SIZE: usize = 64 * 1024 - 1;
-/// The maximum bytes that should be selected as value samples to create a compression dictionary
-const VALUE_COMPRESSION_SAMPLES_SIZE: usize = 256 * 1024;
 /// The maximum bytes that should be selected as key samples to create a compression dictionary
 const KEY_COMPRESSION_SAMPLES_SIZE: usize = 256 * 1024;
-/// The minimum bytes that should be selected as value samples. Below that no compression dictionary
-/// is used.
-const MIN_VALUE_COMPRESSION_SAMPLES_SIZE: usize = 1024;
 /// The minimum bytes that should be selected as key samples. Below that no compression dictionary
 /// is used.
 const MIN_KEY_COMPRESSION_SAMPLES_SIZE: usize = 1024;
-/// The bytes that are used per key/value entry for a sample.
-const COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY: usize = 100;
 /// The minimum bytes that are used per key/value entry for a sample.
 const MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY: usize = 16;
 
@@ -68,6 +61,11 @@ pub enum EntryValue<'l> {
     Small { value: &'l [u8] },
     /// Medium-sized value. They are stored in their own value block.
     Medium { value: &'l [u8] },
+    /// Medium-sized value. They are stored in their own value block. Precompressed.
+    MediumCompressed {
+        uncompressed_size: u32,
+        block: &'l [u8],
+    },
     /// Large-sized value. They are stored in a blob file.
     Large { blob: u32 },
     /// Tombstone. The value was removed.
@@ -84,8 +82,6 @@ pub struct StaticSortedFileBuilderMeta<'a> {
     pub amqf: Cow<'a, [u8]>,
     /// The key compression dictionary
     pub key_compression_dictionary_length: u16,
-    /// The value compression dictionary
-    pub value_compression_dictionary_length: u16,
     /// The number of blocks in the SST file
     pub block_count: u16,
     /// The file size of the SST file
@@ -97,21 +93,18 @@ pub struct StaticSortedFileBuilderMeta<'a> {
 pub fn write_static_stored_file<E: Entry>(
     entries: &[E],
     total_key_size: usize,
-    total_value_size: usize,
     file: &Path,
 ) -> Result<(StaticSortedFileBuilderMeta<'static>, File)> {
     debug_assert!(entries.iter().map(|e| e.key_hash()).is_sorted());
 
     let mut file = BufWriter::new(File::create(file)?);
 
-    let capacity = get_compression_buffer_capacity(total_key_size, total_value_size);
+    let capacity = get_compression_buffer_capacity(total_key_size);
     // We use a shared buffer for all operations to avoid excessive allocations
     let mut buffer = Vec::with_capacity(capacity);
 
     let key_dict = compute_key_compression_dictionary(entries, total_key_size, &mut buffer)?;
-    let value_dict = compute_value_compression_dictionary(entries, total_value_size, &mut buffer)?;
     file.write_all(&key_dict)?;
-    file.write_all(&value_dict)?;
 
     let mut block_writer = BlockWriter::new(&mut file, &mut buffer);
 
@@ -121,7 +114,7 @@ pub fn write_static_stored_file<E: Entry>(
     let mut buffer = Vec::new();
 
     let min_hash = entries.first().map_or(u64::MAX, |e| e.key_hash());
-    let value_locations = write_value_blocks(entries, &value_dict, &mut block_writer, &mut buffer)
+    let value_locations = write_value_blocks(entries, &mut block_writer, &mut buffer)
         .context("Failed to write value blocks")?;
     let amqf = write_key_blocks_and_compute_amqf(
         entries,
@@ -144,7 +137,6 @@ pub fn write_static_stored_file<E: Entry>(
         max_hash,
         amqf: Cow::Owned(amqf),
         key_compression_dictionary_length: key_dict.len().try_into().unwrap(),
-        value_compression_dictionary_length: value_dict.len().try_into().unwrap(),
         block_count,
         size: file.stream_position()?,
         entries: entries.len() as u64,
@@ -152,18 +144,11 @@ pub fn write_static_stored_file<E: Entry>(
     Ok((meta, file.into_inner()?))
 }
 
-fn get_compression_buffer_capacity(total_key_size: usize, total_value_size: usize) -> usize {
-    let mut size = 0;
-    if total_key_size >= MIN_KEY_COMPRESSION_SAMPLES_SIZE {
-        let key_compression_samples_size = min(KEY_COMPRESSION_SAMPLES_SIZE, total_key_size / 16);
-        size = key_compression_samples_size;
-    }
-    if total_value_size >= MIN_VALUE_COMPRESSION_SAMPLES_SIZE {
-        let value_compression_samples_size =
-            min(VALUE_COMPRESSION_SAMPLES_SIZE, total_value_size / 16);
-        size = size.max(value_compression_samples_size);
+fn get_compression_buffer_capacity(total_key_size: usize) -> usize {
+    if total_key_size < MIN_KEY_COMPRESSION_SAMPLES_SIZE {
+        return 0;
     }
-    size
+    min(KEY_COMPRESSION_SAMPLES_SIZE, total_key_size / 16)
 }
 
 /// Computes compression dictionaries from keys of all entries
@@ -173,23 +158,28 @@ fn compute_key_compression_dictionary<E: Entry>(
     total_key_size: usize,
     buffer: &mut Vec<u8>,
 ) -> Result<Vec<u8>> {
-    if total_key_size < MIN_KEY_COMPRESSION_SAMPLES_SIZE {
+    let key_compression_samples_size = get_compression_buffer_capacity(total_key_size);
+    if key_compression_samples_size == 0 {
         return Ok(Vec::new());
     }
-    let key_compression_samples_size = min(KEY_COMPRESSION_SAMPLES_SIZE, total_key_size / 16);
+
+    let max_sample_size = max(
+        MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY,
+        key_compression_samples_size / 1024,
+    );
+
     let mut sample_sizes = Vec::new();
 
-    // Limit the number of iterations to avoid infinite loops
-    let max_iterations = total_key_size / COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY * 2;
-    for i in 0..max_iterations {
-        let entry = &entries[i % entries.len()];
+    for entry in entries {
         let key_remaining = key_compression_samples_size - buffer.len();
         if key_remaining < MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY {
             break;
         }
         let len = entry.key_len();
         if len >= MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY {
-            let used_len = min(key_remaining, COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY);
+            let optimal_len =
+                (len / 8).clamp(MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY, max_sample_size);
+            let used_len = min(key_remaining, optimal_len);
             if len <= used_len {
                 sample_sizes.push(len);
                 entry.write_key_to(buffer);
@@ -204,57 +194,12 @@ fn compute_key_compression_dictionary<E: Entry>(
             }
         }
     }
-    debug_assert!(buffer.len() == sample_sizes.iter().sum::<usize>());
-    let result = if buffer.len() > MIN_KEY_COMPRESSION_SAMPLES_SIZE && sample_sizes.len() > 5 {
-        zstd::dict::from_continuous(buffer, &sample_sizes, KEY_COMPRESSION_DICTIONARY_SIZE)
-            .context("Key dictionary creation failed")?
-    } else {
-        Vec::new()
-    };
-    buffer.clear();
-    Ok(result)
-}
-
-/// Computes compression dictionaries from values of all entries
-#[tracing::instrument(level = "trace", skip(entries))]
-fn compute_value_compression_dictionary<E: Entry>(
-    entries: &[E],
-    total_value_size: usize,
-    buffer: &mut Vec<u8>,
-) -> Result<Vec<u8>> {
-    if total_value_size < MIN_VALUE_COMPRESSION_SAMPLES_SIZE {
-        return Ok(Vec::new());
-    }
-    let value_compression_samples_size = min(VALUE_COMPRESSION_SAMPLES_SIZE, total_value_size / 16);
-    let mut sample_sizes = Vec::new();
-
-    // Limit the number of iterations to avoid infinite loops
-    let max_iterations = total_value_size / COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY * 2;
-    for i in 0..max_iterations {
-        let entry = &entries[i % entries.len()];
-        let remaining = value_compression_samples_size - buffer.len();
-        if remaining < MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY {
-            break;
-        }
-        if let EntryValue::Small { value } | EntryValue::Medium { value } = entry.value() {
-            let len = value.len();
-            if len >= MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY {
-                let used_len = min(remaining, COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY);
-                if len <= used_len {
-                    sample_sizes.push(len);
-                    buffer.extend_from_slice(value);
-                } else {
-                    sample_sizes.push(used_len);
-                    let p = buffer.len() % (len - used_len);
-                    buffer.extend_from_slice(&value[p..p + used_len]);
-                };
-            }
-        }
-    }
-    debug_assert!(buffer.len() == sample_sizes.iter().sum::<usize>());
-    let result = if buffer.len() > MIN_VALUE_COMPRESSION_SAMPLES_SIZE && sample_sizes.len() > 5 {
-        zstd::dict::from_continuous(buffer, &sample_sizes, VALUE_COMPRESSION_DICTIONARY_SIZE)
-            .context("Value dictionary creation failed")?
+    /// The zlib dict builder requires at least 7 samples
+    const MIN_SAMPLE_SIZE: usize = 7;
+    let result = if buffer.len() > MIN_KEY_COMPRESSION_SAMPLES_SIZE
+        && sample_sizes.len() > MIN_SAMPLE_SIZE
+    {
+        zstd::dict::from_continuous(buffer, &sample_sizes, KEY_COMPRESSION_DICTIONARY_SIZE)?
     } else {
         Vec::new()
     };
@@ -293,25 +238,31 @@ impl<'l> BlockWriter<'l> {
 
     #[tracing::instrument(level = "trace", skip_all)]
     fn write_key_block(&mut self, block: &[u8], dict: &[u8]) -> Result<()> {
-        self.write_block(block, dict)
+        self.write_block(block, Some(dict), false)
             .context("Failed to write key block")
     }
 
     #[tracing::instrument(level = "trace", skip_all)]
     fn write_index_block(&mut self, block: &[u8], dict: &[u8]) -> Result<()> {
-        self.write_block(block, dict)
+        self.write_block(block, Some(dict), false)
             .context("Failed to write index block")
     }
 
     #[tracing::instrument(level = "trace", skip_all)]
-    fn write_value_block(&mut self, block: &[u8], dict: &[u8]) -> Result<()> {
-        self.write_block(block, dict)
+    fn write_small_value_block(&mut self, block: &[u8]) -> Result<()> {
+        self.write_block(block, None, false)
+            .context("Failed to write small value block")
+    }
+
+    #[tracing::instrument(level = "trace", skip_all)]
+    fn write_value_block(&mut self, block: &[u8]) -> Result<()> {
+        self.write_block(block, None, true)
             .context("Failed to write value block")
     }
 
-    fn write_block(&mut self, block: &[u8], dict: &[u8]) -> Result<()> {
+    fn write_block(&mut self, block: &[u8], dict: Option<&[u8]>, long_term: bool) -> Result<()> {
         let uncompressed_size = block.len().try_into().unwrap();
-        self.compress_block_into_buffer(block, dict);
+        self.compress_block_into_buffer(block, dict, long_term)?;
         let len = (self.buffer.len() + 4).try_into().unwrap();
         let offset = self
             .block_offsets
@@ -332,15 +283,34 @@ impl<'l> BlockWriter<'l> {
         Ok(())
     }
 
+    fn write_compressed_block(&mut self, uncompressed_size: u32, block: &[u8]) -> Result<()> {
+        let len = (block.len() + 4).try_into().unwrap();
+        let offset = self
+            .block_offsets
+            .last()
+            .copied()
+            .unwrap_or_default()
+            .checked_add(len)
+            .expect("Block offset overflow");
+        self.block_offsets.push(offset);
+
+        self.writer
+            .write_u32::<BE>(uncompressed_size)
+            .context("Failed to write uncompressed size")?;
+        self.writer
+            .write_all(block)
+            .context("Failed to write compressed block")?;
+        Ok(())
+    }
+
     /// Compresses a block with a compression dictionary.
-    #[tracing::instrument(level = "trace", skip_all)]
-    fn compress_block_into_buffer(&mut self, block: &[u8], dict: &[u8]) {
-        let mut compressor =
-            lzzzz::lz4::Compressor::with_dict(dict).expect("LZ4 compressor creation failed");
-        self.buffer.reserve(max_compressed_size(block.len()));
-        compressor
-            .next_to_vec(block, self.buffer, ACC_LEVEL_DEFAULT)
-            .expect("Compression failed");
+    fn compress_block_into_buffer(
+        &mut self,
+        block: &[u8],
+        dict: Option<&[u8]>,
+        long_term: bool,
+    ) -> Result<()> {
+        compress_into_buffer(block, dict, long_term, self.buffer)
     }
 }
 
@@ -348,7 +318,6 @@ impl<'l> BlockWriter<'l> {
 #[tracing::instrument(level = "trace", skip_all)]
 fn write_value_blocks(
     entries: &[impl Entry],
-    value_compression_dictionary: &[u8],
     writer: &mut BlockWriter<'_>,
     buffer: &mut Vec<u8>,
 ) -> Result<Vec<(u16, u32)>> {
@@ -371,7 +340,7 @@ fn write_value_blocks(
                             value_locations[j].0 = block_index;
                         }
                     }
-                    writer.write_value_block(buffer, value_compression_dictionary)?;
+                    writer.write_small_value_block(buffer)?;
                     buffer.clear();
                     current_block_start = i;
                     current_block_size = 0;
@@ -384,9 +353,17 @@ fn write_value_blocks(
             EntryValue::Medium { value } => {
                 let block_index = writer.next_block_index();
                 value_locations.push((block_index, 0));
-                writer.write_value_block(value, value_compression_dictionary)?;
+                writer.write_value_block(value)?;
+            }
+            EntryValue::MediumCompressed {
+                uncompressed_size,
+                block,
+            } => {
+                let block_index = writer.next_block_index();
+                value_locations.push((block_index, 0));
+                writer.write_compressed_block(uncompressed_size, block)?;
             }
-            _ => {
+            EntryValue::Deleted | EntryValue::Large { .. } => {
                 value_locations.push((0, 0));
             }
         }
@@ -400,7 +377,7 @@ fn write_value_blocks(
                 value_locations[j].0 = block_index;
             }
         }
-        writer.write_value_block(buffer, value_compression_dictionary)?;
+        writer.write_small_value_block(buffer)?;
         buffer.clear();
     }
 
@@ -438,7 +415,7 @@ fn write_key_blocks_and_compute_amqf(
                     value.len().try_into().unwrap(),
                 );
             }
-            EntryValue::Medium { .. } => {
+            EntryValue::Medium { .. } | EntryValue::MediumCompressed { .. } => {
                 block.put_medium(entry, value_location.0);
             }
             EntryValue::Large { blob } => {
diff --git a/turbopack/crates/turbo-persistence/src/tests.rs b/turbopack/crates/turbo-persistence/src/tests.rs
index 5c123611d8759..dc9e7edb36374 100644
--- a/turbopack/crates/turbo-persistence/src/tests.rs
+++ b/turbopack/crates/turbo-persistence/src/tests.rs
@@ -6,28 +6,123 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator};
 use crate::{
     constants::MAX_MEDIUM_VALUE_SIZE,
     db::{CompactConfig, TurboPersistence},
+    parallel_scheduler::ParallelScheduler,
     write_batch::WriteBatch,
 };
 
+#[derive(Clone, Copy)]
+struct RayonParallelScheduler;
+
+impl ParallelScheduler for RayonParallelScheduler {
+    fn block_in_place<R>(&self, f: impl FnOnce() -> R + Send) -> R
+    where
+        R: Send,
+    {
+        f()
+    }
+
+    fn parallel_for_each<T>(&self, items: &[T], f: impl Fn(&T) + Send + Sync)
+    where
+        T: Sync,
+    {
+        items.into_par_iter().for_each(f);
+    }
+
+    fn try_parallel_for_each<'l, T, E>(
+        &self,
+        items: &'l [T],
+        f: impl (Fn(&'l T) -> Result<(), E>) + Send + Sync,
+    ) -> Result<(), E>
+    where
+        T: Sync,
+        E: Send,
+    {
+        items.into_par_iter().try_for_each(f)
+    }
+
+    fn try_parallel_for_each_mut<'l, T, E>(
+        &self,
+        items: &'l mut [T],
+        f: impl (Fn(&'l mut T) -> Result<(), E>) + Send + Sync,
+    ) -> Result<(), E>
+    where
+        T: Send + Sync,
+        E: Send,
+    {
+        items.into_par_iter().try_for_each(f)
+    }
+
+    fn try_vec_into_parallel_for_each<T, E>(
+        &self,
+        items: Vec<T>,
+        f: impl (Fn(T) -> Result<(), E>) + Send + Sync,
+    ) -> Result<(), E>
+    where
+        T: Send + Sync,
+        E: Send,
+    {
+        items.into_par_iter().try_for_each(f)
+    }
+
+    fn parallel_map_collect<'l, T, I, R>(
+        &self,
+        items: &'l [T],
+        f: impl Fn(&'l T) -> I + Send + Sync,
+    ) -> R
+    where
+        T: Sync,
+        I: Send + Sync,
+        R: FromIterator<I>,
+    {
+        items
+            .into_par_iter()
+            .map(f)
+            .collect_vec_list()
+            .into_iter()
+            .flatten()
+            .collect()
+    }
+
+    fn vec_into_parallel_map_collect<T, I, R>(
+        &self,
+        items: Vec<T>,
+        f: impl Fn(T) -> I + Send + Sync,
+    ) -> R
+    where
+        T: Send + Sync,
+        I: Send + Sync,
+        R: FromIterator<I>,
+    {
+        items
+            .into_par_iter()
+            .map(f)
+            .collect_vec_list()
+            .into_iter()
+            .flatten()
+            .collect()
+    }
+}
+
 #[test]
 fn full_cycle() -> Result<()> {
     let mut test_cases = Vec::new();
     type TestCases = Vec<(
         &'static str,
-        Box<dyn Fn(&mut WriteBatch<Vec<u8>, 16>) -> Result<()>>,
-        Box<dyn Fn(&TurboPersistence) -> Result<()>>,
+        Box<dyn Fn(&mut WriteBatch<Vec<u8>, RayonParallelScheduler, 16>) -> Result<()>>,
+        Box<dyn Fn(&TurboPersistence<RayonParallelScheduler>) -> Result<()>>,
     )>;
 
     fn test_case(
         test_cases: &mut TestCases,
         name: &'static str,
-        write: impl Fn(&mut WriteBatch<Vec<u8>, 16>) -> Result<()> + 'static,
-        read: impl Fn(&TurboPersistence) -> Result<()> + 'static,
+        write: impl Fn(&mut WriteBatch<Vec<u8>, RayonParallelScheduler, 16>) -> Result<()> + 'static,
+        read: impl Fn(&TurboPersistence<RayonParallelScheduler>) -> Result<()> + 'static,
     ) {
         test_cases.push((
             name,
-            Box::new(write) as Box<dyn Fn(&mut WriteBatch<Vec<u8>, 16>) -> Result<()>>,
-            Box::new(read) as Box<dyn Fn(&TurboPersistence) -> Result<()>>,
+            Box::new(write)
+                as Box<dyn Fn(&mut WriteBatch<Vec<u8>, RayonParallelScheduler, 16>) -> Result<()>>,
+            Box::new(read) as Box<dyn Fn(&TurboPersistence<RayonParallelScheduler>) -> Result<()>>,
         ));
     }
 
@@ -215,7 +310,10 @@ fn full_cycle() -> Result<()> {
 
         {
             let start = Instant::now();
-            let db = TurboPersistence::open(path.to_path_buf())?;
+            let db = TurboPersistence::open_with_parallel_scheduler(
+                path.to_path_buf(),
+                RayonParallelScheduler,
+            )?;
             let mut batch = db.write_batch()?;
             write(&mut batch)?;
             db.commit_write_batch(batch)?;
@@ -231,7 +329,10 @@ fn full_cycle() -> Result<()> {
         }
         {
             let start = Instant::now();
-            let db = TurboPersistence::open(path.to_path_buf())?;
+            let db = TurboPersistence::open_with_parallel_scheduler(
+                path.to_path_buf(),
+                RayonParallelScheduler,
+            )?;
             println!("{name} restore time: {:?}", start.elapsed());
             let start = Instant::now();
             read(&db)?;
@@ -257,7 +358,10 @@ fn full_cycle() -> Result<()> {
         }
         {
             let start = Instant::now();
-            let db = TurboPersistence::open(path.to_path_buf())?;
+            let db = TurboPersistence::open_with_parallel_scheduler(
+                path.to_path_buf(),
+                RayonParallelScheduler,
+            )?;
             println!("{name} restore time after compact: {:?}", start.elapsed());
             let start = Instant::now();
             read(&db)?;
@@ -291,7 +395,10 @@ fn full_cycle() -> Result<()> {
 
         {
             let start = Instant::now();
-            let db = TurboPersistence::open(path.to_path_buf())?;
+            let db = TurboPersistence::open_with_parallel_scheduler(
+                path.to_path_buf(),
+                RayonParallelScheduler,
+            )?;
             let mut batch = db.write_batch()?;
             for (_, write, _) in test_cases.iter() {
                 write(&mut batch)?;
@@ -311,7 +418,10 @@ fn full_cycle() -> Result<()> {
         }
         {
             let start = Instant::now();
-            let db = TurboPersistence::open(path.to_path_buf())?;
+            let db = TurboPersistence::open_with_parallel_scheduler(
+                path.to_path_buf(),
+                RayonParallelScheduler,
+            )?;
             println!("All restore time: {:?}", start.elapsed());
             for (name, _, read) in test_cases.iter() {
                 let start = Instant::now();
@@ -343,7 +453,10 @@ fn full_cycle() -> Result<()> {
 
         {
             let start = Instant::now();
-            let db = TurboPersistence::open(path.to_path_buf())?;
+            let db = TurboPersistence::open_with_parallel_scheduler(
+                path.to_path_buf(),
+                RayonParallelScheduler,
+            )?;
             println!("All restore time after compact: {:?}", start.elapsed());
 
             for (name, _, read) in test_cases.iter() {
@@ -383,13 +496,17 @@ fn persist_changes() -> Result<()> {
     let path = tempdir.path();
 
     const READ_COUNT: u32 = 2_000; // we'll read every 10th value, so writes are 10x this value
-    fn put(b: &WriteBatch<(u8, [u8; 4]), 1>, key: u8, value: u8) -> Result<()> {
+    fn put(
+        b: &WriteBatch<(u8, [u8; 4]), RayonParallelScheduler, 1>,
+        key: u8,
+        value: u8,
+    ) -> Result<()> {
         for i in 0..(READ_COUNT * 10) {
             b.put(0, (key, i.to_be_bytes()), vec![value].into())?;
         }
         Ok(())
     }
-    fn check(db: &TurboPersistence, key: u8, value: u8) -> Result<()> {
+    fn check(db: &TurboPersistence<RayonParallelScheduler>, key: u8, value: u8) -> Result<()> {
         for i in 0..READ_COUNT {
             // read every 10th item
             let i = i * 10;
@@ -402,7 +519,10 @@ fn persist_changes() -> Result<()> {
     }
 
     {
-        let db = TurboPersistence::open(path.to_path_buf())?;
+        let db = TurboPersistence::open_with_parallel_scheduler(
+            path.to_path_buf(),
+            RayonParallelScheduler,
+        )?;
         let b = db.write_batch::<_, 1>()?;
         put(&b, 1, 11)?;
         put(&b, 2, 21)?;
@@ -418,7 +538,10 @@ fn persist_changes() -> Result<()> {
 
     println!("---");
     {
-        let db = TurboPersistence::open(path.to_path_buf())?;
+        let db = TurboPersistence::open_with_parallel_scheduler(
+            path.to_path_buf(),
+            RayonParallelScheduler,
+        )?;
         let b = db.write_batch::<_, 1>()?;
         put(&b, 1, 12)?;
         put(&b, 2, 22)?;
@@ -432,7 +555,10 @@ fn persist_changes() -> Result<()> {
     }
 
     {
-        let db = TurboPersistence::open(path.to_path_buf())?;
+        let db = TurboPersistence::open_with_parallel_scheduler(
+            path.to_path_buf(),
+            RayonParallelScheduler,
+        )?;
         let b = db.write_batch::<_, 1>()?;
         put(&b, 1, 13)?;
         db.commit_write_batch(b)?;
@@ -446,7 +572,10 @@ fn persist_changes() -> Result<()> {
 
     println!("---");
     {
-        let db = TurboPersistence::open(path.to_path_buf())?;
+        let db = TurboPersistence::open_with_parallel_scheduler(
+            path.to_path_buf(),
+            RayonParallelScheduler,
+        )?;
 
         check(&db, 1, 13)?;
         check(&db, 2, 22)?;
@@ -457,7 +586,10 @@ fn persist_changes() -> Result<()> {
 
     println!("---");
     {
-        let db = TurboPersistence::open(path.to_path_buf())?;
+        let db = TurboPersistence::open_with_parallel_scheduler(
+            path.to_path_buf(),
+            RayonParallelScheduler,
+        )?;
 
         db.compact(&CompactConfig {
             optimal_merge_count: 4,
@@ -475,7 +607,10 @@ fn persist_changes() -> Result<()> {
 
     println!("---");
     {
-        let db = TurboPersistence::open(path.to_path_buf())?;
+        let db = TurboPersistence::open_with_parallel_scheduler(
+            path.to_path_buf(),
+            RayonParallelScheduler,
+        )?;
 
         check(&db, 1, 13)?;
         check(&db, 2, 22)?;
@@ -493,13 +628,17 @@ fn partial_compaction() -> Result<()> {
     let path = tempdir.path();
 
     const READ_COUNT: u32 = 2_000; // we'll read every 10th value, so writes are 10x this value
-    fn put(b: &WriteBatch<(u8, [u8; 4]), 1>, key: u8, value: u8) -> Result<()> {
+    fn put(
+        b: &WriteBatch<(u8, [u8; 4]), RayonParallelScheduler, 1>,
+        key: u8,
+        value: u8,
+    ) -> Result<()> {
         for i in 0..(READ_COUNT * 10) {
             b.put(0, (key, i.to_be_bytes()), vec![value].into())?;
         }
         Ok(())
     }
-    fn check(db: &TurboPersistence, key: u8, value: u8) -> Result<()> {
+    fn check(db: &TurboPersistence<RayonParallelScheduler>, key: u8, value: u8) -> Result<()> {
         for i in 0..READ_COUNT {
             // read every 10th item
             let i = i * 10;
@@ -516,7 +655,10 @@ fn partial_compaction() -> Result<()> {
         println!("--- Iteration {i} ---");
         println!("Add more entries");
         {
-            let db = TurboPersistence::open(path.to_path_buf())?;
+            let db = TurboPersistence::open_with_parallel_scheduler(
+                path.to_path_buf(),
+                RayonParallelScheduler,
+            )?;
             let b = db.write_batch::<_, 1>()?;
             put(&b, i, i)?;
             put(&b, i + 1, i)?;
@@ -535,7 +677,10 @@ fn partial_compaction() -> Result<()> {
 
         println!("Compaction");
         {
-            let db = TurboPersistence::open(path.to_path_buf())?;
+            let db = TurboPersistence::open_with_parallel_scheduler(
+                path.to_path_buf(),
+                RayonParallelScheduler,
+            )?;
 
             db.compact(&CompactConfig {
                 optimal_merge_count: 4,
@@ -556,7 +701,10 @@ fn partial_compaction() -> Result<()> {
 
         println!("Restore check");
         {
-            let db = TurboPersistence::open(path.to_path_buf())?;
+            let db = TurboPersistence::open_with_parallel_scheduler(
+                path.to_path_buf(),
+                RayonParallelScheduler,
+            )?;
 
             for j in 0..i {
                 check(&db, j, j)?;
@@ -580,7 +728,11 @@ fn merge_file_removal() -> Result<()> {
     let _ = fs::remove_dir_all(path);
 
     const READ_COUNT: u32 = 2_000; // we'll read every 10th value, so writes are 10x this value
-    fn put(b: &WriteBatch<(u8, [u8; 4]), 1>, key: u8, value: u32) -> Result<()> {
+    fn put(
+        b: &WriteBatch<(u8, [u8; 4]), RayonParallelScheduler, 1>,
+        key: u8,
+        value: u32,
+    ) -> Result<()> {
         for i in 0..(READ_COUNT * 10) {
             b.put(
                 0,
@@ -590,7 +742,7 @@ fn merge_file_removal() -> Result<()> {
         }
         Ok(())
     }
-    fn check(db: &TurboPersistence, key: u8, value: u32) -> Result<()> {
+    fn check(db: &TurboPersistence<RayonParallelScheduler>, key: u8, value: u32) -> Result<()> {
         for i in 0..READ_COUNT {
             // read every 10th item
             let i = i * 10;
@@ -608,7 +760,10 @@ fn merge_file_removal() -> Result<()> {
 
     {
         println!("--- Init ---");
-        let db = TurboPersistence::open(path.to_path_buf())?;
+        let db = TurboPersistence::open_with_parallel_scheduler(
+            path.to_path_buf(),
+            RayonParallelScheduler,
+        )?;
         let b = db.write_batch::<_, 1>()?;
         for j in 0..=255 {
             put(&b, j, 0)?;
@@ -624,7 +779,10 @@ fn merge_file_removal() -> Result<()> {
         let i = i * 37;
         println!("Add more entries");
         {
-            let db = TurboPersistence::open(path.to_path_buf())?;
+            let db = TurboPersistence::open_with_parallel_scheduler(
+                path.to_path_buf(),
+                RayonParallelScheduler,
+            )?;
             let b = db.write_batch::<_, 1>()?;
             for j in iter_bits(i) {
                 println!("Put {j} = {i}");
@@ -642,7 +800,10 @@ fn merge_file_removal() -> Result<()> {
 
         println!("Compaction");
         {
-            let db = TurboPersistence::open(path.to_path_buf())?;
+            let db = TurboPersistence::open_with_parallel_scheduler(
+                path.to_path_buf(),
+                RayonParallelScheduler,
+            )?;
 
             db.compact(&CompactConfig {
                 optimal_merge_count: 4,
@@ -660,7 +821,10 @@ fn merge_file_removal() -> Result<()> {
 
         println!("Restore check");
         {
-            let db = TurboPersistence::open(path.to_path_buf())?;
+            let db = TurboPersistence::open_with_parallel_scheduler(
+                path.to_path_buf(),
+                RayonParallelScheduler,
+            )?;
 
             for j in 0..32 {
                 check(&db, j, expected_values[j as usize])?;
diff --git a/turbopack/crates/turbo-persistence/src/write_batch.rs b/turbopack/crates/turbo-persistence/src/write_batch.rs
index 490cf38e88a90..085b1f14a0ccd 100644
--- a/turbopack/crates/turbo-persistence/src/write_batch.rs
+++ b/turbopack/crates/turbo-persistence/src/write_batch.rs
@@ -9,23 +9,20 @@ use std::{
 
 use anyhow::{Context, Result};
 use byteorder::{BE, WriteBytesExt};
-use lzzzz::lz4::{self, ACC_LEVEL_DEFAULT};
+use either::Either;
 use parking_lot::Mutex;
-use rayon::{
-    iter::{Either, IndexedParallelIterator, IntoParallelIterator, ParallelIterator},
-    scope,
-};
 use smallvec::SmallVec;
 use thread_local::ThreadLocal;
-use tracing::Span;
 
 use crate::{
     ValueBuffer,
     collector::Collector,
     collector_entry::CollectorEntry,
+    compression::compress_into_buffer,
     constants::{MAX_MEDIUM_VALUE_SIZE, THREAD_LOCAL_SIZE_SHIFT},
     key::StoreKey,
     meta_file_builder::MetaFileBuilder,
+    parallel_scheduler::ParallelScheduler,
     static_sorted_file_builder::{StaticSortedFileBuilderMeta, write_static_stored_file},
 };
 
@@ -68,7 +65,9 @@ enum GlobalCollectorState<K: StoreKey + Send> {
 }
 
 /// A write batch.
-pub struct WriteBatch<K: StoreKey + Send, const FAMILIES: usize> {
+pub struct WriteBatch<K: StoreKey + Send, S: ParallelScheduler, const FAMILIES: usize> {
+    /// Parallel scheduler
+    parallel_scheduler: S,
     /// The database path
     db_path: PathBuf,
     /// The current sequence number counter. Increased for every new SST file or blob file.
@@ -84,13 +83,16 @@ pub struct WriteBatch<K: StoreKey + Send, const FAMILIES: usize> {
     new_sst_files: Mutex<Vec<(u32, File)>>,
 }
 
-impl<K: StoreKey + Send + Sync, const FAMILIES: usize> WriteBatch<K, FAMILIES> {
+impl<K: StoreKey + Send + Sync, S: ParallelScheduler, const FAMILIES: usize>
+    WriteBatch<K, S, FAMILIES>
+{
     /// Creates a new write batch for a database.
-    pub(crate) fn new(path: PathBuf, current: u32) -> Self {
+    pub(crate) fn new(path: PathBuf, current: u32, parallel_scheduler: S) -> Self {
         const {
             assert!(FAMILIES <= usize_from_u32(u32::MAX));
         };
         Self {
+            parallel_scheduler,
             db_path: path,
             current_sequence_number: AtomicU32::new(current),
             thread_locals: ThreadLocal::new(),
@@ -223,13 +225,12 @@ impl<K: StoreKey + Send + Sync, const FAMILIES: usize> WriteBatch<K, FAMILIES> {
             }
         }
 
-        let span = Span::current();
-        collectors.into_par_iter().try_for_each(|mut collector| {
-            let _span = span.clone().entered();
-            self.flush_thread_local_collector(family, &mut collector)?;
-            drop(collector);
-            anyhow::Ok(())
-        })?;
+        self.parallel_scheduler
+            .try_vec_into_parallel_for_each(collectors, |mut collector| {
+                self.flush_thread_local_collector(family, &mut collector)?;
+                drop(collector);
+                anyhow::Ok(())
+            })?;
 
         // Now we flush the global collector(s).
         let mut collector_state = self.collectors[usize_from_u32(family)].lock();
@@ -242,22 +243,22 @@ impl<K: StoreKey + Send + Sync, const FAMILIES: usize> WriteBatch<K, FAMILIES> {
                 }
             }
             GlobalCollectorState::Sharded(_) => {
-                let GlobalCollectorState::Sharded(shards) = replace(
+                let GlobalCollectorState::Sharded(mut shards) = replace(
                     &mut *collector_state,
                     GlobalCollectorState::Unsharded(Collector::new()),
                 ) else {
                     unreachable!();
                 };
-                shards.into_par_iter().try_for_each(|mut collector| {
-                    let _span = span.clone().entered();
-                    if !collector.is_empty() {
-                        let sst = self.create_sst_file(family, collector.sorted())?;
-                        collector.clear();
-                        self.new_sst_files.lock().push(sst);
-                        drop(collector);
-                    }
-                    anyhow::Ok(())
-                })?;
+                self.parallel_scheduler
+                    .try_parallel_for_each_mut(&mut shards, |collector| {
+                        if !collector.is_empty() {
+                            let sst = self.create_sst_file(family, collector.sorted())?;
+                            collector.clear();
+                            self.new_sst_files.lock().push(sst);
+                            collector.drop_contents();
+                        }
+                        anyhow::Ok(())
+                    })?;
             }
         }
 
@@ -269,10 +270,9 @@ impl<K: StoreKey + Send + Sync, const FAMILIES: usize> WriteBatch<K, FAMILIES> {
     #[tracing::instrument(level = "trace", skip(self))]
     pub(crate) fn finish(&mut self) -> Result<FinishResult> {
         let mut new_blob_files = Vec::new();
-        let shared_error = Mutex::new(Ok(()));
 
         // First, we flush all thread local collectors to the global collectors.
-        scope(|scope| {
+        {
             let _span = tracing::trace_span!("flush thread local collectors").entered();
             let mut collectors = [const { Vec::new() }; FAMILIES];
             for cell in self.thread_locals.iter_mut() {
@@ -286,23 +286,24 @@ impl<K: StoreKey + Send + Sync, const FAMILIES: usize> WriteBatch<K, FAMILIES> {
                     }
                 }
             }
-            for (family, thread_local_collectors) in collectors.into_iter().enumerate() {
-                for mut collector in thread_local_collectors {
-                    let this = &self;
-                    let shared_error = &shared_error;
-                    let span = Span::current();
-                    scope.spawn(move |_| {
-                        let _span = span.entered();
-                        if let Err(err) =
-                            this.flush_thread_local_collector(family as u32, &mut collector)
-                        {
-                            *shared_error.lock() = Err(err);
-                        }
-                        drop(collector);
-                    });
-                }
-            }
-        });
+            let to_flush = collectors
+                .into_iter()
+                .enumerate()
+                .flat_map(|(family, collector)| {
+                    collector
+                        .into_iter()
+                        .map(move |collector| (family as u32, collector))
+                })
+                .collect::<Vec<_>>();
+            self.parallel_scheduler.try_vec_into_parallel_for_each(
+                to_flush,
+                |(family, mut collector)| {
+                    self.flush_thread_local_collector(family, &mut collector)?;
+                    drop(collector);
+                    anyhow::Ok(())
+                },
+            )?;
+        }
 
         let _span = tracing::trace_span!("flush collectors").entered();
 
@@ -313,25 +314,24 @@ impl<K: StoreKey + Send + Sync, const FAMILIES: usize> WriteBatch<K, FAMILIES> {
         let new_collectors =
             [(); FAMILIES].map(|_| Mutex::new(GlobalCollectorState::Unsharded(Collector::new())));
         let collectors = replace(&mut self.collectors, new_collectors);
-        let span = Span::current();
-        collectors
-            .into_par_iter()
+        let collectors = collectors
+            .into_iter()
             .enumerate()
             .flat_map(|(family, state)| {
                 let collector = state.into_inner();
                 match collector {
                     GlobalCollectorState::Unsharded(collector) => {
-                        Either::Left([(family, collector)].into_par_iter())
+                        Either::Left([(family, collector)].into_iter())
+                    }
+                    GlobalCollectorState::Sharded(shards) => {
+                        Either::Right(shards.into_iter().map(move |collector| (family, collector)))
                     }
-                    GlobalCollectorState::Sharded(shards) => Either::Right(
-                        shards
-                            .into_par_iter()
-                            .map(move |collector| (family, collector)),
-                    ),
                 }
             })
-            .try_for_each(|(family, mut collector)| {
-                let _span = span.clone().entered();
+            .collect::<Vec<_>>();
+        self.parallel_scheduler.try_vec_into_parallel_for_each(
+            collectors,
+            |(family, mut collector)| {
                 let family = family as u32;
                 if !collector.is_empty() {
                     let sst = self.create_sst_file(family, collector.sorted())?;
@@ -340,33 +340,37 @@ impl<K: StoreKey + Send + Sync, const FAMILIES: usize> WriteBatch<K, FAMILIES> {
                     shared_new_sst_files.lock().push(sst);
                 }
                 anyhow::Ok(())
-            })?;
-
-        shared_error.into_inner()?;
+            },
+        )?;
 
         // Not we need to write the new meta files.
         let new_meta_collectors = [(); FAMILIES].map(|_| Mutex::new(Vec::new()));
         let meta_collectors = replace(&mut self.meta_collectors, new_meta_collectors);
         let keys_written = AtomicU64::new(0);
-        let new_meta_files = meta_collectors
-            .into_par_iter()
+        let file_to_write = meta_collectors
+            .into_iter()
             .map(|mutex| mutex.into_inner())
             .enumerate()
             .filter(|(_, sst_files)| !sst_files.is_empty())
-            .map(|(family, sst_files)| {
-                let family = family as u32;
-                let mut entries = 0;
-                let mut builder = MetaFileBuilder::new(family);
-                for (seq, sst) in sst_files {
-                    entries += sst.entries;
-                    builder.add(seq, sst);
-                }
-                keys_written.fetch_add(entries, Ordering::Relaxed);
-                let seq = self.current_sequence_number.fetch_add(1, Ordering::SeqCst) + 1;
-                let file = builder.write(&self.db_path, seq)?;
-                Ok((seq, file))
-            })
-            .collect::<Result<Vec<_>>>()?;
+            .collect::<Vec<_>>();
+        let new_meta_files = self
+            .parallel_scheduler
+            .vec_into_parallel_map_collect::<_, _, Result<Vec<_>>>(
+                file_to_write,
+                |(family, sst_files)| {
+                    let family = family as u32;
+                    let mut entries = 0;
+                    let mut builder = MetaFileBuilder::new(family);
+                    for (seq, sst) in sst_files {
+                        entries += sst.entries;
+                        builder.add(seq, sst);
+                    }
+                    keys_written.fetch_add(entries, Ordering::Relaxed);
+                    let seq = self.current_sequence_number.fetch_add(1, Ordering::SeqCst) + 1;
+                    let file = builder.write(&self.db_path, seq)?;
+                    Ok((seq, file))
+                },
+            )?;
 
         // Finally we return the new files and sequence number.
         let seq = self.current_sequence_number.load(Ordering::SeqCst);
@@ -386,7 +390,7 @@ impl<K: StoreKey + Send + Sync, const FAMILIES: usize> WriteBatch<K, FAMILIES> {
         let seq = self.current_sequence_number.fetch_add(1, Ordering::SeqCst) + 1;
         let mut buffer = Vec::new();
         buffer.write_u32::<BE>(value.len() as u32)?;
-        lz4::compress_to_vec(value, &mut buffer, ACC_LEVEL_DEFAULT)
+        compress_into_buffer(value, None, true, &mut buffer)
             .context("Compression of value for blob file failed")?;
 
         let file = self.db_path.join(format!("{seq:08}.blob"));
@@ -403,15 +407,16 @@ impl<K: StoreKey + Send + Sync, const FAMILIES: usize> WriteBatch<K, FAMILIES> {
     fn create_sst_file(
         &self,
         family: u32,
-        collector_data: (&[CollectorEntry<K>], usize, usize),
+        collector_data: (&[CollectorEntry<K>], usize),
     ) -> Result<(u32, File)> {
-        let (entries, total_key_size, total_value_size) = collector_data;
+        let (entries, total_key_size) = collector_data;
         let seq = self.current_sequence_number.fetch_add(1, Ordering::SeqCst) + 1;
 
         let path = self.db_path.join(format!("{seq:08}.sst"));
-        let (meta, file) =
-            write_static_stored_file(entries, total_key_size, total_value_size, &path)
-                .with_context(|| format!("Unable to write SST file {seq:08}.sst"))?;
+        let (meta, file) = self
+            .parallel_scheduler
+            .block_in_place(|| write_static_stored_file(entries, total_key_size, &path))
+            .with_context(|| format!("Unable to write SST file {seq:08}.sst"))?;
 
         #[cfg(feature = "verify_sst_content")]
         {
@@ -433,7 +438,6 @@ impl<K: StoreKey + Send + Sync, const FAMILIES: usize> WriteBatch<K, FAMILIES> {
                 StaticSortedFileMetaData {
                     sequence_number: seq,
                     key_compression_dictionary_length: meta.key_compression_dictionary_length,
-                    value_compression_dictionary_length: meta.value_compression_dictionary_length,
                     block_count: meta.block_count,
                 },
             )?;
diff --git a/turbopack/crates/turbo-tasks-backend/Cargo.toml b/turbopack/crates/turbo-tasks-backend/Cargo.toml
index eb3ee57b72093..3554ce1f31b20 100644
--- a/turbopack/crates/turbo-tasks-backend/Cargo.toml
+++ b/turbopack/crates/turbo-tasks-backend/Cargo.toml
@@ -40,7 +40,6 @@ once_cell = { workspace = true }
 parking_lot = { workspace = true }
 pot = "3.0.0"
 rand = { workspace = true }
-rayon = { workspace = true }
 ringmap = { workspace = true, features = ["serde"] }
 rustc-hash = { workspace = true }
 serde = { workspace = true }
diff --git a/turbopack/crates/turbo-tasks-backend/src/backend/mod.rs b/turbopack/crates/turbo-tasks-backend/src/backend/mod.rs
index addfc48bdca8f..27d66a2ad86e1 100644
--- a/turbopack/crates/turbo-tasks-backend/src/backend/mod.rs
+++ b/turbopack/crates/turbo-tasks-backend/src/backend/mod.rs
@@ -64,7 +64,8 @@ use crate::{
         InProgressCellState, InProgressState, InProgressStateInner, OutputValue, RootType,
     },
     utils::{
-        bi_map::BiMap, chunked_vec::ChunkedVec, ptr_eq_arc::PtrEqArc, sharded::Sharded, swap_retain,
+        bi_map::BiMap, chunked_vec::ChunkedVec, dash_map_drop_contents::drop_contents,
+        ptr_eq_arc::PtrEqArc, sharded::Sharded, swap_retain,
     },
 };
 
@@ -1216,6 +1217,9 @@ impl<B: BackingStorage> TurboTasksBackendInner<B> {
             self.is_idle.store(false, Ordering::Release);
             self.verify_aggregation_graph(turbo_tasks, false);
         }
+        self.task_cache.drop_contents();
+        drop_contents(&self.transient_tasks);
+        self.storage.drop_contents();
         if let Err(err) = self.backing_storage.shutdown() {
             println!("Shutting down failed: {err}");
         }
@@ -1267,7 +1271,6 @@ impl<B: BackingStorage> TurboTasksBackendInner<B> {
             return task_id;
         }
 
-        self.track_cache_miss(&task_type);
         let tx = self
             .should_restore()
             .then(|| self.backing_storage.start_read_transaction())
@@ -1279,6 +1282,7 @@ impl<B: BackingStorage> TurboTasksBackendInner<B> {
                     .forward_lookup_task_cache(tx.as_ref(), &task_type)
                     .expect("Failed to lookup task id")
             } {
+                self.track_cache_hit(&task_type);
                 let _ = self.task_cache.try_insert(Arc::new(task_type), task_id);
                 task_id
             } else {
@@ -1287,12 +1291,14 @@ impl<B: BackingStorage> TurboTasksBackendInner<B> {
                 let task_id = if let Err(existing_task_id) =
                     self.task_cache.try_insert(task_type.clone(), task_id)
                 {
+                    self.track_cache_hit(&task_type);
                     // Safety: We just created the id and failed to insert it.
                     unsafe {
                         self.persisted_task_id_factory.reuse(task_id);
                     }
                     existing_task_id
                 } else {
+                    self.track_cache_miss(&task_type);
                     task_id
                 };
                 if let Some(log) = &self.persisted_task_cache_log {
@@ -1327,10 +1333,10 @@ impl<B: BackingStorage> TurboTasksBackendInner<B> {
             return task_id;
         }
 
-        self.track_cache_miss(&task_type);
         let task_type = Arc::new(task_type);
         let task_id = self.transient_task_id_factory.get();
-        if let Err(existing_task_id) = self.task_cache.try_insert(task_type, task_id) {
+        if let Err(existing_task_id) = self.task_cache.try_insert(task_type.clone(), task_id) {
+            self.track_cache_hit(&task_type);
             // Safety: We just created the id and failed to insert it.
             unsafe {
                 self.transient_task_id_factory.reuse(task_id);
@@ -1339,6 +1345,7 @@ impl<B: BackingStorage> TurboTasksBackendInner<B> {
             return existing_task_id;
         }
 
+        self.track_cache_miss(&task_type);
         self.connect_child(parent_task, task_id, turbo_tasks);
 
         task_id
@@ -2172,7 +2179,7 @@ impl<B: BackingStorage> TurboTasksBackendInner<B> {
                     }
 
                     let this = self.clone();
-                    let snapshot = turbo_tasks::spawn_blocking(move || this.snapshot()).await;
+                    let snapshot = this.snapshot();
                     if let Some((snapshot_start, new_data)) = snapshot {
                         last_snapshot = snapshot_start;
                         if new_data {
diff --git a/turbopack/crates/turbo-tasks-backend/src/backend/storage.rs b/turbopack/crates/turbo-tasks-backend/src/backend/storage.rs
index ceab626298854..60da5545b8579 100644
--- a/turbopack/crates/turbo-tasks-backend/src/backend/storage.rs
+++ b/turbopack/crates/turbo-tasks-backend/src/backend/storage.rs
@@ -6,9 +6,8 @@ use std::{
 };
 
 use bitfield::bitfield;
-use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};
 use smallvec::SmallVec;
-use turbo_tasks::{FxDashMap, TaskId};
+use turbo_tasks::{FxDashMap, TaskId, parallel};
 
 use crate::{
     backend::dynamic_storage::DynamicStorage,
@@ -17,7 +16,10 @@ use crate::{
         CachedDataItemValue, CachedDataItemValueRef, CachedDataItemValueRefMut, OutputValue,
     },
     data_storage::{AutoMapStorage, OptionStorage},
-    utils::dash_map_multi::{RefMut, get_multiple_mut},
+    utils::{
+        dash_map_drop_contents::drop_contents,
+        dash_map_multi::{RefMut, get_multiple_mut},
+    },
 };
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -664,48 +666,43 @@ impl Storage {
 
         // The number of shards is much larger than the number of threads, so the effect of the
         // locks held is negligible.
-        self.modified
-            .shards()
-            .par_iter()
-            .with_max_len(1)
-            .map(|shard| {
-                let mut direct_snapshots: Vec<(TaskId, Box<InnerStorageSnapshot>)> = Vec::new();
-                let mut modified: SmallVec<[TaskId; 4]> = SmallVec::new();
-                {
-                    // Take the snapshots from the modified map
-                    let guard = shard.write();
-                    // Safety: guard must outlive the iterator.
-                    for bucket in unsafe { guard.iter() } {
-                        // Safety: the guard guarantees that the bucket is not removed and the ptr
-                        // is valid.
-                        let (key, shared_value) = unsafe { bucket.as_mut() };
-                        let modified_state = shared_value.get_mut();
-                        match modified_state {
-                            ModifiedState::Modified => {
-                                modified.push(*key);
-                            }
-                            ModifiedState::Snapshot(snapshot) => {
-                                if let Some(snapshot) = snapshot.take() {
-                                    direct_snapshots.push((*key, snapshot));
-                                }
+        parallel::map_collect::<_, _, Vec<_>>(self.modified.shards(), |shard| {
+            let mut direct_snapshots: Vec<(TaskId, Box<InnerStorageSnapshot>)> = Vec::new();
+            let mut modified: SmallVec<[TaskId; 4]> = SmallVec::new();
+            {
+                // Take the snapshots from the modified map
+                let guard = shard.write();
+                // Safety: guard must outlive the iterator.
+                for bucket in unsafe { guard.iter() } {
+                    // Safety: the guard guarantees that the bucket is not removed and the ptr
+                    // is valid.
+                    let (key, shared_value) = unsafe { bucket.as_mut() };
+                    let modified_state = shared_value.get_mut();
+                    match modified_state {
+                        ModifiedState::Modified => {
+                            modified.push(*key);
+                        }
+                        ModifiedState::Snapshot(snapshot) => {
+                            if let Some(snapshot) = snapshot.take() {
+                                direct_snapshots.push((*key, snapshot));
                             }
                         }
                     }
-                    // Safety: guard must outlive the iterator.
-                    drop(guard);
                 }
+                // Safety: guard must outlive the iterator.
+                drop(guard);
+            }
 
-                SnapshotShard {
-                    direct_snapshots,
-                    modified,
-                    storage: self,
-                    guard: Some(guard.clone()),
-                    process,
-                    preprocess,
-                    process_snapshot,
-                }
-            })
-            .collect::<Vec<_>>()
+            SnapshotShard {
+                direct_snapshots,
+                modified,
+                storage: self,
+                guard: Some(guard.clone()),
+                process,
+                preprocess,
+                process_snapshot,
+            }
+        })
     }
 
     /// Start snapshot mode.
@@ -812,6 +809,11 @@ impl Storage {
             },
         )
     }
+
+    pub fn drop_contents(&self) {
+        drop_contents(&self.map);
+        drop_contents(&self.modified);
+    }
 }
 
 pub struct StorageWriteGuard<'a> {
diff --git a/turbopack/crates/turbo-tasks-backend/src/database/turbo.rs b/turbopack/crates/turbo-tasks-backend/src/database/turbo/mod.rs
similarity index 73%
rename from turbopack/crates/turbo-tasks-backend/src/database/turbo.rs
rename to turbopack/crates/turbo-tasks-backend/src/database/turbo/mod.rs
index 82e972f268d66..146a5f5e56952 100644
--- a/turbopack/crates/turbo-tasks-backend/src/database/turbo.rs
+++ b/turbopack/crates/turbo-tasks-backend/src/database/turbo/mod.rs
@@ -1,34 +1,33 @@
-use std::{
-    cmp::max,
-    path::PathBuf,
-    sync::Arc,
-    thread::{JoinHandle, available_parallelism, spawn},
-};
+use std::{cmp::max, path::PathBuf, sync::Arc, thread::available_parallelism, time::Instant};
 
-use anyhow::Result;
+use anyhow::{Ok, Result};
 use parking_lot::Mutex;
 use turbo_persistence::{
     ArcSlice, CompactConfig, KeyBase, StoreKey, TurboPersistence, ValueBuffer,
 };
+use turbo_tasks::{JoinHandle, block_for_future, message_queue::TimingEvent, spawn, turbo_tasks};
 
 use crate::database::{
     key_value_database::{KeySpace, KeyValueDatabase},
+    turbo::parallel_scheduler::TurboTasksParallelScheduler,
     write_batch::{BaseWriteBatch, ConcurrentWriteBatch, WriteBatch, WriteBuffer},
 };
 
+mod parallel_scheduler;
+
 const MB: u64 = 1024 * 1024;
 const COMPACT_CONFIG: CompactConfig = CompactConfig {
     min_merge_count: 3,
     optimal_merge_count: 8,
     max_merge_count: 64,
     max_merge_bytes: 512 * MB,
-    min_merge_duplication_bytes: MB,
+    min_merge_duplication_bytes: 50 * MB,
     optimal_merge_duplication_bytes: 100 * MB,
     max_merge_segment_count: 16,
 };
 
 pub struct TurboKeyValueDatabase {
-    db: Arc<TurboPersistence>,
+    db: Arc<TurboPersistence<TurboTasksParallelScheduler>>,
     compact_join_handle: Mutex<Option<JoinHandle<Result<()>>>>,
     is_ci: bool,
     is_short_session: bool,
@@ -37,24 +36,12 @@ pub struct TurboKeyValueDatabase {
 impl TurboKeyValueDatabase {
     pub fn new(versioned_path: PathBuf, is_ci: bool, is_short_session: bool) -> Result<Self> {
         let db = Arc::new(TurboPersistence::open(versioned_path)?);
-        let mut this = Self {
+        Ok(Self {
             db: db.clone(),
             compact_join_handle: Mutex::new(None),
             is_ci,
             is_short_session,
-        };
-        // start compaction in background if the database is not empty
-        if !db.is_empty() {
-            let handle = spawn(move || {
-                db.compact(&CompactConfig {
-                    max_merge_segment_count: available_parallelism()
-                        .map_or(4, |c| max(4, c.get() / 4)),
-                    ..COMPACT_CONFIG
-                })
-            });
-            this.compact_join_handle.get_mut().replace(handle);
-        }
-        Ok(this)
+        })
     }
 }
 
@@ -96,7 +83,7 @@ impl KeyValueDatabase for TurboKeyValueDatabase {
     ) -> Result<WriteBatch<'_, Self::SerialWriteBatch<'_>, Self::ConcurrentWriteBatch<'_>>> {
         // Wait for the compaction to finish
         if let Some(join_handle) = self.compact_join_handle.lock().take() {
-            join_handle.join().unwrap()?;
+            block_for_future(join_handle)?;
         }
         // Start a new write batch
         Ok(WriteBatch::concurrent(TurboWriteBatch {
@@ -112,26 +99,47 @@ impl KeyValueDatabase for TurboKeyValueDatabase {
     fn shutdown(&self) -> Result<()> {
         // Wait for the compaction to finish
         if let Some(join_handle) = self.compact_join_handle.lock().take() {
-            join_handle.join().unwrap()?;
+            block_for_future(join_handle)?;
         }
         // Compact the database on shutdown
-        self.db.compact(&CompactConfig {
-            max_merge_segment_count: if self.is_ci {
-                // Fully compact in CI to reduce cache size
-                usize::MAX
-            } else {
-                available_parallelism().map_or(4, |c| max(4, c.get()))
-            },
-            ..COMPACT_CONFIG
-        })?;
+        if self.is_ci {
+            // Fully compact in CI to reduce cache size
+            do_compact(&self.db, "Finished database compaction", usize::MAX)?;
+        } else {
+            // Compact with a reasonable limit in non-CI environments
+            do_compact(
+                &self.db,
+                "Finished database compaction",
+                available_parallelism().map_or(4, |c| max(4, c.get())),
+            )?;
+        }
         // Shutdown the database
         self.db.shutdown()
     }
 }
 
+fn do_compact(
+    db: &TurboPersistence<TurboTasksParallelScheduler>,
+    message: &'static str,
+    max_merge_segment_count: usize,
+) -> Result<()> {
+    let start = Instant::now();
+    // Compact the database with the given max merge segment count
+    let ran = db.compact(&CompactConfig {
+        max_merge_segment_count,
+        ..COMPACT_CONFIG
+    })?;
+    if ran {
+        let elapsed = start.elapsed();
+        turbo_tasks()
+            .send_compilation_event(Arc::new(TimingEvent::new(message.to_string(), elapsed)));
+    }
+    Ok(())
+}
+
 pub struct TurboWriteBatch<'a> {
-    batch: turbo_persistence::WriteBatch<WriteBuffer<'static>, 5>,
-    db: &'a Arc<TurboPersistence>,
+    batch: turbo_persistence::WriteBatch<WriteBuffer<'static>, TurboTasksParallelScheduler, 5>,
+    db: &'a Arc<TurboPersistence<TurboTasksParallelScheduler>>,
     compact_join_handle: Option<&'a Mutex<Option<JoinHandle<Result<()>>>>>,
 }
 
@@ -156,12 +164,12 @@ impl<'a> BaseWriteBatch<'a> for TurboWriteBatch<'a> {
         if let Some(compact_join_handle) = self.compact_join_handle {
             // Start a new compaction in the background
             let db = self.db.clone();
-            let handle = spawn(move || {
-                db.compact(&CompactConfig {
-                    max_merge_segment_count: available_parallelism()
-                        .map_or(4, |c| max(4, c.get() / 2)),
-                    ..COMPACT_CONFIG
-                })
+            let handle = spawn(async move {
+                do_compact(
+                    &db,
+                    "Finished database compaction",
+                    available_parallelism().map_or(4, |c| max(4, c.get() / 2)),
+                )
             });
             compact_join_handle.lock().replace(handle);
         }
diff --git a/turbopack/crates/turbo-tasks-backend/src/database/turbo/parallel_scheduler.rs b/turbopack/crates/turbo-tasks-backend/src/database/turbo/parallel_scheduler.rs
new file mode 100644
index 0000000000000..c4e137c20f146
--- /dev/null
+++ b/turbopack/crates/turbo-tasks-backend/src/database/turbo/parallel_scheduler.rs
@@ -0,0 +1,83 @@
+use turbo_persistence::ParallelScheduler;
+use turbo_tasks::{block_in_place, parallel};
+
+#[derive(Clone, Copy, Default)]
+pub struct TurboTasksParallelScheduler;
+
+impl ParallelScheduler for TurboTasksParallelScheduler {
+    fn block_in_place<R>(&self, f: impl FnOnce() -> R + Send) -> R
+    where
+        R: Send,
+    {
+        block_in_place(f)
+    }
+
+    fn parallel_for_each<T>(&self, items: &[T], f: impl Fn(&T) + Send + Sync)
+    where
+        T: Sync,
+    {
+        parallel::for_each(items, f);
+    }
+
+    fn try_parallel_for_each<'l, T, E>(
+        &self,
+        items: &'l [T],
+        f: impl (Fn(&'l T) -> Result<(), E>) + Send + Sync,
+    ) -> Result<(), E>
+    where
+        T: Sync,
+        E: Send + 'static,
+    {
+        parallel::try_for_each(items, f)
+    }
+
+    fn try_parallel_for_each_mut<'l, T, E>(
+        &self,
+        items: &'l mut [T],
+        f: impl (Fn(&'l mut T) -> Result<(), E>) + Send + Sync,
+    ) -> Result<(), E>
+    where
+        T: Send + Sync,
+        E: Send + 'static,
+    {
+        parallel::try_for_each_mut(items, f)
+    }
+
+    fn try_vec_into_parallel_for_each<T, E>(
+        &self,
+        items: Vec<T>,
+        f: impl (Fn(T) -> Result<(), E>) + Send + Sync,
+    ) -> Result<(), E>
+    where
+        T: Send + Sync,
+        E: Send + 'static,
+    {
+        parallel::try_into_for_each(items, f)
+    }
+
+    fn parallel_map_collect<'l, T, I, R>(
+        &self,
+        items: &'l [T],
+        f: impl Fn(&'l T) -> I + Send + Sync,
+    ) -> R
+    where
+        T: Sync,
+        I: Send + Sync + 'l,
+        R: FromIterator<I>,
+    {
+        parallel::map_collect(items, f)
+    }
+
+    fn vec_into_parallel_map_collect<T, I, R>(
+        &self,
+        items: Vec<T>,
+        f: impl Fn(T) -> I + Send + Sync,
+    ) -> R
+    where
+        T: Send + Sync,
+        I: Send + Sync,
+        R: FromIterator<I>,
+    {
+        parallel::vec_into_map_collect(items, f)
+    }
+}
diff --git a/turbopack/crates/turbo-tasks-backend/src/kv_backing_storage.rs b/turbopack/crates/turbo-tasks-backend/src/kv_backing_storage.rs
index c4b84310d651f..77cc7e15e580d 100644
--- a/turbopack/crates/turbo-tasks-backend/src/kv_backing_storage.rs
+++ b/turbopack/crates/turbo-tasks-backend/src/kv_backing_storage.rs
@@ -1,21 +1,18 @@
 use std::{
     borrow::Borrow,
-    cmp::max,
     env,
     path::PathBuf,
     sync::{Arc, LazyLock, Mutex, PoisonError, Weak},
 };
 
 use anyhow::{Context, Result, anyhow};
-use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator};
 use serde::{Deserialize, Serialize};
 use smallvec::SmallVec;
-use tracing::Span;
 use turbo_tasks::{
     SessionId, TaskId,
     backend::CachedTaskType,
     panic_hooks::{PanicHookGuard, register_panic_hook},
-    turbo_tasks_scope,
+    parallel,
 };
 
 use crate::{
@@ -331,14 +328,15 @@ impl<T: KeyValueDatabase + Send + Sync + 'static> BackingStorageSealed
                     let _span = tracing::trace_span!("update task data").entered();
                     process_task_data(snapshots, Some(batch))?;
                     let span = tracing::trace_span!("flush task data").entered();
-                    [KeySpace::TaskMeta, KeySpace::TaskData]
-                        .into_par_iter()
-                        .try_for_each(|key_space| {
+                    parallel::try_for_each(
+                        &[KeySpace::TaskMeta, KeySpace::TaskData],
+                        |&key_space| {
                             let _span = span.clone().entered();
                             // Safety: We already finished all processing of the task data and task
                             // meta
                             unsafe { batch.flush(key_space) }
-                        })?;
+                        },
+                    )?;
                 }
 
                 let mut next_task_id = get_next_free_task_id::<
@@ -352,10 +350,9 @@ impl<T: KeyValueDatabase + Send + Sync + 'static> BackingStorageSealed
                         items = task_cache_updates.iter().map(|m| m.len()).sum::<usize>()
                     )
                     .entered();
-                    let result = task_cache_updates
-                        .into_par_iter()
-                        .with_max_len(1)
-                        .map(|updates| {
+                    let result = parallel::vec_into_map_collect::<_, _, Result<Vec<_>>>(
+                        task_cache_updates,
+                        |updates| {
                             let _span = _span.clone().entered();
                             let mut max_task_id = 0;
 
@@ -390,15 +387,11 @@ impl<T: KeyValueDatabase + Send + Sync + 'static> BackingStorageSealed
                             }
 
                             Ok(max_task_id)
-                        })
-                        .reduce(
-                            || Ok(0),
-                            |a, b| -> anyhow::Result<_> {
-                                let a_max = a?;
-                                let b_max = b?;
-                                Ok(max(a_max, b_max))
-                            },
-                        )?;
+                        },
+                    )?
+                    .into_iter()
+                    .max()
+                    .unwrap_or(0);
                     next_task_id = next_task_id.max(result);
                 }
 
@@ -410,64 +403,11 @@ impl<T: KeyValueDatabase + Send + Sync + 'static> BackingStorageSealed
                 )?;
             }
             WriteBatch::Serial(batch) => {
-                let mut task_items_result = Ok(Vec::new());
-                turbo_tasks::scope(|s| {
-                    s.spawn(|_| {
-                        task_items_result =
-                            process_task_data(snapshots, None::<&T::ConcurrentWriteBatch<'_>>);
-                    });
-
-                    let mut next_task_id =
-                        get_next_free_task_id::<
-                            T::SerialWriteBatch<'_>,
-                            T::ConcurrentWriteBatch<'_>,
-                        >(&mut WriteBatchRef::serial(batch))?;
-
-                    {
-                        let _span = tracing::trace_span!(
-                            "update task cache",
-                            items = task_cache_updates.iter().map(|m| m.len()).sum::<usize>()
-                        )
-                        .entered();
-                        let mut task_type_bytes = Vec::new();
-                        for (task_type, task_id) in task_cache_updates.into_iter().flatten() {
-                            let task_id = *task_id;
-                            serialize_task_type(&task_type, &mut task_type_bytes, task_id)?;
-
-                            batch
-                                .put(
-                                    KeySpace::ForwardTaskCache,
-                                    WriteBuffer::Borrowed(&task_type_bytes),
-                                    WriteBuffer::Borrowed(&task_id.to_le_bytes()),
-                                )
-                                .with_context(|| {
-                                    anyhow!("Unable to write task cache {task_type:?} => {task_id}")
-                                })?;
-                            batch
-                                .put(
-                                    KeySpace::ReverseTaskCache,
-                                    WriteBuffer::Borrowed(IntKey::new(task_id).as_ref()),
-                                    WriteBuffer::Borrowed(&task_type_bytes),
-                                )
-                                .with_context(|| {
-                                    anyhow!("Unable to write task cache {task_id} => {task_type:?}")
-                                })?;
-                            next_task_id = next_task_id.max(task_id + 1);
-                        }
-                    }
-
-                    save_infra::<T::SerialWriteBatch<'_>, T::ConcurrentWriteBatch<'_>>(
-                        &mut WriteBatchRef::serial(batch),
-                        next_task_id,
-                        session_id,
-                        operations,
-                    )?;
-                    anyhow::Ok(())
-                })?;
-
                 {
                     let _span = tracing::trace_span!("update tasks").entered();
-                    for (task_id, meta, data) in task_items_result?.into_iter().flatten() {
+                    let task_items =
+                        process_task_data(snapshots, None::<&T::ConcurrentWriteBatch<'_>>)?;
+                    for (task_id, meta, data) in task_items.into_iter().flatten() {
                         let key = IntKey::new(*task_id);
                         let key = key.as_ref();
                         if let Some(meta) = meta {
@@ -485,7 +425,54 @@ impl<T: KeyValueDatabase + Send + Sync + 'static> BackingStorageSealed
                                 })?;
                         }
                     }
+                    batch.flush(KeySpace::TaskMeta)?;
+                    batch.flush(KeySpace::TaskData)?;
+                }
+
+                let mut next_task_id = get_next_free_task_id::<
+                    T::SerialWriteBatch<'_>,
+                    T::ConcurrentWriteBatch<'_>,
+                >(&mut WriteBatchRef::serial(batch))?;
+
+                {
+                    let _span = tracing::trace_span!(
+                        "update task cache",
+                        items = task_cache_updates.iter().map(|m| m.len()).sum::<usize>()
+                    )
+                    .entered();
+                    let mut task_type_bytes = Vec::new();
+                    for (task_type, task_id) in task_cache_updates.into_iter().flatten() {
+                        let task_id = *task_id;
+                        serialize_task_type(&task_type, &mut task_type_bytes, task_id)?;
+
+                        batch
+                            .put(
+                                KeySpace::ForwardTaskCache,
+                                WriteBuffer::Borrowed(&task_type_bytes),
+                                WriteBuffer::Borrowed(&task_id.to_le_bytes()),
+                            )
+                            .with_context(|| {
+                                anyhow!("Unable to write task cache {task_type:?} => {task_id}")
+                            })?;
+                        batch
+                            .put(
+                                KeySpace::ReverseTaskCache,
+                                WriteBuffer::Borrowed(IntKey::new(task_id).as_ref()),
+                                WriteBuffer::Borrowed(&task_type_bytes),
+                            )
+                            .with_context(|| {
+                                anyhow!("Unable to write task cache {task_id} => {task_type:?}")
+                            })?;
+                        next_task_id = next_task_id.max(task_id + 1);
+                    }
                 }
+
+                save_infra::<T::SerialWriteBatch<'_>, T::ConcurrentWriteBatch<'_>>(
+                    &mut WriteBatchRef::serial(batch),
+                    next_task_id,
+                    session_id,
+                    operations,
+                )?;
             }
         }
 
@@ -703,48 +690,38 @@ where
         > + Send
         + Sync,
 {
-    let span = Span::current();
-    let turbo_tasks = turbo_tasks::turbo_tasks();
-    let handle = tokio::runtime::Handle::current();
-    tasks
-        .into_par_iter()
-        .map(|tasks| {
-            let _span = span.clone().entered();
-            let _guard = handle.clone().enter();
-            turbo_tasks_scope(turbo_tasks.clone(), || {
-                let mut result = Vec::new();
-                for (task_id, meta, data) in tasks {
-                    if let Some(batch) = batch {
-                        let key = IntKey::new(*task_id);
-                        let key = key.as_ref();
-                        if let Some(meta) = meta {
-                            batch.put(
-                                KeySpace::TaskMeta,
-                                WriteBuffer::Borrowed(key),
-                                WriteBuffer::SmallVec(meta),
-                            )?;
-                        }
-                        if let Some(data) = data {
-                            batch.put(
-                                KeySpace::TaskData,
-                                WriteBuffer::Borrowed(key),
-                                WriteBuffer::SmallVec(data),
-                            )?;
-                        }
-                    } else {
-                        // Store the new task data
-                        result.push((
-                            task_id,
-                            meta.map(WriteBuffer::SmallVec),
-                            data.map(WriteBuffer::SmallVec),
-                        ));
-                    }
+    parallel::vec_into_map_collect::<_, _, Result<Vec<_>>>(tasks, |tasks| {
+        let mut result = Vec::new();
+        for (task_id, meta, data) in tasks {
+            if let Some(batch) = batch {
+                let key = IntKey::new(*task_id);
+                let key = key.as_ref();
+                if let Some(meta) = meta {
+                    batch.put(
+                        KeySpace::TaskMeta,
+                        WriteBuffer::Borrowed(key),
+                        WriteBuffer::SmallVec(meta),
+                    )?;
+                }
+                if let Some(data) = data {
+                    batch.put(
+                        KeySpace::TaskData,
+                        WriteBuffer::Borrowed(key),
+                        WriteBuffer::SmallVec(data),
+                    )?;
                 }
+            } else {
+                // Store the new task data
+                result.push((
+                    task_id,
+                    meta.map(WriteBuffer::SmallVec),
+                    data.map(WriteBuffer::SmallVec),
+                ));
+            }
+        }
 
-                Ok(result)
-            })
-        })
-        .collect::<Result<Vec<_>>>()
+        Ok(result)
+    })
 }
 
 fn serialize(task: TaskId, data: &Vec<CachedDataItem>) -> Result<SmallVec<[u8; 16]>> {
diff --git a/turbopack/crates/turbo-tasks-backend/src/utils/bi_map.rs b/turbopack/crates/turbo-tasks-backend/src/utils/bi_map.rs
index 038494cb97b9b..bf912a248613d 100644
--- a/turbopack/crates/turbo-tasks-backend/src/utils/bi_map.rs
+++ b/turbopack/crates/turbo-tasks-backend/src/utils/bi_map.rs
@@ -3,6 +3,8 @@ use std::{borrow::Borrow, hash::Hash};
 use dashmap::mapref::entry::Entry;
 use turbo_tasks::FxDashMap;
 
+use crate::utils::dash_map_drop_contents::drop_contents;
+
 /// A bidirectional [`FxDashMap`] that allows lookup by key or value.
 ///
 /// As keys and values are stored twice, they should be small types, such as
@@ -53,3 +55,14 @@ where
         }
     }
 }
+
+impl<K, V> BiMap<K, V>
+where
+    K: Eq + Hash + Send + Sync,
+    V: Eq + Hash + Send + Sync,
+{
+    pub fn drop_contents(&self) {
+        drop_contents(&self.forward);
+        drop_contents(&self.reverse);
+    }
+}
diff --git a/turbopack/crates/turbo-tasks-backend/src/utils/dash_map_drop_contents.rs b/turbopack/crates/turbo-tasks-backend/src/utils/dash_map_drop_contents.rs
new file mode 100644
index 0000000000000..480f9b20cb423
--- /dev/null
+++ b/turbopack/crates/turbo-tasks-backend/src/utils/dash_map_drop_contents.rs
@@ -0,0 +1,17 @@
+use std::{
+    hash::{BuildHasher, Hash},
+    mem::take,
+};
+
+use dashmap::DashMap;
+use turbo_tasks::parallel;
+
+pub fn drop_contents<K: Hash + Eq + Send + Sync, V: Send + Sync, H: BuildHasher + Clone>(
+    map: &DashMap<K, V, H>,
+) {
+    let shards = map.shards();
+    parallel::for_each(shards, |shard| {
+        let table = take(&mut *shard.write());
+        drop(table);
+    });
+}
diff --git a/turbopack/crates/turbo-tasks-backend/src/utils/mod.rs b/turbopack/crates/turbo-tasks-backend/src/utils/mod.rs
index 1cb30f53a1ff6..784ec44fe564a 100644
--- a/turbopack/crates/turbo-tasks-backend/src/utils/mod.rs
+++ b/turbopack/crates/turbo-tasks-backend/src/utils/mod.rs
@@ -1,5 +1,6 @@
 pub mod bi_map;
 pub mod chunked_vec;
+pub mod dash_map_drop_contents;
 pub mod dash_map_multi;
 pub mod ptr_eq_arc;
 pub mod sharded;
diff --git a/turbopack/crates/turbo-tasks-backend/tests/all_in_one.rs b/turbopack/crates/turbo-tasks-backend/tests/all_in_one.rs
index f9321cfd797fb..add31c32ecd35 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/all_in_one.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/all_in_one.rs
@@ -9,7 +9,7 @@ use turbo_tasks_testing::{Registration, register, run};
 
 static REGISTRATION: Registration = register!();
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn all_in_one() {
     run(&REGISTRATION, || async {
         let a: Vc<u32> = Vc::cell(4242);
diff --git a/turbopack/crates/turbo-tasks-backend/tests/basic.rs b/turbopack/crates/turbo-tasks-backend/tests/basic.rs
index a12da0b8578d8..a22cb96ade456 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/basic.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/basic.rs
@@ -8,7 +8,7 @@ use turbo_tasks_testing::{Registration, register, run};
 
 static REGISTRATION: Registration = register!();
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn basic() {
     run(&REGISTRATION, || async {
         let output1 = func_without_args();
diff --git a/turbopack/crates/turbo-tasks-backend/tests/bug.rs b/turbopack/crates/turbo-tasks-backend/tests/bug.rs
index f7e8097a1b7aa..5d225bdb8c48e 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/bug.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/bug.rs
@@ -24,7 +24,7 @@ struct TaskSpec {
 #[turbo_tasks::value(transparent)]
 struct TasksSpec(Vec<TaskSpec>);
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn graph_bug() {
     // see https://github.com/vercel/next.js/pull/79451
     run(&REGISTRATION, || async {
diff --git a/turbopack/crates/turbo-tasks-backend/tests/bug2.rs b/turbopack/crates/turbo-tasks-backend/tests/bug2.rs
index df3115b8aa3da..a1495eeeca91b 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/bug2.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/bug2.rs
@@ -33,7 +33,7 @@ pub struct TaskSpec {
 #[turbo_tasks::value(transparent)]
 struct Iteration(State<usize>);
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn graph_bug() {
     run(&REGISTRATION, move || async move {
         let spec = vec![
diff --git a/turbopack/crates/turbo-tasks-backend/tests/call_types.rs b/turbopack/crates/turbo-tasks-backend/tests/call_types.rs
index 17875d2630d78..f06430ada2bd0 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/call_types.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/call_types.rs
@@ -8,7 +8,7 @@ use turbo_tasks_testing::{Registration, register, run};
 
 static REGISTRATION: Registration = register!();
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn functions() {
     run(&REGISTRATION, || async {
         assert_eq!(*fn_plain().await?, 42);
@@ -53,7 +53,7 @@ async fn async_fn_vc_arg(n: Vc<u32>) -> Result<Vc<u32>> {
     Ok(Vc::cell(*n.await?))
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn methods() {
     run(&REGISTRATION, || async {
         assert_eq!(*Value::static_method().await?, 42);
@@ -106,7 +106,7 @@ impl Value {
     }
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn trait_methods() {
     run(&REGISTRATION, || async {
         assert_eq!(*Value::static_trait_method().await?, 42);
diff --git a/turbopack/crates/turbo-tasks-backend/tests/collectibles.rs b/turbopack/crates/turbo-tasks-backend/tests/collectibles.rs
index a86c0e09343d0..945845a86e3a2 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/collectibles.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/collectibles.rs
@@ -14,7 +14,7 @@ use turbo_tasks_testing::{Registration, register, run};
 
 static REGISTRATION: Registration = register!();
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn transitive_emitting() {
     run(&REGISTRATION, || async {
         let result_op = my_transitive_emitting_function(rcstr!(""), rcstr!(""));
@@ -32,7 +32,7 @@ async fn transitive_emitting() {
     .unwrap()
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn transitive_emitting_indirect() {
     run(&REGISTRATION, || async {
         let result_op = my_transitive_emitting_function(rcstr!(""), rcstr!(""));
@@ -50,7 +50,7 @@ async fn transitive_emitting_indirect() {
     .unwrap()
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn multi_emitting() {
     run(&REGISTRATION, || async {
         let result_op = my_multi_emitting_function();
@@ -68,7 +68,7 @@ async fn multi_emitting() {
     .unwrap()
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn taking_collectibles() {
     run(&REGISTRATION, || async {
         let result_op = my_collecting_function();
@@ -84,7 +84,7 @@ async fn taking_collectibles() {
     .unwrap()
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn taking_collectibles_extra_layer() {
     run(&REGISTRATION, || async {
         let result_op = my_collecting_function_indirect();
@@ -100,7 +100,7 @@ async fn taking_collectibles_extra_layer() {
     .unwrap()
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn taking_collectibles_parallel() {
     run(&REGISTRATION, || async {
         let result_op = my_transitive_emitting_function(rcstr!(""), rcstr!("a"));
@@ -142,7 +142,7 @@ async fn taking_collectibles_parallel() {
     .unwrap()
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn taking_collectibles_with_resolve() {
     run(&REGISTRATION, || async {
         let result_op = my_transitive_emitting_function_with_resolve(rcstr!("resolve"));
diff --git a/turbopack/crates/turbo-tasks-backend/tests/debug.rs b/turbopack/crates/turbo-tasks-backend/tests/debug.rs
index 854d57b234395..ccc833eeb85d8 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/debug.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/debug.rs
@@ -9,7 +9,7 @@ use turbo_tasks_testing::{Registration, register, run};
 
 static REGISTRATION: Registration = register!();
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn primitive_debug() {
     run(&REGISTRATION, || async {
         let a: Vc<u32> = Vc::cell(42);
@@ -20,7 +20,7 @@ async fn primitive_debug() {
     .unwrap()
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn transparent_debug() {
     run(&REGISTRATION, || async {
         let a: Vc<Transparent> = Transparent(42).cell();
@@ -32,7 +32,7 @@ async fn transparent_debug() {
     .unwrap()
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn enum_none_debug() {
     run(&REGISTRATION, || async {
         let a: Vc<Enum> = Enum::None.cell();
@@ -44,7 +44,7 @@ async fn enum_none_debug() {
     .unwrap()
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn enum_transparent_debug() {
     run(&REGISTRATION, || async {
         let a: Vc<Enum> = Enum::Transparent(Transparent(42).resolved_cell()).cell();
@@ -60,7 +60,7 @@ async fn enum_transparent_debug() {
     .unwrap()
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn enum_inner_vc_debug() {
     run(&REGISTRATION, || async {
         let a: Vc<Enum> = Enum::Enum(Enum::None.resolved_cell()).cell();
@@ -76,7 +76,7 @@ async fn enum_inner_vc_debug() {
     .unwrap()
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn struct_unit_debug() {
     run(&REGISTRATION, || async {
         let a: Vc<StructUnit> = StructUnit.cell();
@@ -87,7 +87,7 @@ async fn struct_unit_debug() {
     .unwrap()
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn struct_transparent_debug() {
     run(&REGISTRATION, || async {
         let a: Vc<StructWithTransparent> = StructWithTransparent {
@@ -106,7 +106,7 @@ async fn struct_transparent_debug() {
     .unwrap()
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn struct_vec_debug() {
     run(&REGISTRATION, || async {
         let a: Vc<StructWithVec> = StructWithVec { vec: vec![] }.cell();
@@ -135,7 +135,7 @@ async fn struct_vec_debug() {
     .unwrap()
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn struct_ignore_debug() {
     run(&REGISTRATION, || async {
         let a: Vc<StructWithIgnore> = StructWithIgnore {
diff --git a/turbopack/crates/turbo-tasks-backend/tests/detached.rs b/turbopack/crates/turbo-tasks-backend/tests/detached.rs
index c76c23590f8ab..b1c80929fad6a 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/detached.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/detached.rs
@@ -15,7 +15,7 @@ use turbo_tasks_testing::{Registration, register, run};
 
 static REGISTRATION: Registration = register!();
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn test_spawns_detached() -> anyhow::Result<()> {
     run(&REGISTRATION, || async {
         // HACK: The watch channel we use has an incorrect implementation of `TraceRawVcs`, just
@@ -82,7 +82,7 @@ async fn spawns_detached(
     Vc::cell(())
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn test_spawns_detached_changing() -> anyhow::Result<()> {
     run(&REGISTRATION, || async {
         // HACK: The watch channel we use has an incorrect implementation of `TraceRawVcs`
diff --git a/turbopack/crates/turbo-tasks-backend/tests/dirty_in_progress.rs b/turbopack/crates/turbo-tasks-backend/tests/dirty_in_progress.rs
index 8171cead7dd40..89aa8998fae80 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/dirty_in_progress.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/dirty_in_progress.rs
@@ -11,7 +11,7 @@ use turbo_tasks_testing::{Registration, register, run};
 
 static REGISTRATION: Registration = register!();
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn dirty_in_progress() {
     run(&REGISTRATION, || async {
         let cases = [
diff --git a/turbopack/crates/turbo-tasks-backend/tests/emptied_cells.rs b/turbopack/crates/turbo-tasks-backend/tests/emptied_cells.rs
index 4a3ddce3bfa73..87c2d6672e468 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/emptied_cells.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/emptied_cells.rs
@@ -8,7 +8,7 @@ use turbo_tasks_testing::{Registration, register, run};
 
 static REGISTRATION: Registration = register!();
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn recompute() {
     run(&REGISTRATION, || async {
         let input = ChangingInput {
diff --git a/turbopack/crates/turbo-tasks-backend/tests/filter_unused_args.rs b/turbopack/crates/turbo-tasks-backend/tests/filter_unused_args.rs
index b7081174940c6..3193382110215 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/filter_unused_args.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/filter_unused_args.rs
@@ -8,7 +8,7 @@ use turbo_tasks_testing::{Registration, register, run};
 
 static REGISTRATION: Registration = register!();
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn filtered_trait_method_args() -> Result<()> {
     run(&REGISTRATION, || async {
         let uses_arg = UsesArg.cell();
diff --git a/turbopack/crates/turbo-tasks-backend/tests/immutable.rs b/turbopack/crates/turbo-tasks-backend/tests/immutable.rs
index d90a4cb2f78de..0c716c7544744 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/immutable.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/immutable.rs
@@ -8,7 +8,7 @@ use turbo_tasks_testing::{Registration, register, run};
 
 static REGISTRATION: Registration = register!();
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn hidden_mutate() {
     run(&REGISTRATION, || async {
         let input = create_input().resolve().await?;
diff --git a/turbopack/crates/turbo-tasks-backend/tests/local_tasks.rs b/turbopack/crates/turbo-tasks-backend/tests/local_tasks.rs
index e2a6a7abdfa74..f66363d374635 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/local_tasks.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/local_tasks.rs
@@ -8,7 +8,7 @@ use turbo_tasks_testing::{Registration, register, run};
 
 static REGISTRATION: Registration = register!();
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn test_local_task_id() -> Result<()> {
     run(&REGISTRATION, || async {
         let local_vc = get_local_task_id();
diff --git a/turbopack/crates/turbo-tasks-backend/tests/operation_vc.rs b/turbopack/crates/turbo-tasks-backend/tests/operation_vc.rs
index 8000ddc8b26e3..457971d0667c7 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/operation_vc.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/operation_vc.rs
@@ -26,7 +26,7 @@ fn use_operations() -> Vc<i32> {
     forty_two.connect()
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn test_use_operations() -> Result<()> {
     run(&REGISTRATION, || async {
         assert_eq!(*use_operations().await?, 42);
diff --git a/turbopack/crates/turbo-tasks-backend/tests/panics.rs b/turbopack/crates/turbo-tasks-backend/tests/panics.rs
index d321e825f1430..8b9458ab4f046 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/panics.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/panics.rs
@@ -25,7 +25,7 @@ static FILE_PATH_REGEX: LazyLock<Regex> =
 //
 // This test depends on the process-wide global panic handler. This test must be run in its own
 // process in isolation of any other tests.
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn test_panic_hook() {
     let prev_hook = take_hook();
     set_hook(Box::new(move |info| {
diff --git a/turbopack/crates/turbo-tasks-backend/tests/performance.rs b/turbopack/crates/turbo-tasks-backend/tests/performance.rs
index 904843fad2a63..13b76582af633 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/performance.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/performance.rs
@@ -142,7 +142,7 @@ fn check_skip() -> bool {
     false
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn many_calls_to_many_children() {
     if check_skip() {
         return;
@@ -157,7 +157,7 @@ async fn many_calls_to_many_children() {
     .unwrap();
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn many_calls_to_uncached_many_children() {
     if check_skip() {
         return;
@@ -189,7 +189,7 @@ fn run_big_graph_test(counts: Vec<u32>) -> impl Future<Output = Result<()>> + Se
     )
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn many_calls_to_big_graph_1() {
     if check_skip() {
         return;
@@ -199,7 +199,7 @@ async fn many_calls_to_big_graph_1() {
         .unwrap();
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn many_calls_to_big_graph_2() {
     if check_skip() {
         return;
@@ -211,7 +211,7 @@ async fn many_calls_to_big_graph_2() {
     .unwrap();
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn many_calls_to_big_graph_3() {
     if check_skip() {
         return;
@@ -221,7 +221,7 @@ async fn many_calls_to_big_graph_3() {
         .unwrap();
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn many_calls_to_big_graph_4() {
     if check_skip() {
         return;
@@ -231,7 +231,7 @@ async fn many_calls_to_big_graph_4() {
         .unwrap();
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn many_calls_to_big_graph_5() {
     if check_skip() {
         return;
@@ -243,7 +243,7 @@ async fn many_calls_to_big_graph_5() {
     .unwrap();
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn many_calls_to_big_graph_6() {
     if check_skip() {
         return;
@@ -255,7 +255,7 @@ async fn many_calls_to_big_graph_6() {
     .unwrap();
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn many_calls_to_big_graph_7() {
     if check_skip() {
         return;
@@ -270,7 +270,7 @@ async fn many_calls_to_big_graph_7() {
     .unwrap();
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn many_calls_to_big_graph_8() {
     if check_skip() {
         return;
@@ -282,7 +282,7 @@ async fn many_calls_to_big_graph_8() {
     .unwrap();
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn many_calls_to_big_graph_9() {
     if check_skip() {
         return;
diff --git a/turbopack/crates/turbo-tasks-backend/tests/random_change.rs b/turbopack/crates/turbo-tasks-backend/tests/random_change.rs
index 841c4564af444..089490ab1c79c 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/random_change.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/random_change.rs
@@ -9,7 +9,7 @@ use turbo_tasks_testing::{Registration, register, run};
 
 static REGISTRATION: Registration = register!();
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn random_change() {
     run(&REGISTRATION, || async {
         let state = make_state();
diff --git a/turbopack/crates/turbo-tasks-backend/tests/read_ref_cell.rs b/turbopack/crates/turbo-tasks-backend/tests/read_ref_cell.rs
index d7ccf3b37b6cf..66c51c9e4f1ad 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/read_ref_cell.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/read_ref_cell.rs
@@ -10,7 +10,7 @@ use turbo_tasks_testing::{Registration, register, run};
 
 static REGISTRATION: Registration = register!();
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn read_ref() {
     run(&REGISTRATION, || async {
         let counter = Counter::cell(Counter {
diff --git a/turbopack/crates/turbo-tasks-backend/tests/recompute.rs b/turbopack/crates/turbo-tasks-backend/tests/recompute.rs
index 17a69e9c151d3..dcad783b06e08 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/recompute.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/recompute.rs
@@ -8,7 +8,7 @@ use turbo_tasks_testing::{Registration, register, run};
 
 static REGISTRATION: Registration = register!();
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn recompute() {
     run(&REGISTRATION, || async {
         let input = ChangingInput {
@@ -58,7 +58,7 @@ async fn recompute() {
     .unwrap()
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn immutable_analysis() {
     run(&REGISTRATION, || async {
         let input = ChangingInput {
diff --git a/turbopack/crates/turbo-tasks-backend/tests/recompute_collectibles.rs b/turbopack/crates/turbo-tasks-backend/tests/recompute_collectibles.rs
index 54074af628add..d7c0be301ac70 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/recompute_collectibles.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/recompute_collectibles.rs
@@ -9,7 +9,7 @@ use turbo_tasks_testing::{Registration, register, run};
 
 static REGISTRATION: Registration = register!();
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn recompute() {
     run(&REGISTRATION, || async {
         let input = ChangingInput::new(1).resolve().await?;
diff --git a/turbopack/crates/turbo-tasks-backend/tests/resolved_vc.rs b/turbopack/crates/turbo-tasks-backend/tests/resolved_vc.rs
index da3a69ca62dce..a0b9914b7f8bb 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/resolved_vc.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/resolved_vc.rs
@@ -23,7 +23,7 @@ fn assert_resolved(input: ResolvedVc<u32>) {
     assert!(input_vc.is_resolved());
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn test_conversion() -> Result<()> {
     run(&REGISTRATION, || async {
         let unresolved: Vc<u32> = Vc::cell(42);
@@ -38,7 +38,7 @@ async fn test_conversion() -> Result<()> {
     .await
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn test_cell_construction() -> Result<()> {
     run(&REGISTRATION, || async {
         let a: ResolvedVc<u32> = ResolvedVc::cell(42);
@@ -50,7 +50,7 @@ async fn test_cell_construction() -> Result<()> {
     .await
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn test_resolved_vc_as_arg() -> Result<()> {
     run(&REGISTRATION, || async {
         let unresolved: Vc<u32> = returns_int(42);
@@ -62,7 +62,7 @@ async fn test_resolved_vc_as_arg() -> Result<()> {
     .await
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn test_into_future() -> Result<()> {
     run(&REGISTRATION, || async {
         let mut resolved = ResolvedVc::cell(42);
@@ -78,7 +78,7 @@ async fn test_into_future() -> Result<()> {
     .await
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn test_sidecast() -> Result<()> {
     run(&REGISTRATION, || async {
         let concrete_value = ImplementsAAndB.resolved_cell();
diff --git a/turbopack/crates/turbo-tasks-backend/tests/shrink_to_fit.rs b/turbopack/crates/turbo-tasks-backend/tests/shrink_to_fit.rs
index 524a78950acf2..dc82e82174de5 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/shrink_to_fit.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/shrink_to_fit.rs
@@ -11,7 +11,7 @@ static REGISTRATION: Registration = register!();
 #[turbo_tasks::value(transparent)]
 struct Wrapper(Vec<u32>);
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn test_shrink_to_fit() -> Result<()> {
     run(&REGISTRATION, || async {
         // `Vec::shrink_to_fit` is implicitly called when a cell is constructed.
diff --git a/turbopack/crates/turbo-tasks-backend/tests/task_statistics.rs b/turbopack/crates/turbo-tasks-backend/tests/task_statistics.rs
index 8a391ace095aa..869c944bcb5c7 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/task_statistics.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/task_statistics.rs
@@ -13,7 +13,7 @@ use turbo_tasks_testing::{Registration, register, run_without_cache_check};
 
 static REGISTRATION: Registration = register!();
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn test_simple_task() -> Result<()> {
     run_without_cache_check(&REGISTRATION, async move {
         enable_stats();
@@ -39,7 +39,7 @@ async fn test_simple_task() -> Result<()> {
     .await
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn test_await_same_vc_multiple_times() -> Result<()> {
     run_without_cache_check(&REGISTRATION, async move {
         enable_stats();
@@ -61,7 +61,7 @@ async fn test_await_same_vc_multiple_times() -> Result<()> {
     .await
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn test_vc_receiving_task() -> Result<()> {
     run_without_cache_check(&REGISTRATION, async move {
         enable_stats();
@@ -93,7 +93,7 @@ async fn test_vc_receiving_task() -> Result<()> {
     .await
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn test_trait_methods() -> Result<()> {
     run_without_cache_check(&REGISTRATION, async move {
         enable_stats();
@@ -130,7 +130,7 @@ async fn test_trait_methods() -> Result<()> {
     .await
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn test_dyn_trait_methods() -> Result<()> {
     run_without_cache_check(&REGISTRATION, async move {
         enable_stats();
@@ -174,7 +174,7 @@ async fn test_dyn_trait_methods() -> Result<()> {
 }
 
 // creates Vcs, but doesn't ever execute them
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn test_no_execution() -> Result<()> {
     run_without_cache_check(&REGISTRATION, async move {
         enable_stats();
diff --git a/turbopack/crates/turbo-tasks-backend/tests/trace_transient.rs b/turbopack/crates/turbo-tasks-backend/tests/trace_transient.rs
index 74c21fcaebb65..f553a83a52c5b 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/trace_transient.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/trace_transient.rs
@@ -18,7 +18,7 @@ Adder::add_method (read cell of type turbo-tasks@TODO::::primitives::u64)
     unknown transient task (read cell of type turbo-tasks@TODO::::primitives::u16)
     unknown transient task (read cell of type turbo-tasks@TODO::::primitives::u32)";
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn test_trace_transient() {
     let result = run_without_cache_check(&REGISTRATION, async {
         read_incorrect_task_input_operation(IncorrectTaskInput(
diff --git a/turbopack/crates/turbo-tasks-backend/tests/trait_ref_cell.rs b/turbopack/crates/turbo-tasks-backend/tests/trait_ref_cell.rs
index c556e8d422489..2372947303360 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/trait_ref_cell.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/trait_ref_cell.rs
@@ -10,7 +10,7 @@ use turbo_tasks_testing::{Registration, register, run};
 
 static REGISTRATION: Registration = register!();
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn trait_ref() {
     run(&REGISTRATION, || async {
         let counter = Counter::cell(Counter {
diff --git a/turbopack/crates/turbo-tasks-backend/tests/trait_ref_cell_mode.rs b/turbopack/crates/turbo-tasks-backend/tests/trait_ref_cell_mode.rs
index 15917f62563bf..3b8d1cb15c02a 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/trait_ref_cell_mode.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/trait_ref_cell_mode.rs
@@ -9,7 +9,7 @@ static REGISTRATION: Registration = register!();
 
 // Test that with `cell = "shared"`, the cell will be re-used as long as the
 // value is equal.
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn test_trait_ref_shared_cell_mode() {
     run(&REGISTRATION, || async {
         let input = CellIdSelector {
@@ -44,7 +44,7 @@ async fn test_trait_ref_shared_cell_mode() {
 
 // Test that with `cell = "new"`, the cell will is never re-used, even if the
 // value is equal.
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn test_trait_ref_new_cell_mode() {
     run(&REGISTRATION, || async {
         let input = CellIdSelector {
diff --git a/turbopack/crates/turbo-tasks-backend/tests/transient_collectible.rs b/turbopack/crates/turbo-tasks-backend/tests/transient_collectible.rs
index 216e8a285dbf8..b144319ed4763 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/transient_collectible.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/transient_collectible.rs
@@ -10,7 +10,7 @@ static REGISTRATION: Registration = register!();
 const EXPECTED_MSG: &str =
     "Collectible is transient, transient collectibles cannot be emitted from persistent tasks";
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn test_transient_emit_from_persistent() {
     let result = run_without_cache_check(&REGISTRATION, async {
         emit_incorrect_task_input_operation(IncorrectTaskInput(U32Wrapper(123).resolved_cell()))
diff --git a/turbopack/crates/turbo-tasks-backend/tests/transient_vc.rs b/turbopack/crates/turbo-tasks-backend/tests/transient_vc.rs
index 7db072310c915..100008c755c5c 100644
--- a/turbopack/crates/turbo-tasks-backend/tests/transient_vc.rs
+++ b/turbopack/crates/turbo-tasks-backend/tests/transient_vc.rs
@@ -7,7 +7,7 @@ use turbo_tasks_testing::{Registration, register, run_without_cache_check};
 
 static REGISTRATION: Registration = register!();
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn test_transient_vc() -> Result<()> {
     run_without_cache_check(&REGISTRATION, async {
         test_transient_operation(TransientValue::new(123))
diff --git a/turbopack/crates/turbo-tasks-fetch/tests/fetch.rs b/turbopack/crates/turbo-tasks-fetch/tests/fetch.rs
index a325eefa0f445..b44f2a3a00522 100644
--- a/turbopack/crates/turbo-tasks-fetch/tests/fetch.rs
+++ b/turbopack/crates/turbo-tasks-fetch/tests/fetch.rs
@@ -18,7 +18,7 @@ static REGISTRATION: Registration = register!(turbo_tasks_fetch::register);
 /// acquire and hold this lock to prevent potential flakiness.
 static GLOBAL_TEST_LOCK: TokioMutex<()> = TokioMutex::const_new(());
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn basic_get() {
     let _guard = GLOBAL_TEST_LOCK.lock().await;
     run(&REGISTRATION, || async {
@@ -49,7 +49,7 @@ async fn basic_get() {
     .unwrap()
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn sends_user_agent() {
     let _guard = GLOBAL_TEST_LOCK.lock().await;
     run(&REGISTRATION, || async {
@@ -85,7 +85,7 @@ async fn sends_user_agent() {
 
 // This is temporary behavior.
 // TODO: Implement invalidation that respects Cache-Control headers.
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn invalidation_does_not_invalidate() {
     let _guard = GLOBAL_TEST_LOCK.lock().await;
     run(&REGISTRATION, || async {
@@ -130,7 +130,7 @@ fn get_issue_context() -> Vc<FileSystemPath> {
     DiskFileSystem::new(rcstr!("root"), rcstr!("/")).root()
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn errors_on_failed_connection() {
     let _guard = GLOBAL_TEST_LOCK.lock().await;
     run(&REGISTRATION, || async {
@@ -161,7 +161,7 @@ async fn errors_on_failed_connection() {
     .unwrap()
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn errors_on_404() {
     let _guard = GLOBAL_TEST_LOCK.lock().await;
     run(&REGISTRATION, || async {
@@ -196,7 +196,7 @@ async fn errors_on_404() {
     .unwrap()
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn client_cache() {
     // a simple fetch that should always succeed
     async fn simple_fetch(path: &str, client: FetchClient) -> anyhow::Result<()> {
diff --git a/turbopack/crates/turbo-tasks-fs/src/lib.rs b/turbopack/crates/turbo-tasks-fs/src/lib.rs
index 59205def86e6a..6e24d176978b2 100644
--- a/turbopack/crates/turbo-tasks-fs/src/lib.rs
+++ b/turbopack/crates/turbo-tasks-fs/src/lib.rs
@@ -46,7 +46,6 @@ use dunce::simplified;
 use indexmap::IndexSet;
 use jsonc_parser::{ParseOptions, parse_to_serde_value};
 use mime::Mime;
-use rayon::iter::{IntoParallelIterator, ParallelIterator};
 use rustc_hash::FxHashSet;
 use serde::{Deserialize, Serialize};
 use serde_json::Value;
@@ -56,7 +55,7 @@ use turbo_rcstr::{RcStr, rcstr};
 use turbo_tasks::{
     ApplyEffectsContext, Completion, InvalidationReason, Invalidator, NonLocalValue, ReadRef,
     ResolvedVc, TaskInput, ValueToString, Vc, debug::ValueDebugFormat, effect,
-    mark_session_dependent, mark_stateful, trace::TraceRawVcs,
+    mark_session_dependent, mark_stateful, parallel, trace::TraceRawVcs,
 };
 use turbo_tasks_hash::{DeterministicHash, DeterministicHasher, hash_xxh3_hash64};
 
@@ -314,19 +313,14 @@ impl DiskFileSystemInner {
 
     fn invalidate(&self) {
         let _span = tracing::info_span!("invalidate filesystem", name = &*self.root).entered();
-        let span = tracing::Span::current();
-        let handle = tokio::runtime::Handle::current();
         let invalidator_map = take(&mut *self.invalidator_map.lock().unwrap());
         let dir_invalidator_map = take(&mut *self.dir_invalidator_map.lock().unwrap());
-        let iter = invalidator_map
-            .into_par_iter()
-            .chain(dir_invalidator_map.into_par_iter())
-            .flat_map(|(_, invalidators)| invalidators.into_par_iter());
-        iter.for_each(|(i, _)| {
-            let _span = span.clone().entered();
-            let _guard = handle.enter();
-            i.invalidate()
-        });
+        let invalidators = invalidator_map
+            .into_iter()
+            .chain(dir_invalidator_map)
+            .flat_map(|(_, invalidators)| invalidators.into_keys())
+            .collect::<Vec<_>>();
+        parallel::vec_into_for_each(invalidators, |invalidator| invalidator.invalidate());
     }
 
     /// Invalidates every tracked file in the filesystem.
@@ -337,23 +331,19 @@ impl DiskFileSystemInner {
         reason: impl Fn(&Path) -> R + Sync,
     ) {
         let _span = tracing::info_span!("invalidate filesystem", name = &*self.root).entered();
-        let span = tracing::Span::current();
-        let handle = tokio::runtime::Handle::current();
         let invalidator_map = take(&mut *self.invalidator_map.lock().unwrap());
         let dir_invalidator_map = take(&mut *self.dir_invalidator_map.lock().unwrap());
-        let iter = invalidator_map
-            .into_par_iter()
-            .chain(dir_invalidator_map.into_par_iter())
+        let invalidators = invalidator_map
+            .into_iter()
+            .chain(dir_invalidator_map)
             .flat_map(|(path, invalidators)| {
-                let _span = span.clone().entered();
                 let reason_for_path = reason(&path);
                 invalidators
-                    .into_par_iter()
+                    .into_keys()
                     .map(move |i| (reason_for_path.clone(), i))
-            });
-        iter.for_each(|(reason, (invalidator, _))| {
-            let _span = span.clone().entered();
-            let _guard = handle.enter();
+            })
+            .collect::<Vec<_>>();
+        parallel::vec_into_for_each(invalidators, |(reason, invalidator)| {
             invalidator.invalidate_with_reason(reason)
         });
     }
diff --git a/turbopack/crates/turbo-tasks-fs/src/watcher.rs b/turbopack/crates/turbo-tasks-fs/src/watcher.rs
index bc429eb6cb16c..2c7c6ca98ac6f 100644
--- a/turbopack/crates/turbo-tasks-fs/src/watcher.rs
+++ b/turbopack/crates/turbo-tasks-fs/src/watcher.rs
@@ -16,13 +16,12 @@ use notify::{
     Config, EventKind, PollWatcher, RecommendedWatcher, RecursiveMode, Watcher,
     event::{MetadataKind, ModifyKind, RenameMode},
 };
-use rayon::iter::{IntoParallelIterator, ParallelIterator};
 use rustc_hash::FxHashSet;
 use serde::{Deserialize, Serialize};
 use tracing::instrument;
 use turbo_rcstr::RcStr;
 use turbo_tasks::{
-    FxIndexSet, InvalidationReason, InvalidationReasonKind, Invalidator, spawn_thread,
+    FxIndexSet, InvalidationReason, InvalidationReasonKind, Invalidator, parallel, spawn_thread,
     util::StaticOrArc,
 };
 
@@ -271,40 +270,30 @@ impl DiskWatcher {
         // We need to invalidate all reads that happened before watching
         // Best is to start_watching before starting to read
         {
-            let span = tracing::info_span!("invalidate filesystem");
-            let _span = span.clone().entered();
+            let _span = tracing::info_span!("invalidate filesystem").entered();
             let invalidator_map = take(&mut *fs_inner.invalidator_map.lock().unwrap());
             let dir_invalidator_map = take(&mut *fs_inner.dir_invalidator_map.lock().unwrap());
-            let iter = invalidator_map
-                .into_par_iter()
-                .chain(dir_invalidator_map.into_par_iter());
-            let handle = tokio::runtime::Handle::current();
+            let iter = invalidator_map.into_iter().chain(dir_invalidator_map);
             if report_invalidation_reason {
-                iter.flat_map(|(path, invalidators)| {
-                    let _span = span.clone().entered();
-                    let reason = WatchStart {
-                        name: fs_inner.name.clone(),
-                        // this path is just used for display purposes
-                        path: RcStr::from(path.to_string_lossy()),
-                    };
-                    invalidators
-                        .into_par_iter()
-                        .map(move |i| (reason.clone(), i))
-                })
-                .for_each(|(reason, (invalidator, _))| {
-                    let _span = span.clone().entered();
-                    let _guard = handle.enter();
-                    invalidator.invalidate_with_reason(reason)
+                let invalidators = iter
+                    .flat_map(|(path, invalidators)| {
+                        let reason = WatchStart {
+                            name: fs_inner.name.clone(),
+                            // this path is just used for display purposes
+                            path: RcStr::from(path.to_string_lossy()),
+                        };
+                        invalidators.into_iter().map(move |i| (reason.clone(), i))
+                    })
+                    .collect::<Vec<_>>();
+                parallel::vec_into_for_each(invalidators, |(reason, (invalidator, _))| {
+                    invalidator.invalidate_with_reason(reason);
                 });
             } else {
-                iter.flat_map(|(_, invalidators)| {
-                    let _span = span.clone().entered();
-                    invalidators.into_par_iter().map(move |i| i)
-                })
-                .for_each(|(invalidator, _)| {
-                    let _span = span.clone().entered();
-                    let _guard = handle.enter();
-                    invalidator.invalidate()
+                let invalidators = iter
+                    .flat_map(|(_, invalidators)| invalidators.into_keys())
+                    .collect::<Vec<_>>();
+                parallel::vec_into_for_each(invalidators, |invalidator| {
+                    invalidator.invalidate();
                 });
             }
         }
diff --git a/turbopack/crates/turbo-tasks-malloc/src/lib.rs b/turbopack/crates/turbo-tasks-malloc/src/lib.rs
index 5f2df85ee6282..194d2796d843b 100644
--- a/turbopack/crates/turbo-tasks-malloc/src/lib.rs
+++ b/turbopack/crates/turbo-tasks-malloc/src/lib.rs
@@ -3,6 +3,7 @@ mod counter;
 use std::{
     alloc::{GlobalAlloc, Layout},
     marker::PhantomData,
+    ops::{Add, AddAssign},
 };
 
 use self::counter::{add, flush, get, remove, update};
@@ -16,12 +17,45 @@ pub struct AllocationInfo {
 }
 
 impl AllocationInfo {
+    pub const ZERO: Self = Self {
+        allocations: 0,
+        deallocations: 0,
+        allocation_count: 0,
+        deallocation_count: 0,
+    };
+
     pub fn is_empty(&self) -> bool {
         self.allocations == 0
             && self.deallocations == 0
             && self.allocation_count == 0
             && self.deallocation_count == 0
     }
+
+    pub fn memory_usage(&self) -> usize {
+        self.allocations.saturating_sub(self.deallocations)
+    }
+}
+
+impl Add<Self> for AllocationInfo {
+    type Output = Self;
+
+    fn add(self, other: Self) -> Self {
+        Self {
+            allocations: self.allocations + other.allocations,
+            deallocations: self.deallocations + other.deallocations,
+            allocation_count: self.allocation_count + other.allocation_count,
+            deallocation_count: self.deallocation_count + other.deallocation_count,
+        }
+    }
+}
+
+impl AddAssign<Self> for AllocationInfo {
+    fn add_assign(&mut self, other: Self) {
+        self.allocations += other.allocations;
+        self.deallocations += other.deallocations;
+        self.allocation_count += other.allocation_count;
+        self.deallocation_count += other.deallocation_count;
+    }
 }
 
 #[derive(Default, Clone, Debug)]
diff --git a/turbopack/crates/turbo-tasks/src/capture_future.rs b/turbopack/crates/turbo-tasks/src/capture_future.rs
index 6bd646f942794..4f791869dbab4 100644
--- a/turbopack/crates/turbo-tasks/src/capture_future.rs
+++ b/turbopack/crates/turbo-tasks/src/capture_future.rs
@@ -31,8 +31,7 @@ pin_project! {
         #[pin]
         future: F,
         duration: Duration,
-        allocations: usize,
-        deallocations: usize,
+        allocations: AllocationInfo,
     }
 }
 
@@ -41,8 +40,7 @@ impl<T, F: Future<Output = T>> CaptureFuture<T, F> {
         Self {
             future,
             duration: Duration::ZERO,
-            allocations: 0,
-            deallocations: 0,
+            allocations: AllocationInfo::ZERO,
         }
     }
 }
@@ -77,6 +75,17 @@ pub struct TurboTasksPanic {
     pub location: Option<String>,
 }
 
+impl TurboTasksPanic {
+    pub fn into_panic(self) -> Box<dyn std::any::Any + Send> {
+        Box::new(format!(
+            "{} at {}",
+            self.message,
+            self.location
+                .unwrap_or_else(|| "unknown location".to_string())
+        ))
+    }
+}
+
 impl Display for TurboTasksPanic {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "{}", self.message)
@@ -84,7 +93,7 @@ impl Display for TurboTasksPanic {
 }
 
 impl<T, F: Future<Output = T>> Future for CaptureFuture<T, F> {
-    type Output = (Result<T, TurboTasksPanic>, Duration, usize);
+    type Output = (Result<T, TurboTasksPanic>, Duration, AllocationInfo);
 
     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
         let this = self.project();
@@ -127,17 +136,10 @@ impl<T, F: Future<Output = T>> Future for CaptureFuture<T, F> {
         let elapsed = start.elapsed();
         let allocations = start_allocations.until_now();
         *this.duration += elapsed + data.duration;
-        *this.allocations += allocations.allocations + data.allocations;
-        *this.deallocations += allocations.deallocations + data.deallocations;
+        *this.allocations += allocations;
         match result {
-            Err(err) => {
-                let memory_usage = this.allocations.saturating_sub(*this.deallocations);
-                Poll::Ready((Err(err), *this.duration, memory_usage))
-            }
-            Ok(Poll::Ready(r)) => {
-                let memory_usage = this.allocations.saturating_sub(*this.deallocations);
-                Poll::Ready((Ok(r), *this.duration, memory_usage))
-            }
+            Err(err) => Poll::Ready((Err(err), *this.duration, this.allocations.clone())),
+            Ok(Poll::Ready(r)) => Poll::Ready((Ok(r), *this.duration, this.allocations.clone())),
             Ok(Poll::Pending) => Poll::Pending,
         }
     }
diff --git a/turbopack/crates/turbo-tasks/src/effect.rs b/turbopack/crates/turbo-tasks/src/effect.rs
index 0b893b2a128ad..6373306853477 100644
--- a/turbopack/crates/turbo-tasks/src/effect.rs
+++ b/turbopack/crates/turbo-tasks/src/effect.rs
@@ -1,6 +1,5 @@
 use std::{
     any::{Any, TypeId},
-    borrow::Cow,
     future::Future,
     mem::replace,
     panic,
@@ -8,20 +7,20 @@ use std::{
     sync::Arc,
 };
 
-use anyhow::{Result, anyhow};
+use anyhow::Result;
 use auto_hash_map::AutoSet;
 use futures::{StreamExt, TryStreamExt};
 use parking_lot::Mutex;
 use rustc_hash::{FxHashMap, FxHashSet};
 use tokio::task_local;
-use tracing::{Instrument, Span};
+use tracing::Instrument;
 
 use crate::{
     self as turbo_tasks, CollectiblesSource, NonLocalValue, ReadRef, ResolvedVc, TryJoinIterExt,
     debug::ValueDebugFormat,
     emit,
     event::{Event, EventListener},
-    manager::turbo_tasks_future_scope,
+    spawn,
     trace::TraceRawVcs,
     util::SharedError,
 };
@@ -98,28 +97,10 @@ impl EffectInstance {
                     listener.await;
                 }
                 State::NotStarted(EffectInner { future }) => {
-                    let join_handle = tokio::spawn(ApplyEffectsContext::in_current_scope(
-                        turbo_tasks_future_scope(turbo_tasks::turbo_tasks(), future)
-                            .instrument(Span::current()),
-                    ));
+                    let join_handle = spawn(ApplyEffectsContext::in_current_scope(future));
                     let result = match join_handle.await {
-                        Ok(Err(err)) => Err(SharedError::new(err)),
-                        Err(err) => {
-                            let any = err.into_panic();
-                            let panic = match any.downcast::<String>() {
-                                Ok(owned) => Some(Cow::Owned(*owned)),
-                                Err(any) => match any.downcast::<&'static str>() {
-                                    Ok(str) => Some(Cow::Borrowed(*str)),
-                                    Err(_) => None,
-                                },
-                            };
-                            Err(SharedError::new(if let Some(panic) = panic {
-                                anyhow!("Task effect panicked: {panic}")
-                            } else {
-                                anyhow!("Task effect panicked")
-                            }))
-                        }
-                        Ok(Ok(())) => Ok(()),
+                        Err(err) => Err(SharedError::new(err)),
+                        Ok(()) => Ok(()),
                     };
                     let event = {
                         let mut guard = self.inner.lock();
diff --git a/turbopack/crates/turbo-tasks/src/lib.rs b/turbopack/crates/turbo-tasks/src/lib.rs
index 2c8494d6afb89..d5aafe26593ea 100644
--- a/turbopack/crates/turbo-tasks/src/lib.rs
+++ b/turbopack/crates/turbo-tasks/src/lib.rs
@@ -64,15 +64,16 @@ mod no_move_vec;
 mod once_map;
 mod output;
 pub mod panic_hooks;
+pub mod parallel;
 pub mod persisted_graph;
 pub mod primitives;
 mod raw_vc;
 mod read_options;
 mod read_ref;
 pub mod registry;
-mod scope;
 mod serialization_invalidation;
 pub mod small_duration;
+mod spawn;
 mod state;
 pub mod task;
 mod task_execution_reason;
@@ -107,17 +108,18 @@ pub use manager::{
     CurrentCellRef, ReadConsistency, TaskPersistence, TurboTasks, TurboTasksApi,
     TurboTasksBackendApi, TurboTasksBackendApiExt, TurboTasksCallApi, Unused, UpdateInfo,
     dynamic_call, emit, mark_finished, mark_root, mark_session_dependent, mark_stateful,
-    prevent_gc, run_once, run_once_with_reason, spawn_blocking, spawn_thread, trait_call,
-    turbo_tasks, turbo_tasks_scope,
+    prevent_gc, run_once, run_once_with_reason, trait_call, turbo_tasks, turbo_tasks_scope,
 };
 pub use output::OutputContent;
 pub use raw_vc::{CellId, RawVc, ReadRawVcFuture, ResolveTypeError};
 pub use read_options::ReadCellOptions;
 pub use read_ref::ReadRef;
 use rustc_hash::FxHasher;
-pub use scope::scope;
 pub use serialization_invalidation::SerializationInvalidator;
 pub use shrink_to_fit::ShrinkToFit;
+pub use spawn::{
+    JoinHandle, block_for_future, block_in_place, spawn, spawn_blocking, spawn_thread,
+};
 pub use state::{State, TransientState};
 pub use task::{SharedReference, TypedSharedReference, task_input::TaskInput};
 pub use task_execution_reason::TaskExecutionReason;
diff --git a/turbopack/crates/turbo-tasks/src/manager.rs b/turbopack/crates/turbo-tasks/src/manager.rs
index eaa31f420ed5d..68e3a784fb62e 100644
--- a/turbopack/crates/turbo-tasks/src/manager.rs
+++ b/turbopack/crates/turbo-tasks/src/manager.rs
@@ -8,7 +8,6 @@ use std::{
         Arc, Mutex, RwLock, Weak,
         atomic::{AtomicBool, AtomicUsize, Ordering},
     },
-    thread,
     time::{Duration, Instant},
 };
 
@@ -17,10 +16,9 @@ use auto_hash_map::AutoMap;
 use rustc_hash::FxHasher;
 use serde::{Deserialize, Serialize};
 use smallvec::SmallVec;
-use tokio::{runtime::Handle, select, sync::mpsc::Receiver, task_local};
+use tokio::{select, sync::mpsc::Receiver, task_local};
 use tokio_util::task::TaskTracker;
-use tracing::{Instrument, Level, Span, info_span, instrument, trace_span};
-use turbo_tasks_malloc::TurboMalloc;
+use tracing::{Instrument, Level, instrument, trace_span};
 
 use crate::{
     Completion, InvalidationReason, InvalidationReasonSet, OutputContent, ReadCellOptions,
@@ -30,7 +28,7 @@ use crate::{
         Backend, CachedTaskType, CellContent, TaskCollectiblesMap, TaskExecutionSpec,
         TransientTaskType, TurboTasksExecutionError, TypedCellContent,
     },
-    capture_future::{self, CaptureFuture},
+    capture_future::CaptureFuture,
     event::{Event, EventListener},
     id::{BackendJobId, ExecutionId, LocalTaskId, TRANSIENT_TASK_BIT, TraitTypeId},
     id_factory::IdFactoryWithReuse,
@@ -718,7 +716,7 @@ impl<B: Backend + 'static> TurboTasks<B> {
                     };
 
                     async {
-                        let (result, duration, memory_usage) = CaptureFuture::new(future).await;
+                        let (result, duration, alloc_info) = CaptureFuture::new(future).await;
 
                         // wait for all spawned local tasks using `local` to finish
                         let ltt = CURRENT_TASK_STATE
@@ -742,7 +740,7 @@ impl<B: Backend + 'static> TurboTasks<B> {
                         let schedule_again = this.backend.task_execution_completed(
                             task_id,
                             duration,
-                            memory_usage,
+                            alloc_info.memory_usage(),
                             &cell_counters,
                             stateful,
                             has_invalidator,
@@ -1060,27 +1058,30 @@ impl<B: Backend + 'static> TurboTasks<B> {
     }
 
     pub async fn stop_and_wait(&self) {
-        self.backend.stopping(self);
-        self.stopped.store(true, Ordering::Release);
-        {
-            let listener = self
-                .event
-                .listen_with_note(|| || "wait for stop".to_string());
-            if self.currently_scheduled_tasks.load(Ordering::Acquire) != 0 {
-                listener.await;
+        turbo_tasks_future_scope(self.pin(), async move {
+            self.backend.stopping(self);
+            self.stopped.store(true, Ordering::Release);
+            {
+                let listener = self
+                    .event
+                    .listen_with_note(|| || "wait for stop".to_string());
+                if self.currently_scheduled_tasks.load(Ordering::Acquire) != 0 {
+                    listener.await;
+                }
             }
-        }
-        {
-            let listener = self.event_background.listen();
-            if self
-                .currently_scheduled_background_jobs
-                .load(Ordering::Acquire)
-                != 0
             {
-                listener.await;
+                let listener = self.event_background.listen();
+                if self
+                    .currently_scheduled_background_jobs
+                    .load(Ordering::Acquire)
+                    != 0
+                {
+                    listener.await;
+                }
             }
-        }
-        self.backend.stop(self);
+            self.backend.stop(self);
+        })
+        .await;
     }
 
     #[track_caller]
@@ -1677,6 +1678,10 @@ pub fn turbo_tasks() -> Arc<dyn TurboTasksApi> {
     TURBO_TASKS.with(|arc| arc.clone())
 }
 
+pub fn try_turbo_tasks() -> Option<Arc<dyn TurboTasksApi>> {
+    TURBO_TASKS.try_with(|arc| arc.clone()).ok()
+}
+
 pub fn with_turbo_tasks<T>(func: impl FnOnce(&Arc<dyn TurboTasksApi>) -> T) -> T {
     TURBO_TASKS.with(|arc| func(arc))
 }
@@ -1685,6 +1690,14 @@ pub fn turbo_tasks_scope<T>(tt: Arc<dyn TurboTasksApi>, f: impl FnOnce() -> T) -
     TURBO_TASKS.sync_scope(tt, f)
 }
 
+pub fn turbo_tasks_try_scope<T>(tt: Option<Arc<dyn TurboTasksApi>>, f: impl FnOnce() -> T) -> T {
+    if let Some(tt) = tt {
+        TURBO_TASKS.sync_scope(tt, f)
+    } else {
+        f()
+    }
+}
+
 pub fn turbo_tasks_future_scope<T>(
     tt: Arc<dyn TurboTasksApi>,
     f: impl Future<Output = T>,
@@ -1787,35 +1800,6 @@ pub fn emit<T: VcValueTrait + ?Sized>(collectible: ResolvedVc<T>) {
     })
 }
 
-pub async fn spawn_blocking<T: Send + 'static>(func: impl FnOnce() -> T + Send + 'static) -> T {
-    let turbo_tasks = turbo_tasks();
-    let span = Span::current();
-    let (result, duration, alloc_info) = tokio::task::spawn_blocking(|| {
-        let _guard = span.entered();
-        let start = Instant::now();
-        let start_allocations = TurboMalloc::allocation_counters();
-        let r = turbo_tasks_scope(turbo_tasks, func);
-        (r, start.elapsed(), start_allocations.until_now())
-    })
-    .await
-    .unwrap();
-    capture_future::add_duration(duration);
-    capture_future::add_allocation_info(alloc_info);
-    result
-}
-
-pub fn spawn_thread(func: impl FnOnce() + Send + 'static) {
-    let handle = Handle::current();
-    let span = info_span!("thread").or_current();
-    thread::spawn(move || {
-        let span = span.entered();
-        let guard = handle.enter();
-        func();
-        drop(guard);
-        drop(span);
-    });
-}
-
 pub(crate) async fn read_task_output(
     this: &dyn TurboTasksApi,
     id: TaskId,
diff --git a/turbopack/crates/turbo-tasks/src/parallel.rs b/turbopack/crates/turbo-tasks/src/parallel.rs
new file mode 100644
index 0000000000000..751d3098e5a61
--- /dev/null
+++ b/turbopack/crates/turbo-tasks/src/parallel.rs
@@ -0,0 +1,535 @@
+//! Parallel for each resp. map running in the current tokio thread pool maintaining turbo tasks and
+//! tracing context.
+//!
+//! This avoid the problem of sleeping threads with mimalloc when using rayon in combination with
+//! tokio. It also avoid having multiple thread pools.
+
+use std::{
+    mem::{ManuallyDrop, transmute},
+    panic,
+    sync::{Arc, LazyLock},
+    thread::available_parallelism,
+};
+
+use tokio::{
+    runtime::Handle,
+    task::{JoinHandle, block_in_place},
+};
+use tracing::{Instrument, Span};
+
+use crate::{
+    TurboTasksApi,
+    manager::{try_turbo_tasks, turbo_tasks_try_scope},
+};
+
+/// Calculates a good chunk size for parallel processing based on the number of available threads.
+/// This is used to ensure that the workload is evenly distributed across the threads.
+fn good_chunk_size(len: usize) -> usize {
+    static GOOD_CHUNK_COUNT: LazyLock<usize> =
+        LazyLock::new(|| available_parallelism().map_or(16, |c| c.get() * 4));
+    let min_chunk_count = *GOOD_CHUNK_COUNT;
+    len.div_ceil(min_chunk_count)
+}
+
+/// Context to allow spawning a task with a limited lifetime.
+///
+/// ## Safety
+///
+/// This context must not be dropped before all tasks spawned with it have been awaited.
+struct ProcessInParallelContext<'l, R: Send + 'l> {
+    results: Box<[Option<R>]>,
+    index: usize,
+    handle: Handle,
+    turbo_tasks: Option<Arc<dyn TurboTasksApi>>,
+    span: Span,
+    phantom: std::marker::PhantomData<&'l ()>,
+}
+
+impl<'l, R: Send + 'l> ProcessInParallelContext<'l, R> {
+    fn new(len: usize) -> Self {
+        let mut results = Vec::with_capacity(len);
+        for _ in 0..len {
+            results.push(None);
+        }
+        Self {
+            results: results.into_boxed_slice(),
+            index: 0,
+            handle: Handle::current(),
+            turbo_tasks: try_turbo_tasks(),
+            span: Span::current(),
+            phantom: std::marker::PhantomData,
+        }
+    }
+
+    fn task<F>(&mut self, f: F) -> JoinHandle<()>
+    where
+        F: FnOnce() -> R + Send + 'l,
+    {
+        struct SendablePtr<T>(*mut Option<T>);
+        unsafe impl<T: Send> Send for SendablePtr<T> {}
+        unsafe impl<T: Sync> Sync for SendablePtr<T> {}
+        impl<T> SendablePtr<T> {
+            fn new(reference: &mut Option<T>) -> Self {
+                SendablePtr(reference as *mut Option<T>)
+            }
+
+            unsafe fn get_mut(&mut self) -> &mut Option<T> {
+                // SAFETY: This is a valid pointer, as we got this pointer from a reference.
+                unsafe { &mut *self.0 }
+            }
+        }
+
+        let mut result_cell = SendablePtr::new(&mut self.results[self.index]);
+        self.index += 1;
+
+        let f: Box<dyn FnOnce() + Send + 'l> = Box::new(move || {
+            let result = f();
+            // SAFETY: This is a valid pointer, as we got this pointer from a reference.
+            let result_cell = unsafe { result_cell.get_mut() };
+            *result_cell = Some(result);
+        });
+        // SAFETY: In `process_in_parallel` we ensure that the spawned tasks is awaited before the
+        // lifetime `'l` ends.
+        let f: Box<dyn FnOnce() + Send + 'static> = unsafe {
+            transmute::<Box<dyn FnOnce() + Send + 'l>, Box<dyn FnOnce() + Send + 'static>>(f)
+        };
+        let turbo_tasks = self.turbo_tasks.clone();
+        let span = self.span.clone();
+        self.handle.spawn(async move {
+            turbo_tasks_try_scope(turbo_tasks, || {
+                let _guard = span.entered();
+                f();
+            })
+        })
+    }
+
+    /// Converts the context into a vector of results
+    ///
+    /// ## Safety
+    ///
+    /// The caller must ensure that all tasks have been awaited before calling this method.
+    unsafe fn into_results(self) -> Vec<Option<R>> {
+        self.results.into_vec()
+    }
+}
+
+/// Helper method to spawn tasks in parallel, ensuring that all tasks are awaited and errors are
+/// handled. Also ensures turbo tasks and tracing context are maintained across the tasks.
+///
+/// ## Safety
+///
+/// The caller must ensure that all references in `inner` are valid for the lifetime `'l`.
+unsafe fn process_in_parallel<'l, I, R>(len: usize, inner: I) -> Vec<Option<R>>
+where
+    R: Send + 'l,
+    I: FnOnce(&mut ProcessInParallelContext<'l, R>) -> Vec<JoinHandle<()>> + 'l,
+{
+    let mut process_context = ProcessInParallelContext::new(len);
+    block_in_place(|| {
+        let tasks = inner(&mut process_context);
+        process_context.handle.block_on(
+            async {
+                let mut first_err = None;
+                for task in tasks {
+                    match task.await {
+                        Ok(()) => {}
+                        Err(err) if first_err.is_none() => {
+                            // SAFETY: We need to finish all tasks before panicking.
+                            first_err = Some(err);
+                        }
+                        Err(_) => {
+                            // Ignore subsequent errors
+                        }
+                    }
+                }
+                if let Some(err) = first_err {
+                    panic::resume_unwind(err.into_panic());
+                }
+            }
+            .instrument(process_context.span.clone()),
+        );
+    });
+    // SAFETY: We ensure that all tasks have been awaited before calling this method.
+    unsafe { process_context.into_results() }
+}
+
+pub fn for_each<'l, T, F>(items: &'l [T], f: F)
+where
+    T: Sync,
+    F: Fn(&'l T) + Send + Sync,
+{
+    let len = items.len();
+    if len == 0 {
+        return;
+    }
+    let chunk_size = good_chunk_size(len);
+    let f = &f;
+    // SAFETY: We ensured that references in the closure are valid for the whole lifetime of this
+    // function.
+    unsafe {
+        process_in_parallel(len.div_ceil(chunk_size), |ctx| {
+            items
+                .chunks(chunk_size)
+                .map(|chunk| {
+                    ctx.task(move || {
+                        for item in chunk {
+                            f(item);
+                        }
+                    })
+                })
+                .collect::<Vec<_>>()
+        })
+    };
+    // SAFETY: Ensure references are kept until here
+    let _ = items;
+    let _ = f;
+}
+
+pub fn vec_into_for_each<T>(items: Vec<T>, f: impl Fn(T) + Send + Sync)
+where
+    T: Send + Sync,
+{
+    let len = items.len();
+    if len == 0 {
+        return;
+    }
+    let chunk_size = good_chunk_size(len);
+    let f = &f;
+    // SAFETY: transmuting to ManuallyDrop is always safe. We just need to make sure to not leak
+    // memory.
+    let mut items = unsafe { transmute::<Vec<T>, Vec<ManuallyDrop<T>>>(items) };
+    // SAFETY: We ensured that references in the closure are valid for the whole lifetime of this
+    // function.
+    unsafe {
+        process_in_parallel(len.div_ceil(chunk_size), |ctx| {
+            items
+                .chunks_mut(chunk_size)
+                .map(|chunk| {
+                    ctx.task(move || {
+                        // SAFETY: Even when f() panics we drop all items in the chunk.
+                        for item in MapEvenWhenDropped::new(chunk.iter_mut(), |item| {
+                            ManuallyDrop::take(item)
+                        }) {
+                            f(item);
+                        }
+                    })
+                })
+                .collect::<Vec<_>>()
+        })
+    };
+    // SAFETY: Ensure references are kept until here
+    drop(items);
+    let _ = f;
+}
+
+pub fn try_for_each<'l, T, E>(
+    items: &'l [T],
+    f: impl (Fn(&'l T) -> Result<(), E>) + Send + Sync,
+) -> Result<(), E>
+where
+    T: Sync,
+    E: Send + 'static,
+{
+    let len = items.len();
+    if len == 0 {
+        return Ok(()); // No items to process, return early
+    }
+    let chunk_size = good_chunk_size(len);
+    let f = &f;
+    // SAFETY: We ensured that references in the closure are valid for the whole lifetime of this
+    // function.
+    let results = unsafe {
+        process_in_parallel(len.div_ceil(chunk_size), |ctx| {
+            items
+                .chunks(chunk_size)
+                .map(|chunk| {
+                    ctx.task(move || {
+                        for item in chunk {
+                            f(item)?;
+                        }
+                        Ok(())
+                    })
+                })
+                .collect::<Vec<_>>()
+        })
+    };
+    let result = results.into_iter().flatten().collect::<Result<(), E>>();
+    // SAFETY: Ensure references are kept until here
+    let _ = items;
+    let _ = f;
+    result
+}
+
+pub fn try_for_each_mut<'l, T, E>(
+    items: &'l mut [T],
+    f: impl (Fn(&'l mut T) -> Result<(), E>) + Send + Sync,
+) -> Result<(), E>
+where
+    T: Send + Sync,
+    E: Send + 'static,
+{
+    let len = items.len();
+    if len == 0 {
+        return Ok(()); // No items to process, return early
+    }
+    let chunk_size = good_chunk_size(len);
+    let f = &f;
+    // SAFETY: We ensured that references in the closure are valid for the whole lifetime of this
+    // function.
+    let results = unsafe {
+        process_in_parallel(len.div_ceil(chunk_size), |ctx| {
+            items
+                .chunks_mut(chunk_size)
+                .map(|chunk| {
+                    ctx.task(move || {
+                        for item in chunk {
+                            f(item)?;
+                        }
+                        Ok(())
+                    })
+                })
+                .collect::<Vec<_>>()
+        })
+    };
+    let result = results.into_iter().flatten().collect::<Result<(), E>>();
+    // SAFETY: Ensure references are kept until here
+    let _ = items;
+    let _ = f;
+    result
+}
+
+pub fn try_into_for_each<T, E>(
+    items: Vec<T>,
+    f: impl (Fn(T) -> Result<(), E>) + Send + Sync,
+) -> Result<(), E>
+where
+    T: Send + Sync,
+    E: Send + 'static,
+{
+    let len = items.len();
+    if len == 0 {
+        return Ok(()); // No items to process, return early
+    }
+    let chunk_size = good_chunk_size(len);
+    let f = &f;
+    // SAFETY: transmuting to ManuallyDrop is always safe. We just need to make sure to not leak
+    // memory.
+    let mut items = unsafe { transmute::<Vec<T>, Vec<ManuallyDrop<T>>>(items) };
+    // SAFETY: We ensured that references in the closure are valid for the whole lifetime of this
+    // function.
+    let results = unsafe {
+        process_in_parallel(len.div_ceil(chunk_size), |ctx| {
+            items
+                .chunks_mut(chunk_size)
+                .map(|chunk| {
+                    ctx.task(move || {
+                        // SAFETY: Even when f() panics we drop all items in the chunk.
+                        for item in MapEvenWhenDropped::new(chunk.iter_mut(), |item| {
+                            ManuallyDrop::take(item)
+                        }) {
+                            f(item)?;
+                        }
+                        Ok(())
+                    })
+                })
+                .collect::<Vec<_>>()
+        })
+    };
+    let result = results.into_iter().flatten().collect::<Result<(), E>>();
+    // SAFETY: Ensure references are kept until here
+    let _ = items;
+    let _ = f;
+    result
+}
+
+pub fn map_collect<'l, T, I, R>(items: &'l [T], f: impl Fn(&'l T) -> I + Send + Sync) -> R
+where
+    T: Sync,
+    I: Send + Sync + 'l,
+    R: FromIterator<I>,
+{
+    let len = items.len();
+    if len == 0 {
+        return R::from_iter(std::iter::empty()); // No items to process, return empty collection
+    }
+    let chunk_size = good_chunk_size(len);
+    let f = &f;
+    // SAFETY: We ensured that references in the closure are valid for the whole lifetime of this
+    // function.
+    let results = unsafe {
+        process_in_parallel(len.div_ceil(chunk_size), |ctx| {
+            items
+                .chunks(chunk_size)
+                .map(|chunk| ctx.task(move || chunk.iter().map(f).collect::<Vec<_>>()))
+                .collect::<Vec<_>>()
+        })
+    };
+    let result = results.into_iter().flatten().flatten().collect();
+    // SAFETY: Ensure references are kept until here
+    let _ = items;
+    let _ = f;
+    result
+}
+
+pub fn vec_into_map_collect<'l, T, I, R>(items: Vec<T>, f: impl Fn(T) -> I + Send + Sync) -> R
+where
+    T: Send + Sync,
+    I: Send + Sync + 'l,
+    R: FromIterator<I>,
+{
+    let len = items.len();
+    if len == 0 {
+        return R::from_iter(std::iter::empty()); // No items to process, return empty collection;
+    }
+    let chunk_size = good_chunk_size(len);
+    let f = &f;
+    let mut items = unsafe { transmute::<Vec<T>, Vec<ManuallyDrop<T>>>(items) };
+    // SAFETY: We ensured that references in the closure are valid for the whole lifetime of this
+    // function.
+    let results = unsafe {
+        process_in_parallel(len.div_ceil(chunk_size), |ctx| {
+            items
+                .chunks_mut(chunk_size)
+                .map(|chunk| {
+                    ctx.task(move || {
+                        // SAFETY: Even when f() panics we drop all items in the chunk.
+                        MapEvenWhenDropped::new(chunk.iter_mut(), |item| ManuallyDrop::take(item))
+                            .map(f)
+                            .collect::<Vec<_>>()
+                    })
+                })
+                .collect::<Vec<_>>()
+        })
+    };
+    let result = results.into_iter().flatten().flatten().collect();
+    // SAFETY: Ensure references are kept until here
+    let _ = items;
+    let _ = f;
+    result
+}
+
+struct MapEvenWhenDropped<I, B, F>
+where
+    I: Iterator,
+    F: FnMut(I::Item) -> B,
+{
+    iter: I,
+    f: F,
+}
+
+impl<I, B, F> MapEvenWhenDropped<I, B, F>
+where
+    I: Iterator,
+    F: FnMut(I::Item) -> B,
+{
+    fn new(iter: I, f: F) -> Self {
+        Self { iter, f }
+    }
+}
+
+impl<I, B, F> Iterator for MapEvenWhenDropped<I, B, F>
+where
+    I: Iterator,
+    F: FnMut(I::Item) -> B,
+{
+    type Item = B;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.iter.next().map(&mut self.f)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.iter.size_hint()
+    }
+}
+
+impl<I, B, F> Drop for MapEvenWhenDropped<I, B, F>
+where
+    I: Iterator,
+    F: FnMut(I::Item) -> B,
+{
+    fn drop(&mut self) {
+        // Ensure that the mapping function is called even when the iterator is dropped.
+        for item in &mut self.iter {
+            drop((self.f)(item));
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::atomic::{AtomicI32, Ordering};
+
+    use super::*;
+
+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_parallel_for_each() {
+        let input = vec![1, 2, 3, 4, 5];
+        let sum = AtomicI32::new(0);
+        for_each(&input, |&x| {
+            sum.fetch_add(x, Ordering::SeqCst);
+        });
+        assert_eq!(sum.load(Ordering::SeqCst), 15);
+    }
+
+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_parallel_try_for_each() {
+        let input = vec![1, 2, 3, 4, 5];
+        let result = try_for_each(&input, |&x| {
+            if x % 2 == 0 {
+                Ok(())
+            } else {
+                Err(format!("Odd number {x} encountered"))
+            }
+        });
+        assert!(result.is_err());
+        assert_eq!(result.unwrap_err(), "Odd number 1 encountered");
+    }
+
+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_parallel_try_for_each_mut() {
+        let mut input = vec![1, 2, 3, 4, 5];
+        let result = try_for_each_mut(&mut input, |x| {
+            *x += 10;
+            if *x % 2 == 0 {
+                Ok(())
+            } else {
+                Err(format!("Odd number {} encountered", *x))
+            }
+        });
+        assert!(result.is_err());
+        assert_eq!(result.unwrap_err(), "Odd number 11 encountered");
+        assert_eq!(input, vec![11, 12, 13, 14, 15]);
+    }
+
+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_parallel_vec_into_for_each() {
+        let input = vec![1, 2, 3, 4, 5];
+        let sum = AtomicI32::new(0);
+        vec_into_for_each(input, |x| {
+            sum.fetch_add(x, Ordering::SeqCst);
+        });
+        assert_eq!(sum.load(Ordering::SeqCst), 15);
+    }
+
+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_parallel_map_collect() {
+        let input = vec![1, 2, 3, 4, 5];
+        let result: Vec<_> = map_collect(&input, |&x| x * 2);
+        assert_eq!(result, vec![2, 4, 6, 8, 10]);
+    }
+
+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_parallel_into_map_collect() {
+        let input = vec![1, 2, 3, 4, 5];
+        let result: Vec<_> = vec_into_map_collect(input, |x| x * 2);
+        assert_eq!(result, vec![2, 4, 6, 8, 10]);
+    }
+
+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_parallel_vec_into_map_collect_many() {
+        let input = vec![1; 1000];
+        let result: Vec<_> = vec_into_map_collect(input, |x| x * 2);
+        assert_eq!(result, vec![2; 1000]);
+    }
+}
diff --git a/turbopack/crates/turbo-tasks/src/scope.rs b/turbopack/crates/turbo-tasks/src/scope.rs
deleted file mode 100644
index bfe5e355df358..0000000000000
--- a/turbopack/crates/turbo-tasks/src/scope.rs
+++ /dev/null
@@ -1,52 +0,0 @@
-use std::sync::Arc;
-
-use crate::{TurboTasksApi, turbo_tasks, turbo_tasks_scope};
-
-/// A wrapper around [`rayon::Scope`] that preserves the [`turbo_tasks_scope`].
-pub struct Scope<'scope, 'a> {
-    scope: &'a rayon::Scope<'scope>,
-    handle: tokio::runtime::Handle,
-    turbo_tasks: Arc<dyn TurboTasksApi>,
-    span: tracing::Span,
-}
-
-impl<'scope> Scope<'scope, '_> {
-    pub fn spawn<Body>(&self, body: Body)
-    where
-        Body: FnOnce(&Scope<'scope, '_>) + Send + 'scope,
-    {
-        let span = self.span.clone();
-        let handle = self.handle.clone();
-        let turbo_tasks = self.turbo_tasks.clone();
-        self.scope.spawn(|scope| {
-            let _span = span.clone().entered();
-            let _guard = handle.enter();
-            turbo_tasks_scope(turbo_tasks.clone(), || {
-                body(&Scope {
-                    scope,
-                    span,
-                    handle,
-                    turbo_tasks,
-                })
-            })
-        });
-    }
-}
-
-/// A wrapper around [`rayon::in_place_scope`] that preserves the [`turbo_tasks_scope`].
-pub fn scope<'scope, Op, R>(op: Op) -> R
-where
-    Op: FnOnce(&Scope<'scope, '_>) -> R,
-{
-    let span = tracing::Span::current();
-    let handle = tokio::runtime::Handle::current();
-    let turbo_tasks = turbo_tasks();
-    rayon::in_place_scope(|scope| {
-        op(&Scope {
-            scope,
-            span,
-            handle,
-            turbo_tasks,
-        })
-    })
-}
diff --git a/turbopack/crates/turbo-tasks/src/spawn.rs b/turbopack/crates/turbo-tasks/src/spawn.rs
new file mode 100644
index 0000000000000..fb0eaf932e919
--- /dev/null
+++ b/turbopack/crates/turbo-tasks/src/spawn.rs
@@ -0,0 +1,106 @@
+use std::{
+    panic::resume_unwind,
+    pin::Pin,
+    task::{Context, Poll},
+    thread,
+    time::{Duration, Instant},
+};
+
+use anyhow::Result;
+use futures::{FutureExt, ready};
+use tokio::runtime::Handle;
+use tracing::{Instrument, Span, info_span};
+use turbo_tasks_malloc::{AllocationInfo, TurboMalloc};
+
+use crate::{
+    TurboTasksPanic,
+    capture_future::{self, CaptureFuture},
+    manager::turbo_tasks_future_scope,
+    turbo_tasks, turbo_tasks_scope,
+};
+
+pub struct JoinHandle<T> {
+    join_handle: tokio::task::JoinHandle<(Result<T, TurboTasksPanic>, Duration, AllocationInfo)>,
+}
+
+impl<T> Future for JoinHandle<T> {
+    type Output = T;
+
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let this = self.get_mut();
+        match ready!(this.join_handle.poll_unpin(cx)) {
+            Ok((res, duration, alloc_info)) => {
+                capture_future::add_duration(duration);
+                capture_future::add_allocation_info(alloc_info);
+                match res {
+                    Ok(res) => Poll::Ready(res),
+                    Err(e) => resume_unwind(e.into_panic()),
+                }
+            }
+            Err(e) => resume_unwind(e.into_panic()),
+        }
+    }
+}
+
+/// Spawns a future as separate task and returns a JoinHandle which can be used to await the result.
+/// The future has access to the current TurboTasks context and runs in the same tracing span.
+/// Allocations and cpu time is accounted to the current turbo-tasks function.
+pub fn spawn<T: Send + 'static>(future: impl Future<Output = T> + Send + 'static) -> JoinHandle<T> {
+    let turbo_tasks = turbo_tasks();
+    let span = Span::current();
+    let join_handle = tokio::task::spawn(
+        turbo_tasks_future_scope(turbo_tasks, CaptureFuture::new(future)).instrument(span),
+    );
+    JoinHandle { join_handle }
+}
+
+/// Spawns a blocking function in a separate task using the blocking pool and returns a JoinHandle
+/// which can be used to await the result. The function has access to the current TurboTasks context
+/// and runs in the same tracing span.
+/// Allocations and cpu time is accounted to the current turbo-tasks function.
+pub fn spawn_blocking<T: Send + 'static>(
+    func: impl FnOnce() -> T + Send + 'static,
+) -> JoinHandle<T> {
+    let turbo_tasks = turbo_tasks();
+    let span = Span::current();
+    let join_handle = tokio::task::spawn_blocking(|| {
+        let _guard = span.entered();
+        let start = Instant::now();
+        let start_allocations = TurboMalloc::allocation_counters();
+        let r = turbo_tasks_scope(turbo_tasks, func);
+        (Ok(r), start.elapsed(), start_allocations.until_now())
+    });
+    JoinHandle { join_handle }
+}
+
+/// Spawns a thread which runs in background. It has access to the current TurboTasks context, but
+/// is not accounted towards the current turbo-tasks function.
+pub fn spawn_thread(func: impl FnOnce() + Send + 'static) {
+    let handle = Handle::current();
+    let span = info_span!("thread").or_current();
+    let turbo_tasks = turbo_tasks();
+    thread::spawn(move || {
+        let _span = span.entered();
+        turbo_tasks_scope(turbo_tasks, || {
+            let _guard = handle.enter();
+            func();
+        })
+    });
+}
+
+/// Tells the scheduler about blocking work happening in the current thread.
+/// It will make sure to allocate extra threads for the pool.
+pub fn block_in_place<R>(f: impl FnOnce() -> R + Send) -> R
+where
+    R: Send,
+{
+    tokio::task::block_in_place(f)
+}
+
+/// Blocks the current thread until the future is resolved.
+pub fn block_for_future<T>(future: impl Future<Output = T> + Send) -> T
+where
+    T: Send,
+{
+    block_in_place(|| Handle::current().block_on(future))
+}
diff --git a/turbopack/crates/turbopack/tests/node-file-trace.rs b/turbopack/crates/turbopack/tests/node-file-trace.rs
index 4670a3905dddb..8f4b0aa3d00be 100644
--- a/turbopack/crates/turbopack/tests/node-file-trace.rs
+++ b/turbopack/crates/turbopack/tests/node-file-trace.rs
@@ -272,7 +272,7 @@ fn test_cases() {}
 
 #[apply(test_cases)]
 fn node_file_trace_noop_backing_storage(#[case] input: CaseInput) {
-    node_file_trace(input, "noop_backing_storage", false, 1, 120, |_| {
+    node_file_trace(input, "noop_backing_storage", 1, 120, |_| {
         TurboTasks::new(TurboTasksBackend::new(
             turbo_tasks_backend::BackendOptions::default(),
             turbo_tasks_backend::noop_backing_storage(),
@@ -282,7 +282,7 @@ fn node_file_trace_noop_backing_storage(#[case] input: CaseInput) {
 
 #[apply(test_cases)]
 fn node_file_trace_persistent(#[case] input: CaseInput) {
-    node_file_trace(input, "persistent_cache", false, 2, 240, |directory_path| {
+    node_file_trace(input, "persistent_cache", 2, 240, |directory_path| {
         TurboTasks::new(TurboTasksBackend::new(
             turbo_tasks_backend::BackendOptions::default(),
             turbo_tasks_backend::default_backing_storage(
@@ -302,31 +302,18 @@ fn node_file_trace_persistent(#[case] input: CaseInput) {
 
 #[cfg(feature = "bench_against_node_nft")]
 #[apply(test_cases)]
-fn bench_against_node_nft_st(#[case] input: CaseInput) {
-    bench_against_node_nft_inner(input, false);
+fn bench_against_node_nft(#[case] input: CaseInput) {
+    bench_against_node_nft_inner(input);
 }
 
 #[cfg(feature = "bench_against_node_nft")]
-#[apply(test_cases)]
-fn bench_against_node_nft_mt(#[case] input: CaseInput) {
-    bench_against_node_nft_inner(input, true);
-}
-
-#[cfg(feature = "bench_against_node_nft")]
-fn bench_against_node_nft_inner(input: CaseInput, multi_threaded: bool) {
-    node_file_trace(
-        input,
-        "noop_backing_storage",
-        multi_threaded,
-        1,
-        120,
-        |_| {
-            TurboTasks::new(TurboTasksBackend::new(
-                turbo_tasks_backend::BackendOptions::default(),
-                turbo_tasks_backend::noop_backing_storage(),
-            ))
-        },
-    );
+fn bench_against_node_nft_inner(input: CaseInput) {
+    node_file_trace(input, "noop_backing_storage", 1, 120, |_| {
+        TurboTasks::new(TurboTasksBackend::new(
+            turbo_tasks_backend::BackendOptions::default(),
+            turbo_tasks_backend::noop_backing_storage(),
+        ))
+    });
 }
 
 #[turbo_tasks::function(operation)]
@@ -401,7 +388,6 @@ fn node_file_trace<B: Backend + 'static>(
         expected_stderr,
     }: CaseInput,
     mode: &str,
-    multi_threaded: bool,
     run_count: i32,
     timeout_len: u64,
     create_turbo_tasks: impl Fn(&Path) -> Arc<TurboTasks<B>>,
@@ -410,15 +396,9 @@ fn node_file_trace<B: Backend + 'static>(
         LazyLock::new(|| Arc::new(Mutex::new(Vec::new())));
 
     let r = &mut {
-        let mut builder = if multi_threaded {
-            tokio::runtime::Builder::new_multi_thread()
-        } else {
-            tokio::runtime::Builder::new_current_thread()
-        };
+        let mut builder = tokio::runtime::Builder::new_multi_thread();
         builder.enable_all();
-        if !multi_threaded {
-            builder.max_blocking_threads(20);
-        }
+        builder.max_blocking_threads(20);
         builder.build().unwrap()
     };
     r.block_on(async move {
@@ -490,12 +470,7 @@ fn node_file_trace<B: Backend + 'static>(
                         bench_suites_lock.push(BenchSuite {
                             suite: input
                                 .trim_start_matches("node-file-trace/integration/")
-                                .to_string()
-                                + (if multi_threaded {
-                                    " (multi-threaded)"
-                                } else {
-                                    ""
-                                }),
+                                .to_string(),
                             is_faster,
                             rust_duration,
                             node_duration,