Skip to content

Commit 4100492

Browse files
Merge branch 'main' into shashank/deny-unknown-fields-rpc
2 parents 230f370 + 188c99b commit 4100492

23 files changed

Lines changed: 336 additions & 93 deletions

File tree

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@
3535

3636
### Changed
3737

38+
- [#6939](https://github.com/ChainSafe/forest/pull/6939): Refactored snapshot export and garbage collection logic to use disk-backed hash set for de-de-duplicating reachable blocks. This results in less RAM usage (~6-7GiB) and more disk usage (~7-8GiB on mainnet).
39+
3840
### Removed
3941

4042
- [#6948](https://github.com/ChainSafe/forest/pull/6948): Removed the `FOREST_FEES_FIP0115HEIGHT` environment variable. The `FIP-0115` will be automatically activated at `FireHorse` network upgrade.

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ num-traits = "0.2"
169169
num_cpus = "1"
170170
nunny = { version = "0.2", features = ["serde", "quickcheck", "schemars1"] }
171171
openrpc-types = "0.5"
172-
parity-db = { version = "0.5" }
172+
parity-db = { version = "0.5", features = ["bytes"] }
173173
parking_lot = { version = "0.12", features = ["deadlock_detection"] }
174174
pastey = "0.2"
175175
pathfinding = "4"

docs/docs/users/guides/gc.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,10 @@ Always remember to enable GC when moving back to production or long-term testing
5454

5555
During the GC process, Forest consumes extra RAM and disk space temporarily:
5656

57-
- While traversing reachable blocks, it uses 32 bytes of RAM per reachable block.
57+
- While traversing reachable blocks, it uses ~80MiB of RAM and ~8GiB disk space on mainnet (and ~2GiB on calibnet) for de-duplicating reachable blocks.
5858
- While exporting a lite snapshot, it uses extra disk space before cleaning up parity-db and stale CAR snapshots.
5959

60-
For a typical ~80 GiB mainnet snapshot, this results in ~2.5 GiB of additional RAM and ~80 GiB disk space usage.
60+
For a typical ~80 GiB mainnet snapshot, this results in ~80 MiB of additional RAM and ~90 GiB disk space usage.
6161

6262
### Syncing Pauses or Performance Overheads
6363

src/chain/mod.rs

Lines changed: 29 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ mod weight;
1111
pub use self::{snapshot_format::*, store::*, weight::*};
1212

1313
use crate::blocks::{Tipset, TipsetKey};
14-
use crate::cid_collections::CidHashSet;
14+
use crate::cid_collections::CidHashSetLike;
1515
use crate::db::car::forest::{self, ForestCarFrame, finalize_frame};
1616
use crate::db::{SettingsStore, SettingsStoreExt};
1717
use crate::ipld::stream_chain;
@@ -32,50 +32,61 @@ use std::io::{Read, Seek, SeekFrom};
3232
use std::sync::Arc;
3333
use tokio::io::{AsyncWrite, AsyncWriteExt, BufWriter};
3434

35-
#[derive(Debug, Clone, Default)]
36-
pub struct ExportOptions {
35+
pub struct ExportOptions<S> {
3736
pub skip_checksum: bool,
3837
pub include_receipts: bool,
3938
pub include_events: bool,
4039
pub include_tipset_keys: bool,
41-
pub seen: CidHashSet,
40+
pub seen: S,
4241
}
4342

44-
pub async fn export_from_head<D: Digest>(
43+
impl<S: Default> Default for ExportOptions<S> {
44+
fn default() -> Self {
45+
Self {
46+
skip_checksum: Default::default(),
47+
include_receipts: Default::default(),
48+
include_events: Default::default(),
49+
include_tipset_keys: Default::default(),
50+
seen: Default::default(),
51+
}
52+
}
53+
}
54+
55+
pub async fn export_from_head<D: Digest, S: CidHashSetLike + Send + Sync + 'static>(
4556
db: &Arc<impl Blockstore + SettingsStore + Send + Sync + 'static>,
4657
lookup_depth: ChainEpochDelta,
4758
writer: impl AsyncWrite + Unpin,
48-
options: Option<ExportOptions>,
59+
options: ExportOptions<S>,
4960
) -> anyhow::Result<(Tipset, Option<digest::Output<D>>)> {
5061
let head_key = SettingsStoreExt::read_obj::<TipsetKey>(db, crate::db::setting_keys::HEAD_KEY)?
5162
.context("chain head key not found")?;
5263
let head_ts = Tipset::load_required(&db, &head_key)?;
53-
let digest = export::<D>(db, &head_ts, lookup_depth, writer, options).await?;
64+
let digest = export::<D, S>(db, &head_ts, lookup_depth, writer, options).await?;
5465
Ok((head_ts, digest))
5566
}
5667

5768
/// Exports a Filecoin snapshot in v1 format
5869
/// See <https://github.com/filecoin-project/FIPs/blob/98e33b9fa306959aa0131519eb4cc155522b2081/FRCs/frc-0108.md#v1-specification>
59-
pub async fn export<D: Digest>(
70+
pub async fn export<D: Digest, S: CidHashSetLike + Send + Sync + 'static>(
6071
db: &Arc<impl Blockstore + Send + Sync + 'static>,
6172
tipset: &Tipset,
6273
lookup_depth: ChainEpochDelta,
6374
writer: impl AsyncWrite + Unpin,
64-
options: Option<ExportOptions>,
75+
options: ExportOptions<S>,
6576
) -> anyhow::Result<Option<digest::Output<D>>> {
6677
let roots = tipset.key().to_cids();
67-
export_to_forest_car::<D>(roots, None, db, tipset, lookup_depth, writer, options).await
78+
export_to_forest_car::<D, S>(roots, None, db, tipset, lookup_depth, writer, options).await
6879
}
6980

7081
/// Exports a Filecoin snapshot in v2 format
7182
/// See <https://github.com/filecoin-project/FIPs/blob/98e33b9fa306959aa0131519eb4cc155522b2081/FRCs/frc-0108.md#v2-specification>
72-
pub async fn export_v2<D: Digest, F: Seek + Read>(
83+
pub async fn export_v2<D: Digest, F: Seek + Read, S: CidHashSetLike + Send + Sync + 'static>(
7384
db: &Arc<impl Blockstore + Send + Sync + 'static>,
7485
mut f3: Option<(Cid, F)>,
7586
tipset: &Tipset,
7687
lookup_depth: ChainEpochDelta,
7788
writer: impl AsyncWrite + Unpin,
78-
options: Option<ExportOptions>,
89+
options: ExportOptions<S>,
7990
) -> anyhow::Result<Option<digest::Output<D>>> {
8091
// validate f3 data
8192
if let Some((f3_cid, f3_data)) = &mut f3 {
@@ -121,7 +132,7 @@ pub async fn export_v2<D: Digest, F: Seek + Read>(
121132
});
122133
}
123134

124-
export_to_forest_car::<D>(
135+
export_to_forest_car::<D, S>(
125136
roots,
126137
Some(prefix_data_frames),
127138
db,
@@ -134,23 +145,21 @@ pub async fn export_v2<D: Digest, F: Seek + Read>(
134145
}
135146

136147
#[allow(clippy::too_many_arguments)]
137-
async fn export_to_forest_car<D: Digest>(
148+
async fn export_to_forest_car<D: Digest, S: CidHashSetLike + Send + Sync + 'static>(
138149
roots: NonEmpty<Cid>,
139150
prefix_data_frames: Option<Vec<anyhow::Result<ForestCarFrame>>>,
140151
db: &Arc<impl Blockstore + Send + Sync + 'static>,
141152
tipset: &Tipset,
142153
lookup_depth: ChainEpochDelta,
143154
writer: impl AsyncWrite + Unpin,
144-
options: Option<ExportOptions>,
145-
) -> anyhow::Result<Option<digest::Output<D>>> {
146-
let ExportOptions {
155+
ExportOptions {
147156
skip_checksum,
148157
include_receipts,
149158
include_events,
150159
include_tipset_keys,
151160
seen,
152-
} = options.unwrap_or_default();
153-
161+
}: ExportOptions<S>,
162+
) -> anyhow::Result<Option<digest::Output<D>>> {
154163
if include_events && !include_receipts {
155164
anyhow::bail!("message receipts must be included when events are included");
156165
}
@@ -171,8 +180,8 @@ async fn export_to_forest_car<D: Digest>(
171180
db.shallow_clone(),
172181
tipset.shallow_clone().chain_owned(db.shallow_clone()),
173182
stateroot_lookup_limit,
183+
seen,
174184
)
175-
.with_seen(seen)
176185
.with_message_receipts(include_receipts)
177186
.with_events(include_events)
178187
.with_tipset_keys(include_tipset_keys)

src/chain/tests.rs

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
use super::*;
55
use crate::{
66
blocks::{CachingBlockHeader, Chain4U, Tipset, TipsetKey, chain4u},
7+
cid_collections::CidHashSet,
78
db::{MemoryDB, car::ForestCar},
89
utils::db::CborStoreExt,
910
};
@@ -60,10 +61,25 @@ async fn test_export_inner(version: FilecoinSnapshotVersion) -> anyhow::Result<(
6061

6162
let checksum = match version {
6263
FilecoinSnapshotVersion::V1 => {
63-
export::<Sha256>(&db, &head, 0, &mut car_bytes, None).await?
64+
export::<Sha256, _>(
65+
&db,
66+
&head,
67+
0,
68+
&mut car_bytes,
69+
ExportOptions::<CidHashSet>::default(),
70+
)
71+
.await?
6472
}
6573
FilecoinSnapshotVersion::V2 => {
66-
export_v2::<Sha256, File>(&db, None, &head, 0, &mut car_bytes, None).await?
74+
export_v2::<Sha256, File, _>(
75+
&db,
76+
None,
77+
&head,
78+
0,
79+
&mut car_bytes,
80+
ExportOptions::<CidHashSet>::default(),
81+
)
82+
.await?
6783
}
6884
};
6985

src/cid_collections/hash_set.rs

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,20 @@
22
// SPDX-License-Identifier: Apache-2.0, MIT
33

44
use super::*;
5+
use anyhow::Context as _;
6+
use bytes::Bytes;
57
use cid::Cid;
68

79
#[cfg(doc)]
810
use std::collections::HashSet;
11+
use std::{path::Path, sync::LazyLock};
12+
13+
pub trait CidHashSetLike {
14+
/// Adds a value to the set.
15+
///
16+
/// Returns whether the value was newly inserted.
17+
fn insert(&mut self, cid: Cid) -> anyhow::Result<bool>;
18+
}
919

1020
/// A hash set implemented as a `HashMap` where the value is `()`.
1121
///
@@ -56,6 +66,12 @@ impl CidHashSet {
5666
}
5767
}
5868

69+
impl CidHashSetLike for CidHashSet {
70+
fn insert(&mut self, cid: Cid) -> anyhow::Result<bool> {
71+
Ok(self.insert(cid))
72+
}
73+
}
74+
5975
////////////////////
6076
// Collection Ops //
6177
////////////////////
@@ -73,3 +89,129 @@ impl FromIterator<Cid> for CidHashSet {
7389
this
7490
}
7591
}
92+
93+
/// A file-backed CID hash set.
94+
/// This is intended to be used for large sets of CIDs that may not fit in memory, such as when tracking seen CIDs during a chain export.
95+
pub struct FileBackedCidHashSet {
96+
db: parity_db::Db,
97+
// for dropping the temporary directory when the set is dropped
98+
_dir: tempfile::TempDir,
99+
lru: hashlink::LruCache<SmallCid, ()>,
100+
}
101+
102+
impl FileBackedCidHashSet {
103+
pub fn new(temp_dir_root: impl AsRef<Path>) -> anyhow::Result<Self> {
104+
let dir = tempfile::tempdir_in(temp_dir_root.as_ref()).with_context(|| {
105+
format!(
106+
"failed to create temp dir in {}",
107+
temp_dir_root.as_ref().display(),
108+
)
109+
})?;
110+
let options = parity_db::Options {
111+
path: dir.path().to_path_buf(),
112+
sync_wal: false,
113+
sync_data: false,
114+
stats: false,
115+
salt: None,
116+
columns: vec![
117+
parity_db::ColumnOptions {
118+
uniform: true,
119+
append_only: true,
120+
..Default::default()
121+
},
122+
parity_db::ColumnOptions {
123+
append_only: true,
124+
..Default::default()
125+
},
126+
],
127+
compression_threshold: Default::default(),
128+
};
129+
let db = parity_db::Db::open_or_create(&options).with_context(|| {
130+
format!(
131+
"failed to create temp parity-db at {}",
132+
options.path.display()
133+
)
134+
})?;
135+
Ok(Self {
136+
db,
137+
_dir: dir,
138+
#[allow(clippy::disallowed_methods)]
139+
lru: hashlink::LruCache::new(2 << 19), // ~80MiB for 1M entries
140+
})
141+
}
142+
143+
pub fn new_in_temp_dir() -> anyhow::Result<Self> {
144+
Self::new(std::env::temp_dir())
145+
}
146+
}
147+
148+
impl CidHashSetLike for FileBackedCidHashSet {
149+
fn insert(&mut self, cid: Cid) -> anyhow::Result<bool> {
150+
static EMPTY_VALUE: LazyLock<Bytes> = LazyLock::new(|| Bytes::from_static(&[]));
151+
152+
let small = SmallCid::from(cid);
153+
if self.lru.get(&small).is_some() {
154+
return Ok(false);
155+
}
156+
157+
let (col, key) = match &small {
158+
SmallCid::Inline(c) => (0, c.digest().to_vec()),
159+
SmallCid::Indirect(u) => (1, u.inner().to_bytes()),
160+
};
161+
if self.db.get(col, &key).ok().flatten().is_some() {
162+
self.lru.insert(small, ());
163+
Ok(false)
164+
} else {
165+
self.db.commit_changes_bytes([(
166+
col,
167+
parity_db::Operation::Set(key, EMPTY_VALUE.clone()),
168+
)])?;
169+
self.lru.insert(small, ());
170+
Ok(true)
171+
}
172+
}
173+
}
174+
175+
#[cfg(test)]
176+
impl Default for FileBackedCidHashSet {
177+
fn default() -> Self {
178+
Self::new_in_temp_dir().expect("failed to create FileBackedCidHashSet")
179+
}
180+
}
181+
182+
#[cfg(test)]
183+
mod tests {
184+
use super::*;
185+
use ahash::HashSet;
186+
187+
#[quickcheck_macros::quickcheck]
188+
fn test_cid_hashset(cids: HashSet<Cid>) {
189+
let mut set = CidHashSet::default();
190+
for cid in cids.iter() {
191+
all_asserts::assert_true!(set.insert(*cid), "expected CID to be newly inserted");
192+
}
193+
for cid in cids.iter() {
194+
all_asserts::assert_false!(set.insert(*cid), "expected CID to be present in the set");
195+
}
196+
}
197+
198+
#[quickcheck_macros::quickcheck]
199+
fn test_file_backed_cid_hashset(cids: HashSet<Cid>) {
200+
let mut set = FileBackedCidHashSet::default();
201+
let dir = set._dir.path().to_path_buf();
202+
for cid in cids.iter() {
203+
all_asserts::assert_true!(
204+
set.insert(*cid).unwrap(),
205+
"expected CID to be newly inserted"
206+
);
207+
}
208+
for cid in cids.iter() {
209+
all_asserts::assert_false!(
210+
set.insert(*cid).unwrap(),
211+
"expected CID to be present in the set"
212+
);
213+
}
214+
drop(set);
215+
all_asserts::assert_false!(dir.exists(), "expected temporary directory to be deleted");
216+
}
217+
}

0 commit comments

Comments
 (0)