Skip to content

Commit c934f0f

Browse files
authored
fix: no reboot after GC (#6841)
1 parent 7e7196b commit c934f0f

19 files changed

Lines changed: 686 additions & 310 deletions

File tree

docs/docs/users/guides/gc.md

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,8 @@ GC can be trigger manually with `forest-cli chain prune snap`, regardless whethe
2121
Garbage Collection (GC) runs on a regular schedule and follows these steps:
2222

2323
- Export an effective standard lite snapshot in `.forest.car.zst` format.
24-
- Stop the node.
2524
- Purge parity-db columns that serve as non-persistent blockstore.
2625
- Purge old CAR database files.
27-
- Restart the node.
2826

2927
This process keeps the system clean by regularly removing old, unused data.
3028

@@ -67,7 +65,6 @@ While GC runs in the background, it can cause some delays or pauses, particularl
6765

6866
- **Syncing Pauses**: There may be brief interruptions in syncing as resources are allocated for the GC process.
6967
- **Performance Overhead**: While relatively efficient, the chain traversal algorithm could slow down operations slightly.
70-
- **Reboot pauses**: The GC stops the node before cleaning up parity-db and CAR snapshots and then restarts the node, which could take `~10s-~30s` on mainnet
7168

7269
## Disk Usage
7370

src/chain/store/chain_store.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ pub struct ChainStore<DB> {
8585
genesis_block_header: CachingBlockHeader,
8686

8787
/// validated blocks
88-
validated_blocks: Mutex<HashSet<Cid>>,
88+
pub(crate) validated_blocks: Mutex<HashSet<Cid>>,
8989

9090
/// Ethereum mappings store
9191
eth_mappings: Arc<dyn EthMappingsStore + Sync + Send>,

src/chain_sync/bad_block_cache.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@ impl BadBlockCache {
4343
pub fn peek(&self, c: &Cid) -> Option<()> {
4444
self.cache.peek_cloned(&(*c).into())
4545
}
46+
47+
pub fn clear(&self) {
48+
self.cache.clear()
49+
}
4650
}
4751

4852
/// Thread-safe LRU cache for tracking recently seen gossip block CIDs.

src/chain_sync/chain_follower.rs

Lines changed: 71 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,17 @@ use tokio::{sync::Notify, task::JoinSet};
4646
use tracing::{debug, error, info, trace, warn};
4747

4848
pub struct ChainFollower<DB> {
49+
/// Tasks
50+
tasks: Arc<Mutex<HashSet<SyncTask>>>,
51+
52+
/// State machine
53+
state_machine: Arc<Mutex<SyncStateMachine<DB>>>,
54+
4955
/// Syncing status of the chain
5056
pub sync_status: SyncStatus,
5157

5258
/// manages retrieving and updates state objects
53-
state_manager: Arc<StateManager<DB>>,
59+
pub state_manager: Arc<StateManager<DB>>,
5460

5561
/// Context to be able to send requests to P2P network
5662
pub network: SyncNetworkContext<DB>,
@@ -93,17 +99,26 @@ impl<DB: Blockstore + Sync + Send + 'static> ChainFollower<DB> {
9399
) -> Self {
94100
crate::def_is_env_truthy!(cache_disabled, "FOREST_DISABLE_BAD_BLOCK_CACHE");
95101
let (tipset_sender, tipset_receiver) = flume::bounded(20);
102+
let tasks: Arc<Mutex<HashSet<SyncTask>>> = Arc::new(Mutex::new(HashSet::default()));
103+
let bad_blocks = if cache_disabled() {
104+
tracing::warn!("bad block cache is disabled by `FOREST_DISABLE_BAD_BLOCK_CACHE`");
105+
None
106+
} else {
107+
Some(Default::default())
108+
};
109+
let state_machine = Arc::new(Mutex::new(SyncStateMachine::new(
110+
state_manager.chain_store().clone(),
111+
bad_blocks.clone(),
112+
stateless_mode,
113+
)));
96114
Self {
115+
tasks,
116+
state_machine,
97117
sync_status: Arc::new(RwLock::new(SyncStatusReport::init())),
98118
state_manager,
99119
network,
100120
genesis,
101-
bad_blocks: if cache_disabled() {
102-
tracing::warn!("bad block cache is disabled by `FOREST_DISABLE_BAD_BLOCK_CACHE`");
103-
None
104-
} else {
105-
Some(Default::default())
106-
},
121+
bad_blocks,
107122
net_handler,
108123
tipset_sender,
109124
tipset_receiver,
@@ -112,16 +127,37 @@ impl<DB: Blockstore + Sync + Send + 'static> ChainFollower<DB> {
112127
}
113128
}
114129

115-
pub async fn run(self) -> anyhow::Result<()> {
130+
/// Reset inner states
131+
pub fn reset(&self) {
132+
let start = Instant::now();
133+
self.tasks.lock().clear();
134+
self.state_manager
135+
.chain_store()
136+
.validated_blocks
137+
.lock()
138+
.clear();
139+
self.state_machine.lock().tipsets.clear();
140+
if let Some(bad_blocks) = &self.bad_blocks {
141+
bad_blocks.clear();
142+
}
143+
tracing::info!(
144+
"chain follower reset, took {}",
145+
humantime::format_duration(start.elapsed())
146+
);
147+
}
148+
149+
pub async fn run(&self) -> anyhow::Result<()> {
116150
chain_follower(
117-
self.state_manager,
118-
self.bad_blocks,
119-
self.net_handler,
120-
self.tipset_receiver,
121-
self.network,
122-
self.mem_pool,
123-
self.sync_status,
124-
self.genesis,
151+
&self.tasks,
152+
&self.state_machine,
153+
&self.state_manager,
154+
self.bad_blocks.clone(),
155+
self.net_handler.clone(),
156+
self.tipset_receiver.clone(),
157+
&self.network,
158+
&self.mem_pool,
159+
&self.sync_status,
160+
&self.genesis,
125161
self.stateless_mode,
126162
)
127163
.await
@@ -130,24 +166,21 @@ impl<DB: Blockstore + Sync + Send + 'static> ChainFollower<DB> {
130166

131167
#[allow(clippy::too_many_arguments)]
132168
// We receive new full tipsets from the p2p swarm, and from miners that use Forest as their frontend.
133-
pub async fn chain_follower<DB: Blockstore + Sync + Send + 'static>(
134-
state_manager: Arc<StateManager<DB>>,
169+
async fn chain_follower<DB: Blockstore + Sync + Send + 'static>(
170+
tasks: &Arc<Mutex<HashSet<SyncTask>>>,
171+
state_machine: &Arc<Mutex<SyncStateMachine<DB>>>,
172+
state_manager: &Arc<StateManager<DB>>,
135173
bad_block_cache: Option<Arc<BadBlockCache>>,
136174
network_rx: flume::Receiver<NetworkEvent>,
137175
tipset_receiver: flume::Receiver<FullTipset>,
138-
network: SyncNetworkContext<DB>,
139-
mem_pool: Arc<MessagePool<Arc<ChainStore<DB>>>>,
140-
sync_status: SyncStatus,
141-
genesis: Tipset,
176+
network: &SyncNetworkContext<DB>,
177+
mem_pool: &Arc<MessagePool<Arc<ChainStore<DB>>>>,
178+
sync_status: &SyncStatus,
179+
genesis: &Tipset,
142180
stateless_mode: bool,
143181
) -> anyhow::Result<()> {
144182
let state_changed = Arc::new(Notify::new());
145-
let state_machine = Arc::new(Mutex::new(SyncStateMachine::new(
146-
state_manager.chain_store().clone(),
147-
bad_block_cache.clone(),
148-
stateless_mode,
149-
)));
150-
let tasks: Arc<Mutex<HashSet<SyncTask>>> = Arc::new(Mutex::new(HashSet::default()));
183+
151184
let seen_block_cache = SeenBlockCache::default();
152185

153186
let mut set = JoinSet::new();
@@ -158,6 +191,8 @@ pub async fn chain_follower<DB: Blockstore + Sync + Send + 'static>(
158191
let state_changed = state_changed.shallow_clone();
159192
let state_machine = state_machine.shallow_clone();
160193
let network = network.shallow_clone();
194+
let mem_pool = mem_pool.shallow_clone();
195+
let genesis = genesis.shallow_clone();
161196
let bad_block_cache = bad_block_cache.shallow_clone();
162197
let seen_block_cache = seen_block_cache.shallow_clone();
163198
async move {
@@ -244,11 +279,13 @@ pub async fn chain_follower<DB: Blockstore + Sync + Send + 'static>(
244279

245280
// When the state machine is updated, we need to update the sync status and spawn tasks
246281
set.spawn({
247-
let state_manager = state_manager.clone();
248-
let state_machine = state_machine.clone();
249-
let state_changed = state_changed.clone();
250-
let tasks = tasks.clone();
251-
let bad_block_cache = bad_block_cache.clone();
282+
let state_manager = state_manager.shallow_clone();
283+
let state_machine = state_machine.shallow_clone();
284+
let network = network.shallow_clone();
285+
let sync_status = sync_status.shallow_clone();
286+
let state_changed = state_changed.shallow_clone();
287+
let tasks = tasks.shallow_clone();
288+
let bad_block_cache = bad_block_cache.shallow_clone();
252289
async move {
253290
loop {
254291
state_changed.notified().await;
@@ -726,7 +763,7 @@ impl<DB: Blockstore> SyncStateMachine<DB> {
726763

727764
fn mark_validated_tipset(&mut self, tipset: FullTipset, is_proposed_head: bool) {
728765
if !self.is_parent_validated(&tipset) {
729-
tracing::error!(epoch = %tipset.epoch(), tsk = %tipset.key(), "Tipset must be validated");
766+
tracing::error!(epoch = %tipset.epoch(), tsk = %tipset.key(), parent_state = %tipset.parent_state(), "Parent tipset must be validated");
730767
return;
731768
}
732769

src/chain_sync/tipset_syncer.rs

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,9 @@ pub async fn validate_tipset<DB: Blockstore + Send + Sync + 'static>(
111111
let timer = metrics::TIPSET_PROCESSING_TIME.start_timer();
112112

113113
let epoch = full_tipset.epoch();
114-
let full_tipset_key = full_tipset.key().clone();
115-
trace!("Tipset keys: {full_tipset_key}");
114+
let parent_state = *full_tipset.parent_state();
115+
let tipset_key = full_tipset.key();
116+
trace!("Tipset keys: {tipset_key}");
116117
let blocks = full_tipset.into_blocks();
117118
let mut validations = JoinSet::new();
118119
for b in blocks {
@@ -127,14 +128,20 @@ pub async fn validate_tipset<DB: Blockstore + Send + Sync + 'static>(
127128
.add_to_tipset_tracker(block.header());
128129
}
129130
Err((cid, why)) => {
130-
warn!("Validating block [CID = {cid}] in EPOCH = {epoch} failed: {why}");
131+
warn!(
132+
"Validating block [CID = {cid}, PARENT_STATE = {parent_state}] in EPOCH = {epoch} failed: {why}",
133+
);
131134
match &why {
132135
TipsetSyncerError::TimeTravellingBlock(_, _) => {
133136
// Do not mark a block as bad for temporary errors.
134137
// See <https://github.com/filecoin-project/lotus/blob/v1.34.1/chain/sync.go#L602> in Lotus
135138
}
136139
_ => {
137-
if let Some(bad_block_cache) = bad_block_cache {
140+
// Do not mark block as bad if the parent state tree does not exist
141+
if StateTree::new_from_root(state_manager.blockstore_owned(), &parent_state)
142+
.is_ok()
143+
&& let Some(bad_block_cache) = bad_block_cache
144+
{
138145
bad_block_cache.push(cid);
139146
}
140147
}

src/daemon/context.rs

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ use crate::daemon::asyncify;
99
use crate::daemon::bundle::load_actor_bundles;
1010
use crate::daemon::db_util::load_all_forest_cars_with_cleanup;
1111
use crate::db::car::ManyCar;
12-
use crate::db::db_engine::{db_root, open_db};
13-
use crate::db::parity_db::ParityDb;
12+
use crate::db::db_engine::db_root;
13+
use crate::db::parity_db::{GarbageCollectableParityDb, ParityDb};
1414
use crate::db::{CAR_DB_DIR_NAME, DummyStore, EthMappingsStore};
1515
use crate::genesis::read_genesis_header;
1616
use crate::libp2p::{Keypair, PeerId};
@@ -178,7 +178,7 @@ fn maybe_migrate_db(config: &Config) {
178178
}
179179
}
180180

181-
pub type DbType = ManyCar<Arc<ParityDb>>;
181+
pub type DbType = ManyCar<Arc<GarbageCollectableParityDb>>;
182182

183183
pub(crate) struct DbMetadata {
184184
db_root_dir: PathBuf,
@@ -204,7 +204,10 @@ async fn setup_db(opts: &CliOpts, config: &Config) -> anyhow::Result<(Arc<DbType
204204
maybe_migrate_db(config);
205205
let chain_data_path = chain_path(config);
206206
let db_root_dir = db_root(&chain_data_path)?;
207-
let db_writer = Arc::new(open_db(db_root_dir.clone(), config.db_config())?);
207+
let db_writer = Arc::new(GarbageCollectableParityDb::new(ParityDb::to_options(
208+
db_root_dir.clone(),
209+
config.db_config(),
210+
))?);
208211
let db = Arc::new(ManyCar::new(db_writer.clone()));
209212
let forest_car_db_dir = db_root_dir.join(CAR_DB_DIR_NAME);
210213
load_all_forest_cars_with_cleanup(&db, &forest_car_db_dir)?;

0 commit comments

Comments
 (0)