Skip to content

Commit f5fee69

Browse files
authored
fix(consensus): Improve error handling for quorum cert retrieval (#443)
## PR Description [Briefly describe your changes] Issue Number: closes #xxx ## Tested? - [ ] Yes - [ ] No
1 parent ee7b447 commit f5fee69

File tree

7 files changed

+35
-8
lines changed

7 files changed

+35
-8
lines changed

aptos-core/consensus/consensus-types/src/block_retrieval.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,8 @@ pub enum BlockRetrievalStatus {
104104
NotEnoughBlocks,
105105
// Successfully found the target,
106106
SucceededWithTarget,
107+
// Can not find the quorum certificate for the block.
108+
QuorumCertNotFound,
107109
}
108110

109111
/// Carries the returned blocks and the retrieval status.

aptos-core/consensus/src/block_storage/block_store.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ impl BlockStore {
223223
info!("sending qc {} to execution, current commit round {}", qc.commit_info().round(), self.commit_root().round());
224224
if let Err(e) = self.send_for_execution(qc.into_wrapped_ledger_info(), true).await {
225225
error!("Error in try-committing blocks. {}", e.to_string());
226+
break;
226227
}
227228
}
228229
}

aptos-core/consensus/src/block_storage/sync_manager.rs

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,7 @@ impl BlockStore {
397397
self.rebuild(root, blocks, quorum_certs).await;
398398
storage.consensus_db().ledger_db.metadata_db().set_latest_ledger_info(ledger_infos.last().unwrap().clone());
399399

400-
if ledger_infos.last().unwrap().ledger_info().ends_epoch() {
400+
if !ledger_infos.is_empty() && ledger_infos.last().unwrap().ledger_info().ends_epoch() {
401401
retriever
402402
.network
403403
.send_epoch_change(EpochChangeProof::new(
@@ -625,7 +625,14 @@ impl BlockStore {
625625
let mut parent_id = HashValue::zero();
626626
let mut is_last_block = false;
627627
if let Some(executed_block) = self.get_block(id) {
628-
let qc = self.get_quorum_cert_for_block(id).unwrap();
628+
let qc = match self.get_quorum_cert_for_block(id) {
629+
Some(qc) => qc,
630+
None => {
631+
info!("Cannot find quorum cert for block id {}", id);
632+
status = BlockRetrievalStatus::QuorumCertNotFound;
633+
break;
634+
}
635+
};
629636
is_last_block = qc.commit_info().round() == 0;
630637
quorum_certs.push((*qc).clone());
631638
let randomness = match executed_block.block().block_number() {
@@ -637,7 +644,19 @@ impl BlockStore {
637644
} else if let Ok(Some(executed_block)) =
638645
self.storage.consensus_db().get_block(retrieval_epoch, id)
639646
{
640-
let qc = self.storage.consensus_db().get_qc(retrieval_epoch, id).unwrap().unwrap();
647+
let qc = match self.storage.consensus_db().get_qc(retrieval_epoch, id) {
648+
Ok(Some(qc)) => qc,
649+
Ok(None) => {
650+
info!("Cannot find quorum cert for block id {} in epoch {}", id, retrieval_epoch);
651+
status = BlockRetrievalStatus::QuorumCertNotFound;
652+
break;
653+
}
654+
Err(e) => {
655+
error!("Error retrieving quorum cert for block id {} in epoch {}: {:?}", id, retrieval_epoch, e);
656+
status = BlockRetrievalStatus::QuorumCertNotFound;
657+
break;
658+
}
659+
};
641660
is_last_block = qc.commit_info().round() == 0;
642661
quorum_certs.push(qc);
643662
let randomness = self.storage.consensus_db().get_randomness(executed_block.block_number().unwrap()).unwrap();

aptos-core/consensus/src/consensusdb/mod.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,8 @@ impl ConsensusDB {
129129
&self,
130130
latest_block_number: u64,
131131
epoch: u64,
132-
) -> Result<(Option<Vec<u8>>, Option<Vec<u8>>, Vec<Block>, Vec<QuorumCert>)> {
132+
) -> Result<(Option<Vec<u8>>, Option<Vec<u8>>, Vec<Block>, Vec<QuorumCert>, bool)> {
133+
let mut has_root = false;
133134
let last_vote = self.get_last_vote()?;
134135
let highest_2chain_timeout_certificate = self.get_highest_2chain_timeout_certificate()?;
135136
let start_key = (epoch, HashValue::zero());
@@ -143,6 +144,7 @@ impl ConsensusDB {
143144
let (start_epoch, start_round, start_block_id) = if block_number_to_block_id.contains_key(&latest_block_number) {
144145
let block = self.get::<BlockSchema>(&(epoch, block_number_to_block_id[&latest_block_number]))?
145146
.unwrap();
147+
has_root = true;
146148
(block.epoch(), block.round(), block.id())
147149
} else {
148150
(epoch, 0, HashValue::zero())
@@ -170,7 +172,7 @@ impl ConsensusDB {
170172
.collect();
171173
info!("consensus_blocks size : {}, consensus_qcs size : {}, block_number_to_block_id size : {}, start_round : {}",
172174
consensus_blocks.len(), consensus_qcs.len(), block_number_to_block_id.len(), start_round);
173-
Ok((last_vote, highest_2chain_timeout_certificate, consensus_blocks, consensus_qcs))
175+
Ok((last_vote, highest_2chain_timeout_certificate, consensus_blocks, consensus_qcs, has_root))
174176
}
175177

176178
pub fn save_highest_2chain_timeout_certificate(&self, tc: Vec<u8>) -> Result<(), DbError> {

aptos-core/consensus/src/epoch_manager.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1597,7 +1597,7 @@ impl<P: OnChainConfigProvider> EpochManager<P> {
15971597
| ConsensusMsg::SignedBatchInfo(_)
15981598
| ConsensusMsg::ProofOfStoreMsg(_) => {
15991599
let event: UnverifiedEvent = msg.into();
1600-
info!("event epoch: {:?} and self epoch {:?}", event.epoch(), self.epoch());
1600+
debug!("event epoch: {:?} and self epoch {:?}", event.epoch(), self.epoch());
16011601
if event.epoch()? == self.epoch() {
16021602
return Ok(Some(event));
16031603
} else {

aptos-core/consensus/src/persistent_liveness_storage.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,12 +276,13 @@ impl RecoveryData {
276276
mut quorum_certs: Vec<QuorumCert>,
277277
highest_2chain_timeout_cert: Option<TwoChainTimeoutCertificate>,
278278
order_vote_enabled: bool,
279+
has_root: bool,
279280
) -> Result<Self> {
280281
info!("blocks in db: {:?}", blocks.len());
281282
info!("quorum certs in db: {:?}", quorum_certs.len());
282283

283284
let root;
284-
if !blocks.is_empty() && execution_latest_block_num != ledger_recovery_data.storage_ledger.ledger_info().block_number() {
285+
if has_root {
285286
root = Self::find_root_by_block_number(
286287
execution_latest_block_num,
287288
&mut blocks,
@@ -463,6 +464,7 @@ impl PersistentLivenessStorage for StorageWriteProxy {
463464
// Use original method
464465
self.aptos_db.get_latest_ledger_info().unwrap()
465466
};
467+
info!("is_last_block_of_prev_epoch: {}, latest_ledger_info: {:?}, has_root: {}", is_last_block_of_prev_epoch, latest_ledger_info, raw_data.4);
466468

467469
let ledger_recovery_data = LedgerRecoveryData::new(latest_ledger_info);
468470
match RecoveryData::new(
@@ -473,6 +475,7 @@ impl PersistentLivenessStorage for StorageWriteProxy {
473475
quorum_certs,
474476
highest_2chain_timeout_cert,
475477
order_vote_enabled,
478+
raw_data.4,
476479
) {
477480
Ok(initial_data) => {
478481
// TODO(gravity_lightman)

aptos-core/consensus/src/util/db_tool.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ impl Command {
4141
let all_batches = quorum_store_db.get_all_batches().unwrap();
4242

4343
let consensus_db = ConsensusDB::new(self.db_dir.clone(), &PathBuf::new());
44-
let (_, _, blocks, _) = consensus_db.get_data(0, 0)?;
44+
let (_, _, blocks, _, _) = consensus_db.get_data(0, 0)?;
4545

4646
let mut txns = Vec::new();
4747
for block in blocks {

0 commit comments

Comments
 (0)