diff --git a/magicblock-accounts-db/src/error.rs b/magicblock-accounts-db/src/error.rs index 5154fd4c1..a213b0dd6 100644 --- a/magicblock-accounts-db/src/error.rs +++ b/magicblock-accounts-db/src/error.rs @@ -1,5 +1,7 @@ use std::io; +use log::error; + #[derive(Debug, thiserror::Error)] pub enum AccountsDbError { #[error("requested account doesn't exist in adb")] @@ -11,7 +13,7 @@ pub enum AccountsDbError { #[error("snapshot for slot {0} doesn't exist")] SnapshotMissing(u64), #[error("internal accountsdb error: {0}")] - Internal(&'static str), + Internal(String), } impl From for AccountsDbError { @@ -23,12 +25,28 @@ impl From for AccountsDbError { } } -#[macro_export] -macro_rules! log_err { - ($msg: expr) => { - |err| ::log::error!("{} error: {err}", $msg) - }; - ($msg: expr, $($ctx:expr),* $(,)?) => { - |err| ::log::error!("{} error: {err}", format!($msg, $($ctx),*)) - }; +/// Extension trait to easily log errors in Result chains. +pub trait LogErr { + /// Logs the error if the result is `Err`, then returns the result unmodified. + fn log_err(self, msg: F) -> Result + where + F: FnOnce() -> S, + S: std::fmt::Display; +} + +impl LogErr for Result +where + E: std::fmt::Display, +{ + #[track_caller] + fn log_err(self, msg: F) -> Result + where + F: FnOnce() -> S, + S: std::fmt::Display, + { + if let Err(e) = &self { + error!("{}: {}", msg(), e); + } + self + } } diff --git a/magicblock-accounts-db/src/index.rs b/magicblock-accounts-db/src/index.rs index fe7c3e5ca..18a2c9746 100644 --- a/magicblock-accounts-db/src/index.rs +++ b/magicblock-accounts-db/src/index.rs @@ -1,17 +1,14 @@ use std::path::Path; use iterator::OffsetPubkeyIter; -use lmdb::{ - Cursor, DatabaseFlags, Environment, RwTransaction, Transaction, WriteFlags, -}; +use lmdb::{Cursor, DatabaseFlags, Environment, RwTransaction, Transaction}; use log::warn; use solana_pubkey::Pubkey; use table::Table; use utils::*; use crate::{ - error::AccountsDbError, - log_err, + error::{AccountsDbError, LogErr}, storage::{Allocation, ExistingAllocation}, AccountsDbResult, }; @@ -19,93 +16,76 @@ use crate::{ pub type Offset = u32; pub type Blocks = u32; -const WEMPTY: WriteFlags = WriteFlags::empty(); - const ACCOUNTS_INDEX: &str = "accounts-idx"; const PROGRAMS_INDEX: &str = "programs-idx"; const DEALLOCATIONS_INDEX: &str = "deallocations-idx"; const OWNERS_INDEX: &str = "owners-idx"; -/// LMDB Index manager +/// LMDB Index manager. +/// +/// Handles secondary indices for mapping Pubkeys to storage offsets, +/// tracking program ownership, and managing deallocated space. #[cfg_attr(test, derive(Debug))] pub(crate) struct AccountsDbIndex { - /// Accounts Index, used for searching accounts by offset in the main storage - /// - /// the key is the account's pubkey (32 bytes) - /// the value is a concatenation of: - /// 1. offset in the storage (4 bytes) - /// 2. number of allocated blocks (4 bytes) + /// Maps Account Pubkey -> (Storage Offset, Block Count). accounts: Table, - /// Programs Index, used to keep track of owner->accounts - /// mapping, significantly speeds up program accounts retrieval - /// - /// the key is the owner's pubkey (32 bytes) - /// the value is a concatenation of: - /// 1. offset in the storage (4 bytes) - /// 2. account pubkey (32 bytes) + /// Maps Owner Pubkey -> (Storage Offset, Account Pubkey). + /// Used for `get_program_accounts`. programs: Table, - /// Deallocation Index, used to keep track of allocation size of deallocated - /// accounts, this is further utilized when defragmentation is required, by - /// matching new accounts' size and already present "holes" in the database - /// - /// the key is the allocation size in blocks (4 bytes) - /// the value is a concatenation of: - /// 1. offset in the storage (4 bytes) - /// 2. number of allocated blocks (4 bytes) + /// Maps Allocation Size (Blocks) -> (Storage Offset, Block Count). + /// Used for finding recyclable "holes" in storage. deallocations: Table, - /// Index map from accounts' pubkeys to their current owners, the index is - /// used primarily for cleanup purposes when owner change occures and we need - /// to cleanup programs index, so that old owner -> account mapping doesn't dangle - /// - /// the key is the account's pubkey (32 bytes) - /// the value is owner's pubkey (32 bytes) + /// Maps Account Pubkey -> Owner Pubkey. + /// Used for cleaning up the `programs` index when an account changes owner. owners: Table, - /// Common envorinment for all of the tables + /// The LMDB Environment. env: Environment, } -/// Helper macro to pack(merge) two types into single buffer of similar -/// combined length or to unpack(unmerge) them back into original types +/// Helper macro to pack/unpack types into/from byte buffers. +/// Uses unaligned writes/reads for performance and compactness. macro_rules! bytes { - (#pack, $hi: expr, $t1: ty, $low: expr, $t2: ty) => {{ - const S1: usize = size_of::<$t1>(); - const S2: usize = size_of::<$t2>(); + (#pack, $val1: expr, $type1: ty, $val2: expr, $type2: ty) => {{ + const S1: usize = std::mem::size_of::<$type1>(); + const S2: usize = std::mem::size_of::<$type2>(); let mut buffer = [0_u8; S1 + S2]; let ptr = buffer.as_mut_ptr(); - // SAFETY: - // we made sure that buffer contains exact space required by both writes - unsafe { (ptr as *mut $t1).write_unaligned($hi) }; - unsafe { (ptr.add(S1) as *mut $t2).write_unaligned($low) }; + // SAFETY: Buffer is exactly S1 + S2 bytes. + unsafe { + (ptr as *mut $type1).write_unaligned($val1); + (ptr.add(S1) as *mut $type2).write_unaligned($val2); + } buffer }}; - (#unpack, $packed: expr, $t1: ty, $t2: ty) => {{ + (#unpack, $packed: expr, $type1: ty, $type2: ty) => {{ let ptr = $packed.as_ptr(); - const S1: usize = size_of::<$t1>(); - // SAFETY: - // this macro branch is called on values previously packed by first branch - // so we essentially undo the packing on buffer of valid length - let t1 = unsafe { (ptr as *const $t1).read_unaligned() }; - let t2 = unsafe { (ptr.add(S1) as *const $t2).read_unaligned() }; - (t1, t2) + const S1: usize = std::mem::size_of::<$type1>(); + // SAFETY: Macro caller ensures $packed is valid length. + unsafe { + let v1 = (ptr as *const $type1).read_unaligned(); + let v2 = (ptr.add(S1) as *const $type2).read_unaligned(); + (v1, v2) + } }}; } impl AccountsDbIndex { - /// Creates new index manager for AccountsDB, by - /// opening/creating necessary lmdb environments pub(crate) fn new(size: usize, directory: &Path) -> AccountsDbResult { - // create an environment for all the tables - let env = lmdb_env(directory, size).inspect_err(log_err!( - "main index env creation at {}", - directory.display() - ))?; + let env = utils::create_lmdb_env(directory, size).log_err(|| { + format!("main index env creation at {}", directory.display()) + })?; + let accounts = Table::new(&env, ACCOUNTS_INDEX, DatabaseFlags::empty())?; + let programs = Table::new( &env, PROGRAMS_INDEX, DatabaseFlags::DUP_SORT | DatabaseFlags::DUP_FIXED, )?; + + // DEALLOCATIONS: Allow duplicates (multiple holes of same size), + // integer keys (block size) for range searches. let deallocations = Table::new( &env, DEALLOCATIONS_INDEX, @@ -116,6 +96,7 @@ impl AccountsDbIndex { )?; let owners = Table::new(&env, OWNERS_INDEX, DatabaseFlags::empty())?; + Ok(Self { accounts, programs, @@ -125,45 +106,32 @@ impl AccountsDbIndex { }) } - /// Retrieve the offset at which account can be read from main storage + /// Retrieves the storage offset for a given account. #[inline(always)] - pub(crate) fn get_account_offset( + pub(crate) fn get_offset( &self, pubkey: &Pubkey, ) -> AccountsDbResult { let txn = self.env.begin_ro_txn()?; - let Some(offset) = self.accounts.get(&txn, pubkey)? else { + let Some(value_bytes) = self.accounts.get(&txn, pubkey)? else { return Err(AccountsDbError::NotFound); }; + + // We only need the first 4 bytes (Offset), ignoring the Blocks count. + // SAFETY: Accounts index values are always created by `bytes!(#pack, ...)` + // which guarantees [Offset(4), Blocks(4)]. let offset = - // SAFETY: - // The accounts index stores two u32 values (offset and blocks) - // serialized into 8 byte long slice. Here we are interested only in the first 4 bytes - // (offset). The memory used by lmdb to store the serialization might not be u32 - // aligned, so we make use `read_unaligned`. - // - // We read the data stored by corresponding put in `insert_account`, - // thus it should be of valid length and contain valid value - unsafe { (offset.as_ptr() as *const Offset).read_unaligned() }; + unsafe { (value_bytes.as_ptr() as *const Offset).read_unaligned() }; Ok(offset) } - /// Retrieve the offset and the size (number of blocks) given account occupies - fn get_allocation( - &self, - txn: &T, - pubkey: &Pubkey, - ) -> AccountsDbResult { - let Some(slice) = self.accounts.get(txn, pubkey)? else { - return Err(AccountsDbError::NotFound); - }; - let (offset, blocks) = bytes!(#unpack, slice, u32, u32); - Ok(ExistingAllocation { offset, blocks }) - } - - /// Insert account's allocation information into various indices, if - /// account is already present, necessary bookkeeping will take place - pub(crate) fn insert_account( + /// Inserts or updates an account's allocation in the indices. + /// + /// If the account already exists, it handles the "move": + /// 1. Marks the old space as deallocated (recyclable). + /// 2. Updates the main index. + /// 3. Updates secondary indices (programs, owners). + pub(crate) fn upsert_account( &self, pubkey: &Pubkey, owner: &Pubkey, @@ -171,170 +139,217 @@ impl AccountsDbIndex { txn: &mut RwTransaction, ) -> AccountsDbResult> { let Allocation { offset, blocks, .. } = allocation; + let mut old_allocation = None; - let mut dealloc = None; - - // merge offset and block count into one single u64 and cast it to [u8; 8] let index_value = bytes!(#pack, offset, Offset, blocks, Blocks); - // concatenate offset where account is stored with pubkey of that account let offset_and_pubkey = bytes!(#pack, offset, Offset, *pubkey, Pubkey); - // optimisitically try to insert account to index, assuming that it doesn't exist - let inserted = - self.accounts.put_if_not_exists(txn, pubkey, index_value)?; - // if the account does exist, then it already occupies space in main storage + // 1. Optimistic insert (returns true if inserted, false if existed) + let inserted = self.accounts.insert(txn, pubkey, index_value)?; + + // 2. Handle update if it already existed if !inserted { - // in which case we just move the account to a new allocation - // adjusting all of the offsets and cleaning up the older ones let previous = self.reallocate_account(pubkey, txn, &index_value)?; - dealloc.replace(previous); - }; + old_allocation = Some(previous); + } - // track the account via programs' index as well - self.programs.put(txn, owner, offset_and_pubkey)?; - // track the reverse relation between account and its owner - self.owners.put(txn, pubkey, owner)?; + // 3. Update secondary indices + self.programs.upsert(txn, owner, offset_and_pubkey)?; + self.owners.upsert(txn, pubkey, owner)?; - Ok(dealloc) + Ok(old_allocation) } - /// Helper method to change the allocation for a given account + /// Handles the logistics of moving an existing account to a new location. + /// Returns the old allocation so it can be added to the free list. fn reallocate_account( &self, pubkey: &Pubkey, txn: &mut RwTransaction, - index_value: &[u8], + new_index_value: &[u8], ) -> AccountsDbResult { - // retrieve the size and offset for allocation - let allocation = self.get_allocation(txn, pubkey)?; - // and put it into deallocation index, so the space can be recycled later - let key = allocation.blocks.to_le_bytes(); + // Retrieve old location + let old_alloc = self.get_allocation(txn, pubkey)?; + + // Mark old space as free (Deallocations Index) + // Key: Block Count (for size-based lookup), Value: {Offset, Block Count} + let key = old_alloc.blocks.to_le_bytes(); let value = - bytes!(#pack, allocation.offset, Offset, allocation.blocks, Blocks); - self.deallocations.put(txn, key, value)?; + bytes!(#pack, old_alloc.offset, Offset, old_alloc.blocks, Blocks); + self.deallocations.upsert(txn, key, value)?; + + // Update Main Index with new location + self.accounts.upsert(txn, pubkey, new_index_value)?; - // now we can overwrite the index record - self.accounts.put(txn, pubkey, index_value)?; + // Clean up Programs Index (dangling pointer to old offset) + self.remove_program_index_entry(pubkey, None, txn, old_alloc.offset)?; - // we also need to delete old entry from `programs` index - self.remove_programs_index_entry(pubkey, None, txn, allocation.offset)?; - Ok(allocation) + Ok(old_alloc) } - /// Removes account from the database and marks its backing storage for recycling - /// this method also performs various cleanup operations on the secondary indexes - pub(crate) fn remove_account( - &self, - pubkey: &Pubkey, - ) -> AccountsDbResult<()> { + /// Removes an account from all indices and marks its space as recyclable. + pub(crate) fn remove(&self, pubkey: &Pubkey) -> AccountsDbResult<()> { let mut txn = self.env.begin_rw_txn()?; - let mut cursor = self.accounts.cursor_rw(&mut txn)?; - - // locate the account entry - let result = cursor - .get(Some(pubkey.as_ref()), None, MDB_SET_OP) - .map(|(_, v)| bytes!(#unpack, v, Offset, Blocks)); - let (offset, blocks) = match result { - Ok(r) => r, - Err(lmdb::Error::NotFound) => return Ok(()), - Err(err) => Err(err)?, + + // Get allocation to know what to free + let allocation = match self.get_allocation(&txn, pubkey) { + Ok(a) => a, + Err(AccountsDbError::NotFound) => return Ok(()), // Idempotent + Err(e) => return Err(e), }; - // and delete it - cursor.del(WriteFlags::empty())?; - drop(cursor); + // Remove from Main Index + self.accounts.remove(&mut txn, pubkey, None)?; + + // Add to Deallocations Index + let key = allocation.blocks.to_le_bytes(); + let val = + bytes!(#pack, allocation.offset, Offset, allocation.blocks, Blocks); + self.deallocations.upsert(&mut txn, key, val)?; - // mark the allocation for future recycling - self.deallocations.put( + // Remove from Secondary Indices + self.owners.remove(&mut txn, pubkey, None)?; + self.remove_program_index_entry( + pubkey, + None, &mut txn, - blocks.to_le_bytes(), - bytes!(#pack, offset, Offset, blocks, Blocks), + allocation.offset, )?; - // we also need to cleanup `programs` index - self.remove_programs_index_entry(pubkey, None, &mut txn, offset)?; txn.commit()?; Ok(()) } - /// Ensures that current owner of account matches the one recorded in index, - /// if not, the index cleanup will be performed and new entry inserted to - /// match the current state + /// Reconciles the `owners` and `programs` indices if the account's owner changed. pub(crate) fn ensure_correct_owner( &self, pubkey: &Pubkey, - owner: &Pubkey, + new_owner: &Pubkey, txn: &mut RwTransaction, ) -> AccountsDbResult<()> { - let old_owner = match self.owners.get(txn, pubkey)? { - // if current owner matches with that stored in index, then we are all set - Some(val) if owner.as_ref() == val => { - return Ok(()); - } - None => return Ok(()), - // if they don't match, then we have to remove old entries and create new ones - Some(val) => Pubkey::try_from(val).ok(), + let stored_owner_bytes = self.owners.get(txn, pubkey)?; + + let needs_update = match stored_owner_bytes { + Some(bytes) => bytes != new_owner.as_ref(), + None => true, }; + + if !needs_update { + return Ok(()); + } + + let old_owner = + stored_owner_bytes.map(|b| Pubkey::try_from(b).unwrap_or_default()); // Safe unwrap, if index isn't corrupt + let allocation = self.get_allocation(txn, pubkey)?; - // cleanup `programs` and `owners` index - self.remove_programs_index_entry( + + // 1. Clean up old program index entry + self.remove_program_index_entry( pubkey, old_owner, txn, allocation.offset, )?; - // track new owner of the account via programs' index + + // 2. Insert new program index entry let offset_and_pubkey = bytes!(#pack, allocation.offset, Offset, *pubkey, Pubkey); - self.programs.put(txn, owner, offset_and_pubkey)?; - // track the reverse relation between account and its owner - self.owners.put(txn, pubkey, owner)?; + self.programs.upsert(txn, new_owner, offset_and_pubkey)?; + + // 3. Update owners index + self.owners.upsert(txn, pubkey, new_owner)?; + Ok(()) } - fn remove_programs_index_entry( + /// Internal helper to retrieve full allocation info (Offset + Size). + fn get_allocation( + &self, + txn: &T, + pubkey: &Pubkey, + ) -> AccountsDbResult { + let Some(slice) = self.accounts.get(txn, pubkey)? else { + return Err(AccountsDbError::NotFound); + }; + let (offset, blocks) = bytes!(#unpack, slice, Offset, Blocks); + Ok(ExistingAllocation { offset, blocks }) + } + + /// Finds and removes a `programs` index entry. + /// If `old_owner` is not provided, it is looked up in the `owners` index. + fn remove_program_index_entry( &self, pubkey: &Pubkey, old_owner: Option, txn: &mut RwTransaction, offset: Offset, ) -> lmdb::Result<()> { - let val = bytes!(#pack, offset, Offset, *pubkey, Pubkey); - if let Some(owner) = old_owner { - return self.programs.del(txn, owner, Some(&val)); - } - // in order to delete the old entry from `programs` index, we consult - // the `owners` index to fetch the previous owner of the account - let mut owners = self.owners.cursor_rw(txn)?; - let owner = match owners.get(Some(pubkey.as_ref()), None, MDB_SET_OP) { - Ok((_, val)) => { - let pk = Pubkey::try_from(val).inspect_err(log_err!( - "owners index contained invalid value for pubkey of len {}", - val.len() - )); - let Ok(owner) = pk else { + // We need the owner to find the key in the programs index (Key=Owner). + let owner = match old_owner { + Some(pk) => pk, + None => match self.owners.get(txn, pubkey)? { + Some(val) => { + Pubkey::try_from(val).map_err(|_| lmdb::Error::Invalid)? + } + None => { + warn!("account {pubkey} missing from owners index during cleanup"); return Ok(()); - }; - owner - } - Err(lmdb::Error::NotFound) => { - warn!("account {pubkey} didn't have owners index entry"); - return Ok(()); - } - Err(e) => { - return Err(e); - } + } + }, }; - owners.del(WEMPTY)?; - drop(owners); - self.programs.del(txn, owner, Some(&val))?; - Ok(()) + // Value in programs index is {Offset, Pubkey}. + // We need exact match to delete from DUPSORT db. + let val = bytes!(#pack, offset, Offset, *pubkey, Pubkey); + self.programs.remove(txn, owner, Some(&val)) } - /// Returns an iterator over offsets and pubkeys of accounts for given - /// program offsets can be used to retrieve the account from storage + /// Tries to find a free block of `needed` size in the deallocations table. + /// If found, splits it if too large, and returns the recycled allocation. + pub(crate) fn try_recycle_allocation( + &self, + needed: Blocks, + txn: &mut RwTransaction, + ) -> AccountsDbResult { + let mut cursor = self.deallocations.cursor_rw(txn)?; + + // MDB_SET_RANGE: Position cursor at first key >= `needed`. + // This effectively implements a "Best Fit" (or "First Sufficient Fit") strategy. + let (_key_bytes, val_bytes) = cursor.get( + Some(&needed.to_le_bytes()), + None, + utils::MDB_SET_RANGE_OP, + )?; + + let (offset, available_blocks) = + bytes!(#unpack, val_bytes, Offset, Blocks); + + // Remove this allocation from free list + cursor.del(lmdb::WriteFlags::empty())?; + + // If we found a block larger than needed, split it and put the remainder back. + let remainder = available_blocks.saturating_sub(needed); + if remainder > 0 { + let new_hole_offset = offset.saturating_add(needed); + + let new_key = remainder.to_le_bytes(); + let new_val = + bytes!(#pack, new_hole_offset, Offset, remainder, Blocks); + + drop(cursor); + // Insert the remainder back into deallocations + self.deallocations.upsert(txn, new_key, new_val)?; + } + + Ok(ExistingAllocation { + offset, + blocks: needed, + }) + } + + // --- Iterators & Accessors --- + pub(crate) fn get_program_accounts_iter( &self, program: &Pubkey, @@ -343,8 +358,6 @@ impl AccountsDbIndex { OffsetPubkeyIter::new(&self.programs, txn, Some(program)) } - /// Returns an iterator over offsets and pubkeys of all accounts in database - /// offsets can be used further to retrieve the account from storage pub(crate) fn get_all_accounts( &self, ) -> AccountsDbResult> { @@ -352,7 +365,6 @@ impl AccountsDbIndex { OffsetPubkeyIter::new(&self.programs, txn, None) } - /// Obtain a wrapped cursor to query account offsets repeatedly pub(crate) fn offset_finder( &self, ) -> AccountsDbResult> { @@ -360,87 +372,33 @@ impl AccountsDbIndex { AccountOffsetFinder::new(&self.accounts, txn) } - /// Returns the number of accounts in the database pub(crate) fn get_accounts_count(&self) -> usize { let Ok(txn) = self.env.begin_ro_txn() else { - warn!("failed to start transaction for stats retrieval"); return 0; }; self.owners.entries(&txn) } - /// Check whether allocation of given size (in blocks) exists. - /// These are the allocations which are leftovers from - /// accounts' reallocations due to their resizing/removal - pub(crate) fn try_recycle_allocation( - &self, - space: Blocks, - txn: &mut RwTransaction, - ) -> AccountsDbResult { - let mut cursor = self.deallocations.cursor_rw(txn)?; - // this is a neat lmdb trick where we can search for entry with matching - // or greater key since we are interested in any allocation of at least - // `blocks` size or greater, this works perfectly well for this case - - let (_, val) = - cursor.get(Some(&space.to_le_bytes()), None, MDB_SET_RANGE_OP)?; - - let (offset, mut blocks) = bytes!(#unpack, val, Offset, Blocks); - // delete the allocation record from recycleable list - cursor.del(WEMPTY)?; - // check whether the found allocation contains more space than necessary - let remainder = blocks.saturating_sub(space); - if remainder > 0 { - // split the allocation, to maximize the efficiency of block reuse - blocks = space; - let new_offset = offset.saturating_add(blocks); - let index_value = bytes!(#pack, new_offset, u32, remainder, u32); - cursor.put(&remainder.to_le_bytes(), &index_value, WEMPTY)?; - } - - Ok(ExistingAllocation { offset, blocks }) - } - pub(crate) fn flush(&self) { - // it's ok to ignore potential error here, as it will only happen if something - // utterly terrible happened at OS level, in which case we most likely won't even - // reach this code in any case there's no meaningful way to handle these errors let _ = self .env .sync(true) - .inspect_err(log_err!("main index flushing")); + .log_err(|| "main index flushing".to_string()); } - /// Reopen the index databases from a different directory at provided path - /// - /// NOTE: this is a very cheap operation, as fast as opening a few files pub(crate) fn reload(&mut self, dbpath: &Path) -> AccountsDbResult<()> { - // set it to default lmdb map size, it will be - // ignored if smaller than currently occupied const DEFAULT_SIZE: usize = 1024 * 1024; *self = Self::new(DEFAULT_SIZE, dbpath)?; Ok(()) } - /// Returns the number of deallocations in the database - #[cfg(test)] - pub(crate) fn get_delloactions_count(&self) -> usize { - let Ok(txn) = self.env.begin_ro_txn() else { - warn!("failed to start transaction for stats retrieval"); - return 0; - }; - self.deallocations.entries(&txn) - } - - /// Initiate RW Transaction pub(crate) fn rwtxn(&self) -> lmdb::Result> { self.env.begin_rw_txn() } } pub(crate) mod iterator; -pub(super) mod utils; -//mod standalone; mod table; #[cfg(test)] mod tests; +pub(super) mod utils; diff --git a/magicblock-accounts-db/src/index/iterator.rs b/magicblock-accounts-db/src/index/iterator.rs index 569d3950c..f7e80f472 100644 --- a/magicblock-accounts-db/src/index/iterator.rs +++ b/magicblock-accounts-db/src/index/iterator.rs @@ -1,14 +1,22 @@ -use lmdb::{Cursor, RoCursor, RoTransaction}; +use lmdb::{Cursor, Error as LmdbError, Iter, RoCursor, RoTransaction}; use solana_pubkey::Pubkey; use super::{table::Table, MDB_SET_OP}; use crate::{index::Offset, AccountsDbResult}; -/// Iterator over pubkeys and offsets, where accounts -/// for those pubkeys can be found in the database +/// Iterator over pubkeys and offsets. +/// +/// This struct bundles the transaction, cursor, and iterator to ensure +/// correct lifetime management for iterating over the index tables. pub struct OffsetPubkeyIter<'env> { - iter: lmdb::Iter<'env>, + /// The underlying LMDB iterator. + /// Note: Must be dropped before `_cursor` and `_txn`. + iter: Iter<'env>, + /// The cursor used by the iterator. + /// Kept alive to maintain the validity of the iterator. _cursor: RoCursor<'env>, + /// The read-only transaction. + /// Kept alive to maintain the validity of the cursor. _txn: RoTransaction<'env>, } @@ -19,36 +27,52 @@ impl<'env> OffsetPubkeyIter<'env> { pubkey: Option<&Pubkey>, ) -> AccountsDbResult { let cursor = table.cursor_ro(&txn)?; + // SAFETY: - // nasty/neat trick for lifetime erasure, but we are upholding - // the rust's ownership contracts by keeping txn around as well - let mut cursor: RoCursor = unsafe { std::mem::transmute(cursor) }; + // We are constructing a self-referential struct here: + // 1. `txn` is moved into the struct. + // 2. `cursor` borrows `txn`. + // 3. `iter` borrows `cursor`. + // + // Normally, moving `txn` would invalidate the `cursor`. However, LMDB handles + // (internal C pointers) are stable in memory. We transmute the cursor to + // extend its lifetime to `'env`, treating it as if it borrows from the + // environment (which outlives this struct) rather than the local `txn`. + // + // Correctness relies on the struct drop order: `iter` -> `_cursor` -> `_txn`. + let mut cursor: RoCursor<'env> = unsafe { std::mem::transmute(cursor) }; + let iter = if let Some(pubkey) = pubkey { - // NOTE: there's a bug in the LMDB, which ignores NotFound error when - // iterating on DUPSORT databases, where it just jumps to the next key, - // here we check for the error explicitly to prevent this behavior - if let Err(lmdb::Error::NotFound) = - cursor.get(Some(pubkey.as_ref()), None, MDB_SET_OP) - { - lmdb::Iter::Err(lmdb::Error::NotFound) - } else { - cursor.iter_dup_of(pubkey) + // Workaround for LMDB bug with DUPSORT databases where NotFound is ignored + // during `iter_dup_of`. We explicitly check existence first. + let key_bytes = pubkey.as_ref(); + match cursor.get(Some(key_bytes), None, MDB_SET_OP) { + Ok(_) => cursor.iter_dup_of(pubkey), + Err(LmdbError::NotFound) => Iter::Err(LmdbError::NotFound), + Err(e) => return Err(e.into()), } } else { cursor.iter_start() }; + Ok(Self { - _txn: txn, - _cursor: cursor, iter, + _cursor: cursor, + _txn: txn, }) } } impl Iterator for OffsetPubkeyIter<'_> { type Item = (Offset, Pubkey); + fn next(&mut self) -> Option { - let record = self.iter.next()?.ok(); - record.map(|entry| bytes!(#unpack, entry.1, Offset, Pubkey)) + // Retrieve the next record from the LMDB iterator. + // The iterator returns `(&[u8], &[u8])` representing (key, value). + let (_, value_bytes) = self.iter.next()?.ok()?; + + // Unpack the value which contains the Offset and Pubkey. + // Usage matches the `programs` index layout: [Offset (4 bytes) | Pubkey (32 bytes)] + Some(bytes!(#unpack, value_bytes, Offset, Pubkey)) } } diff --git a/magicblock-accounts-db/src/index/table.rs b/magicblock-accounts-db/src/index/table.rs index dd2ae7697..56134a64d 100644 --- a/magicblock-accounts-db/src/index/table.rs +++ b/magicblock-accounts-db/src/index/table.rs @@ -3,14 +3,14 @@ use lmdb::{ Transaction, WriteFlags, }; -use super::WEMPTY; - +/// Wrapper around LMDB Database providing a safe, typed interface. #[cfg_attr(test, derive(Debug))] pub(super) struct Table { db: Database, } impl Table { + /// Opens or creates a database within the given environment. pub(super) fn new( env: &Environment, name: &str, @@ -20,6 +20,7 @@ impl Table { Ok(Self { db }) } + /// Retrieves a value by key. Returns `None` if the key does not exist. #[inline] pub(super) fn get<'txn, T: Transaction, K: AsRef<[u8]>>( &self, @@ -33,44 +34,50 @@ impl Table { } } + /// Inserts a key-value pair, **overwriting** any existing value. #[inline] - pub(super) fn put, V: AsRef<[u8]>>( + pub(super) fn upsert, V: AsRef<[u8]>>( &self, txn: &mut RwTransaction, key: K, value: V, ) -> lmdb::Result<()> { - txn.put(self.db, &key, &value, WEMPTY) + txn.put(self.db, &key, &value, WriteFlags::empty()) } + /// Inserts a key-value pair **only if the key does not already exist**. + /// Returns `true` if inserted, `false` if the key already existed. #[inline] - pub(super) fn del>( + pub(super) fn insert, V: AsRef<[u8]>>( &self, txn: &mut RwTransaction, key: K, - value: Option<&[u8]>, - ) -> lmdb::Result<()> { - match txn.del(self.db, &key, value) { - Ok(_) | Err(lmdb::Error::NotFound) => Ok(()), + value: V, + ) -> lmdb::Result { + match txn.put(self.db, &key, &value, WriteFlags::NO_OVERWRITE) { + Ok(_) => Ok(true), + Err(lmdb::Error::KeyExist) => Ok(false), Err(e) => Err(e), } } + /// Removes a key. Returns `Ok(())` even if the key was not found (idempotent). + /// + /// If `value` is provided, the deletion only occurs if the data in the DB matches. #[inline] - pub(super) fn put_if_not_exists, V: AsRef<[u8]>>( + pub(super) fn remove>( &self, txn: &mut RwTransaction, key: K, - value: V, - ) -> lmdb::Result { - let result = txn.put(self.db, &key, &value, WriteFlags::NO_OVERWRITE); - match result { - Ok(_) => Ok(true), - Err(lmdb::Error::KeyExist) => Ok(false), - Err(err) => Err(err), + value: Option<&[u8]>, + ) -> lmdb::Result<()> { + match txn.del(self.db, &key, value) { + Ok(_) | Err(lmdb::Error::NotFound) => Ok(()), + Err(e) => Err(e), } } + /// Opens a read-only cursor. #[inline] pub(super) fn cursor_ro<'txn, T: Transaction>( &self, @@ -79,6 +86,7 @@ impl Table { txn.open_ro_cursor(self.db) } + /// Opens a read-write cursor. #[inline] pub(super) fn cursor_rw<'txn>( &self, @@ -87,6 +95,7 @@ impl Table { txn.open_rw_cursor(self.db) } + /// Returns the number of entries in the database. pub(super) fn entries(&self, txn: &T) -> usize { txn.stat(self.db).map(|s| s.entries()).unwrap_or_default() } diff --git a/magicblock-accounts-db/src/index/tests.rs b/magicblock-accounts-db/src/index/tests.rs index d13caa843..116c04c19 100644 --- a/magicblock-accounts-db/src/index/tests.rs +++ b/magicblock-accounts-db/src/index/tests.rs @@ -1,459 +1,296 @@ use std::{ - fs, ops::Deref, path::PathBuf, ptr::NonNull, sync::atomic::AtomicU32, + ops::Deref, + ptr::NonNull, + sync::atomic::{AtomicU32, Ordering}, }; use lmdb::Transaction; -use magicblock_config::config::{AccountsDbConfig, BlockSize}; +use magicblock_config::config::AccountsDbConfig; use solana_pubkey::Pubkey; +use tempfile::TempDir; use super::{AccountsDbIndex, Allocation}; use crate::error::AccountsDbError; -const INSERT_ACCOUNT_TXN_ERR: &str = - "failed to create transaction for account insertion"; - +/// Verifies that `upsert_account` correctly handles both new insertions +/// and updates to existing accounts, returning the old allocation when applicable. #[test] -fn test_insert_account() { - let tenv = setup(); - let IndexAccount { - pubkey, - owner, - allocation, - } = tenv.account(); - - let mut txn = tenv.env.begin_rw_txn().expect(INSERT_ACCOUNT_TXN_ERR); - let result = tenv.insert_account(&pubkey, &owner, allocation, &mut txn); - assert!(result.is_ok(), "failed to insert account into index"); +fn test_upsert_account() { + let env = IndexTestEnv::new(); + let account = env.new_account(); + + // 1. First Insertion (New Account) + let mut txn = env.rw_txn(); + let result = env.upsert_account(&account, &mut txn); + txn.commit().expect("commit failed"); + + assert!(result.is_ok(), "failed to insert account"); assert!( result.unwrap().is_none(), - "new account should not be reallocated" + "new account should not have a previous allocation" + ); + + // 2. Re-insertion (Update Account) + let mut txn = env.rw_txn(); + let new_allocation = env.new_allocation(); + + // We manually call the internal method to inject a specific new allocation + let result = env.index.upsert_account( + &account.pubkey, + &account.owner, + new_allocation, + &mut txn, ); - txn.commit().expect("failed to commit transaction"); + txn.commit().expect("commit failed"); - let reallocation = tenv.allocation(); - let mut txn = tenv.env.begin_rw_txn().expect(INSERT_ACCOUNT_TXN_ERR); - let result = tenv.insert_account(&pubkey, &owner, reallocation, &mut txn); - assert!(result.is_ok(), "failed to RE-insert account into index"); - let previous_allocation = allocation.into(); + assert!(result.is_ok(), "failed to update account"); + let previous = result.unwrap(); assert_eq!( - result.unwrap(), - Some(previous_allocation), - "account RE-insertion should return previous allocation" + previous, + Some(account.allocation.into()), + "update should return the old allocation for recycling" ); - txn.commit().expect("failed to commit transaction"); } +/// Verifies we can retrieve the correct storage offset for an account +/// using both the convenience method and raw transaction lookup. #[test] -fn test_get_account_offset() { - let tenv = setup(); - let IndexAccount { - pubkey, - owner, - allocation, - } = tenv.account(); - - let mut txn = tenv.env.begin_rw_txn().expect(INSERT_ACCOUNT_TXN_ERR); - tenv.insert_account(&pubkey, &owner, allocation, &mut txn) - .expect("failed to insert account"); - txn.commit().expect("failed to commit transaction"); - let result = tenv.get_account_offset(&pubkey); - assert!(result.is_ok(), "failed to read offset for inserted account"); - assert_eq!( - result.unwrap(), - allocation.offset, - "offset of read account doesn't match that of written one" - ); +fn test_get_offset() { + let env = IndexTestEnv::new(); + let account = env.new_account(); - let txn = tenv - .env - .begin_rw_txn() - .expect("failed to start new RW transaction"); + env.persist_account(&account); - let result = tenv.get_allocation(&txn, &pubkey); + // Test high-level convenience method + let offset = env.get_offset(&account.pubkey); + assert_eq!(offset.unwrap(), account.allocation.offset); - assert!( - result.is_ok(), - "failed to read an allocation for inserted account" - ); - assert_eq!( - result.unwrap(), - allocation.into(), - "allocation of account doesn't match one which was defined during insertion" - ); + // Test low-level internal retrieval + let txn = env.rw_txn(); + let allocation = env.get_allocation(&txn, &account.pubkey); + assert_eq!(allocation.unwrap(), account.allocation.into()); } +/// Ensures that `reallocate_account` moves the account record, updates the index, +/// and marks the old space as deallocated. #[test] fn test_reallocate_account() { - let tenv = setup(); - let IndexAccount { - pubkey, - owner, - allocation, - } = tenv.account(); - - let mut txn = tenv.env.begin_rw_txn().expect(INSERT_ACCOUNT_TXN_ERR); - tenv.insert_account(&pubkey, &owner, allocation, &mut txn) - .expect("failed to insert account"); - txn.commit().expect("failed to commit transaction"); - - let mut txn = tenv - .env - .begin_rw_txn() - .expect("failed to start new RW transaction"); - - let new_allocation = tenv.allocation(); - let index_value = - bytes!(#pack, new_allocation.offset, u32, new_allocation.blocks, u32); - let result = tenv.reallocate_account(&pubkey, &mut txn, &index_value); + let env = IndexTestEnv::new(); + let account = env.new_account(); + env.persist_account(&account); - txn.commit().expect("failed to commit transaction"); + let mut txn = env.rw_txn(); + let new_allocation = env.new_allocation(); - assert!(result.is_ok(), "failed to reallocate account"); - assert_eq!( - result.unwrap(), - allocation.into(), - "allocation of account doesn't match one, which was defined during insertion" + let new_index_value = bytes!( + #pack, + new_allocation.offset, u32, + new_allocation.blocks, u32 ); - let result = tenv - .get_account_offset(&pubkey) - .expect("failed to read reallocated account"); + + let result = + env.reallocate_account(&account.pubkey, &mut txn, &new_index_value); + txn.commit().expect("commit failed"); + + assert!(result.is_ok()); assert_eq!( - result, new_allocation.offset, - "reallocated account's offset doesn't match new allocation" + result.unwrap(), + account.allocation.into(), + "should return old allocation" ); + + // Verify persistence of new offset + let stored_offset = env.get_offset(&account.pubkey).unwrap(); + assert_eq!(stored_offset, new_allocation.offset); } #[test] fn test_remove_account() { - let tenv = setup(); - let IndexAccount { - pubkey, - owner, - allocation, - } = tenv.account(); - - let mut txn = tenv.env.begin_rw_txn().expect(INSERT_ACCOUNT_TXN_ERR); - tenv.insert_account(&pubkey, &owner, allocation, &mut txn) - .expect("failed to insert account"); - txn.commit().expect("failed to commit transaction"); - - let result = tenv.remove_account(&pubkey); - - assert!(result.is_ok(), "failed to remove account"); - let offset = tenv.get_account_offset(&pubkey); - assert!( - matches!(offset, Err(AccountsDbError::NotFound)), - "removed account offset is still present in index" - ); - assert_eq!( - tenv.get_delloactions_count(), - 1, - "the number of deallocations should have increased after account removal" - ); + let env = IndexTestEnv::new(); + let account = env.new_account(); + env.persist_account(&account); + + // Perform removal + let result = env.remove(&account.pubkey); + assert!(result.is_ok()); + + // Verify Index Entry is gone + let offset = env.get_offset(&account.pubkey); + assert!(matches!(offset, Err(AccountsDbError::NotFound))); + + // Verify Space was Deallocated + assert_eq!(env.count_deallocations(), 1, "deallocations count mismatch"); } +/// Tests the owner consistency check. +/// If an account's owner changes, the `programs` index must be updated +/// to reflect the move from the old owner to the new owner. #[test] fn test_ensure_correct_owner() { - let tenv = setup(); - let IndexAccount { - pubkey, - owner, - allocation, - } = tenv.account(); - - let mut txn = tenv.env.begin_rw_txn().expect(INSERT_ACCOUNT_TXN_ERR); - tenv.insert_account(&pubkey, &owner, allocation, &mut txn) - .expect("failed to insert account"); - txn.commit().expect("failed to commit transaction"); - let iter = tenv.get_program_accounts_iter(&owner); - assert!( - iter.is_ok(), - "failed to get iterator for newly inserted program account" - ); - let mut iter = iter.unwrap(); - assert_eq!( - iter.next(), - Some((allocation.offset, pubkey)), - "account returned by program iterator is invalid one" - ); - assert_eq!( - iter.next(), - None, - "program iterator returned more than the number of inserted accounts" - ); - drop(iter); - + let env = IndexTestEnv::new(); + let account = env.new_account(); + env.persist_account(&account); + + // Verify initial state + let programs: Vec<_> = env + .get_program_accounts_iter(&account.owner) + .unwrap() + .collect(); + assert_eq!(programs.len(), 1); + assert_eq!(programs[0], (account.allocation.offset, account.pubkey)); + + // Change Owner let new_owner = Pubkey::new_unique(); - let mut txn = tenv.env.begin_rw_txn().expect(INSERT_ACCOUNT_TXN_ERR); - assert!( - tenv.ensure_correct_owner(&pubkey, &new_owner, &mut txn) - .is_ok(), - "failed to ensure correct account owner" - ); - txn.commit().expect("failed to commit transaction"); - let result = tenv.get_program_accounts_iter(&owner); - assert!( - matches!(result.map(|i| i.count()), Ok(0)), - "programs index still has record of account after owner change" - ); - - let mut iter = tenv - .get_program_accounts_iter(&new_owner) - .expect("failed to get iterator for newly inserted program account"); - assert_eq!( - iter.next(), - Some((allocation.offset, pubkey)), - "account returned by program iterator is invalid one" - ); + let mut txn = env.rw_txn(); + let result = + env.ensure_correct_owner(&account.pubkey, &new_owner, &mut txn); + txn.commit().expect("commit failed"); + assert!(result.is_ok()); + + // Verify old owner is empty + let old_programs_count = env + .get_program_accounts_iter(&account.owner) + .unwrap() + .count(); + assert_eq!(old_programs_count, 0); + + // Verify new owner has account + let new_programs: Vec<_> = + env.get_program_accounts_iter(&new_owner).unwrap().collect(); + assert_eq!(new_programs.len(), 1); + assert_eq!(new_programs[0], (account.allocation.offset, account.pubkey)); } #[test] fn test_program_index_cleanup() { - let tenv = setup(); - let IndexAccount { - pubkey, - owner, - allocation, - } = tenv.account(); - - let mut txn = tenv.env.begin_rw_txn().expect(INSERT_ACCOUNT_TXN_ERR); - tenv.insert_account(&pubkey, &owner, allocation, &mut txn) - .expect("failed to insert account"); - txn.commit().expect("failed to commit transaction"); - - let mut txn = tenv - .env - .begin_rw_txn() - .expect("failed to start new RW transaction"); - let result = tenv.remove_programs_index_entry( - &pubkey, + let env = IndexTestEnv::new(); + let account = env.new_account(); + env.persist_account(&account); + + let mut txn = env.rw_txn(); + let result = env.remove_program_index_entry( + &account.pubkey, None, &mut txn, - allocation.offset, - ); - assert!(result.is_ok(), "failed to remove entry from programs index"); - txn.commit().expect("failed to commit transaction"); - - let result = tenv.get_program_accounts_iter(&owner); - assert!( - matches!(result.map(|i| i.count()), Ok(0)), - "programs index still has record of account after cleanup" + account.allocation.offset, ); + txn.commit().expect("commit failed"); + assert!(result.is_ok()); + + // Verify cleanup + let count = env + .get_program_accounts_iter(&account.owner) + .unwrap() + .count(); + assert_eq!(count, 0); } +/// Verifies the full cycle of: +/// Insert -> Reallocate (creates hole) -> Insert New (fills hole). #[test] -fn test_recycle_allocation_after_realloc() { - let tenv = setup(); - let IndexAccount { - pubkey, - owner, - allocation, - } = tenv.account(); - - let mut txn = tenv.env.begin_rw_txn().expect(INSERT_ACCOUNT_TXN_ERR); - tenv.insert_account(&pubkey, &owner, allocation, &mut txn) - .expect("failed to insert account"); - txn.commit().expect("failed to commit transaction"); - - let mut txn = tenv - .env - .begin_rw_txn() - .expect("failed to start new RW transaction"); - let new_allocation = tenv.allocation(); - let index_value = +fn test_recycle_allocation_flow() { + let env = IndexTestEnv::new(); + let account = env.new_account(); + + // 1. Insert + env.persist_account(&account); + + // 2. Reallocate (moves account, creating a hole at `account.allocation`) + let mut txn = env.rw_txn(); + let new_allocation = env.new_allocation(); + let index_val = bytes!(#pack, new_allocation.offset, u32, new_allocation.blocks, u32); - tenv.reallocate_account(&pubkey, &mut txn, &index_value) - .expect("failed to reallocate account"); - txn.commit().expect("failed to commit transaction"); - assert_eq!( - tenv.get_delloactions_count(), - 1, - "the number of deallocations should have increased after account realloc" - ); + env.reallocate_account(&account.pubkey, &mut txn, &index_val) + .unwrap(); + txn.commit().expect("commit failed"); - let mut txn = tenv.env.begin_rw_txn().expect(INSERT_ACCOUNT_TXN_ERR); - let result = tenv.try_recycle_allocation(new_allocation.blocks, &mut txn); - assert_eq!( - result.expect("failed to recycle allocation"), - allocation.into() - ); - txn.commit().expect("failed to commit transaction"); - assert_eq!( - tenv.get_delloactions_count(), - 0, - "the number of deallocations should have decreased after recycling" - ); - let mut txn = tenv.env.begin_rw_txn().expect(INSERT_ACCOUNT_TXN_ERR); - let result = tenv.try_recycle_allocation(new_allocation.blocks, &mut txn); - assert!( - matches!(result, Err(AccountsDbError::NotFound)), - "deallocations index should have run out of existing allocations" - ); - txn.commit().expect("failed to commit transaction"); - tenv.remove_account(&pubkey) - .expect("failed to remove account"); - assert_eq!( - tenv.get_delloactions_count(), - 1, - "the number of deallocations should have increased after account removal" - ); - let mut txn = tenv.env.begin_rw_txn().expect(INSERT_ACCOUNT_TXN_ERR); - let result = tenv.try_recycle_allocation(new_allocation.blocks, &mut txn); - assert_eq!( - result.expect("failed to recycle allocation after account removal"), - new_allocation.into() - ); - txn.commit().expect("failed to commit transaction"); + assert_eq!(env.count_deallocations(), 1, "hole created"); + + // 3. Recycle exact fit + // We request a block size exactly matching the hole we just made. + let mut txn = env.rw_txn(); + let recycled = + env.try_recycle_allocation(account.allocation.blocks, &mut txn); + txn.commit().expect("commit failed"); + + assert!(recycled.is_ok()); assert_eq!( - tenv.get_delloactions_count(), - 0, - "the number of deallocations should have decreased after recycling" + recycled.unwrap(), + account.allocation.into(), + "should reuse the exact hole" ); + assert_eq!(env.count_deallocations(), 0, "hole consumed"); + + // 4. Recycle missing (should fail) + let mut txn = env.rw_txn(); + let missing = env.try_recycle_allocation(100, &mut txn); + assert!(matches!(missing, Err(AccountsDbError::NotFound))); } +/// Verifies that requesting a smaller size than an available hole +/// correctly splits the hole and returns the remainder to the free list. #[test] fn test_recycle_allocation_split() { - let tenv = setup(); - let IndexAccount { - pubkey, - owner, - allocation, - } = tenv.account(); - - let mut txn = tenv.env.begin_rw_txn().expect(INSERT_ACCOUNT_TXN_ERR); - tenv.insert_account(&pubkey, &owner, allocation, &mut txn) - .expect("failed to insert account"); - txn.commit().expect("failed to commit transaction"); - tenv.remove_account(&pubkey).unwrap(); - assert_eq!( - tenv.get_delloactions_count(), - 1, - "the number of deallocations should have increased after account removal" - ); + let env = IndexTestEnv::new(); + let account = env.new_account(); + env.persist_account(&account); + env.remove(&account.pubkey).unwrap(); - let mut txn = tenv.env.begin_rw_txn().expect(INSERT_ACCOUNT_TXN_ERR); - let result = tenv - .try_recycle_allocation(allocation.blocks / 2, &mut txn) - .expect("failed to recycle allocation"); - assert_eq!(result.blocks, allocation.blocks / 2); - assert_eq!(result.offset, allocation.offset); - txn.commit().expect("failed to commit transaction"); - - let mut txn = tenv.env.begin_rw_txn().expect(INSERT_ACCOUNT_TXN_ERR); - let result = tenv - .try_recycle_allocation(allocation.blocks / 2, &mut txn) - .expect("failed to recycle allocation"); - assert_eq!(result.blocks, allocation.blocks / 2); - assert!(result.offset > allocation.offset); - txn.commit().expect("failed to commit transaction"); + assert_eq!(env.count_deallocations(), 1); - assert_eq!( - tenv.get_delloactions_count(), - 0, - "the number of deallocations should have decreased after recycling" - ); - let mut txn = tenv.env.begin_rw_txn().expect(INSERT_ACCOUNT_TXN_ERR); - let result = tenv.try_recycle_allocation(allocation.blocks, &mut txn); - assert!( - matches!(result, Err(AccountsDbError::NotFound)), - "deallocations index should have run out of existing allocations" - ); - txn.commit().expect("failed to commit transaction"); + let half_size = account.allocation.blocks / 2; + + // 1. Recycle first half + let mut txn = env.rw_txn(); + let part1 = env.try_recycle_allocation(half_size, &mut txn).unwrap(); + txn.commit().expect("commit failed"); + + assert_eq!(part1.blocks, half_size); + assert_eq!(part1.offset, account.allocation.offset); + + // Should still have one hole (the remainder) + assert_eq!(env.count_deallocations(), 1); + + // 2. Recycle second half (remainder) + let mut txn = env.rw_txn(); + let part2 = env.try_recycle_allocation(half_size, &mut txn).unwrap(); + txn.commit().expect("commit failed"); + + assert_eq!(part2.blocks, half_size); + // The new offset should be shifted by the size of the first part + assert!(part2.offset > account.allocation.offset); + + // Empty now + assert_eq!(env.count_deallocations(), 0); } #[test] fn test_byte_pack_unpack_macro() { - macro_rules! check { - ($v1: expr, $t1: ty, $v2: expr, $t2: ty, $tranformer: ident) => { - check!($v1, $t1, $v2, $t2, $tranformer, $tranformer); - }; - ($v1: expr, $t1: ty, $v2: expr, $t2: ty, $tranformer1: ident, $tranformer2: ident) => {{ - // get the cummulative size of value 1 and value 2, as they are laid out in memory - const S1: usize = size_of::<$t1>(); - const S2: usize = size_of::<$t2>(); - // create a buffer array to hold both values in concatenated form - let mut expected = [0_u8; S1 + S2]; - - // put the first value to S1 bytes of buffer, by using type to bytes transformer - expected[..S1].copy_from_slice(<$t1>::$tranformer1($v1).as_slice()); - // put the second value to S2 bytes of buffer following S1 bytes, by using type to bytes transformer - expected[S1..].copy_from_slice(<$t2>::$tranformer2($v2).as_slice()); - - // pack/concatenate the values together - let result = bytes!(#pack, $v1, $t1, $v2, $t2); - - // manually serialized buffer array should match the array produced by bytes! macro - assert_eq!( - result, - expected, - "invalid byte packing of {} ({}) and {} ({})", - $v1, stringify!($t1), $v2, stringify!($t2) - ); - // now, undo the whole thing by unpacking the array back to constituent types - let (v1, v2) = bytes!(#unpack, result, $t1, $t2); - - // we should get exactly the same values and types for the first value - assert_eq!( - $v1, v1, "unpacked value 1 doesn't match with initial {} <> {v1} ({})", - $v1, stringify!($t1) - ); - // same goes for the second value - assert!( - $v2.eq(&v2), "unpacked value 2 doesn't match with initial {} <> {v2} ({})", - $v2, stringify!($t2) - ); + macro_rules! check_pack { + ($v1: expr, $t1: ty, $v2: expr, $t2: ty) => {{ + let packed = bytes!(#pack, $v1, $t1, $v2, $t2); + let (u1, u2) = bytes!(#unpack, packed, $t1, $t2); + assert_eq!($v1, u1); + assert_eq!($v2, u2); }}; } - check!(13, u8, 42, i64, to_le_bytes); - check!(13, i8, 42, u8, to_le_bytes); - check!(13, u16, 42, i8, to_le_bytes); - check!(13, i16, 42, u16, to_le_bytes); - check!(13, u32, 42, i16, to_le_bytes); - check!(13, i32, 42, u32, to_le_bytes); - check!(13, u64, 42, i32, to_le_bytes); - check!(13, i64, 42, u64, to_le_bytes); + check_pack!(13u8, u8, 42u64, u64); + check_pack!(12345u32, u32, 67890u32, u32); let pubkey = Pubkey::new_unique(); - - check!(13, u8, pubkey, Pubkey, to_le_bytes, to_bytes); - check!(13, i8, pubkey, Pubkey, to_le_bytes, to_bytes); - check!(13, u16, pubkey, Pubkey, to_le_bytes, to_bytes); - check!(13, i16, pubkey, Pubkey, to_le_bytes, to_bytes); - check!(13, u32, pubkey, Pubkey, to_le_bytes, to_bytes); - check!(13, i32, pubkey, Pubkey, to_le_bytes, to_bytes); - check!(13, u64, pubkey, Pubkey, to_le_bytes, to_bytes); - check!(13, i64, pubkey, Pubkey, to_le_bytes, to_bytes); - - check!(pubkey, Pubkey, 13, u8, to_bytes, to_le_bytes); - check!(pubkey, Pubkey, 13, i8, to_bytes, to_le_bytes); - check!(pubkey, Pubkey, 13, u16, to_bytes, to_le_bytes); - check!(pubkey, Pubkey, 13, i16, to_bytes, to_le_bytes); - check!(pubkey, Pubkey, 13, u32, to_bytes, to_le_bytes); - check!(pubkey, Pubkey, 13, i32, to_bytes, to_le_bytes); - check!(pubkey, Pubkey, 13, u64, to_bytes, to_le_bytes); - check!(pubkey, Pubkey, 13, i64, to_bytes, to_le_bytes); + check_pack!(100u32, u32, pubkey, Pubkey); + check_pack!(pubkey, Pubkey, 255u8, u8); } // ============================================================== +// TEST UTILITIES // ============================================================== -// UTILITY CODE BELOW -// ============================================================== -// ============================================================== - -fn setup() -> IndexTestEnv { - let config = AccountsDbConfig::default(); - let directory = tempfile::tempdir() - .expect("failed to create temp directory for index tests") - .keep(); - let index = AccountsDbIndex::new(config.index_size, &directory) - .expect("failed to create accountsdb index in temp dir"); - IndexTestEnv { index, directory } -} struct IndexTestEnv { index: AccountsDbIndex, - directory: PathBuf, + // Kept to ensure directory deletion on drop + _temp_dir: TempDir, } struct IndexAccount { @@ -463,42 +300,80 @@ struct IndexAccount { } impl IndexTestEnv { - fn allocation(&self) -> Allocation { - static ALLOCATION: AtomicU32 = - AtomicU32::new(BlockSize::Block256 as u32); + fn new() -> Self { + let _temp_dir = tempfile::tempdir().expect("failed to create temp dir"); + let index = AccountsDbIndex::new( + AccountsDbConfig::default().index_size, + _temp_dir.path(), + ) + .expect("failed to create index"); + + Self { index, _temp_dir } + } + + /// Opens a read-write transaction. Panics on failure. + fn rw_txn(&self) -> lmdb::RwTransaction<'_> { + self.index + .env + .begin_rw_txn() + .expect("failed to begin rw txn") + } + + /// Generates a new dummy allocation with a unique offset. + fn new_allocation(&self) -> Allocation { + static OFFSET_COUNTER: AtomicU32 = AtomicU32::new(0); let blocks = 4; - let offset = ALLOCATION.fetch_add( - (BlockSize::Block256 as u32) * blocks, - std::sync::atomic::Ordering::Relaxed, - ); + // Mocking an offset allocator + let offset = OFFSET_COUNTER.fetch_add(blocks, Ordering::Relaxed); Allocation { - storage: NonNull::dangling(), + ptr: NonNull::dangling(), offset, blocks, } } - fn account(&self) -> IndexAccount { - let pubkey = Pubkey::new_unique(); - let owner = Pubkey::new_unique(); - let allocation = self.allocation(); + + /// Generates a random account with a dummy allocation. + fn new_account(&self) -> IndexAccount { IndexAccount { - pubkey, - owner, - allocation, + pubkey: Pubkey::new_unique(), + owner: Pubkey::new_unique(), + allocation: self.new_allocation(), } } + + /// Helper to insert an account into the index immediately. + /// Useful for setting up test state. + fn persist_account(&self, acc: &IndexAccount) { + let mut txn = self.rw_txn(); + self.index + .upsert_account(&acc.pubkey, &acc.owner, acc.allocation, &mut txn) + .expect("persist failed"); + txn.commit().expect("persist commit failed"); + } + + /// Helper to count entries in the private deallocations table. + fn count_deallocations(&self) -> usize { + let txn = self.index.env.begin_ro_txn().unwrap(); + self.index.deallocations.entries(&txn) + } + + /// Helper wrapper to call upsert on the internal account struct. + fn upsert_account( + &self, + acc: &IndexAccount, + txn: &mut lmdb::RwTransaction, + ) -> crate::AccountsDbResult> + { + self.index + .upsert_account(&acc.pubkey, &acc.owner, acc.allocation, txn) + } } +// Enable calling Index methods directly on Env impl Deref for IndexTestEnv { type Target = AccountsDbIndex; fn deref(&self) -> &Self::Target { &self.index } } - -impl Drop for IndexTestEnv { - fn drop(&mut self) { - let _ = fs::remove_dir_all(&self.directory); - } -} diff --git a/magicblock-accounts-db/src/index/utils.rs b/magicblock-accounts-db/src/index/utils.rs index d04d04611..066ee6db7 100644 --- a/magicblock-accounts-db/src/index/utils.rs +++ b/magicblock-accounts-db/src/index/utils.rs @@ -1,68 +1,93 @@ use std::{fs, path::Path}; -use lmdb::{Cursor, Environment, EnvironmentFlags, RoCursor, RoTransaction}; +use lmdb::{ + Cursor, Environment, EnvironmentFlags, Error as LmdbError, RoCursor, + RoTransaction, +}; use solana_pubkey::Pubkey; use super::{table::Table, Offset}; use crate::{index::Blocks, AccountsDbResult}; -// Below is the list of LMDB cursor operation consts, which were copy -// pasted since they are not exposed in the public API of LMDB -// See https://github.com/mozilla/lmdb-rs/blob/946167603dd6806f3733e18f01a89cee21888468/lmdb-sys/src/bindings.rs#L158 - -#[doc = "Position at first key greater than or equal to specified key."] +// Re-exporting LMDB constants that are missing from the safe wrapper crate. +// Values correspond to MDB_SET_RANGE and MDB_SET in lmdb.h pub(super) const MDB_SET_RANGE_OP: u32 = 17; -#[doc = "Position at specified key"] pub(super) const MDB_SET_OP: u32 = 15; -const TABLES_COUNT: u32 = 4; +const MAX_DBS: u32 = 4; -pub(super) fn lmdb_env(dir: &Path, size: usize) -> lmdb::Result { - let lmdb_env_flags: EnvironmentFlags = - // allows to manually trigger flush syncs, but OS initiated flushes are somewhat beyond our control - EnvironmentFlags::NO_SYNC - // don't bother with copy on write and mutate the memory - // directly, saves CPU cycles and memory access +/// Configures and opens the LMDB environment. +/// +/// Renamed from `lmdb_env` for clarity. +pub(super) fn create_lmdb_env( + dir: &Path, + map_size: usize, +) -> lmdb::Result { + // Flags optimization: + // - NO_SYNC: Let OS manage flushing (we handle durability via snapshots/other means). + // - WRITE_MAP: Use mmap for writes (avoids double buffering). + // - NO_MEM_INIT: Don't zero-out memory (we overwrite anyway). + // - NO_READAHEAD: Random access pattern optimization. + let flags = EnvironmentFlags::NO_SYNC | EnvironmentFlags::WRITE_MAP - // we never read uninit memory, so there's no point in paying for meminit | EnvironmentFlags::NO_MEM_INIT - // accounts' access is pretty much random, so read ahead might be doing unecessary work | EnvironmentFlags::NO_READAHEAD; - let path = dir.join("index"); - let _ = fs::create_dir_all(&path); + let index_dir = dir.join("index"); + if !index_dir.exists() { + fs::create_dir_all(&index_dir) + .map_err(|e| LmdbError::Other(e.raw_os_error().unwrap_or(0)))?; + } + Environment::new() - .set_map_size(size) - .set_max_dbs(TABLES_COUNT) - .set_flags(lmdb_env_flags) - .open_with_permissions(&path, 0o644) + .set_map_size(map_size) + .set_max_dbs(MAX_DBS) + .set_flags(flags) + .open_with_permissions(&index_dir, 0o644) } -/// A wrapper around a cursor on the accounts table +/// A self-contained cursor wrapper for efficient account lookups. +/// +/// Holds both the transaction and the cursor to ensure safe lifetime management +/// while exposing a simple API. pub struct AccountOffsetFinder<'env> { cursor: RoCursor<'env>, + // Kept alive to support the cursor's lifetime. + // Note: Field order matters for Drop! `cursor` depends on `_txn`, so `cursor` must be dropped first. + // Rust drops fields in declaration order (top to bottom). _txn: RoTransaction<'env>, } impl<'env> AccountOffsetFinder<'env> { - /// Set up a new cursor + /// Creates a new finder instance. pub(super) fn new( table: &Table, txn: RoTransaction<'env>, ) -> AccountsDbResult { let cursor = table.cursor_ro(&txn)?; + // SAFETY: - // nasty/neat trick for lifetime erasure, but we are upholding - // the rust's ownership contracts by keeping txn around as well - let cursor: RoCursor = unsafe { std::mem::transmute(cursor) }; + // We are constructing a self-referential struct. + // 1. `txn` is moved into the struct. + // 2. `cursor` is created from `txn` but we must extend its lifetime to `'env` + // to store it alongside `txn`. + // + // This is safe because: + // - LMDB cursors are valid as long as the transaction is open. + // - We ensure `cursor` is dropped before `_txn` via struct field order. + let cursor: RoCursor<'env> = unsafe { std::mem::transmute(cursor) }; + Ok(Self { cursor, _txn: txn }) } - /// Find a storage offset for the given account + /// Finds the storage offset for a given account public key. pub(crate) fn find(&self, pubkey: &Pubkey) -> Option { + let key_bytes = pubkey.as_ref(); + + // MDB_SET_OP: Position at specified key self.cursor - .get(Some(pubkey.as_ref()), None, MDB_SET_OP) + .get(Some(key_bytes), None, MDB_SET_OP) .ok() - .map(|(_, v)| bytes!(#unpack, v, Offset, Blocks).0) + .map(|(_, val_bytes)| bytes!(#unpack, val_bytes, Offset, Blocks).0) } } diff --git a/magicblock-accounts-db/src/lib.rs b/magicblock-accounts-db/src/lib.rs index 8a04ec2ca..7657a0770 100644 --- a/magicblock-accounts-db/src/lib.rs +++ b/magicblock-accounts-db/src/lib.rs @@ -1,84 +1,107 @@ -use std::{collections::HashSet, path::Path, sync::Arc}; +use std::{path::Path, sync::Arc, thread}; -use error::AccountsDbError; +use error::{AccountsDbError, LogErr}; use index::{ iterator::OffsetPubkeyIter, utils::AccountOffsetFinder, AccountsDbIndex, }; use lmdb::{RwTransaction, Transaction}; -use log::{error, warn}; +use log::{error, info, warn}; use magicblock_config::config::AccountsDbConfig; use magicblock_core::traits::AccountsBank; use parking_lot::RwLock; -use snapshot::SnapshotEngine; use solana_account::{ cow::AccountBorrowed, AccountSharedData, ReadableAccount, }; use solana_pubkey::Pubkey; use storage::AccountsStorage; +// Use the refactored manager +use crate::snapshot::SnapshotManager; + pub type AccountsDbResult = Result; -/// Stop the World Lock, used to halt all writes to the accountsdb -/// while some critical operation is in action, e.g. snapshotting -pub type StWLock = Arc>; + +/// A global lock used to suspend all write operations during critical +/// sections (like snapshots). +pub type GlobalWriteLock = Arc>; pub const ACCOUNTSDB_DIR: &str = "accountsdb"; +/// The main Accounts Database. +/// +/// Coordinates: +/// 1. **Storage**: Append-only memory-mapped log (`AccountsStorage`). +/// 2. **Indexing**: LMDB-based key-value store (`AccountsDbIndex`). +/// 3. **Persistence**: Snapshot management (`SnapshotManager`). #[cfg_attr(test, derive(Debug))] pub struct AccountsDb { - /// Main accounts storage, where actual account records are kept + /// Underlying append-only storage for account data. storage: AccountsStorage, - /// Index manager, used for various lookup operations + /// Fast index for account lookups (Pubkey -> Offset). index: AccountsDbIndex, - /// Snapshots manager - snapshot_engine: Arc, - /// Synchronization lock, employed for preventing other threads from - /// writing to accountsdb, currently used for snapshotting only - synchronizer: StWLock, - /// Slot wise frequency at which snapshots should be taken + /// Manages snapshots and state restoration. + snapshot_manager: Arc, + /// Global lock ensures atomic snapshots by pausing writes. + /// Note: Reads are generally wait-free/lock-free via mmap, + /// unless they require index cursor stability. + write_lock: GlobalWriteLock, + /// Configured interval (in slots) for creating snapshots. snapshot_frequency: u64, } impl AccountsDb { - /// Open or create accounts database + /// Initializes the Accounts Database. + /// + /// This handles directory creation and potentially resets the state + /// if `config.reset` is true. pub fn new( config: &AccountsDbConfig, - directory: &Path, + root_dir: &Path, max_slot: u64, ) -> AccountsDbResult { - let directory = directory.join(format!("{ACCOUNTSDB_DIR}/main")); - let lock = StWLock::default(); + let db_dir = root_dir.join(ACCOUNTSDB_DIR).join("main"); - if config.reset && std::fs::exists(&directory)? { - std::fs::remove_dir_all(&directory).inspect_err(log_err!( - "failed to reset accountsdb root directory" - ))?; + if config.reset && db_dir.exists() { + info!("Resetting AccountsDb at {}", db_dir.display()); + std::fs::remove_dir_all(&db_dir).log_err(|| { + "Failed to reset accountsdb directory".to_string() + })?; + } + std::fs::create_dir_all(&db_dir) + .log_err(|| "Failed to create accountsdb directory".to_string())?; + + let storage = AccountsStorage::new(config, &db_dir) + .log_err(|| "Failed to initialize storage".to_string())?; + + let index = AccountsDbIndex::new(config.index_size, &db_dir) + .log_err(|| "Failed to initialize index".to_string())?; + + let snapshot_manager = + SnapshotManager::new(db_dir.clone(), config.max_snapshots as usize) + .log_err(|| { + "Failed to initialize snapshot manager".to_string() + })?; + + if config.snapshot_frequency == 0 { + return Err(AccountsDbError::Internal( + "Snapshot frequency cannot be zero".to_string(), + )); } - std::fs::create_dir_all(&directory).inspect_err(log_err!( - "ensuring existence of accountsdb directory" - ))?; - let storage = AccountsStorage::new(config, &directory) - .inspect_err(log_err!("storage creation"))?; - let index = AccountsDbIndex::new(config.index_size, &directory) - .inspect_err(log_err!("index creation"))?; - let snapshot_engine = - SnapshotEngine::new(directory, config.max_snapshots as usize) - .inspect_err(log_err!("snapshot engine creation"))?; - let snapshot_frequency = config.snapshot_frequency; - assert_ne!(snapshot_frequency, 0, "snapshot frequency cannot be zero"); let mut this = Self { storage, index, - snapshot_engine, - synchronizer: lock, - snapshot_frequency, + snapshot_manager, + write_lock: GlobalWriteLock::default(), + snapshot_frequency: config.snapshot_frequency, }; - this.ensure_at_most(max_slot)?; + + // Recover state if the requested slot is older than our current state + this.restore_state_if_needed(max_slot)?; + Ok(this) } - /// Opens existing database with given snapshot_frequency, used for tests and tools - /// most likely you want to use [new](AccountsDb::new) method + /// Opens an existing database (helper for tooling/tests). pub fn open(directory: &Path) -> AccountsDbResult { let config = AccountsDbConfig { snapshot_frequency: u64::MAX, @@ -87,242 +110,146 @@ impl AccountsDb { Self::new(&config, directory, 0) } - /// Insert account with given pubkey into the database - /// Note: this method removes zero lamport account from database + /// Inserts or updates a single account. pub fn insert_account( &self, pubkey: &Pubkey, account: &AccountSharedData, ) -> AccountsDbResult<()> { - let mut txn = Option::::None; - macro_rules! txn { - () => { - match txn.as_mut() { - Some(t) => t, - None => txn.insert(self.index.rwtxn()?), - } - }; + let mut txn = self.index.rwtxn()?; + self.perform_account_upsert(pubkey, account, &mut txn)?; + txn.commit()?; + Ok(()) + } + + /// Inserts multiple accounts atomically. + /// + /// If any insertion fails, the entire batch is rolled back (index changes are aborted). + pub fn insert_batch<'a>( + &self, + accounts: impl Iterator + Clone, + ) -> AccountsDbResult<()> { + let mut txn = self.index.rwtxn()?; + let mut processed_count = 0; + + // Optimistic execution + for (pubkey, account) in accounts.clone() { + let result = self.perform_account_upsert(pubkey, account, &mut txn); + if let Err(e) = result { + error!("Batch insert failed at {pubkey}: {e}"); + // Rollback in-memory Cow changes for accounts processed so far + self.rollback_borrowed_accounts(accounts, processed_count); + // Abort LMDB transaction (implicit on drop, but explicit return here) + return Err(e); + } + processed_count += 1; + } + + if let Err(e) = txn.commit() { + error!("Batch insert commit failed {e}"); + // Rollback in-memory CoW changes for all processed accounts + self.rollback_borrowed_accounts(accounts, processed_count); + return Err(e.into()); + } + Ok(()) + } + + fn rollback_borrowed_accounts<'a>( + &self, + accounts: impl Iterator, + count: usize, + ) { + for (_, account) in accounts.take(count) { + // SAFETY: + // We invoke rollback on the specific `AccountSharedData` instances + // that were modified in-memory during this failed transaction. + // This method is called only for accounts previously modified by + // perform_account_upsert. + unsafe { account.rollback() }; } - // NOTE: we don't check for non-zero lamports since we allow to store zero-lamport accounts - // for the following two cases: - // - when we clone a compressed account we reflect the exact lamports it has which maybe - // zero since compressed accounts don't need to be rent-exempt - // - when we clone an account to signal that we fetched it from chain already but did not - // find it, i.e. in the case of an escrow account to avoid doing that over and over + } + + /// Core upsert logic: Storage Allocation -> Index Update -> Data Write. + fn perform_account_upsert( + &self, + pubkey: &Pubkey, + account: &AccountSharedData, + txn: &mut RwTransaction, + ) -> AccountsDbResult<()> { match account { AccountSharedData::Borrowed(acc) => { - // check whether the account's owner has changed + // Borrowed accounts are updated in-place in RAM. + // We only touch the index if ownership changes to ensure consistency. if acc.owner_changed() { self.index - .ensure_correct_owner(pubkey, account.owner(), txn!()) - .inspect_err(log_err!( - "failed to ensure correct account owner for {}", - pubkey - ))?; + .ensure_correct_owner(pubkey, account.owner(), txn) + .log_err(|| { + "Failed to update owner index".to_string() + })?; } - // and perform some index bookkeeping to ensure a correct owner - // For borrowed variants everything is already written and we just increment the - // atomic counter. New readers will see the latest update. acc.commit(); } AccountSharedData::Owned(acc) => { - let txn = txn!(); - let datalen = account.data().len() as u32; + // 1. Calculate size requirements + let data_len = account.data().len() as u32; let block_size = self.storage.block_size() as u32; - let size = AccountSharedData::serialized_size_aligned( - datalen, block_size, + let size_bytes = AccountSharedData::serialized_size_aligned( + data_len, block_size, ) as usize; - let blocks = self.storage.get_block_count(size); - // TODO(bmuddha) perf optimization: use reallocs sparringly - // https://github.com/magicblock-labs/magicblock-validator/issues/327 - let allocation = match self - .index - .try_recycle_allocation(blocks, txn) - { - // if we could recycle some "hole" in the database, use it - Ok(recycled) => { - // bookkeeping for the deallocated (free hole) space - self.storage.decrement_deallocations(recycled.blocks); - self.storage.recycle(recycled) - } - // otherwise allocate from the end of memory map - Err(AccountsDbError::NotFound) => self.storage.alloc(size), - Err(err) => { - // This can only happen if we have catastrophic system mulfunction - error!("failed to insert account, index allocation check error: {err}"); - return Err(err); - } - }; + let blocks_needed = self.storage.blocks_required(size_bytes); - // update accounts index - let dealloc = self - .index - .insert_account(pubkey, account.owner(), allocation, txn) - .inspect_err(log_err!("account index insertion"))?; - if let Some(dealloc) = dealloc { - // bookkeeping for deallocated (free hole) space - self.storage.increment_deallocations(dealloc.blocks); - } - // SAFETY: - // Allocation object is constructed by obtaining a valid offset from storage, which - // is unoccupied by other accounts, points to valid memory within mmap and is - // properly aligned to 8 bytes, so the contract of serialize_to_mmap is satisfied - unsafe { - AccountSharedData::serialize_to_mmap( - acc, - allocation.storage.as_ptr(), - block_size * allocation.blocks, - ) - }; - } - } - if let Some(txn) = txn { - txn.commit()?; - } - Ok(()) - } - - pub fn insert_batch<'a>( - &self, - accounts: impl Iterator + Clone, - ) -> AccountsDbResult<()> { - let mut txn = Option::::None; - macro_rules! txn { - () => { - match txn.as_mut() { - Some(t) => t, - None => txn.insert(self.index.rwtxn()?), - } - }; - } - let mut error = Option::::None; - macro_rules! break_error { - ($error: expr, $msg: expr) => { - error!($msg, $error); - error.replace($error); - break; - }; - } - - let mut commited = 0; - for (pubkey, account) in accounts.clone() { - match account { - AccountSharedData::Borrowed(acc) => { - // check whether the account's owner has changed - if acc.owner_changed() { - if let Err(err) = self.index.ensure_correct_owner( - pubkey, - account.owner(), - txn!(), - ) { - break_error!(err, "account owner index update: {}"); - } - } - // and perform some index bookkeeping to ensure a correct owner - // For borrowed variants everything is already written and we just increment the - // atomic counter. New readers will see the latest update. - acc.commit(); - } - AccountSharedData::Owned(acc) => { - let txn = txn!(); - let datalen = account.data().len() as u32; - let block_size = self.storage.block_size() as u32; - let size = AccountSharedData::serialized_size_aligned( - datalen, block_size, - ) as usize; - - let blocks = self.storage.get_block_count(size); - // TODO(bmuddha) perf optimization: use reallocs sparringly - // https://github.com/magicblock-labs/magicblock-validator/issues/327 - let allocation = match self - .index - .try_recycle_allocation(blocks, txn) + // 2. Allocate (Recycle or New) + let allocation = + match self.index.try_recycle_allocation(blocks_needed, txn) { - // if we could recycle some "hole" in the database, use it Ok(recycled) => { - // bookkeeping for the deallocated (free hole) space - self.storage - .decrement_deallocations(recycled.blocks); + self.storage.dec_recycled_count(recycled.blocks); self.storage.recycle(recycled) } - // otherwise allocate from the end of memory map Err(AccountsDbError::NotFound) => { - self.storage.alloc(size) - } - Err(err) => { - // This can only happen if we have catastrophic system mulfunction - break_error!(err, "failed to insert account, index allocation check error: {}"); + self.storage.allocate(size_bytes)? } + Err(e) => return Err(e), }; - // update accounts index - let result = self.index.insert_account( - pubkey, - account.owner(), - allocation, - txn, - ); - match result { - Ok(Some(dealloc)) => { - // bookkeeping for deallocated (free hole) space - self.storage - .increment_deallocations(dealloc.blocks); - } - Ok(None) => (), - Err(err) => { - break_error!( - err, - "account index insertion failed: {}" - ); - } - } - // SAFETY: - // Allocation object is constructed by obtaining a valid offset from storage, which - // is unoccupied by other accounts, points to valid memory within mmap and is - // properly aligned to 8 bytes, so the contract of serialize_to_mmap is satisfied - unsafe { - AccountSharedData::serialize_to_mmap( - acc, - allocation.storage.as_ptr(), - block_size * allocation.blocks, - ) - }; + // 3. Update Index + let old_allocation = self + .index + .upsert_account(pubkey, account.owner(), allocation, txn) + .log_err(|| "Index update failed".to_string())?; + + if let Some(old) = old_allocation { + self.storage.inc_recycled_count(old.blocks); } + + // 4. Write Data + // SAFETY: `allocation` provides a valid, exclusive pointer range managed by `AccountsStorage`. + unsafe { + AccountSharedData::serialize_to_mmap( + acc, + allocation.ptr.as_ptr(), + block_size * allocation.blocks, + ) + }; } - commited += 1; - } - if let Some(error) = error { - for (_, account) in accounts.take(commited) { - // # Safety - // we are only rolling back account which we have just committed, - // thus it is guaranteed that the rollback buffer is initialized - unsafe { account.rollback() }; - } - return Err(error); - } - if let Some(txn) = txn { - txn.commit()?; } Ok(()) } - /// Check whether given account is owned by any of the programs in the provided list + /// Checks if any of the `owners` own the `account`. pub fn account_matches_owners( &self, account: &Pubkey, owners: &[Pubkey], ) -> Option { - let offset = self.index.get_account_offset(account).ok()?; - let memptr = self.storage.offset(offset); - // SAFETY: - // memptr is obtained from the storage directly, which maintains - // the integrity of account records, by making sure, that they are - // initialized and laid out properly along with the shadow buffer - unsafe { AccountBorrowed::any_owner_matches(memptr.as_ptr(), owners) } + let offset = self.index.get_offset(account).ok()?; + let ptr = self.storage.resolve_ptr(offset); + // SAFETY: `ptr` is guaranteed valid by `AccountsStorage` for the duration of the map. + unsafe { AccountBorrowed::any_owner_matches(ptr.as_ptr(), owners) } } - /// Scans the database accounts of given program, satisfying the provided filter + /// Returns a filterable iterator over accounts owned by `program`. pub fn get_program_accounts( &self, program: &Pubkey, @@ -331,13 +258,11 @@ impl AccountsDb { where F: Fn(&AccountSharedData) -> bool + 'static, { - // TODO(bmuddha): perf optimization in scanning logic - // https://github.com/magicblock-labs/magicblock-validator/issues/328 + let iterator = + self.index.get_program_accounts_iter(program).log_err(|| { + "Failed to create program accounts iterator".to_string() + })?; - let iterator = self - .index - .get_program_accounts_iter(program) - .inspect_err(log_err!("program accounts retrieval"))?; Ok(AccountsScanner { iterator, storage: &self.storage, @@ -345,6 +270,8 @@ impl AccountsDb { }) } + /// An optimized accountsdb accessor, which can be used for multiple reads, + /// without incurring the overhead of repeated creation of index transaction pub fn reader(&self) -> AccountsDbResult> { let offset = self.index.offset_finder()?; Ok(AccountsReader { @@ -353,171 +280,170 @@ impl AccountsDb { }) } - /// Check whether account with given pubkey exists in the database + /// Check whether pubkey is present in the AccountsDB pub fn contains_account(&self, pubkey: &Pubkey) -> bool { - match self.index.get_account_offset(pubkey) { - Ok(_) => true, - Err(AccountsDbError::NotFound) => false, - Err(err) => { - warn!("failed to check {pubkey} existence: {err}"); - false - } - } + self.index.get_offset(pubkey).is_ok() } - /// Get the number of accounts in the database - pub fn get_accounts_count(&self) -> usize { + /// Return total number of accounts present in the database + pub fn account_count(&self) -> usize { self.index.get_accounts_count() } - /// Get latest observed slot + /// Return the last slot written to AccountsDB #[inline(always)] pub fn slot(&self) -> u64 { - self.storage.get_slot() - } - - /// Temporary hack for overriding accountsdb slot without snapshot checks - // TODO(bmuddha): remove with the ledger rewrite - pub fn override_slot(&self, slot: u64) { - self.storage.set_slot(slot); + self.storage.slot() } - /// Set latest observed slot + /// Updates the current slot. Triggers a background snapshot if the schedule matches. #[inline(always)] pub fn set_slot(self: &Arc, slot: u64) { - self.storage.set_slot(slot); + self.storage.update_slot(slot); - if !slot.is_multiple_of(self.snapshot_frequency) { - return; + if slot > 0 && slot.is_multiple_of(self.snapshot_frequency) { + self.trigger_background_snapshot(slot); } + } + + /// Spawns a background thread to take a snapshot. + fn trigger_background_snapshot(self: &Arc, slot: u64) { let this = self.clone(); - // Since `set_slot` is usually invoked in async context, we don't want to - // ever block it. Here we move the whole lock acquisition and snapshotting - // to a separate thread, considering that snapshot taking is extremely rare - // operation, the overhead should be negligible - std::thread::spawn(move || { - // acquire the lock, effectively stopping the world, nothing should be able - // to modify underlying accounts database while this lock is active - let locked = this.synchronizer.write(); - // flush everything before taking the snapshot, in order to ensure consistent state + + thread::spawn(move || { + // Acquire write lock to ensure consistent state capture + let write_guard = this.write_lock.write(); this.flush(); - let used_storage = this.storage.utilized_mmap(); - if let Err(err) = - this.snapshot_engine.snapshot(slot, used_storage, locked) - { - warn!( - "failed to take snapshot at {}, slot {slot}: {err}", - this.snapshot_engine.database_path().display() - ); + // Capture the active memory map region for the snapshot + let used_storage = this.storage.active_segment(); + + if let Err(err) = this.snapshot_manager.create_snapshot( + slot, + used_storage, + write_guard, + ) { + warn!("Snapshot failed for slot {}: {}", slot, err); } }); } - /// Checks whether AccountsDB has "freshness", not exceeding given slot - /// Returns current slot if true, otherwise tries to rollback to the - /// most recent snapshot, which is older than the provided slot + /// Ensures the database state is at most `slot`. /// - /// Note: this will delete the current database state upon rollback. - /// But in most cases, the ledger slot and adb slot will match and - /// no rollback will take place, in any case use with care! - pub fn ensure_at_most(&mut self, slot: u64) -> AccountsDbResult { - // if this is a fresh start or we just match, then there's nothing to ensure - if slot >= self.slot().saturating_sub(1) { + /// If the current state is newer than `slot`, this performs a **rollback** + /// to the nearest valid snapshot. + pub fn restore_state_if_needed( + &mut self, + target_slot: u64, + ) -> AccountsDbResult { + // Allow slot-1 because we might be in the middle of processing the current slot + if target_slot >= self.slot().saturating_sub(1) { return Ok(self.slot()); } - // make sure that no one is reading the database - let _locked = self.synchronizer.write(); - - let rb_slot = self - .snapshot_engine - .try_switch_to_snapshot(slot) - .inspect_err(log_err!( - "switching to snapshot before slot {}", - slot - ))?; - let path = self.snapshot_engine.database_path(); + warn!( + "Current slot {} is ahead of target {}. Rolling back...", + self.slot(), + target_slot + ); + + // Block all writes during restoration + let _guard = self.write_lock.write(); + + let restored_slot = self + .snapshot_manager + .restore_from_snapshot(target_slot) + .log_err(|| { + format!( + "Snapshot restoration failed for target {}", + target_slot + ) + })?; + + // Reload components to reflect new FS state + let path = self.snapshot_manager.database_path(); self.storage.reload(path)?; self.index.reload(path)?; - Ok(rb_slot) + + info!("Successfully rolled back to slot {}", restored_slot); + Ok(restored_slot) } - /// Get the total number of bytes in storage pub fn storage_size(&self) -> u64 { - self.storage.size() + self.storage.size_bytes() } - /// Returns an iterator over all accounts in the database, pub fn iter_all( &self, ) -> impl Iterator + '_ { - let iter = self - .index + self.index .get_all_accounts() - .inspect_err(log_err!("iterating all over all account keys")) - .ok(); - iter.into_iter() + .ok() + .into_iter() .flatten() .map(|(offset, pk)| (pk, self.storage.read_account(offset))) } - /// Flush primary storage and indexes to disk pub fn flush(&self) { self.storage.flush(); self.index.flush(); } - /// Get a clone of synchronization lock, to suspend all the writes, - /// while some critical operation, like snapshotting is in progress - pub fn synchronizer(&self) -> StWLock { - self.synchronizer.clone() + pub fn write_lock(&self) -> GlobalWriteLock { + self.write_lock.clone() } } impl AccountsBank for AccountsDb { - /// Read account from with given pubkey from the database (if exists) #[inline(always)] fn get_account(&self, pubkey: &Pubkey) -> Option { - let offset = self.index.get_account_offset(pubkey).ok()?; + let offset = self.index.get_offset(pubkey).ok()?; Some(self.storage.read_account(offset)) } fn remove_account(&self, pubkey: &Pubkey) { let _ = self .index - .remove_account(pubkey) - .inspect_err(log_err!("removing an account {}", pubkey)); + .remove(pubkey) + .log_err(|| format!("Failed to remove account {pubkey}")); } - /// Remove all accounts matching the provided predicate - /// NOTE: accounts are not locked while this operation is in progress, - /// thus this should only be performed before the validator starts processing - /// transactions + /// Efficiently removes accounts matching a predicate. fn remove_where( &self, predicate: impl Fn(&Pubkey, &AccountSharedData) -> bool, ) -> usize { - let to_remove = self - .iter_all() - .filter(|(pk, acc)| predicate(pk, acc)) - .map(|(pk, _)| pk) - .collect::>(); - let removed = to_remove.len(); - for pk in to_remove { - self.remove_account(&pk); + let mut count = 0; + let batch_size = 1024; + let mut keys_to_remove = Vec::with_capacity(batch_size); + + // First pass: Identify keys (using iter_all which is read-only) + for (pk, acc) in self.iter_all() { + if predicate(&pk, &acc) { + keys_to_remove.push(pk); + if keys_to_remove.len() >= batch_size { + for k in keys_to_remove.drain(..) { + self.remove_account(&k); + count += 1; + } + } + } } - removed + + // Final flush + for k in keys_to_remove { + self.remove_account(&k); + count += 1; + } + + count } } -// SAFETY: -// We only ever use AccountsDb within the Arc and all -// write access to it is synchronized via atomic operations +// SAFETY: AccountsDb uses internal locking (LMDB + RwLock) to ensure thread safety. unsafe impl Sync for AccountsDb {} unsafe impl Send for AccountsDb {} -/// Iterator to scan program accounts applying filtering logic on them pub struct AccountsScanner<'db, F> { storage: &'db AccountsStorage, filter: F, @@ -530,31 +456,25 @@ where { type Item = (Pubkey, AccountSharedData); fn next(&mut self) -> Option { - loop { - let (offset, pubkey) = self.iterator.next()?; + for (offset, pubkey) in self.iterator.by_ref() { let account = self.storage.read_account(offset); if (self.filter)(&account) { - break Some((pubkey, account)); + return Some((pubkey, account)); } } + None } } -/// Versatile and reusable account reader, can be used to perform multiple account queries -/// from the database more efficiently, avoiding the cost of index/cursor setups pub struct AccountsReader<'db> { offset: AccountOffsetFinder<'db>, storage: &'db AccountsStorage, } -// SAFETY: -// AccountsReader is only ever used to get readable access to the -// underlying database, and never outlives the the backing storage unsafe impl Send for AccountsReader<'_> {} unsafe impl Sync for AccountsReader<'_> {} impl AccountsReader<'_> { - /// Find the account specified by the pubkey and pass it to the reader function pub fn read(&self, pubkey: &Pubkey, reader: F) -> Option where F: Fn(AccountSharedData) -> R, @@ -564,7 +484,6 @@ impl AccountsReader<'_> { Some(reader(account)) } - /// Check whether given account exists in the AccountsDB pub fn contains(&self, pubkey: &Pubkey) -> bool { self.offset.find(pubkey).is_some() } @@ -573,7 +492,7 @@ impl AccountsReader<'_> { #[cfg(test)] impl AccountsDb { pub fn snapshot_exists(&self, slot: u64) -> bool { - self.snapshot_engine.snapshot_exists(slot) + self.snapshot_manager.snapshot_exists(slot) } } diff --git a/magicblock-accounts-db/src/snapshot.rs b/magicblock-accounts-db/src/snapshot.rs index f4fae0cd2..7089b7c1a 100644 --- a/magicblock-accounts-db/src/snapshot.rs +++ b/magicblock-accounts-db/src/snapshot.rs @@ -2,296 +2,367 @@ use std::{ collections::VecDeque, ffi::OsStr, fs::{self, File}, - io::{self, Write}, + io::{self, BufWriter, Write}, path::{Path, PathBuf}, sync::Arc, }; -use log::{info, warn}; -use memmap2::MmapMut; +use log::{error, info, warn}; use parking_lot::{Mutex, RwLockWriteGuard}; -use reflink::reflink; use crate::{ - error::AccountsDbError, log_err, storage::ADB_FILE, AccountsDbResult, + error::{AccountsDbError, LogErr}, + storage::ACCOUNTS_DB_FILENAME, + AccountsDbResult, }; -#[cfg_attr(test, derive(Debug))] -pub struct SnapshotEngine { - /// directory path where database files are kept - dbpath: PathBuf, - /// indicator flag for Copy on Write support on host file system - is_cow_supported: bool, - /// List of existing snapshots - /// Note: as it's locked only when slot is incremented - /// this is basically a contention free Mutex we use it - /// for the convenience of interior mutability - snapshots: Mutex>, - /// max number of snapshots to keep alive - max_count: usize, +/// Defines the mechanism used to persist snapshots to disk. +#[derive(Debug, Clone, Copy)] +enum SnapshotStrategy { + /// Utilizes filesystem CoW (Copy-on-Write) features (e.g., `ioctl_ficlonerange` on Linux). + /// This is an O(1) operation regarding data size. + Reflink, + /// Fallback for standard filesystems. Performs a deep recursive copy of the directory. + /// Requires capturing the `mmap` state into RAM before writing to ensure consistency. + LegacyCopy, } -impl SnapshotEngine { - pub(crate) fn new( - dbpath: PathBuf, - max_count: usize, +impl SnapshotStrategy { + /// Probes the filesystem at `dir` to determine if CoW operations are supported. + fn detect(dir: &Path) -> AccountsDbResult { + if fs_backend::supports_reflink(dir) + .log_err(|| "CoW check failed".to_string())? + { + info!("Snapshot Strategy: Reflink (Fast/CoW)"); + Ok(Self::Reflink) + } else { + warn!("Snapshot Strategy: Deep Copy (Slow)"); + Ok(Self::LegacyCopy) + } + } + + /// Executes the snapshot operation based on the active strategy. + /// + /// # Arguments + /// * `memory_state` - Required only for `LegacyCopy`. Contains the byte-consistent view + /// of the main database file. + /// * `lock` - Write lock, which prevents any accountsdb modifications during snapshotting + fn execute( + &self, + src: &Path, + dst: &Path, + memory_state: Vec, + lock: RwLockWriteGuard<()>, + ) -> io::Result<()> { + match self { + Self::Reflink => fs_backend::reflink_dir(src, dst), + Self::LegacyCopy => { + // Drop lock for slow copy to avoid stalling the system + drop(lock); + fs_backend::deep_copy_dir(src, dst, &memory_state) + } + } + } +} + +/// Manages the lifecycle, creation, and restoration of database snapshots. +/// +/// This type handles the complexity of filesystem capabilities +/// (CoW vs Copy) and ensures atomic restoration of state. +#[derive(Debug)] +pub struct SnapshotManager { + /// Path to the active (hot) database directory. + db_path: PathBuf, + /// Directory where snapshots are stored. + snapshots_dir: PathBuf, + /// The persistence strategy chosen during initialization. + strategy: SnapshotStrategy, + /// Ordered registry of valid snapshot paths (oldest to newest). + registry: Mutex>, + /// Maximum number of snapshots to retain before rotating out old ones. + max_snapshots: usize, +} + +impl SnapshotManager { + /// Initializes the manager, detecting filesystem capabilities and recovering + /// existing snapshots from disk. + pub fn new( + db_path: PathBuf, + max_snapshots: usize, ) -> AccountsDbResult> { - let is_cow_supported = Self::supports_cow(&dbpath) - .inspect_err(log_err!("cow support check"))?; - let snapshots = Self::read_snapshots(&dbpath, max_count)?.into(); + let snapshots_dir = db_path + .parent() + .ok_or_else(|| { + io::Error::new(io::ErrorKind::InvalidInput, "Invalid db_path") + }) + .log_err(|| "Failed to resolve snapshots directory".to_string())? + .to_path_buf(); + + // 1. Determine capabilities (CoW or Deep Copy) + let strategy = SnapshotStrategy::detect(&snapshots_dir)?; + + // 2. Load and sort existing snapshots from the filesystem + let registry = Self::recover_registry(&snapshots_dir, max_snapshots) + .log_err(|| "Failed to load snapshot registry".to_string())?; Ok(Arc::new(Self { - dbpath, - is_cow_supported, - snapshots, - max_count, + db_path, + snapshots_dir, + strategy, + registry: Mutex::new(registry), + max_snapshots, })) } - /// Take snapshot of database directory, this operation - /// assumes that no writers are currently active - pub(crate) fn snapshot( + /// Creates a durable snapshot of the current database state for the given `slot`. + /// + /// # Locking & Consistency + /// * **CoW (Reflink)**: The `_write_lock` is held during the reflink operation. + /// Since reflink is a metadata operation, this is extremely fast. + /// * **Legacy Copy**: The `_write_lock` is dropped *after* capturing the `active_mem` + /// state to RAM, but *before* the slow disk I/O. This prevents blocking the validator + /// during the deep copy. + pub fn create_snapshot( &self, slot: u64, - mmap: &[u8], + active_mem: &[u8], lock: RwLockWriteGuard<()>, ) -> AccountsDbResult<()> { - let slot = SnapSlot(slot); - // this lock is always free, as we take StWLock higher up in the call stack and - // only one thread can take snapshots, namely the one that advances the slot - let mut snapshots = self.snapshots.lock(); - if snapshots.len() == self.max_count { - if let Some(old) = snapshots.pop_front() { - let _ = fs::remove_dir_all(&old) - .inspect_err(log_err!("error during old snapshot removal")); - } - } - let snapout = slot.as_path(Self::snapshots_dir(&self.dbpath)); + let snap_path = self.generate_path(slot); - if self.is_cow_supported { - self.reflink_dir(&snapout)?; - } else { - let source = self.dbpath.clone(); - let destination = snapout.clone(); - let mmap = mmap.to_vec(); - drop(lock); - rcopy_dir(&source, &destination, &mmap)?; - } - snapshots.push_back(snapout); + // 1. Maintain retention policy + self.prune_registry(); + + // 2. Prepare Data Capture + // If legacy copy, we must capture state while lock is held. + let memory_capture = + matches!(self.strategy, SnapshotStrategy::LegacyCopy) + .then(|| active_mem.to_vec()) + .unwrap_or_default(); + + // 3. Execute Snapshot + self.strategy + .execute(&self.db_path, &snap_path, memory_capture, lock) + .log_err(|| "Snapshot failed".to_string())?; + + // 4. Register success + self.registry.lock().push_back(snap_path); Ok(()) } - /// Try to rollback to snapshot which is the most recent one before given slot + /// Atomically restores the database to the snapshot nearest to `target_slot`. /// - /// NOTE: In case of success, this deletes the primary - /// database, and all newer snapshots, use carefully! - pub(crate) fn try_switch_to_snapshot( + /// # Critical Operation + /// This function replaces the active database directory. + /// 1. Finds the best candidate snapshot (<= target_slot). + /// 2. Prunes all snapshots newer than the candidate (invalidated history). + /// 3. Performs an atomic swap: Active -> Backup, Snapshot -> Active. + /// + /// Returns the actual slot number of the restored snapshot. + pub fn restore_from_snapshot( &self, - mut slot: u64, + target_slot: u64, ) -> AccountsDbResult { - let mut spath = - SnapSlot(slot).as_path(Self::snapshots_dir(&self.dbpath)); - let mut snapshots = self.snapshots.lock(); // free lock + let mut registry = self.registry.lock(); - // paths to snapshots are strictly ordered, so we can b-search - let index = match snapshots.binary_search(&spath) { + // 1. Locate Snapshot (Binary Search) + let search_key = self.generate_path(target_slot); + let index = match registry.binary_search(&search_key) { Ok(i) => i, - // if we have snapshot older than the slot, use it - Err(i) if i != 0 => i - 1, - // otherwise we don't have any snapshot before the given slot - Err(_) => return Err(AccountsDbError::SnapshotMissing(slot)), + Err(i) if i > 0 => i - 1, + _ => return Err(AccountsDbError::SnapshotMissing(target_slot)), }; - // SAFETY: - // we just checked the index above, so this cannot fail - spath = snapshots.swap_remove_back(index).unwrap(); - info!( - "rolling back to snapshot before {slot} using {}", - spath.display() - ); - - // remove all newer snapshots - while let Some(path) = snapshots.swap_remove_back(index) { - warn!("removing snapshot at {}", path.display()); - // if this operation fails (which is unlikely), then it most likely failed due to - // the path being invalid, which is fine by us, since we wanted to remove it anyway - let _ = fs::remove_dir_all(path) - .inspect_err(log_err!("error removing snapshot")); + let chosen_path = registry.remove(index).unwrap(); + let chosen_slot = Self::parse_slot(&chosen_path) + .ok_or(AccountsDbError::SnapshotMissing(target_slot))?; + + info!("Restoring snapshot {} (req: {})", chosen_slot, target_slot); + + // 2. Prune Invalidated Futures + // Any snapshot strictly newer than the chosen one is now on a diverging timeline. + for invalidated in registry.drain(index..) { + warn!("Pruning invalidated snapshot: {}", invalidated.display()); + let _ = fs::remove_dir_all(&invalidated); } - // SAFETY: - // infallible, all entries in `snapshots` are - // created with SnapSlot naming conventions - slot = SnapSlot::try_from_path(&spath).unwrap().0; - - // we perform database swap, thus removing - // latest state and rolling back to snapshot - fs::remove_dir_all(&self.dbpath).inspect_err(log_err!( - "failed to remove current database at {}", - self.dbpath.display() - ))?; - fs::rename(&spath, &self.dbpath).inspect_err(log_err!( - "failed to rename snapshot dir {} -> {}", - spath.display(), - self.dbpath.display() - ))?; - - Ok(slot) - } + // 3. Atomic Swapping via Rename + let backup = self.db_path.with_extension("bak"); - #[inline] - pub(crate) fn database_path(&self) -> &Path { - &self.dbpath - } + // Stage current DB as backup + if self.db_path.exists() { + fs::rename(&self.db_path, &backup) + .log_err(|| "Failed to stage backup".to_string())?; + } - /// Perform test to find out whether file system - /// supports CoW operations (btrfs, xfs, zfs, apfs) - fn supports_cow(dir: &Path) -> io::Result { - let tmp = dir.join("__tempfile.fs"); - let mut file = File::create(&tmp)?; - file.set_len(64)?; - file.write_all(&[42; 64])?; - file.flush()?; - let tmpsnap = dir.join("__tempfile_snap.fs"); - // reflink will fail if CoW is not supported by FS - let supported = reflink(&tmp, &tmpsnap).is_ok(); - if supported { - info!("Host file system supports CoW, will use reflinking (fast)"); - } else { - warn!( - "Host file system doesn't support CoW, will use regular (slow) file copy, OK for development environments" + // Promote snapshot to active + if let Err(e) = fs::rename(&chosen_path, &self.db_path) { + error!( + "Restore failed during promote: {}. Attempting rollback.", + e ); - }; - if tmp.exists() { - fs::remove_file(tmp)?; + // Attempt to restore the backup if promotion fails + if backup.exists() { + let _ = fs::rename(&backup, &self.db_path); + } + return Err(e.into()); } - // if we failed to create the file then the below operation will fail, - // but since we wanted to remove it anyway, just ignore the error - let _ = fs::remove_file(tmpsnap); - Ok(supported) + + // Success: remove backup + let _ = fs::remove_dir_all(&backup); + Ok(chosen_slot) } - fn snapshots_dir(dbpath: &Path) -> &Path { - dbpath - .parent() - .expect("accounts database directory should have a parent") + /// Checks if a snapshot for the exact `slot` exists in the registry. + #[cfg(test)] + pub fn snapshot_exists(&self, slot: u64) -> bool { + let path = self.generate_path(slot); + self.registry.lock().binary_search(&path).is_ok() } - /// Reads the list of snapshots directories from disk, this - /// is necessary to restore last state after restart - fn read_snapshots( - dbpath: &Path, - max_count: usize, - ) -> io::Result> { - let snapdir = Self::snapshots_dir(dbpath); - let mut snapshots = VecDeque::with_capacity(max_count); + // --- Private Helpers --- - if !snapdir.exists() { - fs::create_dir_all(snapdir)?; - return Ok(snapshots); - } - for entry in fs::read_dir(snapdir)? { - let snap = entry?.path(); - if snap.is_dir() && SnapSlot::try_from_path(&snap).is_some() { - snapshots.push_back(snap); + fn generate_path(&self, slot: u64) -> PathBuf { + // Padding ensures standard string sorting aligns with numeric sorting + self.snapshots_dir.join(format!("snapshot-{:0>12}", slot)) + } + + fn parse_slot(path: &Path) -> Option { + path.file_name() + .and_then(OsStr::to_str) + .and_then(|s| s.strip_prefix("snapshot-")) + .and_then(|s| s.parse().ok()) + } + + /// Removes the oldest snapshots until `registry.len() < max_snapshots`. + fn prune_registry(&self) { + let mut registry = self.registry.lock(); + while registry.len() >= self.max_snapshots { + if let Some(path) = registry.pop_front() { + if let Err(e) = fs::remove_dir_all(&path) { + warn!("Failed to prune {}: {}", path.display(), e); + } } } - // sorting is required for correct ordering (slot-wise) of snapshots - snapshots.make_contiguous().sort(); + } - while snapshots.len() > max_count { - snapshots.pop_front(); + /// Scans the disk for existing snapshot directories and builds the registry. + fn recover_registry( + dir: &Path, + max: usize, + ) -> io::Result> { + if !dir.exists() { + fs::create_dir_all(dir)?; + return Ok(VecDeque::new()); } - Ok(snapshots) - } - /// Fast reference linking based directory copy, only works - /// on Copy on Write filesystems like btrfs/xfs/apfs/refs, this - /// operation is essentially a filesystem metadata update, so it usually - /// takes a few milliseconds irrespective of the target directory size - #[inline(always)] - fn reflink_dir(&self, dst: &Path) -> io::Result<()> { - reflink::reflink(&self.dbpath, dst) - } -} + let mut paths: Vec<_> = fs::read_dir(dir)? + .filter_map(|e| e.ok()) + .map(|e| e.path()) + .filter(|p| p.is_dir() && Self::parse_slot(p).is_some()) + .collect(); -#[derive(Eq, PartialEq, PartialOrd, Ord)] -pub(crate) struct SnapSlot(u64); + paths.sort(); // Sorting works due to zero-padded filenames -impl SnapSlot { - /// parse snapshot path to extract slot number - pub(crate) fn try_from_path(path: &Path) -> Option { - path.file_name() - .and_then(|s| s.to_str()) - .and_then(|s| s.split('-').nth(1)) - .and_then(|s| s.parse::().ok()) - .map(Self) + // Enforce limit strictly on startup + let offset = paths.len().saturating_sub(max); + Ok(paths.into_iter().skip(offset).collect()) } - fn as_path(&self, ppath: &Path) -> PathBuf { - // enforce strict alphanumeric ordering by introducing extra padding - ppath.join(format!("snapshot-{:0>12}", self.0)) + /// Return the path to main accountsdb directory + pub(crate) fn database_path(&self) -> &Path { + &self.db_path } } -/// Conventional byte to byte recursive directory copy, -/// works on all filesystems. Ideally this should only -/// be used for development purposes, and performance -/// sensitive instances of validator should run with -/// CoW supported file system for the storage needs -fn rcopy_dir(src: &Path, dst: &Path, mmap: &[u8]) -> io::Result<()> { - fs::create_dir_all(dst).inspect_err(log_err!( - "creating snapshot destination dir: {:?}", - dst - ))?; - for entry in fs::read_dir(src)? { - let entry = entry?; - let src = entry.path(); - let dst = dst.join(entry.file_name()); +mod fs_backend { + use super::*; + + /// Writes a test file and attempts a reflink to determine CoW support. + pub(crate) fn supports_reflink(dir: &Path) -> io::Result { + let src = dir.join(".tmp_cow_probe_src"); + let dst = dir.join(".tmp_cow_probe_dst"); + + // Clean slate + let _ = fs::remove_file(&src); + let _ = fs::remove_file(&dst); + { + let mut f = File::create(&src)?; + f.write_all(&[0u8; 64])?; + f.sync_data()?; + } + + let result = reflink::reflink(&src, &dst).is_ok(); + + let _ = fs::remove_file(src); + let _ = fs::remove_file(dst); + + Ok(result) + } + + pub(crate) fn reflink_dir(src: &Path, dst: &Path) -> io::Result<()> { + // If src is a directory, manually copy entries if src.is_dir() { - rcopy_dir(&src, &dst, mmap)?; - } else if src.file_name().and_then(OsStr::to_str) == Some(ADB_FILE) { - // for main accounts db file we have an exceptional handling logic, as this file - // is usually huge on disk, but only a small fraction of it is actually used - let dst = File::options() - .write(true) - .create(true) - .truncate(true) - .read(true) - .open(dst) - .inspect_err(log_err!( - "creating a snapshot of main accounts db file" - ))?; - // we copy this file via mmap, only writing used portion of it, ignoring zeroes - // NOTE: upon snapshot reload, the size will be readjusted back to the original - // value, but for the storage purposes, we only keep actual data, ignoring slack space - dst.set_len(mmap.len() as u64)?; - // SAFETY: - // we just opened and resized the file to correct length, and we will close - // it immediately after byte copy, so no one can access it concurrently - let mut dst = - unsafe { MmapMut::map_mut(&dst) }.inspect_err(log_err!( - "memory mapping the snapshot file for the accountsdb file", - ))?; - dst.copy_from_slice(mmap); - dst.flush().inspect_err(log_err!( - "flushing accounts.db file after mmap copy" - ))?; + fs::create_dir_all(dst)?; + for entry in fs::read_dir(src)? { + let entry = entry?; + let src_path = entry.path(); + let dst_path = dst.join(entry.file_name()); + + if entry.file_type()?.is_dir() { + reflink_dir(&src_path, &dst_path)?; + } else { + reflink::reflink(&src_path, &dst_path)? + } + } + Ok(()) } else { - std::fs::copy(&src, &dst)?; + reflink::reflink(src, dst) } } - Ok(()) -} -#[cfg(test)] -impl SnapshotEngine { - pub fn snapshot_exists(&self, slot: u64) -> bool { - let spath = SnapSlot(slot).as_path(Self::snapshots_dir(&self.dbpath)); - let snapshots = self.snapshots.lock(); // free lock + /// Recursively copies a directory. + /// + /// Special Handling: + /// If `ACCOUNTS_DB_FILENAME` is encountered, it writes the provided `mem_dump` + /// instead of copying the file from disk. This ensures we write the state + /// consistent with the capture time, ignoring any subsequent disk modifications. + pub(crate) fn deep_copy_dir( + src: &Path, + dst: &Path, + mem_dump: &[u8], + ) -> io::Result<()> { + fs::create_dir_all(dst)?; + for entry in fs::read_dir(src)? { + let entry = entry?; + let ty = entry.file_type()?; + let src_path = entry.path(); + let dst_path = dst.join(entry.file_name()); - // paths to snapshots are strictly ordered, so we can b-search - snapshots.binary_search(&spath).is_ok() + if ty.is_dir() { + deep_copy_dir(&src_path, &dst_path, mem_dump)?; + } else if src_path.file_name().and_then(OsStr::to_str) + == Some(ACCOUNTS_DB_FILENAME) + { + write_dump_file(&dst_path, mem_dump)?; + } else { + fs::copy(&src_path, &dst_path)?; + } + } + Ok(()) + } + + /// Writes the memory capture to disk using buffered I/O. + fn write_dump_file(path: &Path, data: &[u8]) -> io::Result<()> { + let f = File::create(path)?; + // Pre-allocate space if OS supports it to reduce fragmentation + f.set_len(data.len() as u64)?; + + let mut writer = BufWriter::new(f); + writer.write_all(data)?; + writer.flush()?; + writer.get_mut().sync_all()?; + Ok(()) } } diff --git a/magicblock-accounts-db/src/storage.rs b/magicblock-accounts-db/src/storage.rs index 2dee0b7f1..48eeb80d3 100644 --- a/magicblock-accounts-db/src/storage.rs +++ b/magicblock-accounts-db/src/storage.rs @@ -1,9 +1,10 @@ use std::{ fs::File, io::{self, Write}, + mem::size_of, path::Path, ptr::NonNull, - sync::atomic::{AtomicU32, AtomicU64, Ordering::*}, + sync::atomic::{AtomicU32, AtomicU64, Ordering}, }; use log::error; @@ -12,403 +13,452 @@ use memmap2::MmapMut; use solana_account::AccountSharedData; use crate::{ - error::AccountsDbError, + error::{AccountsDbError, LogErr}, index::{Blocks, Offset}, - log_err, AccountsDbResult, + AccountsDbResult, }; -/// Extra space in database storage file reserved for metadata -/// Currently most of it is unused, but still reserved for future extensions +/// The reserved size in bytes at the beginning of the file for the `StorageHeader`. +/// This area is excluded from the data region used for account storage. const METADATA_STORAGE_SIZE: usize = 256; -pub(crate) const ADB_FILE: &str = "accounts.db"; -/// Different offsets into memory mapped file where various metadata fields are stored -const SLOT_OFFSET: usize = size_of::(); -const BLOCKSIZE_OFFSET: usize = SLOT_OFFSET + size_of::(); -const TOTALBLOCKS_OFFSET: usize = BLOCKSIZE_OFFSET + size_of::(); -const DEALLOCATED_OFFSET: usize = TOTALBLOCKS_OFFSET + size_of::(); +// Size calculation: +// write_cursor(8) + slot(8) + block_size(4) + capacity_blocks(4) + recycled_count(4) = 28 bytes used. +const METADATA_HEADER_USED: usize = 28; +const METADATA_PADDING_SIZE: usize = + METADATA_STORAGE_SIZE - METADATA_HEADER_USED; -#[cfg_attr(test, derive(Debug))] -pub(crate) struct AccountsStorage { - meta: StorageMeta, - /// a mutable pointer into memory mapped region - store: NonNull, - /// underlying memory mapped region, but we cannot use it directly as Rust - /// borrowing rules prevent us from mutably accessing it concurrently - mmap: MmapMut, +/// The standard filename for the accounts database file. +pub(crate) const ACCOUNTS_DB_FILENAME: &str = "accounts.db"; + +/// The persistent memory layout of the storage metadata. +/// +/// This struct maps directly to the first `METADATA_STORAGE_SIZE` bytes of the +/// memory-mapped file. It uses atomic types to allow concurrent access/updates +/// from multiple threads without external locking. +#[repr(C)] +struct StorageHeader { + /// The current write position (high-water mark) in blocks. + /// This is an index: `actual_offset = write_cursor * block_size`. + /// + /// This acts as the "head" of the append-only log. Atomic fetch_add is used + /// here to reserve space for new allocations. + write_cursor: AtomicU64, + + /// The latest slot number that has been written to this storage. + /// Used for snapshotting and recovery to know the age of the data. + slot: AtomicU64, + + /// The size of a single block in bytes (e.g., 128, 256, 512). + /// This is immutable once the database is created. + block_size: u32, + + /// The total capacity of the database in blocks. + /// derived from: `(file_size - METADATA_STORAGE_SIZE) / block_size`. + capacity_blocks: u32, + + /// A counter of blocks that have been marked as dead/recycled. + /// This is purely for metrics or fragmentation estimation; it does not + /// automatically reclaim space (this is an append-only structure). + recycled_count: AtomicU32, + + /// Padding to ensure the struct size exactly matches `METADATA_STORAGE_SIZE` + /// and maintains alignment for future extensions. + _padding: [u8; METADATA_PADDING_SIZE], } -// TODO(bmuddha/tacopaco): use Unique pointer types -// from core::ptr once stable instead of raw pointers +// Compile-time assertion to ensure the header definition matches the reserved space. +const _: () = assert!(size_of::() == METADATA_STORAGE_SIZE); +// Ensure 8-byte alignment for 64-bit atomics. +const _: () = assert!(size_of::().is_multiple_of(8)); -/// Storage metadata manager -/// -/// Metadata is persisted along with the actual accounts and is used to track various control -/// mechanisms of underlying storage +/// A handle to the memory-mapped accounts database. /// -/// ---------------------------------------------------------- -/// | Metadata In Memory Layout | -/// ---------------------------------------------------------- -/// | field | description | size in bytes| -/// |---------------|-------------------------|--------------- -/// | head | offset into storage | 8 | -/// | slot | latest slot observed | 8 | -/// | block size | size of block | 4 | -/// | total blocks | total number of blocks | 4 | -/// | deallocated | deallocated block count | 4 | -/// ---------------------------------------------------------- +/// This struct provides safe wrappers around the raw memory map, handling +/// atomic allocation, bounds checking, and pointer arithmetic. #[cfg_attr(test, derive(Debug))] -struct StorageMeta { - /// offset into memory map, where next allocation will be served - head: &'static AtomicU64, - /// latest slot written to this account - slot: &'static AtomicU64, - /// size of the block (indivisible unit of allocation) - block_size: u32, - /// total number of blocks in database - total_blocks: u32, - /// blocks that were deallocated and now require defragmentation - deallocated: &'static AtomicU32, +pub(crate) struct AccountsStorage { + /// The underlying memory-mapped file. + /// Kept alive here to ensure the memory region remains valid. + mmap: MmapMut, + + /// A raw pointer to the start of the *data* segment (skipping the header). + /// + /// # Safety + /// This pointer is derived from `mmap` and is guaranteed to be valid + /// as long as `mmap` is alive. + data_region: NonNull, + + /// A cached copy of `header.block_size` as a `usize`. + /// Optimization to avoid reading from the mmap (potential cache miss/atomic read) + /// on every allocation or read. + block_size: usize, } impl AccountsStorage { - /// Open (or create if doesn't exist) an accountsdb storage + /// Opens an existing accounts database or creates a new one if it doesn't exist. + /// + /// # Arguments + /// * `config` - Configuration defining block size and initial file size. + /// * `directory` - The directory path where `accounts.db` will be located. /// - /// NOTE: - /// passed config is partially ignored if the database file already - /// exists at the supplied path, for example, the size of main database - /// file can be adjusted only up, the blocksize cannot be changed at all + /// # Returns + /// * `AccountsDbResult` - The initialized storage handle. pub(crate) fn new( config: &AccountsDbConfig, directory: &Path, ) -> AccountsDbResult { - let dbpath = directory.join(ADB_FILE); + let db_path = directory.join(ACCOUNTS_DB_FILENAME); + let exists = db_path.exists(); + let mut file = File::options() .create(true) .truncate(false) .write(true) .read(true) - .open(&dbpath) - .inspect_err(log_err!( - "opening adb file at {}", - dbpath.display() - ))?; - - if file.metadata()?.len() == 0 { - // database is being created for the first time, resize the file and write metadata - StorageMeta::init_adb_file(&mut file, config).inspect_err( - log_err!("initializing new adb at {}", dbpath.display()), - )?; + .open(&db_path) + .log_err(|| { + format!("opening accounts db file at {}", db_path.display()) + })?; + + // If the file is new or empty, we must initialize the header and resize it. + if !exists || file.metadata()?.len() == 0 { + initialize_db_file(&mut file, config).log_err(|| { + format!("initializing new accounts db at {}", db_path.display()) + })?; } else { - let db_size = calculate_db_size(config); - adjust_database_file_size(&mut file, db_size as u64)?; + // If it exists, ensure it is at least the expected size from config. + let target_size = calculate_file_size(config); + ensure_file_size(&mut file, target_size as u64)?; } + Self::map_file(file) + } + + /// Internal helper to memory-map the file and validate the header. + fn map_file(file: File) -> AccountsDbResult { // SAFETY: - // Only accountsdb from validator process is modifying the file contents - // through memory map, so the contract of MmapMut is upheld + // We are the exclusive owner of the File object here. + // We rely on the OS to ensure the mmap is valid. let mut mmap = unsafe { MmapMut::map_mut(&file) }?; - if mmap.len() <= METADATA_STORAGE_SIZE { + + if mmap.len() < METADATA_STORAGE_SIZE { return Err(AccountsDbError::Internal( - "memory map length is less than metadata requirement", + "memory map length is less than metadata requirement" + .to_string(), )); + } + + // Validate and fixup metadata. + // We scope the mutable borrow of the header to this block. + let block_size = { + // SAFETY: We verified mmap.len() >= METADATA_STORAGE_SIZE above. + // Casting the first bytes to StorageHeader is safe because StorageHeader is #[repr(C)]. + let header = + unsafe { &mut *(mmap.as_mut_ptr() as *mut StorageHeader) }; + Self::validate_header(header)?; + + // Check if the file was resized (e.g. by external tool or config change) + // and update capacity_blocks to reflect reality. + let actual_capacity = (mmap.len() - METADATA_STORAGE_SIZE) + / header.block_size as usize; + + if actual_capacity as u32 != header.capacity_blocks { + header.capacity_blocks = actual_capacity as u32; + } + + header.block_size as usize }; - let meta = StorageMeta::new(&mut mmap); - // SAFETY: - // StorageMeta::init_adb_file made sure that the mmap is large enough to hold the metadata, - // so jumping to the end of that segment still lands us within the mmap region - let store = unsafe { - let pointer = mmap.as_mut_ptr().add(METADATA_STORAGE_SIZE); - // as mmap points to non-null memory, the `pointer` also points to non-null address - NonNull::new_unchecked(pointer) + // Calculate the pointer to the data region (just past the header). + // SAFETY: We verified mmap.len() >= METADATA_STORAGE_SIZE. + // The pointer arithmetic remains within the valid mmap region. + let data_region = unsafe { + NonNull::new_unchecked(mmap.as_mut_ptr().add(METADATA_STORAGE_SIZE)) }; - Ok(Self { mmap, meta, store }) + + Ok(Self { + mmap, + data_region, + block_size, + }) } - pub(crate) fn alloc(&self, size: usize) -> Allocation { - let blocks = self.get_block_count(size) as u64; + /// Validates the consistency of the storage header. + fn validate_header(header: &StorageHeader) -> AccountsDbResult<()> { + let block_size_valid = [ + BlockSize::Block128, + BlockSize::Block256, + BlockSize::Block512, + ] + .iter() + .any(|&bs| bs as u32 == header.block_size); - let head = self.head(); + if header.capacity_blocks == 0 || !block_size_valid { + error!( + "AccountsDB corrupt: Block Size: {}, Total Blocks: {}", + header.block_size, header.capacity_blocks + ); + return Err(AccountsDbError::Internal( + "AccountsDB file corrupted".to_string(), + )); + } + Ok(()) + } - let offset = head.fetch_add(blocks, Relaxed) as usize; + /// Reserves space for a new allocation in the storage. + /// + /// This method is thread-safe and lock-free. It uses atomic arithmetic to + /// advance the write cursor. + /// + /// # Arguments + /// * `size_bytes` - The number of bytes required for the account data. + /// + /// # Returns + /// * `Ok(Allocation)` - A handle containing the raw pointer and offset. + /// * `Err` - If the database is full. + pub(crate) fn allocate( + &self, + size_bytes: usize, + ) -> AccountsDbResult { + let blocks_needed = self.blocks_required(size_bytes) as u64; + let header = self.header(); + + // Atomic fetch_add reserves a range of blocks for this thread. + // `Relaxed` ordering is sufficient because we only care about uniqueness + // of the index, not synchronization with other memory operations yet. + let start_index = header + .write_cursor + .fetch_add(blocks_needed, Ordering::Relaxed); + + let end_index = start_index as usize + blocks_needed as usize; + let capacity = header.capacity_blocks as usize; + + // Check for overflow (Database Full). + if end_index > capacity { + // Note: We don't roll back the atomic here. The space is technically "leaked" + // at the end of the file, but since the DB is full/unusable anyway, this is acceptable. + return Err(AccountsDbError::Internal(format!( + "Database full: required {} blocks, available {}", + blocks_needed, + capacity.saturating_sub(start_index as usize) + ))); + } - // Ideally we should always have enough space to store accounts, 500 GB - // should be enough to store every single account in solana and more, - // but given that we operate on a tiny subset of that account pool, even - // 10GB should be more than enough. - // - // Here we check that we haven't overflown the memory map and backing - // file's size (and panic if we did), probably we need to implement - // remapping with file growth, but considering that disk is limited, - // this too can fail - // https://github.com/magicblock-labs/magicblock-validator/issues/334 - let size = self.meta.total_blocks as usize; - assert!(offset < size, "database is full: {offset} > {size}",); + // Calculate the raw memory address for this allocation. + // SAFETY: We validated `end_index <= capacity` above, so this pointer + // is guaranteed to be within the mapped data region. + let ptr = unsafe { + self.data_region.add(start_index as usize * self.block_size) + }; - // SAFETY: - // we have validated above that we are within bounds of mmap and fetch_add - // on head, reserved the offset number of blocks for our exclusive use - let storage = unsafe { self.store.add(offset * self.block_size()) }; - Allocation { - storage, - offset: offset as Offset, - blocks: blocks as Blocks, - } + Ok(Allocation { + ptr, + offset: start_index as Offset, + blocks: blocks_needed as Blocks, + }) } + /// Reads an account from the storage at the given offset. + /// + /// # Arguments + /// * `offset` - The block index where the account starts. + /// + /// # Safety + /// This assumes `offset` points to a valid, previously allocated account. + /// The Index system ensures we only request valid offsets. #[inline(always)] pub(crate) fn read_account(&self, offset: Offset) -> AccountSharedData { - let memptr = self.offset(offset).as_ptr(); + let ptr = self.resolve_ptr(offset).as_ptr(); // SAFETY: - // offset is obtained from index and later transformed by storage (to translate to actual - // address) always points to valid account allocation, as it's only possible to insert - // something in database going in reverse, i.e. obtaining valid offset from storage - // and then inserting it into index. So memory location pointed to memptr is valid. - unsafe { AccountSharedData::deserialize_from_mmap(memptr) }.into() + // 1. `resolve_ptr` ensures the pointer is within bounds if `offset` is valid. + // 2. `deserialize_from_mmap` must handle potentially untrusted bytes safely. + unsafe { AccountSharedData::deserialize_from_mmap(ptr) }.into() } + /// Reconstructs an `Allocation` handle from a previously stored offset. + /// + /// Used when reusing an existing slot in the index (though this append-only + /// implementation generally doesn't overwrite data in place, this supports + /// designs that might). pub(crate) fn recycle(&self, recycled: ExistingAllocation) -> Allocation { - let offset = recycled.offset as usize * self.block_size(); - // SAFETY: - // offset is calculated from existing allocation within the map, thus - // jumping to that offset will land us somewhere within those bounds - let storage = unsafe { self.store.add(offset) }; + let ptr = self.resolve_ptr(recycled.offset); Allocation { offset: recycled.offset, blocks: recycled.blocks, - storage, + ptr, } } - pub(crate) fn offset(&self, offset: Offset) -> NonNull { + /// Translates an abstract `Offset` (block index) to a raw memory pointer. + #[inline] + pub(crate) fn resolve_ptr(&self, offset: Offset) -> NonNull { + let offset_bytes = offset as usize * self.block_size; // SAFETY: - // offset is calculated from existing allocation within the map, thus - // jumping to that offset will land us somewhere within those bounds - let offset = (offset * self.meta.block_size) as usize; - unsafe { self.store.add(offset) } + // The caller is responsible for ensuring `offset` is within valid bounds. + // In the context of `AccountsStorage`, offsets come from the Index which + // tracks valid allocations. + unsafe { self.data_region.add(offset_bytes) } + } + + // --- Metadata Accessors --- + + /// Helper to get a reference to the header structure. + #[inline(always)] + fn header(&self) -> &StorageHeader { + // SAFETY: `mmap` is guaranteed to be at least METADATA_STORAGE_SIZE large + // and properly aligned for `StorageHeader`. + unsafe { &*(self.mmap.as_ptr() as *const StorageHeader) } + } + + pub(crate) fn slot(&self) -> u64 { + self.header().slot.load(Ordering::Relaxed) } - pub(crate) fn get_slot(&self) -> u64 { - self.meta.slot.load(Relaxed) + pub(crate) fn update_slot(&self, val: u64) { + self.header().slot.store(val, Ordering::Relaxed) } - pub(crate) fn set_slot(&self, val: u64) { - self.meta.slot.store(val, Relaxed) + pub(crate) fn inc_recycled_count(&self, val: Blocks) { + self.header() + .recycled_count + .fetch_add(val, Ordering::Relaxed); } - pub(crate) fn increment_deallocations(&self, val: Blocks) { - self.meta.deallocated.fetch_add(val, Relaxed); + pub(crate) fn dec_recycled_count(&self, val: Blocks) { + self.header() + .recycled_count + .fetch_sub(val, Ordering::Relaxed); } - pub(crate) fn decrement_deallocations(&self, val: Blocks) { - self.meta.deallocated.fetch_sub(val, Relaxed); + /// Calculates how many blocks are needed to store `size_bytes`. + pub(crate) fn blocks_required(&self, size_bytes: usize) -> Blocks { + size_bytes.div_ceil(self.block_size) as Blocks } - pub(crate) fn get_block_count(&self, size: usize) -> Blocks { - let block_size = self.block_size(); - let blocks = size.div_ceil(block_size); - blocks as Blocks + /// Returns the slice of memory currently containing valid data (up to the write cursor). + pub(crate) fn active_segment(&self) -> &[u8] { + let cursor = + self.header().write_cursor.load(Ordering::Relaxed) as usize; + // Calculate length: Header Size + (Written Blocks * Block Size) + let len = (cursor * self.block_size + METADATA_STORAGE_SIZE) + .min(self.mmap.len()); + &self.mmap[..len] } + /// Flushes changes to disk. pub(crate) fn flush(&self) { let _ = self .mmap .flush() - .inspect_err(log_err!("failed to sync flush the mmap")); + .log_err(|| "failed to sync flush the mmap".to_string()); } - /// Reopen database from a different directory + /// Reloads the database from a different path (used for snapshots). /// - /// NOTE: this is a very cheap operation, as fast as opening a file - pub(crate) fn reload(&mut self, dbpath: &Path) -> AccountsDbResult<()> { + /// This drops the current mmap and opens a new one at `db_path`. + pub(crate) fn reload(&mut self, db_path: &Path) -> AccountsDbResult<()> { let mut file = File::options() .write(true) .read(true) - .open(dbpath.join(ADB_FILE)) - .inspect_err(log_err!( - "opening adb file from snapshot at {}", - dbpath.display() - ))?; - // snapshot files might be truncated, and contain only the actual - // data with no extra space to grow the database, so we readjust the - // file's length to the preconfigured value before performing mmap - adjust_database_file_size(&mut file, self.size())?; - - // Only accountsdb from the validator process is modifying the file contents - // through memory map, so the contract of MmapMut is upheld - let mut mmap = unsafe { MmapMut::map_mut(&file) }?; - let meta = StorageMeta::new(&mut mmap); - // SAFETY: - // Snapshots are created from the same file used by the primary memory mapped file - // and it's already large enough to contain metadata and possibly some accounts - // so jumping to the end of that segment still lands us within the mmap region - let store = unsafe { - NonNull::new_unchecked(mmap.as_mut_ptr().add(METADATA_STORAGE_SIZE)) - }; - self.mmap = mmap; - self.meta = meta; - self.store = store; - Ok(()) - } - - /// Returns the utilized segment (containing written data) of internal memory map - pub(crate) fn utilized_mmap(&self) -> &[u8] { - // get the last byte where data was written in storage segment and add the size - // of metadata storage, this will give us the used storage in backing file - let head = self.meta.head.load(Relaxed) as usize; - let mut end = head * self.block_size() + METADATA_STORAGE_SIZE; - end = end.min(self.mmap.len()); + .open(db_path.join(ACCOUNTS_DB_FILENAME)) + .log_err(|| { + format!("opening adb file for reload at {}", db_path.display()) + })?; - &self.mmap[..end] + ensure_file_size(&mut file, self.size_bytes())?; + *self = Self::map_file(file)?; + Ok(()) } - /// total number of bytes occupied by storage - pub(crate) fn size(&self) -> u64 { - (self.meta.total_blocks as u64 * self.meta.block_size as u64) + /// Returns the total expected size of the file in bytes (Header + Data). + pub(crate) fn size_bytes(&self) -> u64 { + (self.header().capacity_blocks as u64 * self.block_size as u64) + METADATA_STORAGE_SIZE as u64 } pub(crate) fn block_size(&self) -> usize { - self.meta.block_size as usize - } - - #[inline(always)] - fn head(&self) -> &AtomicU64 { - self.meta.head + self.block_size } } -/// NOTE!: any change in metadata format should be reflected here -impl StorageMeta { - fn init_adb_file( - file: &mut File, - config: &AccountsDbConfig, - ) -> AccountsDbResult<()> { - // Somewhat arbitrary min size for database, should be good enough for most test - // cases, and prevent accidental creation of few kilobyte large or 0 sized databases - const MIN_DB_SIZE: usize = 16 * 1024 * 1024; - assert!( - config.database_size > MIN_DB_SIZE, - "database file should be larger than {MIN_DB_SIZE} bytes in length" - ); - let db_size = calculate_db_size(config); - let total_blocks = (db_size / config.block_size as usize) as Blocks; - // grow the backing file as necessary - adjust_database_file_size(file, db_size as u64)?; - - // the storage itself starts immediately after metadata section - let head = 0_u64; - file.write_all(&head.to_le_bytes())?; - - // fresh Accountsdb starts at slot 0 - let slot = 0_u64; - file.write_all(&slot.to_le_bytes())?; - - // write blocksize - file.write_all(&(config.block_size as u32).to_le_bytes())?; - - file.write_all(&total_blocks.to_le_bytes())?; - // number of deallocated blocks, obviously it's zero in a new database - let deallocated = 0_u32; - file.write_all(&deallocated.to_le_bytes())?; - - Ok(file.flush()?) +/// Initializes a fresh accounts database file with the header. +fn initialize_db_file( + file: &mut File, + config: &AccountsDbConfig, +) -> AccountsDbResult<()> { + const MIN_DB_SIZE: usize = 16 * 1024 * 1024; + if config.database_size < MIN_DB_SIZE { + return Err(AccountsDbError::Internal(format!( + "database file should be larger than {} bytes", + MIN_DB_SIZE + ))); } - fn new(store: &mut MmapMut) -> Self { - // SAFETY: - // All pointer arithmethic operations are safe because they are performed - // on the metadata segment of the backing MmapMut, which is guarranteed to - // be large enough, due to previous call to Self::init_adb_file - // - // The pointer to static reference conversion is also sound, because the - // memmap is kept in the AccountsDb for the entirety of its lifecycle - - let ptr = store.as_mut_ptr(); - - // first element is the head - let head = unsafe { &*(ptr as *const AtomicU64) }; - // second element is the slot - let slot = unsafe { &*(ptr.add(SLOT_OFFSET) as *const AtomicU64) }; - // third is the blocks size - let block_size = - unsafe { (ptr.add(BLOCKSIZE_OFFSET) as *const u32).read() }; - - let block_size_is_initialized = [ - BlockSize::Block128, - BlockSize::Block256, - BlockSize::Block512, - ] - .iter() - .any(|&bs| bs as u32 == block_size); - // fourth is the total blocks count - let mut total_blocks = - unsafe { (ptr.add(TOTALBLOCKS_OFFSET) as *const u32).read() }; - // check whether the size of database file has been readjusted - let adjusted_total_blocks = - (store.len() / block_size as usize) as Blocks; - if adjusted_total_blocks != total_blocks { - // if so, use the adjusted number of total blocks - total_blocks = adjusted_total_blocks; - // and persist the new value to the disk via mmap - // SAFETY: - // we just read this value above, and now we are just overwriting it with new 4 bytes - unsafe { - (ptr.add(TOTALBLOCKS_OFFSET) as *mut u32) - .write(adjusted_total_blocks) - }; - } - - if !(total_blocks != 0 && block_size_is_initialized) { - error!( - "AccountsDB file is not initialized properly. Block Size - \ - {block_size} and Total Block Count is: {total_blocks}" - ); - let _ = std::io::stdout().flush(); - std::process::exit(1); - } - // fifth is the number of deallocated blocks so far - let deallocated = - unsafe { &*(ptr.add(DEALLOCATED_OFFSET) as *const AtomicU32) }; - - Self { - head, - slot, - block_size, - total_blocks, - deallocated, - } - } + let target_size = calculate_file_size(config); + // Determine capacity based on file size minus header. + let total_blocks = + (target_size - METADATA_STORAGE_SIZE) / config.block_size as usize; + + ensure_file_size(file, target_size as u64)?; + + // Prepare the initial header state. + let header = StorageHeader { + write_cursor: AtomicU64::new(0), + slot: AtomicU64::new(0), + block_size: config.block_size as u32, + capacity_blocks: total_blocks as u32, + recycled_count: AtomicU32::new(0), + _padding: [0u8; METADATA_PADDING_SIZE], + }; + + // Serialize the header directly to bytes. + // SAFETY: StorageHeader is `repr(C)` and contains only POD (Plain Old Data) types. + let header_bytes = unsafe { + std::slice::from_raw_parts( + &header as *const StorageHeader as *const u8, + size_of::(), + ) + }; + + file.write_all(header_bytes)?; + Ok(file.flush()?) } -/// Helper function to grow the size of the backing accounts db file -/// NOTE: this function cannot be used to shrink the file, as the logic involved to -/// ensure, that we don't accidentally truncate the written data, is a bit complex -fn adjust_database_file_size(file: &mut File, size: u64) -> io::Result<()> { - if file.metadata()?.len() >= size { - return Ok(()); +/// Grows the file to `size` if it is currently smaller. +fn ensure_file_size(file: &mut File, size: u64) -> io::Result<()> { + if file.metadata()?.len() < size { + file.set_len(size)?; } - file.set_len(size) + Ok(()) } -fn calculate_db_size(config: &AccountsDbConfig) -> usize { +/// Calculates the target file size, ensuring alignment with block size. +fn calculate_file_size(config: &AccountsDbConfig) -> usize { let block_size = config.block_size as usize; - let block_num = config.database_size.div_ceil(block_size); - let meta_blocks = METADATA_STORAGE_SIZE.div_ceil(block_size); - (block_num + meta_blocks) * block_size + // Align total size to block size + metadata + let blocks = config.database_size.div_ceil(block_size); + blocks * block_size + METADATA_STORAGE_SIZE } +/// Represents a successful allocation within the storage. #[derive(Clone, Copy)] pub(crate) struct Allocation { - pub(crate) storage: NonNull, + /// Raw pointer to the start of the allocated memory. + pub(crate) ptr: NonNull, + /// The block index (offset) of this allocation. pub(crate) offset: Offset, + /// The number of blocks reserved. pub(crate) blocks: Blocks, } +/// A struct representing a previously known allocation. +/// Used for recycling or testing equality. #[cfg_attr(test, derive(Debug, Eq, PartialEq))] pub(crate) struct ExistingAllocation { + /// The block index (offset) of this allocation. pub(crate) offset: Offset, + /// The number of blocks reserved. pub(crate) blocks: Blocks, } diff --git a/magicblock-accounts-db/src/tests.rs b/magicblock-accounts-db/src/tests.rs index 90aae2936..4d4cedec0 100644 --- a/magicblock-accounts-db/src/tests.rs +++ b/magicblock-accounts-db/src/tests.rs @@ -6,7 +6,7 @@ use solana_account::{AccountSharedData, ReadableAccount, WritableAccount}; use solana_pubkey::Pubkey; use tempfile::TempDir; -use crate::{storage::ADB_FILE, AccountsDb}; +use crate::{storage::ACCOUNTS_DB_FILENAME, AccountsDb}; const LAMPORTS: u64 = 4425; const SPACE: usize = 73; @@ -14,623 +14,459 @@ const OWNER: Pubkey = Pubkey::new_from_array([23; 32]); const ACCOUNT_DATA: &[u8] = b"hello world?"; const INIT_DATA_LEN: usize = ACCOUNT_DATA.len(); +/// Verifies basic account insertion and retrieval. #[test] fn test_get_account() { - let tenv = init_test_env(); - let AccountWithPubkey { pubkey, .. } = tenv.account(); - let acc = tenv.get_account(&pubkey); - assert!( - acc.is_some(), - "account was just inserted and should be in database" - ); - let acc = acc.unwrap(); + let env = TestEnv::new(); + let AccountWithPubkey { pubkey, .. } = env.create_and_insert_account(); + + let acc = env.get_account(&pubkey).expect("account should exist"); + assert_eq!(acc.lamports(), LAMPORTS); assert_eq!(acc.owner(), &OWNER); assert_eq!(&acc.data()[..INIT_DATA_LEN], ACCOUNT_DATA); assert_eq!(acc.data().len(), SPACE); } +/// Verifies Copy-on-Write semantics. +/// Modifying an account in memory should not affect the persistent store +/// until `upsert_account` is called. #[test] fn test_modify_account() { - let tenv = init_test_env(); + let env = TestEnv::new(); let AccountWithPubkey { pubkey, account: mut uncommitted, - } = tenv.account(); + } = env.create_and_insert_account(); let new_lamports = 42; + // Modify in memory assert_eq!(uncommitted.lamports(), LAMPORTS); uncommitted.set_lamports(new_lamports); assert_eq!(uncommitted.lamports(), new_lamports); - let mut committed = tenv - .get_account(&pubkey) - .expect("account should be in database"); - + // Verify DB is unchanged + let committed_before = env.get_account(&pubkey).unwrap(); assert_eq!( - committed.lamports(), + committed_before.lamports(), LAMPORTS, - "account from the main buffer should not be affected" + "database should retain old state before commit" ); - tenv.insert_account(&pubkey, &uncommitted) - .expect("failed to insert account"); - committed = tenv - .get_account(&pubkey) - .expect("account should be in database"); + // Commit changes + env.insert_account(&pubkey, &uncommitted).unwrap(); + // Verify DB is updated + let committed_after = env.get_account(&pubkey).unwrap(); assert_eq!( - committed.lamports(), + committed_after.lamports(), new_lamports, - "account's main buffer should have been switched after commit" + "database should reflect updates after commit" ); } +/// Verifies that accounts are correctly reallocated when their data size increases. #[test] fn test_account_resize() { - let tenv = init_test_env(); + let env = TestEnv::new(); let huge_data = [42; SPACE * 4]; let AccountWithPubkey { pubkey, account: mut uncommitted, - } = tenv.account(); + } = env.create_and_insert_account(); + // Resize in memory uncommitted.set_data_from_slice(&huge_data); - assert!( - matches!(uncommitted, AccountSharedData::Owned(_),), - "account should have been promoted to Owned after resize" - ); - assert_eq!( - uncommitted.data().len(), - SPACE * 4, - "account should have been resized to double of SPACE" - ); + assert_eq!(uncommitted.data().len(), SPACE * 4); - let mut committed = tenv - .get_account(&pubkey) - .expect("account should be in database"); + // Verify DB still has old size + let committed_before = env.get_account(&pubkey).unwrap(); + assert_eq!(committed_before.data().len(), SPACE); - assert_eq!( - committed.data().len(), - SPACE, - "uncommitted account data len should not have changed" - ); - - tenv.insert_account(&pubkey, &uncommitted) - .expect("failed to insert account"); - - committed = tenv - .get_account(&pubkey) - .expect("account should be in database"); + // Update DB + env.insert_account(&pubkey, &uncommitted).unwrap(); + // Verify DB has new size and data + let committed_after = env.get_account(&pubkey).unwrap(); assert_eq!( - committed.data(), + committed_after.data(), huge_data, - "account should have been resized after insertion" + "account data should match resized buffer" ); } +/// Verifies that the storage allocator reuses space (holes) created by updates. #[test] fn test_alloc_reuse() { - let tenv = init_test_env(); + let env = TestEnv::new(); let AccountWithPubkey { - pubkey, + pubkey: pk1, account: mut acc1, - } = tenv.account(); - let huge_data = [42; SPACE * 4]; + } = env.create_and_insert_account(); - let old_addr = acc1.data().as_ptr(); + // Capture the pointer address of the first allocation + let old_ptr = env.get_account(&pk1).unwrap().data().as_ptr(); + // Resize acc1 significantly to force a move, freeing the old slot + let huge_data = [42; SPACE * 4]; acc1.set_data_from_slice(&huge_data); - tenv.insert_account(&pubkey, &acc1) - .expect("failed to insert account"); + env.insert_account(&pk1, &acc1).unwrap(); - let AccountWithPubkey { account: acc2, .. } = tenv.account(); + // Insert a new account that fits in the old slot + let AccountWithPubkey { pubkey: pk2, .. } = env.create_and_insert_account(); - assert_eq!( - acc2.data().as_ptr(), - old_addr, - "new account insertion should have reused the allocation" - ); + let new_ptr = env.get_account(&pk2).unwrap().data().as_ptr(); - let AccountWithPubkey { account: acc3, .. } = tenv.account(); - - assert!( - acc3.data().as_ptr() > acc2.data().as_ptr(), - "last account insertion should have been freshly allocated" + assert_eq!( + new_ptr, old_ptr, + "allocator should recycle the freed slot for the new account" ); } +/// Verifies complex reallocation reuse logic (holes split/merge behavior). #[test] fn test_larger_alloc_reuse() { - let tenv = init_test_env(); - let mut acc = tenv.account(); - - let mut huge_data = vec![42; SPACE * 2]; - acc.account.set_data_from_slice(&huge_data); - tenv.insert_account(&acc.pubkey, &acc.account) - .expect("failed to insert account"); - - let mut acc2 = tenv.account(); - acc2.account.set_data_from_slice(&huge_data); - tenv.insert_account(&acc2.pubkey, &acc2.account) - .expect("failed to insert account"); - - let mut acc3 = tenv.account(); - huge_data = vec![42; SPACE * 4]; - acc3.account.set_data_from_slice(&huge_data); - tenv.insert_account(&acc3.pubkey, &acc3.account) - .expect("failed to insert account"); - acc3.account = tenv - .get_account(&acc3.pubkey) - .expect("third account should be in database"); - - let alloc_addr = acc3.account.data().as_ptr(); - huge_data = vec![42; SPACE * 5]; - acc3.account.set_data_from_slice(&huge_data); - tenv.insert_account(&acc3.pubkey, &acc3.account) - .expect("failed to insert account"); - - let mut acc4 = tenv.account(); - huge_data = vec![42; SPACE * 3]; - acc4.account.set_data_from_slice(&huge_data); - tenv.insert_account(&acc4.pubkey, &acc4.account) - .expect("failed to insert account"); - acc4.account = tenv - .get_account(&acc4.pubkey) - .expect("fourth account should be in database"); + let env = TestEnv::new(); + + // 1. Insert Account 1 + let mut acc1 = env.new_account_obj(SPACE); + let huge_data_2x = vec![42; SPACE * 2]; + acc1.account.set_data_from_slice(&huge_data_2x); + env.insert_account(&acc1.pubkey, &acc1.account).unwrap(); + + // 2. Insert Account 2 (same size) + let mut acc2 = env.new_account_obj(SPACE); + acc2.account.set_data_from_slice(&huge_data_2x); + env.insert_account(&acc2.pubkey, &acc2.account).unwrap(); + + // 3. Insert Account 3 (4x size) + let mut acc3 = env.new_account_obj(SPACE); + let huge_data_4x = vec![42; SPACE * 4]; + acc3.account.set_data_from_slice(&huge_data_4x); + env.insert_account(&acc3.pubkey, &acc3.account).unwrap(); + + // Read back Account 3 to get its pointer + let acc3_stored = env.get_account(&acc3.pubkey).unwrap(); + let acc3_ptr = acc3_stored.data().as_ptr(); + + // 4. Resize Account 3 to 5x (forces move, freeing the 4x slot) + let huge_data_5x = vec![42; SPACE * 5]; + acc3.account.set_data_from_slice(&huge_data_5x); + env.insert_account(&acc3.pubkey, &acc3.account).unwrap(); + + // 5. Insert Account 4 (3x size - fits in the 4x hole) + let mut acc4 = env.new_account_obj(SPACE); + let huge_data_3x = vec![42; SPACE * 3]; + acc4.account.set_data_from_slice(&huge_data_3x); + env.insert_account(&acc4.pubkey, &acc4.account).unwrap(); + + let acc4_stored = env.get_account(&acc4.pubkey).unwrap(); assert_eq!( - acc4.account.data().as_ptr(), - alloc_addr, - "fourth account should have reused the allocation from third one" + acc4_stored.data().as_ptr(), + acc3_ptr, + "account 4 should have reused account 3's old allocation" ); } #[test] fn test_get_program_accounts() { - let tenv = init_test_env(); - let acc = tenv.account(); - let accounts = tenv.get_program_accounts(&OWNER, |_| true); - assert!(accounts.is_ok(), "program account should be in database"); - let mut accounts = accounts.unwrap(); - assert_eq!( - accounts.next().unwrap().1, - acc.account, - "returned program account should match inserted one" - ); - assert_eq!( - accounts.next(), - None, - "only one program account should have been inserted" - ); + let env = TestEnv::new(); + let acc = env.create_and_insert_account(); + + let accounts = env.get_program_accounts(&OWNER, |_| true); + assert!(accounts.is_ok()); + + let mut iter = accounts.unwrap(); + let (pk, data) = iter.next().unwrap(); + + assert_eq!(pk, acc.pubkey); + assert_eq!(data, acc.account); + assert!(iter.next().is_none()); } #[test] fn test_get_all_accounts() { - let tenv = init_test_env(); - let acc = tenv.account(); - let mut pubkeys = HashSet::new(); - pubkeys.insert(acc.pubkey); - let acc2 = tenv.account(); - tenv.insert_account(&acc2.pubkey, &acc2.account) - .expect("failed to insert account"); - pubkeys.insert(acc2.pubkey); - let acc3 = tenv.account(); - tenv.insert_account(&acc3.pubkey, &acc3.account) - .expect("failed to insert account"); - pubkeys.insert(acc3.pubkey); - - let mut pks = tenv.iter_all(); - assert!(pks - .next() - .map(|(pk, _)| pubkeys.contains(&pk)) - .unwrap_or_default()); - assert!(pks - .next() - .map(|(pk, _)| pubkeys.contains(&pk)) - .unwrap_or_default()); - assert!(pks - .next() - .map(|(pk, _)| pubkeys.contains(&pk)) - .unwrap_or_default()); - assert!(pks.next().is_none()); + let env = TestEnv::new(); + let acc1 = env.create_and_insert_account(); + let acc2 = env.create_and_insert_account(); + let acc3 = env.create_and_insert_account(); + + let expected_pks: HashSet<_> = + [acc1.pubkey, acc2.pubkey, acc3.pubkey].into(); + + let stored_pks: HashSet<_> = env.iter_all().map(|(pk, _)| pk).collect(); + + assert_eq!(stored_pks, expected_pks); } #[test] fn test_take_snapshot() { - let tenv = init_test_env(); - let mut acc = tenv.account(); + let env = TestEnv::new(); + let mut acc = env.create_and_insert_account(); - assert_eq!(tenv.slot(), 0, "fresh accountsdb should have 0 slot"); - tenv.set_slot(tenv.snapshot_frequency); - assert_eq!( - tenv.slot(), - tenv.snapshot_frequency, - "adb slot must have been updated" - ); - assert!( - tenv.snapshot_exists(tenv.snapshot_frequency), - "first snapshot should have been created" - ); - acc.account.set_data(ACCOUNT_DATA.to_vec()); + assert_eq!(env.slot(), 0); - tenv.insert_account(&acc.pubkey, &acc.account) - .expect("failed to insert account"); + // Trigger Snapshot 1 + env.advance_slot(env.snapshot_frequency); + assert_eq!(env.slot(), env.snapshot_frequency); + assert!(env.snapshot_exists(env.snapshot_frequency)); - tenv.set_slot(2 * tenv.snapshot_frequency); - assert!( - tenv.snapshot_exists(2 * tenv.snapshot_frequency), - "second snapshot should have been created" - ); + // Update Account + acc.account.set_data(ACCOUNT_DATA.to_vec()); + env.insert_account(&acc.pubkey, &acc.account).unwrap(); + + // Trigger Snapshot 2 + env.advance_slot(env.snapshot_frequency * 2); + assert!(env.snapshot_exists(env.snapshot_frequency * 2)); } #[test] fn test_restore_from_snapshot() { - let mut tenv = init_test_env(); - let mut acc = tenv.account(); - let new_lamports = 42; + let mut env = TestEnv::new(); + let mut acc = env.create_and_insert_account(); + let snap_freq = env.snapshot_frequency; + + // Create Base Snapshot + env.advance_slot(snap_freq); - tenv.set_slot(tenv.snapshot_frequency); // trigger snapshot - tenv.set_slot(tenv.snapshot_frequency + 1); + // Make changes after snapshot + env.advance_slot(snap_freq + 3); + let new_lamports = 999; acc.account.set_lamports(new_lamports); - tenv.insert_account(&acc.pubkey, &acc.account) - .expect("failed to insert account"); + env.insert_account(&acc.pubkey, &acc.account).unwrap(); + env.advance_slot(snap_freq + 3); - let acc_committed = tenv - .get_account(&acc.pubkey) - .expect("account should be in database"); + // Verify update persisted in current state assert_eq!( - acc_committed.lamports(), - new_lamports, - "account's lamports should have been updated after commit" + env.get_account(&acc.pubkey).unwrap().lamports(), + new_lamports ); - tenv.set_slot(tenv.snapshot_frequency * 3); - let snapshot_frequency = tenv.snapshot_frequency; - tenv = tenv.ensure_at_most(snapshot_frequency * 2); - assert_eq!( - tenv.slot(), - tenv.snapshot_frequency, - "slot should have been rolled back" - ); - let acc_rolledback = tenv - .get_account(&acc.pubkey) - .expect("account should be in database"); + // Rollback to before the update + env = env.restore_to_slot(snap_freq); + + let restored_acc = env.get_account(&acc.pubkey).unwrap(); assert_eq!( - acc_rolledback.lamports(), + restored_acc.lamports(), LAMPORTS, - "account's lamports should have been rolled back" + "account should be restored to state at snapshot" ); - assert_eq!(tenv.slot(), tenv.snapshot_frequency); } #[test] fn test_get_all_accounts_after_rollback() { - let mut tenv = init_test_env(); - let acc = tenv.account(); + let mut env = TestEnv::new(); + let acc = env.create_and_insert_account(); let mut pks = vec![acc.pubkey]; const ITERS: u64 = 1024; + + // Create initial state for i in 0..=ITERS { - let acc = tenv.account(); - tenv.insert_account(&acc.pubkey, &acc.account) - .expect("failed to insert account"); + let acc = env.create_and_insert_account(); pks.push(acc.pubkey); - tenv.set_slot(i); + env.advance_slot(i); } + // Add accounts after the restore point let mut post_snap_pks = vec![]; - for i in ITERS..ITERS + tenv.snapshot_frequency { - let acc = tenv.account(); - tenv.insert_account(&acc.pubkey, &acc.account) - .expect("failed to insert account"); - tenv.set_slot(i + 1); + for i in ITERS..ITERS + env.snapshot_frequency { + let acc = env.create_and_insert_account(); + env.advance_slot(i + 1); post_snap_pks.push(acc.pubkey); } - tenv = tenv.ensure_at_most(ITERS); - assert_eq!(tenv.slot(), ITERS, "failed to rollback to snapshot"); + // Rollback + env = env.restore_to_slot(ITERS); + assert_eq!(env.slot(), ITERS); - let asserter = |(pk, acc): (_, AccountSharedData)| { - assert_eq!( - acc.data().len(), - SPACE, - "account was incorrectly deserialized" - ); - assert_eq!( - &acc.data()[..INIT_DATA_LEN], - ACCOUNT_DATA, - "account data contains garbage" - ); - pk - }; - let pubkeys = tenv.iter_all().map(asserter).collect::>(); + // Verify State + let pubkeys: HashSet<_> = env.iter_all().map(|(pk, _)| pk).collect(); assert_eq!(pubkeys.len(), pks.len()); for pk in pks { - assert!(pubkeys.contains(&pk)); + assert!(pubkeys.contains(&pk), "Missing account {}", pk); } for pk in post_snap_pks { - assert!(!pubkeys.contains(&pk)); + assert!( + !pubkeys.contains(&pk), + "Account {} should have been rolled back", + pk + ); } } #[test] fn test_db_size_after_rollback() { - let mut tenv = init_test_env(); + let mut env = TestEnv::new(); let last_slot = 512; for i in 0..=last_slot { - let acc = tenv.account(); - tenv.insert_account(&acc.pubkey, &acc.account) - .expect("failed to insert account"); - tenv.set_slot(i); + env.create_and_insert_account(); + env.advance_slot(i); } - let pre_rollback_db_size = tenv.storage_size(); - let path = tenv.snapshot_engine.database_path(); - let adb_file = path.join(ADB_FILE); - let pre_rollback_file_size = adb_file - .metadata() - .expect("failed to get metadata for adb file") - .len(); - tenv = tenv.ensure_at_most(last_slot); + let pre_rollback_db_size = env.storage_size(); + let file_path = env + .snapshot_manager + .database_path() + .join(ACCOUNTS_DB_FILENAME); + let pre_rollback_file_size = file_path.metadata().unwrap().len(); + + env = env.restore_to_slot(last_slot); assert_eq!( - tenv.storage_size(), + env.storage_size(), pre_rollback_db_size, "database size mismatch after rollback" ); - let path = tenv.snapshot_engine.database_path(); - let adb_file = path.join(ADB_FILE); - let post_rollback_len = adb_file - .metadata() - .expect("failed to get metadata for adb file") - .len(); + + let post_rollback_file_size = file_path.metadata().unwrap().len(); assert_eq!( - post_rollback_len, pre_rollback_file_size, + post_rollback_file_size, pre_rollback_file_size, "adb file size mismatch after rollback" ); } #[test] fn test_zero_lamports_account() { - let tenv = init_test_env(); - let mut acc = tenv.account(); - let pk = acc.pubkey; - assert!( - tenv.get_account(&pk).is_some(), - "account should exists after init" - ); + let env = TestEnv::new(); + let mut acc = env.create_and_insert_account(); + // Explicitly set 0 lamports (simulating escrow or marker account) acc.account.set_lamports(0); + env.insert_account(&acc.pubkey, &acc.account).unwrap(); - tenv.insert_account(&pk, &acc.account) - .expect("failed to insert account"); - - // NOTE: we use empty accounts to mark escrow accounts that were not found on chain - let retained_account = tenv.get_account(&pk); - assert!( - retained_account.is_some(), - "account should be retained at 0 lamports as an empty escrow account" - ); - assert_eq!( - retained_account.unwrap().lamports(), - 0, - "retained escrow account should have 0 lamports" - ); + let stored = env.get_account(&acc.pubkey); + assert!(stored.is_some(), "zero lamport account should be retained"); + assert_eq!(stored.unwrap().lamports(), 0); } #[test] fn test_owner_change() { - let tenv = init_test_env(); - let mut acc = tenv.account(); - let result = tenv.account_matches_owners(&acc.pubkey, &[OWNER]); - assert!(matches!(result, Some(0))); - { - let mut accounts = tenv - .get_program_accounts(&OWNER, |_| true) - .expect("failed to get program accounts"); - let expected = (acc.pubkey, acc.account.clone()); - assert_eq!(accounts.next(), Some(expected)); - } + let env = TestEnv::new(); + let mut acc = env.create_and_insert_account(); + + // Verify index before change + assert!(matches!( + env.account_matches_owners(&acc.pubkey, &[OWNER]), + Some(0) + )); + + // Change owner let new_owner = Pubkey::new_unique(); acc.account.set_owner(new_owner); - tenv.insert_account(&acc.pubkey, &acc.account) - .expect("failed to insert account"); - let result = tenv.account_matches_owners(&acc.pubkey, &[OWNER]); - assert!(result.is_none()); - let result = tenv.get_program_accounts(&OWNER, |_| true); - assert!(result.map(|pks| pks.count() == 0).unwrap_or_default()); - - let result = tenv.account_matches_owners(&acc.pubkey, &[OWNER, new_owner]); - assert!(matches!(result, Some(1))); - let mut accounts = tenv - .get_program_accounts(&new_owner, |_| true) - .expect("failed to get program accounts"); - assert_eq!(accounts.next().map(|(k, _)| k), Some(acc.pubkey)); -} + env.insert_account(&acc.pubkey, &acc.account).unwrap(); -#[test] -#[should_panic] -fn test_account_too_many_accounts() { - let tenv = init_test_env(); - for _ in 0..20 { - let acc = tenv.account(); - let mut oversized_account = acc.account; - oversized_account.extend_from_slice(&[42; 9_000_000]); - tenv.insert_account(&acc.pubkey, &oversized_account) - .expect("failed to insert account"); - } -} - -#[test] -fn test_account_shrinking() { - let tenv = init_test_env(); - let mut acc1 = tenv.account(); - - // ============================================== - // test set_data - acc1.account.set_data(b"".to_vec()); - tenv.insert_account(&acc1.pubkey, &acc1.account) - .expect("failed to insert account"); - acc1.account = tenv - .get_account(&acc1.pubkey) - .expect("account should be inserted"); + // Verify index after change + // Old owner should return nothing + assert!(env.account_matches_owners(&acc.pubkey, &[OWNER]).is_none()); assert_eq!( - acc1.account.data().len(), - 0, - "account data should have been truncated" + env.get_program_accounts(&OWNER, |_| true).unwrap().count(), + 0 ); - // ============================================== - // test set_data_from_slice - let mut acc2 = tenv.account(); - tenv.insert_account(&acc2.pubkey, &acc2.account) - .expect("failed to insert account"); - - acc2.account = tenv - .get_account(&acc2.pubkey) - .expect("account 2 should be inserted"); - - acc2.account.set_data_from_slice(b""); - - tenv.insert_account(&acc2.pubkey, &acc2.account) - .expect("failed to insert account"); - acc2.account = tenv - .get_account(&acc2.pubkey) - .expect("account should be inserted"); + // New owner should match + assert!(matches!( + env.account_matches_owners(&acc.pubkey, &[new_owner]), + Some(0) + )); assert_eq!( - acc2.account.data().len(), - 0, - "account data should have been truncated" + env.get_program_accounts(&new_owner, |_| true) + .unwrap() + .count(), + 1 ); +} - // ============================================== - // test set_data_from_slice - let mut acc3 = tenv.account(); - tenv.insert_account(&acc3.pubkey, &acc3.account) - .expect("failed to insert account"); +/// Verifies that we eventually hit a limit or handle capacity gracefully. +#[test] +fn test_database_full_error() { + let env = TestEnv::new(); - acc3.account = tenv - .get_account(&acc3.pubkey) - .expect("account 2 should be inserted"); + // Fill DB with huge accounts + let huge_data = vec![42; 9_000_000]; // 9MB + let mut hit_limit = false; - acc3.account.resize(0, 0); + // Try to insert until failure + for _ in 0..50 { + let mut acc = env.new_account_obj(SPACE); + acc.account.set_data_from_slice(&huge_data); - tenv.insert_account(&acc3.pubkey, &acc3.account) - .expect("failed to insert account"); - acc3.account = tenv - .get_account(&acc3.pubkey) - .expect("account should be inserted"); - assert_eq!( - acc3.account.data().len(), - 0, - "account data should have been truncated" + if env.insert_account(&acc.pubkey, &acc.account).is_err() { + hit_limit = true; + break; + } + } + + assert!( + hit_limit, + "Database should eventually return error when full" ); } #[test] -fn test_many_insertions_to_accountsdb() { - const ACCOUNTNUM: usize = 16384; - const ITERS: usize = 2 << 16; - const THREADNUM: usize = 4; - - let tenv = init_test_env(); - - let mut pubkeys = Vec::with_capacity(ACCOUNTNUM); - for _ in 0..ACCOUNTNUM { - let acc = tenv.account(); - pubkeys.push(acc.pubkey); - tenv.insert_account(&acc.pubkey, &acc.account) - .expect("failed to insert account"); - } - // test whether frequent account reallocations effectively reuse free - // space in database without overflowing the database boundaries (100MB for test) - let tenv_arc = Arc::new(tenv); - let chunksize = ACCOUNTNUM / THREADNUM; - std::thread::scope(|s| { - for pks in pubkeys.chunks(chunksize) { - let tenv_arc = tenv_arc.clone(); - s.spawn(move || { - for i in 0..ITERS { - let pk = &pks[i % chunksize]; - let mut account = tenv_arc - .get_account(pk) - .expect("account should be in database"); - account - .set_data_from_slice(&vec![43; i % (SPACE * 20) + 13]); - tenv_arc - .insert_account(pk, &account) - .expect("failed to insert account"); - } - }); - } - }); +fn test_account_shrinking() { + let env = TestEnv::new(); + let mut acc = env.create_and_insert_account(); + + // Shrink via set_data + acc.account.set_data(vec![]); + env.insert_account(&acc.pubkey, &acc.account).unwrap(); + + let stored = env.get_account(&acc.pubkey).unwrap(); + assert_eq!(stored.data().len(), 0); } #[test] fn test_reallocation_split() { - let tenv = init_test_env(); + let env = TestEnv::new(); const SIZE: usize = 1024; - tenv.account(); - let account1 = tenv.account_with_size(SIZE * 2); - let data_ptr1 = account1.account.data().as_ptr(); - let account2 = tenv.account_with_size(SIZE); - let data_ptr2 = account2.account.data().as_ptr(); - tenv.remove_account(&account1.pubkey); - let account3 = tenv.account_with_size(SIZE / 4); - let account4 = tenv.account_with_size(SIZE / 4); - assert_eq!(account3.account.data().as_ptr(), data_ptr1); - assert!(account4.account.data().as_ptr() < data_ptr2); - assert!(account4.account.data().as_ptr() > data_ptr1); + + // Create a hole of size 2048 + let acc1 = env.create_account_with_size(SIZE * 2); + let ptr1 = env.get_account(&acc1.pubkey).unwrap().data().as_ptr(); + env.remove_account(&acc1.pubkey); // Creates hole + + // Create 2 accounts of size 256 + let acc2 = env.create_account_with_size(SIZE / 4); + let acc3 = env.create_account_with_size(SIZE / 4); + + let ptr2 = env.get_account(&acc2.pubkey).unwrap().data().as_ptr(); + let ptr3 = env.get_account(&acc3.pubkey).unwrap().data().as_ptr(); + + // Verify they reused the space (ptr2 should be exactly at ptr1) + assert_eq!(ptr2, ptr1, "First small account should take start of hole"); + assert!(ptr3 > ptr2, "Second small account should follow first"); } #[test] fn test_database_reset() { - // 1. Initialize DB and insert some data - let (adb, temp_dir) = init_db(); + let (adb, temp_dir) = TestEnv::init_raw_db(); let pubkey = Pubkey::new_unique(); let account = AccountSharedData::new(LAMPORTS, SPACE, &OWNER); - adb.insert_account(&pubkey, &account) - .expect("failed to insert account"); - assert!( - adb.get_account(&pubkey).is_some(), - "Account should exist before reset" - ); - assert_eq!(adb.get_accounts_count(), 1); + adb.insert_account(&pubkey, &account).unwrap(); + assert!(adb.get_account(&pubkey).is_some()); - // 2. Drop the current instance to release any file locks or handles + // Explicitly drop to release locks drop(adb); - // 3. Re-initialize the DB with the same path but with reset = true + // Re-open with reset=true let config = AccountsDbConfig { reset: true, ..Default::default() }; - // Use the path from the temp_dir (which is maintained alive by `temp_dir`) - let adb_reset = AccountsDb::new(&config, temp_dir.path(), 0) - .expect("should be able to re-open db with reset"); + let adb_reset = AccountsDb::new(&config, temp_dir.path(), 0).unwrap(); - // 4. Verify the database is empty - assert!( - adb_reset.get_account(&pubkey).is_none(), - "Account should be gone after reset" - ); - assert_eq!( - adb_reset.get_accounts_count(), - 0, - "Account count should be zero" - ); + assert!(adb_reset.get_account(&pubkey).is_none()); + assert_eq!(adb_reset.account_count(), 0); } // ============================================================== -// ============================================================== -// UTILITY CODE BELOW -// ============================================================== +// TEST UTILITIES // ============================================================== struct AccountWithPubkey { @@ -638,69 +474,105 @@ struct AccountWithPubkey { account: AccountSharedData, } -struct AdbTestEnv { +struct TestEnv { adb: Arc, + // Kept to ensure temp dir is cleaned up on drop _directory: TempDir, } -pub fn init_db() -> (Arc, TempDir) { - let _ = env_logger::builder() - .filter_level(log::LevelFilter::Warn) - .is_test(true) - .try_init(); - let directory = - tempfile::tempdir().expect("failed to create temporary directory"); - let config = AccountsDbConfig::default(); - - let adb = AccountsDb::new(&config, directory.path(), 0) - .expect("expected to initialize ADB") - .into(); - (adb, directory) -} +impl TestEnv { + fn new() -> Self { + let (adb, _directory) = Self::init_raw_db(); + Self { adb, _directory } + } -fn init_test_env() -> AdbTestEnv { - let (adb, _directory) = init_db(); - AdbTestEnv { adb, _directory } -} + fn init_raw_db() -> (Arc, TempDir) { + let _ = env_logger::builder() + .filter_level(log::LevelFilter::Info) + .is_test(true) + .try_init(); + + let dir = tempfile::tempdir().expect("temp dir creation failed"); + let config = AccountsDbConfig::default(); -impl AdbTestEnv { - fn account(&self) -> AccountWithPubkey { - self.account_with_size(SPACE) + let adb = AccountsDb::new(&config, dir.path(), 0) + .expect("ADB init failed") + .into(); + + (adb, dir) } - fn account_with_size(&self, size: usize) -> AccountWithPubkey { + fn new_account_obj(&self, size: usize) -> AccountWithPubkey { let pubkey = Pubkey::new_unique(); let mut account = AccountSharedData::new(LAMPORTS, size, &OWNER); - account.data_as_mut_slice()[..INIT_DATA_LEN] - .copy_from_slice(ACCOUNT_DATA); - self.adb - .insert_account(&pubkey, &account) - .expect("failed to insert account"); - let account = self - .get_account(&pubkey) - .expect("failed to refetch newly inserted account"); + // Fill with some data + if size >= INIT_DATA_LEN { + account.data_as_mut_slice()[..INIT_DATA_LEN] + .copy_from_slice(ACCOUNT_DATA); + } AccountWithPubkey { pubkey, account } } - fn set_slot(&self, slot: u64) { - self.adb.set_slot(slot); - while Arc::strong_count(&self.adb) > 1 { - std::thread::yield_now(); + fn create_and_insert_account(&self) -> AccountWithPubkey { + let acc = self.new_account_obj(SPACE); + self.adb.insert_account(&acc.pubkey, &acc.account).unwrap(); + // Re-fetch to ensure we have the stored state + let stored = self.adb.get_account(&acc.pubkey).unwrap(); + AccountWithPubkey { + pubkey: acc.pubkey, + account: stored, + } + } + + fn create_account_with_size(&self, size: usize) -> AccountWithPubkey { + let acc = self.new_account_obj(size); + self.adb.insert_account(&acc.pubkey, &acc.account).unwrap(); + AccountWithPubkey { + pubkey: acc.pubkey, + account: acc.account, } } - fn ensure_at_most(mut self, slot: u64) -> Self { - let mut accountsdb = Arc::try_unwrap(self.adb) - .expect("this is the only Arc reference to accountsdb"); - accountsdb - .ensure_at_most(slot) - .expect("failed to rollback accounts database"); - self.adb = Arc::new(accountsdb); + fn advance_slot(&self, target_slot: u64) { + self.adb.set_slot(target_slot); + + // Simple spin-wait if we expect a snapshot trigger. + // This ensures the background thread has started and possibly finished creating the file. + if target_slot.is_multiple_of(self.adb.snapshot_frequency) { + let mut retries = 0; + while !self.adb.snapshot_exists(target_slot) && retries < 50 { + std::thread::sleep(std::time::Duration::from_millis(10)); + retries += 1; + } + } + } + + fn restore_to_slot(mut self, slot: u64) -> Self { + // Robustly wait for background threads (snapshots) to release the Arc. + let mut retries = 0; + let mut inner = loop { + match Arc::try_unwrap(self.adb) { + Ok(inner) => break inner, + Err(adb) => { + if retries > 50 { + // Panic if still shared after ~1 second + panic!("Cannot restore: DB is shared (background snapshot thread likely still running)"); + } + self.adb = adb; // Put it back to retry + std::thread::sleep(std::time::Duration::from_millis(20)); + retries += 1; + } + } + }; + + inner.restore_state_if_needed(slot).unwrap(); + self.adb = Arc::new(inner); self } } -impl Deref for AdbTestEnv { +// Allow calling AccountsDb methods directly on TestEnv +impl Deref for TestEnv { type Target = Arc; fn deref(&self) -> &Self::Target { &self.adb diff --git a/magicblock-api/src/tickers.rs b/magicblock-api/src/tickers.rs index 25a4ac9af..d197fa0b2 100644 --- a/magicblock-api/src/tickers.rs +++ b/magicblock-api/src/tickers.rs @@ -120,10 +120,7 @@ pub fn init_system_metrics_ticker( } fn set_accounts_count(accounts_db: &AccountsDb) { metrics::set_accounts_count( - accounts_db - .get_accounts_count() - .try_into() - .unwrap_or(i64::MAX), + accounts_db.account_count().try_into().unwrap_or(i64::MAX), ); } diff --git a/magicblock-processor/src/executor/mod.rs b/magicblock-processor/src/executor/mod.rs index dbd97d7a1..6a278ecff 100644 --- a/magicblock-processor/src/executor/mod.rs +++ b/magicblock-processor/src/executor/mod.rs @@ -1,7 +1,7 @@ use std::sync::{Arc, RwLock}; use log::info; -use magicblock_accounts_db::{AccountsDb, StWLock}; +use magicblock_accounts_db::{AccountsDb, GlobalWriteLock}; use magicblock_core::link::{ accounts::AccountUpdateTx, transactions::{ @@ -57,7 +57,7 @@ pub(super) struct TransactionExecutor { ready_tx: Sender, /// A read lock held during a slot's processing to synchronize with critical global /// operations like `AccountsDb` snapshots. - sync: StWLock, + sync: GlobalWriteLock, /// Hacky temporary solution to allow automatic airdrops, the flag /// is tightly contolled and will be removed in the nearest future /// True when auto airdrop for fee payers is enabled (auto_airdrop_lamports > 0). @@ -97,7 +97,7 @@ impl TransactionExecutor { }); let this = Self { id, - sync: state.accountsdb.synchronizer(), + sync: state.accountsdb.write_lock(), processor, accountsdb: state.accountsdb.clone(), ledger: state.ledger.clone(),