diff --git a/docs/src/format/table/index/scalar/bitmap.md b/docs/src/format/table/index/scalar/bitmap.md index 6bcd5aac8c..32df3d0b9f 100644 --- a/docs/src/format/table/index/scalar/bitmap.md +++ b/docs/src/format/table/index/scalar/bitmap.md @@ -15,10 +15,10 @@ The bitmap index consists of a single file `bitmap_page_lookup.lance` that store ### File Schema -| Column | Type | Nullable | Description | -|-----------|------------|----------|---------------------------------------------------------------------| -| `keys` | {DataType} | true | The unique value from the indexed column | -| `bitmaps` | Binary | true | Serialized RowIdTreeMap containing row IDs where this value appears | +| Column | Type | Nullable | Description | +|-----------|------------|----------|-------------------------------------------------------------------------| +| `keys` | {DataType} | true | The unique value from the indexed column | +| `bitmaps` | Binary | true | Serialized RowAddrTreeMap containing row addrs where this value appears | ## Accelerated Queries diff --git a/docs/src/format/table/index/scalar/label_list.md b/docs/src/format/table/index/scalar/label_list.md index 13c88d39d5..8d50f2638b 100644 --- a/docs/src/format/table/index/scalar/label_list.md +++ b/docs/src/format/table/index/scalar/label_list.md @@ -17,10 +17,10 @@ The label list index uses a bitmap index internally and stores its data in: ### File Schema -| Column | Type | Nullable | Description | -|-----------|------------|----------|---------------------------------------------------------------------| -| `keys` | {DataType} | true | The unique label value from the indexed column | -| `bitmaps` | Binary | true | Serialized RowIdTreeMap containing row IDs where this label appears | +| Column | Type | Nullable | Description | +|-----------|------------|----------|------------------------------------------------------------------------| +| `keys` | {DataType} | true | The unique label value from the indexed column | +| `bitmaps` | Binary | true | Serialized RowAddrTreeMap containing row addr where this label appears | ## Accelerated Queries diff --git a/rust/lance-core/src/utils/mask.rs b/rust/lance-core/src/utils/mask.rs index f0bf8911de..595825d26e 100644 --- a/rust/lance-core/src/utils/mask.rs +++ b/rust/lance-core/src/utils/mask.rs @@ -28,9 +28,9 @@ use super::address::RowAddress; #[derive(Clone, Debug, Default, DeepSizeOf)] pub struct RowIdMask { /// If Some then only these row ids are selected - pub allow_list: Option, + pub allow_list: Option, /// If Some then these row ids are not selected. - pub block_list: Option, + pub block_list: Option, } impl RowIdMask { @@ -42,13 +42,13 @@ impl RowIdMask { // Create a mask that doesn't allow anything pub fn allow_nothing() -> Self { Self { - allow_list: Some(RowIdTreeMap::new()), + allow_list: Some(RowAddrTreeMap::new()), block_list: None, } } // Create a mask from an allow list - pub fn from_allowed(allow_list: RowIdTreeMap) -> Self { + pub fn from_allowed(allow_list: RowAddrTreeMap) -> Self { Self { allow_list: Some(allow_list), block_list: None, @@ -56,7 +56,7 @@ impl RowIdMask { } // Create a mask from a block list - pub fn from_block(block_list: RowIdTreeMap) -> Self { + pub fn from_block(block_list: RowAddrTreeMap) -> Self { Self { allow_list: None, block_list: Some(block_list), @@ -128,7 +128,7 @@ impl RowIdMask { } /// Also block the given ids - pub fn also_block(self, block_list: RowIdTreeMap) -> Self { + pub fn also_block(self, block_list: RowAddrTreeMap) -> Self { if block_list.is_empty() { return self; } @@ -146,7 +146,7 @@ impl RowIdMask { } /// Also allow the given ids - pub fn also_allow(self, allow_list: RowIdTreeMap) -> Self { + pub fn also_allow(self, allow_list: RowAddrTreeMap) -> Self { if let Some(existing) = self.allow_list { Self { block_list: self.block_list, @@ -207,14 +207,14 @@ impl RowIdMask { let block_list = if array.is_null(0) { None } else { - Some(RowIdTreeMap::deserialize_from(array.value(0))) + Some(RowAddrTreeMap::deserialize_from(array.value(0))) } .transpose()?; let allow_list = if array.is_null(1) { None } else { - Some(RowIdTreeMap::deserialize_from(array.value(1))) + Some(RowAddrTreeMap::deserialize_from(array.value(1))) } .transpose()?; Ok(Self { @@ -245,9 +245,9 @@ impl RowIdMask { /// list contains "full fragment" blocks but that would require some /// extra logic. pub fn iter_ids(&self) -> Option + '_>> { - if let Some(mut allow_iter) = self.allow_list.as_ref().and_then(|list| list.row_ids()) { + if let Some(mut allow_iter) = self.allow_list.as_ref().and_then(|list| list.row_addrs()) { if let Some(block_list) = &self.block_list { - if let Some(block_iter) = block_list.row_ids() { + if let Some(block_iter) = block_list.row_addrs() { let mut block_iter = block_iter.peekable(); Some(Box::new(iter::from_fn(move || { for allow_id in allow_iter.by_ref() { @@ -360,7 +360,9 @@ impl std::ops::BitOr for RowIdMask { } } -/// A collection of row ids. +/// A collection of row addresses. +/// +/// Note: For stable row id mode, this may be split into a separate structure in the future. /// /// These row ids may either be stable-style (where they can be an incrementing /// u64 sequence) or address style, where they are a fragment id and a row offset. @@ -370,7 +372,7 @@ impl std::ops::BitOr for RowIdMask { /// This is similar to a [RoaringTreemap] but it is optimized for the case where /// entire fragments are selected or deselected. #[derive(Clone, Debug, Default, PartialEq, DeepSizeOf)] -pub struct RowIdTreeMap { +pub struct RowAddrTreeMap { /// The contents of the set. If there is a pair (k, Full) then the entire /// fragment k is selected. If there is a pair (k, Partial(v)) then the /// fragment k has the selected rows in v. @@ -417,7 +419,7 @@ impl RowAddrSelection { } } -impl RowIdTreeMap { +impl RowAddrTreeMap { /// Create an empty set pub fn new() -> Self { Self::default() @@ -440,11 +442,11 @@ impl RowIdTreeMap { .try_fold(0_u64, |acc, next| next.map(|next| next + acc)) } - /// An iterator of row ids + /// An iterator of row addrs /// /// If there are any "full fragment" items then this can't be calculated and None /// is returned - pub fn row_ids(&self) -> Option + '_> { + pub fn row_addrs(&self) -> Option + '_> { let inner_iters = self .inner .iter() @@ -469,9 +471,9 @@ impl RowIdTreeMap { /// Returns true if the value was not already in the set. /// /// ```rust - /// use lance_core::utils::mask::RowIdTreeMap; + /// use lance_core::utils::mask::RowAddrTreeMap; /// - /// let mut set = RowIdTreeMap::new(); + /// let mut set = RowAddrTreeMap::new(); /// assert_eq!(set.insert(10), true); /// assert_eq!(set.insert(10), false); /// assert_eq!(set.contains(10), true); @@ -712,7 +714,7 @@ impl RowIdTreeMap { } } -impl std::ops::BitOr for RowIdTreeMap { +impl std::ops::BitOr for RowAddrTreeMap { type Output = Self; fn bitor(mut self, rhs: Self) -> Self::Output { @@ -721,7 +723,7 @@ impl std::ops::BitOr for RowIdTreeMap { } } -impl std::ops::BitOrAssign for RowIdTreeMap { +impl std::ops::BitOrAssign for RowAddrTreeMap { fn bitor_assign(&mut self, rhs: Self) { for (fragment, rhs_set) in &rhs.inner { let lhs_set = self.inner.get_mut(fragment); @@ -746,7 +748,7 @@ impl std::ops::BitOrAssign for RowIdTreeMap { } } -impl std::ops::BitAnd for RowIdTreeMap { +impl std::ops::BitAnd for RowAddrTreeMap { type Output = Self; fn bitand(mut self, rhs: Self) -> Self::Output { @@ -755,7 +757,7 @@ impl std::ops::BitAnd for RowIdTreeMap { } } -impl std::ops::BitAndAssign<&Self> for RowIdTreeMap { +impl std::ops::BitAndAssign<&Self> for RowAddrTreeMap { fn bitand_assign(&mut self, rhs: &Self) { // Remove fragment that aren't on the RHS self.inner @@ -784,7 +786,7 @@ impl std::ops::BitAndAssign<&Self> for RowIdTreeMap { } } -impl std::ops::Sub for RowIdTreeMap { +impl std::ops::Sub for RowAddrTreeMap { type Output = Self; fn sub(mut self, rhs: Self) -> Self { @@ -793,7 +795,7 @@ impl std::ops::Sub for RowIdTreeMap { } } -impl std::ops::SubAssign<&Self> for RowIdTreeMap { +impl std::ops::SubAssign<&Self> for RowAddrTreeMap { fn sub_assign(&mut self, rhs: &Self) { for (fragment, rhs_set) in &rhs.inner { match self.inner.get_mut(fragment) { @@ -828,7 +830,7 @@ impl std::ops::SubAssign<&Self> for RowIdTreeMap { } } -impl FromIterator for RowIdTreeMap { +impl FromIterator for RowAddrTreeMap { fn from_iter>(iter: T) -> Self { let mut inner = BTreeMap::new(); for row_id in iter { @@ -852,13 +854,13 @@ impl FromIterator for RowIdTreeMap { } } -impl<'a> FromIterator<&'a u64> for RowIdTreeMap { +impl<'a> FromIterator<&'a u64> for RowAddrTreeMap { fn from_iter>(iter: T) -> Self { Self::from_iter(iter.into_iter().copied()) } } -impl From> for RowIdTreeMap { +impl From> for RowAddrTreeMap { fn from(range: Range) -> Self { let mut map = Self::default(); map.insert_range(range); @@ -866,7 +868,7 @@ impl From> for RowIdTreeMap { } } -impl From for RowIdTreeMap { +impl From for RowAddrTreeMap { fn from(roaring: RoaringTreemap) -> Self { let mut inner = BTreeMap::new(); for (fragment, set) in roaring.bitmaps() { @@ -876,7 +878,7 @@ impl From for RowIdTreeMap { } } -impl Extend for RowIdTreeMap { +impl Extend for RowAddrTreeMap { fn extend>(&mut self, iter: T) { for row_id in iter { let upper = (row_id >> 32) as u32; @@ -898,14 +900,14 @@ impl Extend for RowIdTreeMap { } } -impl<'a> Extend<&'a u64> for RowIdTreeMap { +impl<'a> Extend<&'a u64> for RowAddrTreeMap { fn extend>(&mut self, iter: T) { self.extend(iter.into_iter().copied()) } } -// Extending with RowIdTreeMap is basically a cumulative set union -impl Extend for RowIdTreeMap { +// Extending with RowAddrTreeMap is basically a cumulative set union +impl Extend for RowAddrTreeMap { fn extend>(&mut self, iter: T) { for other in iter { for (fragment, set) in other.inner { @@ -940,38 +942,38 @@ mod tests { let mask = RowIdMask::default(); assert!(mask.selected(1)); assert!(mask.selected(5)); - let block_list = mask.also_block(RowIdTreeMap::from_iter(&[0, 5, 15])); + let block_list = mask.also_block(RowAddrTreeMap::from_iter(&[0, 5, 15])); assert!(block_list.selected(1)); assert!(!block_list.selected(5)); - let allow_list = RowIdMask::from_allowed(RowIdTreeMap::from_iter(&[0, 2, 5])); + let allow_list = RowIdMask::from_allowed(RowAddrTreeMap::from_iter(&[0, 2, 5])); assert!(!allow_list.selected(1)); assert!(allow_list.selected(5)); let combined = block_list & allow_list; assert!(combined.selected(2)); assert!(!combined.selected(0)); assert!(!combined.selected(5)); - let other = RowIdMask::from_allowed(RowIdTreeMap::from_iter(&[3])); + let other = RowIdMask::from_allowed(RowAddrTreeMap::from_iter(&[3])); let combined = combined | other; assert!(combined.selected(2)); assert!(combined.selected(3)); assert!(!combined.selected(0)); assert!(!combined.selected(5)); - let block_list = RowIdMask::from_block(RowIdTreeMap::from_iter(&[0])); - let allow_list = RowIdMask::from_allowed(RowIdTreeMap::from_iter(&[3])); + let block_list = RowIdMask::from_block(RowAddrTreeMap::from_iter(&[0])); + let allow_list = RowIdMask::from_allowed(RowAddrTreeMap::from_iter(&[3])); let combined = block_list | allow_list; assert!(combined.selected(1)); } #[test] fn test_logical_or() { - let allow1 = RowIdMask::from_allowed(RowIdTreeMap::from_iter(&[5, 6, 7, 8, 9])); - let block1 = RowIdMask::from_block(RowIdTreeMap::from_iter(&[5, 6])); + let allow1 = RowIdMask::from_allowed(RowAddrTreeMap::from_iter(&[5, 6, 7, 8, 9])); + let block1 = RowIdMask::from_block(RowAddrTreeMap::from_iter(&[5, 6])); let mixed1 = allow1 .clone() .also_block(block1.block_list.as_ref().unwrap().clone()); - let allow2 = RowIdMask::from_allowed(RowIdTreeMap::from_iter(&[2, 3, 4, 5, 6, 7, 8])); - let block2 = RowIdMask::from_block(RowIdTreeMap::from_iter(&[4, 5])); + let allow2 = RowIdMask::from_allowed(RowAddrTreeMap::from_iter(&[2, 3, 4, 5, 6, 7, 8])); + let block2 = RowIdMask::from_block(RowAddrTreeMap::from_iter(&[4, 5])); let mixed2 = allow2 .clone() .also_block(block2.block_list.as_ref().unwrap().clone()); @@ -1018,7 +1020,7 @@ mod tests { ]; for range in ranges { - let mut mask = RowIdTreeMap::default(); + let mut mask = RowAddrTreeMap::default(); let count = mask.insert_range(range.clone()); let expected = range.end - range.start; @@ -1032,7 +1034,7 @@ mod tests { assert_eq!(count, 5); } - let mut mask = RowIdTreeMap::default(); + let mut mask = RowAddrTreeMap::default(); let count = mask.insert_range(..10); assert_eq!(count, 10); assert!(mask.contains(0)); @@ -1047,7 +1049,7 @@ mod tests { #[test] fn test_map_remove() { - let mut mask = RowIdTreeMap::default(); + let mut mask = RowAddrTreeMap::default(); assert!(!mask.remove(20)); @@ -1073,7 +1075,7 @@ mod tests { 0..10 ) ) { - let mut mask = RowIdTreeMap::default(); + let mut mask = RowAddrTreeMap::default(); for (fragment, rows) in values { if let Some(rows) = rows { let bitmap = RoaringBitmap::from_iter(rows); @@ -1085,7 +1087,7 @@ mod tests { let mut data = Vec::new(); mask.serialize_into(&mut data).unwrap(); - let deserialized = RowIdTreeMap::deserialize_from(data.as_slice()).unwrap(); + let deserialized = RowAddrTreeMap::deserialize_from(data.as_slice()).unwrap(); prop_assert_eq!(mask, deserialized); } @@ -1096,19 +1098,19 @@ mod tests { right_full_fragments in proptest::collection::vec(0..u32::MAX, 0..10), right_rows in proptest::collection::vec(0..u64::MAX, 0..1000), ) { - let mut left = RowIdTreeMap::default(); + let mut left = RowAddrTreeMap::default(); for fragment in left_full_fragments.clone() { left.insert_fragment(fragment); } left.extend(left_rows.iter().copied()); - let mut right = RowIdTreeMap::default(); + let mut right = RowAddrTreeMap::default(); for fragment in right_full_fragments.clone() { right.insert_fragment(fragment); } right.extend(right_rows.iter().copied()); - let mut expected = RowIdTreeMap::default(); + let mut expected = RowAddrTreeMap::default(); for fragment in &left_full_fragments { if right_full_fragments.contains(fragment) { expected.insert_fragment(*fragment); @@ -1137,19 +1139,19 @@ mod tests { right_full_fragments in proptest::collection::vec(0..u32::MAX, 0..10), right_rows in proptest::collection::vec(0..u64::MAX, 0..1000), ) { - let mut left = RowIdTreeMap::default(); + let mut left = RowAddrTreeMap::default(); for fragment in left_full_fragments.clone() { left.insert_fragment(fragment); } left.extend(left_rows.iter().copied()); - let mut right = RowIdTreeMap::default(); + let mut right = RowAddrTreeMap::default(); for fragment in right_full_fragments.clone() { right.insert_fragment(fragment); } right.extend(right_rows.iter().copied()); - let mut expected = RowIdTreeMap::default(); + let mut expected = RowAddrTreeMap::default(); for fragment in left_full_fragments { expected.insert_fragment(fragment); } @@ -1180,13 +1182,13 @@ mod tests { left_rows in proptest::collection::vec(0..u64::MAX, 0..1000), right_rows in proptest::collection::vec(0..u64::MAX, 0..1000), ) { - let mut left = RowIdTreeMap::default(); + let mut left = RowAddrTreeMap::default(); for fragment in left_full_fragments { left.insert_fragment(fragment); } left.extend(left_rows.iter().copied()); - let mut right = RowIdTreeMap::default(); + let mut right = RowAddrTreeMap::default(); right.extend(right_rows.iter().copied()); let mut expected = left.clone(); @@ -1204,13 +1206,13 @@ mod tests { right_full_fragments in proptest::collection::vec(0..u32::MAX, 0..10), left_rows in proptest::collection::vec(0..u64::MAX, 0..1000), ) { - let mut left = RowIdTreeMap::default(); + let mut left = RowAddrTreeMap::default(); for fragment in left_full_fragments { left.insert_fragment(fragment); } left.extend(left_rows.iter().copied()); - let mut right = RowIdTreeMap::default(); + let mut right = RowAddrTreeMap::default(); for fragment in right_full_fragments.clone() { right.insert_fragment(fragment); } @@ -1232,7 +1234,7 @@ mod tests { assert!(mask.iter_ids().is_none()); // Test with just an allow list - let mut allow_list = RowIdTreeMap::default(); + let mut allow_list = RowAddrTreeMap::default(); allow_list.extend([1, 5, 10].iter().copied()); mask.allow_list = Some(allow_list); @@ -1247,7 +1249,7 @@ mod tests { ); // Test with both allow list and block list - let mut block_list = RowIdTreeMap::default(); + let mut block_list = RowAddrTreeMap::default(); block_list.extend([5].iter().copied()); mask.block_list = Some(block_list); @@ -1261,14 +1263,14 @@ mod tests { ); // Test with full fragment in block list - let mut block_list = RowIdTreeMap::default(); + let mut block_list = RowAddrTreeMap::default(); block_list.insert_fragment(0); mask.block_list = Some(block_list); assert!(mask.iter_ids().is_none()); // Test with full fragment in allow list mask.block_list = None; - let mut allow_list = RowIdTreeMap::default(); + let mut allow_list = RowAddrTreeMap::default(); allow_list.insert_fragment(0); mask.allow_list = Some(allow_list); assert!(mask.iter_ids().is_none()); diff --git a/rust/lance-index/src/frag_reuse.rs b/rust/lance-index/src/frag_reuse.rs index 658e784a7e..922a59776a 100644 --- a/rust/lance-index/src/frag_reuse.rs +++ b/rust/lance-index/src/frag_reuse.rs @@ -8,7 +8,7 @@ use arrow_array::{Array, ArrayRef, PrimitiveArray, RecordBatch, UInt64Array}; use async_trait::async_trait; use deepsize::{Context, DeepSizeOf}; use itertools::Itertools; -use lance_core::utils::mask::RowIdTreeMap; +use lance_core::utils::mask::RowAddrTreeMap; use lance_core::{Error, Result}; use lance_table::format::pb::fragment_reuse_index_details::InlineContent; use lance_table::format::{pb, ExternalFile, Fragment}; @@ -245,8 +245,8 @@ impl FragReuseIndex { mapped_value } - pub fn remap_row_ids_tree_map(&self, row_ids: &RowIdTreeMap) -> RowIdTreeMap { - RowIdTreeMap::from_iter(row_ids.row_ids().unwrap().filter_map(|addr| { + pub fn remap_row_addrs_tree_map(&self, row_addrs: &RowAddrTreeMap) -> RowAddrTreeMap { + RowAddrTreeMap::from_iter(row_addrs.row_addrs().unwrap().filter_map(|addr| { let addr_as_u64 = u64::from(addr); self.remap_row_id(addr_as_u64) })) diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs index 69b5ee35cf..bc9e7efaf9 100644 --- a/rust/lance-index/src/scalar.rs +++ b/rust/lance-index/src/scalar.rs @@ -19,7 +19,7 @@ use datafusion_expr::expr::ScalarFunction; use datafusion_expr::Expr; use deepsize::DeepSizeOf; use inverted::query::{fill_fts_query_column, FtsQuery, FtsQueryNode, FtsSearchParams, MatchQuery}; -use lance_core::utils::mask::RowIdTreeMap; +use lance_core::utils::mask::RowAddrTreeMap; use lance_core::{Error, Result}; use serde::Serialize; use snafu::location; @@ -684,24 +684,24 @@ impl AnyQuery for TokenQuery { #[derive(Debug, PartialEq)] pub enum SearchResult { /// The exact row ids that satisfy the query - Exact(RowIdTreeMap), + Exact(RowAddrTreeMap), /// Any row id satisfying the query will be in this set but not every /// row id in this set will satisfy the query, a further recheck step /// is needed - AtMost(RowIdTreeMap), + AtMost(RowAddrTreeMap), /// All of the given row ids satisfy the query but there may be more /// /// No scalar index actually returns this today but it can arise from /// boolean operations (e.g. NOT(AtMost(x)) == AtLeast(NOT(x))) - AtLeast(RowIdTreeMap), + AtLeast(RowAddrTreeMap), } impl SearchResult { - pub fn row_ids(&self) -> &RowIdTreeMap { + pub fn row_addrs(&self) -> &RowAddrTreeMap { match self { - Self::Exact(row_ids) => row_ids, - Self::AtMost(row_ids) => row_ids, - Self::AtLeast(row_ids) => row_ids, + Self::Exact(row_addrs) => row_addrs, + Self::AtMost(row_addrs) => row_addrs, + Self::AtLeast(row_addrs) => row_addrs, } } diff --git a/rust/lance-index/src/scalar/bitmap.rs b/rust/lance-index/src/scalar/bitmap.rs index 370f9ed8ef..b1e13409c9 100644 --- a/rust/lance-index/src/scalar/bitmap.rs +++ b/rust/lance-index/src/scalar/bitmap.rs @@ -21,7 +21,7 @@ use futures::{stream, StreamExt, TryStreamExt}; use lance_core::{ cache::{CacheKey, LanceCache, WeakLanceCache}, error::LanceOptionExt, - utils::{mask::RowIdTreeMap, tokio::get_num_compute_intensive_cpus}, + utils::{mask::RowAddrTreeMap, tokio::get_num_compute_intensive_cpus}, Error, Result, ROW_ID, }; use roaring::RoaringBitmap; @@ -100,7 +100,7 @@ pub struct BitmapIndex { /// for quickly locating the row and reading it out index_map: BTreeMap, - null_map: Arc, + null_map: Arc, value_type: DataType, @@ -119,7 +119,7 @@ pub struct BitmapKey { } impl CacheKey for BitmapKey { - type ValueType = RowIdTreeMap; + type ValueType = RowAddrTreeMap; fn key(&self) -> std::borrow::Cow<'_, str> { format!("{}", self.value.0).into() @@ -129,7 +129,7 @@ impl CacheKey for BitmapKey { impl BitmapIndex { fn new( index_map: BTreeMap, - null_map: Arc, + null_map: Arc, value_type: DataType, store: Arc, index_cache: WeakLanceCache, @@ -160,7 +160,7 @@ impl BitmapIndex { let data_type = schema.fields[0].data_type(); return Ok(Arc::new(Self::new( BTreeMap::new(), - Arc::new(RowIdTreeMap::default()), + Arc::new(RowAddrTreeMap::default()), data_type, store, WeakLanceCache::from(index_cache), @@ -169,7 +169,7 @@ impl BitmapIndex { } let mut index_map: BTreeMap = BTreeMap::new(); - let mut null_map = Arc::new(RowIdTreeMap::default()); + let mut null_map = Arc::new(RowAddrTreeMap::default()); let mut value_type: Option = None; let mut null_location: Option = None; let mut row_offset = 0; @@ -217,11 +217,11 @@ impl BitmapIndex { location: location!(), })?; let bitmap_bytes = binary_bitmaps.value(0); - let mut bitmap = RowIdTreeMap::deserialize_from(bitmap_bytes).unwrap(); + let mut bitmap = RowAddrTreeMap::deserialize_from(bitmap_bytes).unwrap(); // Apply fragment remapping if needed if let Some(fri) = &frag_reuse_index { - bitmap = fri.remap_row_ids_tree_map(&bitmap); + bitmap = fri.remap_row_addrs_tree_map(&bitmap); } null_map = Arc::new(bitmap); @@ -243,7 +243,7 @@ impl BitmapIndex { &self, key: &OrderableScalarValue, metrics: Option<&dyn MetricsCollector>, - ) -> Result> { + ) -> Result> { if key.0.is_null() { return Ok(self.null_map.clone()); } @@ -261,7 +261,7 @@ impl BitmapIndex { let row_offset = match self.index_map.get(key) { Some(loc) => *loc, - None => return Ok(Arc::new(RowIdTreeMap::default())), + None => return Ok(Arc::new(RowAddrTreeMap::default())), }; let page_lookup_file = self.lazy_reader.get().await?; @@ -278,10 +278,10 @@ impl BitmapIndex { location: location!(), })?; let bitmap_bytes = binary_bitmaps.value(0); // First (and only) row - let mut bitmap = RowIdTreeMap::deserialize_from(bitmap_bytes).unwrap(); + let mut bitmap = RowAddrTreeMap::deserialize_from(bitmap_bytes).unwrap(); if let Some(fri) = &self.frag_reuse_index { - bitmap = fri.remap_row_ids_tree_map(&bitmap); + bitmap = fri.remap_row_addrs_tree_map(&bitmap); } self.index_cache @@ -358,10 +358,10 @@ impl Index for BitmapIndex { } let bitmap_bytes = bitmap_binary_array.value(idx); - let mut bitmap = RowIdTreeMap::deserialize_from(bitmap_bytes).unwrap(); + let mut bitmap = RowAddrTreeMap::deserialize_from(bitmap_bytes).unwrap(); if let Some(frag_reuse_index_ref) = self.frag_reuse_index.as_ref() { - bitmap = frag_reuse_index_ref.remap_row_ids_tree_map(&bitmap); + bitmap = frag_reuse_index_ref.remap_row_addrs_tree_map(&bitmap); } let cache_key = BitmapKey { value: key }; @@ -436,7 +436,7 @@ impl ScalarIndex for BitmapIndex { metrics.record_comparisons(keys.len()); if keys.is_empty() { - RowIdTreeMap::default() + RowAddrTreeMap::default() } else { let bitmaps: Vec<_> = stream::iter(keys.into_iter().map(|key| { let this = self.clone(); @@ -447,7 +447,7 @@ impl ScalarIndex for BitmapIndex { .await?; let bitmap_refs: Vec<_> = bitmaps.iter().map(|b| b.as_ref()).collect(); - RowIdTreeMap::union_all(&bitmap_refs) + RowAddrTreeMap::union_all(&bitmap_refs) } } SargableQuery::IsIn(values) => { @@ -473,7 +473,7 @@ impl ScalarIndex for BitmapIndex { .collect(); if keys.is_empty() && (!has_null || self.null_map.is_empty()) { - RowIdTreeMap::default() + RowAddrTreeMap::default() } else { // Load bitmaps in parallel let mut bitmaps: Vec<_> = stream::iter(keys.into_iter().map(|key| { @@ -490,11 +490,11 @@ impl ScalarIndex for BitmapIndex { } if bitmaps.is_empty() { - RowIdTreeMap::default() + RowAddrTreeMap::default() } else { - // Convert Arc to &RowIdTreeMap for union_all + // Convert Arc to &RowAddrTreeMap for union_all let bitmap_refs: Vec<_> = bitmaps.iter().map(|b| b.as_ref()).collect(); - RowIdTreeMap::union_all(&bitmap_refs) + RowAddrTreeMap::union_all(&bitmap_refs) } } } @@ -528,7 +528,7 @@ impl ScalarIndex for BitmapIndex { for key in self.index_map.keys() { let bitmap = self.load_bitmap(key, None).await?; let remapped_bitmap = - RowIdTreeMap::from_iter(bitmap.row_ids().unwrap().filter_map(|addr| { + RowAddrTreeMap::from_iter(bitmap.row_addrs().unwrap().filter_map(|addr| { let addr_as_u64 = u64::from(addr); mapping .get(&addr_as_u64) @@ -540,7 +540,7 @@ impl ScalarIndex for BitmapIndex { if !self.null_map.is_empty() { let remapped_null = - RowIdTreeMap::from_iter(self.null_map.row_ids().unwrap().filter_map(|addr| { + RowAddrTreeMap::from_iter(self.null_map.row_addrs().unwrap().filter_map(|addr| { let addr_as_u64 = u64::from(addr); mapping .get(&addr_as_u64) @@ -616,7 +616,7 @@ impl BitmapIndexPlugin { } async fn write_bitmap_index( - state: HashMap, + state: HashMap, index_store: &dyn IndexStore, value_type: &DataType, ) -> Result<()> { @@ -680,7 +680,7 @@ impl BitmapIndexPlugin { async fn do_train_bitmap_index( mut data_source: SendableRecordBatchStream, - mut state: HashMap, + mut state: HashMap, index_store: &dyn IndexStore, ) -> Result<()> { let value_type = data_source.schema().field(0).data_type().clone(); @@ -706,7 +706,7 @@ impl BitmapIndexPlugin { index_store: &dyn IndexStore, ) -> Result<()> { // mapping from item to list of the row ids where it is present - let dictionary: HashMap = HashMap::new(); + let dictionary: HashMap = HashMap::new(); Self::do_train_bitmap_index(data, dictionary, index_store).await } @@ -854,7 +854,7 @@ pub mod tests { // Verify results let expected_red_rows = vec![0u64, 3, 6, 10, 11]; if let SearchResult::Exact(row_ids) = result { - let mut actual: Vec = row_ids.row_ids().unwrap().map(|id| id.into()).collect(); + let mut actual: Vec = row_ids.row_addrs().unwrap().map(|id| id.into()).collect(); actual.sort(); assert_eq!(actual, expected_red_rows); } else { @@ -864,7 +864,7 @@ pub mod tests { // Test 2: Search for "red" again - should hit cache let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); if let SearchResult::Exact(row_ids) = result { - let mut actual: Vec = row_ids.row_ids().unwrap().map(|id| id.into()).collect(); + let mut actual: Vec = row_ids.row_addrs().unwrap().map(|id| id.into()).collect(); actual.sort(); assert_eq!(actual, expected_red_rows); } @@ -878,7 +878,7 @@ pub mod tests { let expected_range_rows = vec![1u64, 2, 5, 7, 8, 12, 13]; if let SearchResult::Exact(row_ids) = result { - let mut actual: Vec = row_ids.row_ids().unwrap().map(|id| id.into()).collect(); + let mut actual: Vec = row_ids.row_addrs().unwrap().map(|id| id.into()).collect(); actual.sort(); assert_eq!(actual, expected_range_rows); } @@ -892,7 +892,7 @@ pub mod tests { let expected_in_rows = vec![0u64, 3, 4, 6, 9, 10, 11, 14]; if let SearchResult::Exact(row_ids) = result { - let mut actual: Vec = row_ids.row_ids().unwrap().map(|id| id.into()).collect(); + let mut actual: Vec = row_ids.row_addrs().unwrap().map(|id| id.into()).collect(); actual.sort(); assert_eq!(actual, expected_in_rows); } @@ -909,7 +909,7 @@ pub mod tests { use arrow_schema::DataType; use datafusion_common::ScalarValue; use lance_core::cache::LanceCache; - use lance_core::utils::mask::RowIdTreeMap; + use lance_core::utils::mask::RowAddrTreeMap; use lance_io::object_store::ObjectStore; use std::collections::HashMap; use std::sync::Arc; @@ -925,7 +925,7 @@ pub mod tests { let mut state = HashMap::new(); for i in 0..m { // Create a bitmap that contains, say, 1000 row IDs. - let bitmap = RowIdTreeMap::from_iter(0..per_bitmap_size); + let bitmap = RowAddrTreeMap::from_iter(0..per_bitmap_size); let key = ScalarValue::UInt32(Some(i)); state.insert(key, bitmap); @@ -990,12 +990,12 @@ pub mod tests { .await .unwrap_or_else(|_| panic!("Key {} should exist", key_val)); - // Convert RowIdTreeMap to a vector for easier assertion - let row_ids: Vec = bitmap.row_ids().unwrap().map(u64::from).collect(); + // Convert RowAddrTreeMap to a vector for easier assertion + let row_addrs: Vec = bitmap.row_addrs().unwrap().map(u64::from).collect(); // Verify length assert_eq!( - row_ids.len(), + row_addrs.len(), per_bitmap_size as usize, "Bitmap for key {} has wrong size", key_val @@ -1004,7 +1004,7 @@ pub mod tests { // Verify first few and last few elements for i in 0..5.min(per_bitmap_size) { assert!( - row_ids.contains(&i), + row_addrs.contains(&i), "Bitmap for key {} should contain row_id {}", key_val, i @@ -1013,7 +1013,7 @@ pub mod tests { for i in (per_bitmap_size - 5)..per_bitmap_size { assert!( - row_ids.contains(&i), + row_addrs.contains(&i), "Bitmap for key {} should contain row_id {}", key_val, i @@ -1023,7 +1023,7 @@ pub mod tests { // Verify exact range let expected_range: Vec = (0..per_bitmap_size).collect(); assert_eq!( - row_ids, expected_range, + row_addrs, expected_range, "Bitmap for key {} doesn't contain expected values", key_val ); @@ -1031,7 +1031,7 @@ pub mod tests { tracing::info!( "✓ Verified bitmap for key {}: {} rows as expected", key_val, - row_ids.len() + row_addrs.len() ); } @@ -1121,7 +1121,7 @@ pub mod tests { .get_with_key::(&cache_key_red) .await .unwrap(); - let red_rows: Vec = cached_red.row_ids().unwrap().map(u64::from).collect(); + let red_rows: Vec = cached_red.row_addrs().unwrap().map(u64::from).collect(); assert_eq!(red_rows, vec![0, 3, 6, 10, 11]); // Call prewarm again - should be idempotent @@ -1132,7 +1132,7 @@ pub mod tests { .get_with_key::(&cache_key_red) .await .unwrap(); - let red_rows_2: Vec = cached_red_2.row_ids().unwrap().map(u64::from).collect(); + let red_rows_2: Vec = cached_red_2.row_addrs().unwrap().map(u64::from).collect(); assert_eq!(red_rows_2, vec![0, 3, 6, 10, 11]); } @@ -1247,7 +1247,7 @@ pub mod tests { ]; let actual_null_addrs: Vec = reloaded_idx .null_map - .row_ids() + .row_addrs() .unwrap() .map(u64::from) .collect(); @@ -1263,7 +1263,7 @@ pub mod tests { .await .unwrap(); if let crate::scalar::SearchResult::Exact(row_ids) = result { - let mut actual: Vec = row_ids.row_ids().unwrap().map(u64::from).collect(); + let mut actual: Vec = row_ids.row_addrs().unwrap().map(u64::from).collect(); actual.sort(); let expected: Vec = vec![ RowAddress::new_from_parts(3, 2).into(), @@ -1279,7 +1279,7 @@ pub mod tests { .await .unwrap(); if let crate::scalar::SearchResult::Exact(row_ids) = result { - let mut actual: Vec = row_ids.row_ids().unwrap().map(u64::from).collect(); + let mut actual: Vec = row_ids.row_addrs().unwrap().map(u64::from).collect(); actual.sort(); let expected: Vec = vec![ RowAddress::new_from_parts(3, 4).into(), @@ -1295,7 +1295,7 @@ pub mod tests { .await .unwrap(); if let crate::scalar::SearchResult::Exact(row_ids) = result { - let mut actual: Vec = row_ids.row_ids().unwrap().map(u64::from).collect(); + let mut actual: Vec = row_ids.row_addrs().unwrap().map(u64::from).collect(); actual.sort(); assert_eq!( actual, expected_null_addrs, diff --git a/rust/lance-index/src/scalar/bloomfilter.rs b/rust/lance-index/src/scalar/bloomfilter.rs index 6f38393a53..8b0c8a951a 100644 --- a/rust/lance-index/src/scalar/bloomfilter.rs +++ b/rust/lance-index/src/scalar/bloomfilter.rs @@ -18,7 +18,7 @@ use crate::scalar::{ use crate::{pb, Any}; use arrow_array::{Array, UInt64Array}; use lance_core::utils::address::RowAddress; -use lance_core::utils::mask::RowIdTreeMap; +use lance_core::utils::mask::RowAddrTreeMap; use lance_core::ROW_ADDR; use lance_datafusion::chunker::chunk_concat_stream; mod as_bytes; @@ -481,7 +481,7 @@ impl ScalarIndex for BloomFilterIndex { metrics.record_comparisons(self.zones.len()); let query = query.as_any().downcast_ref::().unwrap(); - let mut row_id_tree_map = RowIdTreeMap::new(); + let mut row_addr_tree_map = RowAddrTreeMap::new(); // For each zone, check if it might contain the queried value for block in self.zones.iter() { @@ -490,11 +490,11 @@ impl ScalarIndex for BloomFilterIndex { let zone_end_addr = zone_start_addr + block.zone_length as u64; // Add all row addresses in this zone to the result - row_id_tree_map.insert_range(zone_start_addr..zone_end_addr); + row_addr_tree_map.insert_range(zone_start_addr..zone_end_addr); } } - Ok(SearchResult::AtMost(row_id_tree_map)) + Ok(SearchResult::AtMost(row_addr_tree_map)) } fn can_remap(&self) -> bool { @@ -1350,7 +1350,7 @@ mod tests { use futures::{stream, StreamExt}; use lance_core::{ cache::LanceCache, - utils::{mask::RowIdTreeMap, tempfile::TempObjDir}, + utils::{mask::RowAddrTreeMap, tempfile::TempObjDir}, ROW_ADDR, }; use lance_io::object_store::ObjectStore; @@ -1426,7 +1426,7 @@ mod tests { // Equals query: null (should match nothing, as there are no nulls in empty index) let query = BloomFilterQuery::Equals(ScalarValue::Int32(None)); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - assert_eq!(result, SearchResult::AtMost(RowIdTreeMap::new())); + assert_eq!(result, SearchResult::AtMost(RowAddrTreeMap::new())); } #[tokio::test] @@ -1481,7 +1481,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match the block since value 50 is in the range [0, 100) - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..100); assert_eq!(result, SearchResult::AtMost(expected)); @@ -1490,7 +1490,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should return empty result since bloom filter correctly filters out this value - assert_eq!(result, SearchResult::AtMost(RowIdTreeMap::new())); + assert_eq!(result, SearchResult::AtMost(RowAddrTreeMap::new())); // Test calculate_included_frags assert_eq!( @@ -1575,7 +1575,7 @@ mod tests { // Should only match fragment 1 blocks since bloom filter correctly filters // Value 150 is only in fragment 1 (values 100-199), not in fragment 0 (values 0-99) - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range((1u64 << 32) + 50..((1u64 << 32) + 100)); // Only the block containing 150 assert_eq!(result, SearchResult::AtMost(expected)); @@ -1641,7 +1641,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match all blocks since they all contain NaN values - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..500); // All rows since NaN is in every block assert_eq!(result, SearchResult::AtMost(expected)); @@ -1650,7 +1650,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match only the first block since 5.0 only exists in rows 0-99 - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..100); assert_eq!(result, SearchResult::AtMost(expected)); @@ -1659,7 +1659,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match the third block since 250.0 would be in that range if it existed - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(200..300); assert_eq!(result, SearchResult::AtMost(expected)); @@ -1668,7 +1668,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should return empty since bloom filter correctly filters out this value - assert_eq!(result, SearchResult::AtMost(RowIdTreeMap::new())); + assert_eq!(result, SearchResult::AtMost(RowAddrTreeMap::new())); // Test IsIn query with NaN and finite values let query = BloomFilterQuery::IsIn(vec![ @@ -1679,7 +1679,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match all blocks since they all contain NaN values - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..500); assert_eq!(result, SearchResult::AtMost(expected)); } @@ -1740,7 +1740,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match zone 2 - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(2000..3000); assert_eq!(result, SearchResult::AtMost(expected)); @@ -1749,7 +1749,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should return empty since bloom filter correctly filters out this value - assert_eq!(result, SearchResult::AtMost(RowIdTreeMap::new())); + assert_eq!(result, SearchResult::AtMost(RowAddrTreeMap::new())); // Test IsIn query with values from different zones let query = BloomFilterQuery::IsIn(vec![ @@ -1761,7 +1761,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match zones 0, 2, and 7 - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..1000); // Zone 0 expected.insert_range(2000..3000); // Zone 2 expected.insert_range(7000..8000); // Zone 7 @@ -1819,7 +1819,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match the first zone - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..100); assert_eq!(result, SearchResult::AtMost(expected)); @@ -1828,7 +1828,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match the second zone - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(100..200); assert_eq!(result, SearchResult::AtMost(expected)); @@ -1838,7 +1838,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should return empty since bloom filter correctly filters out this value - assert_eq!(result, SearchResult::AtMost(RowIdTreeMap::new())); + assert_eq!(result, SearchResult::AtMost(RowAddrTreeMap::new())); // Test IsIn query with string values let query = BloomFilterQuery::IsIn(vec![ @@ -1849,7 +1849,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match both zones - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..200); assert_eq!(result, SearchResult::AtMost(expected)); } @@ -1901,7 +1901,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match the first zone - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..50); assert_eq!(result, SearchResult::AtMost(expected)); @@ -1910,7 +1910,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match the second zone - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(50..100); assert_eq!(result, SearchResult::AtMost(expected)); @@ -1919,7 +1919,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should return empty since bloom filter correctly filters out this value - assert_eq!(result, SearchResult::AtMost(RowIdTreeMap::new())); + assert_eq!(result, SearchResult::AtMost(RowAddrTreeMap::new())); } #[tokio::test] @@ -1970,7 +1970,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match the first zone - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..50); assert_eq!(result, SearchResult::AtMost(expected)); @@ -1981,7 +1981,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should return empty since bloom filter correctly filters out this value - assert_eq!(result, SearchResult::AtMost(RowIdTreeMap::new())); + assert_eq!(result, SearchResult::AtMost(RowAddrTreeMap::new())); } #[tokio::test] @@ -2026,21 +2026,21 @@ mod tests { // Test search for Date32 value in first zone let query = BloomFilterQuery::Equals(ScalarValue::Date32(Some(25))); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..50); assert_eq!(result, SearchResult::AtMost(expected)); // Test search for Date32 value in second zone let query = BloomFilterQuery::Equals(ScalarValue::Date32(Some(75))); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(50..100); assert_eq!(result, SearchResult::AtMost(expected)); // Test search for Date32 value that doesn't exist let query = BloomFilterQuery::Equals(ScalarValue::Date32(Some(500))); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - assert_eq!(result, SearchResult::AtMost(RowIdTreeMap::new())); + assert_eq!(result, SearchResult::AtMost(RowAddrTreeMap::new())); } #[tokio::test] @@ -2090,7 +2090,7 @@ mod tests { None, )); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..50); assert_eq!(result, SearchResult::AtMost(expected)); @@ -2101,7 +2101,7 @@ mod tests { None, )); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(50..100); assert_eq!(result, SearchResult::AtMost(expected)); @@ -2109,7 +2109,7 @@ mod tests { let query = BloomFilterQuery::Equals(ScalarValue::TimestampNanosecond(Some(999_999_999i64), None)); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - assert_eq!(result, SearchResult::AtMost(RowIdTreeMap::new())); + assert_eq!(result, SearchResult::AtMost(RowAddrTreeMap::new())); // Test IsIn query with multiple timestamp values let query = BloomFilterQuery::IsIn(vec![ @@ -2118,7 +2118,7 @@ mod tests { ScalarValue::TimestampNanosecond(Some(999_999_999i64), None), // Not present ]); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..100); // Should match both zones assert_eq!(result, SearchResult::AtMost(expected)); } @@ -2169,14 +2169,14 @@ mod tests { let first_time = time_values[10]; let query = BloomFilterQuery::Equals(ScalarValue::Time64Microsecond(Some(first_time))); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..25); assert_eq!(result, SearchResult::AtMost(expected)); // Test search for Time64 value that doesn't exist let query = BloomFilterQuery::Equals(ScalarValue::Time64Microsecond(Some(999_999_999i64))); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - assert_eq!(result, SearchResult::AtMost(RowIdTreeMap::new())); + assert_eq!(result, SearchResult::AtMost(RowAddrTreeMap::new())); } #[tokio::test] @@ -2220,14 +2220,14 @@ mod tests { // Test a specific equality query let query = BloomFilterQuery::Equals(ScalarValue::Int32(Some(500))); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(500..750); // Should match the zone containing 500 assert_eq!(result, SearchResult::AtMost(expected)); // Test IsNull query let query = BloomFilterQuery::IsNull(); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - assert_eq!(result, SearchResult::AtMost(RowIdTreeMap::new())); // No nulls in the data + assert_eq!(result, SearchResult::AtMost(RowAddrTreeMap::new())); // No nulls in the data // Test IsIn query let query = BloomFilterQuery::IsIn(vec![ @@ -2235,7 +2235,7 @@ mod tests { ScalarValue::Int32(Some(600)), ]); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..250); // Zone containing 100 expected.insert_range(500..750); // Zone containing 600 assert_eq!(result, SearchResult::AtMost(expected)); diff --git a/rust/lance-index/src/scalar/btree.rs b/rust/lance-index/src/scalar/btree.rs index 9bdbee5841..291e32169a 100644 --- a/rust/lance-index/src/scalar/btree.rs +++ b/rust/lance-index/src/scalar/btree.rs @@ -44,7 +44,7 @@ use lance_core::{ cache::{CacheKey, LanceCache, WeakLanceCache}, error::LanceOptionExt, utils::{ - mask::RowIdTreeMap, + mask::RowAddrTreeMap, tokio::get_num_compute_intensive_cpus, tracing::{IO_TYPE_LOAD_SCALAR_PART, TRACE_IO_EVENTS}, }, @@ -832,7 +832,7 @@ impl BTreeIndex { page_number: u32, index_reader: LazyIndexReader, metrics: &dyn MetricsCollector, - ) -> Result { + ) -> Result { let subindex = self.lookup_page(page_number, index_reader, metrics).await?; // TODO: If this is an IN query we can perhaps simplify the subindex query by restricting it to the // values that might be in the page. E.g. if we are searching for X IN [5, 3, 7] and five is in pages @@ -1176,7 +1176,7 @@ impl ScalarIndex for BTreeIndex { // I/O and compute mixed here but important case is index in cache so // use compute intensive thread count .buffered(get_num_compute_intensive_cpus()) - .try_collect::() + .try_collect::() .await?; Ok(SearchResult::Exact(row_ids)) } @@ -2014,7 +2014,7 @@ mod tests { use deepsize::DeepSizeOf; use futures::TryStreamExt; use lance_core::utils::tempfile::TempObjDir; - use lance_core::{cache::LanceCache, utils::mask::RowIdTreeMap}; + use lance_core::{cache::LanceCache, utils::mask::RowAddrTreeMap}; use lance_datafusion::{chunker::break_stream, datagen::DatafusionDatagenExt}; use lance_datagen::{array, gen_batch, ArrayGeneratorExt, BatchCount, RowCount}; use lance_io::object_store::ObjectStore; @@ -2165,7 +2165,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); assert_eq!( result, - SearchResult::Exact(RowIdTreeMap::from_iter(((idx as u64)..1000).step_by(7))) + SearchResult::Exact(RowAddrTreeMap::from_iter(((idx as u64)..1000).step_by(7))) ); } } diff --git a/rust/lance-index/src/scalar/flat.rs b/rust/lance-index/src/scalar/flat.rs index 99fb263921..a350dcb470 100644 --- a/rust/lance-index/src/scalar/flat.rs +++ b/rust/lance-index/src/scalar/flat.rs @@ -15,7 +15,7 @@ use datafusion_physical_expr::expressions::{in_list, lit, Column}; use deepsize::DeepSizeOf; use lance_core::error::LanceOptionExt; use lance_core::utils::address::RowAddress; -use lance_core::utils::mask::RowIdTreeMap; +use lance_core::utils::mask::RowAddrTreeMap; use lance_core::{Error, Result, ROW_ID}; use roaring::RoaringBitmap; use snafu::location; @@ -304,7 +304,7 @@ impl ScalarIndex for FlatIndex { .as_any() .downcast_ref::() .expect("Result of arrow_select::filter::filter did not match input type"); - Ok(SearchResult::Exact(RowIdTreeMap::from_iter( + Ok(SearchResult::Exact(RowAddrTreeMap::from_iter( matching_ids.values(), ))) } @@ -372,7 +372,7 @@ mod tests { let SearchResult::Exact(actual_row_ids) = actual else { panic! {"Expected exact search result"} }; - let expected = RowIdTreeMap::from_iter(expected); + let expected = RowAddrTreeMap::from_iter(expected); assert_eq!(actual_row_ids, expected); } diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index c0896daaf6..4f30040fc3 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -35,7 +35,7 @@ use futures::{stream, FutureExt, StreamExt, TryStreamExt}; use itertools::Itertools; use lance_arrow::{iter_str_array, RecordBatchExt}; use lance_core::cache::{CacheKey, LanceCache, WeakLanceCache}; -use lance_core::utils::mask::RowIdTreeMap; +use lance_core::utils::mask::RowAddrTreeMap; use lance_core::utils::{ mask::RowIdMask, tracing::{IO_TYPE_LOAD_SCALAR_PART, TRACE_IO_EVENTS}, @@ -547,7 +547,7 @@ impl ScalarIndex for InvertedIndex { .downcast_ref::() .unwrap(); let row_ids = row_ids.iter().flatten().collect_vec(); - Ok(SearchResult::AtMost(RowIdTreeMap::from_iter(row_ids))) + Ok(SearchResult::AtMost(RowAddrTreeMap::from_iter(row_ids))) } } } diff --git a/rust/lance-index/src/scalar/label_list.rs b/rust/lance-index/src/scalar/label_list.rs index cb96961849..08eb510615 100644 --- a/rust/lance-index/src/scalar/label_list.rs +++ b/rust/lance-index/src/scalar/label_list.rs @@ -13,7 +13,7 @@ use datafusion_common::ScalarValue; use deepsize::DeepSizeOf; use futures::{stream::BoxStream, StreamExt, TryStream, TryStreamExt}; use lance_core::cache::LanceCache; -use lance_core::{utils::mask::RowIdTreeMap, Error, Result}; +use lance_core::{utils::mask::RowAddrTreeMap, Error, Result}; use roaring::RoaringBitmap; use snafu::location; use tracing::instrument; @@ -41,7 +41,7 @@ trait LabelListSubIndex: ScalarIndex + DeepSizeOf { &self, query: &dyn AnyQuery, metrics: &dyn MetricsCollector, - ) -> Result { + ) -> Result { let result = self.search(query, metrics).await?; match result { SearchResult::Exact(row_ids) => Ok(row_ids), @@ -118,7 +118,7 @@ impl LabelListIndex { &'a self, values: &'a Vec, metrics: &'a dyn MetricsCollector, - ) -> BoxStream<'a, Result> { + ) -> BoxStream<'a, Result> { futures::stream::iter(values) .then(move |value| { let value_query = SargableQuery::Equals(value.clone()); @@ -129,9 +129,9 @@ impl LabelListIndex { async fn set_union<'a>( &'a self, - mut sets: impl TryStream + 'a + Unpin, + mut sets: impl TryStream + 'a + Unpin, single_set: bool, - ) -> Result { + ) -> Result { let mut union_bitmap = sets.try_next().await?.unwrap(); if single_set { return Ok(union_bitmap); @@ -144,9 +144,9 @@ impl LabelListIndex { async fn set_intersection<'a>( &'a self, - mut sets: impl TryStream + 'a + Unpin, + mut sets: impl TryStream + 'a + Unpin, single_set: bool, - ) -> Result { + ) -> Result { let mut intersect_bitmap = sets.try_next().await?.unwrap(); if single_set { return Ok(intersect_bitmap); diff --git a/rust/lance-index/src/scalar/lance_format.rs b/rust/lance-index/src/scalar/lance_format.rs index 2d6703bbf0..ac63b89c7e 100644 --- a/rust/lance-index/src/scalar/lance_format.rs +++ b/rust/lance-index/src/scalar/lance_format.rs @@ -328,7 +328,7 @@ pub mod tests { use arrow_select::take::TakeOptions; use datafusion_common::ScalarValue; use futures::FutureExt; - use lance_core::utils::mask::RowIdTreeMap; + use lance_core::utils::mask::RowAddrTreeMap; use lance_core::utils::tempfile::TempDir; use lance_core::ROW_ID; use lance_datagen::{array, gen_batch, ArrayGeneratorExt, BatchCount, ByteCount, RowCount}; @@ -402,9 +402,9 @@ pub mod tests { .unwrap(); assert!(result.is_exact()); - let row_ids = result.row_ids(); - assert_eq!(Some(1), row_ids.len()); - assert!(row_ids.contains(10000)); + let row_addrs = result.row_addrs(); + assert_eq!(Some(1), row_addrs.len()); + assert!(row_addrs.contains(10000)); let result = index .search( @@ -418,9 +418,9 @@ pub mod tests { .unwrap(); assert!(result.is_exact()); - let row_ids = result.row_ids(); + let row_addrs = result.row_addrs(); - assert_eq!(Some(0), row_ids.len()); + assert_eq!(Some(0), row_addrs.len()); let result = index .search( @@ -434,9 +434,9 @@ pub mod tests { .unwrap(); assert!(result.is_exact()); - let row_ids = result.row_ids(); + let row_addrs = result.row_addrs(); - assert_eq!(Some(100), row_ids.len()); + assert_eq!(Some(100), row_addrs.len()); } #[tokio::test] @@ -494,10 +494,10 @@ pub mod tests { .unwrap(); assert!(result.is_exact()); - let row_ids = result.row_ids(); + let row_addrs = result.row_addrs(); - assert_eq!(Some(1), row_ids.len()); - assert!(row_ids.contains(10000)); + assert_eq!(Some(1), row_addrs.len()); + assert!(row_addrs.contains(10000)); let result = updated_index .search( @@ -508,17 +508,17 @@ pub mod tests { .unwrap(); assert!(result.is_exact()); - let row_ids = result.row_ids(); + let row_addrs = result.row_addrs(); - assert_eq!(Some(1), row_ids.len()); - assert!(row_ids.contains(500_000)); + assert_eq!(Some(1), row_addrs.len()); + assert!(row_addrs.contains(500_000)); } async fn check(index: &Arc, query: SargableQuery, expected: &[u64]) { let results = index.search(&query, &NoOpMetricsCollector).await.unwrap(); assert!(results.is_exact()); - let expected_arr = RowIdTreeMap::from_iter(expected); - assert_eq!(results.row_ids(), &expected_arr); + let expected_arr = RowAddrTreeMap::from_iter(expected); + assert_eq!(results.row_addrs(), &expected_arr); } #[tokio::test] @@ -823,13 +823,13 @@ pub mod tests { .unwrap(); assert!(result.is_exact()); - let row_ids = result.row_ids(); + let row_addrs = result.row_addrs(); // The random data may have had duplicates so there might be more than 1 result // but even for boolean we shouldn't match the entire thing - assert!(!row_ids.is_empty()); - assert!(row_ids.len().unwrap() < data.num_rows() as u64); - assert!(row_ids.contains(sample_row_id)); + assert!(!row_addrs.is_empty()); + assert!(row_addrs.len().unwrap() < data.num_rows() as u64); + assert!(row_addrs.contains(sample_row_id)); } } @@ -886,17 +886,17 @@ pub mod tests { .unwrap(); assert!(result.is_exact()); - let row_ids = result.row_ids(); + let row_addrs = result.row_addrs(); - assert!(row_ids.is_empty()); + assert!(row_addrs.is_empty()); let result = index .search(&SargableQuery::IsNull(), &NoOpMetricsCollector) .await .unwrap(); assert!(result.is_exact()); - let row_ids = result.row_ids(); - assert_eq!(row_ids.len(), Some(4096)); + let row_addrs = result.row_addrs(); + assert_eq!(row_addrs.len(), Some(4096)); } async fn train_bitmap( @@ -962,9 +962,9 @@ pub mod tests { .unwrap(); assert!(result.is_exact()); - let row_ids = result.row_ids(); - assert_eq!(Some(1), row_ids.len()); - assert!(row_ids.contains(2)); + let row_addrs = result.row_addrs(); + assert_eq!(Some(1), row_addrs.len()); + assert!(row_addrs.contains(2)); let result = index .search( @@ -975,11 +975,11 @@ pub mod tests { .unwrap(); assert!(result.is_exact()); - let row_ids = result.row_ids(); - assert_eq!(Some(3), row_ids.len()); - assert!(row_ids.contains(1)); - assert!(row_ids.contains(3)); - assert!(row_ids.contains(6)); + let row_addrs = result.row_addrs(); + assert_eq!(Some(3), row_addrs.len()); + assert!(row_addrs.contains(1)); + assert!(row_addrs.contains(3)); + assert!(row_addrs.contains(6)); } #[tokio::test] @@ -1004,9 +1004,9 @@ pub mod tests { .unwrap(); assert!(result.is_exact()); - let row_ids = result.row_ids(); - assert_eq!(Some(1), row_ids.len()); - assert!(row_ids.contains(10000)); + let row_addrs = result.row_addrs(); + assert_eq!(Some(1), row_addrs.len()); + assert!(row_addrs.contains(10000)); let result = index .search( @@ -1020,8 +1020,8 @@ pub mod tests { .unwrap(); assert!(result.is_exact()); - let row_ids = result.row_ids(); - assert!(row_ids.is_empty()); + let row_addrs = result.row_addrs(); + assert!(row_addrs.is_empty()); let result = index .search( @@ -1035,15 +1035,15 @@ pub mod tests { .unwrap(); assert!(result.is_exact()); - let row_ids = result.row_ids(); - assert_eq!(Some(100), row_ids.len()); + let row_addrs = result.row_addrs(); + assert_eq!(Some(100), row_addrs.len()); } async fn check_bitmap(index: &BitmapIndex, query: SargableQuery, expected: &[u64]) { let results = index.search(&query, &NoOpMetricsCollector).await.unwrap(); assert!(results.is_exact()); - let expected_arr = RowIdTreeMap::from_iter(expected); - assert_eq!(results.row_ids(), &expected_arr); + let expected_arr = RowAddrTreeMap::from_iter(expected); + assert_eq!(results.row_addrs(), &expected_arr); } #[tokio::test] @@ -1307,9 +1307,9 @@ pub mod tests { .unwrap(); assert!(result.is_exact()); - let row_ids = result.row_ids(); - assert_eq!(Some(1), row_ids.len()); - assert!(row_ids.contains(5000)); + let row_addrs = result.row_addrs(); + assert_eq!(Some(1), row_addrs.len()); + assert!(row_addrs.contains(5000)); } #[tokio::test] @@ -1356,7 +1356,7 @@ pub mod tests { ) .await .unwrap() - .row_ids() + .row_addrs() .contains(65)); // Deleted assert!(remapped_index @@ -1366,7 +1366,7 @@ pub mod tests { ) .await .unwrap() - .row_ids() + .row_addrs() .is_empty()); // Not remapped assert!(remapped_index @@ -1376,7 +1376,7 @@ pub mod tests { ) .await .unwrap() - .row_ids() + .row_addrs() .contains(3)); } @@ -1442,10 +1442,10 @@ pub mod tests { .unwrap(); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); assert!(result.is_exact()); - let row_ids = result.row_ids(); + let row_addrs = result.row_addrs(); - let row_ids_set = row_ids - .row_ids() + let row_addrs_set = row_addrs + .row_addrs() .unwrap() .map(u64::from) .collect::>(); @@ -1459,7 +1459,7 @@ pub mod tests { let list = list.unwrap(); let row_id = row_id.unwrap(); let vals = list.as_primitive::().values(); - if row_ids_set.contains(&row_id) { + if row_addrs_set.contains(&row_id) { assert!(match_fn(vals)); } else { assert!(no_match_fn(vals)); diff --git a/rust/lance-index/src/scalar/ngram.rs b/rust/lance-index/src/scalar/ngram.rs index 15dfec35a6..aec9cc29dc 100644 --- a/rust/lance-index/src/scalar/ngram.rs +++ b/rust/lance-index/src/scalar/ngram.rs @@ -38,7 +38,7 @@ use lance_core::utils::address::RowAddress; use lance_core::utils::tempfile::TempDir; use lance_core::utils::tokio::get_num_compute_intensive_cpus; use lance_core::utils::tracing::{IO_TYPE_LOAD_SCALAR_PART, TRACE_IO_EVENTS}; -use lance_core::{utils::mask::RowIdTreeMap, Error}; +use lance_core::{utils::mask::RowAddrTreeMap, Error}; use lance_core::{Result, ROW_ID}; use lance_io::object_store::ObjectStore; use log::info; @@ -451,7 +451,7 @@ impl ScalarIndex for NGramIndex { TextQuery::StringContains(substr) => { if substr.len() < NGRAM_N { // We know nothing on short searches, need to recheck all - return Ok(SearchResult::AtLeast(RowIdTreeMap::new())); + return Ok(SearchResult::AtLeast(RowAddrTreeMap::new())); } let mut row_offsets = Vec::with_capacity(substr.len() * 3); @@ -466,7 +466,7 @@ impl ScalarIndex for NGramIndex { }); // At least one token was missing, so we know there are zero results if missing { - return Ok(SearchResult::Exact(RowIdTreeMap::new())); + return Ok(SearchResult::Exact(RowAddrTreeMap::new())); } let posting_lists = futures::stream::iter( row_offsets @@ -479,7 +479,7 @@ impl ScalarIndex for NGramIndex { metrics.record_comparisons(posting_lists.len()); let list_refs = posting_lists.iter().map(|list| list.as_ref()); let row_ids = NGramPostingList::intersect(list_refs); - Ok(SearchResult::AtMost(RowIdTreeMap::from(row_ids))) + Ok(SearchResult::AtMost(RowAddrTreeMap::from(row_ids))) } } } @@ -1341,7 +1341,7 @@ mod tests { use itertools::Itertools; use lance_core::{ cache::LanceCache, - utils::{mask::RowIdTreeMap, tempfile::TempDir}, + utils::{mask::RowAddrTreeMap, tempfile::TempDir}, ROW_ID, }; use lance_datagen::{BatchCount, ByteCount, RowCount}; @@ -1487,7 +1487,7 @@ mod tests { .await .unwrap(); - let expected = SearchResult::AtMost(RowIdTreeMap::from_iter([0, 2, 3])); + let expected = SearchResult::AtMost(RowAddrTreeMap::from_iter([0, 2, 3])); assert_eq!(expected, res); @@ -1499,7 +1499,7 @@ mod tests { ) .await .unwrap(); - let expected = SearchResult::AtMost(RowIdTreeMap::from_iter([8])); + let expected = SearchResult::AtMost(RowAddrTreeMap::from_iter([8])); assert_eq!(expected, res); // No matches @@ -1510,7 +1510,7 @@ mod tests { ) .await .unwrap(); - let expected = SearchResult::Exact(RowIdTreeMap::new()); + let expected = SearchResult::Exact(RowAddrTreeMap::new()); assert_eq!(expected, res); // False positive @@ -1521,7 +1521,7 @@ mod tests { ) .await .unwrap(); - let expected = SearchResult::AtMost(RowIdTreeMap::from_iter([8])); + let expected = SearchResult::AtMost(RowAddrTreeMap::from_iter([8])); assert_eq!(expected, res); // Too short, don't know anything @@ -1532,7 +1532,7 @@ mod tests { ) .await .unwrap(); - let expected = SearchResult::AtLeast(RowIdTreeMap::new()); + let expected = SearchResult::AtLeast(RowAddrTreeMap::new()); assert_eq!(expected, res); // One short string but we still get at least one trigram, this is ok @@ -1543,7 +1543,7 @@ mod tests { ) .await .unwrap(); - let expected = SearchResult::AtMost(RowIdTreeMap::from_iter([8])); + let expected = SearchResult::AtMost(RowAddrTreeMap::from_iter([8])); assert_eq!(expected, res); } @@ -1582,7 +1582,7 @@ mod tests { ) .await .unwrap(); - let expected = SearchResult::AtMost(RowIdTreeMap::from_iter([0, 4])); + let expected = SearchResult::AtMost(RowAddrTreeMap::from_iter([0, 4])); assert_eq!(expected, res); let null_posting_list = get_null_posting_list(&index).await; diff --git a/rust/lance-index/src/scalar/zonemap.rs b/rust/lance-index/src/scalar/zonemap.rs index 02c7d4ac7e..0b4b94e7e3 100644 --- a/rust/lance-index/src/scalar/zonemap.rs +++ b/rust/lance-index/src/scalar/zonemap.rs @@ -43,7 +43,7 @@ use crate::{Index, IndexType}; use async_trait::async_trait; use deepsize::DeepSizeOf; use lance_core::Result; -use lance_core::{utils::address::RowAddress, utils::mask::RowIdTreeMap, Error}; +use lance_core::{utils::address::RowAddress, utils::mask::RowAddrTreeMap, Error}; use roaring::RoaringBitmap; use snafu::location; const ROWS_PER_ZONE_DEFAULT: u64 = 8192; // 1 zone every two batches @@ -553,7 +553,7 @@ impl ScalarIndex for ZoneMapIndex { metrics.record_comparisons(self.zones.len()); let query = query.as_any().downcast_ref::().unwrap(); - let mut row_id_tree_map = RowIdTreeMap::new(); + let mut row_addr_tree_map = RowAddrTreeMap::new(); // Loop through zones and check each one for zone in self.zones.iter() { @@ -564,11 +564,11 @@ impl ScalarIndex for ZoneMapIndex { let zone_end_addr = zone_start_addr + zone.zone_length as u64; // Add all row addresses in this zone to the result - row_id_tree_map.insert_range(zone_start_addr..zone_end_addr); + row_addr_tree_map.insert_range(zone_start_addr..zone_end_addr); } } - Ok(SearchResult::AtMost(row_id_tree_map)) + Ok(SearchResult::AtMost(row_addr_tree_map)) } fn can_remap(&self) -> bool { @@ -1089,7 +1089,7 @@ mod tests { use datafusion_common::ScalarValue; use futures::{stream, StreamExt, TryStreamExt}; use lance_core::utils::tempfile::TempObjDir; - use lance_core::{cache::LanceCache, utils::mask::RowIdTreeMap, ROW_ADDR}; + use lance_core::{cache::LanceCache, utils::mask::RowAddrTreeMap, ROW_ADDR}; use lance_datafusion::datagen::DatafusionDatagenExt; use lance_datagen::ArrayGeneratorExt; use lance_datagen::{array, BatchCount, RowCount}; @@ -1170,7 +1170,7 @@ mod tests { // Equals query: null (should match nothing, as there are no nulls) let query = SargableQuery::Equals(ScalarValue::Int32(None)); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - assert_eq!(result, SearchResult::AtMost(RowIdTreeMap::new())); + assert_eq!(result, SearchResult::AtMost(RowAddrTreeMap::new())); } #[tokio::test] @@ -1219,8 +1219,8 @@ mod tests { let query = SargableQuery::Equals(ScalarValue::Int32(None)); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - // Create expected RowIdTreeMap with all zones since they contain null values - let mut expected = RowIdTreeMap::new(); + // Create expected RowAddrTreeMap with all zones since they contain null values + let mut expected = RowAddrTreeMap::new(); for fragment_id in 0..10 { let start = (fragment_id as u64) << 32; let end = start + 5000; @@ -1278,7 +1278,7 @@ mod tests { .unwrap(); // Should match original 10 zones (with nulls) but not the new zone (no nulls) - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); for fragment_id in 0..10 { let start = (fragment_id as u64) << 32; let end = start + 5000; @@ -1294,7 +1294,7 @@ mod tests { .unwrap(); // Should match the new zone (fragment 10) - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); let start = 10u64 << 32; let end = start + 5000; expected.insert_range(start..end); @@ -1376,7 +1376,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match all zones since they all contain NaN values - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..500); // All rows since NaN is in every zone assert_eq!(result, SearchResult::AtMost(expected)); @@ -1385,7 +1385,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match only the first zone since 5.0 only exists in rows 0-99 - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..100); assert_eq!(result, SearchResult::AtMost(expected)); @@ -1395,7 +1395,7 @@ mod tests { // Since zones contain NaN values, their max will be NaN, so they will be included // as potential matches for any finite target (false positive, but acceptable for zone maps) - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..500); assert_eq!(result, SearchResult::AtMost(expected)); @@ -1407,7 +1407,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match the first three zones since they contain values in the range [0, 250] - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..300); assert_eq!(result, SearchResult::AtMost(expected)); @@ -1420,7 +1420,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match all zones since they all contain NaN values - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..500); assert_eq!(result, SearchResult::AtMost(expected)); @@ -1433,14 +1433,14 @@ mod tests { // Since zones contain NaN values, their max will be NaN, so they will be included // as potential matches for any range query (false positive, but acceptable for zone maps) - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..500); assert_eq!(result, SearchResult::AtMost(expected)); // Test IsNull query (should match nothing since there are no null values) let query = SargableQuery::IsNull(); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - assert_eq!(result, SearchResult::AtMost(RowIdTreeMap::new())); + assert_eq!(result, SearchResult::AtMost(RowAddrTreeMap::new())); // Test range queries with NaN bounds // Range with NaN as start bound (included) @@ -1450,7 +1450,7 @@ mod tests { ); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match all zones since they all contain NaN values - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..500); assert_eq!(result, SearchResult::AtMost(expected)); @@ -1461,7 +1461,7 @@ mod tests { ); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match all zones since they all contain NaN values - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..500); assert_eq!(result, SearchResult::AtMost(expected)); @@ -1472,7 +1472,7 @@ mod tests { ); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match all zones since everything is less than NaN - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..500); assert_eq!(result, SearchResult::AtMost(expected)); @@ -1483,7 +1483,7 @@ mod tests { ); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match nothing since nothing is greater than NaN - assert_eq!(result, SearchResult::AtMost(RowIdTreeMap::new())); + assert_eq!(result, SearchResult::AtMost(RowAddrTreeMap::new())); // Test IsIn query with mixed float types (Float16, Float32, Float64) let query = SargableQuery::IsIn(vec![ @@ -1494,7 +1494,7 @@ mod tests { ]); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match all zones since they all contain NaN values - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..500); assert_eq!(result, SearchResult::AtMost(expected)); } @@ -1620,7 +1620,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); assert_eq!( result, - SearchResult::AtMost(RowIdTreeMap::from_iter(0..=100)) + SearchResult::AtMost(RowAddrTreeMap::from_iter(0..=100)) ); // 2. Range query: [0, 50] @@ -1631,7 +1631,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); assert_eq!( result, - SearchResult::AtMost(RowIdTreeMap::from_iter(0..=99)) + SearchResult::AtMost(RowAddrTreeMap::from_iter(0..=99)) ); // 3. Range query: [101, 200] (should only match the second zone, which is row 100) @@ -1641,7 +1641,7 @@ mod tests { ); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Only row 100 is in the second zone, but its value is 100, so this should be empty - assert_eq!(result, SearchResult::AtMost(RowIdTreeMap::new())); + assert_eq!(result, SearchResult::AtMost(RowAddrTreeMap::new())); // 4. Range query: [100, 100] (should match only the last row) let query = SargableQuery::Range( @@ -1651,7 +1651,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); assert_eq!( result, - SearchResult::AtMost(RowIdTreeMap::from_iter(100..=100)) + SearchResult::AtMost(RowAddrTreeMap::from_iter(100..=100)) ); // 5. Equals query: 0 (should match first row) @@ -1659,7 +1659,7 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); assert_eq!( result, - SearchResult::AtMost(RowIdTreeMap::from_iter(0..100)) + SearchResult::AtMost(RowAddrTreeMap::from_iter(0..100)) ); // 6. Equals query: 100 (should match only last row) @@ -1667,18 +1667,18 @@ mod tests { let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); assert_eq!( result, - SearchResult::AtMost(RowIdTreeMap::from_iter(100..=100)) + SearchResult::AtMost(RowAddrTreeMap::from_iter(100..=100)) ); // 7. Equals query: 101 (should match nothing) let query = SargableQuery::Equals(ScalarValue::Int32(Some(101))); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - assert_eq!(result, SearchResult::AtMost(RowIdTreeMap::new())); + assert_eq!(result, SearchResult::AtMost(RowAddrTreeMap::new())); // 8. IsNull query (no nulls in data, should match nothing) let query = SargableQuery::IsNull(); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - assert_eq!(result, SearchResult::AtMost(RowIdTreeMap::new())); + assert_eq!(result, SearchResult::AtMost(RowAddrTreeMap::new())); // 9. IsIn query: [0, 100, 101, 50] let query = SargableQuery::IsIn(vec![ @@ -1691,7 +1691,7 @@ mod tests { // 0 and 50 are in the first zone, 100 in the second, 101 is not present assert_eq!( result, - SearchResult::AtMost(RowIdTreeMap::from_iter(0..=100)) + SearchResult::AtMost(RowAddrTreeMap::from_iter(0..=100)) ); // 10. IsIn query: [101, 102] (should match nothing) @@ -1700,17 +1700,17 @@ mod tests { ScalarValue::Int32(Some(102)), ]); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - assert_eq!(result, SearchResult::AtMost(RowIdTreeMap::new())); + assert_eq!(result, SearchResult::AtMost(RowAddrTreeMap::new())); // 11. IsIn query: [null] (should match nothing, as there are no nulls) let query = SargableQuery::IsIn(vec![ScalarValue::Int32(None)]); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - assert_eq!(result, SearchResult::AtMost(RowIdTreeMap::new())); + assert_eq!(result, SearchResult::AtMost(RowAddrTreeMap::new())); // 12. Equals query: null (should match nothing, as there are no nulls) let query = SargableQuery::Equals(ScalarValue::Int32(None)); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - assert_eq!(result, SearchResult::AtMost(RowIdTreeMap::new())); + assert_eq!(result, SearchResult::AtMost(RowAddrTreeMap::new())); } #[tokio::test] @@ -1804,7 +1804,7 @@ mod tests { let query = SargableQuery::Equals(ScalarValue::Int64(Some(1000))); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match row 1000 in fragment 0: row address = (0 << 32) + 1000 = 1000 - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..=8191); assert_eq!(result, SearchResult::AtMost(expected)); @@ -1812,14 +1812,14 @@ mod tests { let query = SargableQuery::Equals(ScalarValue::Int64(Some(9000))); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match row 9000 in fragment 0: row address = (0 << 32) + 9000 = 9000 - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(8192..=16383); assert_eq!(result, SearchResult::AtMost(expected)); // Search for a value not present in any zone let query = SargableQuery::Equals(ScalarValue::Int64(Some(20000))); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - assert_eq!(result, SearchResult::AtMost(RowIdTreeMap::new())); + assert_eq!(result, SearchResult::AtMost(RowAddrTreeMap::new())); // Search for a range that spans multiple zones let query = SargableQuery::Range( @@ -1828,7 +1828,7 @@ mod tests { ); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should match all rows from 8000 to 16400 (inclusive) - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(8192..=16425); assert_eq!(result, SearchResult::AtMost(expected)); } @@ -2037,7 +2037,7 @@ mod tests { ); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should include zones from fragments 0 and 1 since they overlap with range 5000-12000 - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); // zone 1 expected.insert_range(5000..8192); // zone 2 @@ -2048,7 +2048,7 @@ mod tests { let query = SargableQuery::Equals(ScalarValue::Int64(Some(8192))); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should include zone 2 since it contains value 8192 - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range((1u64 << 32)..((1u64 << 32) + 5000)); assert_eq!(result, SearchResult::AtMost(expected)); @@ -2056,29 +2056,29 @@ mod tests { let query = SargableQuery::Equals(ScalarValue::Int64(Some(16385))); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); // Should include zone 4 since it contains value 16385 - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(2u64 << 32..((2u64 << 32) + 42)); assert_eq!(result, SearchResult::AtMost(expected)); // Test query that matches nothing let query = SargableQuery::Equals(ScalarValue::Int64(Some(99999))); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - assert_eq!(result, SearchResult::AtMost(RowIdTreeMap::new())); + assert_eq!(result, SearchResult::AtMost(RowAddrTreeMap::new())); // Test is_in query let query = SargableQuery::IsIn(vec![ScalarValue::Int64(Some(16385))]); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(2u64 << 32..((2u64 << 32) + 42)); assert_eq!(result, SearchResult::AtMost(expected)); // Test equals query with null let query = SargableQuery::Equals(ScalarValue::Int64(None)); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_range(0..=16425); // expected = {:?}", expected - assert_eq!(result, SearchResult::AtMost(RowIdTreeMap::new())); + assert_eq!(result, SearchResult::AtMost(RowAddrTreeMap::new())); } // Each fragment is its own batch diff --git a/rust/lance-table/src/rowids.rs b/rust/lance-table/src/rowids.rs index c7f97c5d0e..81671e871d 100644 --- a/rust/lance-table/src/rowids.rs +++ b/rust/lance-table/src/rowids.rs @@ -27,7 +27,7 @@ use deepsize::DeepSizeOf; pub use index::FragmentRowIdIndex; pub use index::RowIdIndex; use lance_core::{ - utils::mask::{RowIdMask, RowIdTreeMap}, + utils::mask::{RowAddrTreeMap, RowIdMask}, Error, Result, }; use lance_io::ReadBatchParams; @@ -369,7 +369,7 @@ impl RowIdSequence { for segment in &self.0 { match segment { U64Segment::Range(range) => { - let mut ids = RowIdTreeMap::from(range.clone()); + let mut ids = RowAddrTreeMap::from(range.clone()); ids.mask(mask); ranges.extend(GroupingIterator::new( unsafe { ids.into_addr_iter() }.map(|addr| addr - range.start + offset), @@ -378,7 +378,7 @@ impl RowIdSequence { } U64Segment::RangeWithHoles { range, holes } => { let offset_start = offset; - let mut ids = RowIdTreeMap::from(range.clone()); + let mut ids = RowAddrTreeMap::from(range.clone()); offset += range.end - range.start; for hole in holes.iter() { if ids.remove(hole) { @@ -407,7 +407,7 @@ impl RowIdSequence { ))); } U64Segment::RangeWithBitmap { range, bitmap } => { - let mut ids = RowIdTreeMap::from(range.clone()); + let mut ids = RowAddrTreeMap::from(range.clone()); let offset_start = offset; offset += range.end - range.start; for (i, val) in range.clone().enumerate() { @@ -490,7 +490,7 @@ impl> Iterator for GroupingIterator { } } -impl From<&RowIdSequence> for RowIdTreeMap { +impl From<&RowIdSequence> for RowAddrTreeMap { fn from(row_ids: &RowIdSequence) -> Self { let mut tree_map = Self::new(); for segment in &row_ids.0 { @@ -1003,13 +1003,13 @@ mod test { U64Segment::Range(40..50), ]); - let tree_map = RowIdTreeMap::from(&sequence); + let tree_map = RowAddrTreeMap::from(&sequence); let expected = vec![ 0, 1, 2, 3, 4, 7, 9, 10, 12, 14, 35, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 55, 56, 57, 58, 59, ] .into_iter() - .collect::(); + .collect::(); assert_eq!(tree_map, expected); } @@ -1108,17 +1108,17 @@ mod test { fn test_mask_to_offset_ranges() { // Tests with a simple range segment let sequence = RowIdSequence(vec![U64Segment::Range(0..10)]); - let mask = RowIdMask::from_allowed(RowIdTreeMap::from_iter(&[0, 2, 4, 6, 8])); + let mask = RowIdMask::from_allowed(RowAddrTreeMap::from_iter(&[0, 2, 4, 6, 8])); let ranges = sequence.mask_to_offset_ranges(&mask); assert_eq!(ranges, vec![0..1, 2..3, 4..5, 6..7, 8..9]); let sequence = RowIdSequence(vec![U64Segment::Range(40..60)]); - let mask = RowIdMask::from_allowed(RowIdTreeMap::from_iter(&[54])); + let mask = RowIdMask::from_allowed(RowAddrTreeMap::from_iter(&[54])); let ranges = sequence.mask_to_offset_ranges(&mask); assert_eq!(ranges, vec![14..15]); let sequence = RowIdSequence(vec![U64Segment::Range(40..60)]); - let mask = RowIdMask::from_block(RowIdTreeMap::from_iter(&[54])); + let mask = RowIdMask::from_block(RowAddrTreeMap::from_iter(&[54])); let ranges = sequence.mask_to_offset_ranges(&mask); assert_eq!(ranges, vec![0..14, 15..20]); @@ -1128,7 +1128,7 @@ mod test { range: 0..10, holes: vec![2, 6].into(), }]); - let mask = RowIdMask::from_allowed(RowIdTreeMap::from_iter(&[0, 2, 4, 6, 8])); + let mask = RowIdMask::from_allowed(RowAddrTreeMap::from_iter(&[0, 2, 4, 6, 8])); let ranges = sequence.mask_to_offset_ranges(&mask); assert_eq!(ranges, vec![0..1, 3..4, 6..7]); @@ -1136,7 +1136,7 @@ mod test { range: 40..60, holes: vec![47, 43].into(), }]); - let mask = RowIdMask::from_allowed(RowIdTreeMap::from_iter(&[44])); + let mask = RowIdMask::from_allowed(RowAddrTreeMap::from_iter(&[44])); let ranges = sequence.mask_to_offset_ranges(&mask); assert_eq!(ranges, vec![3..4]); @@ -1144,7 +1144,7 @@ mod test { range: 40..60, holes: vec![47, 43].into(), }]); - let mask = RowIdMask::from_block(RowIdTreeMap::from_iter(&[44])); + let mask = RowIdMask::from_block(RowAddrTreeMap::from_iter(&[44])); let ranges = sequence.mask_to_offset_ranges(&mask); assert_eq!(ranges, vec![0..3, 4..18]); @@ -1158,7 +1158,7 @@ mod test { .as_slice() .into(), }]); - let mask = RowIdMask::from_allowed(RowIdTreeMap::from_iter(&[0, 2, 4, 6, 8])); + let mask = RowIdMask::from_allowed(RowAddrTreeMap::from_iter(&[0, 2, 4, 6, 8])); let ranges = sequence.mask_to_offset_ranges(&mask); assert_eq!(ranges, vec![0..1, 2..3, 4..5]); @@ -1166,7 +1166,7 @@ mod test { range: 40..45, bitmap: [true, true, false, false, true].as_slice().into(), }]); - let mask = RowIdMask::from_allowed(RowIdTreeMap::from_iter(&[44])); + let mask = RowIdMask::from_allowed(RowAddrTreeMap::from_iter(&[44])); let ranges = sequence.mask_to_offset_ranges(&mask); assert_eq!(ranges, vec![2..3]); @@ -1174,18 +1174,18 @@ mod test { range: 40..45, bitmap: [true, true, false, false, true].as_slice().into(), }]); - let mask = RowIdMask::from_block(RowIdTreeMap::from_iter(&[44])); + let mask = RowIdMask::from_block(RowAddrTreeMap::from_iter(&[44])); let ranges = sequence.mask_to_offset_ranges(&mask); assert_eq!(ranges, vec![0..2]); // Test with a sorted array segment let sequence = RowIdSequence(vec![U64Segment::SortedArray(vec![0, 2, 4, 6, 8].into())]); - let mask = RowIdMask::from_allowed(RowIdTreeMap::from_iter(&[0, 6, 8])); + let mask = RowIdMask::from_allowed(RowAddrTreeMap::from_iter(&[0, 6, 8])); let ranges = sequence.mask_to_offset_ranges(&mask); assert_eq!(ranges, vec![0..1, 3..5]); let sequence = RowIdSequence(vec![U64Segment::Array(vec![8, 2, 6, 0, 4].into())]); - let mask = RowIdMask::from_allowed(RowIdTreeMap::from_iter(&[0, 6, 8])); + let mask = RowIdMask::from_allowed(RowAddrTreeMap::from_iter(&[0, 6, 8])); let ranges = sequence.mask_to_offset_ranges(&mask); assert_eq!(ranges, vec![0..1, 2..4]); @@ -1201,7 +1201,7 @@ mod test { }, U64Segment::SortedArray(vec![44, 46, 78].into()), ]); - let mask = RowIdMask::from_allowed(RowIdTreeMap::from_iter(&[0, 2, 46, 100, 104])); + let mask = RowIdMask::from_allowed(RowAddrTreeMap::from_iter(&[0, 2, 46, 100, 104])); let ranges = sequence.mask_to_offset_ranges(&mask); assert_eq!(ranges, vec![0..1, 2..3, 5..6, 8..9, 10..11]); diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index 8743c9e1c0..9a7a4ae488 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -48,7 +48,7 @@ use lance_core::datatypes::{ }; use lance_core::error::LanceOptionExt; use lance_core::utils::address::RowAddress; -use lance_core::utils::mask::{RowIdMask, RowIdTreeMap}; +use lance_core::utils::mask::{RowAddrTreeMap, RowIdMask}; use lance_core::utils::tokio::get_num_compute_intensive_cpus; use lance_core::{ROW_ADDR, ROW_ID, ROW_OFFSET}; use lance_datafusion::exec::{ @@ -2154,7 +2154,7 @@ impl Scanner { } fn u64s_as_take_input(&self, u64s: Vec) -> Result> { - let row_ids = RowIdTreeMap::from_iter(u64s); + let row_ids = RowAddrTreeMap::from_iter(u64s); let row_id_mask = RowIdMask::from_allowed(row_ids); let index_result = IndexExprResult::Exact(row_id_mask); let fragments_covered = diff --git a/rust/lance/src/dataset/write/commit.rs b/rust/lance/src/dataset/write/commit.rs index 0be1688bbc..f5ac12d055 100644 --- a/rust/lance/src/dataset/write/commit.rs +++ b/rust/lance/src/dataset/write/commit.rs @@ -4,7 +4,7 @@ use std::collections::HashMap; use std::sync::Arc; -use lance_core::utils::mask::RowIdTreeMap; +use lance_core::utils::mask::RowAddrTreeMap; use lance_file::version::LanceFileVersion; use lance_io::object_store::{ObjectStore, ObjectStoreParams}; use lance_table::{ @@ -46,7 +46,7 @@ pub struct CommitBuilder<'a> { session: Option>, detached: bool, commit_config: CommitConfig, - affected_rows: Option, + affected_rows: Option, transaction_properties: Option>>, } @@ -165,7 +165,7 @@ impl<'a> CommitBuilder<'a> { /// Provide the set of row addresses that were deleted or updated. This is /// used to perform fast conflict resolution. - pub fn with_affected_rows(mut self, affected_rows: RowIdTreeMap) -> Self { + pub fn with_affected_rows(mut self, affected_rows: RowAddrTreeMap) -> Self { self.affected_rows = Some(affected_rows); self } diff --git a/rust/lance/src/dataset/write/delete.rs b/rust/lance/src/dataset/write/delete.rs index 588f5248b7..eafa4ed223 100644 --- a/rust/lance/src/dataset/write/delete.rs +++ b/rust/lance/src/dataset/write/delete.rs @@ -10,7 +10,7 @@ use crate::{ use datafusion::logical_expr::Expr; use datafusion::scalar::ScalarValue; use futures::{StreamExt, TryStreamExt}; -use lance_core::utils::mask::RowIdTreeMap; +use lance_core::utils::mask::RowAddrTreeMap; use lance_core::{Error, Result, ROW_ID}; use lance_table::format::Fragment; use roaring::RoaringTreemap; @@ -150,7 +150,7 @@ struct DeleteJob { struct DeleteData { updated_fragments: Vec, deleted_fragment_ids: Vec, - affected_rows: Option, + affected_rows: Option, } impl RetryExecutor for DeleteJob { @@ -174,7 +174,7 @@ impl RetryExecutor for DeleteJob { Expr::Literal(ScalarValue::Boolean(Some(false)), _) ) { // Predicate evaluated to false - no deletions - (Vec::new(), Vec::new(), Some(RowIdTreeMap::new())) + (Vec::new(), Vec::new(), Some(RowAddrTreeMap::new())) } else if matches!( filter_expr, Expr::Literal(ScalarValue::Boolean(Some(true)), _) @@ -213,12 +213,12 @@ impl RetryExecutor for DeleteJob { let (fragments, deleted_ids) = apply_deletions(&self.dataset, &removed_row_addrs).await?; - let affected_rows = RowIdTreeMap::from(removed_row_addrs.as_ref().clone()); + let affected_rows = RowAddrTreeMap::from(removed_row_addrs.as_ref().clone()); (fragments, deleted_ids, Some(affected_rows)) } } else { // No filter was applied - this shouldn't happen but treat as delete nothing - (Vec::new(), Vec::new(), Some(RowIdTreeMap::new())) + (Vec::new(), Vec::new(), Some(RowAddrTreeMap::new())) }; Ok(DeleteData { diff --git a/rust/lance/src/dataset/write/merge_insert.rs b/rust/lance/src/dataset/write/merge_insert.rs index 69222e9090..bc315450e2 100644 --- a/rust/lance/src/dataset/write/merge_insert.rs +++ b/rust/lance/src/dataset/write/merge_insert.rs @@ -76,7 +76,7 @@ use lance_core::utils::address::RowAddress; use lance_core::{ datatypes::{OnMissing, OnTypeMismatch, SchemaCompareOptions}, error::{box_error, InvalidInputSnafu}, - utils::{futures::Capacity, mask::RowIdTreeMap, tokio::get_num_compute_intensive_cpus}, + utils::{futures::Capacity, mask::RowAddrTreeMap, tokio::get_num_compute_intensive_cpus}, Error, Result, ROW_ADDR, ROW_ADDR_FIELD, ROW_ID, ROW_ID_FIELD, }; use lance_datafusion::{ @@ -1321,7 +1321,7 @@ impl MergeInsertJob { async fn execute_uncommitted_v2( self, source: SendableRecordBatchStream, - ) -> Result<(Transaction, MergeStats, Option)> { + ) -> Result<(Transaction, MergeStats, Option)> { let plan = self.create_plan(source).await?; // Execute the plan @@ -1380,7 +1380,7 @@ impl MergeInsertJob { location: location!(), })?; - let affected_rows = merge_insert_exec.affected_rows().map(RowIdTreeMap::from); + let affected_rows = merge_insert_exec.affected_rows().map(RowAddrTreeMap::from); Ok((transaction, stats, affected_rows)) } @@ -1565,7 +1565,7 @@ impl MergeInsertJob { update_mode: Some(RewriteRows), }; - let affected_rows = Some(RowIdTreeMap::from(removed_row_addrs)); + let affected_rows = Some(RowAddrTreeMap::from(removed_row_addrs)); (operation, affected_rows) }; @@ -1748,7 +1748,7 @@ pub struct MergeStats { pub struct UncommittedMergeInsert { pub transaction: Transaction, - pub affected_rows: Option, + pub affected_rows: Option, pub stats: MergeStats, } diff --git a/rust/lance/src/dataset/write/update.rs b/rust/lance/src/dataset/write/update.rs index ea27b2d7cc..e301444ee4 100644 --- a/rust/lance/src/dataset/write/update.rs +++ b/rust/lance/src/dataset/write/update.rs @@ -25,7 +25,7 @@ use datafusion::scalar::ScalarValue; use futures::StreamExt; use lance_arrow::RecordBatchExt; use lance_core::error::{box_error, InvalidInputSnafu}; -use lance_core::utils::mask::RowIdTreeMap; +use lance_core::utils::mask::RowAddrTreeMap; use lance_core::utils::tokio::get_num_compute_intensive_cpus; use lance_datafusion::expr::safe_coerce_scalar; use lance_table::format::{Fragment, RowIdMeta}; @@ -241,7 +241,7 @@ pub struct UpdateData { removed_fragment_ids: Vec, old_fragments: Vec, new_fragments: Vec, - affected_rows: RowIdTreeMap, + affected_rows: RowAddrTreeMap, num_updated_rows: u64, } @@ -351,7 +351,7 @@ impl UpdateJob { let row_id_index = get_row_id_index(&self.dataset).await?; let row_addrs = removed_row_ids.row_addrs(row_id_index.as_deref()); let (old_fragments, removed_fragment_ids) = self.apply_deletions(&row_addrs).await?; - let affected_rows = RowIdTreeMap::from(row_addrs.as_ref().clone()); + let affected_rows = RowAddrTreeMap::from(row_addrs.as_ref().clone()); let num_updated_rows = new_fragments .iter() diff --git a/rust/lance/src/index/prefilter.rs b/rust/lance/src/index/prefilter.rs index 9c5c2ecc44..9ccf9c485c 100644 --- a/rust/lance/src/index/prefilter.rs +++ b/rust/lance/src/index/prefilter.rs @@ -19,8 +19,7 @@ use futures::FutureExt; use futures::StreamExt; use futures::TryStreamExt; use lance_core::utils::deletion::DeletionVector; -use lance_core::utils::mask::RowIdMask; -use lance_core::utils::mask::RowIdTreeMap; +use lance_core::utils::mask::{RowAddrTreeMap, RowIdMask}; use lance_core::utils::tokio::spawn_cpu; use lance_table::format::Fragment; use lance_table::format::IndexMetadata; @@ -107,7 +106,7 @@ impl DatasetPreFilter { let mut frag_id_deletion_vectors = stream::iter(frag_id_deletion_vectors) .buffer_unordered(dataset.object_store.io_parallelism()); - let mut deleted_ids = RowIdTreeMap::new(); + let mut deleted_ids = RowAddrTreeMap::new(); while let Some((id, deletion_vector)) = frag_id_deletion_vectors.try_next().await? { deleted_ids.insert_bitmap(id, deletion_vector); } @@ -152,16 +151,16 @@ impl DatasetPreFilter { // on a blocking thread. let allow_list = spawn_cpu(move || { Ok(row_ids_and_deletions.into_iter().fold( - RowIdTreeMap::new(), + RowAddrTreeMap::new(), |mut allow_list, (row_ids, deletion_vector)| { let seq = if let Some(deletion_vector) = deletion_vector { let mut row_ids = row_ids.as_ref().clone(); row_ids.mask(deletion_vector.iter()).unwrap(); - Cow::Owned(row_ids) + Cow::::Owned(row_ids) } else { - Cow::Borrowed(row_ids.as_ref()) + Cow::::Borrowed(row_ids.as_ref()) }; - let treemap = RowIdTreeMap::from(seq.as_ref()); + let treemap = RowAddrTreeMap::from(seq.as_ref()); allow_list |= treemap; allow_list }, @@ -360,7 +359,7 @@ mod test { ); assert!(mask.is_some()); let mask = mask.unwrap().await.unwrap(); - let mut expected = RowIdTreeMap::from_iter(vec![(2 << 32) + 2]); + let mut expected = RowAddrTreeMap::from_iter(vec![(2 << 32) + 2]); expected.insert_fragment(1); assert_eq!(&mask.block_list, &Some(expected)); @@ -380,7 +379,7 @@ mod test { ); assert!(mask.is_some()); let mask = mask.unwrap().await.unwrap(); - let mut expected = RowIdTreeMap::new(); + let mut expected = RowAddrTreeMap::new(); expected.insert_fragment(1); expected.insert_fragment(2); assert_eq!(&mask.block_list, &Some(expected)); @@ -405,7 +404,7 @@ mod test { ); assert!(mask.is_some()); let mask = mask.unwrap().await.unwrap(); - let expected = RowIdTreeMap::from_iter(0..8); + let expected = RowAddrTreeMap::from_iter(0..8); assert_eq!(mask.allow_list, Some(expected)); // There was just one row deleted. // If there are deletions and missing fragments, we should get an allow list diff --git a/rust/lance/src/io/commit.rs b/rust/lance/src/io/commit.rs index 70480e2177..f0064ad60c 100644 --- a/rust/lance/src/io/commit.rs +++ b/rust/lance/src/io/commit.rs @@ -26,7 +26,7 @@ use std::time::Instant; use conflict_resolver::TransactionRebase; use lance_core::utils::backoff::{Backoff, SlotBackoff}; -use lance_core::utils::mask::RowIdTreeMap; +use lance_core::utils::mask::RowAddrTreeMap; use lance_file::version::LanceFileVersion; use lance_index::metrics::NoOpMetricsCollector; use lance_io::utils::CachedFileSize; @@ -744,7 +744,7 @@ pub(crate) async fn commit_transaction( write_config: &ManifestWriteConfig, commit_config: &CommitConfig, manifest_naming_scheme: ManifestNamingScheme, - affected_rows: Option<&RowIdTreeMap>, + affected_rows: Option<&RowAddrTreeMap>, ) -> Result<(Manifest, ManifestLocation)> { // Note: object_store has been configured with WriteParams, but dataset.object_store() // has not necessarily. So for anything involving writing, use `object_store`. diff --git a/rust/lance/src/io/commit/conflict_resolver.rs b/rust/lance/src/io/commit/conflict_resolver.rs index 3d2946067d..5669608a4d 100644 --- a/rust/lance/src/io/commit/conflict_resolver.rs +++ b/rust/lance/src/io/commit/conflict_resolver.rs @@ -9,7 +9,7 @@ use crate::{ }; use futures::{StreamExt, TryStreamExt}; use lance_core::{ - utils::{deletion::DeletionVector, mask::RowIdTreeMap}, + utils::{deletion::DeletionVector, mask::RowAddrTreeMap}, Error, Result, }; use lance_index::frag_reuse::FRAG_REUSE_INDEX_NAME; @@ -31,7 +31,7 @@ pub struct TransactionRebase<'a> { initial_fragments: HashMap, /// Fragments that have been deleted or modified modified_fragment_ids: HashSet, - affected_rows: Option<&'a RowIdTreeMap>, + affected_rows: Option<&'a RowAddrTreeMap>, conflicting_frag_reuse_indices: Vec, } @@ -39,7 +39,7 @@ impl<'a> TransactionRebase<'a> { pub async fn try_new( dataset: &Dataset, transaction: Transaction, - affected_rows: Option<&'a RowIdTreeMap>, + affected_rows: Option<&'a RowAddrTreeMap>, ) -> Result { match &transaction.operation { // These operations add new fragments or don't modify any. @@ -1398,7 +1398,7 @@ impl<'a> TransactionRebase<'a> { .await?; // Check for row-level conflicts - let mut existing_deletions = RowIdTreeMap::new(); + let mut existing_deletions = RowAddrTreeMap::new(); for (fragment_id, deletion_vec) in existing_deletion_vecs { existing_deletions .insert_bitmap(fragment_id as u32, deletion_vec.as_ref().into()); @@ -1406,7 +1406,7 @@ impl<'a> TransactionRebase<'a> { let conflicting_rows = existing_deletions.clone() & affected_rows.clone(); if conflicting_rows.len().map(|v| v > 0).unwrap_or(true) { let sample_addressed = conflicting_rows - .row_ids() + .row_addrs() .unwrap() .take(5) .collect::>(); @@ -1945,7 +1945,7 @@ mod tests { for (i, transaction) in transactions.iter().enumerate() { let previous_transactions = transactions.iter().take(i).cloned().collect::>(); - let affected_rows = RowIdTreeMap::from_iter([i as u64]); + let affected_rows = RowAddrTreeMap::from_iter([i as u64]); let mut rebase = TransactionRebase::try_new(&dataset, transaction.clone(), Some(&affected_rows)) .await @@ -2112,7 +2112,7 @@ mod tests { .await .unwrap(); - let affected_rows = RowIdTreeMap::from_iter([0]); + let affected_rows = RowAddrTreeMap::from_iter([0]); dataset.object_store().io_stats_incremental(); // reset let mut rebase = TransactionRebase::try_new(&dataset, txn.clone(), Some(&affected_rows)) diff --git a/rust/lance/src/io/exec/scalar_index.rs b/rust/lance/src/io/exec/scalar_index.rs index 82ba17efe0..6e1718bdc9 100644 --- a/rust/lance/src/io/exec/scalar_index.rs +++ b/rust/lance/src/io/exec/scalar_index.rs @@ -28,7 +28,7 @@ use futures::{stream::BoxStream, Stream, StreamExt, TryFutureExt, TryStreamExt}; use lance_core::{ utils::{ address::RowAddress, - mask::{RowIdMask, RowIdTreeMap}, + mask::{RowAddrTreeMap, RowIdMask}, }, Error, Result, ROW_ID_FIELD, }; @@ -326,7 +326,7 @@ impl MapIndexExec { let allow_list = allow_list - .row_ids() + .row_addrs() .ok_or(datafusion::error::DataFusionError::External( "IndexedLookupExec: row addresses didn't have an iterable allow list" .into(), @@ -611,7 +611,7 @@ async fn row_ids_for_mask( (Some(mut allow_list), None) => { retain_fragments(&mut allow_list, fragments, dataset).await?; - if let Some(allow_list_iter) = allow_list.row_ids() { + if let Some(allow_list_iter) = allow_list.row_addrs() { Ok(allow_list_iter.map(u64::from).collect::>()) } else { // We shouldn't hit this branch if the row ids are stable. @@ -649,7 +649,7 @@ async fn row_ids_for_mask( // We need to filter out irrelevant fragments as well. retain_fragments(&mut allow_list, fragments, dataset).await?; - if let Some(allow_list_iter) = allow_list.row_ids() { + if let Some(allow_list_iter) = allow_list.row_addrs() { Ok(allow_list_iter .filter_map(|addr| { let row_id = u64::from(addr); @@ -672,14 +672,14 @@ async fn row_ids_for_mask( } async fn retain_fragments( - allow_list: &mut RowIdTreeMap, + allow_list: &mut RowAddrTreeMap, fragments: &[Fragment], dataset: &Dataset, ) -> Result<()> { if dataset.manifest.uses_stable_row_ids() { let fragment_ids = load_row_id_sequences(dataset, fragments) - .map_ok(|(_frag_id, sequence)| RowIdTreeMap::from(sequence.as_ref())) - .try_fold(RowIdTreeMap::new(), |mut acc, tree| async { + .map_ok(|(_frag_id, sequence)| RowAddrTreeMap::from(sequence.as_ref())) + .try_fold(RowAddrTreeMap::new(), |mut acc, tree| async { acc |= tree; Ok(acc) }) diff --git a/rust/lance/src/io/exec/utils.rs b/rust/lance/src/io/exec/utils.rs index c5b3753c5a..0cde560e88 100644 --- a/rust/lance/src/io/exec/utils.rs +++ b/rust/lance/src/io/exec/utils.rs @@ -27,7 +27,7 @@ use datafusion::physical_plan::{ use futures::{Stream, StreamExt, TryStreamExt}; use lance_core::error::{CloneableResult, Error}; use lance_core::utils::futures::{Capacity, SharedStreamExt}; -use lance_core::utils::mask::{RowIdMask, RowIdTreeMap}; +use lance_core::utils::mask::{RowAddrTreeMap, RowIdMask}; use lance_core::{Result, ROW_ID}; use lance_index::prefilter::FilterLoader; use snafu::location; @@ -76,7 +76,7 @@ pub(crate) struct FilteredRowIdsToPrefilter(pub SendableRecordBatchStream); #[async_trait] impl FilterLoader for FilteredRowIdsToPrefilter { async fn load(mut self: Box) -> Result { - let mut allow_list = RowIdTreeMap::new(); + let mut allow_list = RowAddrTreeMap::new(); while let Some(batch) = self.0.next().await { let batch = batch?; let row_ids = batch.column_by_name(ROW_ID).ok_or_else(|| Error::Internal {