Skip to content
Open
39 changes: 36 additions & 3 deletions rust/lance-index/src/scalar/bitmap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ use crate::{metrics::MetricsCollector, Index, IndexType};
use crate::{scalar::expression::ScalarQueryParser, scalar::IndexReader};

pub const BITMAP_LOOKUP_NAME: &str = "bitmap_page_lookup.lance";
pub const INDEX_STATS_METADATA_KEY: &str = "lance:index_stats";

const MAX_BITMAP_ARRAY_LENGTH: usize = i32::MAX as usize - 1024 * 1024; // leave headroom

Expand Down Expand Up @@ -601,6 +602,7 @@ impl BitmapIndexPlugin {
index_store: &dyn IndexStore,
value_type: &DataType,
) -> Result<()> {
let num_bitmaps = state.len();
let schema = Arc::new(Schema::new(vec![
Field::new("keys", value_type.clone(), true),
Field::new("bitmaps", DataType::Binary, true),
Expand Down Expand Up @@ -653,8 +655,17 @@ impl BitmapIndexPlugin {
bitmap_index_file.write_record_batch(record_batch).await?;
}

// Finish file once at the end - this creates the file even if we wrote no batches
bitmap_index_file.finish().await?;
// Finish file with metadata that allows lightweight statistics reads
let stats_json = serde_json::to_string(&BitmapStatistics { num_bitmaps }).map_err(|e| {
Error::Internal {
message: format!("failed to serialize bitmap statistics: {e}"),
location: location!(),
}
})?;
let mut metadata = HashMap::new();
metadata.insert(INDEX_STATS_METADATA_KEY.to_string(), stats_json);

bitmap_index_file.finish_with_metadata(metadata).await?;

Ok(())
}
Expand Down Expand Up @@ -719,6 +730,10 @@ impl ScalarIndexPlugin for BitmapIndexPlugin {
true
}

fn index_type(&self) -> IndexType {
IndexType::Bitmap
}

fn version(&self) -> u32 {
BITMAP_INDEX_VERSION
}
Expand Down Expand Up @@ -763,6 +778,23 @@ impl ScalarIndexPlugin for BitmapIndexPlugin {
) -> Result<Arc<dyn ScalarIndex>> {
Ok(BitmapIndex::load(index_store, frag_reuse_index, cache).await? as Arc<dyn ScalarIndex>)
}

async fn load_statistics(
&self,
index_store: Arc<dyn IndexStore>,
_index_details: &prost_types::Any,
) -> Result<Option<serde_json::Value>> {
let reader = index_store.open_index_file(BITMAP_LOOKUP_NAME).await?;
if let Some(value) = reader.schema().metadata.get(INDEX_STATS_METADATA_KEY) {
let stats = serde_json::from_str(value).map_err(|e| Error::Internal {
message: format!("failed to parse bitmap statistics metadata: {e}"),
location: location!(),
})?;
Ok(Some(stats))
} else {
Ok(None)
}
}
}

#[cfg(test)]
Expand All @@ -771,11 +803,12 @@ pub mod tests {
use crate::metrics::NoOpMetricsCollector;
use crate::scalar::lance_format::LanceIndexStore;
use arrow_array::{RecordBatch, StringArray, UInt64Array};
use arrow_schema::{Field, Schema};
use arrow_schema::{DataType, Field, Schema};
use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
use futures::stream;
use lance_core::utils::{address::RowAddress, tempfile::TempObjDir};
use lance_io::object_store::ObjectStore;
use std::collections::HashMap;

#[tokio::test]
async fn test_bitmap_lazy_loading_and_cache() {
Expand Down
12 changes: 12 additions & 0 deletions rust/lance-index/src/scalar/bloomfilter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1285,6 +1285,10 @@ impl ScalarIndexPlugin for BloomFilterIndexPlugin {
false
}

fn index_type(&self) -> IndexType {
IndexType::BloomFilter
}

fn version(&self) -> u32 {
BLOOMFILTER_INDEX_VERSION
}
Expand All @@ -1309,6 +1313,14 @@ impl ScalarIndexPlugin for BloomFilterIndexPlugin {
as Arc<dyn ScalarIndex>,
)
}

async fn load_statistics(
&self,
_index_store: Arc<dyn IndexStore>,
_index_details: &prost_types::Any,
) -> Result<Option<serde_json::Value>> {
Ok(None)
}
}

#[derive(Debug)]
Expand Down
4 changes: 4 additions & 0 deletions rust/lance-index/src/scalar/btree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1939,6 +1939,10 @@ impl ScalarIndexPlugin for BTreeIndexPlugin {
true
}

fn index_type(&self) -> IndexType {
IndexType::BTree
}

fn version(&self) -> u32 {
BTREE_INDEX_VERSION
}
Expand Down
5 changes: 5 additions & 0 deletions rust/lance-index/src/scalar/inverted.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ use crate::{
registry::{ScalarIndexPlugin, TrainingCriteria, TrainingOrdering, TrainingRequest},
CreatedIndex, ScalarIndex,
},
IndexType,
};

use super::IndexStore;
Expand Down Expand Up @@ -137,6 +138,10 @@ impl ScalarIndexPlugin for InvertedIndexPlugin {
false
}

fn index_type(&self) -> IndexType {
IndexType::Inverted
}

fn version(&self) -> u32 {
INVERTED_INDEX_VERSION
}
Expand Down
4 changes: 4 additions & 0 deletions rust/lance-index/src/scalar/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -742,6 +742,10 @@ impl ScalarIndexPlugin for JsonIndexPlugin {
true
}

fn index_type(&self) -> IndexType {
IndexType::Scalar
}

fn attach_registry(&self, registry: Arc<IndexPluginRegistry>) {
let mut reg_ref = self.registry.lock().unwrap();
*reg_ref = Some(registry);
Expand Down
4 changes: 4 additions & 0 deletions rust/lance-index/src/scalar/label_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,10 @@ impl ScalarIndexPlugin for LabelListIndexPlugin {
true
}

fn index_type(&self) -> IndexType {
IndexType::LabelList
}

fn version(&self) -> u32 {
LABEL_LIST_INDEX_VERSION
}
Expand Down
4 changes: 4 additions & 0 deletions rust/lance-index/src/scalar/ngram.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1278,6 +1278,10 @@ impl ScalarIndexPlugin for NGramIndexPlugin {
false
}

fn index_type(&self) -> IndexType {
IndexType::NGram
}

fn version(&self) -> u32 {
NGRAM_INDEX_VERSION
}
Expand Down
13 changes: 13 additions & 0 deletions rust/lance-index/src/scalar/registry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use crate::registry::IndexPluginRegistry;
use crate::{
frag_reuse::FragReuseIndex,
scalar::{expression::ScalarQueryParser, CreatedIndex, IndexStore, ScalarIndex},
IndexType,
};

pub const VALUE_COLUMN_NAME: &str = "value";
Expand Down Expand Up @@ -129,6 +130,9 @@ pub trait ScalarIndexPlugin: Send + Sync + std::fmt::Debug {
/// Returns true if the index returns an exact answer (e.g. not AtMost)
fn provides_exact_answer(&self) -> bool;

/// Returns the index type for this plugin
fn index_type(&self) -> IndexType;
Comment on lines +133 to +134
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Say someone creates a new custom index plugin, that doesn't fit any of the enum variant we have built into our library. What do they return here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of this, could we have something like:

impl IndexType {
    fn try_from_pb_name(pb_name: &str) -> Option<Self> { ... }
}

Seems like we should be able to derive from the name, right?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc @westonpace in case you have any thoughts on this

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed. We should be trying to move away from IndexType enums (because they cannot work with plugins). We should not add this method.

With the introduction of describe_indices every index has two names. The fully qualified type URL (unique, but not friendly, this is the name of the protobuf details message), for example /lance.index.pb.JsonIndexDetails and the short name (friendly, but not unique). The short name is provided by the ScalarIndexPlugin::name method.

The index statistics currently has a index_type. This should be updated to have both index_uri and index_typename. We can leave index_type for legacy / backwards compatibility. We can add a method near async fn index_statistics which maps from index_uri to index_type for the old indexes...

fn legacy_type_name(index_uri: &str) -> String {
  match index_uri {
    BTreeIndexDetails::name => IndexType::BTree.to_string(),
    ...
    _ => "N/A".to_string() // This is a new index type, we don't populate this field any longer
  }
}

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added, PTAL.


/// The version of the index plugin
///
/// We assume that indexes are not forwards compatible. If an index was written with a
Expand Down Expand Up @@ -156,6 +160,15 @@ pub trait ScalarIndexPlugin: Send + Sync + std::fmt::Debug {
cache: &LanceCache,
) -> Result<Arc<dyn ScalarIndex>>;

/// Optional hook allowing a plugin to provide statistics without loading the index.
async fn load_statistics(
&self,
_index_store: Arc<dyn IndexStore>,
_index_details: &prost_types::Any,
) -> Result<Option<serde_json::Value>> {
Ok(None)
}

/// Optional hook that plugins can use if they need to be aware of the registry
fn attach_registry(&self, _registry: Arc<IndexPluginRegistry>) {}

Expand Down
4 changes: 4 additions & 0 deletions rust/lance-index/src/scalar/zonemap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1023,6 +1023,10 @@ impl ScalarIndexPlugin for ZoneMapIndexPlugin {
false
}

fn index_type(&self) -> IndexType {
IndexType::ZoneMap
}

fn version(&self) -> u32 {
ZONEMAP_INDEX_VERSION
}
Expand Down
Loading
Loading