diff --git a/Cargo.toml b/Cargo.toml index 32d7bd990c..558f14378d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -56,6 +56,7 @@ itertools = "0.14.0" measure_time = "0.9.0" arc-swap = "1.5.0" bon = "3.3.1" +i_triangle = "0.38.0" columnar = { version = "0.6", path = "./columnar", package = "tantivy-columnar" } sstable = { version = "0.6", path = "./sstable", package = "tantivy-sstable", optional = true } @@ -70,6 +71,7 @@ futures-util = { version = "0.3.28", optional = true } futures-channel = { version = "0.3.28", optional = true } fnv = "1.0.7" typetag = "0.2.21" +geo-types = "0.7.17" [target.'cfg(windows)'.dependencies] winapi = "0.3.9" diff --git a/examples/geo_json.rs b/examples/geo_json.rs new file mode 100644 index 0000000000..5829c22d24 --- /dev/null +++ b/examples/geo_json.rs @@ -0,0 +1,66 @@ +use geo_types::Point; +use tantivy::collector::TopDocs; +use tantivy::query::SpatialQuery; +use tantivy::schema::{Schema, Value, SPATIAL, STORED, TEXT}; +use tantivy::spatial::point::GeoPoint; +use tantivy::{Index, IndexWriter, TantivyDocument}; +fn main() -> tantivy::Result<()> { + let mut schema_builder = Schema::builder(); + schema_builder.add_json_field("properties", STORED | TEXT); + schema_builder.add_spatial_field("geometry", STORED | SPATIAL); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema.clone()); + let mut index_writer: IndexWriter = index.writer(50_000_000)?; + let doc = TantivyDocument::parse_json( + &schema, + r#"{ + "type":"Feature", + "geometry":{ + "type":"Polygon", + "coordinates":[[[-99.483911,45.577697],[-99.483869,45.571457],[-99.481739,45.571461],[-99.474881,45.571584],[-99.473167,45.571615],[-99.463394,45.57168],[-99.463391,45.57883],[-99.463368,45.586076],[-99.48177,45.585926],[-99.48384,45.585953],[-99.483885,45.57873],[-99.483911,45.577697]]] + }, + "properties":{ + "admin_level":"8", + "border_type":"city", + "boundary":"administrative", + "gnis:feature_id":"1267426", + "name":"Hosmer", + "place":"city", + "source":"TIGER/Line® 2008 Place Shapefiles (http://www.census.gov/geo/www/tiger/)", + "wikidata":"Q2442118", + "wikipedia":"en:Hosmer, South Dakota" + } + }"#, + )?; + index_writer.add_document(doc)?; + index_writer.commit()?; + + let reader = index.reader()?; + let searcher = reader.searcher(); + let field = schema.get_field("geometry").unwrap(); + let query = SpatialQuery::new( + field, + [ + GeoPoint { + lon: -99.49, + lat: 45.56, + }, + GeoPoint { + lon: -99.45, + lat: 45.59, + }, + ], + tantivy::query::SpatialQueryType::Intersects, + ); + let hits = searcher.search(&query, &TopDocs::with_limit(10).order_by_score())?; + for (_score, doc_address) in &hits { + let retrieved_doc: TantivyDocument = searcher.doc(*doc_address)?; + if let Some(field_value) = retrieved_doc.get_first(field) { + if let Some(geometry_box) = field_value.as_value().into_geometry() { + println!("Retrieved geometry: {:?}", geometry_box); + } + } + } + assert_eq!(hits.len(), 1); + Ok(()) +} diff --git a/src/core/json_utils.rs b/src/core/json_utils.rs index ade65fa110..36c681b604 100644 --- a/src/core/json_utils.rs +++ b/src/core/json_utils.rs @@ -227,6 +227,9 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>( ReferenceValueLeaf::IpAddr(_) => { unimplemented!("IP address support in dynamic fields is not yet implemented") } + ReferenceValueLeaf::Geometry(_) => { + unimplemented!("Geometry support in dynamic fields is not implemented") + } }, ReferenceValue::Array(elements) => { for val in elements { diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 726b9b76a6..f01e891f50 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -683,7 +683,7 @@ mod tests { } #[test] - fn test_datefastfield() -> crate::Result<()> { + fn test_datefastfield() { let mut schema_builder = Schema::builder(); let date_field = schema_builder.add_date_field( "date", @@ -697,22 +697,28 @@ mod tests { ); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); - let mut index_writer = index.writer_for_tests()?; + let mut index_writer = index.writer_for_tests().unwrap(); index_writer.set_merge_policy(Box::new(NoMergePolicy)); - index_writer.add_document(doc!( - date_field => DateTime::from_u64(1i64.to_u64()), - multi_date_field => DateTime::from_u64(2i64.to_u64()), - multi_date_field => DateTime::from_u64(3i64.to_u64()) - ))?; - index_writer.add_document(doc!( - date_field => DateTime::from_u64(4i64.to_u64()) - ))?; - index_writer.add_document(doc!( - multi_date_field => DateTime::from_u64(5i64.to_u64()), - multi_date_field => DateTime::from_u64(6i64.to_u64()) - ))?; - index_writer.commit()?; - let reader = index.reader()?; + index_writer + .add_document(doc!( + date_field => DateTime::from_u64(1i64.to_u64()), + multi_date_field => DateTime::from_u64(2i64.to_u64()), + multi_date_field => DateTime::from_u64(3i64.to_u64()) + )) + .unwrap(); + index_writer + .add_document(doc!( + date_field => DateTime::from_u64(4i64.to_u64()) + )) + .unwrap(); + index_writer + .add_document(doc!( + multi_date_field => DateTime::from_u64(5i64.to_u64()), + multi_date_field => DateTime::from_u64(6i64.to_u64()) + )) + .unwrap(); + index_writer.commit().unwrap(); + let reader = index.reader().unwrap(); let searcher = reader.searcher(); assert_eq!(searcher.segment_readers().len(), 1); let segment_reader = searcher.segment_reader(0); @@ -746,7 +752,6 @@ mod tests { assert_eq!(dates[0].into_timestamp_nanos(), 5i64); assert_eq!(dates[1].into_timestamp_nanos(), 6i64); } - Ok(()) } #[test] diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index a1288e0ad1..39d81c10ee 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -189,6 +189,9 @@ impl FastFieldsWriter { .record_str(doc_id, field_name, &token.text); } } + ReferenceValueLeaf::Geometry(_) => { + panic!("Geometry fields should not be routed to fast field writer") + } }, ReferenceValue::Array(val) => { // TODO: Check this is the correct behaviour we want. @@ -320,6 +323,9 @@ fn record_json_value_to_columnar_writer<'a, V: Value<'a>>( "Pre-tokenized string support in dynamic fields is not yet implemented" ) } + ReferenceValueLeaf::Geometry(_) => { + unimplemented!("Geometry support in dynamic fields is not yet implemented") + } }, ReferenceValue::Array(elements) => { for el in elements { diff --git a/src/index/index_meta.rs b/src/index/index_meta.rs index 86eaa35d6c..7a5cb2649d 100644 --- a/src/index/index_meta.rs +++ b/src/index/index_meta.rs @@ -142,6 +142,7 @@ impl SegmentMeta { SegmentComponent::FastFields => ".fast".to_string(), SegmentComponent::FieldNorms => ".fieldnorm".to_string(), SegmentComponent::Delete => format!(".{}.del", self.delete_opstamp().unwrap_or(0)), + SegmentComponent::Spatial => ".spatial".to_string(), }); PathBuf::from(path) } diff --git a/src/index/segment_component.rs b/src/index/segment_component.rs index 42ac1d178c..4e208ae1eb 100644 --- a/src/index/segment_component.rs +++ b/src/index/segment_component.rs @@ -28,12 +28,14 @@ pub enum SegmentComponent { /// Bitset describing which document of the segment is alive. /// (It was representing deleted docs but changed to represent alive docs from v0.17) Delete, + /// HUSH + Spatial, } impl SegmentComponent { /// Iterates through the components. pub fn iterator() -> slice::Iter<'static, SegmentComponent> { - static SEGMENT_COMPONENTS: [SegmentComponent; 8] = [ + static SEGMENT_COMPONENTS: [SegmentComponent; 9] = [ SegmentComponent::Postings, SegmentComponent::Positions, SegmentComponent::FastFields, @@ -42,6 +44,7 @@ impl SegmentComponent { SegmentComponent::Store, SegmentComponent::TempStore, SegmentComponent::Delete, + SegmentComponent::Spatial, ]; SEGMENT_COMPONENTS.iter() } diff --git a/src/index/segment_reader.rs b/src/index/segment_reader.rs index f5589a6902..1afe55b2ba 100644 --- a/src/index/segment_reader.rs +++ b/src/index/segment_reader.rs @@ -14,6 +14,7 @@ use crate::index::{InvertedIndexReader, Segment, SegmentComponent, SegmentId}; use crate::json_utils::json_path_sep_to_dot; use crate::schema::{Field, IndexRecordOption, Schema, Type}; use crate::space_usage::SegmentSpaceUsage; +use crate::spatial::reader::SpatialReaders; use crate::store::StoreReader; use crate::termdict::TermDictionary; use crate::{DocId, Opstamp}; @@ -43,6 +44,7 @@ pub struct SegmentReader { positions_composite: CompositeFile, fast_fields_readers: FastFieldReaders, fieldnorm_readers: FieldNormReaders, + spatial_readers: SpatialReaders, store_file: FileSlice, alive_bitset_opt: Option, @@ -92,6 +94,11 @@ impl SegmentReader { &self.fast_fields_readers } + /// HUSH + pub fn spatial_fields(&self) -> &SpatialReaders { + &self.spatial_readers + } + /// Accessor to the `FacetReader` associated with a given `Field`. pub fn facet_reader(&self, field_name: &str) -> crate::Result { let schema = self.schema(); @@ -173,6 +180,12 @@ impl SegmentReader { let fast_fields_readers = FastFieldReaders::open(fast_fields_data, schema.clone())?; let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?; let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?; + let spatial_readers = if schema.contains_spatial_field() { + let spatial_data = segment.open_read(SegmentComponent::Spatial)?; + SpatialReaders::open(spatial_data)? + } else { + SpatialReaders::empty() + }; let original_bitset = if segment.meta().has_deletes() { let alive_doc_file_slice = segment.open_read(SegmentComponent::Delete)?; @@ -198,6 +211,7 @@ impl SegmentReader { postings_composite, fast_fields_readers, fieldnorm_readers, + spatial_readers, segment_id: segment.id(), delete_opstamp: segment.meta().delete_opstamp(), store_file, @@ -460,6 +474,7 @@ impl SegmentReader { self.positions_composite.space_usage(), self.fast_fields_readers.space_usage(self.schema())?, self.fieldnorm_readers.space_usage(), + self.spatial_readers.space_usage(), self.get_store_reader(0)?.space_usage(), self.alive_bitset_opt .as_ref() diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 47ac5a55b4..6af60d361e 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -1,3 +1,5 @@ +use std::collections::HashMap; +use std::io::{BufWriter, Write}; use std::sync::Arc; use columnar::{ @@ -6,6 +8,7 @@ use columnar::{ use common::ReadOnlyBitSet; use itertools::Itertools; use measure_time::debug_time; +use tempfile::NamedTempFile; use crate::directory::WritePtr; use crate::docset::{DocSet, TERMINATED}; @@ -17,6 +20,8 @@ use crate::indexer::doc_id_mapping::{MappingType, SegmentDocIdMapping}; use crate::indexer::SegmentSerializer; use crate::postings::{InvertedIndexSerializer, Postings, SegmentPostings}; use crate::schema::{value_type_to_column_type, Field, FieldType, Schema}; +use crate::spatial::bkd::LeafPageIterator; +use crate::spatial::triangle::Triangle; use crate::store::StoreWriter; use crate::termdict::{TermMerger, TermOrdinal}; use crate::{DocAddress, DocId, InvertedIndexReader}; @@ -170,6 +175,7 @@ impl IndexMerger { let mut readers = vec![]; for (segment, new_alive_bitset_opt) in segments.iter().zip(alive_bitset_opt) { if segment.meta().num_docs() > 0 { + dbg!("segment"); let reader = SegmentReader::open_with_custom_alive_set(segment, new_alive_bitset_opt)?; readers.push(reader); @@ -520,6 +526,89 @@ impl IndexMerger { Ok(()) } + fn write_spatial_fields( + &self, + serializer: &mut SegmentSerializer, + doc_id_mapping: &SegmentDocIdMapping, + ) -> crate::Result<()> { + /// We need to rebuild a BKD-tree based off the list of triangles. + /// + /// Because the data can be large, we do this by writing the sequence of triangles to + /// disk, and mmapping it as mutable slice, and calling the same code as what + /// is done for the segment serialization. + /// + /// The OS is in charge of deciding how to handle its page cache. + /// This is the same as what would have happened with swapping, + /// except by explicitly mapping the file, the OS is more likely to + /// swap, the memory will not be accounted as anonymous memory, + /// swap space is reserved etc. + use crate::spatial::bkd::Segment; + + let Some(mut spatial_serializer) = serializer.extract_spatial_serializer() else { + // The schema does not contain any spatial field. + return Ok(()); + }; + + let mut segment_mappings: Vec>> = Vec::new(); + for reader in &self.readers { + let max_doc = reader.max_doc(); + segment_mappings.push(vec![None; max_doc as usize]); + } + for (new_doc_id, old_doc_addr) in doc_id_mapping.iter_old_doc_addrs().enumerate() { + segment_mappings[old_doc_addr.segment_ord as usize][old_doc_addr.doc_id as usize] = + Some(new_doc_id as DocId); + } + let mut temp_files: HashMap = HashMap::new(); + + for (field, field_entry) in self.schema.fields() { + if matches!(field_entry.field_type(), FieldType::Spatial(_)) { + temp_files.insert(field, NamedTempFile::new()?); + } + } + for (segment_ord, reader) in self.readers.iter().enumerate() { + for (field, temp_file) in &mut temp_files { + let mut buf_temp_file = BufWriter::new(temp_file); + let spatial_readers = reader.spatial_fields(); + let Some(spatial_reader) = spatial_readers.get_field(*field)? else { + continue; + }; + let segment = Segment::new(spatial_reader.get_bytes()); + for triangle_result in LeafPageIterator::new(&segment) { + let triangles = triangle_result?; + for triangle in triangles { + if let Some(new_doc_id) = + segment_mappings[segment_ord][triangle.doc_id as usize] + { + // This is really just a temporary file, not meant to be portable, so we + // use native endianness here. + for &word in &triangle.words { + buf_temp_file.write_all(&word.to_ne_bytes())?; + } + buf_temp_file.write_all(&new_doc_id.to_ne_bytes())?; + } + } + } + buf_temp_file.flush()?; + // No need to fsync here. This file is not here for persistency. + } + } + for (field, temp_file) in temp_files { + // Memory map the triangle file. + use memmap2::MmapOptions; + let mmap = unsafe { MmapOptions::new().map_mut(temp_file.as_file())? }; + // Cast to &[Triangle] slice + let triangle_count = mmap.len() / std::mem::size_of::(); + let triangles = unsafe { + std::slice::from_raw_parts_mut(mmap.as_ptr() as *mut Triangle, triangle_count) + }; + // Get spatial writer and rebuild block kd-tree. + spatial_serializer.serialize_field(field, triangles)?; + } + spatial_serializer.close()?; + + Ok(()) + } + /// Writes the merged segment by pushing information /// to the `SegmentSerializer`. /// @@ -544,9 +633,10 @@ impl IndexMerger { debug!("write-storagefields"); self.write_storable_fields(serializer.get_store_writer())?; + debug!("write-spatialfields"); + self.write_spatial_fields(&mut serializer, &doc_id_mapping)?; debug!("write-fastfields"); self.write_fast_fields(serializer.get_fast_field_write(), doc_id_mapping)?; - debug!("close-serializer"); serializer.close()?; Ok(self.max_doc) diff --git a/src/indexer/segment_serializer.rs b/src/indexer/segment_serializer.rs index 057a51b10c..a6143f1703 100644 --- a/src/indexer/segment_serializer.rs +++ b/src/indexer/segment_serializer.rs @@ -4,6 +4,7 @@ use crate::directory::WritePtr; use crate::fieldnorm::FieldNormsSerializer; use crate::index::{Segment, SegmentComponent}; use crate::postings::InvertedIndexSerializer; +use crate::spatial::serializer::SpatialSerializer; use crate::store::StoreWriter; /// Segment serializer is in charge of laying out on disk @@ -12,6 +13,7 @@ pub struct SegmentSerializer { segment: Segment, pub(crate) store_writer: StoreWriter, fast_field_write: WritePtr, + spatial_serializer: Option, fieldnorms_serializer: Option, postings_serializer: InvertedIndexSerializer, } @@ -35,11 +37,20 @@ impl SegmentSerializer { let fieldnorms_write = segment.open_write(SegmentComponent::FieldNorms)?; let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?; + let spatial_serializer: Option = + if segment.schema().contains_spatial_field() { + let spatial_write = segment.open_write(SegmentComponent::Spatial)?; + Some(SpatialSerializer::from_write(spatial_write)?) + } else { + None + }; + let postings_serializer = InvertedIndexSerializer::open(&mut segment)?; Ok(SegmentSerializer { segment, store_writer, fast_field_write, + spatial_serializer, fieldnorms_serializer: Some(fieldnorms_serializer), postings_serializer, }) @@ -64,6 +75,11 @@ impl SegmentSerializer { &mut self.fast_field_write } + /// Accessor to the `SpatialSerializer` + pub fn extract_spatial_serializer(&mut self) -> Option { + self.spatial_serializer.take() + } + /// Extract the field norm serializer. /// /// Note the fieldnorms serializer can only be extracted once. @@ -81,6 +97,9 @@ impl SegmentSerializer { if let Some(fieldnorms_serializer) = self.extract_fieldnorms_serializer() { fieldnorms_serializer.close()?; } + if let Some(spatial_serializer) = self.extract_spatial_serializer() { + spatial_serializer.close()?; + } self.fast_field_write.terminate()?; self.postings_serializer.close()?; self.store_writer.close()?; diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 30338187cd..630bd4fc51 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -16,6 +16,7 @@ use crate::postings::{ }; use crate::schema::document::{Document, Value}; use crate::schema::{FieldEntry, FieldType, Schema, DATE_TIME_PRECISION_INDEXED}; +use crate::spatial::writer::SpatialWriter; use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer}; use crate::{DocId, Opstamp, TantivyError}; @@ -52,6 +53,7 @@ pub struct SegmentWriter { pub(crate) segment_serializer: SegmentSerializer, pub(crate) fast_field_writers: FastFieldsWriter, pub(crate) fieldnorms_writer: FieldNormsWriter, + pub(crate) spatial_writer: SpatialWriter, pub(crate) json_path_writer: JsonPathWriter, pub(crate) json_positions_per_path: IndexingPositionsPerPath, pub(crate) doc_opstamps: Vec, @@ -104,6 +106,7 @@ impl SegmentWriter { ctx: IndexingContext::new(table_size), per_field_postings_writers, fieldnorms_writer: FieldNormsWriter::for_schema(&schema), + spatial_writer: SpatialWriter::default(), json_path_writer: JsonPathWriter::default(), json_positions_per_path: IndexingPositionsPerPath::default(), segment_serializer, @@ -130,6 +133,7 @@ impl SegmentWriter { self.ctx, self.fast_field_writers, &self.fieldnorms_writer, + &mut self.spatial_writer, self.segment_serializer, )?; Ok(self.doc_opstamps) @@ -142,6 +146,7 @@ impl SegmentWriter { + self.fieldnorms_writer.mem_usage() + self.fast_field_writers.mem_usage() + self.segment_serializer.mem_usage() + + self.spatial_writer.mem_usage() } fn index_document(&mut self, doc: &D) -> crate::Result<()> { @@ -338,6 +343,13 @@ impl SegmentWriter { self.fieldnorms_writer.record(doc_id, field, num_vals); } } + FieldType::Spatial(_) => { + for value in values { + if let Some(geometry) = value.as_geometry() { + self.spatial_writer.add_geometry(doc_id, field, *geometry); + } + } + } } } Ok(()) @@ -392,12 +404,16 @@ fn remap_and_write( ctx: IndexingContext, fast_field_writers: FastFieldsWriter, fieldnorms_writer: &FieldNormsWriter, + spatial_writer: &mut SpatialWriter, mut serializer: SegmentSerializer, ) -> crate::Result<()> { debug!("remap-and-write"); if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() { fieldnorms_writer.serialize(fieldnorms_serializer)?; } + if let Some(spatial_serializer) = serializer.extract_spatial_serializer() { + spatial_writer.serialize(spatial_serializer)?; + } let fieldnorm_data = serializer .segment() .open_read(SegmentComponent::FieldNorms)?; diff --git a/src/lib.rs b/src/lib.rs index 1027f4f46b..ba2d9aaab2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -191,6 +191,7 @@ pub mod fieldnorm; pub mod index; pub mod positions; pub mod postings; +pub mod spatial; /// Module containing the different query implementations. pub mod query; diff --git a/src/postings/per_field_postings_writer.rs b/src/postings/per_field_postings_writer.rs index f3d6d6534c..a9035ca84e 100644 --- a/src/postings/per_field_postings_writer.rs +++ b/src/postings/per_field_postings_writer.rs @@ -51,6 +51,7 @@ fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box Box::>::default(), FieldType::JsonObject(ref json_object_options) => { if let Some(text_indexing_option) = json_object_options.get_text_indexing_options() { diff --git a/src/query/mod.rs b/src/query/mod.rs index d609a04021..8bc45ed2ed 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -24,6 +24,7 @@ mod reqopt_scorer; mod scorer; mod set_query; mod size_hint; +mod spatial_query; mod term_query; mod union; mod weight; @@ -62,6 +63,7 @@ pub use self::reqopt_scorer::RequiredOptionalScorer; pub use self::score_combiner::{DisjunctionMaxCombiner, ScoreCombiner, SumCombiner}; pub use self::scorer::Scorer; pub use self::set_query::TermSetQuery; +pub use self::spatial_query::{SpatialQuery, SpatialQueryType}; pub use self::term_query::TermQuery; pub use self::union::BufferedUnionScorer; #[cfg(test)] diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 7ebaf3a474..483aa36387 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -524,6 +524,9 @@ impl QueryParser { let ip_v6 = IpAddr::from_str(phrase)?.into_ipv6_addr(); Ok(Term::from_field_ip_addr(field, ip_v6)) } + FieldType::Spatial(_) => Err(QueryParserError::UnsupportedQuery( + "Spatial queries are not yet supported in text query parser".to_string(), + )), } } @@ -624,6 +627,10 @@ impl QueryParser { let term = Term::from_field_ip_addr(field, ip_v6); Ok(vec![LogicalLiteral::Term(term)]) } + FieldType::Spatial(_) => Err(QueryParserError::UnsupportedQuery(format!( + "Spatial queries are not yet supported for field '{}'", + field_name + ))), } } diff --git a/src/query/range_query/mod.rs b/src/query/range_query/mod.rs index e93bb40491..3297dae072 100644 --- a/src/query/range_query/mod.rs +++ b/src/query/range_query/mod.rs @@ -20,6 +20,6 @@ pub(crate) fn is_type_valid_for_fastfield_range_query(typ: Type) -> bool { | Type::Date | Type::Json | Type::IpAddr => true, - Type::Facet | Type::Bytes => false, + Type::Facet | Type::Bytes | Type::Spatial => false, } } diff --git a/src/query/range_query/range_query_fastfield.rs b/src/query/range_query/range_query_fastfield.rs index 54cf0cad54..f4d7e970e1 100644 --- a/src/query/range_query/range_query_fastfield.rs +++ b/src/query/range_query/range_query_fastfield.rs @@ -128,12 +128,15 @@ impl Weight for FastFieldRangeWeight { BoundsRange::new(bounds.lower_bound, bounds.upper_bound), ) } - Type::Bool | Type::Facet | Type::Bytes | Type::Json | Type::IpAddr => { - Err(crate::TantivyError::InvalidArgument(format!( - "unsupported value bytes type in json term value_bytes {:?}", - term_value.typ() - ))) - } + Type::Bool + | Type::Facet + | Type::Bytes + | Type::Json + | Type::IpAddr + | Type::Spatial => Err(crate::TantivyError::InvalidArgument(format!( + "unsupported value bytes type in json term value_bytes {:?}", + term_value.typ() + ))), } } else if field_type.is_ip_addr() { let parse_ip_from_bytes = |term: &Term| { @@ -435,7 +438,7 @@ pub(crate) fn maps_to_u64_fastfield(typ: Type) -> bool { match typ { Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true, Type::IpAddr => false, - Type::Str | Type::Facet | Type::Bytes | Type::Json => false, + Type::Str | Type::Facet | Type::Bytes | Type::Json | Type::Spatial => false, } } diff --git a/src/query/spatial_query.rs b/src/query/spatial_query.rs new file mode 100644 index 0000000000..024b941534 --- /dev/null +++ b/src/query/spatial_query.rs @@ -0,0 +1,186 @@ +//! HUSH + +use common::BitSet; + +use crate::query::explanation::does_not_match; +use crate::query::{BitSetDocSet, Explanation, Query, Scorer, Weight}; +use crate::schema::Field; +use crate::spatial::bkd::{search_intersects, Segment}; +use crate::spatial::point::GeoPoint; +use crate::spatial::writer::as_point_i32; +use crate::{DocId, DocSet, Score, TantivyError, TERMINATED}; + +#[derive(Clone, Copy, Debug)] +/// HUSH +pub enum SpatialQueryType { + /// HUSH + Intersects, + // Within, + // Contains, +} + +#[derive(Clone, Copy, Debug)] +/// HUSH +pub struct SpatialQuery { + field: Field, + bounds: [(i32, i32); 2], + query_type: SpatialQueryType, +} + +impl SpatialQuery { + /// HUSH + pub fn new(field: Field, bounds: [GeoPoint; 2], query_type: SpatialQueryType) -> Self { + SpatialQuery { + field, + bounds: [as_point_i32(bounds[0]), as_point_i32(bounds[1])], + query_type, + } + } +} + +impl Query for SpatialQuery { + fn weight( + &self, + _enable_scoring: super::EnableScoring<'_>, + ) -> crate::Result> { + Ok(Box::new(SpatialWeight::new( + self.field, + self.bounds, + self.query_type, + ))) + } +} + +pub struct SpatialWeight { + field: Field, + bounds: [(i32, i32); 2], + query_type: SpatialQueryType, +} + +impl SpatialWeight { + fn new(field: Field, bounds: [(i32, i32); 2], query_type: SpatialQueryType) -> Self { + SpatialWeight { + field, + bounds, + query_type, + } + } +} + +impl Weight for SpatialWeight { + fn scorer( + &self, + reader: &crate::SegmentReader, + boost: crate::Score, + ) -> crate::Result> { + let spatial_reader = reader + .spatial_fields() + .get_field(self.field)? + .ok_or_else(|| TantivyError::SchemaError(format!("No spatial data for field")))?; + let block_kd_tree = Segment::new(spatial_reader.get_bytes()); + match self.query_type { + SpatialQueryType::Intersects => { + let mut include = BitSet::with_max_value(reader.max_doc()); + search_intersects( + &block_kd_tree, + block_kd_tree.root_offset, + &[ + self.bounds[0].1, + self.bounds[0].0, + self.bounds[1].1, + self.bounds[1].0, + ], + &mut include, + )?; + Ok(Box::new(SpatialScorer::new(boost, include, None))) + } + } + } + fn explain( + &self, + reader: &crate::SegmentReader, + doc: DocId, + ) -> crate::Result { + let mut scorer = self.scorer(reader, 1.0)?; + if scorer.seek(doc) != doc { + return Err(does_not_match(doc)); + } + let query_type_desc = match self.query_type { + SpatialQueryType::Intersects => "SpatialQuery::Intersects", + }; + let score = scorer.score(); + let mut explanation = Explanation::new(query_type_desc, score); + explanation.add_context(format!( + "bounds: [({}, {}), ({}, {})]", + self.bounds[0].0, self.bounds[0].1, self.bounds[1].0, self.bounds[1].1, + )); + explanation.add_context(format!("field: {:?}", self.field)); + Ok(explanation) + } +} + +struct SpatialScorer { + include: BitSetDocSet, + exclude: Option, + doc_id: DocId, + score: Score, +} + +impl SpatialScorer { + pub fn new(score: Score, include: BitSet, exclude: Option) -> Self { + let mut scorer = SpatialScorer { + include: BitSetDocSet::from(include), + exclude, + doc_id: 0, + score, + }; + scorer.prime(); + scorer + } + fn prime(&mut self) { + self.doc_id = self.include.doc(); + while self.exclude() { + self.doc_id = self.include.advance(); + } + } + + fn exclude(&self) -> bool { + if self.doc_id == TERMINATED { + return false; + } + match &self.exclude { + Some(exclude) => exclude.contains(self.doc_id), + None => false, + } + } +} + +impl Scorer for SpatialScorer { + fn score(&mut self) -> Score { + self.score + } +} + +impl DocSet for SpatialScorer { + fn advance(&mut self) -> DocId { + if self.doc_id == TERMINATED { + return TERMINATED; + } + self.doc_id = self.include.advance(); + while self.exclude() { + self.doc_id = self.include.advance(); + } + self.doc_id + } + + fn size_hint(&self) -> u32 { + match &self.exclude { + Some(exclude) => self.include.size_hint() - exclude.len() as u32, + None => self.include.size_hint(), + } + } + + fn doc(&self) -> DocId { + self.doc_id + } +} diff --git a/src/schema/document/de.rs b/src/schema/document/de.rs index 02ee7a6a0c..9e6b6ecc03 100644 --- a/src/schema/document/de.rs +++ b/src/schema/document/de.rs @@ -22,6 +22,7 @@ use super::se::BinaryObjectSerializer; use super::{OwnedValue, Value}; use crate::schema::document::type_codes; use crate::schema::{Facet, Field}; +use crate::spatial::geometry::Geometry; use crate::store::DocStoreVersion; use crate::tokenizer::PreTokenizedString; @@ -129,6 +130,9 @@ pub trait ValueDeserializer<'de> { /// Attempts to deserialize a pre-tokenized string value from the deserializer. fn deserialize_pre_tokenized_string(self) -> Result; + /// HUSH + fn deserialize_geometry(self) -> Result; + /// Attempts to deserialize the value using a given visitor. fn deserialize_any(self, visitor: V) -> Result where V: ValueVisitor; @@ -166,6 +170,8 @@ pub enum ValueType { /// A JSON object value. Deprecated. #[deprecated(note = "We keep this for backwards compatibility, use Object instead")] JSONObject, + /// HUSH + Geometry, } /// A value visitor for deserializing a document value. @@ -246,6 +252,12 @@ pub trait ValueVisitor { Err(DeserializeError::UnsupportedType(ValueType::PreTokStr)) } + #[inline] + /// Called when the deserializer visits a geometry value. + fn visit_geometry(&self, _val: Geometry) -> Result { + Err(DeserializeError::UnsupportedType(ValueType::Geometry)) + } + #[inline] /// Called when the deserializer visits an array. fn visit_array<'de, A>(&self, _access: A) -> Result @@ -380,6 +392,7 @@ where R: Read match ext_type_code { type_codes::TOK_STR_EXT_CODE => ValueType::PreTokStr, + type_codes::GEO_EXT_CODE => ValueType::Geometry, _ => { return Err(DeserializeError::from(io::Error::new( io::ErrorKind::InvalidData, @@ -495,6 +508,11 @@ where R: Read .map_err(DeserializeError::from) } + fn deserialize_geometry(self) -> Result { + self.validate_type(ValueType::Geometry)?; + ::deserialize(self.reader).map_err(DeserializeError::from) + } + fn deserialize_any(self, visitor: V) -> Result where V: ValueVisitor { match self.value_type { @@ -539,6 +557,10 @@ where R: Read let val = self.deserialize_pre_tokenized_string()?; visitor.visit_pre_tokenized_string(val) } + ValueType::Geometry => { + let val = self.deserialize_geometry()?; + visitor.visit_geometry(val) + } ValueType::Array => { let access = BinaryArrayDeserializer::from_reader(self.reader, self.doc_store_version)?; diff --git a/src/schema/document/default_document.rs b/src/schema/document/default_document.rs index 915b685aa7..2a28482720 100644 --- a/src/schema/document/default_document.rs +++ b/src/schema/document/default_document.rs @@ -13,6 +13,7 @@ use crate::schema::document::{ }; use crate::schema::field_type::ValueParsingError; use crate::schema::{Facet, Field, NamedFieldDocument, OwnedValue, Schema}; +use crate::spatial::geometry::Geometry; use crate::tokenizer::PreTokenizedString; #[repr(C, packed)] @@ -254,6 +255,7 @@ impl CompactDoc { } ReferenceValueLeaf::IpAddr(num) => write_into(&mut self.node_data, num.to_u128()), ReferenceValueLeaf::PreTokStr(pre_tok) => write_into(&mut self.node_data, *pre_tok), + ReferenceValueLeaf::Geometry(geometry) => write_into(&mut self.node_data, *geometry), }; ValueAddr { type_id, val_addr } } @@ -464,6 +466,12 @@ impl<'a> CompactDocValue<'a> { .map(Into::into) .map(ReferenceValueLeaf::PreTokStr) .map(Into::into), + ValueType::Geometry => self + .container + .read_from::(addr) + .map(Into::into) + .map(ReferenceValueLeaf::Geometry) + .map(Into::into), ValueType::Object => Ok(ReferenceValue::Object(CompactDocObjectIter::new( self.container, addr, @@ -542,6 +550,8 @@ pub enum ValueType { Object = 11, /// Pre-tokenized str type, Array = 12, + /// HUSH + Geometry = 13, } impl BinarySerializable for ValueType { @@ -587,6 +597,7 @@ impl<'a> From<&ReferenceValueLeaf<'a>> for ValueType { ReferenceValueLeaf::PreTokStr(_) => ValueType::PreTokStr, ReferenceValueLeaf::Facet(_) => ValueType::Facet, ReferenceValueLeaf::Bytes(_) => ValueType::Bytes, + ReferenceValueLeaf::Geometry(_) => ValueType::Geometry, } } } diff --git a/src/schema/document/mod.rs b/src/schema/document/mod.rs index 8168ee8112..ed4a03d6f4 100644 --- a/src/schema/document/mod.rs +++ b/src/schema/document/mod.rs @@ -273,4 +273,5 @@ pub(crate) mod type_codes { // Extended type codes pub const TOK_STR_EXT_CODE: u8 = 0; + pub const GEO_EXT_CODE: u8 = 1; } diff --git a/src/schema/document/owned_value.rs b/src/schema/document/owned_value.rs index 9fbf1f8c26..5977a83875 100644 --- a/src/schema/document/owned_value.rs +++ b/src/schema/document/owned_value.rs @@ -15,6 +15,7 @@ use crate::schema::document::{ ValueDeserializer, ValueVisitor, }; use crate::schema::Facet; +use crate::spatial::geometry::Geometry; use crate::tokenizer::PreTokenizedString; use crate::DateTime; @@ -49,6 +50,8 @@ pub enum OwnedValue { Object(Vec<(String, Self)>), /// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`. IpAddr(Ipv6Addr), + /// A GeoRust multi-polygon. + Geometry(Geometry), } impl AsRef for OwnedValue { @@ -77,6 +80,9 @@ impl<'a> Value<'a> for &'a OwnedValue { OwnedValue::IpAddr(val) => ReferenceValueLeaf::IpAddr(*val).into(), OwnedValue::Array(array) => ReferenceValue::Array(array.iter()), OwnedValue::Object(object) => ReferenceValue::Object(ObjectMapIter(object.iter())), + OwnedValue::Geometry(geometry) => { + ReferenceValueLeaf::Geometry(Box::new(geometry.clone())).into() + } } } } @@ -136,6 +142,10 @@ impl ValueDeserialize for OwnedValue { Ok(OwnedValue::PreTokStr(val)) } + fn visit_geometry(&self, val: Geometry) -> Result { + Ok(OwnedValue::Geometry(val)) + } + fn visit_array<'de, A>(&self, mut access: A) -> Result where A: ArrayAccess<'de> { let mut elements = Vec::with_capacity(access.size_hint()); @@ -198,6 +208,7 @@ impl serde::Serialize for OwnedValue { } } OwnedValue::Array(ref array) => array.serialize(serializer), + OwnedValue::Geometry(ref geometry) => geometry.to_geojson().serialize(serializer), } } } @@ -285,6 +296,7 @@ impl<'a, V: Value<'a>> From> for OwnedValue { ReferenceValueLeaf::IpAddr(val) => OwnedValue::IpAddr(val), ReferenceValueLeaf::Bool(val) => OwnedValue::Bool(val), ReferenceValueLeaf::PreTokStr(val) => OwnedValue::PreTokStr(*val.clone()), + ReferenceValueLeaf::Geometry(val) => OwnedValue::Geometry(*val.clone()), }, ReferenceValue::Array(val) => { OwnedValue::Array(val.map(|v| v.as_value().into()).collect()) diff --git a/src/schema/document/se.rs b/src/schema/document/se.rs index 528c8dffc3..d1b884f016 100644 --- a/src/schema/document/se.rs +++ b/src/schema/document/se.rs @@ -133,6 +133,10 @@ where W: Write self.write_type_code(type_codes::EXT_CODE)?; self.serialize_with_type_code(type_codes::TOK_STR_EXT_CODE, &*val) } + ReferenceValueLeaf::Geometry(val) => { + self.write_type_code(type_codes::EXT_CODE)?; + self.serialize_with_type_code(type_codes::GEO_EXT_CODE, &*val) + } }, ReferenceValue::Array(elements) => { self.write_type_code(type_codes::ARRAY_CODE)?; diff --git a/src/schema/document/value.rs b/src/schema/document/value.rs index de1067ce67..87d2593d47 100644 --- a/src/schema/document/value.rs +++ b/src/schema/document/value.rs @@ -3,6 +3,7 @@ use std::net::Ipv6Addr; use common::DateTime; +use crate::spatial::geometry::Geometry; use crate::tokenizer::PreTokenizedString; /// A single field value. @@ -108,6 +109,12 @@ pub trait Value<'a>: Send + Sync + Debug { None } } + + #[inline] + /// HUSH + fn as_geometry(&self) -> Option> { + self.as_leaf().and_then(|leaf| leaf.into_geometry()) + } } /// A enum representing a leaf value for tantivy to index. @@ -136,6 +143,8 @@ pub enum ReferenceValueLeaf<'a> { Bool(bool), /// Pre-tokenized str type, PreTokStr(Box), + /// HUSH + Geometry(Box), } impl From for ReferenceValueLeaf<'_> { @@ -220,6 +229,9 @@ impl<'a, T: Value<'a> + ?Sized> From> for ReferenceValue< ReferenceValueLeaf::PreTokStr(val) => { ReferenceValue::Leaf(ReferenceValueLeaf::PreTokStr(val)) } + ReferenceValueLeaf::Geometry(val) => { + ReferenceValue::Leaf(ReferenceValueLeaf::Geometry(val)) + } } } } @@ -331,6 +343,16 @@ impl<'a> ReferenceValueLeaf<'a> { None } } + + #[inline] + /// HUSH + pub fn into_geometry(self) -> Option> { + if let Self::Geometry(val) = self { + Some(val) + } else { + None + } + } } /// A enum representing a value for tantivy to index. @@ -448,4 +470,10 @@ where V: Value<'a> pub fn is_object(&self) -> bool { matches!(self, Self::Object(_)) } + + #[inline] + /// HUSH + pub fn into_geometry(self) -> Option> { + self.into_leaf().and_then(|leaf| leaf.into_geometry()) + } } diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index 77e061bc61..d8325f7dcc 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -1,6 +1,7 @@ use serde::{Deserialize, Serialize}; use super::ip_options::IpAddrOptions; +use super::spatial_options::SpatialOptions; use crate::schema::bytes_options::BytesOptions; use crate::schema::{ is_valid_field_name, DateOptions, FacetOptions, FieldType, JsonObjectOptions, NumericOptions, @@ -80,6 +81,11 @@ impl FieldEntry { Self::new(field_name, FieldType::JsonObject(json_object_options)) } + /// Creates a field entry for a spatial field + pub fn new_spatial(field_name: String, spatial_options: SpatialOptions) -> FieldEntry { + Self::new(field_name, FieldType::Spatial(spatial_options)) + } + /// Returns the name of the field pub fn name(&self) -> &str { &self.name @@ -129,6 +135,7 @@ impl FieldEntry { FieldType::Bytes(ref options) => options.is_stored(), FieldType::JsonObject(ref options) => options.is_stored(), FieldType::IpAddr(ref options) => options.is_stored(), + FieldType::Spatial(ref options) => options.is_stored(), } } } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 8b203f5b37..0caef80caf 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -9,6 +9,7 @@ use serde_json::Value as JsonValue; use thiserror::Error; use super::ip_options::IpAddrOptions; +use super::spatial_options::SpatialOptions; use super::IntoIpv6Addr; use crate::schema::bytes_options::BytesOptions; use crate::schema::facet_options::FacetOptions; @@ -16,6 +17,7 @@ use crate::schema::{ DateOptions, Facet, IndexRecordOption, JsonObjectOptions, NumericOptions, OwnedValue, TextFieldIndexing, TextOptions, }; +use crate::spatial::geometry::Geometry; use crate::time::format_description::well_known::Rfc3339; use crate::time::OffsetDateTime; use crate::tokenizer::PreTokenizedString; @@ -71,6 +73,8 @@ pub enum Type { Json = b'j', /// IpAddr IpAddr = b'p', + /// Spatial + Spatial = b't', } impl From for Type { @@ -139,6 +143,7 @@ impl Type { Type::Bytes => "Bytes", Type::Json => "Json", Type::IpAddr => "IpAddr", + Type::Spatial => "Spatial", } } @@ -189,6 +194,8 @@ pub enum FieldType { JsonObject(JsonObjectOptions), /// IpAddr field IpAddr(IpAddrOptions), + /// Spatial field + Spatial(SpatialOptions), } impl FieldType { @@ -205,6 +212,7 @@ impl FieldType { FieldType::Bytes(_) => Type::Bytes, FieldType::JsonObject(_) => Type::Json, FieldType::IpAddr(_) => Type::IpAddr, + FieldType::Spatial(_) => Type::Spatial, } } @@ -241,6 +249,7 @@ impl FieldType { FieldType::Bytes(ref bytes_options) => bytes_options.is_indexed(), FieldType::JsonObject(ref json_object_options) => json_object_options.is_indexed(), FieldType::IpAddr(ref ip_addr_options) => ip_addr_options.is_indexed(), + FieldType::Spatial(ref _spatial_options) => true, } } @@ -278,6 +287,7 @@ impl FieldType { FieldType::IpAddr(ref ip_addr_options) => ip_addr_options.is_fast(), FieldType::Facet(_) => true, FieldType::JsonObject(ref json_object_options) => json_object_options.is_fast(), + FieldType::Spatial(_) => false, } } @@ -297,6 +307,7 @@ impl FieldType { FieldType::Bytes(ref bytes_options) => bytes_options.fieldnorms(), FieldType::JsonObject(ref _json_object_options) => false, FieldType::IpAddr(ref ip_addr_options) => ip_addr_options.fieldnorms(), + FieldType::Spatial(_) => false, } } @@ -348,6 +359,8 @@ impl FieldType { None } } + FieldType::Spatial(_) => None, /* Geometry types cannot be indexed in the inverted + * index. */ } } @@ -449,6 +462,10 @@ impl FieldType { Ok(OwnedValue::IpAddr(ip_addr.into_ipv6_addr())) } + FieldType::Spatial(_) => Err(ValueParsingError::TypeError { + expected: "spatial field parsing not implemented", + json: JsonValue::String(field_text), + }), } } JsonValue::Number(field_val_num) => match self { @@ -508,6 +525,10 @@ impl FieldType { expected: "a string with an ip addr", json: JsonValue::Number(field_val_num), }), + FieldType::Spatial(_) => Err(ValueParsingError::TypeError { + expected: "spatial field parsing not implemented", + json: JsonValue::Number(field_val_num), + }), }, JsonValue::Object(json_map) => match self { FieldType::Str(_) => { @@ -523,6 +544,14 @@ impl FieldType { } } FieldType::JsonObject(_) => Ok(OwnedValue::from(json_map)), + FieldType::Spatial(_) => Ok(OwnedValue::Geometry( + Geometry::from_geojson(&json_map).map_err(|e| { + ValueParsingError::ParseError { + error: format!("{:?}", e), + json: JsonValue::Object(json_map), + } + })?, + )), _ => Err(ValueParsingError::TypeError { expected: self.value_type().name(), json: JsonValue::Object(json_map), diff --git a/src/schema/flags.rs b/src/schema/flags.rs index 4f8caa612c..9b03b214d5 100644 --- a/src/schema/flags.rs +++ b/src/schema/flags.rs @@ -1,6 +1,6 @@ use std::ops::BitOr; -use crate::schema::{DateOptions, NumericOptions, TextOptions}; +use crate::schema::{DateOptions, NumericOptions, SpatialOptions, TextOptions}; #[derive(Clone)] pub struct StoredFlag; @@ -95,6 +95,14 @@ impl> BitOr for SchemaFlagList } } +impl> BitOr for SchemaFlagList { + type Output = SpatialOptions; + + fn bitor(self, rhs: SpatialOptions) -> Self::Output { + self.head.into() | rhs + } +} + #[derive(Clone)] pub struct SchemaFlagList { pub head: Head, diff --git a/src/schema/mod.rs b/src/schema/mod.rs index 1cd4b72436..eabd1c5a12 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -124,6 +124,7 @@ mod ip_options; mod json_object_options; mod named_field_document; mod numeric_options; +mod spatial_options; mod text_options; use columnar::ColumnType; @@ -144,6 +145,7 @@ pub use self::json_object_options::JsonObjectOptions; pub use self::named_field_document::NamedFieldDocument; pub use self::numeric_options::NumericOptions; pub use self::schema::{Schema, SchemaBuilder}; +pub use self::spatial_options::{SpatialOptions, SPATIAL}; pub use self::term::{Term, ValueBytes}; pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT}; @@ -168,6 +170,7 @@ pub(crate) fn value_type_to_column_type(typ: Type) -> Option { Type::Bytes => Some(ColumnType::Bytes), Type::IpAddr => Some(ColumnType::IpAddr), Type::Json => None, + Type::Spatial => None, } } diff --git a/src/schema/schema.rs b/src/schema/schema.rs index c1d22c0baa..24069870db 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -194,6 +194,16 @@ impl SchemaBuilder { self.add_field(field_entry) } + /// Adds a spatial entry to the schema in build. + pub fn add_spatial_field>( + &mut self, + field_name: &str, + field_options: T, + ) -> Field { + let field_entry = FieldEntry::new_spatial(field_name.to_string(), field_options.into()); + self.add_field(field_entry) + } + /// Adds a field entry to the schema in build. pub fn add_field(&mut self, field_entry: FieldEntry) -> Field { let field = Field::from_field_id(self.fields.len() as u32); @@ -208,9 +218,14 @@ impl SchemaBuilder { /// Finalize the creation of a `Schema` /// This will consume your `SchemaBuilder` pub fn build(self) -> Schema { + let contains_spatial_field = self + .fields + .iter() + .any(|field_entry| field_entry.field_type().value_type() == Type::Spatial); Schema(Arc::new(InnerSchema { fields: self.fields, fields_map: self.fields_map, + contains_spatial_field, })) } } @@ -218,6 +233,7 @@ impl SchemaBuilder { struct InnerSchema { fields: Vec, fields_map: HashMap, // transient + contains_spatial_field: bool, } impl PartialEq for InnerSchema { @@ -368,6 +384,11 @@ impl Schema { } Some((field, json_path)) } + + /// Returns true if the schema contains a spatial field. + pub(crate) fn contains_spatial_field(&self) -> bool { + self.0.contains_spatial_field + } } impl Serialize for Schema { @@ -395,16 +416,16 @@ impl<'de> Deserialize<'de> for Schema { fn visit_seq(self, mut seq: A) -> Result where A: SeqAccess<'de> { - let mut schema = SchemaBuilder { + let mut schema_builder = SchemaBuilder { fields: Vec::with_capacity(seq.size_hint().unwrap_or(0)), fields_map: HashMap::with_capacity(seq.size_hint().unwrap_or(0)), }; while let Some(value) = seq.next_element()? { - schema.add_field(value); + schema_builder.add_field(value); } - Ok(schema.build()) + Ok(schema_builder.build()) } } @@ -1020,4 +1041,33 @@ mod tests { Some((default, "foobar")) ); } + + #[test] + fn test_contains_spatial_field() { + // No spatial field + { + let mut schema_builder = Schema::builder(); + schema_builder.add_text_field("title", TEXT); + let schema = schema_builder.build(); + assert!(!schema.contains_spatial_field()); + + // Serialization check + let schema_json = serde_json::to_string(&schema).unwrap(); + let schema_deserialized: Schema = serde_json::from_str(&schema_json).unwrap(); + assert!(!schema_deserialized.contains_spatial_field()); + } + // With spatial field + { + let mut schema_builder = Schema::builder(); + schema_builder.add_text_field("title", TEXT); + schema_builder.add_spatial_field("location", SPATIAL); + let schema = schema_builder.build(); + assert!(schema.contains_spatial_field()); + + // Serialization check + let schema_json = serde_json::to_string(&schema).unwrap(); + let schema_deserialized: Schema = serde_json::from_str(&schema_json).unwrap(); + assert!(schema_deserialized.contains_spatial_field()); + } + } } diff --git a/src/schema/spatial_options.rs b/src/schema/spatial_options.rs new file mode 100644 index 0000000000..248a5500fc --- /dev/null +++ b/src/schema/spatial_options.rs @@ -0,0 +1,53 @@ +use std::ops::BitOr; + +use serde::{Deserialize, Serialize}; + +use crate::schema::flags::StoredFlag; + +/// Define how a spatial field should be handled by tantivy. +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize, Default)] +pub struct SpatialOptions { + #[serde(default)] + stored: bool, +} + +/// The field will be untokenized and indexed. +pub const SPATIAL: SpatialOptions = SpatialOptions { stored: false }; + +impl SpatialOptions { + /// Returns true if the geometry is to be stored. + #[inline] + pub fn is_stored(&self) -> bool { + self.stored + } +} + +impl> BitOr for SpatialOptions { + type Output = SpatialOptions; + + fn bitor(self, other: T) -> SpatialOptions { + let other = other.into(); + SpatialOptions { + stored: self.stored | other.stored, + } + } +} + +impl From for SpatialOptions { + fn from(_: StoredFlag) -> SpatialOptions { + SpatialOptions { stored: true } + } +} + +// #[cfg(test)] +// mod tests { +// use crate::schema::*; +// +// #[test] +// fn test_field_options() { +// let field_options = STORED | SPATIAL; +// assert!(field_options.is_stored()); +// let mut schema_builder = Schema::builder(); +// schema_builder.add_spatial_index("where", SPATIAL | STORED); +// } +// } diff --git a/src/schema/term.rs b/src/schema/term.rs index 2dd78b82ae..4414fe0890 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -503,6 +503,9 @@ where B: AsRef<[u8]> Type::IpAddr => { write_opt(f, self.as_ip_addr())?; } + Type::Spatial => { + write!(f, "")?; + } } Ok(()) } diff --git a/src/space_usage/mod.rs b/src/space_usage/mod.rs index 70291153ad..118badd147 100644 --- a/src/space_usage/mod.rs +++ b/src/space_usage/mod.rs @@ -69,6 +69,7 @@ pub struct SegmentSpaceUsage { positions: PerFieldSpaceUsage, fast_fields: PerFieldSpaceUsage, fieldnorms: PerFieldSpaceUsage, + spatial: PerFieldSpaceUsage, store: StoreSpaceUsage, @@ -86,6 +87,7 @@ impl SegmentSpaceUsage { positions: PerFieldSpaceUsage, fast_fields: PerFieldSpaceUsage, fieldnorms: PerFieldSpaceUsage, + spatial: PerFieldSpaceUsage, store: StoreSpaceUsage, deletes: ByteCount, ) -> SegmentSpaceUsage { @@ -94,6 +96,7 @@ impl SegmentSpaceUsage { + positions.total() + fast_fields.total() + fieldnorms.total() + + spatial.total() + store.total() + deletes; SegmentSpaceUsage { @@ -103,6 +106,7 @@ impl SegmentSpaceUsage { positions, fast_fields, fieldnorms, + spatial, store, deletes, total, @@ -121,6 +125,7 @@ impl SegmentSpaceUsage { Positions => PerField(self.positions().clone()), FastFields => PerField(self.fast_fields().clone()), FieldNorms => PerField(self.fieldnorms().clone()), + Spatial => PerField(self.spatial().clone()), Terms => PerField(self.termdict().clone()), SegmentComponent::Store => ComponentSpaceUsage::Store(self.store().clone()), SegmentComponent::TempStore => ComponentSpaceUsage::Store(self.store().clone()), @@ -158,6 +163,11 @@ impl SegmentSpaceUsage { &self.fieldnorms } + /// Space usage for field norms + pub fn spatial(&self) -> &PerFieldSpaceUsage { + &self.spatial + } + /// Space usage for stored documents pub fn store(&self) -> &StoreSpaceUsage { &self.store diff --git a/src/spatial/bkd.rs b/src/spatial/bkd.rs new file mode 100644 index 0000000000..f4d0829cd3 --- /dev/null +++ b/src/spatial/bkd.rs @@ -0,0 +1,853 @@ +//! Block kd-tree spatial indexing for triangulated polygons. +//! +//! Implements an immutable bulk-loaded spatial index using recursive median partitioning on +//! bounding box dimensions. Each leaf stores up to 512 triangles with delta-compressed coordinates +//! and doc IDs. The tree provides three query types (intersects, within, contains) that use exact +//! integer arithmetic for geometric predicates and accumulate results in bit sets for efficient +//! deduplication across leaves. +//! +//! The serialized format stores compressed leaf pages followed by the tree structure (leaf and +//! branch nodes), enabling zero-copy access through memory-mapped segments without upfront +//! decompression. +use std::io; +use std::io::Write; + +use common::{BitSet, CountingWriter}; + +use crate::directory::WritePtr; +use crate::spatial::delta::{compress, decompress, Compressible}; +use crate::spatial::triangle::Triangle; + +#[derive(Clone, Copy)] +struct SpreadSurvey { + min: i32, + max: i32, +} + +impl SpreadSurvey { + fn survey(&mut self, value: i32) { + self.min = self.min.min(value); + self.max = self.max.max(value); + } + fn spread(&self) -> i32 { + self.max - self.min + } +} + +impl Default for SpreadSurvey { + fn default() -> Self { + SpreadSurvey { + min: i32::MAX, + max: i32::MIN, + } + } +} + +#[derive(Clone, Copy)] +struct BoundingBoxSurvey { + bbox: [i32; 4], +} + +impl BoundingBoxSurvey { + fn survey(&mut self, triangle: &Triangle) { + self.bbox[0] = triangle.words[0].min(self.bbox[0]); + self.bbox[1] = triangle.words[1].min(self.bbox[1]); + self.bbox[2] = triangle.words[2].max(self.bbox[2]); + self.bbox[3] = triangle.words[3].max(self.bbox[3]); + } + fn bbox(&self) -> [i32; 4] { + self.bbox.clone() + } +} + +impl Default for BoundingBoxSurvey { + fn default() -> Self { + BoundingBoxSurvey { + bbox: [i32::MAX, i32::MAX, i32::MIN, i32::MIN], + } + } +} + +enum BuildNode { + Branch { + bbox: [i32; 4], + left: Box, + right: Box, + }, + Leaf { + bbox: [i32; 4], + pos: u64, + len: u16, + }, +} + +struct CompressibleTriangleI32<'a> { + triangles: &'a [Triangle], + dimension: usize, +} + +impl<'a> CompressibleTriangleI32<'a> { + fn new(triangles: &'a [Triangle], dimension: usize) -> Self { + CompressibleTriangleI32 { + triangles, + dimension, + } + } +} + +impl<'a> Compressible for CompressibleTriangleI32<'a> { + type Value = i32; + fn len(&self) -> usize { + self.triangles.len() + } + fn get(&self, i: usize) -> i32 { + self.triangles[i].words[self.dimension] + } +} + +struct CompressibleTriangleDocID<'a> { + triangles: &'a [Triangle], +} + +impl<'a> CompressibleTriangleDocID<'a> { + fn new(triangles: &'a [Triangle]) -> Self { + CompressibleTriangleDocID { triangles } + } +} + +impl<'a> Compressible for CompressibleTriangleDocID<'a> { + type Value = u32; + fn len(&self) -> usize { + self.triangles.len() + } + fn get(&self, i: usize) -> u32 { + self.triangles[i].doc_id + } +} + +// Leaf pages are first the count of triangles, followed by delta encoded doc_ids, followed by +// the delta encoded words in order. We will then have the length of the page. We build a tree +// after the pages with leaf nodes and branch nodes. Leaf nodes will contain the bounding box +// of the leaf followed position and length of the page. The leaf node is a level of direction +// to store the position and length of the page in a format that is easy to read directly from +// the mapping. + +// We do not compress the tree nodes. We read them directly from the mapping. + +// +fn write_leaf_pages( + triangles: &mut [Triangle], + write: &mut CountingWriter, +) -> io::Result { + // If less than 512 triangles we are at a leaf, otherwise we still in the inner nodes. + if triangles.len() <= 512 { + let pos = write.written_bytes(); + let mut spreads = [SpreadSurvey::default(); 4]; + let mut bounding_box = BoundingBoxSurvey::default(); + for triangle in triangles.iter() { + for i in 0..4 { + spreads[i].survey(triangle.words[i]); + } + bounding_box.survey(triangle); + } + let mut max_spread = spreads[0].spread(); + let mut dimension = 0; + for i in 1..4 { + let current_spread = spreads[i].spread(); + if current_spread > max_spread { + dimension = i; + max_spread = current_spread; + } + } + write.write_all(&(triangles.len() as u16).to_le_bytes())?; + triangles.sort_by_key(|t| t.words[dimension]); + compress(&CompressibleTriangleDocID::new(triangles), write)?; + let compressible = [ + CompressibleTriangleI32::new(triangles, 0), + CompressibleTriangleI32::new(triangles, 1), + CompressibleTriangleI32::new(triangles, 2), + CompressibleTriangleI32::new(triangles, 3), + CompressibleTriangleI32::new(triangles, 4), + CompressibleTriangleI32::new(triangles, 5), + CompressibleTriangleI32::new(triangles, 6), + ]; + for i in 0..7 { + compress(&compressible[i], write)?; + } + let len = write.written_bytes() - pos; + Ok(BuildNode::Leaf { + bbox: bounding_box.bbox(), + pos, + len: len as u16, + }) + } else { + let mut spreads = [SpreadSurvey::default(); 4]; + let mut bounding_box = BoundingBoxSurvey::default(); + for triangle in triangles.iter() { + for i in 0..4 { + spreads[i].survey(triangle.words[i]); + } + bounding_box.survey(triangle); + } + let mut max_spread = spreads[0].spread(); + let mut dimension = 0; + for i in 0..4 { + let current_spread = spreads[i].spread(); + if current_spread > max_spread { + dimension = i; + max_spread = current_spread; + } + } + // Partition the triangles. + let mid = triangles.len() / 2; + triangles.select_nth_unstable_by_key(mid, |t| t.words[dimension]); + let partition = triangles[mid].words[dimension]; + let mut split_point = mid + 1; + while split_point < triangles.len() && triangles[split_point].words[dimension] == partition + { + split_point += 1; + } + // If we reached the end of triangles then all of the triangles share the partition value + // for the dimension. We handle this degeneracy by splitting at the midpoint so that we + // won't have a leaf with zero triangles. + if split_point == triangles.len() { + split_point = mid; // Force split at midpoint index + } else { + // Our partition does not sort the triangles, it only partitions. We have scan our right + // partition to find all the midpoint values and move them to the left partition. + let mut reverse = triangles.len() - 1; + loop { + // Scan backwards looking for the partition value. + while triangles[reverse].words[dimension] != partition { + reverse -= 1; + } + // If we have reached the split point then we are done. + if reverse <= split_point { + break; + } + // Swap the midpoint value with our current split point. + triangles.swap(split_point, reverse); + // Move the split point up one. + split_point += 1; + // We know that what was at the split point was not the midpoint value. + reverse -= 1; + } + } + // Split into left and write partitions and create child nodes. + let (left, right) = triangles.split_at_mut(split_point); + let left_node = write_leaf_pages(left, write)?; + let right_node = write_leaf_pages(right, write)?; + // Return an inner node. + Ok(BuildNode::Branch { + bbox: bounding_box.bbox(), + left: Box::new(left_node), + right: Box::new(right_node), + }) + } +} + +fn write_leaf_nodes(node: &BuildNode, write: &mut CountingWriter) -> io::Result<()> { + match node { + BuildNode::Branch { + bbox: _, + left, + right, + } => { + write_leaf_nodes(right, write)?; + write_leaf_nodes(left, write)?; + } + BuildNode::Leaf { bbox, pos, len } => { + for &dimension in bbox.iter() { + write.write_all(&dimension.to_le_bytes())?; + } + write.write_all(&pos.to_le_bytes())?; + write.write_all(&len.to_le_bytes())?; + write.write_all(&[0u8; 6])?; + } + } + Ok(()) +} + +fn write_branch_nodes( + node: &BuildNode, + branch_offset: &mut i32, + leaf_offset: &mut i32, + write: &mut CountingWriter, +) -> io::Result { + match node { + BuildNode::Leaf { .. } => { + let pos = *leaf_offset; + *leaf_offset -= 1; + Ok(pos * size_of::() as i32) + } + BuildNode::Branch { bbox, left, right } => { + let left = write_branch_nodes(left, branch_offset, leaf_offset, write)?; + let right = write_branch_nodes(right, branch_offset, leaf_offset, write)?; + for &val in bbox { + write.write_all(&val.to_le_bytes())?; + } + write.write_all(&left.to_le_bytes())?; + write.write_all(&right.to_le_bytes())?; + write.write_all(&[0u8; 8])?; + let pos = *branch_offset; + *branch_offset += 1; + Ok(pos * size_of::() as i32) + } + } +} + +const VERSION: u16 = 1u16; + +/// Builds and serializes a block kd-tree for spatial indexing of triangles. +/// +/// Takes a collection of triangles and constructs a complete block kd-tree, writing both the +/// compressed leaf pages and tree structure to the output. The tree uses recursive median +/// partitioning on the dimension with maximum spread, storing up to 512 triangles per leaf. +/// +/// The output format consists of: +/// - Version header (u16) +/// - Compressed leaf pages (delta-encoded doc_ids and triangle coordinates) +/// - 32-byte aligned tree structure (leaf nodes, then branch nodes) +/// - Footer with triangle count, root offset, and branch position +/// +/// The `triangles` slice will be reordered during tree construction as partitioning sorts by the +/// selected dimension at each level. +pub fn write_block_kd_tree( + triangles: &mut [Triangle], + write: &mut CountingWriter, +) -> io::Result<()> { + write.write_all(&VERSION.to_le_bytes())?; + + let tree = write_leaf_pages(triangles, write)?; + let current = write.written_bytes(); + let aligned = current.next_multiple_of(32); + let padding = aligned - current; + write.write_all(&vec![0u8; padding as usize])?; + + write_leaf_nodes(&tree, write)?; + let branch_position = write.written_bytes(); + let mut branch_offset: i32 = 0; + let mut leaf_offset: i32 = -1; + let root = write_branch_nodes(&tree, &mut branch_offset, &mut leaf_offset, write)?; + write.write_all(&[0u8; 12])?; + write.write_all(&triangles.len().to_le_bytes())?; + write.write_all(&root.to_le_bytes())?; + write.write_all(&branch_position.to_le_bytes())?; + Ok(()) +} + +fn decompress_leaf(mut data: &[u8]) -> io::Result> { + use common::BinarySerializable; + let triangle_count: usize = u16::deserialize(&mut data)? as usize; + let mut offset: usize = 0; + let mut triangles: Vec = Vec::with_capacity(triangle_count); + offset += decompress::(&data[offset..], triangle_count, |_, doc_id| { + triangles.push(Triangle::skeleton(doc_id)) + })?; + for i in 0..7 { + offset += decompress::(&data[offset..], triangle_count, |j, word| { + triangles[j].words[i] = word + })?; + } + Ok(triangles) +} + +#[repr(C)] +struct BranchNode { + bbox: [i32; 4], + left: i32, + right: i32, + pad: [u8; 8], +} + +#[repr(C)] +struct LeafNode { + bbox: [i32; 4], + pos: u64, + len: u16, + pad: [u8; 6], +} + +/// A read-only view into a serialized block kd-tree segment. +/// +/// Provides access to the tree structure and compressed leaf data through memory-mapped or +/// buffered byte slices. The segment contains compressed leaf pages followed by the tree structure +/// (leaf nodes and branch nodes), with a footer containing metadata for locating the root and +/// interpreting offsets. +pub struct Segment<'a> { + data: &'a [u8], + branch_position: u64, + /// Offset to the root of the tree, used as the starting point for traversal. + pub root_offset: i32, +} + +impl<'a> Segment<'a> { + /// Creates a new segment from serialized block kd-tree data. + /// + /// Reads the footer metadata from the last 12 bytes to locate the tree structure and root + /// node. + pub fn new(data: &'a [u8]) -> Self { + Segment { + data, + branch_position: u64::from_le_bytes(data[data.len() - 8..].try_into().unwrap()), + root_offset: i32::from_le_bytes( + data[data.len() - 12..data.len() - 8].try_into().unwrap(), + ), + } + } + #[inline(always)] + fn bounding_box(&self, offset: i32) -> [i32; 4] { + let byte_offset = (self.branch_position as i64 + offset as i64) as usize; + let bytes = &self.data[byte_offset..byte_offset + 16]; + [ + i32::from_le_bytes(bytes[0..4].try_into().unwrap()), + i32::from_le_bytes(bytes[4..8].try_into().unwrap()), + i32::from_le_bytes(bytes[8..12].try_into().unwrap()), + i32::from_le_bytes(bytes[12..16].try_into().unwrap()), + ] + } + #[inline(always)] + fn branch_node(&self, offset: i32) -> BranchNode { + let byte_offset = (self.branch_position as i64 + offset as i64) as usize; + let bytes = &self.data[byte_offset..byte_offset + 32]; + BranchNode { + bbox: [ + i32::from_le_bytes(bytes[0..4].try_into().unwrap()), + i32::from_le_bytes(bytes[4..8].try_into().unwrap()), + i32::from_le_bytes(bytes[8..12].try_into().unwrap()), + i32::from_le_bytes(bytes[12..16].try_into().unwrap()), + ], + left: i32::from_le_bytes(bytes[16..20].try_into().unwrap()), + right: i32::from_le_bytes(bytes[20..24].try_into().unwrap()), + pad: [0u8; 8], + } + } + #[inline(always)] + fn leaf_node(&self, offset: i32) -> LeafNode { + let byte_offset = (self.branch_position as i64 + offset as i64) as usize; + let bytes = &self.data[byte_offset..byte_offset + 32]; + LeafNode { + bbox: [ + i32::from_le_bytes(bytes[0..4].try_into().unwrap()), + i32::from_le_bytes(bytes[4..8].try_into().unwrap()), + i32::from_le_bytes(bytes[8..12].try_into().unwrap()), + i32::from_le_bytes(bytes[12..16].try_into().unwrap()), + ], + pos: u64::from_le_bytes(bytes[16..24].try_into().unwrap()), + len: u16::from_le_bytes(bytes[24..26].try_into().unwrap()), + pad: [0u8; 6], + } + } + fn leaf_page(&self, leaf_node: &LeafNode) -> &[u8] { + &self.data[(leaf_node.pos as usize)..(leaf_node.pos as usize + leaf_node.len as usize)] + } +} + +fn collect_all_docs(segment: &Segment, offset: i32, result: &mut BitSet) -> io::Result<()> { + if offset < 0 { + let leaf_node = segment.leaf_node(offset); + let data = segment.leaf_page(&leaf_node); + let count = u16::from_le_bytes([data[0], data[1]]) as usize; + decompress::(&data[2..], count, |_, doc_id| result.insert(doc_id))?; + } else { + let branch_node = segment.branch_node(offset); + collect_all_docs(segment, branch_node.left, result)?; + collect_all_docs(segment, branch_node.right, result)?; + } + Ok(()) +} + +fn bbox_within(bbox: &[i32; 4], query: &[i32; 4]) -> bool { + bbox[0] >= query[0] && // min_y >= query_min_y + bbox[1] >= query[1] && // min_x >= query_min_x + bbox[2] <= query[2] && // max_y <= query_max_y + bbox[3] <= query[3] // max_x <= query_max_x +} + +fn bbox_intersects(bbox: &[i32; 4], query: &[i32; 4]) -> bool { + !(bbox[2] < query[0] || bbox[0] > query[2] || bbox[3] < query[1] || bbox[1] > query[3]) +} + +/// Finds documents with triangles that intersect the query bounding box. +/// +/// Traverses the tree starting at `offset` (typically `segment.root_offset`), pruning subtrees +/// whose bounding boxes don't intersect the query. When a node's bbox is entirely within the +/// query, all its documents are bulk-collected. Otherwise, individual triangles are tested using +/// exact geometric predicates. +/// +/// The query is `[min_y, min_x, max_y, max_x]` in integer coordinates. Documents are inserted into +/// the `result` BitSet, which automatically deduplicates when the same document appears in +/// multiple leaves. +pub fn search_intersects( + segment: &Segment, + offset: i32, + query: &[i32; 4], + result: &mut BitSet, +) -> io::Result<()> { + let bbox = segment.bounding_box(offset); + // bbox doesn't intersect query → skip entire subtree + if !bbox_intersects(&bbox, query) { + } + // bbox entirely within query → all triangles intersect + else if bbox_within(&bbox, query) { + collect_all_docs(segment, offset, result)?; + } else if offset < 0 { + // bbox crosses query → test each triangle + let leaf_node = segment.leaf_node(offset); + let triangles = decompress_leaf(segment.leaf_page(&leaf_node))?; + for triangle in &triangles { + if triangle_intersects(triangle, query) { + result.insert(triangle.doc_id); // BitSet deduplicates + } + } + } else { + let branch_node = segment.branch_node(offset); + // bbox crosses query → must check children + search_intersects(segment, branch_node.left, query, result)?; + search_intersects(segment, branch_node.right, query, result)?; + } + Ok(()) +} + +#[expect(clippy::too_many_arguments)] +fn line_intersects_line( + x1: i32, + y1: i32, + x2: i32, + y2: i32, + x3: i32, + y3: i32, + x4: i32, + y4: i32, +) -> bool { + // Cast to i128 to prevent overflow in coordinate arithmetic + let x1 = x1 as i128; + let y1 = y1 as i128; + let x2 = x2 as i128; + let y2 = y2 as i128; + let x3 = x3 as i128; + let y3 = y3 as i128; + let x4 = x4 as i128; + let y4 = y4 as i128; + + // Proper segment-segment intersection test + let d = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4); + if d == 0 { + // parallel + return false; + } + + let t = (x1 - x3) * (y3 - y4) - (y1 - y3) * (x3 - x4); + let u = -((x1 - x2) * (y1 - y3) - (y1 - y2) * (x1 - x3)); + + if d > 0 { + t >= 0 && t <= d && u >= 0 && u <= d + } else { + t <= 0 && t >= d && u <= 0 && u >= d + } +} + +fn edge_intersects_bbox(x1: i32, y1: i32, x2: i32, y2: i32, bbox: &[i32; 4]) -> bool { + // Test against all 4 rectangle edges, bottom, right, top, left. + line_intersects_line(x1, y1, x2, y2, bbox[1], bbox[0], bbox[3], bbox[0]) + || line_intersects_line(x1, y1, x2, y2, bbox[3], bbox[0], bbox[3], bbox[2]) + || line_intersects_line(x1, y1, x2, y2, bbox[3], bbox[2], bbox[1], bbox[2]) + || line_intersects_line(x1, y1, x2, y2, bbox[1], bbox[2], bbox[1], bbox[0]) +} + +fn edge_crosses_bbox(x1: i32, y1: i32, x2: i32, y2: i32, bbox: &[i32; 4]) -> bool { + // Edge has endpoint outside while other is inside (crosses boundary) + let p1_inside = y1 >= bbox[0] && x1 >= bbox[1] && y1 <= bbox[2] && x1 <= bbox[3]; + let p2_inside = y2 >= bbox[0] && x2 >= bbox[1] && y2 <= bbox[2] && x2 <= bbox[3]; + p1_inside != p2_inside +} + +fn triangle_within(triangle: &Triangle, query: &[i32; 4]) -> bool { + let tri_bbox = &triangle.words[0..4]; + + // Triangle bbox entirely within query → WITHIN + if tri_bbox[0] >= query[0] + && tri_bbox[1] >= query[1] + && tri_bbox[2] <= query[2] + && tri_bbox[3] <= query[3] + { + return true; + } + + // Triangle bbox entirely outside → NOT WITHIN + if tri_bbox[2] < query[0] + || tri_bbox[3] < query[1] + || tri_bbox[0] > query[2] + || tri_bbox[1] > query[3] + { + return false; + } + + // Decode vertices. + let ([ay, ax, by, bx, cy, cx], [ab, bc, ca]) = triangle.decode(); + + // Check each edge - if boundary edge crosses query bbox, NOT WITHIN + if ab && edge_crosses_bbox(ax, ay, bx, by, query) { + return false; + } + if bc && edge_crosses_bbox(bx, by, cx, cy, query) { + return false; + } + if ca && edge_crosses_bbox(cx, cy, ax, ay, query) { + return false; + } + + // No boundary edges cross out + true +} + +#[expect(clippy::too_many_arguments)] +fn point_in_triangle( + px: i32, + py: i32, + ax: i32, + ay: i32, + bx: i32, + by: i32, + cx: i32, + cy: i32, +) -> bool { + let v0x = (cx - ax) as i128; + let v0y = (cy - ay) as i128; + let v1x = (bx - ax) as i128; + let v1y = (by - ay) as i128; + let v2x = (px - ax) as i128; + let v2y = (py - ay) as i128; + + let dot00 = v0x * v0x + v0y * v0y; + let dot01 = v0x * v1x + v0y * v1y; + let dot02 = v0x * v2x + v0y * v2y; + let dot11 = v1x * v1x + v1y * v1y; + let dot12 = v1x * v2x + v1y * v2y; + + let denom = dot00 * dot11 - dot01 * dot01; + if denom == 0 { + return false; + } + + let u = dot11 * dot02 - dot01 * dot12; + let v = dot00 * dot12 - dot01 * dot02; + + u >= 0 && v >= 0 && u + v <= denom +} + +fn triangle_intersects(triangle: &Triangle, query: &[i32; 4]) -> bool { + let tri_bbox = &triangle.words[0..4]; + + // Quick reject: bboxes don't overlap + if tri_bbox[2] < query[0] + || tri_bbox[3] < query[1] + || tri_bbox[0] > query[2] + || tri_bbox[1] > query[3] + { + return false; + } + + let ([ay, ax, by, bx, cy, cx], _) = triangle.decode(); + + // Any triangle vertex inside rectangle? + if (ax >= query[1] && ax <= query[3] && ay >= query[0] && ay <= query[2]) + || (bx >= query[1] && bx <= query[3] && by >= query[0] && by <= query[2]) + || (cx >= query[1] && cx <= query[3] && cy >= query[0] && cy <= query[2]) + { + return true; + } + + // Any rectangle corner inside triangle? + let corners = [ + (query[1], query[0]), // min_x, min_y + (query[3], query[0]), // max_x, min_y + (query[3], query[2]), // max_x, max_y + (query[1], query[2]), // min_x, max_y + ]; + for (x, y) in corners { + if point_in_triangle(x, y, ax, ay, bx, by, cx, cy) { + return true; + } + } + + // Any triangle edge intersect rectangle edges? + edge_intersects_bbox(ax, ay, bx, by, query) + || edge_intersects_bbox(bx, by, cx, cy, query) + || edge_intersects_bbox(cx, cy, ax, ay, query) +} + +/// Finds documents where all triangles are within the query bounding box. +/// +/// Traverses the tree starting at `offset` (typically `segment.root_offset`), testing each +/// triangle to determine if it lies entirely within the query bounds. Uses two `BitSet` instances +/// to track state: `result` accumulates candidate documents, while `excluded` marks documents that +/// have at least one triangle extending outside the query. +/// +/// The query is `[min_y, min_x, max_y, max_x]` in integer coordinates. The final result is +/// documents in `result` that are NOT in `excluded` - the caller must compute this difference. +pub fn search_within( + segment: &Segment, + offset: i32, + query: &[i32; 4], // [min_y, min_x, max_y, max_x] + result: &mut BitSet, + excluded: &mut BitSet, +) -> io::Result<()> { + let bbox = segment.bounding_box(offset); + if !bbox_intersects(&bbox, query) { + } else if offset < 0 { + let leaf_node = segment.leaf_node(offset); + // bbox crosses query → test each triangle + let triangles = decompress_leaf(segment.leaf_page(&leaf_node))?; + for triangle in &triangles { + if triangle_intersects(triangle, query) { + if excluded.contains(triangle.doc_id) { + continue; // Already excluded + } + if triangle_within(triangle, query) { + result.insert(triangle.doc_id); + } else { + excluded.insert(triangle.doc_id); + } + } + } + } else { + let branch_node = segment.branch_node(offset); + search_within(segment, branch_node.left, query, result, excluded)?; + search_within(segment, branch_node.right, query, result, excluded)?; + } + Ok(()) +} + +enum ContainsRelation { + CANDIDATE, // Query might be contained + NOTWITHIN, // Query definitely not contained + DISJOINT, // Triangle doesn't overlap query +} + +fn triangle_contains_relation(triangle: &Triangle, query: &[i32; 4]) -> ContainsRelation { + let tri_bbox = &triangle.words[0..4]; + + if query[2] < tri_bbox[0] + || query[3] < tri_bbox[1] + || query[0] > tri_bbox[2] + || query[1] > tri_bbox[3] + { + return ContainsRelation::DISJOINT; + } + + let ([ay, ax, by, bx, cy, cx], [ab, bc, ca]) = triangle.decode(); + + let corners = [ + (query[1], query[0]), + (query[3], query[0]), + (query[3], query[2]), + (query[1], query[2]), + ]; + + let mut any_corner_inside = false; + for &(qx, qy) in &corners { + if point_in_triangle(qx, qy, ax, ay, bx, by, cx, cy) { + any_corner_inside = true; + break; + } + } + + let ab_intersects = edge_intersects_bbox(ax, ay, bx, by, query); + let bc_intersects = edge_intersects_bbox(bx, by, cx, cy, query); + let ca_intersects = edge_intersects_bbox(cx, cy, ax, ay, query); + + if (ab && edge_crosses_bbox(ax, ay, bx, by, query)) + || (bc && edge_crosses_bbox(bx, by, cx, cy, query)) + || (ca && edge_crosses_bbox(cx, cy, ax, ay, query)) + { + return ContainsRelation::NOTWITHIN; + } + + if any_corner_inside || ab_intersects || bc_intersects || ca_intersects { + return ContainsRelation::CANDIDATE; + } + + ContainsRelation::DISJOINT +} + +/// Finds documents whose polygons contain the query bounding box. +/// +/// Traverses the tree starting at `offset` (typically `segment.root_offset`), testing each +/// triangle using three-state logic: `CANDIDATE` (query might be contained), `NOTWITHIN` (boundary +/// edge crosses query), or `DISJOINT` (no overlap). Only boundary edges are tested for crossing - +/// internal tessellation edges are ignored. +/// +/// The query is `[min_y, min_x, max_y, max_x]` in integer coordinates. Uses two `BitSet` +/// instances: `result` accumulates candidates, `excluded` marks documents with disqualifying +/// boundary crossings. The final result is documents in `result` that are NOT in `excluded`. +pub fn search_contains( + segment: &Segment, + offset: i32, + query: &[i32; 4], + result: &mut BitSet, + excluded: &mut BitSet, +) -> io::Result<()> { + let bbox = segment.bounding_box(offset); + if !bbox_intersects(&bbox, query) { + } else if offset < 0 { + let leaf_node = segment.leaf_node(offset); + // bbox crosses query → test each triangle + let triangles = decompress_leaf(segment.leaf_page(&leaf_node))?; + for triangle in &triangles { + if triangle_intersects(triangle, query) { + let doc_id = triangle.doc_id; + if excluded.contains(doc_id) { + continue; + } + match triangle_contains_relation(triangle, query) { + ContainsRelation::CANDIDATE => result.insert(doc_id), + ContainsRelation::NOTWITHIN => excluded.insert(doc_id), + ContainsRelation::DISJOINT => {} + } + } + } + } else { + let branch_node = segment.branch_node(offset); + search_contains(segment, branch_node.left, query, result, excluded)?; + search_contains(segment, branch_node.right, query, result, excluded)?; + } + Ok(()) +} + +/// HUSH +pub struct LeafPageIterator<'a> { + segment: &'a Segment<'a>, + descent_stack: Vec, +} + +impl<'a> LeafPageIterator<'a> { + /// HUSH + pub fn new(segment: &'a Segment<'a>) -> Self { + Self { + segment, + descent_stack: vec![segment.root_offset], + } + } +} + +impl<'a> Iterator for LeafPageIterator<'a> { + type Item = io::Result>; + + fn next(&mut self) -> Option { + let offset = self.descent_stack.pop()?; + if offset < 0 { + let leaf_node = self.segment.leaf_node(offset); + let leaf_page = self.segment.leaf_page(&leaf_node); + match decompress_leaf(&leaf_page) { + Ok(triangles) => Some(Ok(triangles)), + Err(e) => Some(Err(e)), + } + } else { + let branch_node = self.segment.branch_node(offset); + self.descent_stack.push(branch_node.right); + self.descent_stack.push(branch_node.left); + self.next() + } + } +} diff --git a/src/spatial/delta.rs b/src/spatial/delta.rs new file mode 100644 index 0000000000..8a9d450459 --- /dev/null +++ b/src/spatial/delta.rs @@ -0,0 +1,300 @@ +//! Delta compression for block kd-tree leaves. +//! +//! Delta compression with dimension-major bit-packing for block kd-tree leaves. Each leaf contains +//! ≤512 triangles sorted by the split dimension (the dimension with maximum spread chosen during +//! tree construction). We store all 512 values for dimension 0, then all for dimension 1, etc., +//! enabling tight bit-packing per dimension and better cache locality during decode. +//! +//! The split dimension is already optimal for compression. Since triangles in a leaf are spatially +//! clustered, sorting by the max-spread dimension naturally orders them by proximity in all +//! dimensions. Testing multiple sort orders would be wasted effort. +//! +//! Our encoding uses ~214 units/meter for latitude, ~107 units/meter for longitude (millimeter +//! precision). A quarter-acre lot (32m × 32m) spans ~6,850 units across 512 sorted triangles = avg +//! delta ~13 units = 4 bits. A baseball field (100m × 100m) is ~42 unit deltas = 6 bits. Even +//! Russia-sized polygons (1000 km) average ~418,000 unit deltas = 19 bits. Time will tell if these +//! numbers are anything to go by in practice. +//! +//! Our format for use with leaf-page triangles: First a count of triangles in the page, then the +//! delta encoded doc_ids followed by delta encoding of each series of the triangle dimensions, +//! followed by delta encoding of the flags. Creates eight parallel arrays from which triangles can +//! be reconstructed. +//! +//! Note: Tantivy also has delta encoding in `sstable/src/delta.rs`, but that's for string +//! dictionary compression (prefix sharing + vint deltas). This module uses bit-packing with zigzag +//! encoding, which is optimal for our signed i32 spatial coordinates with small deltas. It uses +//! the same basic algorithm to compress u32 doc_ids. +use std::io::{self, Write}; + +fn zigzag_encode(x: i32) -> u32 { + ((x << 1) ^ (x >> 31)) as u32 +} + +fn zigzag_decode(x: u32) -> i32 { + ((x >> 1) ^ (0u32.wrapping_sub(x & 1))) as i32 +} + +/// Trait for reading values by index during compression. +/// +/// The `Compressible` trait allows `compress()` to work with two different data sources, +/// `Vec` when indexing and memory mapped `Triangle` when merging. The compress function +/// reads values on-demand via `get()`, computing deltas and bit-packing without intermediate +/// allocations. +pub trait Compressible { + /// The type of the values being compressed. + type Value: Copy; + /// Returns the number of values in this source. + fn len(&self) -> usize; + /// Returns the value at the given index. + fn get(&self, i: usize) -> Self::Value; +} + +/// Operations for types that can be delta-encoded and bit-packed into four-byte words. +pub trait DeltaEncoder: Copy { + /// Computes a zigzag-encoded delta between two values. + fn compute_delta(current: Self, previous: Self) -> u32; + /// Converts a value to little-endian bytes for storage. + fn to_le_bytes(value: Self) -> [u8; 4]; +} + +impl DeltaEncoder for i32 { + fn compute_delta(current: Self, previous: Self) -> u32 { + zigzag_encode(current.wrapping_sub(previous)) + } + fn to_le_bytes(value: Self) -> [u8; 4] { + value.to_le_bytes() + } +} + +// Delta encoding for u32 values using wrapping arithmetic and zigzag encoding. +// +// This handles arbitrary u32 document IDs that may be non-sequential or widely spaced. The +// strategy uses wrapping subtraction followed by zigzag encoding: +// +// 1. wrapping_sub computes the difference modulo 2^32, producing a u32 result +// 2. Cast to i32 reinterprets the bit pattern as signed (two's complement) +// 3. zigzag_encode maps signed values to unsigned for efficient bit-packing: +// - Positive deltas (0, 1, 2...) encode to even numbers (0, 2, 4...) +// - Negative deltas (-1, -2, -3...) encode to odd numbers (1, 3, 5...) +// +// Example with large jump (doc_id 0 → 4,000,000,000): +// delta = 4_000_000_000u32.wrapping_sub(0) = 4_000_000_000u32 +// as i32 = -294,967,296 (bit pattern preserved via two's complement) +// zigzag_encode(-294,967,296) = some u32 value +// +// During decompression, zigzag_decode returns the signed i32 delta, which is cast back to u32 and +// added with wrapping_add. The bit pattern round-trips correctly because wrapping_add and +// wrapping_sub are mathematical inverses modulo 2^32, making this encoding symmetric for the full +// u32 range. +impl DeltaEncoder for u32 { + fn compute_delta(current: Self, previous: Self) -> u32 { + zigzag_encode(current.wrapping_sub(previous) as i32) + } + fn to_le_bytes(value: Self) -> [u8; 4] { + value.to_le_bytes() + } +} + +/// Compresses values from a `Compressible` source using delta encoding and bit-packing. +/// +/// Computes signed deltas between consecutive values, zigzag encodes them, and determines the +/// minimum bit width needed to represent all deltas. Writes a header (1 byte for bit width + +/// 4 bytes for first value in little-endian), then bit-packs the remaining deltas. +pub fn compress(compressible: &T, write: &mut W) -> io::Result<()> +where + T: Compressible, + T::Value: DeltaEncoder, + W: Write, +{ + let mut max_delta = 0u32; + for i in 1..compressible.len() { + let delta = T::Value::compute_delta(compressible.get(i), compressible.get(i - 1)); + max_delta = max_delta.max(delta); + } + let bits = if max_delta == 0 { + 0u32 + } else { + 32 - max_delta.leading_zeros() as u32 + }; + let mask = if bits == 32 { + u32::MAX + } else { + (1u32 << bits) - 1 + }; + write.write_all(&[bits as u8])?; + write.write_all(&T::Value::to_le_bytes(compressible.get(0)))?; + let mut buffer = 0u64; + let mut buffer_bits = 0u32; + for i in 1..compressible.len() { + let delta = T::Value::compute_delta(compressible.get(i), compressible.get(i - 1)); + let value = delta & mask; + buffer = (buffer << bits) | (value as u64); + buffer_bits += bits; + while buffer_bits >= 8 { + buffer_bits -= 8; + write.write_all(&[(buffer >> buffer_bits) as u8])?; + } + } + if buffer_bits > 0 { + write.write_all(&[(buffer << (8 - buffer_bits)) as u8])?; + } + Ok(()) +} + +/// Operations needed to decompress delta-encoded values back to their original form. +pub trait DeltaDecoder: Copy + Sized { + /// Converts from little-endian bytes to a value. + fn from_le_bytes(bytes: [u8; 4]) -> Self; + /// Applies a zigzag-decoded delta to reconstruct the next value. + fn apply_delta(value: Self, delta: u32) -> Self; +} + +impl DeltaDecoder for i32 { + fn from_le_bytes(bytes: [u8; 4]) -> Self { + i32::from_le_bytes(bytes) + } + fn apply_delta(value: Self, delta: u32) -> Self { + value.wrapping_add(zigzag_decode(delta)) + } +} + +impl DeltaDecoder for u32 { + fn from_le_bytes(bytes: [u8; 4]) -> Self { + u32::from_le_bytes(bytes) + } + fn apply_delta(value: Self, delta: u32) -> Self { + value.wrapping_add(zigzag_decode(delta) as u32) + } +} + +/// Decompresses bit-packed delta-encoded values from a byte slice. +/// +/// Reads the header to get bit width and first value, then unpacks the bit-packed deltas, applies +/// zigzag decoding, and reconstructs the original values by accumulating deltas. +/// +/// Returns the count of bytes read from `data`. +pub fn decompress( + data: &[u8], + count: usize, + mut process: F, +) -> io::Result +where + F: FnMut(usize, T), +{ + if data.len() < 5 { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "truncated header", + )); + } + let bits = data[0] as u32; + let first = T::from_le_bytes([data[1], data[2], data[3], data[4]]); + process(0, first); + let mut offset = 5; + if bits == 0 { + // All deltas are zero - all values same as first + for i in 1..count { + process(i, first); + } + return Ok(offset); + } + let mut buffer = 0u64; + let mut buffer_bits = 0u32; + let mut prev = first; + for i in 1..count { + // Refill buffer with bytes + while buffer_bits < bits { + if offset >= data.len() { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + format!("expected {} values but only decoded {}", count, i - 1), + )); + } + buffer = (buffer << 8) | (data[offset] as u64); + offset += 1; + buffer_bits += 8; + } + if buffer_bits >= bits { + // Extract packed value + buffer_bits -= bits; + let encoded = ((buffer >> buffer_bits) & ((1u64 << bits) - 1)) as u32; + let value = T::apply_delta(prev, encoded); + process(i, value); + prev = value; + } else { + break; + } + } + Ok(offset) +} + +#[cfg(test)] +mod test { + use super::*; + + pub struct CompressibleI32Vec { + vec: Vec, + } + + impl CompressibleI32Vec { + fn new(vec: Vec) -> Self { + CompressibleI32Vec { vec } + } + } + + impl Compressible for CompressibleI32Vec { + type Value = i32; + fn len(&self) -> usize { + return self.vec.len(); + } + fn get(&self, i: usize) -> i32 { + return self.vec[i]; + } + } + + #[test] + fn test_spatial_delta_compress_decompress() { + let values = vec![ + 100000, 99975, 100050, 99980, 100100, 100025, 99950, 100150, 100075, 99925, 100200, + 100100, + ]; + let compressible = CompressibleI32Vec::new(values.clone()); + let mut buffer = Vec::new(); + compress(&compressible, &mut buffer).unwrap(); + let mut vec = Vec::new(); + decompress::(&buffer, values.len(), |_, value| vec.push(value)).unwrap(); + assert_eq!(vec, values); + } + + #[test] + fn test_spatial_delta_bad_header() { + let mut vec = Vec::new(); + let result = decompress::(&[1, 2], 1, |_, value| vec.push(value)); + assert!(result.is_err()); + } + + #[test] + fn test_spatial_delta_insufficient_data() { + let mut vec = Vec::new(); + let result = decompress::(&[5, 0, 0, 0, 1], 12, |_, value| vec.push(value)); + assert!(result.is_err()); + } + + #[test] + fn test_spatial_delta_single_item() { + let mut vec = Vec::new(); + decompress::(&[5, 1, 0, 0, 0], 1, |_, value| vec.push(value)).unwrap(); + assert_eq!(vec[0], 1); + } + + #[test] + fn test_spatial_delta_zero_length_delta() { + let values = vec![1, 1, 1]; + let compressible = CompressibleI32Vec::new(values.clone()); + let mut buffer = Vec::new(); + compress(&compressible, &mut buffer).unwrap(); + let mut vec = Vec::new(); + decompress::(&buffer, values.len(), |_, value| vec.push(value)).unwrap(); + assert_eq!(vec, values); + } +} diff --git a/src/spatial/geometry.rs b/src/spatial/geometry.rs new file mode 100644 index 0000000000..091f9688c9 --- /dev/null +++ b/src/spatial/geometry.rs @@ -0,0 +1,490 @@ +//! HUSH + +use std::io::{self, Read, Write}; + +use common::{BinarySerializable, VInt}; +use serde_json::{json, Map, Value}; + +use crate::spatial::point::GeoPoint; +use crate::spatial::xor::{compress_f64, decompress_f64}; + +/// HUSH +#[derive(Debug)] +pub enum GeometryError { + /// HUSH + MissingType, + /// HUSH + MissingField(String), // "expected array", "wrong nesting depth", etc + /// HUSH + UnsupportedType(String), + /// HUSH + InvalidCoordinate(String), // Can report the actual bad value + /// HUSH + InvalidStructure(String), // "expected array", "wrong nesting depth", etc +} + +/// HUSH +#[derive(Debug, Clone, PartialEq)] +pub enum Geometry { + /// HUSH + Point(GeoPoint), + /// HUSH + MultiPoint(Vec), + /// HUSH + LineString(Vec), + /// HUSH + MultiLineString(Vec>), + /// HUSH + Polygon(Vec>), + /// HUSH + MultiPolygon(Vec>>), + /// HUSH + GeometryCollection(Vec), +} + +impl Geometry { + /// HUSH + pub fn from_geojson(object: &Map) -> Result { + let geometry_type = object + .get("type") + .and_then(|v| v.as_str()) + .ok_or(GeometryError::MissingType)?; + match geometry_type { + "Point" => { + let coordinates = get_coordinates(object)?; + let point = to_point(coordinates)?; + Ok(Geometry::Point(point)) + } + "MultiPoint" => { + let coordinates = get_coordinates(object)?; + let multi_point = to_line_string(coordinates)?; + Ok(Geometry::MultiPoint(multi_point)) + } + "LineString" => { + let coordinates = get_coordinates(object)?; + let line_string = to_line_string(coordinates)?; + if line_string.len() < 2 { + return Err(GeometryError::InvalidStructure( + "a line string contains at least 2 points".to_string(), + )); + } + Ok(Geometry::LineString(line_string)) + } + "MultiLineString" => { + let coordinates = get_coordinates(object)?; + let multi_line_string = to_multi_line_string(coordinates)?; + for line_string in &multi_line_string { + if line_string.len() < 2 { + return Err(GeometryError::InvalidStructure( + "a line string contains at least 2 points".to_string(), + )); + } + } + Ok(Geometry::MultiLineString(multi_line_string)) + } + "Polygon" => { + let coordinates = get_coordinates(object)?; + let polygon = to_multi_line_string(coordinates)?; + for ring in &polygon { + if ring.len() < 3 { + return Err(GeometryError::InvalidStructure( + "a polygon ring contains at least 3 points".to_string(), + )); + } + } + Ok(Geometry::Polygon(polygon)) + } + "MultiPolygon" => { + let mut result = Vec::new(); + let multi_polygons = get_coordinates(object)?; + let multi_polygons = + multi_polygons + .as_array() + .ok_or(GeometryError::InvalidStructure( + "expected an array of polygons".to_string(), + ))?; + for polygon in multi_polygons { + let polygon = to_multi_line_string(polygon)?; + for ring in &polygon { + if ring.len() < 3 { + return Err(GeometryError::InvalidStructure( + "a polygon ring contains at least 3 points".to_string(), + )); + } + } + result.push(polygon); + } + Ok(Geometry::MultiPolygon(result)) + } + "GeometriesCollection" => { + let geometries = object + .get("geometries") + .ok_or(GeometryError::MissingField("geometries".to_string()))?; + let geometries = geometries + .as_array() + .ok_or(GeometryError::InvalidStructure( + "geometries is not an array".to_string(), + ))?; + let mut result = Vec::new(); + for geometry in geometries { + let object = geometry.as_object().ok_or(GeometryError::InvalidStructure( + "geometry is not an object".to_string(), + ))?; + result.push(Geometry::from_geojson(object)?); + } + Ok(Geometry::GeometryCollection(result)) + } + _ => Err(GeometryError::UnsupportedType(geometry_type.to_string())), + } + } + + /// Serialize the geometry to GeoJSON format. + /// https://fr.wikipedia.org/wiki/GeoJSON + pub fn to_geojson(&self) -> Map { + let mut map = Map::new(); + match self { + Geometry::Point(point) => { + map.insert("type".to_string(), Value::String("Point".to_string())); + let coords = json!([point.lon, point.lat]); + map.insert("coordinates".to_string(), coords); + } + Geometry::MultiPoint(points) => { + map.insert("type".to_string(), Value::String("MultiPoint".to_string())); + let coords: Vec = points.iter().map(|p| json!([p.lon, p.lat])).collect(); + map.insert("coordinates".to_string(), Value::Array(coords)); + } + Geometry::LineString(line) => { + map.insert("type".to_string(), Value::String("LineString".to_string())); + let coords: Vec = line.iter().map(|p| json!([p.lon, p.lat])).collect(); + map.insert("coordinates".to_string(), Value::Array(coords)); + } + Geometry::MultiLineString(lines) => { + map.insert( + "type".to_string(), + Value::String("MultiLineString".to_string()), + ); + let coords: Vec = lines + .iter() + .map(|line| Value::Array(line.iter().map(|p| json!([p.lon, p.lat])).collect())) + .collect(); + map.insert("coordinates".to_string(), Value::Array(coords)); + } + Geometry::Polygon(rings) => { + map.insert("type".to_string(), Value::String("Polygon".to_string())); + let coords: Vec = rings + .iter() + .map(|ring| Value::Array(ring.iter().map(|p| json!([p.lon, p.lat])).collect())) + .collect(); + map.insert("coordinates".to_string(), Value::Array(coords)); + } + Geometry::MultiPolygon(polygons) => { + map.insert( + "type".to_string(), + Value::String("MultiPolygon".to_string()), + ); + let coords: Vec = polygons + .iter() + .map(|polygon| { + Value::Array( + polygon + .iter() + .map(|ring| { + Value::Array( + ring.iter().map(|p| json!([p.lon, p.lat])).collect(), + ) + }) + .collect(), + ) + }) + .collect(); + map.insert("coordinates".to_string(), Value::Array(coords)); + } + Geometry::GeometryCollection(geometries) => { + map.insert( + "type".to_string(), + Value::String("GeometryCollection".to_string()), + ); + let geoms: Vec = geometries + .iter() + .map(|g| Value::Object(g.to_geojson())) + .collect(); + map.insert("geometries".to_string(), Value::Array(geoms)); + } + } + map + } +} + +fn get_coordinates(object: &Map) -> Result<&Value, GeometryError> { + let coordinates = object + .get("coordinates") + .ok_or(GeometryError::MissingField("coordinates".to_string()))?; + Ok(coordinates) +} + +fn to_point(value: &Value) -> Result { + let lonlat = value.as_array().ok_or(GeometryError::InvalidStructure( + "expected 2 element array pair of lon/lat".to_string(), + ))?; + if lonlat.len() != 2 { + return Err(GeometryError::InvalidStructure( + "expected 2 element array pair of lon/lat".to_string(), + )); + } + let lon = lonlat[0].as_f64().ok_or(GeometryError::InvalidCoordinate( + "longitude must be f64".to_string(), + ))?; + if !lon.is_finite() || !(-180.0..=180.0).contains(&lon) { + return Err(GeometryError::InvalidCoordinate(format!( + "invalid longitude: {}", + lon + ))); + } + let lat = lonlat[1].as_f64().ok_or(GeometryError::InvalidCoordinate( + "latitude must be f64".to_string(), + ))?; + if !lat.is_finite() || !(-90.0..=90.0).contains(&lat) { + return Err(GeometryError::InvalidCoordinate(format!( + "invalid latitude: {}", + lat + ))); + } + Ok(GeoPoint { lon, lat }) +} + +fn to_line_string(value: &Value) -> Result, GeometryError> { + let mut result = Vec::new(); + let coordinates = value.as_array().ok_or(GeometryError::InvalidStructure( + "expected an array of lon/lat arrays".to_string(), + ))?; + for coordinate in coordinates { + result.push(to_point(coordinate)?); + } + Ok(result) +} + +fn to_multi_line_string(value: &Value) -> Result>, GeometryError> { + let mut result = Vec::new(); + let coordinates = value.as_array().ok_or(GeometryError::InvalidStructure( + "expected an array of an array of lon/lat arrays".to_string(), + ))?; + for coordinate in coordinates { + result.push(to_line_string(coordinate)?); + } + Ok(result) +} + +impl BinarySerializable for Geometry { + fn serialize(&self, writer: &mut W) -> io::Result<()> { + match self { + Geometry::Point(point) => { + 0u8.serialize(writer)?; + point.lon.serialize(writer)?; + point.lat.serialize(writer)?; + Ok(()) + } + Geometry::MultiPoint(points) => { + 1u8.serialize(writer)?; + serialize_line_string(points, writer) + } + Geometry::LineString(line_string) => { + 2u8.serialize(writer)?; + serialize_line_string(line_string, writer) + } + Geometry::MultiLineString(multi_line_string) => { + 3u8.serialize(writer)?; + serialize_polygon(&multi_line_string[..], writer) + } + Geometry::Polygon(polygon) => { + 4u8.serialize(writer)?; + serialize_polygon(polygon, writer) + } + Geometry::MultiPolygon(multi_polygon) => { + 5u8.serialize(writer)?; + BinarySerializable::serialize(&VInt(multi_polygon.len() as u64), writer)?; + for polygon in multi_polygon { + BinarySerializable::serialize(&VInt(polygon.len() as u64), writer)?; + for ring in polygon { + BinarySerializable::serialize(&VInt(ring.len() as u64), writer)?; + } + } + let mut lon = Vec::new(); + let mut lat = Vec::new(); + for polygon in multi_polygon { + for ring in polygon { + for point in ring { + lon.push(point.lon); + lat.push(point.lat); + } + } + } + let lon = compress_f64(&lon); + let lat = compress_f64(&lat); + VInt(lon.len() as u64).serialize(writer)?; + writer.write_all(&lon)?; + VInt(lat.len() as u64).serialize(writer)?; + writer.write_all(&lat)?; + Ok(()) + } + Geometry::GeometryCollection(geometries) => { + 6u8.serialize(writer)?; + BinarySerializable::serialize(&VInt(geometries.len() as u64), writer)?; + for geometry in geometries { + geometry.serialize(writer)?; + } + Ok(()) + } + } + } + + fn deserialize(reader: &mut R) -> io::Result { + let discriminant: u8 = BinarySerializable::deserialize(reader)?; + match discriminant { + 0 => { + let lon = BinarySerializable::deserialize(reader)?; + let lat = BinarySerializable::deserialize(reader)?; + Ok(Geometry::Point(GeoPoint { lon, lat })) + } + 1 => Ok(Geometry::MultiPoint(deserialize_line_string(reader)?)), + 2 => Ok(Geometry::LineString(deserialize_line_string(reader)?)), + 3 => Ok(Geometry::MultiLineString(deserialize_polygon(reader)?)), + 4 => Ok(Geometry::Polygon(deserialize_polygon(reader)?)), + 5 => { + let polygon_count = VInt::deserialize(reader)?.0 as usize; + let mut polygons = Vec::new(); + let mut count = 0; + for _ in 0..polygon_count { + let ring_count = VInt::deserialize(reader)?.0 as usize; + let mut rings = Vec::new(); + for _ in 0..ring_count { + let point_count = VInt::deserialize(reader)?.0 as usize; + rings.push(point_count); + count += point_count; + } + polygons.push(rings); + } + let lon_bytes: Vec = BinarySerializable::deserialize(reader)?; + let lat_bytes: Vec = BinarySerializable::deserialize(reader)?; + let lon = decompress_f64(&lon_bytes, count); + let lat = decompress_f64(&lat_bytes, count); + let mut multi_polygon = Vec::new(); + let mut offset = 0; + for rings in polygons { + let mut polygon = Vec::new(); + for point_count in rings { + let mut ring = Vec::new(); + for _ in 0..point_count { + ring.push(GeoPoint { + lon: lon[offset], + lat: lat[offset], + }); + offset += 1; + } + polygon.push(ring); + } + multi_polygon.push(polygon); + } + Ok(Geometry::MultiPolygon(multi_polygon)) + } + 6 => { + let geometry_count = VInt::deserialize(reader)?.0 as usize; + let mut geometries = Vec::new(); + for _ in 0..geometry_count { + geometries.push(Geometry::deserialize(reader)?); + } + Ok(Geometry::GeometryCollection(geometries)) + } + _ => Err(io::Error::new( + io::ErrorKind::InvalidData, + "invalid geometry type", + )), + } + } +} + +fn serialize_line_string(line: &[GeoPoint], writer: &mut W) -> io::Result<()> { + BinarySerializable::serialize(&VInt(line.len() as u64), writer)?; + let mut lon = Vec::new(); + let mut lat = Vec::new(); + for point in line { + lon.push(point.lon); + lat.push(point.lat); + } + let lon = compress_f64(&lon); + let lat = compress_f64(&lat); + VInt(lon.len() as u64).serialize(writer)?; + writer.write_all(&lon)?; + VInt(lat.len() as u64).serialize(writer)?; + writer.write_all(&lat)?; + Ok(()) +} + +fn serialize_polygon( + line_string: &[Vec], + writer: &mut W, +) -> io::Result<()> { + BinarySerializable::serialize(&VInt(line_string.len() as u64), writer)?; + for ring in line_string { + BinarySerializable::serialize(&VInt(ring.len() as u64), writer)?; + } + let mut lon: Vec = Vec::new(); + let mut lat: Vec = Vec::new(); + for ring in line_string { + for point in ring { + lon.push(point.lon); + lat.push(point.lat); + } + } + let lon: Vec = compress_f64(&lon); + let lat: Vec = compress_f64(&lat); + VInt(lon.len() as u64).serialize(writer)?; + writer.write_all(&lon)?; + VInt(lat.len() as u64).serialize(writer)?; + writer.write_all(&lat)?; + Ok(()) +} + +fn deserialize_line_string(reader: &mut R) -> io::Result> { + let point_count = VInt::deserialize(reader)?.0 as usize; + let lon_bytes: Vec = BinarySerializable::deserialize(reader)?; + let lat_bytes: Vec = BinarySerializable::deserialize(reader)?; + let lon: Vec = decompress_f64(&lon_bytes, point_count); + let lat: Vec = decompress_f64(&lat_bytes, point_count); + let mut line_string: Vec = Vec::new(); + for offset in 0..point_count { + line_string.push(GeoPoint { + lon: lon[offset], + lat: lat[offset], + }); + } + Ok(line_string) +} + +fn deserialize_polygon(reader: &mut R) -> io::Result>> { + let ring_count = VInt::deserialize(reader)?.0 as usize; + let mut rings = Vec::new(); + let mut count = 0; + for _ in 0..ring_count { + let point_count = VInt::deserialize(reader)?.0 as usize; + rings.push(point_count); + count += point_count; + } + let lon_bytes: Vec = BinarySerializable::deserialize(reader)?; + let lat_bytes: Vec = BinarySerializable::deserialize(reader)?; + let lon: Vec = decompress_f64(&lon_bytes, count); + let lat: Vec = decompress_f64(&lat_bytes, count); + let mut polygon: Vec> = Vec::new(); + let mut offset = 0; + for point_count in rings { + let mut ring = Vec::new(); + for _ in 0..point_count { + ring.push(GeoPoint { + lon: lon[offset], + lat: lat[offset], + }); + offset += 1; + } + polygon.push(ring); + } + Ok(polygon) +} diff --git a/src/spatial/mod.rs b/src/spatial/mod.rs new file mode 100644 index 0000000000..4ac391b349 --- /dev/null +++ b/src/spatial/mod.rs @@ -0,0 +1,12 @@ +//! Spatial module (implements a block kd-tree) + +pub mod bkd; +pub mod delta; +pub mod geometry; +pub mod point; +pub mod radix_select; +pub mod reader; +pub mod serializer; +pub mod triangle; +pub mod writer; +pub mod xor; diff --git a/src/spatial/point.rs b/src/spatial/point.rs new file mode 100644 index 0000000000..93981b2ad2 --- /dev/null +++ b/src/spatial/point.rs @@ -0,0 +1,8 @@ +/// A point in the geographical coordinate system. +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct GeoPoint { + /// Longitude + pub lon: f64, + /// Latitude + pub lat: f64, +} diff --git a/src/spatial/radix_select.rs b/src/spatial/radix_select.rs new file mode 100644 index 0000000000..2397c9eaa9 --- /dev/null +++ b/src/spatial/radix_select.rs @@ -0,0 +1,122 @@ +//! Radix selection for block kd-tree tree partitioning. +//! +//! Implements byte-wise histogram selection to find median values without comparisons, enabling +//! efficient partitioning of spatial data during block kd-tree construction. Processes values +//! through multiple passes, building histograms for each byte position after a common prefix, +//! avoiding the need to sort or compare elements directly. + +/// Performs radix selection to find the median value without comparisons by building byte-wise +/// histograms. +pub struct RadixSelect { + histogram: [usize; 256], + prefix: Vec, + offset: usize, + nth: usize, +} + +impl RadixSelect { + /// Creates a new radix selector for finding the nth element among values with a common prefix. + /// + /// The offset specifies how many matching elements appeared in previous buckets (from earlier + /// passes). The nth parameter is 0-indexed, so pass 31 to find the 32nd element (median of + /// 64). + pub fn new(prefix: Vec, offset: usize, nth: usize) -> Self { + RadixSelect { + histogram: [0; 256], + prefix, + offset, + nth, + } + } + /// Updates the histogram with a value if it matches the current prefix. + /// + /// Values that don't start with the prefix are ignored. For matching values, increments the + /// count for the byte at position `prefix.len()`. + pub fn update(&mut self, value: i32) { + let bytes = value.to_be_bytes(); + if !bytes.starts_with(&self.prefix) { + return; + } + let byte = bytes[self.prefix.len()]; + self.histogram[byte as usize] += 1; + } + /// Finds which bucket contains the nth element and returns the bucket value and offset. + /// + /// Returns a tuple of `(bucket_byte, count_before)` where bucket_byte is the value of the byte + /// that contains the nth element, and count_before is the number of elements in earlier + /// buckets (becomes the offset for the next pass). + pub fn nth(&self) -> (u8, usize) { + let mut count = self.offset; + for (bucket, &frequency) in self.histogram.iter().enumerate() { + if count + frequency > self.nth as usize { + return (bucket as u8, count); + } + count += frequency; + } + panic!("nth element {} not found in histogram", self.nth); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn radix_selection() { + let dimensions = [ + ( + vec![ + 0x10101010, 0x10101011, 0x10101012, 0x10101013, 0x10101014, 0x10101015, + 0x10101016, 0x10101017, 0x10101018, 0x10101019, 0x1010101A, 0x1010101B, + 0x1010101C, 0x1010101D, 0x1010101E, 0x1010101F, 0x10101020, 0x10101021, + 0x10101022, 0x10101023, 0x10101024, 0x10101025, 0x10101026, 0x10101027, + 0x10101028, 0x10101029, 0x1010102A, 0x1010102B, 0x1010102C, 0x1010102D, + 0x1010102E, 0x1010102F, 0x10101030, 0x10101031, 0x10101032, 0x10101033, + 0x10101034, 0x10101035, 0x10101036, 0x10101037, 0x10101038, 0x10101039, + 0x1010103A, 0x1010103B, 0x1010103C, 0x1010103D, 0x1010103E, 0x1010103F, + 0x10101040, 0x10101041, 0x10101042, 0x10101043, 0x10101044, 0x10101045, + 0x10101046, 0x10101047, 0x10101048, 0x10101049, 0x1010104A, 0x1010104B, + 0x1010104C, 0x1010104D, 0x1010104E, 0x1010104F, + ], + [(0x10, 0), (0x10, 0), (0x10, 0), (0x2F, 31)], + ), + ( + vec![ + 0x10101010, 0x10101011, 0x10101012, 0x10101013, 0x10101014, 0x10101015, + 0x10101016, 0x10101017, 0x10101018, 0x10101019, 0x1010101A, 0x1010101B, + 0x1010101C, 0x1010101D, 0x1010101E, 0x1010101F, 0x10101020, 0x10101021, + 0x10101022, 0x10101023, 0x10101024, 0x20101025, 0x20201026, 0x20301027, + 0x20401028, 0x20501029, 0x2060102A, 0x2070102B, 0x2080102C, 0x2090102D, + 0x20A0102E, 0x20B0102F, 0x20C01030, 0x20D01031, 0x20E01032, 0x20F01033, + 0x20F11034, 0x20F21035, 0x20F31036, 0x20F41037, 0x20F51038, 0x20F61039, + 0x3010103A, 0x3010103B, 0x3010103C, 0x3010103D, 0x3010103E, 0x3010103F, + 0x30101040, 0x30101041, 0x30101042, 0x30101043, 0x30101044, 0x30101045, + 0x30101046, 0x30101047, 0x30101048, 0x30101049, 0x3010104A, 0x3010104B, + 0x3010104C, 0x3010104D, 0x3010104E, 0x3010104F, + ], + [(0x20, 21), (0xB0, 31), (0x10, 31), (0x2F, 31)], + ), + ]; + for (numbers, expected) in dimensions { + let mut offset = 0; + let mut prefix = Vec::new(); + for i in 0..4 { + let mut radix_select = RadixSelect::new(prefix.clone(), offset, 31); + for &number in &numbers { + radix_select.update(number); + } + let (byte, count) = radix_select.nth(); + if i != 3 { + assert_eq!(expected[i].0, byte); + assert_eq!(expected[i].1, count); + } + prefix.push(byte); + offset = count; + } + let mut sorted = numbers.clone(); + sorted.sort(); + let radix_result = i32::from_be_bytes(prefix.as_slice().try_into().unwrap()); + assert_eq!(radix_result, sorted[31]); + } + } +} diff --git a/src/spatial/reader.rs b/src/spatial/reader.rs new file mode 100644 index 0000000000..7bf10e3295 --- /dev/null +++ b/src/spatial/reader.rs @@ -0,0 +1,70 @@ +//! HUSH +use std::io; +use std::sync::Arc; + +use common::file_slice::FileSlice; +use common::OwnedBytes; + +use crate::directory::CompositeFile; +use crate::schema::Field; +use crate::space_usage::PerFieldSpaceUsage; + +#[derive(Clone)] +pub struct SpatialReaders { + data: Arc, +} + +impl SpatialReaders { + pub fn empty() -> SpatialReaders { + SpatialReaders { + data: Arc::new(CompositeFile::empty()), + } + } + + /// Creates a field norm reader. + pub fn open(file: FileSlice) -> crate::Result { + let data = CompositeFile::open(&file)?; + Ok(SpatialReaders { + data: Arc::new(data), + }) + } + + /// Returns the FieldNormReader for a specific field. + pub fn get_field(&self, field: Field) -> crate::Result> { + if let Some(file) = self.data.open_read(field) { + let spatial_reader = SpatialReader::open(file)?; + Ok(Some(spatial_reader)) + } else { + Ok(None) + } + } + + /// Return a break down of the space usage per field. + pub fn space_usage(&self) -> PerFieldSpaceUsage { + self.data.space_usage() + } + + /// Returns a handle to inner file + pub fn get_inner_file(&self) -> Arc { + self.data.clone() + } +} + +/// HUSH +#[derive(Clone)] +pub struct SpatialReader { + data: OwnedBytes, +} + +impl SpatialReader { + /// Opens the spatial reader from a `FileSlice`. Returns `None` if the file is empty (no + /// spatial fields indexed.) + pub fn open(spatial_file: FileSlice) -> io::Result { + let data = spatial_file.read_bytes()?; + Ok(SpatialReader { data }) + } + /// HUSH + pub fn get_bytes(&self) -> &[u8] { + self.data.as_ref() + } +} diff --git a/src/spatial/serializer.rs b/src/spatial/serializer.rs new file mode 100644 index 0000000000..8336d45c62 --- /dev/null +++ b/src/spatial/serializer.rs @@ -0,0 +1,37 @@ +//! HUSH +use std::io; +use std::io::Write; + +use crate::directory::{CompositeWrite, WritePtr}; +use crate::schema::Field; +use crate::spatial::bkd::write_block_kd_tree; +use crate::spatial::triangle::Triangle; + +/// The fieldnorms serializer is in charge of +/// the serialization of field norms for all fields. +pub struct SpatialSerializer { + composite_write: CompositeWrite, +} + +impl SpatialSerializer { + /// Create a composite file from the write pointer. + pub fn from_write(write: WritePtr) -> io::Result { + // just making room for the pointer to header. + let composite_write = CompositeWrite::wrap(write); + Ok(SpatialSerializer { composite_write }) + } + + /// Serialize the given field + pub fn serialize_field(&mut self, field: Field, triangles: &mut [Triangle]) -> io::Result<()> { + let write = self.composite_write.for_field(field); + write_block_kd_tree(triangles, write)?; + write.flush()?; + Ok(()) + } + + /// Clean up, flush, and close. + pub fn close(self) -> io::Result<()> { + self.composite_write.close()?; + Ok(()) + } +} diff --git a/src/spatial/triangle.rs b/src/spatial/triangle.rs new file mode 100644 index 0000000000..5136a683b4 --- /dev/null +++ b/src/spatial/triangle.rs @@ -0,0 +1,515 @@ +//! A triangle encoding with bounding box in the first four words for efficient spatial pruning. +//! +//! Encodes triangles with the bounding box in the first four words, enabling efficient spatial +//! pruning during tree traversal without reconstructing the full triangle. The remaining words +//! contain an additional vertex and packed reconstruction metadata, allowing exact triangle +//! recovery when needed. + +use i_triangle::advanced::delaunay::IntDelaunay; +use i_triangle::i_overlay::i_float::int::point::IntPoint; + +use crate::DocId; + +const MINY_MINX_MAXY_MAXX_Y_X: i32 = 0; +const MINY_MINX_Y_X_MAXY_MAXX: i32 = 1; +const MAXY_MINX_Y_X_MINY_MAXX: i32 = 2; +const MAXY_MINX_MINY_MAXX_Y_X: i32 = 3; +const Y_MINX_MINY_X_MAXY_MAXX: i32 = 4; +const Y_MINX_MINY_MAXX_MAXY_X: i32 = 5; +const MAXY_MINX_MINY_X_Y_MAXX: i32 = 6; +const MINY_MINX_Y_MAXX_MAXY_X: i32 = 7; + +/// Converts geographic coordinates (WGS84 lat/lon) to integer spatial coordinates. +/// +/// Maps the full globe to the i32 range using linear scaling: +/// - Latitude: -90° to +90° → -2³¹ to +2³¹-1 +/// - Longitude: -180° to +180° → -2³¹ to +2³¹-1 +/// +/// Provides approximately 214 units/meter for latitude and 107 units/meter for longitude, giving +/// millimeter-level precision. Uses `floor()` to ensure consistent quantization. +/// +/// Returns `(y, x)` where y=latitude coordinate, x=longitude coordinate. +pub fn latlon_to_point(lat: f64, lon: f64) -> (i32, i32) { + let y = (lat / (180.0 / (1i64 << 32) as f64)).floor() as i32; + let x = (lon / (360.0 / (1i64 << 32) as f64)).floor() as i32; + (y, x) +} + +/// Creates a bounding box from two lat/lon corner coordinates. +/// +/// Takes two arbitrary corner points and produces a normalized bounding box in the internal +/// coordinate system. Automatically computes min/max for each dimension. +/// +/// Returns `[min_y, min_x, max_y, max_x]` matching the internal storage format used throughout the +/// block kd-tree and triangle encoding. +pub fn latlon_to_bbox(lat1: f64, lon1: f64, lat2: f64, lon2: f64) -> [i32; 4] { + let (y1, x1) = latlon_to_point(lat1, lon1); + let (y2, x2) = latlon_to_point(lat2, lon2); + [y1.min(y2), x1.min(x2), y1.max(y2), x1.max(x2)] +} + +/// A triangle encoded with bounding box in the first four words for efficient spatial pruning. +/// +/// Encodes the bounding box, one vertex, boundary edge flags, and a reconstruction code that +/// together allow exact triangle recovery while optimizing for spatial query performance. Finally, +/// it contains the document id. +#[repr(C)] +#[derive(Debug)] +pub struct Triangle { + /// The bounding box, one vertex, followed by a packed integer containing boundary edge flags + /// and a reconstruction code. + pub words: [i32; 7], + /// The id of the document associated with this triangle. + pub doc_id: DocId, +} + +impl Triangle { + /// Encodes a triangle with the bounding box in the first four words for efficient spatial + /// pruning. + /// + /// Takes three vertices as `[y0, x0, y1, x1, y2, x2]` and edge boundary flags `[ab, bc, ca]` + /// indicating which edges are polygon boundaries. Returns a triangle struct with the bounding + /// box in the first four words as `[min_y, min_x, max_y, max_x]`. When decoded, the vertex + /// order may differ from the original input to `new()` due to normalized rotation. + /// + /// The edge boundary flags are here to express whether an edge is part of the boundaries + /// in the tesselation of the larger polygon it belongs to. + pub fn new(doc_id: u32, triangle: [i32; 6], boundaries: [bool; 3]) -> Self { + let mut ay = triangle[0]; + let mut ax = triangle[1]; + let mut by = triangle[2]; + let mut bx = triangle[3]; + let mut cy = triangle[4]; + let mut cx = triangle[5]; + let mut ab = boundaries[0]; + let mut bc = boundaries[1]; + let mut ca = boundaries[2]; + // rotate edges and place minX at the beginning + if bx < ax || cx < ax { + let temp_x = ax; + let temp_y = ay; + let temp_boundary = ab; + if bx < cx { + ax = bx; + ay = by; + ab = bc; + bx = cx; + by = cy; + bc = ca; + cx = temp_x; + cy = temp_y; + ca = temp_boundary; + } else { + ax = cx; + ay = cy; + ab = ca; + cx = bx; + cy = by; + ca = bc; + bx = temp_x; + by = temp_y; + bc = temp_boundary; + } + } else if ax == bx && ax == cx { + // degenerated case, all points with same longitude + // we need to prevent that ax is in the middle (not part of the MBS) + if by < ay || cy < ay { + let temp_x = ax; + let temp_y = ay; + let temp_boundary = ab; + if by < cy { + ax = bx; + ay = by; + ab = bc; + bx = cx; + by = cy; + bc = ca; + cx = temp_x; + cy = temp_y; + ca = temp_boundary; + } else { + ax = cx; + ay = cy; + ab = ca; + cx = bx; + cy = by; + ca = bc; + bx = temp_x; + by = temp_y; + bc = temp_boundary; + } + } + } + // change orientation if clockwise (CW) + if !is_counter_clockwise( + IntPoint { y: ay, x: ax }, + IntPoint { y: by, x: bx }, + IntPoint { y: cy, x: cx }, + ) { + // To change the orientation, we simply swap B and C. + let temp_x = bx; + let temp_y = by; + let temp_boundary = ab; + // ax and ay do not change, ab becomes bc + ab = ca; + bx = cx; + by = cy; + // bc does not change, ca becomes ab + cx = temp_x; + cy = temp_y; + ca = temp_boundary; + } + let min_x = ax; + let min_y = ay.min(by).min(cy); + let max_x = ax.max(bx).max(cx); + let max_y = ay.max(by).max(cy); + let (y, x, code) = if min_y == ay { + if max_y == by && max_x == bx { + (cy, cx, MINY_MINX_MAXY_MAXX_Y_X) + } else if max_y == cy && max_x == cx { + (by, bx, MINY_MINX_Y_X_MAXY_MAXX) + } else { + (by, cx, MINY_MINX_Y_MAXX_MAXY_X) + } + } else if max_y == ay { + if min_y == by && max_x == bx { + (cy, cx, MAXY_MINX_MINY_MAXX_Y_X) + } else if min_y == cy && max_x == cx { + (by, bx, MAXY_MINX_Y_X_MINY_MAXX) + } else { + (cy, bx, MAXY_MINX_MINY_X_Y_MAXX) + } + } else if max_x == bx && min_y == by { + (ay, cx, Y_MINX_MINY_MAXX_MAXY_X) + } else if max_x == cx && max_y == cy { + (ay, bx, Y_MINX_MINY_X_MAXY_MAXX) + } else { + panic!("Could not encode the provided triangle"); + }; + let boundaries_bits = (ab as i32) | ((bc as i32) << 1) | ((ca as i32) << 2); + let packed = code | (boundaries_bits << 3); + Triangle { + words: [min_y, min_x, max_y, max_x, y, x, packed], + doc_id: doc_id, + } + } + + /// Builds a degenerated triangle degenerating for a single point. + /// All vertices are that point, and all vertices are boundaries. + pub fn from_point(doc_id: DocId, point_x: i32, point_y: i32) -> Triangle { + Triangle::new( + doc_id, + [point_y, point_x, point_y, point_x, point_y, point_x], + [true, true, true], + ) + } + + /// Builds a degenerated triangle for a segment. + /// Line segment AB is represented as the triangle ABA. + pub fn from_line_segment(doc_id: DocId, a_x: i32, a_y: i32, b_x: i32, b_y: i32) -> Triangle { + Triangle::new(doc_id, [a_y, a_x, b_y, b_x, a_y, a_x], [true, true, true]) + } + + /// Create a triangle with only the doc_id and the words initialized to zero. + /// + /// The doc_id and words in the field are delta-compressed as a series with the doc_id + /// serialized first. When we reconstruct the triangle we can first reconstruct skeleton + /// triangles with the doc_id series, then populate the words directly from the decompression + /// as we decompress each series. + /// + /// An immutable constructor would require that we decompress first into parallel `Vec` + /// instances, then loop through the count of triangles building triangles using a constructor + /// that takes all eight field values at once. This saves a copy, the triangle is the + /// decompression destination. + pub fn skeleton(doc_id: u32) -> Self { + Triangle { + doc_id: doc_id, + words: [0i32; 7], + } + } + + /// Decodes the triangle back to vertex coordinates and boundary flags. + /// + /// Returns vertices as `[y0, x0, y1, x1, y2, x2]` in CCW order and boundary flags `[ab, bc, + /// ca]`. The vertex order may differ from the original input to `new()` due to normalized CCW + /// rotation. + pub fn decode(&self) -> ([i32; 6], [bool; 3]) { + let packed = self.words[6]; + let code = packed & 7; // Lower 3 bits + let boundaries = [ + (packed & (1 << 3)) != 0, // bit 3 = ab + (packed & (1 << 4)) != 0, // bit 4 = bc + (packed & (1 << 5)) != 0, // bit 5 = ca + ]; + let (ay, ax, by, bx, cy, cx) = match code { + MINY_MINX_MAXY_MAXX_Y_X => ( + self.words[0], + self.words[1], + self.words[2], + self.words[3], + self.words[4], + self.words[5], + ), + MINY_MINX_Y_X_MAXY_MAXX => ( + self.words[0], + self.words[1], + self.words[4], + self.words[5], + self.words[2], + self.words[3], + ), + MAXY_MINX_Y_X_MINY_MAXX => ( + self.words[2], + self.words[1], + self.words[4], + self.words[5], + self.words[0], + self.words[3], + ), + MAXY_MINX_MINY_MAXX_Y_X => ( + self.words[2], + self.words[1], + self.words[0], + self.words[3], + self.words[4], + self.words[5], + ), + Y_MINX_MINY_X_MAXY_MAXX => ( + self.words[4], + self.words[1], + self.words[0], + self.words[5], + self.words[2], + self.words[3], + ), + Y_MINX_MINY_MAXX_MAXY_X => ( + self.words[4], + self.words[1], + self.words[0], + self.words[3], + self.words[2], + self.words[5], + ), + MAXY_MINX_MINY_X_Y_MAXX => ( + self.words[2], + self.words[1], + self.words[0], + self.words[5], + self.words[4], + self.words[3], + ), + MINY_MINX_Y_MAXX_MAXY_X => ( + self.words[0], + self.words[1], + self.words[4], + self.words[3], + self.words[2], + self.words[5], + ), + _ => panic!("Could not decode the provided triangle"), + }; + ([ay, ax, by, bx, cy, cx], boundaries) + } + + /// Returns the bounding box coordinates of the encoded triangle. + /// + /// Provides access to the bounding box `[min_y, min_x, max_y, max_x]` stored in the first four + /// words of the structure. The bounding box is stored first for efficient spatial pruning, + /// determining whether it is necessary to decode the triangle for precise intersection or + /// containment tests. + pub fn bbox(&self) -> &[i32] { + &self.words[..4] + } +} + +/// Encodes the triangles of a Delaunay triangulation into block kd-tree triangles. +/// +/// Takes the output of a Delaunay triangulation from `i_triangle` and encodes each triangle into +/// the normalized triangle used by the block kd-tree. Each triangle includes its bounding box, +/// vertex coordinates, and boundary edge flags that distinguish original polygon edges from +/// internal tessellation edges. +/// +/// The boundary edge information provided by the `i_triangle` Delaunay triangulation is essential +/// for CONTAINS and WITHIN queries to work correctly. +pub fn delaunay_to_triangles(doc_id: u32, delaunay: &IntDelaunay, triangles: &mut Vec) { + for triangle in delaunay.triangles.iter() { + let bounds = [ + triangle.neighbors[0] == usize::MAX, + triangle.neighbors[1] == usize::MAX, + triangle.neighbors[2] == usize::MAX, + ]; + let v0 = &delaunay.points[triangle.vertices[0].index]; + let v1 = &delaunay.points[triangle.vertices[1].index]; + let v2 = &delaunay.points[triangle.vertices[2].index]; + triangles.push(Triangle::new( + doc_id, + [v0.y, v0.x, v1.y, v1.x, v2.y, v2.x], + bounds, + )) + } +} + +/// Returns true if the path A -> B -> C is Counter-Clockwise (CCW) or collinear. +/// Returns false if it is Clockwise (CW). +#[inline(always)] +fn is_counter_clockwise(a: IntPoint, b: IntPoint, c: IntPoint) -> bool { + // We calculate the 2D cross product (determinant) of vectors AB and AC. + // Formula: (bx - ax)(cy - ay) - (by - ay)(cx - ax) + + // We cast to i64 to prevent overflow, as multiplying two i32s can exceed i32::MAX. + let val = (b.x as i64 - a.x as i64) * (c.y as i64 - a.y as i64) + - (b.y as i64 - a.y as i64) * (c.x as i64 - a.x as i64); + + // If the result is positive, the triangle is CCW. + // If negative, it is CW. + // If zero, the points are collinear (we return true in that case). + val >= 0 +} + +#[cfg(test)] +mod tests { + use i_triangle::i_overlay::i_float::int::point::IntPoint; + use i_triangle::int::triangulatable::IntTriangulatable; + + use super::*; + + #[test] + fn encode_triangle() { + let test_cases = [ + ([1, 1, 3, 2, 2, 4], [true, false, false]), + ([1, 1, 2, 4, 3, 2], [false, false, true]), + ([2, 4, 1, 1, 3, 2], [false, true, false]), + ([2, 4, 3, 2, 1, 1], [false, true, false]), + ([3, 2, 1, 1, 2, 4], [true, false, false]), + ([3, 2, 2, 4, 1, 1], [false, false, true]), + ]; + let ccw_coords = [1, 1, 2, 4, 3, 2]; + let ccw_bounds = [false, false, true]; + for (coords, bounds) in test_cases { + let triangle = Triangle::new(1, coords, bounds); + let (decoded_coords, decoded_bounds) = triangle.decode(); + assert_eq!(decoded_coords, ccw_coords); + assert_eq!(decoded_bounds, ccw_bounds); + } + } + + #[test] + fn test_cw_triangle_boundary_and_coord_flip() { + // 1. Define distinct coordinates for a Clockwise triangle + // Visual layout: + // A(50,40): Top Center-ish + // B(10,60): Bottom Right + // C(20,10): Bottom Left (Has the Minimum X=10) + // Path A->B->C is Clockwise. + let input_coords = [ + 50, 40, // A (y, x) + 10, 60, // B + 20, 10, // C + ]; + + // 2. Define Boundaries [ab, bc, ca] + // We set BC=true and CA=false. + // The bug (ab=bc) would erroneously put 'true' into the first slot. + // The fix (ab=ca) should put 'false' into the first slot. + let input_bounds = [false, true, false]; + + // 3. Encode + let triangle = Triangle::new(1, input_coords, input_bounds); + let (decoded_coords, decoded_bounds) = triangle.decode(); + + // 4. Expected Coordinates + // The internal logic detects CW, swaps B/C to make it CCW: + // A(50,40) -> C(20,10) -> B(10,60) + // Then it rotates to put Min-X first. + // Min X is 10 (Vertex C). + // Final Sequence: C -> B -> A + let expected_coords = [ + 20, 10, // C + 10, 60, // B + 50, 40, // A + ]; + + // 5. Expected Boundaries + // After Flip (A->C->B): + // Edge AC (was CA) = false + // Edge CB (was BC) = true + // Edge BA (was AB) = false + // Unrotated: [false, true, false] + // After Rotation (shifting to start at C): + // Shift left by 1: [true, false, false] + let expected_bounds = [true, false, false]; + + assert_eq!( + decoded_coords, expected_coords, + "Coordinates did not decode as expected" + ); + assert_eq!( + decoded_bounds, expected_bounds, + "Boundary flags were incorrect (likely swap bug)" + ); + } + + #[test] + fn degenerate_triangle() { + let test_cases = [ + ( + [1, 1, 2, 1, 3, 1], + [true, false, false], + [1, 1, 2, 1, 3, 1], + [true, false, false], + ), + ( + [2, 1, 1, 1, 3, 1], + [true, false, false], + [1, 1, 3, 1, 2, 1], + [false, false, true], + ), + ( + [2, 1, 3, 1, 1, 1], + [false, false, true], + [1, 1, 2, 1, 3, 1], + [true, false, false], + ), + ]; + for (coords, bounds, ccw_coords, ccw_bounds) in test_cases { + let triangle = Triangle::new(1, coords, bounds); + let (decoded_coords, decoded_bounds) = triangle.decode(); + assert_eq!(decoded_coords, ccw_coords); + assert_eq!(decoded_bounds, ccw_bounds); + } + } + + #[test] + fn decode_triangle() { + // distinct values for each coordinate to catch transposition + let test_cases = [ + [11, 10, 60, 80, 41, 40], + [1, 0, 11, 20, 31, 30], + [30, 0, 11, 10, 1, 20], + [30, 0, 1, 20, 21, 11], + [20, 0, 1, 30, 41, 40], + [20, 0, 1, 30, 31, 10], + [30, 0, 1, 10, 11, 20], + [1, 0, 10, 20, 21, 11], + ]; + for coords in test_cases { + let triangle = Triangle::new(1, coords, [true, true, true]); + let (decoded_coords, _) = triangle.decode(); + assert_eq!(decoded_coords, coords); + } + } + + #[test] + fn triangulate_box() { + let i_polygon = vec![vec![ + IntPoint::new(0, 0), + IntPoint::new(10, 0), + IntPoint::new(10, 10), + IntPoint::new(0, 10), + ]]; + let mut triangles = Vec::new(); + let delaunay = i_polygon.triangulate().into_delaunay(); + delaunay_to_triangles(1, &delaunay, &mut triangles); + assert_eq!(triangles.len(), 2); + } +} diff --git a/src/spatial/writer.rs b/src/spatial/writer.rs new file mode 100644 index 0000000000..5c8ff87e23 --- /dev/null +++ b/src/spatial/writer.rs @@ -0,0 +1,125 @@ +//! HUSH +use std::collections::HashMap; +use std::io; + +use i_triangle::i_overlay::i_float::int::point::IntPoint; +use i_triangle::int::triangulatable::IntTriangulatable; + +use crate::schema::Field; +use crate::spatial::geometry::Geometry; +use crate::spatial::point::GeoPoint; +use crate::spatial::serializer::SpatialSerializer; +use crate::spatial::triangle::{delaunay_to_triangles, Triangle}; +use crate::DocId; + +/// HUSH +pub struct SpatialWriter { + /// Map from field to its triangles buffer + triangles_by_field: HashMap>, +} + +impl SpatialWriter { + /// HUST + pub fn add_geometry(&mut self, doc_id: DocId, field: Field, geometry: Geometry) { + let triangles = &mut self.triangles_by_field.entry(field).or_default(); + match geometry { + Geometry::Point(point) => { + append_point(triangles, doc_id, point); + } + Geometry::MultiPoint(multi_point) => { + for point in multi_point { + append_point(triangles, doc_id, point); + } + } + Geometry::LineString(line_string) => { + append_line_string(triangles, doc_id, line_string); + } + Geometry::MultiLineString(multi_line_string) => { + for line_string in multi_line_string { + append_line_string(triangles, doc_id, line_string); + } + } + Geometry::Polygon(polygon) => { + append_polygon(triangles, doc_id, &polygon); + } + Geometry::MultiPolygon(multi_polygon) => { + for polygon in multi_polygon { + append_polygon(triangles, doc_id, &polygon); + } + } + Geometry::GeometryCollection(geometries) => { + for geometry in geometries { + self.add_geometry(doc_id, field, geometry); + } + } + } + } + + /// Memory usage estimate + pub fn mem_usage(&self) -> usize { + self.triangles_by_field + .values() + .map(|triangles| triangles.len() * std::mem::size_of::()) + .sum() + } + + /// Serializing our field. + pub fn serialize(&mut self, mut serializer: SpatialSerializer) -> io::Result<()> { + for (field, triangles) in &mut self.triangles_by_field { + serializer.serialize_field(*field, triangles)?; + } + serializer.close()?; + Ok(()) + } +} + +impl Default for SpatialWriter { + /// HUSH + fn default() -> Self { + SpatialWriter { + triangles_by_field: HashMap::new(), + } + } +} + +/// Convert a point of `(longitude, latitude)` to a integer point. +pub fn as_point_i32(point: GeoPoint) -> (i32, i32) { + ( + (point.lon / (360.0 / (1i64 << 32) as f64)).floor() as i32, + (point.lat / (180.0 / (1i64 << 32) as f64)).floor() as i32, + ) +} + +fn append_point(triangles: &mut Vec, doc_id: DocId, point: GeoPoint) { + let point = as_point_i32(point); + triangles.push(Triangle::from_point(doc_id, point.0, point.1)); +} + +fn append_line_string(triangles: &mut Vec, doc_id: DocId, line_string: Vec) { + let mut previous = as_point_i32(line_string[0]); + for point in line_string.into_iter().skip(1) { + let point = as_point_i32(point); + triangles.push(Triangle::from_line_segment( + doc_id, previous.0, previous.1, point.0, point.1, + )); + previous = point + } +} + +fn append_ring(i_polygon: &mut Vec>, ring: &[GeoPoint]) { + let mut i_ring = Vec::with_capacity(ring.len() + 1); + for &point in ring { + let point = as_point_i32(point); + i_ring.push(IntPoint::new(point.0, point.1)); + } + i_polygon.push(i_ring); +} + +fn append_polygon(triangles: &mut Vec, doc_id: DocId, polygon: &[Vec]) { + let mut i_polygon: Vec> = Vec::new(); + for ring in polygon { + append_ring(&mut i_polygon, ring); + } + let delaunay = i_polygon.triangulate().into_delaunay(); + delaunay_to_triangles(doc_id, &delaunay, triangles); +} diff --git a/src/spatial/xor.rs b/src/spatial/xor.rs new file mode 100644 index 0000000000..c9e6fb2804 --- /dev/null +++ b/src/spatial/xor.rs @@ -0,0 +1,136 @@ +//! XOR delta compression for f64 polygon coordinates. +//! +//! Lossless compression for floating-point lat/lon coordinates using XOR delta encoding on IEEE +//! 754 bit patterns with variable-length integer encoding. Designed for per-polygon random access +//! in the document store, where each polygon compresses independently without requiring sequential +//! decompression. +//! +//! Spatially local coordinates share most high-order bits. A municipal boundary spanning 1km has +//! consecutive vertices typically within 100-500 meters, meaning their f64 bit patterns share +//! 30-40 bits. XOR reveals these common bits as zeros, which varint encoding then compresses +//! efficiently. +//! +//! The format stores the first coordinate as raw 8 bytes, then XOR deltas between consecutive +//! coordinates encoded as variable-length integers. When compression produces larger output than +//! the raw input (random data, compression-hostile patterns), the function automatically falls +//! back to storing coordinates as uncompressed 8-byte values. +//! +//! Unlike delta.rs which uses arithmetic deltas for i32 spatial coordinates in the block kd-tree, +//! this module operates on f64 bit patterns directly to preserve exact floating-point values for +//! returning to users. +use std::io::Read; + +use common::VInt; + +/// Compresses f64 coordinates using XOR delta encoding with automatic raw fallback. +/// +/// Stores the first coordinate as raw bits, then computes XOR between consecutive coordinate bit +/// patterns and encodes as variable-length integers. If the compressed output would be larger than +/// raw storage (8 bytes per coordinate), automatically falls back to raw encoding. +/// +/// Returns a byte vector that can be decompressed with `decompress_f64()` to recover exact +/// original values. +pub fn compress_f64(values: &[f64]) -> Vec { + if values.is_empty() { + return Vec::new(); + } + let mut output: Vec = Vec::new(); + let mut previous: u64 = f64_to_le(values[0]); + output.extend_from_slice(&previous.to_le_bytes()); + for &value in &values[1..] { + let bits = value.to_bits(); + let xor = bits ^ previous; + VInt(xor).serialize_into_vec(&mut output); + previous = bits + } + if output.len() >= values.len() * 8 { + let mut output = Vec::with_capacity(values.len() * 8); + for &value in values { + output.extend_from_slice(&f64_to_le(value).to_le_bytes()); + } + return output; + } + output +} + +fn f64_to_le(value: f64) -> u64 { + u64::from_le_bytes(value.to_le_bytes()) +} + +fn f64_from_le(value: u64) -> f64 { + f64::from_le_bytes(value.to_le_bytes()) +} + +/// Decompresses f64 coordinates from XOR delta or raw encoding. +/// +/// Detects compression format by byte length - if `bytes.len() == count * 8`, data is raw and +/// copied directly. Otherwise, reads first coordinate from 8 bytes, then XOR deltas as varints, +/// reconstructing the original sequence. +/// +/// Returns exact f64 values that were passed to `compress_f64()`. +pub fn decompress_f64(mut bytes: &[u8], count: usize) -> Vec { + let mut values = Vec::with_capacity(count); + if bytes.len() == count * 8 { + for i in 0..count { + let bits = u64::from_le_bytes(bytes[i * 8..(i + 1) * 8].try_into().unwrap()); + values.push(f64_from_le(bits)); + } + return values; + } + let mut cursor: &mut &[u8] = &mut bytes; + + // Read first value (raw 8 bytes) + let mut first_bytes = [0u8; 8]; + cursor.read_exact(&mut first_bytes).unwrap(); + let mut previous = u64::from_le_bytes(first_bytes); + values.push(f64::from_bits(previous)); + + // Read remaining values as VInt XORs + while values.len() < count { + let xor = VInt::deserialize_u64(&mut cursor).unwrap(); + let bits = previous ^ xor; + values.push(f64::from_bits(bits)); + previous = bits; + } + + values +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_compress_spatial_locality() { + // Small town polygon - longitude only. + let longitudes = vec![ + 40.7580, 40.7581, 40.7582, 40.7583, 40.7584, 40.7585, 40.7586, 40.7587, + ]; + let bytes = compress_f64(&longitudes); + // Should compress well - XOR deltas will be small + assert_eq!(bytes.len(), 46); + // Should decompress to exact original values + let decompressed = decompress_f64(&bytes, longitudes.len()); + assert_eq!(longitudes, decompressed); + } + #[test] + fn test_fallback_to_raw() { + // Random, widely scattered values - poor compression + let values = vec![ + 12345.6789, + -98765.4321, + 0.00001, + 999999.999, + -0.0, + std::f64::consts::PI, + std::f64::consts::E, + 42.0, + ]; + let bytes = compress_f64(&values); + // Should fall back to raw storage + assert_eq!(bytes.len(), values.len() * 8); + // Should still decompress correctly + let decompressed = decompress_f64(&bytes, values.len()); + assert_eq!(values, decompressed); + } +}