From 43b5f347217c47316b66574420213fbeeac7fd54 Mon Sep 17 00:00:00 2001 From: Alan Gutierrez Date: Tue, 23 Sep 2025 18:20:41 -0500 Subject: [PATCH 01/22] Implement SPATIAL flag. Implement a SPATIAL flag for use in creating a spatial field. --- src/schema/flags.rs | 10 ++++++- src/schema/mod.rs | 2 ++ src/schema/spatial_options.rs | 51 +++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 src/schema/spatial_options.rs diff --git a/src/schema/flags.rs b/src/schema/flags.rs index 4f8caa612c..9b03b214d5 100644 --- a/src/schema/flags.rs +++ b/src/schema/flags.rs @@ -1,6 +1,6 @@ use std::ops::BitOr; -use crate::schema::{DateOptions, NumericOptions, TextOptions}; +use crate::schema::{DateOptions, NumericOptions, SpatialOptions, TextOptions}; #[derive(Clone)] pub struct StoredFlag; @@ -95,6 +95,14 @@ impl> BitOr for SchemaFlagList } } +impl> BitOr for SchemaFlagList { + type Output = SpatialOptions; + + fn bitor(self, rhs: SpatialOptions) -> Self::Output { + self.head.into() | rhs + } +} + #[derive(Clone)] pub struct SchemaFlagList { pub head: Head, diff --git a/src/schema/mod.rs b/src/schema/mod.rs index 1cd4b72436..627774e545 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -124,6 +124,7 @@ mod ip_options; mod json_object_options; mod named_field_document; mod numeric_options; +mod spatial_options; mod text_options; use columnar::ColumnType; @@ -144,6 +145,7 @@ pub use self::json_object_options::JsonObjectOptions; pub use self::named_field_document::NamedFieldDocument; pub use self::numeric_options::NumericOptions; pub use self::schema::{Schema, SchemaBuilder}; +pub use self::spatial_options::{SpatialOptions, SPATIAL}; pub use self::term::{Term, ValueBytes}; pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT}; diff --git a/src/schema/spatial_options.rs b/src/schema/spatial_options.rs new file mode 100644 index 0000000000..13cce1a6f0 --- /dev/null +++ b/src/schema/spatial_options.rs @@ -0,0 +1,51 @@ +use std::ops::BitOr; + +use serde::{Deserialize, Serialize}; + +use crate::schema::flags::StoredFlag; + +/// Define how a spatial field should be handled by tantivy. +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize, Default)] +pub struct SpatialOptions { + #[serde(default)] + stored: bool, +} + +/// The field will be untokenized and indexed. +pub const SPATIAL: SpatialOptions = SpatialOptions { stored: false }; + +impl SpatialOptions { + /// Returns true if the geometry is to be stored. + #[inline] + pub fn is_stored(&self) -> bool { + self.stored + } +} + +impl> BitOr for SpatialOptions { + type Output = SpatialOptions; + + fn bitor(self, other: T) -> SpatialOptions { + let other = other.into(); + SpatialOptions { + stored: self.stored | other.stored, + } + } +} + +impl From for SpatialOptions { + fn from(_: StoredFlag) -> SpatialOptions { + SpatialOptions { stored: true } + } +} + +#[cfg(test)] +mod tests { + use crate::schema::*; + + #[test] + fn test_field_options() { + let field_options = STORED | SPATIAL; + assert!(field_options.is_stored()); + } +} From 558c99fa2d5a534ca3808dd9189a396df3871f26 Mon Sep 17 00:00:00 2001 From: Alan Gutierrez Date: Sun, 12 Oct 2025 22:06:27 -0500 Subject: [PATCH 02/22] Triangle encoding for spatial indexing. Encodes triangles with the bounding box in the first four words, enabling efficient spatial pruning during tree traversal without reconstructing the full triangle. The remaining words contain an additional vertex and packed reconstruction metadata, allowing exact triangle recovery when needed. --- Cargo.toml | 1 + src/lib.rs | 1 + src/spatial/mod.rs | 3 + src/spatial/triangle.rs | 376 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 381 insertions(+) create mode 100644 src/spatial/mod.rs create mode 100644 src/spatial/triangle.rs diff --git a/Cargo.toml b/Cargo.toml index 32d7bd990c..977677d31b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -56,6 +56,7 @@ itertools = "0.14.0" measure_time = "0.9.0" arc-swap = "1.5.0" bon = "3.3.1" +robust = "1.2" columnar = { version = "0.6", path = "./columnar", package = "tantivy-columnar" } sstable = { version = "0.6", path = "./sstable", package = "tantivy-sstable", optional = true } diff --git a/src/lib.rs b/src/lib.rs index 1027f4f46b..ba2d9aaab2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -191,6 +191,7 @@ pub mod fieldnorm; pub mod index; pub mod positions; pub mod postings; +pub mod spatial; /// Module containing the different query implementations. pub mod query; diff --git a/src/spatial/mod.rs b/src/spatial/mod.rs new file mode 100644 index 0000000000..8d400578ca --- /dev/null +++ b/src/spatial/mod.rs @@ -0,0 +1,3 @@ +//! Spatial module (implements a block kd-tree) + +pub mod triangle; diff --git a/src/spatial/triangle.rs b/src/spatial/triangle.rs new file mode 100644 index 0000000000..689cdf2a74 --- /dev/null +++ b/src/spatial/triangle.rs @@ -0,0 +1,376 @@ +//! A triangle encoding with bounding box in the first four words for efficient spatial pruning. +//! +//! Encodes triangles with the bounding box in the first four words, enabling efficient spatial +//! pruning during tree traversal without reconstructing the full triangle. The remaining words +//! contain an additional vertex and packed reconstruction metadata, allowing exact triangle +//! recovery when needed. + +use robust::{orient2d, Coord}; + +const MINY_MINX_MAXY_MAXX_Y_X: i32 = 0; +const MINY_MINX_Y_X_MAXY_MAXX: i32 = 1; +const MAXY_MINX_Y_X_MINY_MAXX: i32 = 2; +const MAXY_MINX_MINY_MAXX_Y_X: i32 = 3; +const Y_MINX_MINY_X_MAXY_MAXX: i32 = 4; +const Y_MINX_MINY_MAXX_MAXY_X: i32 = 5; +const MAXY_MINX_MINY_X_Y_MAXX: i32 = 6; +const MINY_MINX_Y_MAXX_MAXY_X: i32 = 7; + +/// Converts geographic coordinates (WGS84 lat/lon) to integer spatial coordinates. +/// +/// Maps the full globe to the i32 range using linear scaling: +/// - Latitude: -90° to +90° → -2³¹ to +2³¹-1 +/// - Longitude: -180° to +180° → -2³¹ to +2³¹-1 +/// +/// Provides approximately 214 units/meter for latitude and 107 units/meter for longitude, giving +/// millimeter-level precision. Uses `floor()` to ensure consistent quantization. +/// +/// Returns `(y, x)` where y=latitude coordinate, x=longitude coordinate. +pub fn latlon_to_point(lat: f64, lon: f64) -> (i32, i32) { + let y = (lat / (180.0 / (1i64 << 32) as f64)).floor() as i32; + let x = (lon / (360.0 / (1i64 << 32) as f64)).floor() as i32; + (y, x) +} + +/// Creates a bounding box from two lat/lon corner coordinates. +/// +/// Takes two arbitrary corner points and produces a normalized bounding box in the internal +/// coordinate system. Automatically computes min/max for each dimension. +/// +/// Returns `[min_y, min_x, max_y, max_x]` matching the internal storage format used throughout the +/// block kd-tree and triangle encoding. +pub fn latlon_to_bbox(lat1: f64, lon1: f64, lat2: f64, lon2: f64) -> [i32; 4] { + let (y1, x1) = latlon_to_point(lat1, lon1); + let (y2, x2) = latlon_to_point(lat2, lon2); + [y1.min(y2), x1.min(x2), y1.max(y2), x1.max(x2)] +} + +/// A triangle encoded with bounding box in the first four words for efficient spatial pruning. +/// +/// Encodes the bounding box, one vertex, boundary edge flags, and a reconstruction code that +/// together allow exact triangle recovery while optimizing for spatial query performance. Finally, +/// it contains the document id. +#[repr(C)] +#[derive(Debug)] +pub struct Triangle { + /// The bounding box, one vertex, followed by a packed integer containing boundary edge flags + /// and a reconstruction code. + pub words: [i32; 7], + /// The id of the document associated with this triangle. + pub doc_id: u32, +} + +impl Triangle { + /// Encodes a triangle with the bounding box in the first four words for efficient spatial + /// pruning. + /// + /// Takes three vertices as `[y0, x0, y1, x1, y2, x2]` and edge boundary flags `[ab, bc, ca]` + /// indicating which edges are polygon boundaries. Returns a triangle struct with the bounding + /// box in the first four words as `[min_y, min_x, max_y, max_x]`. When decoded, the vertex + /// order may differ from the original input to `new()` due to normalized rotation. + pub fn new(doc_id: u32, triangle: [i32; 6], boundaries: [bool; 3]) -> Self { + let mut ay = triangle[0]; + let mut ax = triangle[1]; + let mut by = triangle[2]; + let mut bx = triangle[3]; + let mut cy = triangle[4]; + let mut cx = triangle[5]; + let mut ab = boundaries[0]; + let mut bc = boundaries[1]; + let mut ca = boundaries[2]; + // rotate edges and place minX at the beginning + if bx < ax || cx < ax { + let temp_x = ax; + let temp_y = ay; + let temp_boundary = ab; + if bx < cx { + ax = bx; + ay = by; + ab = bc; + bx = cx; + by = cy; + bc = ca; + cx = temp_x; + cy = temp_y; + ca = temp_boundary; + } else { + ax = cx; + ay = cy; + ab = ca; + cx = bx; + cy = by; + ca = bc; + bx = temp_x; + by = temp_y; + bc = temp_boundary; + } + } else if ax == bx && ax == cx { + // degenerated case, all points with same longitude + // we need to prevent that ax is in the middle (not part of the MBS) + if by < ay || cy < ay { + let temp_x = ax; + let temp_y = ay; + let temp_boundary = ab; + if by < cy { + ax = bx; + ay = by; + ab = bc; + bx = cx; + by = cy; + bc = ca; + cx = temp_x; + cy = temp_y; + ca = temp_boundary; + } else { + ax = cx; + ay = cy; + ab = ca; + cx = bx; + cy = by; + ca = bc; + bx = temp_x; + by = temp_y; + bc = temp_boundary; + } + } + } + // change orientation if CW + if orient2d( + Coord { y: ay, x: ax }, + Coord { y: by, x: bx }, + Coord { y: cy, x: cx }, + ) < 0.0 + { + let temp_x = bx; + let temp_y = by; + let temp_boundary = ab; + // ax and ay do not change, ab becomes bc + ab = bc; + bx = cx; + by = cy; + // bc does not change, ca becomes ab + cx = temp_x; + cy = temp_y; + ca = temp_boundary; + } + let min_x = ax; + let min_y = ay.min(by).min(cy); + let max_x = ax.max(bx).max(cx); + let max_y = ay.max(by).max(cy); + let (y, x, code) = if min_y == ay { + if max_y == by && max_x == bx { + (cy, cx, MINY_MINX_MAXY_MAXX_Y_X) + } else if max_y == cy && max_x == cx { + (by, bx, MINY_MINX_Y_X_MAXY_MAXX) + } else { + (by, cx, MINY_MINX_Y_MAXX_MAXY_X) + } + } else if max_y == ay { + if min_y == by && max_x == bx { + (cy, cx, MAXY_MINX_MINY_MAXX_Y_X) + } else if min_y == cy && max_x == cx { + (by, bx, MAXY_MINX_Y_X_MINY_MAXX) + } else { + (cy, bx, MAXY_MINX_MINY_X_Y_MAXX) + } + } else if max_x == bx && min_y == by { + (ay, cx, Y_MINX_MINY_MAXX_MAXY_X) + } else if max_x == cx && max_y == cy { + (ay, bx, Y_MINX_MINY_X_MAXY_MAXX) + } else { + panic!("Could not encode the provided triangle"); + }; + let boundaries_bits = (ab as i32) | ((bc as i32) << 1) | ((ca as i32) << 2); + let packed = code | (boundaries_bits << 3); + Triangle { + words: [min_y, min_x, max_y, max_x, y, x, packed], + doc_id: doc_id, + } + } + + /// Create a triangle with only the doc_id and the words initialized to zero. + /// + /// The doc_id and words in the field are delta-compressed as a series with the doc_id + /// serialized first. When we reconstruct the triangle we can first reconstruct skeleton + /// triangles with the doc_id series, then populate the words directly from the decompression + /// as we decompress each series. + /// + /// An immutable constructor would require that we decompress first into parallel `Vec` + /// instances, then loop through the count of triangles building triangles using a constructor + /// that takes all eight field values at once. This saves a copy, the triangle is the + /// decompression destination. + pub fn skeleton(doc_id: u32) -> Self { + Triangle { + doc_id: doc_id, + words: [0i32; 7], + } + } + + /// Decodes the triangle back to vertex coordinates and boundary flags. + /// + /// Returns vertices as `[y0, x0, y1, x1, y2, x2]` in CCW order and boundary flags `[ab, bc, + /// ca]`. The vertex order may differ from the original input to `new()` due to normalized CCW + /// rotation. + pub fn decode(&self) -> ([i32; 6], [bool; 3]) { + let packed = self.words[6]; + let code = packed & 7; // Lower 3 bits + let boundaries = [ + (packed & (1 << 3)) != 0, // bit 3 = ab + (packed & (1 << 4)) != 0, // bit 4 = bc + (packed & (1 << 5)) != 0, // bit 5 = ca + ]; + let (ay, ax, by, bx, cy, cx) = match code { + MINY_MINX_MAXY_MAXX_Y_X => ( + self.words[0], + self.words[1], + self.words[2], + self.words[3], + self.words[4], + self.words[5], + ), + MINY_MINX_Y_X_MAXY_MAXX => ( + self.words[0], + self.words[1], + self.words[4], + self.words[5], + self.words[2], + self.words[3], + ), + MAXY_MINX_Y_X_MINY_MAXX => ( + self.words[2], + self.words[1], + self.words[4], + self.words[5], + self.words[0], + self.words[3], + ), + MAXY_MINX_MINY_MAXX_Y_X => ( + self.words[2], + self.words[1], + self.words[0], + self.words[3], + self.words[4], + self.words[5], + ), + Y_MINX_MINY_X_MAXY_MAXX => ( + self.words[4], + self.words[1], + self.words[0], + self.words[5], + self.words[2], + self.words[3], + ), + Y_MINX_MINY_MAXX_MAXY_X => ( + self.words[4], + self.words[1], + self.words[0], + self.words[3], + self.words[2], + self.words[5], + ), + MAXY_MINX_MINY_X_Y_MAXX => ( + self.words[2], + self.words[1], + self.words[0], + self.words[5], + self.words[4], + self.words[3], + ), + MINY_MINX_Y_MAXX_MAXY_X => ( + self.words[0], + self.words[1], + self.words[4], + self.words[3], + self.words[2], + self.words[5], + ), + _ => panic!("Could not decode the provided triangle"), + }; + ([ay, ax, by, bx, cy, cx], boundaries) + } + + /// Returns the bounding box coordinates of the encoded triangle. + /// + /// Provides access to the bounding box `[min_y, min_x, max_y, max_x]` stored in the first four + /// words of the structure. The bounding box is stored first for efficient spatial pruning, + /// determining whether it is necessary to decode the triangle for precise intersection or + /// containment tests. + pub fn bbox(&self) -> &[i32] { + &self.words[..4] + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn encode_triangle() { + let test_cases = [ + ([1, 1, 3, 2, 2, 4], [true, false, false]), + ([1, 1, 2, 4, 3, 2], [false, false, true]), + ([2, 4, 1, 1, 3, 2], [false, true, false]), + ([2, 4, 3, 2, 1, 1], [false, true, false]), + ([3, 2, 1, 1, 2, 4], [true, false, false]), + ([3, 2, 2, 4, 1, 1], [false, false, true]), + ]; + let ccw_coords = [1, 1, 2, 4, 3, 2]; + let ccw_bounds = [false, false, true]; + for (coords, bounds) in test_cases { + let triangle = Triangle::new(1, coords, bounds); + let (decoded_coords, decoded_bounds) = triangle.decode(); + assert_eq!(decoded_coords, ccw_coords); + assert_eq!(decoded_bounds, ccw_bounds); + } + } + + #[test] + fn degenerate_triangle() { + let test_cases = [ + ( + [1, 1, 2, 1, 3, 1], + [true, false, false], + [1, 1, 2, 1, 3, 1], + [true, false, false], + ), + ( + [2, 1, 1, 1, 3, 1], + [true, false, false], + [1, 1, 3, 1, 2, 1], + [false, false, true], + ), + ( + [2, 1, 3, 1, 1, 1], + [false, false, true], + [1, 1, 2, 1, 3, 1], + [true, false, false], + ), + ]; + for (coords, bounds, ccw_coords, ccw_bounds) in test_cases { + let triangle = Triangle::new(1, coords, bounds); + let (decoded_coords, decoded_bounds) = triangle.decode(); + assert_eq!(decoded_coords, ccw_coords); + assert_eq!(decoded_bounds, ccw_bounds); + } + } + + #[test] + fn decode_triangle() { + // distinct values for each coordinate to catch transposition + let test_cases = [ + [11, 10, 60, 80, 41, 40], + [1, 0, 11, 20, 31, 30], + [30, 0, 11, 10, 1, 20], + [30, 0, 1, 20, 21, 11], + [20, 0, 1, 30, 41, 40], + [20, 0, 1, 30, 31, 10], + [30, 0, 1, 10, 11, 20], + [1, 0, 10, 20, 21, 11], + ]; + for coords in test_cases { + let triangle = Triangle::new(1, coords, [true, true, true]); + let (decoded_coords, _) = triangle.decode(); + assert_eq!(decoded_coords, coords); + } + } +} From b2a9bb279d294270761c142f24b8b08dcf353d4c Mon Sep 17 00:00:00 2001 From: Alan Gutierrez Date: Sun, 12 Oct 2025 23:41:07 -0500 Subject: [PATCH 03/22] Implement polygon tessellation. The `triangulate` function takes a polygon with floating-point lat/lon coordinates, converts to integer coordinates with millimeter precision (using 2^32 scaling), performs constrained Delaunay triangulation, and encodes the resulting triangles with boundary edge information for block kd-tree spatial indexing. It handles polygons with holes correctly, preserving which triangle edges lie on the original polygon boundaries versus internal tessellation edges. --- Cargo.toml | 2 ++ src/spatial/triangle.rs | 71 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index 977677d31b..2cdb25b1c4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,6 +57,7 @@ measure_time = "0.9.0" arc-swap = "1.5.0" bon = "3.3.1" robust = "1.2" +i_triangle = "0.38.0" columnar = { version = "0.6", path = "./columnar", package = "tantivy-columnar" } sstable = { version = "0.6", path = "./sstable", package = "tantivy-sstable", optional = true } @@ -71,6 +72,7 @@ futures-util = { version = "0.3.28", optional = true } futures-channel = { version = "0.3.28", optional = true } fnv = "1.0.7" typetag = "0.2.21" +geo-types = "0.7.17" [target.'cfg(windows)'.dependencies] winapi = "0.3.9" diff --git a/src/spatial/triangle.rs b/src/spatial/triangle.rs index 689cdf2a74..e25d9734fb 100644 --- a/src/spatial/triangle.rs +++ b/src/spatial/triangle.rs @@ -5,6 +5,9 @@ //! contain an additional vertex and packed reconstruction metadata, allowing exact triangle //! recovery when needed. +use geo_types::MultiPolygon; +use i_triangle::i_overlay::i_float::int::point::IntPoint; +use i_triangle::int::triangulatable::IntTriangulatable; use robust::{orient2d, Coord}; const MINY_MINX_MAXY_MAXX_Y_X: i32 = 0; @@ -300,8 +303,66 @@ impl Triangle { } } +/// Triangulates a geographic polygon into encoded triangles for block kd-tree indexing. +/// +/// This function is used by applications to convert a `MultiPolygon` into triangles for indexing +/// in the block kd-tree. Once triangulated, the encoded triangles can be inserted into the block +/// kd-tree and queries against the block kd-tree will return the associated `doc_id`. +/// +/// Takes a polygon with floating-point lat/lon coordinates, converts to integer coordinates with +/// millimeter precision (using 2^32 scaling), performs constrained Delaunay triangulation, and +/// encodes the resulting triangles with boundary edge information. +/// +/// Handles polygons with holes correctly, preserving which triangle edges lie on the original +/// polygon boundaries versus internal tessellation edges. +pub fn triangulate(doc_id: u32, geometry: G, triangles: &mut Vec) +where G: Into> { + let multi = geometry.into(); + for polygon in multi { + let exterior: Vec = polygon + .exterior() + .coords() + .map(|coord| { + let lat = (coord.y / (180.0 / (1i64 << 32) as f64)).floor() as i32; + let lon = (coord.x / (360.0 / (1i64 << 32) as f64)).floor() as i32; + IntPoint::new(lon, lat) + }) + .collect(); + let mut i_polygon = vec![exterior]; + for interior in polygon.interiors() { + let hole: Vec = interior + .coords() + .map(|coord| { + let lat = (coord.y / (180.0 / (1i64 << 32) as f64)).floor() as i32; + let lon = (coord.x / (360.0 / (1i64 << 32) as f64)).floor() as i32; + IntPoint::new(lon, lat) + }) + .collect(); + i_polygon.push(hole); + } + let delaunay = i_polygon.triangulate().into_delaunay(); + for (_, triangle) in delaunay.triangles.iter().enumerate() { + let bounds = [ + triangle.neighbors[0] == usize::MAX, + triangle.neighbors[1] == usize::MAX, + triangle.neighbors[2] == usize::MAX, + ]; + let v0 = &delaunay.points[triangle.vertices[0].index]; + let v1 = &delaunay.points[triangle.vertices[1].index]; + let v2 = &delaunay.points[triangle.vertices[2].index]; + triangles.push(Triangle::new( + doc_id, + [v0.y, v0.x, v1.y, v1.x, v2.y, v2.x], + bounds, + )) + } + } +} + #[cfg(test)] mod tests { + use geo_types::polygon; + use super::*; #[test] @@ -373,4 +434,14 @@ mod tests { assert_eq!(decoded_coords, coords); } } + + #[test] + fn triangulate_box() { + let polygon = polygon![ + (x: 0.0, y: 0.0), (x: 10.0, y: 0.0), (x: 10.0, y: 10.0), (x: 0.0, y: 10.0) + ]; + let mut triangles = Vec::new(); + triangulate(1, polygon, &mut triangles); + assert_eq!(triangles.len(), 2); + } } From 1c66567efcf0169c6c9edc0e78c914d8837036dd Mon Sep 17 00:00:00 2001 From: Alan Gutierrez Date: Mon, 13 Oct 2025 01:54:04 -0500 Subject: [PATCH 04/22] Radix selection for block kd-tree partitioning. Implemented byte-wise histogram selection to find median values without comparisons, enabling efficient partitioning of spatial data during block kd-tree construction. Processes values through multiple passes, building histograms for each byte position after a common prefix, avoiding the need to sort or compare elements directly. --- src/spatial/mod.rs | 1 + src/spatial/radix_select.rs | 122 ++++++++++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+) create mode 100644 src/spatial/radix_select.rs diff --git a/src/spatial/mod.rs b/src/spatial/mod.rs index 8d400578ca..cdf5c89f4c 100644 --- a/src/spatial/mod.rs +++ b/src/spatial/mod.rs @@ -1,3 +1,4 @@ //! Spatial module (implements a block kd-tree) +pub mod radix_select; pub mod triangle; diff --git a/src/spatial/radix_select.rs b/src/spatial/radix_select.rs new file mode 100644 index 0000000000..2397c9eaa9 --- /dev/null +++ b/src/spatial/radix_select.rs @@ -0,0 +1,122 @@ +//! Radix selection for block kd-tree tree partitioning. +//! +//! Implements byte-wise histogram selection to find median values without comparisons, enabling +//! efficient partitioning of spatial data during block kd-tree construction. Processes values +//! through multiple passes, building histograms for each byte position after a common prefix, +//! avoiding the need to sort or compare elements directly. + +/// Performs radix selection to find the median value without comparisons by building byte-wise +/// histograms. +pub struct RadixSelect { + histogram: [usize; 256], + prefix: Vec, + offset: usize, + nth: usize, +} + +impl RadixSelect { + /// Creates a new radix selector for finding the nth element among values with a common prefix. + /// + /// The offset specifies how many matching elements appeared in previous buckets (from earlier + /// passes). The nth parameter is 0-indexed, so pass 31 to find the 32nd element (median of + /// 64). + pub fn new(prefix: Vec, offset: usize, nth: usize) -> Self { + RadixSelect { + histogram: [0; 256], + prefix, + offset, + nth, + } + } + /// Updates the histogram with a value if it matches the current prefix. + /// + /// Values that don't start with the prefix are ignored. For matching values, increments the + /// count for the byte at position `prefix.len()`. + pub fn update(&mut self, value: i32) { + let bytes = value.to_be_bytes(); + if !bytes.starts_with(&self.prefix) { + return; + } + let byte = bytes[self.prefix.len()]; + self.histogram[byte as usize] += 1; + } + /// Finds which bucket contains the nth element and returns the bucket value and offset. + /// + /// Returns a tuple of `(bucket_byte, count_before)` where bucket_byte is the value of the byte + /// that contains the nth element, and count_before is the number of elements in earlier + /// buckets (becomes the offset for the next pass). + pub fn nth(&self) -> (u8, usize) { + let mut count = self.offset; + for (bucket, &frequency) in self.histogram.iter().enumerate() { + if count + frequency > self.nth as usize { + return (bucket as u8, count); + } + count += frequency; + } + panic!("nth element {} not found in histogram", self.nth); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn radix_selection() { + let dimensions = [ + ( + vec![ + 0x10101010, 0x10101011, 0x10101012, 0x10101013, 0x10101014, 0x10101015, + 0x10101016, 0x10101017, 0x10101018, 0x10101019, 0x1010101A, 0x1010101B, + 0x1010101C, 0x1010101D, 0x1010101E, 0x1010101F, 0x10101020, 0x10101021, + 0x10101022, 0x10101023, 0x10101024, 0x10101025, 0x10101026, 0x10101027, + 0x10101028, 0x10101029, 0x1010102A, 0x1010102B, 0x1010102C, 0x1010102D, + 0x1010102E, 0x1010102F, 0x10101030, 0x10101031, 0x10101032, 0x10101033, + 0x10101034, 0x10101035, 0x10101036, 0x10101037, 0x10101038, 0x10101039, + 0x1010103A, 0x1010103B, 0x1010103C, 0x1010103D, 0x1010103E, 0x1010103F, + 0x10101040, 0x10101041, 0x10101042, 0x10101043, 0x10101044, 0x10101045, + 0x10101046, 0x10101047, 0x10101048, 0x10101049, 0x1010104A, 0x1010104B, + 0x1010104C, 0x1010104D, 0x1010104E, 0x1010104F, + ], + [(0x10, 0), (0x10, 0), (0x10, 0), (0x2F, 31)], + ), + ( + vec![ + 0x10101010, 0x10101011, 0x10101012, 0x10101013, 0x10101014, 0x10101015, + 0x10101016, 0x10101017, 0x10101018, 0x10101019, 0x1010101A, 0x1010101B, + 0x1010101C, 0x1010101D, 0x1010101E, 0x1010101F, 0x10101020, 0x10101021, + 0x10101022, 0x10101023, 0x10101024, 0x20101025, 0x20201026, 0x20301027, + 0x20401028, 0x20501029, 0x2060102A, 0x2070102B, 0x2080102C, 0x2090102D, + 0x20A0102E, 0x20B0102F, 0x20C01030, 0x20D01031, 0x20E01032, 0x20F01033, + 0x20F11034, 0x20F21035, 0x20F31036, 0x20F41037, 0x20F51038, 0x20F61039, + 0x3010103A, 0x3010103B, 0x3010103C, 0x3010103D, 0x3010103E, 0x3010103F, + 0x30101040, 0x30101041, 0x30101042, 0x30101043, 0x30101044, 0x30101045, + 0x30101046, 0x30101047, 0x30101048, 0x30101049, 0x3010104A, 0x3010104B, + 0x3010104C, 0x3010104D, 0x3010104E, 0x3010104F, + ], + [(0x20, 21), (0xB0, 31), (0x10, 31), (0x2F, 31)], + ), + ]; + for (numbers, expected) in dimensions { + let mut offset = 0; + let mut prefix = Vec::new(); + for i in 0..4 { + let mut radix_select = RadixSelect::new(prefix.clone(), offset, 31); + for &number in &numbers { + radix_select.update(number); + } + let (byte, count) = radix_select.nth(); + if i != 3 { + assert_eq!(expected[i].0, byte); + assert_eq!(expected[i].1, count); + } + prefix.push(byte); + offset = count; + } + let mut sorted = numbers.clone(); + sorted.sort(); + let radix_result = i32::from_be_bytes(prefix.as_slice().try_into().unwrap()); + assert_eq!(radix_result, sorted[31]); + } + } +} From 0996bea7accadf0538b23590af8c102731ecfa9a Mon Sep 17 00:00:00 2001 From: Alan Gutierrez Date: Thu, 16 Oct 2025 02:34:00 -0500 Subject: [PATCH 05/22] Add a surveyor to determine spread and prefix. Implemented a `Surveyor` that will evaluate the bounding boxes of a set of triangles and determine the dimension with the maximum spread and the shared prefix for the values of dimension with the maximum spread. --- src/spatial/mod.rs | 1 + src/spatial/surveyor.rs | 135 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 src/spatial/surveyor.rs diff --git a/src/spatial/mod.rs b/src/spatial/mod.rs index cdf5c89f4c..e14fb6d863 100644 --- a/src/spatial/mod.rs +++ b/src/spatial/mod.rs @@ -1,4 +1,5 @@ //! Spatial module (implements a block kd-tree) pub mod radix_select; +pub mod surveyor; pub mod triangle; diff --git a/src/spatial/surveyor.rs b/src/spatial/surveyor.rs new file mode 100644 index 0000000000..390d3913c1 --- /dev/null +++ b/src/spatial/surveyor.rs @@ -0,0 +1,135 @@ +//! Dimension analysis for block kd-tree construction. +//! +//! Surveys triangle collections to identify the optimal partition dimension by tracking min/max +//! spreads and common byte prefixes across the four bounding box dimensions. The dimension with +//! maximum spread is selected for partitioning, and its prefix is used to optimize radix selection +//! by skipping shared leading bytes. +use crate::spatial::triangle::Triangle; + +#[derive(Clone, Copy)] +struct Spread { + min: i32, + max: i32, +} + +impl Spread { + fn survey(&mut self, value: i32) { + if value < self.min { + self.min = value; + } + if value > self.max { + self.max = value; + } + } + fn spread(&self) -> i32 { + self.max - self.min + } +} + +impl Default for Spread { + fn default() -> Self { + Spread { + min: i32::MAX, + max: i32::MIN, + } + } +} + +#[derive(Debug)] +struct Prefix { + words: Vec, +} + +impl Prefix { + fn prime(value: i32) -> Self { + Prefix { + words: value.to_be_bytes().to_vec(), + } + } + fn survey(&mut self, value: i32) { + let bytes = value.to_be_bytes(); + while !bytes.starts_with(&self.words) { + self.words.pop(); + } + } +} + +/// Tracks dimension statistics for block kd-tree partitioning decisions. +/// +/// Accumulates min/max spreads and common byte prefixes across the four bounding box dimensions +/// `(min_y, min_x, max_y, max_x)` of a triangle collection. Used during tree construction to +/// select the optimal partition dimension (maximum spread) and identify shared prefixes for radix +/// selection optimization. +pub struct Surveyor { + spreads: [Spread; 4], + prefixes: [Prefix; 4], +} + +impl Surveyor { + /// Creates a new surveyor using an initial triangle to establish byte prefixes. + /// + /// Prefixes require an initial value to begin comparison, so the first triangle's bounding box + /// dimensions seed each prefix. Spreads initialize to defaults and will be updated as + /// triangles are surveyed. + pub fn new(triangle: &Triangle) -> Self { + let mut prefixes = Vec::new(); + for &value in triangle.bbox() { + prefixes.push(Prefix::prime(value)); + } + Surveyor { + spreads: [Spread::default(); 4], + prefixes: prefixes.try_into().unwrap(), + } + } + + /// Updates dimension statistics with values from another triangle. + /// + /// Expands the min/max spread for each bounding box dimension and shrinks prefixes to only the + /// bytes shared across all triangles surveyed so far. + pub fn survey(&mut self, triangle: &Triangle) { + for (i, &value) in triangle.bbox().iter().enumerate() { + self.spreads[i].survey(value); + self.prefixes[i].survey(value); + } + } + + /// Returns the dimension with maximum spread and its common byte prefix. + /// + /// Selects the optimal bounding box dimension for partitioning by finding which has the + /// largest range (max - min). Returns both the dimension index and its prefix for use in radix + /// selection. + pub fn dimension(&self) -> (usize, &Vec) { + let mut dimension = 0; + let mut max_spread = self.spreads[0].spread(); + for (i, spread) in self.spreads.iter().enumerate().skip(1) { + let current_spread = spread.spread(); + if current_spread > max_spread { + dimension = i; + max_spread = current_spread; + } + } + (dimension, &self.prefixes[dimension].words) + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::spatial::triangle::Triangle; + + #[test] + fn survey_triangles() { + let triangles = [ + Triangle::new(1, [0, 0, 8, 1, 10, 3], [false, false, false]), + Triangle::new(1, [0, 0, 0, 11, 9, 0], [false, false, false]), + Triangle::new(1, [0, 0, 5, 4, 5, 0], [false, false, false]), + ]; + let mut surveyor = Surveyor::new(&triangles[0]); + for triangle in &triangles { + surveyor.survey(triangle); + } + let (dimension, prefix) = surveyor.dimension(); + assert_eq!(dimension, 3); + assert_eq!(prefix.len(), 3); + } +} From f38140f72fb73b6b3a912e6588788f51693280e7 Mon Sep 17 00:00:00 2001 From: Alan Gutierrez Date: Sat, 18 Oct 2025 03:53:54 -0500 Subject: [PATCH 06/22] Add delta compression for block kd-tree leaf nodes. Implements dimension-major bit-packing with zigzag encoding for signed i32 deltas, enabling compression of spatially-clustered triangles from 32-bit coordinates down to 4-19 bits per delta depending on spatial extent. --- src/spatial/delta.rs | 300 +++++++++++++++++++++++++++++++++++++++++++ src/spatial/mod.rs | 1 + 2 files changed, 301 insertions(+) create mode 100644 src/spatial/delta.rs diff --git a/src/spatial/delta.rs b/src/spatial/delta.rs new file mode 100644 index 0000000000..8a9d450459 --- /dev/null +++ b/src/spatial/delta.rs @@ -0,0 +1,300 @@ +//! Delta compression for block kd-tree leaves. +//! +//! Delta compression with dimension-major bit-packing for block kd-tree leaves. Each leaf contains +//! ≤512 triangles sorted by the split dimension (the dimension with maximum spread chosen during +//! tree construction). We store all 512 values for dimension 0, then all for dimension 1, etc., +//! enabling tight bit-packing per dimension and better cache locality during decode. +//! +//! The split dimension is already optimal for compression. Since triangles in a leaf are spatially +//! clustered, sorting by the max-spread dimension naturally orders them by proximity in all +//! dimensions. Testing multiple sort orders would be wasted effort. +//! +//! Our encoding uses ~214 units/meter for latitude, ~107 units/meter for longitude (millimeter +//! precision). A quarter-acre lot (32m × 32m) spans ~6,850 units across 512 sorted triangles = avg +//! delta ~13 units = 4 bits. A baseball field (100m × 100m) is ~42 unit deltas = 6 bits. Even +//! Russia-sized polygons (1000 km) average ~418,000 unit deltas = 19 bits. Time will tell if these +//! numbers are anything to go by in practice. +//! +//! Our format for use with leaf-page triangles: First a count of triangles in the page, then the +//! delta encoded doc_ids followed by delta encoding of each series of the triangle dimensions, +//! followed by delta encoding of the flags. Creates eight parallel arrays from which triangles can +//! be reconstructed. +//! +//! Note: Tantivy also has delta encoding in `sstable/src/delta.rs`, but that's for string +//! dictionary compression (prefix sharing + vint deltas). This module uses bit-packing with zigzag +//! encoding, which is optimal for our signed i32 spatial coordinates with small deltas. It uses +//! the same basic algorithm to compress u32 doc_ids. +use std::io::{self, Write}; + +fn zigzag_encode(x: i32) -> u32 { + ((x << 1) ^ (x >> 31)) as u32 +} + +fn zigzag_decode(x: u32) -> i32 { + ((x >> 1) ^ (0u32.wrapping_sub(x & 1))) as i32 +} + +/// Trait for reading values by index during compression. +/// +/// The `Compressible` trait allows `compress()` to work with two different data sources, +/// `Vec` when indexing and memory mapped `Triangle` when merging. The compress function +/// reads values on-demand via `get()`, computing deltas and bit-packing without intermediate +/// allocations. +pub trait Compressible { + /// The type of the values being compressed. + type Value: Copy; + /// Returns the number of values in this source. + fn len(&self) -> usize; + /// Returns the value at the given index. + fn get(&self, i: usize) -> Self::Value; +} + +/// Operations for types that can be delta-encoded and bit-packed into four-byte words. +pub trait DeltaEncoder: Copy { + /// Computes a zigzag-encoded delta between two values. + fn compute_delta(current: Self, previous: Self) -> u32; + /// Converts a value to little-endian bytes for storage. + fn to_le_bytes(value: Self) -> [u8; 4]; +} + +impl DeltaEncoder for i32 { + fn compute_delta(current: Self, previous: Self) -> u32 { + zigzag_encode(current.wrapping_sub(previous)) + } + fn to_le_bytes(value: Self) -> [u8; 4] { + value.to_le_bytes() + } +} + +// Delta encoding for u32 values using wrapping arithmetic and zigzag encoding. +// +// This handles arbitrary u32 document IDs that may be non-sequential or widely spaced. The +// strategy uses wrapping subtraction followed by zigzag encoding: +// +// 1. wrapping_sub computes the difference modulo 2^32, producing a u32 result +// 2. Cast to i32 reinterprets the bit pattern as signed (two's complement) +// 3. zigzag_encode maps signed values to unsigned for efficient bit-packing: +// - Positive deltas (0, 1, 2...) encode to even numbers (0, 2, 4...) +// - Negative deltas (-1, -2, -3...) encode to odd numbers (1, 3, 5...) +// +// Example with large jump (doc_id 0 → 4,000,000,000): +// delta = 4_000_000_000u32.wrapping_sub(0) = 4_000_000_000u32 +// as i32 = -294,967,296 (bit pattern preserved via two's complement) +// zigzag_encode(-294,967,296) = some u32 value +// +// During decompression, zigzag_decode returns the signed i32 delta, which is cast back to u32 and +// added with wrapping_add. The bit pattern round-trips correctly because wrapping_add and +// wrapping_sub are mathematical inverses modulo 2^32, making this encoding symmetric for the full +// u32 range. +impl DeltaEncoder for u32 { + fn compute_delta(current: Self, previous: Self) -> u32 { + zigzag_encode(current.wrapping_sub(previous) as i32) + } + fn to_le_bytes(value: Self) -> [u8; 4] { + value.to_le_bytes() + } +} + +/// Compresses values from a `Compressible` source using delta encoding and bit-packing. +/// +/// Computes signed deltas between consecutive values, zigzag encodes them, and determines the +/// minimum bit width needed to represent all deltas. Writes a header (1 byte for bit width + +/// 4 bytes for first value in little-endian), then bit-packs the remaining deltas. +pub fn compress(compressible: &T, write: &mut W) -> io::Result<()> +where + T: Compressible, + T::Value: DeltaEncoder, + W: Write, +{ + let mut max_delta = 0u32; + for i in 1..compressible.len() { + let delta = T::Value::compute_delta(compressible.get(i), compressible.get(i - 1)); + max_delta = max_delta.max(delta); + } + let bits = if max_delta == 0 { + 0u32 + } else { + 32 - max_delta.leading_zeros() as u32 + }; + let mask = if bits == 32 { + u32::MAX + } else { + (1u32 << bits) - 1 + }; + write.write_all(&[bits as u8])?; + write.write_all(&T::Value::to_le_bytes(compressible.get(0)))?; + let mut buffer = 0u64; + let mut buffer_bits = 0u32; + for i in 1..compressible.len() { + let delta = T::Value::compute_delta(compressible.get(i), compressible.get(i - 1)); + let value = delta & mask; + buffer = (buffer << bits) | (value as u64); + buffer_bits += bits; + while buffer_bits >= 8 { + buffer_bits -= 8; + write.write_all(&[(buffer >> buffer_bits) as u8])?; + } + } + if buffer_bits > 0 { + write.write_all(&[(buffer << (8 - buffer_bits)) as u8])?; + } + Ok(()) +} + +/// Operations needed to decompress delta-encoded values back to their original form. +pub trait DeltaDecoder: Copy + Sized { + /// Converts from little-endian bytes to a value. + fn from_le_bytes(bytes: [u8; 4]) -> Self; + /// Applies a zigzag-decoded delta to reconstruct the next value. + fn apply_delta(value: Self, delta: u32) -> Self; +} + +impl DeltaDecoder for i32 { + fn from_le_bytes(bytes: [u8; 4]) -> Self { + i32::from_le_bytes(bytes) + } + fn apply_delta(value: Self, delta: u32) -> Self { + value.wrapping_add(zigzag_decode(delta)) + } +} + +impl DeltaDecoder for u32 { + fn from_le_bytes(bytes: [u8; 4]) -> Self { + u32::from_le_bytes(bytes) + } + fn apply_delta(value: Self, delta: u32) -> Self { + value.wrapping_add(zigzag_decode(delta) as u32) + } +} + +/// Decompresses bit-packed delta-encoded values from a byte slice. +/// +/// Reads the header to get bit width and first value, then unpacks the bit-packed deltas, applies +/// zigzag decoding, and reconstructs the original values by accumulating deltas. +/// +/// Returns the count of bytes read from `data`. +pub fn decompress( + data: &[u8], + count: usize, + mut process: F, +) -> io::Result +where + F: FnMut(usize, T), +{ + if data.len() < 5 { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "truncated header", + )); + } + let bits = data[0] as u32; + let first = T::from_le_bytes([data[1], data[2], data[3], data[4]]); + process(0, first); + let mut offset = 5; + if bits == 0 { + // All deltas are zero - all values same as first + for i in 1..count { + process(i, first); + } + return Ok(offset); + } + let mut buffer = 0u64; + let mut buffer_bits = 0u32; + let mut prev = first; + for i in 1..count { + // Refill buffer with bytes + while buffer_bits < bits { + if offset >= data.len() { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + format!("expected {} values but only decoded {}", count, i - 1), + )); + } + buffer = (buffer << 8) | (data[offset] as u64); + offset += 1; + buffer_bits += 8; + } + if buffer_bits >= bits { + // Extract packed value + buffer_bits -= bits; + let encoded = ((buffer >> buffer_bits) & ((1u64 << bits) - 1)) as u32; + let value = T::apply_delta(prev, encoded); + process(i, value); + prev = value; + } else { + break; + } + } + Ok(offset) +} + +#[cfg(test)] +mod test { + use super::*; + + pub struct CompressibleI32Vec { + vec: Vec, + } + + impl CompressibleI32Vec { + fn new(vec: Vec) -> Self { + CompressibleI32Vec { vec } + } + } + + impl Compressible for CompressibleI32Vec { + type Value = i32; + fn len(&self) -> usize { + return self.vec.len(); + } + fn get(&self, i: usize) -> i32 { + return self.vec[i]; + } + } + + #[test] + fn test_spatial_delta_compress_decompress() { + let values = vec![ + 100000, 99975, 100050, 99980, 100100, 100025, 99950, 100150, 100075, 99925, 100200, + 100100, + ]; + let compressible = CompressibleI32Vec::new(values.clone()); + let mut buffer = Vec::new(); + compress(&compressible, &mut buffer).unwrap(); + let mut vec = Vec::new(); + decompress::(&buffer, values.len(), |_, value| vec.push(value)).unwrap(); + assert_eq!(vec, values); + } + + #[test] + fn test_spatial_delta_bad_header() { + let mut vec = Vec::new(); + let result = decompress::(&[1, 2], 1, |_, value| vec.push(value)); + assert!(result.is_err()); + } + + #[test] + fn test_spatial_delta_insufficient_data() { + let mut vec = Vec::new(); + let result = decompress::(&[5, 0, 0, 0, 1], 12, |_, value| vec.push(value)); + assert!(result.is_err()); + } + + #[test] + fn test_spatial_delta_single_item() { + let mut vec = Vec::new(); + decompress::(&[5, 1, 0, 0, 0], 1, |_, value| vec.push(value)).unwrap(); + assert_eq!(vec[0], 1); + } + + #[test] + fn test_spatial_delta_zero_length_delta() { + let values = vec![1, 1, 1]; + let compressible = CompressibleI32Vec::new(values.clone()); + let mut buffer = Vec::new(); + compress(&compressible, &mut buffer).unwrap(); + let mut vec = Vec::new(); + decompress::(&buffer, values.len(), |_, value| vec.push(value)).unwrap(); + assert_eq!(vec, values); + } +} diff --git a/src/spatial/mod.rs b/src/spatial/mod.rs index e14fb6d863..1c677015e3 100644 --- a/src/spatial/mod.rs +++ b/src/spatial/mod.rs @@ -1,5 +1,6 @@ //! Spatial module (implements a block kd-tree) +pub mod delta; pub mod radix_select; pub mod surveyor; pub mod triangle; From 2dc46b235e59c85b30f51808b75b908b5517f6a7 Mon Sep 17 00:00:00 2001 From: Alan Gutierrez Date: Sun, 19 Oct 2025 23:58:37 -0500 Subject: [PATCH 07/22] Implement block kd-tree. Implement an immutable bulk-loaded spatial index using recursive median partitioning on bounding box dimensions. Each leaf stores up to 512 triangles with delta-compressed coordinates and doc IDs. The tree provides three query types (intersects, within, contains) that use exact integer arithmetic for geometric predicates and accumulate results in bit sets for efficient deduplication across leaves. The serialized format stores compressed leaf pages followed by the tree structure (leaf and branch nodes), enabling zero-copy access through memory-mapped segments without upfront decompression. --- Cargo.toml | 1 + src/spatial/bkd.rs | 760 ++++++++++++++++++++++++++++++++++++++++ src/spatial/mod.rs | 2 +- src/spatial/surveyor.rs | 135 ------- 4 files changed, 762 insertions(+), 136 deletions(-) create mode 100644 src/spatial/bkd.rs delete mode 100644 src/spatial/surveyor.rs diff --git a/Cargo.toml b/Cargo.toml index 2cdb25b1c4..cb0c2b8bb3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -93,6 +93,7 @@ time = { version = "0.3.10", features = ["serde-well-known", "macros"] } postcard = { version = "1.0.4", features = [ "use-std", ], default-features = false } +geojson = "0.24.2" [target.'cfg(not(windows))'.dev-dependencies] criterion = { version = "0.5", default-features = false } diff --git a/src/spatial/bkd.rs b/src/spatial/bkd.rs new file mode 100644 index 0000000000..99476e1c23 --- /dev/null +++ b/src/spatial/bkd.rs @@ -0,0 +1,760 @@ +//! Block kd-tree spatial indexing for triangulated polygons. +//! +//! Implements an immutable bulk-loaded spatial index using recursive median partitioning on +//! bounding box dimensions. Each leaf stores up to 512 triangles with delta-compressed coordinates +//! and doc IDs. The tree provides three query types (intersects, within, contains) that use exact +//! integer arithmetic for geometric predicates and accumulate results in bit sets for efficient +//! deduplication across leaves. +//! +//! The serialized format stores compressed leaf pages followed by the tree structure (leaf and +//! branch nodes), enabling zero-copy access through memory-mapped segments without upfront +//! decompression. +use std::io; +use std::io::{Seek, Write}; + +use common::BitSet; + +use crate::spatial::delta::{compress, decompress, Compressible}; +use crate::spatial::triangle::Triangle; + +#[derive(Clone, Copy)] +struct SpreadSurvey { + min: i32, + max: i32, +} + +impl SpreadSurvey { + fn survey(&mut self, value: i32) { + self.min = self.min.min(value); + self.max = self.max.max(value); + } + fn spread(&self) -> i32 { + self.max - self.min + } +} + +impl Default for SpreadSurvey { + fn default() -> Self { + SpreadSurvey { + min: i32::MAX, + max: i32::MIN, + } + } +} + +#[derive(Clone, Copy)] +struct BoundingBoxSurvey { + bbox: [i32; 4], +} + +impl BoundingBoxSurvey { + fn survey(&mut self, triangle: &Triangle) { + self.bbox[0] = triangle.words[0].min(self.bbox[0]); + self.bbox[1] = triangle.words[1].min(self.bbox[1]); + self.bbox[2] = triangle.words[2].max(self.bbox[2]); + self.bbox[3] = triangle.words[3].max(self.bbox[3]); + } + fn bbox(&self) -> [i32; 4] { + self.bbox.clone() + } +} + +impl Default for BoundingBoxSurvey { + fn default() -> Self { + BoundingBoxSurvey { + bbox: [i32::MAX, i32::MAX, i32::MIN, i32::MIN], + } + } +} + +enum BuildNode { + Branch { + bbox: [i32; 4], + left: Box, + right: Box, + }, + Leaf { + bbox: [i32; 4], + pos: u64, + len: u16, + }, +} + +struct CompressibleTriangleI32<'a> { + triangles: &'a [Triangle], + dimension: usize, +} + +impl<'a> CompressibleTriangleI32<'a> { + fn new(triangles: &'a [Triangle], dimension: usize) -> Self { + CompressibleTriangleI32 { + triangles: triangles, + dimension: dimension, + } + } +} + +impl<'a> Compressible for CompressibleTriangleI32<'a> { + type Value = i32; + fn len(&self) -> usize { + self.triangles.len() + } + fn get(&self, i: usize) -> i32 { + self.triangles[i].words[self.dimension] + } +} + +struct CompressibleTriangleDocID<'a> { + triangles: &'a [Triangle], +} + +impl<'a> CompressibleTriangleDocID<'a> { + fn new(triangles: &'a [Triangle]) -> Self { + CompressibleTriangleDocID { + triangles: triangles, + } + } +} + +impl<'a> Compressible for CompressibleTriangleDocID<'a> { + type Value = u32; + fn len(&self) -> usize { + self.triangles.len() + } + fn get(&self, i: usize) -> u32 { + self.triangles[i].doc_id + } +} + +// Leaf pages are first the count of triangles, followed by delta encoded doc_ids, followed by +// the delta encoded words in order. We will then have the length of the page. We build a tree +// after the pages with leaf nodes and branch nodes. Leaf nodes will contain the bounding box +// of the leaf followed position and length of the page. The leaf node is a level of direction +// to store the position and length of the page in a format that is easy to read directly from +// the mapping. + +// We do not compress the tree nodes. We read them directly from the mapping. + +// +fn write_leaf_pages( + triangles: &mut [Triangle], + write: &mut W, +) -> io::Result { + if triangles.len() <= 512 { + let pos = write.stream_position()?; + let mut spreads = [SpreadSurvey::default(); 4]; + let mut bounding_box = BoundingBoxSurvey::default(); + for triangle in triangles.iter() { + for i in 0..4 { + spreads[i].survey(triangle.words[i]); + } + bounding_box.survey(triangle); + } + let mut max_spread = spreads[0].spread(); + let mut dimension = 0; + for i in 1..4 { + let current_spread = spreads[i].spread(); + if current_spread > max_spread { + dimension = i; + max_spread = current_spread; + } + } + write.write_all(&(triangles.len() as u16).to_le_bytes())?; + triangles.sort_by_key(|t| t.words[dimension]); + compress(&CompressibleTriangleDocID::new(triangles), write)?; + let compressible = [ + CompressibleTriangleI32::new(triangles, 0), + CompressibleTriangleI32::new(triangles, 1), + CompressibleTriangleI32::new(triangles, 2), + CompressibleTriangleI32::new(triangles, 3), + CompressibleTriangleI32::new(triangles, 4), + CompressibleTriangleI32::new(triangles, 5), + CompressibleTriangleI32::new(triangles, 6), + ]; + for i in 0..7 { + compress(&compressible[i], write)?; + } + let len = write.stream_position()? - pos; + Ok(BuildNode::Leaf { + bbox: bounding_box.bbox(), + pos: pos, + len: len as u16, + }) + } else { + let mut spreads = [SpreadSurvey::default(); 4]; + let mut bounding_box = BoundingBoxSurvey::default(); + for triangle in triangles.iter() { + for i in 0..4 { + spreads[i].survey(triangle.words[i]); + } + bounding_box.survey(triangle); + } + let mut max_spread = spreads[0].spread(); + let mut dimension = 0; + for i in 0..4 { + let current_spread = spreads[i].spread(); + if current_spread > max_spread { + dimension = i; + max_spread = current_spread; + } + } + let mid = triangles.len() / 2; + triangles.select_nth_unstable_by_key(mid, |t| t.words[dimension]); + let partition = triangles[mid].words[dimension]; + let mut split_point = mid + 1; + while split_point < triangles.len() && triangles[split_point].words[dimension] == partition + { + split_point += 1; + } + if split_point == triangles.len() { + split_point = mid; // Force split at midpoint index + } + let (left, right) = triangles.split_at_mut(split_point); + let left_node = write_leaf_pages(left, write)?; + let right_node = write_leaf_pages(right, write)?; + Ok(BuildNode::Branch { + bbox: bounding_box.bbox(), + left: Box::new(left_node), + right: Box::new(right_node), + }) + } +} + +fn write_leaf_nodes(node: &BuildNode, write: &mut W) -> io::Result<()> { + match node { + BuildNode::Branch { + bbox: _, + left, + right, + } => { + write_leaf_nodes(right, write)?; + write_leaf_nodes(left, write)?; + } + BuildNode::Leaf { bbox, pos, len } => { + for &dimension in bbox.iter() { + write.write_all(&dimension.to_le_bytes())?; + } + write.write_all(&pos.to_le_bytes())?; + write.write_all(&len.to_le_bytes())?; + write.write_all(&[0u8; 6])?; + } + } + Ok(()) +} + +fn write_branch_nodes( + node: &BuildNode, + branch_offset: &mut i32, + leaf_offset: &mut i32, + write: &mut W, +) -> io::Result { + match node { + BuildNode::Leaf { .. } => { + let pos = *leaf_offset; + *leaf_offset -= 1; + Ok(pos * size_of::() as i32) + } + BuildNode::Branch { bbox, left, right } => { + let left = write_branch_nodes(left, branch_offset, leaf_offset, write)?; + let right = write_branch_nodes(right, branch_offset, leaf_offset, write)?; + for &val in bbox { + write.write_all(&val.to_le_bytes())?; + } + write.write_all(&left.to_le_bytes())?; + write.write_all(&right.to_le_bytes())?; + write.write_all(&[0u8; 8])?; + let pos = *branch_offset; + *branch_offset += 1; + Ok(pos * size_of::() as i32) + } + } +} + +/// Builds and serializes a block kd-tree for spatial indexing of triangles. +/// +/// Takes a collection of triangles and constructs a complete block kd-tree, writing both the +/// compressed leaf pages and tree structure to the output. The tree uses recursive median +/// partitioning on the dimension with maximum spread, storing up to 512 triangles per leaf. +/// +/// The output format consists of: +/// - Version header (u16) +/// - Compressed leaf pages (delta-encoded doc_ids and triangle coordinates) +/// - 32-byte aligned tree structure (leaf nodes, then branch nodes) +/// - Footer with triangle count, root offset, and branch position +/// +/// The `triangles` slice will be reordered during tree construction as partitioning sorts by the +/// selected dimension at each level. +pub fn write_block_kd_tree( + triangles: &mut [Triangle], + write: &mut W, +) -> io::Result<()> { + write.write_all(&1u16.to_le_bytes())?; + let tree = write_leaf_pages(triangles, write)?; + let current = write.stream_position()?; + let aligned = (current + 31) & !31; + let padding = aligned - current; + write.write_all(&vec![0u8; padding as usize])?; + write_leaf_nodes(&tree, write)?; + let branch_position = write.stream_position()?; + let mut branch_offset: i32 = 0; + let mut leaf_offset: i32 = -1; + let root = write_branch_nodes(&tree, &mut branch_offset, &mut leaf_offset, write)?; + write.write_all(&triangles.len().to_le_bytes())?; + write.write_all(&root.to_le_bytes())?; + write.write_all(&branch_position.to_le_bytes())?; + Ok(()) +} + +fn decompress_leaf(data: &[u8]) -> io::Result> { + let count = u16::from_le_bytes([data[0], data[1]]) as usize; + let mut offset = 2; + let mut triangles = Vec::with_capacity(count); + offset += decompress::(&data[offset..], count, |_, doc_id| { + triangles.push(Triangle::skeleton(doc_id)) + })?; + for i in 0..7 { + offset += decompress::(&data[offset..], count, |j, word| { + triangles[j].words[i] = word + })?; + } + Ok(triangles) +} + +#[repr(C)] +struct BranchNode { + bbox: [i32; 4], + left: i32, + right: i32, + pad: [u8; 8], +} + +#[repr(C)] +struct LeafNode { + bbox: [i32; 4], + pos: u64, + len: u16, + pad: [u8; 6], +} + +/// A read-only view into a serialized block kd-tree segment. +/// +/// Provides access to the tree structure and compressed leaf data through memory-mapped or +/// buffered byte slices. The segment contains compressed leaf pages followed by the tree structure +/// (leaf nodes and branch nodes), with a footer containing metadata for locating the root and +/// interpreting offsets. +pub struct Segment<'a> { + data: &'a [u8], + branch_position: u64, + /// Offset to the root of the tree, used as the starting point for traversal. + pub root_offset: i32, +} + +impl<'a> Segment<'a> { + /// Creates a new segment from serialized block kd-tree data. + /// + /// Reads the footer metadata from the last 12 bytes to locate the tree structure and root + /// node. + pub fn new(data: &'a [u8]) -> Self { + Segment { + data: data, + branch_position: u64::from_le_bytes(data[data.len() - 8..].try_into().unwrap()), + root_offset: i32::from_le_bytes( + data[data.len() - 12..data.len() - 8].try_into().unwrap(), + ), + } + } + fn bounding_box(&self, offset: i32) -> &[i32; 4] { + unsafe { + let byte_offset = (self.branch_position as i64 + offset as i64) as usize; + let ptr = self.data.as_ptr().add(byte_offset); + &*ptr.cast::<[i32; 4]>() + } + } + fn branch_node(&self, offset: i32) -> &BranchNode { + unsafe { + let byte_offset = (self.branch_position as i64 + offset as i64) as usize; + let ptr = self.data.as_ptr().add(byte_offset); + &*ptr.cast::() + } + } + fn leaf_node(&self, offset: i32) -> &LeafNode { + unsafe { + let byte_offset = (self.branch_position as i64 + offset as i64) as usize; + let ptr = self.data.as_ptr().add(byte_offset); + &*ptr.cast::() + } + } + fn leaf_page(&self, leaf_node: &LeafNode) -> &[u8] { + &self.data[(leaf_node.pos as usize)..(leaf_node.pos as usize + leaf_node.len as usize)] + } +} + +fn collect_all_docs(segment: &Segment, offset: i32, result: &mut BitSet) -> io::Result<()> { + if offset < 0 { + let leaf_node = segment.leaf_node(offset); + let data = segment.leaf_page(leaf_node); + let count = u16::from_le_bytes([data[0], data[1]]) as usize; + decompress::(&data[2..], count, |_, doc_id| result.insert(doc_id))?; + } else { + let branch_node = segment.branch_node(offset); + collect_all_docs(segment, branch_node.left, result)?; + collect_all_docs(segment, branch_node.right, result)?; + } + Ok(()) +} + +fn bbox_within(bbox: &[i32; 4], query: &[i32; 4]) -> bool { + bbox[0] >= query[0] && // min_y >= query_min_y + bbox[1] >= query[1] && // min_x >= query_min_x + bbox[2] <= query[2] && // max_y <= query_max_y + bbox[3] <= query[3] // max_x <= query_max_x +} + +fn bbox_intersects(bbox: &[i32; 4], query: &[i32; 4]) -> bool { + !(bbox[2] < query[0] || bbox[0] > query[2] || bbox[3] < query[1] || bbox[1] > query[3]) +} + +/// Finds documents with triangles that intersect the query bounding box. +/// +/// Traverses the tree starting at `offset` (typically `segment.root_offset`), pruning subtrees +/// whose bounding boxes don't intersect the query. When a node's bbox is entirely within the +/// query, all its documents are bulk-collected. Otherwise, individual triangles are tested using +/// exact geometric predicates. +/// +/// The query is `[min_y, min_x, max_y, max_x]` in integer coordinates. Documents are inserted into +/// the `result` BitSet, which automatically deduplicates when the same document appears in +/// multiple leaves. +pub fn search_intersects( + segment: &Segment, + offset: i32, + query: &[i32; 4], + result: &mut BitSet, +) -> io::Result<()> { + let bbox = segment.bounding_box(offset); + // bbox doesn't intersect query → skip entire subtree + if !bbox_intersects(bbox, query) { + } + // bbox entirely within query → all triangles intersect + else if bbox_within(bbox, query) { + collect_all_docs(segment, offset, result)?; + } else if offset < 0 { + // bbox crosses query → test each triangle + let leaf_node = segment.leaf_node(offset); + let triangles = decompress_leaf(segment.leaf_page(leaf_node))?; + for triangle in &triangles { + if triangle_intersects(triangle, query) { + result.insert(triangle.doc_id); // BitSet deduplicates + } + } + } else { + let branch_node = segment.branch_node(offset); + // bbox crosses query → must check children + search_intersects(segment, branch_node.left, query, result)?; + search_intersects(segment, branch_node.right, query, result)?; + } + Ok(()) +} + +fn line_intersects_line( + x1: i32, + y1: i32, + x2: i32, + y2: i32, + x3: i32, + y3: i32, + x4: i32, + y4: i32, +) -> bool { + // Cast to i128 to prevent overflow in coordinate arithmetic + let x1 = x1 as i128; + let y1 = y1 as i128; + let x2 = x2 as i128; + let y2 = y2 as i128; + let x3 = x3 as i128; + let y3 = y3 as i128; + let x4 = x4 as i128; + let y4 = y4 as i128; + + // Proper segment-segment intersection test + let d = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4); + if d == 0 { + // parallel + return false; + } + + let t = (x1 - x3) * (y3 - y4) - (y1 - y3) * (x3 - x4); + let u = -((x1 - x2) * (y1 - y3) - (y1 - y2) * (x1 - x3)); + + if d > 0 { + t >= 0 && t <= d && u >= 0 && u <= d + } else { + t <= 0 && t >= d && u <= 0 && u >= d + } +} + +fn edge_intersects_bbox(x1: i32, y1: i32, x2: i32, y2: i32, bbox: &[i32; 4]) -> bool { + // Test against all 4 rectangle edges, bottom, right, top, left. + line_intersects_line(x1, y1, x2, y2, bbox[1], bbox[0], bbox[3], bbox[0]) + || line_intersects_line(x1, y1, x2, y2, bbox[3], bbox[0], bbox[3], bbox[2]) + || line_intersects_line(x1, y1, x2, y2, bbox[3], bbox[2], bbox[1], bbox[2]) + || line_intersects_line(x1, y1, x2, y2, bbox[1], bbox[2], bbox[1], bbox[0]) +} + +fn edge_crosses_bbox(x1: i32, y1: i32, x2: i32, y2: i32, bbox: &[i32; 4]) -> bool { + // Edge has endpoint outside while other is inside (crosses boundary) + let p1_inside = y1 >= bbox[0] && x1 >= bbox[1] && y1 <= bbox[2] && x1 <= bbox[3]; + let p2_inside = y2 >= bbox[0] && x2 >= bbox[1] && y2 <= bbox[2] && x2 <= bbox[3]; + p1_inside != p2_inside +} + +fn triangle_within(triangle: &Triangle, query: &[i32; 4]) -> bool { + let tri_bbox = &triangle.words[0..4]; + + // Triangle bbox entirely within query → WITHIN + if tri_bbox[0] >= query[0] + && tri_bbox[1] >= query[1] + && tri_bbox[2] <= query[2] + && tri_bbox[3] <= query[3] + { + return true; + } + + // Triangle bbox entirely outside → NOT WITHIN + if tri_bbox[2] < query[0] + || tri_bbox[3] < query[1] + || tri_bbox[0] > query[2] + || tri_bbox[1] > query[3] + { + return false; + } + + // Decode vertices. + let ([ay, ax, by, bx, cy, cx], [ab, bc, ca]) = triangle.decode(); + + // Check each edge - if boundary edge crosses query bbox, NOT WITHIN + if ab && edge_crosses_bbox(ax, ay, bx, by, query) { + return false; + } + if bc && edge_crosses_bbox(bx, by, cx, cy, query) { + return false; + } + if ca && edge_crosses_bbox(cx, cy, ax, ay, query) { + return false; + } + + // No boundary edges cross out + true +} + +fn point_in_triangle( + px: i32, + py: i32, + ax: i32, + ay: i32, + bx: i32, + by: i32, + cx: i32, + cy: i32, +) -> bool { + let v0x = (cx - ax) as i128; + let v0y = (cy - ay) as i128; + let v1x = (bx - ax) as i128; + let v1y = (by - ay) as i128; + let v2x = (px - ax) as i128; + let v2y = (py - ay) as i128; + + let dot00 = v0x * v0x + v0y * v0y; + let dot01 = v0x * v1x + v0y * v1y; + let dot02 = v0x * v2x + v0y * v2y; + let dot11 = v1x * v1x + v1y * v1y; + let dot12 = v1x * v2x + v1y * v2y; + + let denom = dot00 * dot11 - dot01 * dot01; + if denom == 0 { + return false; + } + + let u = dot11 * dot02 - dot01 * dot12; + let v = dot00 * dot12 - dot01 * dot02; + + u >= 0 && v >= 0 && u + v <= denom +} + +fn triangle_intersects(triangle: &Triangle, query: &[i32; 4]) -> bool { + let tri_bbox = &triangle.words[0..4]; + + // Quick reject: bboxes don't overlap + if tri_bbox[2] < query[0] + || tri_bbox[3] < query[1] + || tri_bbox[0] > query[2] + || tri_bbox[1] > query[3] + { + return false; + } + + let ([ay, ax, by, bx, cy, cx], _) = triangle.decode(); + + // Any triangle vertex inside rectangle? + if (ax >= query[1] && ax <= query[3] && ay >= query[0] && ay <= query[2]) + || (bx >= query[1] && bx <= query[3] && by >= query[0] && by <= query[2]) + || (cx >= query[1] && cx <= query[3] && cy >= query[0] && cy <= query[2]) + { + return true; + } + + // Any rectangle corner inside triangle? + let corners = [ + (query[1], query[0]), // min_x, min_y + (query[3], query[0]), // max_x, min_y + (query[3], query[2]), // max_x, max_y + (query[1], query[2]), // min_x, max_y + ]; + for (x, y) in corners { + if point_in_triangle(x, y, ax, ay, bx, by, cx, cy) { + return true; + } + } + + // Any triangle edge intersect rectangle edges? + edge_intersects_bbox(ax, ay, bx, by, query) + || edge_intersects_bbox(bx, by, cx, cy, query) + || edge_intersects_bbox(cx, cy, ax, ay, query) +} + +/// Finds documents where all triangles are within the query bounding box. +/// +/// Traverses the tree starting at `offset` (typically `segment.root_offset`), testing each +/// triangle to determine if it lies entirely within the query bounds. Uses two `BitSet` instances +/// to track state: `result` accumulates candidate documents, while `excluded` marks documents that +/// have at least one triangle extending outside the query. +/// +/// The query is `[min_y, min_x, max_y, max_x]` in integer coordinates. The final result is +/// documents in `result` that are NOT in `excluded` - the caller must compute this difference. +pub fn search_within( + segment: &Segment, + offset: i32, + query: &[i32; 4], // [min_y, min_x, max_y, max_x] + result: &mut BitSet, + excluded: &mut BitSet, +) -> io::Result<()> { + let bbox = segment.bounding_box(offset); + if !bbox_intersects(bbox, query) { + } else if offset < 0 { + let leaf_node = segment.leaf_node(offset); + // bbox crosses query → test each triangle + let triangles = decompress_leaf(segment.leaf_page(leaf_node))?; + for triangle in &triangles { + if triangle_intersects(triangle, query) { + if excluded.contains(triangle.doc_id) { + continue; // Already excluded + } + if triangle_within(triangle, query) { + result.insert(triangle.doc_id); + } else { + excluded.insert(triangle.doc_id); + } + } + } + } else { + let branch_node = segment.branch_node(offset); + search_within(segment, branch_node.left, query, result, excluded)?; + search_within(segment, branch_node.right, query, result, excluded)?; + } + Ok(()) +} + +enum ContainsRelation { + CANDIDATE, // Query might be contained + NOTWITHIN, // Query definitely not contained + DISJOINT, // Triangle doesn't overlap query +} + +fn triangle_contains_relation(triangle: &Triangle, query: &[i32; 4]) -> ContainsRelation { + let tri_bbox = &triangle.words[0..4]; + + if query[2] < tri_bbox[0] + || query[3] < tri_bbox[1] + || query[0] > tri_bbox[2] + || query[1] > tri_bbox[3] + { + return ContainsRelation::DISJOINT; + } + + let ([ay, ax, by, bx, cy, cx], [ab, bc, ca]) = triangle.decode(); + + let corners = [ + (query[1], query[0]), + (query[3], query[0]), + (query[3], query[2]), + (query[1], query[2]), + ]; + + let mut any_corner_inside = false; + for &(qx, qy) in &corners { + if point_in_triangle(qx, qy, ax, ay, bx, by, cx, cy) { + any_corner_inside = true; + break; + } + } + + let ab_intersects = edge_intersects_bbox(ax, ay, bx, by, query); + let bc_intersects = edge_intersects_bbox(bx, by, cx, cy, query); + let ca_intersects = edge_intersects_bbox(cx, cy, ax, ay, query); + + if (ab && edge_crosses_bbox(ax, ay, bx, by, query)) + || (bc && edge_crosses_bbox(bx, by, cx, cy, query)) + || (ca && edge_crosses_bbox(cx, cy, ax, ay, query)) + { + return ContainsRelation::NOTWITHIN; + } + + if any_corner_inside || ab_intersects || bc_intersects || ca_intersects { + return ContainsRelation::CANDIDATE; + } + + ContainsRelation::DISJOINT +} + +/// Finds documents whose polygons contain the query bounding box. +/// +/// Traverses the tree starting at `offset` (typically `segment.root_offset`), testing each +/// triangle using three-state logic: `CANDIDATE` (query might be contained), `NOTWITHIN` (boundary +/// edge crosses query), or `DISJOINT` (no overlap). Only boundary edges are tested for crossing - +/// internal tessellation edges are ignored. +/// +/// The query is `[min_y, min_x, max_y, max_x]` in integer coordinates. Uses two `BitSet` +/// instances: `result` accumulates candidates, `excluded` marks documents with disqualifying +/// boundary crossings. The final result is documents in `result` that are NOT in `excluded`. +pub fn search_contains( + segment: &Segment, + offset: i32, + query: &[i32; 4], + result: &mut BitSet, + excluded: &mut BitSet, +) -> io::Result<()> { + let bbox = segment.bounding_box(offset); + if !bbox_intersects(bbox, query) { + } else if offset < 0 { + let leaf_node = segment.leaf_node(offset); + // bbox crosses query → test each triangle + let triangles = decompress_leaf(segment.leaf_page(leaf_node))?; + for triangle in &triangles { + if triangle_intersects(triangle, query) { + let doc_id = triangle.doc_id; + if excluded.contains(doc_id) { + continue; + } + match triangle_contains_relation(triangle, query) { + ContainsRelation::CANDIDATE => result.insert(doc_id), + ContainsRelation::NOTWITHIN => excluded.insert(doc_id), + ContainsRelation::DISJOINT => {} + } + } + } + } else { + let branch_node = segment.branch_node(offset); + search_contains(segment, branch_node.left, query, result, excluded)?; + search_contains(segment, branch_node.right, query, result, excluded)?; + } + Ok(()) +} diff --git a/src/spatial/mod.rs b/src/spatial/mod.rs index 1c677015e3..fa5498c646 100644 --- a/src/spatial/mod.rs +++ b/src/spatial/mod.rs @@ -1,6 +1,6 @@ //! Spatial module (implements a block kd-tree) +pub mod bkd; pub mod delta; pub mod radix_select; -pub mod surveyor; pub mod triangle; diff --git a/src/spatial/surveyor.rs b/src/spatial/surveyor.rs deleted file mode 100644 index 390d3913c1..0000000000 --- a/src/spatial/surveyor.rs +++ /dev/null @@ -1,135 +0,0 @@ -//! Dimension analysis for block kd-tree construction. -//! -//! Surveys triangle collections to identify the optimal partition dimension by tracking min/max -//! spreads and common byte prefixes across the four bounding box dimensions. The dimension with -//! maximum spread is selected for partitioning, and its prefix is used to optimize radix selection -//! by skipping shared leading bytes. -use crate::spatial::triangle::Triangle; - -#[derive(Clone, Copy)] -struct Spread { - min: i32, - max: i32, -} - -impl Spread { - fn survey(&mut self, value: i32) { - if value < self.min { - self.min = value; - } - if value > self.max { - self.max = value; - } - } - fn spread(&self) -> i32 { - self.max - self.min - } -} - -impl Default for Spread { - fn default() -> Self { - Spread { - min: i32::MAX, - max: i32::MIN, - } - } -} - -#[derive(Debug)] -struct Prefix { - words: Vec, -} - -impl Prefix { - fn prime(value: i32) -> Self { - Prefix { - words: value.to_be_bytes().to_vec(), - } - } - fn survey(&mut self, value: i32) { - let bytes = value.to_be_bytes(); - while !bytes.starts_with(&self.words) { - self.words.pop(); - } - } -} - -/// Tracks dimension statistics for block kd-tree partitioning decisions. -/// -/// Accumulates min/max spreads and common byte prefixes across the four bounding box dimensions -/// `(min_y, min_x, max_y, max_x)` of a triangle collection. Used during tree construction to -/// select the optimal partition dimension (maximum spread) and identify shared prefixes for radix -/// selection optimization. -pub struct Surveyor { - spreads: [Spread; 4], - prefixes: [Prefix; 4], -} - -impl Surveyor { - /// Creates a new surveyor using an initial triangle to establish byte prefixes. - /// - /// Prefixes require an initial value to begin comparison, so the first triangle's bounding box - /// dimensions seed each prefix. Spreads initialize to defaults and will be updated as - /// triangles are surveyed. - pub fn new(triangle: &Triangle) -> Self { - let mut prefixes = Vec::new(); - for &value in triangle.bbox() { - prefixes.push(Prefix::prime(value)); - } - Surveyor { - spreads: [Spread::default(); 4], - prefixes: prefixes.try_into().unwrap(), - } - } - - /// Updates dimension statistics with values from another triangle. - /// - /// Expands the min/max spread for each bounding box dimension and shrinks prefixes to only the - /// bytes shared across all triangles surveyed so far. - pub fn survey(&mut self, triangle: &Triangle) { - for (i, &value) in triangle.bbox().iter().enumerate() { - self.spreads[i].survey(value); - self.prefixes[i].survey(value); - } - } - - /// Returns the dimension with maximum spread and its common byte prefix. - /// - /// Selects the optimal bounding box dimension for partitioning by finding which has the - /// largest range (max - min). Returns both the dimension index and its prefix for use in radix - /// selection. - pub fn dimension(&self) -> (usize, &Vec) { - let mut dimension = 0; - let mut max_spread = self.spreads[0].spread(); - for (i, spread) in self.spreads.iter().enumerate().skip(1) { - let current_spread = spread.spread(); - if current_spread > max_spread { - dimension = i; - max_spread = current_spread; - } - } - (dimension, &self.prefixes[dimension].words) - } -} - -#[cfg(test)] -mod test { - use super::*; - use crate::spatial::triangle::Triangle; - - #[test] - fn survey_triangles() { - let triangles = [ - Triangle::new(1, [0, 0, 8, 1, 10, 3], [false, false, false]), - Triangle::new(1, [0, 0, 0, 11, 9, 0], [false, false, false]), - Triangle::new(1, [0, 0, 5, 4, 5, 0], [false, false, false]), - ]; - let mut surveyor = Surveyor::new(&triangles[0]); - for triangle in &triangles { - surveyor.survey(triangle); - } - let (dimension, prefix) = surveyor.dimension(); - assert_eq!(dimension, 3); - assert_eq!(prefix.len(), 3); - } -} From ccdf399cd75e67a978b5a157c01dc8a6f1ac89bd Mon Sep 17 00:00:00 2001 From: Alan Gutierrez Date: Sun, 26 Oct 2025 22:47:38 -0500 Subject: [PATCH 08/22] XOR delta compression for f64 polygon coordinates. Lossless compression for floating-point lat/lon coordinates using XOR delta encoding on IEEE 754 bit patterns with variable-length integer encoding. Designed for per-polygon random access in the document store, where each polygon compresses independently without requiring sequential decompression. --- src/spatial/mod.rs | 1 + src/spatial/xor.rs | 128 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 src/spatial/xor.rs diff --git a/src/spatial/mod.rs b/src/spatial/mod.rs index fa5498c646..0e75f3d89e 100644 --- a/src/spatial/mod.rs +++ b/src/spatial/mod.rs @@ -4,3 +4,4 @@ pub mod bkd; pub mod delta; pub mod radix_select; pub mod triangle; +pub mod xor; diff --git a/src/spatial/xor.rs b/src/spatial/xor.rs new file mode 100644 index 0000000000..af069a43a2 --- /dev/null +++ b/src/spatial/xor.rs @@ -0,0 +1,128 @@ +//! XOR delta compression for f64 polygon coordinates. +//! +//! Lossless compression for floating-point lat/lon coordinates using XOR delta encoding on IEEE +//! 754 bit patterns with variable-length integer encoding. Designed for per-polygon random access +//! in the document store, where each polygon compresses independently without requiring sequential +//! decompression. +//! +//! Spatially local coordinates share most high-order bits. A municipal boundary spanning 1km has +//! consecutive vertices typically within 100-500 meters, meaning their f64 bit patterns share +//! 30-40 bits. XOR reveals these common bits as zeros, which varint encoding then compresses +//! efficiently. +//! +//! The format stores the first coordinate as raw 8 bytes, then XOR deltas between consecutive +//! coordinates encoded as variable-length integers. When compression produces larger output than +//! the raw input (random data, compression-hostile patterns), the function automatically falls +//! back to storing coordinates as uncompressed 8-byte values. +//! +//! Unlike delta.rs which uses arithmetic deltas for i32 spatial coordinates in the block kd-tree, +//! this module operates on f64 bit patterns directly to preserve exact floating-point values for +//! returning to users. +use std::io::{Cursor, Read}; + +use common::VInt; + +/// Compresses f64 coordinates using XOR delta encoding with automatic raw fallback. +/// +/// Stores the first coordinate as raw bits, then computes XOR between consecutive coordinate bit +/// patterns and encodes as variable-length integers. If the compressed output would be larger than +/// raw storage (8 bytes per coordinate), automatically falls back to raw encoding. +/// +/// Returns a byte vector that can be decompressed with `decompress_f64()` to recover exact +/// original values. +pub fn compress_f64(values: &[f64]) -> Vec { + if values.is_empty() { + return Vec::new(); + } + let mut output = Vec::new(); + let mut previous = values[0].to_bits(); + output.extend_from_slice(&previous.to_le_bytes()); + for &value in &values[1..] { + let bits = value.to_bits(); + let xor = bits ^ previous; + VInt(xor).serialize_into_vec(&mut output); + previous = bits + } + if output.len() >= values.len() * 8 { + let mut output = Vec::with_capacity(values.len() * 8); + for &value in values { + output.extend_from_slice(&value.to_bits().to_le_bytes()); + } + return output; + } + output +} + +/// Decompresses f64 coordinates from XOR delta or raw encoding. +/// +/// Detects compression format by byte length - if `bytes.len() == count * 8`, data is raw and +/// copied directly. Otherwise, reads first coordinate from 8 bytes, then XOR deltas as varints, +/// reconstructing the original sequence. +/// +/// Returns exact f64 values that were passed to `compress_f64()`. +pub fn decompress_f64(bytes: &[u8], count: usize) -> Vec { + let mut values = Vec::with_capacity(count); + if bytes.len() == count * 8 { + for i in 0..count { + let bits = u64::from_le_bytes(bytes[i * 8..(i + 1) * 8].try_into().unwrap()); + values.push(f64::from_bits(bits)); + } + return values; + } + let mut cursor = Cursor::new(bytes); + + // Read first value (raw 8 bytes) + let mut first_bytes = [0u8; 8]; + cursor.read_exact(&mut first_bytes).unwrap(); + let mut previous = u64::from_le_bytes(first_bytes); + values.push(f64::from_bits(previous)); + + // Read remaining values as VInt XORs + while values.len() < count { + let xor = VInt::deserialize_u64(&mut cursor).unwrap(); + let bits = previous ^ xor; + values.push(f64::from_bits(bits)); + previous = bits; + } + + values +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_compress_spatial_locality() { + // Small town polygon - longitude only. + let longitudes = vec![ + 40.7580, 40.7581, 40.7582, 40.7583, 40.7584, 40.7585, 40.7586, 40.7587, + ]; + let bytes = compress_f64(&longitudes); + // Should compress well - XOR deltas will be small + assert_eq!(bytes.len(), 46); + // Should decompress to exact original values + let decompressed = decompress_f64(&bytes, longitudes.len()); + assert_eq!(longitudes, decompressed); + } + #[test] + fn test_fallback_to_raw() { + // Random, widely scattered values - poor compression + let values = vec![ + 12345.6789, + -98765.4321, + 0.00001, + 999999.999, + -0.0, + std::f64::consts::PI, + std::f64::consts::E, + 42.0, + ]; + let bytes = compress_f64(&values); + // Should fall back to raw storage + assert_eq!(bytes.len(), values.len() * 8); + // Should still decompress correctly + let decompressed = decompress_f64(&bytes, values.len()); + assert_eq!(values, decompressed); + } +} From d3049cb323db8c32c3b020d3ee91bb79dcaf057d Mon Sep 17 00:00:00 2001 From: Alan Gutierrez Date: Sun, 26 Oct 2025 22:58:58 -0500 Subject: [PATCH 09/22] Triangulation is not just a conversion. The triangulation function in `triangle.rs` is now called `delaunay_to_triangles` and it accepts the output of a Delaunay triangulation from `i_triangle` and not a GeoRust multi-polygon. The translation of user polygons to `i_triangle` polygons and subsequent triangulation will take place outside of `triangle.rs`. --- src/spatial/triangle.rs | 92 +++++++++++++++-------------------------- 1 file changed, 33 insertions(+), 59 deletions(-) diff --git a/src/spatial/triangle.rs b/src/spatial/triangle.rs index e25d9734fb..562d55b85b 100644 --- a/src/spatial/triangle.rs +++ b/src/spatial/triangle.rs @@ -5,9 +5,7 @@ //! contain an additional vertex and packed reconstruction metadata, allowing exact triangle //! recovery when needed. -use geo_types::MultiPolygon; -use i_triangle::i_overlay::i_float::int::point::IntPoint; -use i_triangle::int::triangulatable::IntTriangulatable; +use i_triangle::advanced::delaunay::IntDelaunay; use robust::{orient2d, Coord}; const MINY_MINX_MAXY_MAXX_Y_X: i32 = 0; @@ -303,65 +301,37 @@ impl Triangle { } } -/// Triangulates a geographic polygon into encoded triangles for block kd-tree indexing. +/// Encodes the triangles of a Delaunay triangulation into block kd-tree triangles. /// -/// This function is used by applications to convert a `MultiPolygon` into triangles for indexing -/// in the block kd-tree. Once triangulated, the encoded triangles can be inserted into the block -/// kd-tree and queries against the block kd-tree will return the associated `doc_id`. +/// Takes the output of a Delaunay triangulation from `i_triangle` and encodes each triangle into +/// the normalized triangle used by the block kd-tree. Each triangle includes its bounding box, +/// vertex coordinates, and boundary edge flags that distinguish original polygon edges from +/// internal tessellation edges. /// -/// Takes a polygon with floating-point lat/lon coordinates, converts to integer coordinates with -/// millimeter precision (using 2^32 scaling), performs constrained Delaunay triangulation, and -/// encodes the resulting triangles with boundary edge information. -/// -/// Handles polygons with holes correctly, preserving which triangle edges lie on the original -/// polygon boundaries versus internal tessellation edges. -pub fn triangulate(doc_id: u32, geometry: G, triangles: &mut Vec) -where G: Into> { - let multi = geometry.into(); - for polygon in multi { - let exterior: Vec = polygon - .exterior() - .coords() - .map(|coord| { - let lat = (coord.y / (180.0 / (1i64 << 32) as f64)).floor() as i32; - let lon = (coord.x / (360.0 / (1i64 << 32) as f64)).floor() as i32; - IntPoint::new(lon, lat) - }) - .collect(); - let mut i_polygon = vec![exterior]; - for interior in polygon.interiors() { - let hole: Vec = interior - .coords() - .map(|coord| { - let lat = (coord.y / (180.0 / (1i64 << 32) as f64)).floor() as i32; - let lon = (coord.x / (360.0 / (1i64 << 32) as f64)).floor() as i32; - IntPoint::new(lon, lat) - }) - .collect(); - i_polygon.push(hole); - } - let delaunay = i_polygon.triangulate().into_delaunay(); - for (_, triangle) in delaunay.triangles.iter().enumerate() { - let bounds = [ - triangle.neighbors[0] == usize::MAX, - triangle.neighbors[1] == usize::MAX, - triangle.neighbors[2] == usize::MAX, - ]; - let v0 = &delaunay.points[triangle.vertices[0].index]; - let v1 = &delaunay.points[triangle.vertices[1].index]; - let v2 = &delaunay.points[triangle.vertices[2].index]; - triangles.push(Triangle::new( - doc_id, - [v0.y, v0.x, v1.y, v1.x, v2.y, v2.x], - bounds, - )) - } +/// The boundary edge information provided by the `i_triangle` Delaunay triangulation is essential +/// for CONTAINS and WITHIN queries to work correctly. +pub fn delaunay_to_triangles(doc_id: u32, delaunay: &IntDelaunay, triangles: &mut Vec) { + for triangle in delaunay.triangles.iter() { + let bounds = [ + triangle.neighbors[0] == usize::MAX, + triangle.neighbors[1] == usize::MAX, + triangle.neighbors[2] == usize::MAX, + ]; + let v0 = &delaunay.points[triangle.vertices[0].index]; + let v1 = &delaunay.points[triangle.vertices[1].index]; + let v2 = &delaunay.points[triangle.vertices[2].index]; + triangles.push(Triangle::new( + doc_id, + [v0.y, v0.x, v1.y, v1.x, v2.y, v2.x], + bounds, + )) } } #[cfg(test)] mod tests { - use geo_types::polygon; + use i_triangle::i_overlay::i_float::int::point::IntPoint; + use i_triangle::int::triangulatable::IntTriangulatable; use super::*; @@ -437,11 +407,15 @@ mod tests { #[test] fn triangulate_box() { - let polygon = polygon![ - (x: 0.0, y: 0.0), (x: 10.0, y: 0.0), (x: 10.0, y: 10.0), (x: 0.0, y: 10.0) - ]; + let i_polygon = vec![vec![ + IntPoint::new(0, 0), + IntPoint::new(10, 0), + IntPoint::new(10, 10), + IntPoint::new(0, 10), + ]]; let mut triangles = Vec::new(); - triangulate(1, polygon, &mut triangles); + let delaunay = i_polygon.triangulate().into_delaunay(); + delaunay_to_triangles(1, &delaunay, &mut triangles); assert_eq!(triangles.len(), 2); } } From dbbc8c3f659e97cfee97a27ee6fb4b97e681349d Mon Sep 17 00:00:00 2001 From: Alan Gutierrez Date: Sat, 25 Oct 2025 13:43:07 -0500 Subject: [PATCH 10/22] Slot block kd-tree into Tantivy. Implemented a geometry document field with a minimal `Geometry` enum. Now able to add that Geometry from GeoJSON parsed from a JSON document. Geometry is triangulated if it is a polygon, otherwise it is correctly encoded as a degenerate triangle if it is a point or a line string. Write accumulated triangles to a block kd-tree on commit. Serialize the original `f64` polygon for retrieval from search. Created a query method for intersection. Query against the memory mapped block kd-tree. Return hits and original `f64` polygon. Implemented a merge of one or more block kd-trees from one or more segments during merge. Updated the block kd-tree to write to a Tantivy `WritePtr` instead of more generic Rust I/O. --- Cargo.toml | 1 - examples/geo_json.rs | 55 ++ src/core/json_utils.rs | 1 + src/fastfield/writer.rs | 2 + src/index/index_meta.rs | 1 + src/index/segment_component.rs | 5 +- src/index/segment_reader.rs | 10 + src/indexer/merger.rs | 73 ++- src/indexer/segment_serializer.rs | 14 + src/indexer/segment_writer.rs | 16 + src/postings/per_field_postings_writer.rs | 1 + src/query/mod.rs | 2 + src/query/query_parser/query_parser.rs | 7 + src/query/range_query/mod.rs | 2 +- .../range_query/range_query_fastfield.rs | 17 +- src/query/spatial_query.rs | 170 +++++++ src/schema/document/de.rs | 22 + src/schema/document/default_document.rs | 12 + src/schema/document/mod.rs | 1 + src/schema/document/owned_value.rs | 12 + src/schema/document/se.rs | 4 + src/schema/document/value.rs | 28 + src/schema/field_entry.rs | 7 + src/schema/field_type.rs | 28 + src/schema/mod.rs | 1 + src/schema/schema.rs | 10 + src/schema/spatial_options.rs | 22 +- src/schema/term.rs | 3 + src/space_usage/mod.rs | 1 + src/spatial/bkd.rs | 82 ++- src/spatial/geometry.rs | 479 ++++++++++++++++++ src/spatial/mod.rs | 4 + src/spatial/reader.rs | 65 +++ src/spatial/serializer.rs | 37 ++ src/spatial/writer.rs | 132 +++++ 35 files changed, 1286 insertions(+), 41 deletions(-) create mode 100644 examples/geo_json.rs create mode 100644 src/query/spatial_query.rs create mode 100644 src/spatial/geometry.rs create mode 100644 src/spatial/reader.rs create mode 100644 src/spatial/serializer.rs create mode 100644 src/spatial/writer.rs diff --git a/Cargo.toml b/Cargo.toml index cb0c2b8bb3..2cdb25b1c4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -93,7 +93,6 @@ time = { version = "0.3.10", features = ["serde-well-known", "macros"] } postcard = { version = "1.0.4", features = [ "use-std", ], default-features = false } -geojson = "0.24.2" [target.'cfg(not(windows))'.dev-dependencies] criterion = { version = "0.5", default-features = false } diff --git a/examples/geo_json.rs b/examples/geo_json.rs new file mode 100644 index 0000000000..3990d27671 --- /dev/null +++ b/examples/geo_json.rs @@ -0,0 +1,55 @@ +use tantivy::collector::TopDocs; +use tantivy::query::SpatialQuery; +use tantivy::schema::{Schema, Value, SPATIAL, STORED, TEXT}; +use tantivy::{Index, IndexWriter, TantivyDocument}; +fn main() -> tantivy::Result<()> { + let mut schema_builder = Schema::builder(); + schema_builder.add_json_field("properties", STORED | TEXT); + schema_builder.add_spatial_field("geometry", STORED | SPATIAL); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema.clone()); + let mut index_writer: IndexWriter = index.writer(50_000_000)?; + let doc = TantivyDocument::parse_json( + &schema, + r#"{ + "type":"Feature", + "geometry":{ + "type":"Polygon", + "coordinates":[[[-99.483911,45.577697],[-99.483869,45.571457],[-99.481739,45.571461],[-99.474881,45.571584],[-99.473167,45.571615],[-99.463394,45.57168],[-99.463391,45.57883],[-99.463368,45.586076],[-99.48177,45.585926],[-99.48384,45.585953],[-99.483885,45.57873],[-99.483911,45.577697]]] + }, + "properties":{ + "admin_level":"8", + "border_type":"city", + "boundary":"administrative", + "gnis:feature_id":"1267426", + "name":"Hosmer", + "place":"city", + "source":"TIGER/Line® 2008 Place Shapefiles (http://www.census.gov/geo/www/tiger/)", + "wikidata":"Q2442118", + "wikipedia":"en:Hosmer, South Dakota" + } + }"#, + )?; + index_writer.add_document(doc)?; + index_writer.commit()?; + + let reader = index.reader()?; + let searcher = reader.searcher(); + let field = schema.get_field("geometry").unwrap(); + let query = SpatialQuery::new( + field, + [(-99.49, 45.56), (-99.45, 45.59)], + tantivy::query::SpatialQueryType::Intersects, + ); + let hits = searcher.search(&query, &TopDocs::with_limit(10))?; + for (_score, doc_address) in &hits { + let retrieved_doc: TantivyDocument = searcher.doc(*doc_address)?; + if let Some(field_value) = retrieved_doc.get_first(field) { + if let Some(geometry_box) = field_value.as_value().into_geometry() { + println!("Retrieved geometry: {:?}", geometry_box); + } + } + } + assert_eq!(hits.len(), 1); + Ok(()) +} diff --git a/src/core/json_utils.rs b/src/core/json_utils.rs index ade65fa110..ee9292d667 100644 --- a/src/core/json_utils.rs +++ b/src/core/json_utils.rs @@ -227,6 +227,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>( ReferenceValueLeaf::IpAddr(_) => { unimplemented!("IP address support in dynamic fields is not yet implemented") } + ReferenceValueLeaf::Geometry(_) => todo!(), }, ReferenceValue::Array(elements) => { for val in elements { diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index a1288e0ad1..311a3d6afb 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -189,6 +189,7 @@ impl FastFieldsWriter { .record_str(doc_id, field_name, &token.text); } } + ReferenceValueLeaf::Geometry(_) => todo!(), }, ReferenceValue::Array(val) => { // TODO: Check this is the correct behaviour we want. @@ -320,6 +321,7 @@ fn record_json_value_to_columnar_writer<'a, V: Value<'a>>( "Pre-tokenized string support in dynamic fields is not yet implemented" ) } + ReferenceValueLeaf::Geometry(_) => todo!(), }, ReferenceValue::Array(elements) => { for el in elements { diff --git a/src/index/index_meta.rs b/src/index/index_meta.rs index 86eaa35d6c..7a5cb2649d 100644 --- a/src/index/index_meta.rs +++ b/src/index/index_meta.rs @@ -142,6 +142,7 @@ impl SegmentMeta { SegmentComponent::FastFields => ".fast".to_string(), SegmentComponent::FieldNorms => ".fieldnorm".to_string(), SegmentComponent::Delete => format!(".{}.del", self.delete_opstamp().unwrap_or(0)), + SegmentComponent::Spatial => ".spatial".to_string(), }); PathBuf::from(path) } diff --git a/src/index/segment_component.rs b/src/index/segment_component.rs index 42ac1d178c..4e208ae1eb 100644 --- a/src/index/segment_component.rs +++ b/src/index/segment_component.rs @@ -28,12 +28,14 @@ pub enum SegmentComponent { /// Bitset describing which document of the segment is alive. /// (It was representing deleted docs but changed to represent alive docs from v0.17) Delete, + /// HUSH + Spatial, } impl SegmentComponent { /// Iterates through the components. pub fn iterator() -> slice::Iter<'static, SegmentComponent> { - static SEGMENT_COMPONENTS: [SegmentComponent; 8] = [ + static SEGMENT_COMPONENTS: [SegmentComponent; 9] = [ SegmentComponent::Postings, SegmentComponent::Positions, SegmentComponent::FastFields, @@ -42,6 +44,7 @@ impl SegmentComponent { SegmentComponent::Store, SegmentComponent::TempStore, SegmentComponent::Delete, + SegmentComponent::Spatial, ]; SEGMENT_COMPONENTS.iter() } diff --git a/src/index/segment_reader.rs b/src/index/segment_reader.rs index f5589a6902..ee1a9938af 100644 --- a/src/index/segment_reader.rs +++ b/src/index/segment_reader.rs @@ -14,6 +14,7 @@ use crate::index::{InvertedIndexReader, Segment, SegmentComponent, SegmentId}; use crate::json_utils::json_path_sep_to_dot; use crate::schema::{Field, IndexRecordOption, Schema, Type}; use crate::space_usage::SegmentSpaceUsage; +use crate::spatial::reader::SpatialReaders; use crate::store::StoreReader; use crate::termdict::TermDictionary; use crate::{DocId, Opstamp}; @@ -43,6 +44,7 @@ pub struct SegmentReader { positions_composite: CompositeFile, fast_fields_readers: FastFieldReaders, fieldnorm_readers: FieldNormReaders, + spatial_readers: SpatialReaders, store_file: FileSlice, alive_bitset_opt: Option, @@ -92,6 +94,11 @@ impl SegmentReader { &self.fast_fields_readers } + /// HUSH + pub fn spatial_fields(&self) -> &SpatialReaders { + &self.spatial_readers + } + /// Accessor to the `FacetReader` associated with a given `Field`. pub fn facet_reader(&self, field_name: &str) -> crate::Result { let schema = self.schema(); @@ -173,6 +180,8 @@ impl SegmentReader { let fast_fields_readers = FastFieldReaders::open(fast_fields_data, schema.clone())?; let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?; let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?; + let spatial_data = segment.open_read(SegmentComponent::Spatial)?; + let spatial_readers = SpatialReaders::open(spatial_data)?; let original_bitset = if segment.meta().has_deletes() { let alive_doc_file_slice = segment.open_read(SegmentComponent::Delete)?; @@ -198,6 +207,7 @@ impl SegmentReader { postings_composite, fast_fields_readers, fieldnorm_readers, + spatial_readers, segment_id: segment.id(), delete_opstamp: segment.meta().delete_opstamp(), store_file, diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 47ac5a55b4..7630b611f7 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -1,3 +1,5 @@ +use std::collections::HashMap; +use std::io::Write; use std::sync::Arc; use columnar::{ @@ -6,6 +8,7 @@ use columnar::{ use common::ReadOnlyBitSet; use itertools::Itertools; use measure_time::debug_time; +use tempfile::NamedTempFile; use crate::directory::WritePtr; use crate::docset::{DocSet, TERMINATED}; @@ -17,6 +20,8 @@ use crate::indexer::doc_id_mapping::{MappingType, SegmentDocIdMapping}; use crate::indexer::SegmentSerializer; use crate::postings::{InvertedIndexSerializer, Postings, SegmentPostings}; use crate::schema::{value_type_to_column_type, Field, FieldType, Schema}; +use crate::spatial::bkd::LeafPageIterator; +use crate::spatial::triangle::Triangle; use crate::store::StoreWriter; use crate::termdict::{TermMerger, TermOrdinal}; use crate::{DocAddress, DocId, InvertedIndexReader}; @@ -520,6 +525,71 @@ impl IndexMerger { Ok(()) } + fn write_spatial_fields( + &self, + serializer: &mut SegmentSerializer, + doc_id_mapping: &SegmentDocIdMapping, + ) -> crate::Result<()> { + use crate::spatial::bkd::Segment; + let mut segment_mappings: Vec>> = Vec::new(); + for reader in &self.readers { + let max_doc = reader.max_doc(); + segment_mappings.push(vec![None; max_doc as usize]); + } + for (new_doc_id, old_doc_addr) in doc_id_mapping.iter_old_doc_addrs().enumerate() { + segment_mappings[old_doc_addr.segment_ord as usize][old_doc_addr.doc_id as usize] = + Some(new_doc_id as DocId); + } + let mut temp_files: HashMap = HashMap::new(); + + for (field, field_entry) in self.schema.fields() { + if matches!(field_entry.field_type(), FieldType::Spatial(_)) { + temp_files.insert(field, NamedTempFile::new()?); + } + } + for (segment_ord, reader) in self.readers.iter().enumerate() { + for (field, temp_file) in &mut temp_files { + let spatial_readers = reader.spatial_fields(); + let spatial_reader = match spatial_readers.get_field(*field)? { + Some(reader) => reader, + None => continue, + }; + let segment = Segment::new(spatial_reader.get_bytes()); + for triangle_result in LeafPageIterator::new(&segment) { + let triangles = triangle_result?; + for triangle in triangles { + if let Some(new_doc_id) = + segment_mappings[segment_ord][triangle.doc_id as usize] + { + for &word in &triangle.words { + temp_file.write_all(&word.to_le_bytes())?; + } + temp_file.write_all(&new_doc_id.to_le_bytes())?; + } + } + } + } + } + if let Some(mut spatial_serializer) = serializer.extract_spatial_serializer() { + for (field, mut temp_file) in temp_files { + // Flush and sync triangles. + temp_file.flush()?; + temp_file.as_file_mut().sync_all()?; + // Memory map the triangle file. + use memmap2::MmapOptions; + let mmap = unsafe { MmapOptions::new().map_mut(temp_file.as_file())? }; + // Cast to &[Triangle] slice + let triangle_count = mmap.len() / std::mem::size_of::(); + let triangles = unsafe { + std::slice::from_raw_parts_mut(mmap.as_ptr() as *mut Triangle, triangle_count) + }; + // Get spatial writer and rebuild block kd-tree. + spatial_serializer.serialize_field(field, triangles)?; + } + } + Ok(()) + } + /// Writes the merged segment by pushing information /// to the `SegmentSerializer`. /// @@ -544,9 +614,10 @@ impl IndexMerger { debug!("write-storagefields"); self.write_storable_fields(serializer.get_store_writer())?; + debug!("write-spatialfields"); + self.write_spatial_fields(&mut serializer, &doc_id_mapping)?; debug!("write-fastfields"); self.write_fast_fields(serializer.get_fast_field_write(), doc_id_mapping)?; - debug!("close-serializer"); serializer.close()?; Ok(self.max_doc) diff --git a/src/indexer/segment_serializer.rs b/src/indexer/segment_serializer.rs index 057a51b10c..473dbd1e80 100644 --- a/src/indexer/segment_serializer.rs +++ b/src/indexer/segment_serializer.rs @@ -4,6 +4,7 @@ use crate::directory::WritePtr; use crate::fieldnorm::FieldNormsSerializer; use crate::index::{Segment, SegmentComponent}; use crate::postings::InvertedIndexSerializer; +use crate::spatial::serializer::SpatialSerializer; use crate::store::StoreWriter; /// Segment serializer is in charge of laying out on disk @@ -12,6 +13,7 @@ pub struct SegmentSerializer { segment: Segment, pub(crate) store_writer: StoreWriter, fast_field_write: WritePtr, + spatial_serializer: Option, fieldnorms_serializer: Option, postings_serializer: InvertedIndexSerializer, } @@ -35,11 +37,15 @@ impl SegmentSerializer { let fieldnorms_write = segment.open_write(SegmentComponent::FieldNorms)?; let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?; + let spatial_write = segment.open_write(SegmentComponent::Spatial)?; + let spatial_serializer = SpatialSerializer::from_write(spatial_write)?; + let postings_serializer = InvertedIndexSerializer::open(&mut segment)?; Ok(SegmentSerializer { segment, store_writer, fast_field_write, + spatial_serializer: Some(spatial_serializer), fieldnorms_serializer: Some(fieldnorms_serializer), postings_serializer, }) @@ -64,6 +70,11 @@ impl SegmentSerializer { &mut self.fast_field_write } + /// HUSH + pub fn extract_spatial_serializer(&mut self) -> Option { + self.spatial_serializer.take() + } + /// Extract the field norm serializer. /// /// Note the fieldnorms serializer can only be extracted once. @@ -81,6 +92,9 @@ impl SegmentSerializer { if let Some(fieldnorms_serializer) = self.extract_fieldnorms_serializer() { fieldnorms_serializer.close()?; } + if let Some(spatial_serializer) = self.extract_spatial_serializer() { + spatial_serializer.close()?; + } self.fast_field_write.terminate()?; self.postings_serializer.close()?; self.store_writer.close()?; diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 30338187cd..630bd4fc51 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -16,6 +16,7 @@ use crate::postings::{ }; use crate::schema::document::{Document, Value}; use crate::schema::{FieldEntry, FieldType, Schema, DATE_TIME_PRECISION_INDEXED}; +use crate::spatial::writer::SpatialWriter; use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer}; use crate::{DocId, Opstamp, TantivyError}; @@ -52,6 +53,7 @@ pub struct SegmentWriter { pub(crate) segment_serializer: SegmentSerializer, pub(crate) fast_field_writers: FastFieldsWriter, pub(crate) fieldnorms_writer: FieldNormsWriter, + pub(crate) spatial_writer: SpatialWriter, pub(crate) json_path_writer: JsonPathWriter, pub(crate) json_positions_per_path: IndexingPositionsPerPath, pub(crate) doc_opstamps: Vec, @@ -104,6 +106,7 @@ impl SegmentWriter { ctx: IndexingContext::new(table_size), per_field_postings_writers, fieldnorms_writer: FieldNormsWriter::for_schema(&schema), + spatial_writer: SpatialWriter::default(), json_path_writer: JsonPathWriter::default(), json_positions_per_path: IndexingPositionsPerPath::default(), segment_serializer, @@ -130,6 +133,7 @@ impl SegmentWriter { self.ctx, self.fast_field_writers, &self.fieldnorms_writer, + &mut self.spatial_writer, self.segment_serializer, )?; Ok(self.doc_opstamps) @@ -142,6 +146,7 @@ impl SegmentWriter { + self.fieldnorms_writer.mem_usage() + self.fast_field_writers.mem_usage() + self.segment_serializer.mem_usage() + + self.spatial_writer.mem_usage() } fn index_document(&mut self, doc: &D) -> crate::Result<()> { @@ -338,6 +343,13 @@ impl SegmentWriter { self.fieldnorms_writer.record(doc_id, field, num_vals); } } + FieldType::Spatial(_) => { + for value in values { + if let Some(geometry) = value.as_geometry() { + self.spatial_writer.add_geometry(doc_id, field, *geometry); + } + } + } } } Ok(()) @@ -392,12 +404,16 @@ fn remap_and_write( ctx: IndexingContext, fast_field_writers: FastFieldsWriter, fieldnorms_writer: &FieldNormsWriter, + spatial_writer: &mut SpatialWriter, mut serializer: SegmentSerializer, ) -> crate::Result<()> { debug!("remap-and-write"); if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() { fieldnorms_writer.serialize(fieldnorms_serializer)?; } + if let Some(spatial_serializer) = serializer.extract_spatial_serializer() { + spatial_writer.serialize(spatial_serializer)?; + } let fieldnorm_data = serializer .segment() .open_read(SegmentComponent::FieldNorms)?; diff --git a/src/postings/per_field_postings_writer.rs b/src/postings/per_field_postings_writer.rs index f3d6d6534c..a9035ca84e 100644 --- a/src/postings/per_field_postings_writer.rs +++ b/src/postings/per_field_postings_writer.rs @@ -51,6 +51,7 @@ fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box Box::>::default(), FieldType::JsonObject(ref json_object_options) => { if let Some(text_indexing_option) = json_object_options.get_text_indexing_options() { diff --git a/src/query/mod.rs b/src/query/mod.rs index d609a04021..8bc45ed2ed 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -24,6 +24,7 @@ mod reqopt_scorer; mod scorer; mod set_query; mod size_hint; +mod spatial_query; mod term_query; mod union; mod weight; @@ -62,6 +63,7 @@ pub use self::reqopt_scorer::RequiredOptionalScorer; pub use self::score_combiner::{DisjunctionMaxCombiner, ScoreCombiner, SumCombiner}; pub use self::scorer::Scorer; pub use self::set_query::TermSetQuery; +pub use self::spatial_query::{SpatialQuery, SpatialQueryType}; pub use self::term_query::TermQuery; pub use self::union::BufferedUnionScorer; #[cfg(test)] diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 7ebaf3a474..483aa36387 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -524,6 +524,9 @@ impl QueryParser { let ip_v6 = IpAddr::from_str(phrase)?.into_ipv6_addr(); Ok(Term::from_field_ip_addr(field, ip_v6)) } + FieldType::Spatial(_) => Err(QueryParserError::UnsupportedQuery( + "Spatial queries are not yet supported in text query parser".to_string(), + )), } } @@ -624,6 +627,10 @@ impl QueryParser { let term = Term::from_field_ip_addr(field, ip_v6); Ok(vec![LogicalLiteral::Term(term)]) } + FieldType::Spatial(_) => Err(QueryParserError::UnsupportedQuery(format!( + "Spatial queries are not yet supported for field '{}'", + field_name + ))), } } diff --git a/src/query/range_query/mod.rs b/src/query/range_query/mod.rs index e93bb40491..3297dae072 100644 --- a/src/query/range_query/mod.rs +++ b/src/query/range_query/mod.rs @@ -20,6 +20,6 @@ pub(crate) fn is_type_valid_for_fastfield_range_query(typ: Type) -> bool { | Type::Date | Type::Json | Type::IpAddr => true, - Type::Facet | Type::Bytes => false, + Type::Facet | Type::Bytes | Type::Spatial => false, } } diff --git a/src/query/range_query/range_query_fastfield.rs b/src/query/range_query/range_query_fastfield.rs index 54cf0cad54..f4d7e970e1 100644 --- a/src/query/range_query/range_query_fastfield.rs +++ b/src/query/range_query/range_query_fastfield.rs @@ -128,12 +128,15 @@ impl Weight for FastFieldRangeWeight { BoundsRange::new(bounds.lower_bound, bounds.upper_bound), ) } - Type::Bool | Type::Facet | Type::Bytes | Type::Json | Type::IpAddr => { - Err(crate::TantivyError::InvalidArgument(format!( - "unsupported value bytes type in json term value_bytes {:?}", - term_value.typ() - ))) - } + Type::Bool + | Type::Facet + | Type::Bytes + | Type::Json + | Type::IpAddr + | Type::Spatial => Err(crate::TantivyError::InvalidArgument(format!( + "unsupported value bytes type in json term value_bytes {:?}", + term_value.typ() + ))), } } else if field_type.is_ip_addr() { let parse_ip_from_bytes = |term: &Term| { @@ -435,7 +438,7 @@ pub(crate) fn maps_to_u64_fastfield(typ: Type) -> bool { match typ { Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true, Type::IpAddr => false, - Type::Str | Type::Facet | Type::Bytes | Type::Json => false, + Type::Str | Type::Facet | Type::Bytes | Type::Json | Type::Spatial => false, } } diff --git a/src/query/spatial_query.rs b/src/query/spatial_query.rs new file mode 100644 index 0000000000..a6c4d7050e --- /dev/null +++ b/src/query/spatial_query.rs @@ -0,0 +1,170 @@ +//! HUSH + +use common::BitSet; + +use crate::query::{BitSetDocSet, Query, Scorer, Weight}; +use crate::schema::Field; +use crate::spatial::bkd::{search_intersects, Segment}; +use crate::spatial::writer::as_point_i32; +use crate::{DocId, DocSet, Score, TantivyError, TERMINATED}; + +#[derive(Clone, Copy, Debug)] +/// HUSH +pub enum SpatialQueryType { + /// HUSH + Intersects, + // Within, + // Contains, +} + +#[derive(Clone, Copy, Debug)] +/// HUSH +pub struct SpatialQuery { + field: Field, + bounds: [(i32, i32); 2], + query_type: SpatialQueryType, +} + +impl SpatialQuery { + /// HUSH + pub fn new(field: Field, bounds: [(f64, f64); 2], query_type: SpatialQueryType) -> Self { + SpatialQuery { + field, + bounds: [as_point_i32(bounds[0]), as_point_i32(bounds[1])], + query_type, + } + } +} + +impl Query for SpatialQuery { + fn weight( + &self, + _enable_scoring: super::EnableScoring<'_>, + ) -> crate::Result> { + Ok(Box::new(SpatialWeight::new( + self.field, + self.bounds, + self.query_type, + ))) + } +} + +pub struct SpatialWeight { + field: Field, + bounds: [(i32, i32); 2], + query_type: SpatialQueryType, +} + +impl SpatialWeight { + fn new(field: Field, bounds: [(i32, i32); 2], query_type: SpatialQueryType) -> Self { + SpatialWeight { + field, + bounds, + query_type, + } + } +} + +impl Weight for SpatialWeight { + fn scorer( + &self, + reader: &crate::SegmentReader, + boost: crate::Score, + ) -> crate::Result> { + let spatial_reader = reader + .spatial_fields() + .get_field(self.field)? + .ok_or_else(|| TantivyError::SchemaError(format!("No spatial data for field")))?; + let block_kd_tree = Segment::new(spatial_reader.get_bytes()); + match self.query_type { + SpatialQueryType::Intersects => { + let mut include = BitSet::with_max_value(reader.max_doc()); + search_intersects( + &block_kd_tree, + block_kd_tree.root_offset, + &[ + self.bounds[0].1, + self.bounds[0].0, + self.bounds[1].1, + self.bounds[1].0, + ], + &mut include, + )?; + Ok(Box::new(SpatialScorer::new(boost, include, None))) + } + } + } + fn explain( + &self, + _reader: &crate::SegmentReader, + _doc: DocId, + ) -> crate::Result { + todo!(); + } +} + +struct SpatialScorer { + include: BitSetDocSet, + exclude: Option, + doc_id: DocId, + score: Score, +} + +impl SpatialScorer { + pub fn new(score: Score, include: BitSet, exclude: Option) -> Self { + let mut scorer = SpatialScorer { + include: BitSetDocSet::from(include), + exclude, + doc_id: 0, + score, + }; + scorer.prime(); + scorer + } + fn prime(&mut self) { + self.doc_id = self.include.doc(); + while self.exclude() { + self.doc_id = self.include.advance(); + } + } + + fn exclude(&self) -> bool { + if self.doc_id == TERMINATED { + return false; + } + match &self.exclude { + Some(exclude) => exclude.contains(self.doc_id), + None => false, + } + } +} + +impl Scorer for SpatialScorer { + fn score(&mut self) -> Score { + self.score + } +} + +impl DocSet for SpatialScorer { + fn advance(&mut self) -> DocId { + if self.doc_id == TERMINATED { + return TERMINATED; + } + self.doc_id = self.include.advance(); + while self.exclude() { + self.doc_id = self.include.advance(); + } + self.doc_id + } + + fn size_hint(&self) -> u32 { + match &self.exclude { + Some(exclude) => self.include.size_hint() - exclude.len() as u32, + None => self.include.size_hint(), + } + } + + fn doc(&self) -> DocId { + self.doc_id + } +} diff --git a/src/schema/document/de.rs b/src/schema/document/de.rs index 02ee7a6a0c..9e6b6ecc03 100644 --- a/src/schema/document/de.rs +++ b/src/schema/document/de.rs @@ -22,6 +22,7 @@ use super::se::BinaryObjectSerializer; use super::{OwnedValue, Value}; use crate::schema::document::type_codes; use crate::schema::{Facet, Field}; +use crate::spatial::geometry::Geometry; use crate::store::DocStoreVersion; use crate::tokenizer::PreTokenizedString; @@ -129,6 +130,9 @@ pub trait ValueDeserializer<'de> { /// Attempts to deserialize a pre-tokenized string value from the deserializer. fn deserialize_pre_tokenized_string(self) -> Result; + /// HUSH + fn deserialize_geometry(self) -> Result; + /// Attempts to deserialize the value using a given visitor. fn deserialize_any(self, visitor: V) -> Result where V: ValueVisitor; @@ -166,6 +170,8 @@ pub enum ValueType { /// A JSON object value. Deprecated. #[deprecated(note = "We keep this for backwards compatibility, use Object instead")] JSONObject, + /// HUSH + Geometry, } /// A value visitor for deserializing a document value. @@ -246,6 +252,12 @@ pub trait ValueVisitor { Err(DeserializeError::UnsupportedType(ValueType::PreTokStr)) } + #[inline] + /// Called when the deserializer visits a geometry value. + fn visit_geometry(&self, _val: Geometry) -> Result { + Err(DeserializeError::UnsupportedType(ValueType::Geometry)) + } + #[inline] /// Called when the deserializer visits an array. fn visit_array<'de, A>(&self, _access: A) -> Result @@ -380,6 +392,7 @@ where R: Read match ext_type_code { type_codes::TOK_STR_EXT_CODE => ValueType::PreTokStr, + type_codes::GEO_EXT_CODE => ValueType::Geometry, _ => { return Err(DeserializeError::from(io::Error::new( io::ErrorKind::InvalidData, @@ -495,6 +508,11 @@ where R: Read .map_err(DeserializeError::from) } + fn deserialize_geometry(self) -> Result { + self.validate_type(ValueType::Geometry)?; + ::deserialize(self.reader).map_err(DeserializeError::from) + } + fn deserialize_any(self, visitor: V) -> Result where V: ValueVisitor { match self.value_type { @@ -539,6 +557,10 @@ where R: Read let val = self.deserialize_pre_tokenized_string()?; visitor.visit_pre_tokenized_string(val) } + ValueType::Geometry => { + let val = self.deserialize_geometry()?; + visitor.visit_geometry(val) + } ValueType::Array => { let access = BinaryArrayDeserializer::from_reader(self.reader, self.doc_store_version)?; diff --git a/src/schema/document/default_document.rs b/src/schema/document/default_document.rs index 915b685aa7..c2416040ac 100644 --- a/src/schema/document/default_document.rs +++ b/src/schema/document/default_document.rs @@ -13,6 +13,7 @@ use crate::schema::document::{ }; use crate::schema::field_type::ValueParsingError; use crate::schema::{Facet, Field, NamedFieldDocument, OwnedValue, Schema}; +use crate::spatial::geometry::Geometry; use crate::tokenizer::PreTokenizedString; #[repr(C, packed)] @@ -254,6 +255,7 @@ impl CompactDoc { } ReferenceValueLeaf::IpAddr(num) => write_into(&mut self.node_data, num.to_u128()), ReferenceValueLeaf::PreTokStr(pre_tok) => write_into(&mut self.node_data, *pre_tok), + ReferenceValueLeaf::Geometry(geometry) => write_into(&mut self.node_data, *geometry), }; ValueAddr { type_id, val_addr } } @@ -464,6 +466,13 @@ impl<'a> CompactDocValue<'a> { .map(Into::into) .map(ReferenceValueLeaf::PreTokStr) .map(Into::into), + // ValueType::Geometry => todo!(), + ValueType::Geometry => self + .container + .read_from::(addr) + .map(Into::into) + .map(ReferenceValueLeaf::Geometry) + .map(Into::into), ValueType::Object => Ok(ReferenceValue::Object(CompactDocObjectIter::new( self.container, addr, @@ -542,6 +551,8 @@ pub enum ValueType { Object = 11, /// Pre-tokenized str type, Array = 12, + /// HUSH + Geometry = 13, } impl BinarySerializable for ValueType { @@ -587,6 +598,7 @@ impl<'a> From<&ReferenceValueLeaf<'a>> for ValueType { ReferenceValueLeaf::PreTokStr(_) => ValueType::PreTokStr, ReferenceValueLeaf::Facet(_) => ValueType::Facet, ReferenceValueLeaf::Bytes(_) => ValueType::Bytes, + ReferenceValueLeaf::Geometry(_) => ValueType::Geometry, } } } diff --git a/src/schema/document/mod.rs b/src/schema/document/mod.rs index 8168ee8112..ed4a03d6f4 100644 --- a/src/schema/document/mod.rs +++ b/src/schema/document/mod.rs @@ -273,4 +273,5 @@ pub(crate) mod type_codes { // Extended type codes pub const TOK_STR_EXT_CODE: u8 = 0; + pub const GEO_EXT_CODE: u8 = 1; } diff --git a/src/schema/document/owned_value.rs b/src/schema/document/owned_value.rs index 9fbf1f8c26..8c72e358d6 100644 --- a/src/schema/document/owned_value.rs +++ b/src/schema/document/owned_value.rs @@ -15,6 +15,7 @@ use crate::schema::document::{ ValueDeserializer, ValueVisitor, }; use crate::schema::Facet; +use crate::spatial::geometry::Geometry; use crate::tokenizer::PreTokenizedString; use crate::DateTime; @@ -49,6 +50,8 @@ pub enum OwnedValue { Object(Vec<(String, Self)>), /// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`. IpAddr(Ipv6Addr), + /// A GeoRust multi-polygon. + Geometry(Geometry), } impl AsRef for OwnedValue { @@ -77,6 +80,9 @@ impl<'a> Value<'a> for &'a OwnedValue { OwnedValue::IpAddr(val) => ReferenceValueLeaf::IpAddr(*val).into(), OwnedValue::Array(array) => ReferenceValue::Array(array.iter()), OwnedValue::Object(object) => ReferenceValue::Object(ObjectMapIter(object.iter())), + OwnedValue::Geometry(geometry) => { + ReferenceValueLeaf::Geometry(Box::new(geometry.clone())).into() + } } } } @@ -136,6 +142,10 @@ impl ValueDeserialize for OwnedValue { Ok(OwnedValue::PreTokStr(val)) } + fn visit_geometry(&self, val: Geometry) -> Result { + Ok(OwnedValue::Geometry(val)) + } + fn visit_array<'de, A>(&self, mut access: A) -> Result where A: ArrayAccess<'de> { let mut elements = Vec::with_capacity(access.size_hint()); @@ -198,6 +208,7 @@ impl serde::Serialize for OwnedValue { } } OwnedValue::Array(ref array) => array.serialize(serializer), + OwnedValue::Geometry(_) => todo!(), } } } @@ -285,6 +296,7 @@ impl<'a, V: Value<'a>> From> for OwnedValue { ReferenceValueLeaf::IpAddr(val) => OwnedValue::IpAddr(val), ReferenceValueLeaf::Bool(val) => OwnedValue::Bool(val), ReferenceValueLeaf::PreTokStr(val) => OwnedValue::PreTokStr(*val.clone()), + ReferenceValueLeaf::Geometry(_) => todo!(), }, ReferenceValue::Array(val) => { OwnedValue::Array(val.map(|v| v.as_value().into()).collect()) diff --git a/src/schema/document/se.rs b/src/schema/document/se.rs index 528c8dffc3..d1b884f016 100644 --- a/src/schema/document/se.rs +++ b/src/schema/document/se.rs @@ -133,6 +133,10 @@ where W: Write self.write_type_code(type_codes::EXT_CODE)?; self.serialize_with_type_code(type_codes::TOK_STR_EXT_CODE, &*val) } + ReferenceValueLeaf::Geometry(val) => { + self.write_type_code(type_codes::EXT_CODE)?; + self.serialize_with_type_code(type_codes::GEO_EXT_CODE, &*val) + } }, ReferenceValue::Array(elements) => { self.write_type_code(type_codes::ARRAY_CODE)?; diff --git a/src/schema/document/value.rs b/src/schema/document/value.rs index de1067ce67..87d2593d47 100644 --- a/src/schema/document/value.rs +++ b/src/schema/document/value.rs @@ -3,6 +3,7 @@ use std::net::Ipv6Addr; use common::DateTime; +use crate::spatial::geometry::Geometry; use crate::tokenizer::PreTokenizedString; /// A single field value. @@ -108,6 +109,12 @@ pub trait Value<'a>: Send + Sync + Debug { None } } + + #[inline] + /// HUSH + fn as_geometry(&self) -> Option> { + self.as_leaf().and_then(|leaf| leaf.into_geometry()) + } } /// A enum representing a leaf value for tantivy to index. @@ -136,6 +143,8 @@ pub enum ReferenceValueLeaf<'a> { Bool(bool), /// Pre-tokenized str type, PreTokStr(Box), + /// HUSH + Geometry(Box), } impl From for ReferenceValueLeaf<'_> { @@ -220,6 +229,9 @@ impl<'a, T: Value<'a> + ?Sized> From> for ReferenceValue< ReferenceValueLeaf::PreTokStr(val) => { ReferenceValue::Leaf(ReferenceValueLeaf::PreTokStr(val)) } + ReferenceValueLeaf::Geometry(val) => { + ReferenceValue::Leaf(ReferenceValueLeaf::Geometry(val)) + } } } } @@ -331,6 +343,16 @@ impl<'a> ReferenceValueLeaf<'a> { None } } + + #[inline] + /// HUSH + pub fn into_geometry(self) -> Option> { + if let Self::Geometry(val) = self { + Some(val) + } else { + None + } + } } /// A enum representing a value for tantivy to index. @@ -448,4 +470,10 @@ where V: Value<'a> pub fn is_object(&self) -> bool { matches!(self, Self::Object(_)) } + + #[inline] + /// HUSH + pub fn into_geometry(self) -> Option> { + self.into_leaf().and_then(|leaf| leaf.into_geometry()) + } } diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index 77e061bc61..d8325f7dcc 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -1,6 +1,7 @@ use serde::{Deserialize, Serialize}; use super::ip_options::IpAddrOptions; +use super::spatial_options::SpatialOptions; use crate::schema::bytes_options::BytesOptions; use crate::schema::{ is_valid_field_name, DateOptions, FacetOptions, FieldType, JsonObjectOptions, NumericOptions, @@ -80,6 +81,11 @@ impl FieldEntry { Self::new(field_name, FieldType::JsonObject(json_object_options)) } + /// Creates a field entry for a spatial field + pub fn new_spatial(field_name: String, spatial_options: SpatialOptions) -> FieldEntry { + Self::new(field_name, FieldType::Spatial(spatial_options)) + } + /// Returns the name of the field pub fn name(&self) -> &str { &self.name @@ -129,6 +135,7 @@ impl FieldEntry { FieldType::Bytes(ref options) => options.is_stored(), FieldType::JsonObject(ref options) => options.is_stored(), FieldType::IpAddr(ref options) => options.is_stored(), + FieldType::Spatial(ref options) => options.is_stored(), } } } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 8b203f5b37..aa1e47094e 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -9,6 +9,7 @@ use serde_json::Value as JsonValue; use thiserror::Error; use super::ip_options::IpAddrOptions; +use super::spatial_options::SpatialOptions; use super::IntoIpv6Addr; use crate::schema::bytes_options::BytesOptions; use crate::schema::facet_options::FacetOptions; @@ -16,6 +17,7 @@ use crate::schema::{ DateOptions, Facet, IndexRecordOption, JsonObjectOptions, NumericOptions, OwnedValue, TextFieldIndexing, TextOptions, }; +use crate::spatial::geometry::Geometry; use crate::time::format_description::well_known::Rfc3339; use crate::time::OffsetDateTime; use crate::tokenizer::PreTokenizedString; @@ -71,6 +73,8 @@ pub enum Type { Json = b'j', /// IpAddr IpAddr = b'p', + /// Spatial + Spatial = b't', } impl From for Type { @@ -139,6 +143,7 @@ impl Type { Type::Bytes => "Bytes", Type::Json => "Json", Type::IpAddr => "IpAddr", + Type::Spatial => "Spatial", } } @@ -189,6 +194,8 @@ pub enum FieldType { JsonObject(JsonObjectOptions), /// IpAddr field IpAddr(IpAddrOptions), + /// Spatial field + Spatial(SpatialOptions), } impl FieldType { @@ -205,6 +212,7 @@ impl FieldType { FieldType::Bytes(_) => Type::Bytes, FieldType::JsonObject(_) => Type::Json, FieldType::IpAddr(_) => Type::IpAddr, + FieldType::Spatial(_) => Type::Spatial, } } @@ -241,6 +249,7 @@ impl FieldType { FieldType::Bytes(ref bytes_options) => bytes_options.is_indexed(), FieldType::JsonObject(ref json_object_options) => json_object_options.is_indexed(), FieldType::IpAddr(ref ip_addr_options) => ip_addr_options.is_indexed(), + FieldType::Spatial(ref _spatial_options) => true, } } @@ -278,6 +287,7 @@ impl FieldType { FieldType::IpAddr(ref ip_addr_options) => ip_addr_options.is_fast(), FieldType::Facet(_) => true, FieldType::JsonObject(ref json_object_options) => json_object_options.is_fast(), + FieldType::Spatial(_) => false, } } @@ -297,6 +307,7 @@ impl FieldType { FieldType::Bytes(ref bytes_options) => bytes_options.fieldnorms(), FieldType::JsonObject(ref _json_object_options) => false, FieldType::IpAddr(ref ip_addr_options) => ip_addr_options.fieldnorms(), + FieldType::Spatial(_) => false, } } @@ -348,6 +359,7 @@ impl FieldType { None } } + FieldType::Spatial(_) => todo!(), } } @@ -449,6 +461,10 @@ impl FieldType { Ok(OwnedValue::IpAddr(ip_addr.into_ipv6_addr())) } + FieldType::Spatial(_) => Err(ValueParsingError::TypeError { + expected: "spatial field parsing not implemented", + json: JsonValue::String(field_text), + }), } } JsonValue::Number(field_val_num) => match self { @@ -508,6 +524,10 @@ impl FieldType { expected: "a string with an ip addr", json: JsonValue::Number(field_val_num), }), + FieldType::Spatial(_) => Err(ValueParsingError::TypeError { + expected: "spatial field parsing not implemented", + json: JsonValue::Number(field_val_num), + }), }, JsonValue::Object(json_map) => match self { FieldType::Str(_) => { @@ -523,6 +543,14 @@ impl FieldType { } } FieldType::JsonObject(_) => Ok(OwnedValue::from(json_map)), + FieldType::Spatial(_) => Ok(OwnedValue::Geometry( + Geometry::from_geojson(&json_map).map_err(|e| { + ValueParsingError::ParseError { + error: format!("{:?}", e), + json: JsonValue::Object(json_map), + } + })?, + )), _ => Err(ValueParsingError::TypeError { expected: self.value_type().name(), json: JsonValue::Object(json_map), diff --git a/src/schema/mod.rs b/src/schema/mod.rs index 627774e545..eabd1c5a12 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -170,6 +170,7 @@ pub(crate) fn value_type_to_column_type(typ: Type) -> Option { Type::Bytes => Some(ColumnType::Bytes), Type::IpAddr => Some(ColumnType::IpAddr), Type::Json => None, + Type::Spatial => None, } } diff --git a/src/schema/schema.rs b/src/schema/schema.rs index c1d22c0baa..ddfd533e40 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -194,6 +194,16 @@ impl SchemaBuilder { self.add_field(field_entry) } + /// Adds a spatial entry to the schema in build. + pub fn add_spatial_field>( + &mut self, + field_name: &str, + field_options: T, + ) -> Field { + let field_entry = FieldEntry::new_spatial(field_name.to_string(), field_options.into()); + self.add_field(field_entry) + } + /// Adds a field entry to the schema in build. pub fn add_field(&mut self, field_entry: FieldEntry) -> Field { let field = Field::from_field_id(self.fields.len() as u32); diff --git a/src/schema/spatial_options.rs b/src/schema/spatial_options.rs index 13cce1a6f0..248a5500fc 100644 --- a/src/schema/spatial_options.rs +++ b/src/schema/spatial_options.rs @@ -39,13 +39,15 @@ impl From for SpatialOptions { } } -#[cfg(test)] -mod tests { - use crate::schema::*; - - #[test] - fn test_field_options() { - let field_options = STORED | SPATIAL; - assert!(field_options.is_stored()); - } -} +// #[cfg(test)] +// mod tests { +// use crate::schema::*; +// +// #[test] +// fn test_field_options() { +// let field_options = STORED | SPATIAL; +// assert!(field_options.is_stored()); +// let mut schema_builder = Schema::builder(); +// schema_builder.add_spatial_index("where", SPATIAL | STORED); +// } +// } diff --git a/src/schema/term.rs b/src/schema/term.rs index 2dd78b82ae..4414fe0890 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -503,6 +503,9 @@ where B: AsRef<[u8]> Type::IpAddr => { write_opt(f, self.as_ip_addr())?; } + Type::Spatial => { + write!(f, "")?; + } } Ok(()) } diff --git a/src/space_usage/mod.rs b/src/space_usage/mod.rs index 70291153ad..e44d6e4277 100644 --- a/src/space_usage/mod.rs +++ b/src/space_usage/mod.rs @@ -125,6 +125,7 @@ impl SegmentSpaceUsage { SegmentComponent::Store => ComponentSpaceUsage::Store(self.store().clone()), SegmentComponent::TempStore => ComponentSpaceUsage::Store(self.store().clone()), Delete => Basic(self.deletes()), + Spatial => todo!(), } } diff --git a/src/spatial/bkd.rs b/src/spatial/bkd.rs index 99476e1c23..4c6652f64e 100644 --- a/src/spatial/bkd.rs +++ b/src/spatial/bkd.rs @@ -10,10 +10,11 @@ //! branch nodes), enabling zero-copy access through memory-mapped segments without upfront //! decompression. use std::io; -use std::io::{Seek, Write}; +use std::io::Write; -use common::BitSet; +use common::{BitSet, CountingWriter}; +use crate::directory::WritePtr; use crate::spatial::delta::{compress, decompress, Compressible}; use crate::spatial::triangle::Triangle; @@ -88,8 +89,8 @@ struct CompressibleTriangleI32<'a> { impl<'a> CompressibleTriangleI32<'a> { fn new(triangles: &'a [Triangle], dimension: usize) -> Self { CompressibleTriangleI32 { - triangles: triangles, - dimension: dimension, + triangles, + dimension, } } } @@ -110,9 +111,7 @@ struct CompressibleTriangleDocID<'a> { impl<'a> CompressibleTriangleDocID<'a> { fn new(triangles: &'a [Triangle]) -> Self { - CompressibleTriangleDocID { - triangles: triangles, - } + CompressibleTriangleDocID { triangles } } } @@ -136,12 +135,12 @@ impl<'a> Compressible for CompressibleTriangleDocID<'a> { // We do not compress the tree nodes. We read them directly from the mapping. // -fn write_leaf_pages( +fn write_leaf_pages( triangles: &mut [Triangle], - write: &mut W, + write: &mut CountingWriter, ) -> io::Result { if triangles.len() <= 512 { - let pos = write.stream_position()?; + let pos = write.written_bytes(); let mut spreads = [SpreadSurvey::default(); 4]; let mut bounding_box = BoundingBoxSurvey::default(); for triangle in triangles.iter() { @@ -174,10 +173,10 @@ fn write_leaf_pages( for i in 0..7 { compress(&compressible[i], write)?; } - let len = write.stream_position()? - pos; + let len = write.written_bytes() - pos; Ok(BuildNode::Leaf { bbox: bounding_box.bbox(), - pos: pos, + pos, len: len as u16, }) } else { @@ -220,7 +219,7 @@ fn write_leaf_pages( } } -fn write_leaf_nodes(node: &BuildNode, write: &mut W) -> io::Result<()> { +fn write_leaf_nodes(node: &BuildNode, write: &mut CountingWriter) -> io::Result<()> { match node { BuildNode::Branch { bbox: _, @@ -242,11 +241,11 @@ fn write_leaf_nodes(node: &BuildNode, write: &mut W) -> io::Res Ok(()) } -fn write_branch_nodes( +fn write_branch_nodes( node: &BuildNode, branch_offset: &mut i32, leaf_offset: &mut i32, - write: &mut W, + write: &mut CountingWriter, ) -> io::Result { match node { BuildNode::Leaf { .. } => { @@ -284,21 +283,26 @@ fn write_branch_nodes( /// /// The `triangles` slice will be reordered during tree construction as partitioning sorts by the /// selected dimension at each level. -pub fn write_block_kd_tree( +pub fn write_block_kd_tree( triangles: &mut [Triangle], - write: &mut W, + write: &mut CountingWriter, ) -> io::Result<()> { + assert_eq!( + triangles.as_ptr() as usize % std::mem::align_of::(), + 0 + ); write.write_all(&1u16.to_le_bytes())?; let tree = write_leaf_pages(triangles, write)?; - let current = write.stream_position()?; + let current = write.written_bytes(); let aligned = (current + 31) & !31; let padding = aligned - current; write.write_all(&vec![0u8; padding as usize])?; write_leaf_nodes(&tree, write)?; - let branch_position = write.stream_position()?; + let branch_position = write.written_bytes(); let mut branch_offset: i32 = 0; let mut leaf_offset: i32 = -1; let root = write_branch_nodes(&tree, &mut branch_offset, &mut leaf_offset, write)?; + write.write_all(&[0u8; 12])?; write.write_all(&triangles.len().to_le_bytes())?; write.write_all(&root.to_le_bytes())?; write.write_all(&branch_position.to_le_bytes())?; @@ -355,8 +359,9 @@ impl<'a> Segment<'a> { /// Reads the footer metadata from the last 12 bytes to locate the tree structure and root /// node. pub fn new(data: &'a [u8]) -> Self { + assert_eq!(data.as_ptr() as usize % std::mem::align_of::(), 0); Segment { - data: data, + data, branch_position: u64::from_le_bytes(data[data.len() - 8..].try_into().unwrap()), root_offset: i32::from_le_bytes( data[data.len() - 12..data.len() - 8].try_into().unwrap(), @@ -758,3 +763,40 @@ pub fn search_contains( } Ok(()) } + +/// HUSH +pub struct LeafPageIterator<'a> { + segment: &'a Segment<'a>, + descent_stack: Vec, +} + +impl<'a> LeafPageIterator<'a> { + /// HUSH + pub fn new(segment: &'a Segment<'a>) -> Self { + Self { + segment, + descent_stack: vec![segment.root_offset], + } + } +} + +impl<'a> Iterator for LeafPageIterator<'a> { + type Item = io::Result>; + + fn next(&mut self) -> Option { + let offset = self.descent_stack.pop()?; + if offset < 0 { + let leaf_node = self.segment.leaf_node(offset); + let leaf_page = self.segment.leaf_page(leaf_node); + match decompress_leaf(leaf_page) { + Ok(triangles) => Some(Ok(triangles)), + Err(e) => Some(Err(e)), + } + } else { + let branch_node = self.segment.branch_node(offset); + self.descent_stack.push(branch_node.right); + self.descent_stack.push(branch_node.left); + self.next() + } + } +} diff --git a/src/spatial/geometry.rs b/src/spatial/geometry.rs new file mode 100644 index 0000000000..5a770fc41a --- /dev/null +++ b/src/spatial/geometry.rs @@ -0,0 +1,479 @@ +//! HUSH + +use std::io::{self, Read, Write}; + +use common::{BinarySerializable, VInt}; +use serde_json::{json, Map, Value}; + +use crate::spatial::xor::{compress_f64, decompress_f64}; + +/// HUSH +#[derive(Debug)] +pub enum GeometryError { + /// HUSH + MissingType, + /// HUSH + MissingField(String), // "expected array", "wrong nesting depth", etc + /// HUSH + UnsupportedType(String), + /// HUSH + InvalidCoordinate(String), // Can report the actual bad value + /// HUSH + InvalidStructure(String), // "expected array", "wrong nesting depth", etc +} + +/// HUSH +#[derive(Debug, Clone, PartialEq)] +pub enum Geometry { + /// HUSH + Point((f64, f64)), + /// HUSH + MultiPoint(Vec<(f64, f64)>), + /// HUSH + LineString(Vec<(f64, f64)>), + /// HUSH + MultiLineString(Vec>), + /// HUSH + Polygon(Vec>), + /// HUSH + MultiPolygon(Vec>>), + /// HUSH + GeometryCollection(Vec), +} + +impl Geometry { + /// HUSH + pub fn from_geojson(object: &Map) -> Result { + let geometry_type = object + .get("type") + .and_then(|v| v.as_str()) + .ok_or(GeometryError::MissingType)?; + match geometry_type { + "Point" => { + let coordinates = get_coordinates(object)?; + let point = to_point(coordinates)?; + Ok(Geometry::Point(point)) + } + "MultiPoint" => { + let coordinates = get_coordinates(object)?; + let multi_point = to_line_string(coordinates)?; + Ok(Geometry::MultiPoint(multi_point)) + } + "LineString" => { + let coordinates = get_coordinates(object)?; + let line_string = to_line_string(coordinates)?; + if line_string.len() < 2 { + return Err(GeometryError::InvalidStructure( + "a line string contains at least 2 points".to_string(), + )); + } + Ok(Geometry::LineString(line_string)) + } + "MultiLineString" => { + let coordinates = get_coordinates(object)?; + let multi_line_string = to_multi_line_string(coordinates)?; + for line_string in &multi_line_string { + if line_string.len() < 2 { + return Err(GeometryError::InvalidStructure( + "a line string contains at least 2 points".to_string(), + )); + } + } + Ok(Geometry::MultiLineString(multi_line_string)) + } + "Polygon" => { + let coordinates = get_coordinates(object)?; + let polygon = to_multi_line_string(coordinates)?; + for ring in &polygon { + if ring.len() < 3 { + return Err(GeometryError::InvalidStructure( + "a polygon ring contains at least 3 points".to_string(), + )); + } + } + Ok(Geometry::Polygon(polygon)) + } + "MultiPolygon" => { + let mut result = Vec::new(); + let multi_polygons = get_coordinates(object)?; + let multi_polygons = + multi_polygons + .as_array() + .ok_or(GeometryError::InvalidStructure( + "expected an array of polygons".to_string(), + ))?; + for polygon in multi_polygons { + let polygon = to_multi_line_string(polygon)?; + for ring in &polygon { + if ring.len() < 3 { + return Err(GeometryError::InvalidStructure( + "a polygon ring contains at least 3 points".to_string(), + )); + } + } + result.push(polygon); + } + Ok(Geometry::MultiPolygon(result)) + } + "GeometriesCollection" => { + let geometries = object + .get("geometries") + .ok_or(GeometryError::MissingField("geometries".to_string()))?; + let geometries = geometries + .as_array() + .ok_or(GeometryError::InvalidStructure( + "geometries is not an array".to_string(), + ))?; + let mut result = Vec::new(); + for geometry in geometries { + let object = geometry.as_object().ok_or(GeometryError::InvalidStructure( + "geometry is not an object".to_string(), + ))?; + result.push(Geometry::from_geojson(object)?); + } + Ok(Geometry::GeometryCollection(result)) + } + _ => Err(GeometryError::UnsupportedType(geometry_type.to_string())), + } + } + + /// HUSH + pub fn to_geojson(&self) -> Map { + let mut map = Map::new(); + match self { + Geometry::Point(point) => { + map.insert("type".to_string(), Value::String("Point".to_string())); + let coords = json!([point.0, point.1]); + map.insert("coordinates".to_string(), coords); + } + Geometry::MultiPoint(points) => { + map.insert("type".to_string(), Value::String("MultiPoint".to_string())); + let coords: Vec = points.iter().map(|p| json!([p.0, p.1])).collect(); + map.insert("coordinates".to_string(), Value::Array(coords)); + } + Geometry::LineString(line) => { + map.insert("type".to_string(), Value::String("LineString".to_string())); + let coords: Vec = line.iter().map(|p| json!([p.0, p.1])).collect(); + map.insert("coordinates".to_string(), Value::Array(coords)); + } + Geometry::MultiLineString(lines) => { + map.insert( + "type".to_string(), + Value::String("MultiLineString".to_string()), + ); + let coords: Vec = lines + .iter() + .map(|line| Value::Array(line.iter().map(|p| json!([p.0, p.1])).collect())) + .collect(); + map.insert("coordinates".to_string(), Value::Array(coords)); + } + Geometry::Polygon(rings) => { + map.insert("type".to_string(), Value::String("Polygon".to_string())); + let coords: Vec = rings + .iter() + .map(|ring| Value::Array(ring.iter().map(|p| json!([p.0, p.1])).collect())) + .collect(); + map.insert("coordinates".to_string(), Value::Array(coords)); + } + Geometry::MultiPolygon(polygons) => { + map.insert( + "type".to_string(), + Value::String("MultiPolygon".to_string()), + ); + let coords: Vec = polygons + .iter() + .map(|polygon| { + Value::Array( + polygon + .iter() + .map(|ring| { + Value::Array(ring.iter().map(|p| json!([p.0, p.1])).collect()) + }) + .collect(), + ) + }) + .collect(); + map.insert("coordinates".to_string(), Value::Array(coords)); + } + Geometry::GeometryCollection(geometries) => { + map.insert( + "type".to_string(), + Value::String("GeometryCollection".to_string()), + ); + let geoms: Vec = geometries + .iter() + .map(|g| Value::Object(g.to_geojson())) + .collect(); + map.insert("geometries".to_string(), Value::Array(geoms)); + } + } + map + } +} + +fn get_coordinates(object: &Map) -> Result<&Value, GeometryError> { + let coordinates = object + .get("coordinates") + .ok_or(GeometryError::MissingField("coordinates".to_string()))?; + Ok(coordinates) +} + +fn to_point(value: &Value) -> Result<(f64, f64), GeometryError> { + let lonlat = value.as_array().ok_or(GeometryError::InvalidStructure( + "expected 2 element array pair of lon/lat".to_string(), + ))?; + if lonlat.len() != 2 { + return Err(GeometryError::InvalidStructure( + "expected 2 element array pair of lon/lat".to_string(), + )); + } + let lon = lonlat[0].as_f64().ok_or(GeometryError::InvalidCoordinate( + "longitude must be f64".to_string(), + ))?; + if !lon.is_finite() || !(-180.0..=180.0).contains(&lon) { + return Err(GeometryError::InvalidCoordinate(format!( + "invalid longitude: {}", + lon + ))); + } + let lat = lonlat[1].as_f64().ok_or(GeometryError::InvalidCoordinate( + "latitude must be f64".to_string(), + ))?; + if !lat.is_finite() || !(-90.0..=90.0).contains(&lat) { + return Err(GeometryError::InvalidCoordinate(format!( + "invalid latitude: {}", + lat + ))); + } + Ok((lon, lat)) +} + +fn to_line_string(value: &Value) -> Result, GeometryError> { + let mut result = Vec::new(); + let coordinates = value.as_array().ok_or(GeometryError::InvalidStructure( + "expected an array of lon/lat arrays".to_string(), + ))?; + for coordinate in coordinates { + result.push(to_point(coordinate)?); + } + Ok(result) +} + +fn to_multi_line_string(value: &Value) -> Result>, GeometryError> { + let mut result = Vec::new(); + let coordinates = value.as_array().ok_or(GeometryError::InvalidStructure( + "expected an array of an array of lon/lat arrays".to_string(), + ))?; + for coordinate in coordinates { + result.push(to_line_string(coordinate)?); + } + Ok(result) +} + +impl BinarySerializable for Geometry { + fn serialize(&self, writer: &mut W) -> io::Result<()> { + match self { + Geometry::Point(point) => { + 0u8.serialize(writer)?; + point.0.serialize(writer)?; + point.1.serialize(writer)?; + Ok(()) + } + Geometry::MultiPoint(points) => { + 1u8.serialize(writer)?; + serialize_line_string(points, writer) + } + Geometry::LineString(line_string) => { + 2u8.serialize(writer)?; + serialize_line_string(line_string, writer) + } + Geometry::MultiLineString(multi_line_string) => { + 3u8.serialize(writer)?; + serialize_polygon(multi_line_string, writer) + } + Geometry::Polygon(polygon) => { + 4u8.serialize(writer)?; + serialize_polygon(polygon, writer) + } + Geometry::MultiPolygon(multi_polygon) => { + 5u8.serialize(writer)?; + BinarySerializable::serialize(&VInt(multi_polygon.len() as u64), writer)?; + for polygon in multi_polygon { + BinarySerializable::serialize(&VInt(polygon.len() as u64), writer)?; + for ring in polygon { + BinarySerializable::serialize(&VInt(ring.len() as u64), writer)?; + } + } + let mut lon = Vec::new(); + let mut lat = Vec::new(); + for polygon in multi_polygon { + for ring in polygon { + for point in ring { + lon.push(point.0); + lat.push(point.1); + } + } + } + let lon = compress_f64(&lon); + let lat = compress_f64(&lat); + VInt(lon.len() as u64).serialize(writer)?; + writer.write_all(&lon)?; + VInt(lat.len() as u64).serialize(writer)?; + writer.write_all(&lat)?; + Ok(()) + } + Geometry::GeometryCollection(geometries) => { + 6u8.serialize(writer)?; + BinarySerializable::serialize(&VInt(geometries.len() as u64), writer)?; + for geometry in geometries { + geometry.serialize(writer)?; + } + Ok(()) + } + } + } + + fn deserialize(reader: &mut R) -> io::Result { + let discriminant: u8 = BinarySerializable::deserialize(reader)?; + match discriminant { + 0 => { + let lon = BinarySerializable::deserialize(reader)?; + let lat = BinarySerializable::deserialize(reader)?; + Ok(Geometry::Point((lon, lat))) + } + 1 => Ok(Geometry::MultiPoint(deserialize_line_string(reader)?)), + 2 => Ok(Geometry::LineString(deserialize_line_string(reader)?)), + 3 => Ok(Geometry::MultiLineString(deserialize_polygon(reader)?)), + 4 => Ok(Geometry::Polygon(deserialize_polygon(reader)?)), + 5 => { + let polygon_count = VInt::deserialize(reader)?.0 as usize; + let mut polygons = Vec::new(); + let mut count = 0; + for _ in 0..polygon_count { + let ring_count = VInt::deserialize(reader)?.0 as usize; + let mut rings = Vec::new(); + for _ in 0..ring_count { + rings.push(VInt::deserialize(reader)?.0 as usize); + count += 1; + } + polygons.push(rings); + } + let lon_bytes: Vec = BinarySerializable::deserialize(reader)?; + let lat_bytes: Vec = BinarySerializable::deserialize(reader)?; + let lon = decompress_f64(&lon_bytes, count); + let lat = decompress_f64(&lat_bytes, count); + let mut multi_polygon = Vec::new(); + let mut offset = 0; + for rings in polygons { + let mut polygon = Vec::new(); + for point_count in rings { + let mut ring = Vec::new(); + for _ in 0..point_count { + ring.push((lon[offset], lat[offset])); + offset += 1; + } + polygon.push(ring); + } + multi_polygon.push(polygon); + } + Ok(Geometry::MultiPolygon(multi_polygon)) + } + 6 => { + let geometry_count = VInt::deserialize(reader)?.0 as usize; + let mut geometries = Vec::new(); + for _ in 0..geometry_count { + geometries.push(Geometry::deserialize(reader)?); + } + Ok(Geometry::GeometryCollection(geometries)) + } + _ => Err(io::Error::new( + io::ErrorKind::InvalidData, + "invalid geometry type", + )), + } + } +} + +fn serialize_line_string( + line: &Vec<(f64, f64)>, + writer: &mut W, +) -> io::Result<()> { + BinarySerializable::serialize(&VInt(line.len() as u64), writer)?; + let mut lon = Vec::new(); + let mut lat = Vec::new(); + for point in line { + lon.push(point.0); + lat.push(point.1); + } + let lon = compress_f64(&lon); + let lat = compress_f64(&lat); + VInt(lon.len() as u64).serialize(writer)?; + writer.write_all(&lon)?; + VInt(lat.len() as u64).serialize(writer)?; + writer.write_all(&lat)?; + Ok(()) +} + +fn serialize_polygon( + line_string: &Vec>, + writer: &mut W, +) -> io::Result<()> { + BinarySerializable::serialize(&VInt(line_string.len() as u64), writer)?; + for ring in line_string { + BinarySerializable::serialize(&VInt(ring.len() as u64), writer)?; + } + let mut lon = Vec::new(); + let mut lat = Vec::new(); + for ring in line_string { + for point in ring { + lon.push(point.0); + lat.push(point.1); + } + } + let lon = compress_f64(&lon); + let lat = compress_f64(&lat); + VInt(lon.len() as u64).serialize(writer)?; + writer.write_all(&lon)?; + VInt(lat.len() as u64).serialize(writer)?; + writer.write_all(&lat)?; + Ok(()) +} + +fn deserialize_line_string(reader: &mut R) -> io::Result> { + let point_count = VInt::deserialize(reader)?.0 as usize; + let lon_bytes: Vec = BinarySerializable::deserialize(reader)?; + let lat_bytes: Vec = BinarySerializable::deserialize(reader)?; + let lon = decompress_f64(&lon_bytes, point_count); + let lat = decompress_f64(&lat_bytes, point_count); + let mut line_string = Vec::new(); + for offset in 0..point_count { + line_string.push((lon[offset], lat[offset])); + } + Ok(line_string) +} + +fn deserialize_polygon(reader: &mut R) -> io::Result>> { + let ring_count = VInt::deserialize(reader)?.0 as usize; + let mut rings = Vec::new(); + let mut count = 0; + for _ in 0..ring_count { + let point_count = VInt::deserialize(reader)?.0 as usize; + rings.push(point_count); + count += point_count; + } + let lon_bytes: Vec = BinarySerializable::deserialize(reader)?; + let lat_bytes: Vec = BinarySerializable::deserialize(reader)?; + let lon = decompress_f64(&lon_bytes, count); + let lat = decompress_f64(&lat_bytes, count); + let mut polygon = Vec::new(); + let mut offset = 0; + for point_count in rings { + let mut ring = Vec::new(); + for _ in 0..point_count { + ring.push((lon[offset], lat[offset])); + offset += 1; + } + polygon.push(ring); + } + Ok(polygon) +} diff --git a/src/spatial/mod.rs b/src/spatial/mod.rs index 0e75f3d89e..70c59aafc4 100644 --- a/src/spatial/mod.rs +++ b/src/spatial/mod.rs @@ -2,6 +2,10 @@ pub mod bkd; pub mod delta; +pub mod geometry; pub mod radix_select; +pub mod reader; +pub mod serializer; pub mod triangle; +pub mod writer; pub mod xor; diff --git a/src/spatial/reader.rs b/src/spatial/reader.rs new file mode 100644 index 0000000000..9338111ff7 --- /dev/null +++ b/src/spatial/reader.rs @@ -0,0 +1,65 @@ +//! HUSH +use std::io; +use std::sync::Arc; + +use common::file_slice::FileSlice; +use common::OwnedBytes; + +use crate::directory::CompositeFile; +use crate::schema::Field; +use crate::space_usage::PerFieldSpaceUsage; + +#[derive(Clone)] +/// HUSH +pub struct SpatialReaders { + data: Arc, +} + +impl SpatialReaders { + /// Creates a field norm reader. + pub fn open(file: FileSlice) -> crate::Result { + let data = CompositeFile::open(&file)?; + Ok(SpatialReaders { + data: Arc::new(data), + }) + } + + /// Returns the FieldNormReader for a specific field. + pub fn get_field(&self, field: Field) -> crate::Result> { + if let Some(file) = self.data.open_read(field) { + let spatial_reader = SpatialReader::open(file)?; + Ok(Some(spatial_reader)) + } else { + Ok(None) + } + } + + /// Return a break down of the space usage per field. + pub fn space_usage(&self) -> PerFieldSpaceUsage { + self.data.space_usage() + } + + /// Returns a handle to inner file + pub fn get_inner_file(&self) -> Arc { + self.data.clone() + } +} + +/// HUSH +#[derive(Clone)] +pub struct SpatialReader { + data: OwnedBytes, +} + +impl SpatialReader { + /// Opens the spatial reader from a `FileSlice`. Returns `None` if the file is empty (no + /// spatial fields indexed.) + pub fn open(spatial_file: FileSlice) -> io::Result { + let data = spatial_file.read_bytes()?; + Ok(SpatialReader { data }) + } + /// HUSH + pub fn get_bytes(&self) -> &[u8] { + self.data.as_ref() + } +} diff --git a/src/spatial/serializer.rs b/src/spatial/serializer.rs new file mode 100644 index 0000000000..8336d45c62 --- /dev/null +++ b/src/spatial/serializer.rs @@ -0,0 +1,37 @@ +//! HUSH +use std::io; +use std::io::Write; + +use crate::directory::{CompositeWrite, WritePtr}; +use crate::schema::Field; +use crate::spatial::bkd::write_block_kd_tree; +use crate::spatial::triangle::Triangle; + +/// The fieldnorms serializer is in charge of +/// the serialization of field norms for all fields. +pub struct SpatialSerializer { + composite_write: CompositeWrite, +} + +impl SpatialSerializer { + /// Create a composite file from the write pointer. + pub fn from_write(write: WritePtr) -> io::Result { + // just making room for the pointer to header. + let composite_write = CompositeWrite::wrap(write); + Ok(SpatialSerializer { composite_write }) + } + + /// Serialize the given field + pub fn serialize_field(&mut self, field: Field, triangles: &mut [Triangle]) -> io::Result<()> { + let write = self.composite_write.for_field(field); + write_block_kd_tree(triangles, write)?; + write.flush()?; + Ok(()) + } + + /// Clean up, flush, and close. + pub fn close(self) -> io::Result<()> { + self.composite_write.close()?; + Ok(()) + } +} diff --git a/src/spatial/writer.rs b/src/spatial/writer.rs new file mode 100644 index 0000000000..b1584fcd14 --- /dev/null +++ b/src/spatial/writer.rs @@ -0,0 +1,132 @@ +//! HUSH +use std::collections::HashMap; +use std::io; + +use i_triangle::i_overlay::i_float::int::point::IntPoint; +use i_triangle::int::triangulatable::IntTriangulatable; + +use crate::schema::Field; +use crate::spatial::geometry::Geometry; +use crate::spatial::serializer::SpatialSerializer; +use crate::spatial::triangle::{delaunay_to_triangles, Triangle}; +use crate::DocId; + +/// HUSH +pub struct SpatialWriter { + /// Map from field to its triangles buffer + triangles_by_field: HashMap>, +} + +impl SpatialWriter { + /// HUST + pub fn add_geometry(&mut self, doc_id: DocId, field: Field, geometry: Geometry) { + let triangles = &mut self.triangles_by_field.entry(field).or_default(); + match geometry { + Geometry::Point(point) => { + into_point(triangles, doc_id, point); + } + Geometry::MultiPoint(multi_point) => { + for point in multi_point { + into_point(triangles, doc_id, point); + } + } + Geometry::LineString(line_string) => { + into_line_string(triangles, doc_id, line_string); + } + Geometry::MultiLineString(multi_line_string) => { + for line_string in multi_line_string { + into_line_string(triangles, doc_id, line_string); + } + } + Geometry::Polygon(polygon) => { + into_polygon(triangles, doc_id, polygon); + } + Geometry::MultiPolygon(multi_polygon) => { + for polygon in multi_polygon { + into_polygon(triangles, doc_id, polygon); + } + } + Geometry::GeometryCollection(geometries) => { + for geometry in geometries { + self.add_geometry(doc_id, field, geometry); + } + } + } + } + + /// Memory usage estimate + pub fn mem_usage(&self) -> usize { + self.triangles_by_field + .values() + .map(|triangles| triangles.len() * std::mem::size_of::()) + .sum() + } + + /// HUSH + pub fn serialize(&mut self, mut serializer: SpatialSerializer) -> io::Result<()> { + for (field, triangles) in &mut self.triangles_by_field { + serializer.serialize_field(*field, triangles)?; + } + serializer.close()?; + Ok(()) + } +} + +impl Default for SpatialWriter { + /// HUSH + fn default() -> Self { + SpatialWriter { + triangles_by_field: HashMap::new(), + } + } +} + +/// HUSH +pub fn as_point_i32(point: (f64, f64)) -> (i32, i32) { + ( + (point.0 / (360.0 / (1i64 << 32) as f64)).floor() as i32, + (point.1 / (180.0 / (1i64 << 32) as f64)).floor() as i32, + ) +} + +fn into_point(triangles: &mut Vec, doc_id: DocId, point: (f64, f64)) { + let point = as_point_i32(point); + triangles.push(Triangle::new( + doc_id, + [point.1, point.0, point.1, point.0, point.1, point.0], + [true, true, true], + )); +} + +fn into_line_string(triangles: &mut Vec, doc_id: DocId, line_string: Vec<(f64, f64)>) { + let mut previous = as_point_i32(line_string[0]); + for point in line_string.into_iter().skip(1) { + let point = as_point_i32(point); + triangles.push(Triangle::new( + doc_id, + [ + previous.1, previous.0, point.1, point.0, previous.1, previous.0, + ], + [true, true, true], + )); + previous = point + } +} + +fn into_ring(i_polygon: &mut Vec>, ring: Vec<(f64, f64)>) { + let mut i_ring = Vec::new(); + for point in ring { + let point = as_point_i32(point); + i_ring.push(IntPoint::new(point.0, point.1)); + } + i_polygon.push(i_ring); +} + +fn into_polygon(triangles: &mut Vec, doc_id: DocId, polygon: Vec>) { + let mut i_polygon = Vec::new(); + for ring in polygon { + into_ring(&mut i_polygon, ring); + } + let delaunay = i_polygon.triangulate().into_delaunay(); + delaunay_to_triangles(doc_id, &delaunay, triangles); +} From 459456ca2805e54074043931e93ac33588fa902a Mon Sep 17 00:00:00 2001 From: Alan Gutierrez Date: Mon, 3 Nov 2025 22:26:48 -0600 Subject: [PATCH 11/22] Remove `radix_select.rs`. Ended up using `select_nth_unstable_by_key` from the Rust standard library instead. --- src/spatial/mod.rs | 1 - src/spatial/radix_select.rs | 122 ------------------------------------ 2 files changed, 123 deletions(-) delete mode 100644 src/spatial/radix_select.rs diff --git a/src/spatial/mod.rs b/src/spatial/mod.rs index 70c59aafc4..180b553012 100644 --- a/src/spatial/mod.rs +++ b/src/spatial/mod.rs @@ -3,7 +3,6 @@ pub mod bkd; pub mod delta; pub mod geometry; -pub mod radix_select; pub mod reader; pub mod serializer; pub mod triangle; diff --git a/src/spatial/radix_select.rs b/src/spatial/radix_select.rs deleted file mode 100644 index 2397c9eaa9..0000000000 --- a/src/spatial/radix_select.rs +++ /dev/null @@ -1,122 +0,0 @@ -//! Radix selection for block kd-tree tree partitioning. -//! -//! Implements byte-wise histogram selection to find median values without comparisons, enabling -//! efficient partitioning of spatial data during block kd-tree construction. Processes values -//! through multiple passes, building histograms for each byte position after a common prefix, -//! avoiding the need to sort or compare elements directly. - -/// Performs radix selection to find the median value without comparisons by building byte-wise -/// histograms. -pub struct RadixSelect { - histogram: [usize; 256], - prefix: Vec, - offset: usize, - nth: usize, -} - -impl RadixSelect { - /// Creates a new radix selector for finding the nth element among values with a common prefix. - /// - /// The offset specifies how many matching elements appeared in previous buckets (from earlier - /// passes). The nth parameter is 0-indexed, so pass 31 to find the 32nd element (median of - /// 64). - pub fn new(prefix: Vec, offset: usize, nth: usize) -> Self { - RadixSelect { - histogram: [0; 256], - prefix, - offset, - nth, - } - } - /// Updates the histogram with a value if it matches the current prefix. - /// - /// Values that don't start with the prefix are ignored. For matching values, increments the - /// count for the byte at position `prefix.len()`. - pub fn update(&mut self, value: i32) { - let bytes = value.to_be_bytes(); - if !bytes.starts_with(&self.prefix) { - return; - } - let byte = bytes[self.prefix.len()]; - self.histogram[byte as usize] += 1; - } - /// Finds which bucket contains the nth element and returns the bucket value and offset. - /// - /// Returns a tuple of `(bucket_byte, count_before)` where bucket_byte is the value of the byte - /// that contains the nth element, and count_before is the number of elements in earlier - /// buckets (becomes the offset for the next pass). - pub fn nth(&self) -> (u8, usize) { - let mut count = self.offset; - for (bucket, &frequency) in self.histogram.iter().enumerate() { - if count + frequency > self.nth as usize { - return (bucket as u8, count); - } - count += frequency; - } - panic!("nth element {} not found in histogram", self.nth); - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn radix_selection() { - let dimensions = [ - ( - vec![ - 0x10101010, 0x10101011, 0x10101012, 0x10101013, 0x10101014, 0x10101015, - 0x10101016, 0x10101017, 0x10101018, 0x10101019, 0x1010101A, 0x1010101B, - 0x1010101C, 0x1010101D, 0x1010101E, 0x1010101F, 0x10101020, 0x10101021, - 0x10101022, 0x10101023, 0x10101024, 0x10101025, 0x10101026, 0x10101027, - 0x10101028, 0x10101029, 0x1010102A, 0x1010102B, 0x1010102C, 0x1010102D, - 0x1010102E, 0x1010102F, 0x10101030, 0x10101031, 0x10101032, 0x10101033, - 0x10101034, 0x10101035, 0x10101036, 0x10101037, 0x10101038, 0x10101039, - 0x1010103A, 0x1010103B, 0x1010103C, 0x1010103D, 0x1010103E, 0x1010103F, - 0x10101040, 0x10101041, 0x10101042, 0x10101043, 0x10101044, 0x10101045, - 0x10101046, 0x10101047, 0x10101048, 0x10101049, 0x1010104A, 0x1010104B, - 0x1010104C, 0x1010104D, 0x1010104E, 0x1010104F, - ], - [(0x10, 0), (0x10, 0), (0x10, 0), (0x2F, 31)], - ), - ( - vec![ - 0x10101010, 0x10101011, 0x10101012, 0x10101013, 0x10101014, 0x10101015, - 0x10101016, 0x10101017, 0x10101018, 0x10101019, 0x1010101A, 0x1010101B, - 0x1010101C, 0x1010101D, 0x1010101E, 0x1010101F, 0x10101020, 0x10101021, - 0x10101022, 0x10101023, 0x10101024, 0x20101025, 0x20201026, 0x20301027, - 0x20401028, 0x20501029, 0x2060102A, 0x2070102B, 0x2080102C, 0x2090102D, - 0x20A0102E, 0x20B0102F, 0x20C01030, 0x20D01031, 0x20E01032, 0x20F01033, - 0x20F11034, 0x20F21035, 0x20F31036, 0x20F41037, 0x20F51038, 0x20F61039, - 0x3010103A, 0x3010103B, 0x3010103C, 0x3010103D, 0x3010103E, 0x3010103F, - 0x30101040, 0x30101041, 0x30101042, 0x30101043, 0x30101044, 0x30101045, - 0x30101046, 0x30101047, 0x30101048, 0x30101049, 0x3010104A, 0x3010104B, - 0x3010104C, 0x3010104D, 0x3010104E, 0x3010104F, - ], - [(0x20, 21), (0xB0, 31), (0x10, 31), (0x2F, 31)], - ), - ]; - for (numbers, expected) in dimensions { - let mut offset = 0; - let mut prefix = Vec::new(); - for i in 0..4 { - let mut radix_select = RadixSelect::new(prefix.clone(), offset, 31); - for &number in &numbers { - radix_select.update(number); - } - let (byte, count) = radix_select.nth(); - if i != 3 { - assert_eq!(expected[i].0, byte); - assert_eq!(expected[i].1, count); - } - prefix.push(byte); - offset = count; - } - let mut sorted = numbers.clone(); - sorted.sort(); - let radix_result = i32::from_be_bytes(prefix.as_slice().try_into().unwrap()); - assert_eq!(radix_result, sorted[31]); - } - } -} From 68009bb25ba6b41145ae20b038d22778833fa3eb Mon Sep 17 00:00:00 2001 From: Alan Gutierrez Date: Mon, 3 Nov 2025 23:55:12 -0600 Subject: [PATCH 12/22] Read block kd-tree nodes using `from_le_bytes`. Read node structures using `from_le_bytes` instead of casting memory. After an inspection of columnar storage, it appears that this is the standard practice in Rust and in the Tantivy code base. Left the structure alignment for now in case it tends to align with cache boundaries. --- src/spatial/bkd.rs | 77 +++++++++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 28 deletions(-) diff --git a/src/spatial/bkd.rs b/src/spatial/bkd.rs index 4c6652f64e..071556d27b 100644 --- a/src/spatial/bkd.rs +++ b/src/spatial/bkd.rs @@ -359,7 +359,6 @@ impl<'a> Segment<'a> { /// Reads the footer metadata from the last 12 bytes to locate the tree structure and root /// node. pub fn new(data: &'a [u8]) -> Self { - assert_eq!(data.as_ptr() as usize % std::mem::align_of::(), 0); Segment { data, branch_position: u64::from_le_bytes(data[data.len() - 8..].try_into().unwrap()), @@ -368,25 +367,47 @@ impl<'a> Segment<'a> { ), } } - fn bounding_box(&self, offset: i32) -> &[i32; 4] { - unsafe { - let byte_offset = (self.branch_position as i64 + offset as i64) as usize; - let ptr = self.data.as_ptr().add(byte_offset); - &*ptr.cast::<[i32; 4]>() + #[inline(always)] + fn bounding_box(&self, offset: i32) -> [i32; 4] { + let byte_offset = (self.branch_position as i64 + offset as i64) as usize; + let bytes = &self.data[byte_offset..byte_offset + 16]; + [ + i32::from_le_bytes(bytes[0..4].try_into().unwrap()), + i32::from_le_bytes(bytes[4..8].try_into().unwrap()), + i32::from_le_bytes(bytes[8..12].try_into().unwrap()), + i32::from_le_bytes(bytes[12..16].try_into().unwrap()), + ] + } + #[inline(always)] + fn branch_node(&self, offset: i32) -> BranchNode { + let byte_offset = (self.branch_position as i64 + offset as i64) as usize; + let bytes = &self.data[byte_offset..byte_offset + 32]; + BranchNode { + bbox: [ + i32::from_le_bytes(bytes[0..4].try_into().unwrap()), + i32::from_le_bytes(bytes[4..8].try_into().unwrap()), + i32::from_le_bytes(bytes[8..12].try_into().unwrap()), + i32::from_le_bytes(bytes[12..16].try_into().unwrap()), + ], + left: i32::from_le_bytes(bytes[16..20].try_into().unwrap()), + right: i32::from_le_bytes(bytes[20..24].try_into().unwrap()), + pad: [0u8; 8], } } - fn branch_node(&self, offset: i32) -> &BranchNode { - unsafe { - let byte_offset = (self.branch_position as i64 + offset as i64) as usize; - let ptr = self.data.as_ptr().add(byte_offset); - &*ptr.cast::() - } - } - fn leaf_node(&self, offset: i32) -> &LeafNode { - unsafe { - let byte_offset = (self.branch_position as i64 + offset as i64) as usize; - let ptr = self.data.as_ptr().add(byte_offset); - &*ptr.cast::() + #[inline(always)] + fn leaf_node(&self, offset: i32) -> LeafNode { + let byte_offset = (self.branch_position as i64 + offset as i64) as usize; + let bytes = &self.data[byte_offset..byte_offset + 32]; + LeafNode { + bbox: [ + i32::from_le_bytes(bytes[0..4].try_into().unwrap()), + i32::from_le_bytes(bytes[4..8].try_into().unwrap()), + i32::from_le_bytes(bytes[8..12].try_into().unwrap()), + i32::from_le_bytes(bytes[12..16].try_into().unwrap()), + ], + pos: u64::from_le_bytes(bytes[16..24].try_into().unwrap()), + len: u16::from_le_bytes(bytes[24..26].try_into().unwrap()), + pad: [0u8; 6], } } fn leaf_page(&self, leaf_node: &LeafNode) -> &[u8] { @@ -397,7 +418,7 @@ impl<'a> Segment<'a> { fn collect_all_docs(segment: &Segment, offset: i32, result: &mut BitSet) -> io::Result<()> { if offset < 0 { let leaf_node = segment.leaf_node(offset); - let data = segment.leaf_page(leaf_node); + let data = segment.leaf_page(&leaf_node); let count = u16::from_le_bytes([data[0], data[1]]) as usize; decompress::(&data[2..], count, |_, doc_id| result.insert(doc_id))?; } else { @@ -437,15 +458,15 @@ pub fn search_intersects( ) -> io::Result<()> { let bbox = segment.bounding_box(offset); // bbox doesn't intersect query → skip entire subtree - if !bbox_intersects(bbox, query) { + if !bbox_intersects(&bbox, query) { } // bbox entirely within query → all triangles intersect - else if bbox_within(bbox, query) { + else if bbox_within(&bbox, query) { collect_all_docs(segment, offset, result)?; } else if offset < 0 { // bbox crosses query → test each triangle let leaf_node = segment.leaf_node(offset); - let triangles = decompress_leaf(segment.leaf_page(leaf_node))?; + let triangles = decompress_leaf(segment.leaf_page(&leaf_node))?; for triangle in &triangles { if triangle_intersects(triangle, query) { result.insert(triangle.doc_id); // BitSet deduplicates @@ -643,11 +664,11 @@ pub fn search_within( excluded: &mut BitSet, ) -> io::Result<()> { let bbox = segment.bounding_box(offset); - if !bbox_intersects(bbox, query) { + if !bbox_intersects(&bbox, query) { } else if offset < 0 { let leaf_node = segment.leaf_node(offset); // bbox crosses query → test each triangle - let triangles = decompress_leaf(segment.leaf_page(leaf_node))?; + let triangles = decompress_leaf(segment.leaf_page(&leaf_node))?; for triangle in &triangles { if triangle_intersects(triangle, query) { if excluded.contains(triangle.doc_id) { @@ -738,11 +759,11 @@ pub fn search_contains( excluded: &mut BitSet, ) -> io::Result<()> { let bbox = segment.bounding_box(offset); - if !bbox_intersects(bbox, query) { + if !bbox_intersects(&bbox, query) { } else if offset < 0 { let leaf_node = segment.leaf_node(offset); // bbox crosses query → test each triangle - let triangles = decompress_leaf(segment.leaf_page(leaf_node))?; + let triangles = decompress_leaf(segment.leaf_page(&leaf_node))?; for triangle in &triangles { if triangle_intersects(triangle, query) { let doc_id = triangle.doc_id; @@ -787,8 +808,8 @@ impl<'a> Iterator for LeafPageIterator<'a> { let offset = self.descent_stack.pop()?; if offset < 0 { let leaf_node = self.segment.leaf_node(offset); - let leaf_page = self.segment.leaf_page(leaf_node); - match decompress_leaf(leaf_page) { + let leaf_page = self.segment.leaf_page(&leaf_node); + match decompress_leaf(&leaf_page) { Ok(triangles) => Some(Ok(triangles)), Err(e) => Some(Err(e)), } From 9f10279681d157721d0ef8c272636a06f6ff3204 Mon Sep 17 00:00:00 2001 From: Alan Gutierrez Date: Thu, 13 Nov 2025 01:05:17 -0600 Subject: [PATCH 13/22] Complete Spatial/Geometry type integration. Addressed all `todo!()` markers created when adding `Spatial` field type and `Geometry` value type to existing code paths: - Dynamic field handling: `Geometry` not supported in dynamic JSON fields, return `unimplemented!()` consistently with other complex types. - Fast field writer: Panic if geometry routed incorrectly (internal error.) - `OwnedValue` serialization: Implement `Geometry` to GeoJSON serialization and reference-to-owned conversion. - Field type: Return `None` for `get_index_record_option()` since spatial fields use BKD trees, not inverted index. - Space usage tracking: Add spatial field to `SegmentSpaceUsage` with proper integration through `SegmentReader`. - Spatial query explain: Implement `explain()` method following pattern of other binary/constant-score queries. Fixed `MultiPolygon` deserialization bug: count total points across all rings, not number of rings. Added clippy expects for legitimate too_many_arguments cases in geometric predicates. --- src/core/json_utils.rs | 4 +++- src/fastfield/writer.rs | 8 ++++++-- src/index/segment_reader.rs | 1 + src/query/spatial_query.rs | 23 +++++++++++++++++++---- src/schema/document/default_document.rs | 1 - src/schema/document/owned_value.rs | 4 ++-- src/schema/field_type.rs | 3 ++- src/space_usage/mod.rs | 11 ++++++++++- src/spatial/bkd.rs | 2 ++ src/spatial/geometry.rs | 5 +++-- 10 files changed, 48 insertions(+), 14 deletions(-) diff --git a/src/core/json_utils.rs b/src/core/json_utils.rs index ee9292d667..36c681b604 100644 --- a/src/core/json_utils.rs +++ b/src/core/json_utils.rs @@ -227,7 +227,9 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>( ReferenceValueLeaf::IpAddr(_) => { unimplemented!("IP address support in dynamic fields is not yet implemented") } - ReferenceValueLeaf::Geometry(_) => todo!(), + ReferenceValueLeaf::Geometry(_) => { + unimplemented!("Geometry support in dynamic fields is not implemented") + } }, ReferenceValue::Array(elements) => { for val in elements { diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 311a3d6afb..39d81c10ee 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -189,7 +189,9 @@ impl FastFieldsWriter { .record_str(doc_id, field_name, &token.text); } } - ReferenceValueLeaf::Geometry(_) => todo!(), + ReferenceValueLeaf::Geometry(_) => { + panic!("Geometry fields should not be routed to fast field writer") + } }, ReferenceValue::Array(val) => { // TODO: Check this is the correct behaviour we want. @@ -321,7 +323,9 @@ fn record_json_value_to_columnar_writer<'a, V: Value<'a>>( "Pre-tokenized string support in dynamic fields is not yet implemented" ) } - ReferenceValueLeaf::Geometry(_) => todo!(), + ReferenceValueLeaf::Geometry(_) => { + unimplemented!("Geometry support in dynamic fields is not yet implemented") + } }, ReferenceValue::Array(elements) => { for el in elements { diff --git a/src/index/segment_reader.rs b/src/index/segment_reader.rs index ee1a9938af..576a5f83b1 100644 --- a/src/index/segment_reader.rs +++ b/src/index/segment_reader.rs @@ -470,6 +470,7 @@ impl SegmentReader { self.positions_composite.space_usage(), self.fast_fields_readers.space_usage(self.schema())?, self.fieldnorm_readers.space_usage(), + self.spatial_readers.space_usage(), self.get_store_reader(0)?.space_usage(), self.alive_bitset_opt .as_ref() diff --git a/src/query/spatial_query.rs b/src/query/spatial_query.rs index a6c4d7050e..22b2134bd8 100644 --- a/src/query/spatial_query.rs +++ b/src/query/spatial_query.rs @@ -2,7 +2,8 @@ use common::BitSet; -use crate::query::{BitSetDocSet, Query, Scorer, Weight}; +use crate::query::explanation::does_not_match; +use crate::query::{BitSetDocSet, Explanation, Query, Scorer, Weight}; use crate::schema::Field; use crate::spatial::bkd::{search_intersects, Segment}; use crate::spatial::writer::as_point_i32; @@ -96,10 +97,24 @@ impl Weight for SpatialWeight { } fn explain( &self, - _reader: &crate::SegmentReader, - _doc: DocId, + reader: &crate::SegmentReader, + doc: DocId, ) -> crate::Result { - todo!(); + let mut scorer = self.scorer(reader, 1.0)?; + if scorer.seek(doc) != doc { + return Err(does_not_match(doc)); + } + let query_type_desc = match self.query_type { + SpatialQueryType::Intersects => "SpatialQuery::Intersects", + }; + let score = scorer.score(); + let mut explanation = Explanation::new(query_type_desc, score); + explanation.add_context(format!( + "bounds: [({}, {}), ({}, {})]", + self.bounds[0].0, self.bounds[0].1, self.bounds[1].0, self.bounds[1].1, + )); + explanation.add_context(format!("field: {:?}", self.field)); + Ok(explanation) } } diff --git a/src/schema/document/default_document.rs b/src/schema/document/default_document.rs index c2416040ac..2a28482720 100644 --- a/src/schema/document/default_document.rs +++ b/src/schema/document/default_document.rs @@ -466,7 +466,6 @@ impl<'a> CompactDocValue<'a> { .map(Into::into) .map(ReferenceValueLeaf::PreTokStr) .map(Into::into), - // ValueType::Geometry => todo!(), ValueType::Geometry => self .container .read_from::(addr) diff --git a/src/schema/document/owned_value.rs b/src/schema/document/owned_value.rs index 8c72e358d6..5977a83875 100644 --- a/src/schema/document/owned_value.rs +++ b/src/schema/document/owned_value.rs @@ -208,7 +208,7 @@ impl serde::Serialize for OwnedValue { } } OwnedValue::Array(ref array) => array.serialize(serializer), - OwnedValue::Geometry(_) => todo!(), + OwnedValue::Geometry(ref geometry) => geometry.to_geojson().serialize(serializer), } } } @@ -296,7 +296,7 @@ impl<'a, V: Value<'a>> From> for OwnedValue { ReferenceValueLeaf::IpAddr(val) => OwnedValue::IpAddr(val), ReferenceValueLeaf::Bool(val) => OwnedValue::Bool(val), ReferenceValueLeaf::PreTokStr(val) => OwnedValue::PreTokStr(*val.clone()), - ReferenceValueLeaf::Geometry(_) => todo!(), + ReferenceValueLeaf::Geometry(val) => OwnedValue::Geometry(*val.clone()), }, ReferenceValue::Array(val) => { OwnedValue::Array(val.map(|v| v.as_value().into()).collect()) diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index aa1e47094e..0caef80caf 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -359,7 +359,8 @@ impl FieldType { None } } - FieldType::Spatial(_) => todo!(), + FieldType::Spatial(_) => None, /* Geometry types cannot be indexed in the inverted + * index. */ } } diff --git a/src/space_usage/mod.rs b/src/space_usage/mod.rs index e44d6e4277..118badd147 100644 --- a/src/space_usage/mod.rs +++ b/src/space_usage/mod.rs @@ -69,6 +69,7 @@ pub struct SegmentSpaceUsage { positions: PerFieldSpaceUsage, fast_fields: PerFieldSpaceUsage, fieldnorms: PerFieldSpaceUsage, + spatial: PerFieldSpaceUsage, store: StoreSpaceUsage, @@ -86,6 +87,7 @@ impl SegmentSpaceUsage { positions: PerFieldSpaceUsage, fast_fields: PerFieldSpaceUsage, fieldnorms: PerFieldSpaceUsage, + spatial: PerFieldSpaceUsage, store: StoreSpaceUsage, deletes: ByteCount, ) -> SegmentSpaceUsage { @@ -94,6 +96,7 @@ impl SegmentSpaceUsage { + positions.total() + fast_fields.total() + fieldnorms.total() + + spatial.total() + store.total() + deletes; SegmentSpaceUsage { @@ -103,6 +106,7 @@ impl SegmentSpaceUsage { positions, fast_fields, fieldnorms, + spatial, store, deletes, total, @@ -121,11 +125,11 @@ impl SegmentSpaceUsage { Positions => PerField(self.positions().clone()), FastFields => PerField(self.fast_fields().clone()), FieldNorms => PerField(self.fieldnorms().clone()), + Spatial => PerField(self.spatial().clone()), Terms => PerField(self.termdict().clone()), SegmentComponent::Store => ComponentSpaceUsage::Store(self.store().clone()), SegmentComponent::TempStore => ComponentSpaceUsage::Store(self.store().clone()), Delete => Basic(self.deletes()), - Spatial => todo!(), } } @@ -159,6 +163,11 @@ impl SegmentSpaceUsage { &self.fieldnorms } + /// Space usage for field norms + pub fn spatial(&self) -> &PerFieldSpaceUsage { + &self.spatial + } + /// Space usage for stored documents pub fn store(&self) -> &StoreSpaceUsage { &self.store diff --git a/src/spatial/bkd.rs b/src/spatial/bkd.rs index 071556d27b..2fbe5e267c 100644 --- a/src/spatial/bkd.rs +++ b/src/spatial/bkd.rs @@ -481,6 +481,7 @@ pub fn search_intersects( Ok(()) } +#[expect(clippy::too_many_arguments)] fn line_intersects_line( x1: i32, y1: i32, @@ -572,6 +573,7 @@ fn triangle_within(triangle: &Triangle, query: &[i32; 4]) -> bool { true } +#[expect(clippy::too_many_arguments)] fn point_in_triangle( px: i32, py: i32, diff --git a/src/spatial/geometry.rs b/src/spatial/geometry.rs index 5a770fc41a..08dd2cf163 100644 --- a/src/spatial/geometry.rs +++ b/src/spatial/geometry.rs @@ -353,8 +353,9 @@ impl BinarySerializable for Geometry { let ring_count = VInt::deserialize(reader)?.0 as usize; let mut rings = Vec::new(); for _ in 0..ring_count { - rings.push(VInt::deserialize(reader)?.0 as usize); - count += 1; + let point_count = VInt::deserialize(reader)?.0 as usize; + rings.push(point_count); + count += point_count; } polygons.push(rings); } From 6da54fa5daf8e9a89cd7bcb6584821a6715d6d50 Mon Sep 17 00:00:00 2001 From: Alan Gutierrez Date: Tue, 18 Nov 2025 00:22:40 -0600 Subject: [PATCH 14/22] Revert "Remove `radix_select.rs`." This reverts commit 19eab167b6fa9fccf928217033ccf43c8ca26fb8. Restore radix select in order to implement a merge solution that will not require a temporary file. --- src/spatial/mod.rs | 1 + src/spatial/radix_select.rs | 122 ++++++++++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+) create mode 100644 src/spatial/radix_select.rs diff --git a/src/spatial/mod.rs b/src/spatial/mod.rs index 180b553012..70c59aafc4 100644 --- a/src/spatial/mod.rs +++ b/src/spatial/mod.rs @@ -3,6 +3,7 @@ pub mod bkd; pub mod delta; pub mod geometry; +pub mod radix_select; pub mod reader; pub mod serializer; pub mod triangle; diff --git a/src/spatial/radix_select.rs b/src/spatial/radix_select.rs new file mode 100644 index 0000000000..2397c9eaa9 --- /dev/null +++ b/src/spatial/radix_select.rs @@ -0,0 +1,122 @@ +//! Radix selection for block kd-tree tree partitioning. +//! +//! Implements byte-wise histogram selection to find median values without comparisons, enabling +//! efficient partitioning of spatial data during block kd-tree construction. Processes values +//! through multiple passes, building histograms for each byte position after a common prefix, +//! avoiding the need to sort or compare elements directly. + +/// Performs radix selection to find the median value without comparisons by building byte-wise +/// histograms. +pub struct RadixSelect { + histogram: [usize; 256], + prefix: Vec, + offset: usize, + nth: usize, +} + +impl RadixSelect { + /// Creates a new radix selector for finding the nth element among values with a common prefix. + /// + /// The offset specifies how many matching elements appeared in previous buckets (from earlier + /// passes). The nth parameter is 0-indexed, so pass 31 to find the 32nd element (median of + /// 64). + pub fn new(prefix: Vec, offset: usize, nth: usize) -> Self { + RadixSelect { + histogram: [0; 256], + prefix, + offset, + nth, + } + } + /// Updates the histogram with a value if it matches the current prefix. + /// + /// Values that don't start with the prefix are ignored. For matching values, increments the + /// count for the byte at position `prefix.len()`. + pub fn update(&mut self, value: i32) { + let bytes = value.to_be_bytes(); + if !bytes.starts_with(&self.prefix) { + return; + } + let byte = bytes[self.prefix.len()]; + self.histogram[byte as usize] += 1; + } + /// Finds which bucket contains the nth element and returns the bucket value and offset. + /// + /// Returns a tuple of `(bucket_byte, count_before)` where bucket_byte is the value of the byte + /// that contains the nth element, and count_before is the number of elements in earlier + /// buckets (becomes the offset for the next pass). + pub fn nth(&self) -> (u8, usize) { + let mut count = self.offset; + for (bucket, &frequency) in self.histogram.iter().enumerate() { + if count + frequency > self.nth as usize { + return (bucket as u8, count); + } + count += frequency; + } + panic!("nth element {} not found in histogram", self.nth); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn radix_selection() { + let dimensions = [ + ( + vec![ + 0x10101010, 0x10101011, 0x10101012, 0x10101013, 0x10101014, 0x10101015, + 0x10101016, 0x10101017, 0x10101018, 0x10101019, 0x1010101A, 0x1010101B, + 0x1010101C, 0x1010101D, 0x1010101E, 0x1010101F, 0x10101020, 0x10101021, + 0x10101022, 0x10101023, 0x10101024, 0x10101025, 0x10101026, 0x10101027, + 0x10101028, 0x10101029, 0x1010102A, 0x1010102B, 0x1010102C, 0x1010102D, + 0x1010102E, 0x1010102F, 0x10101030, 0x10101031, 0x10101032, 0x10101033, + 0x10101034, 0x10101035, 0x10101036, 0x10101037, 0x10101038, 0x10101039, + 0x1010103A, 0x1010103B, 0x1010103C, 0x1010103D, 0x1010103E, 0x1010103F, + 0x10101040, 0x10101041, 0x10101042, 0x10101043, 0x10101044, 0x10101045, + 0x10101046, 0x10101047, 0x10101048, 0x10101049, 0x1010104A, 0x1010104B, + 0x1010104C, 0x1010104D, 0x1010104E, 0x1010104F, + ], + [(0x10, 0), (0x10, 0), (0x10, 0), (0x2F, 31)], + ), + ( + vec![ + 0x10101010, 0x10101011, 0x10101012, 0x10101013, 0x10101014, 0x10101015, + 0x10101016, 0x10101017, 0x10101018, 0x10101019, 0x1010101A, 0x1010101B, + 0x1010101C, 0x1010101D, 0x1010101E, 0x1010101F, 0x10101020, 0x10101021, + 0x10101022, 0x10101023, 0x10101024, 0x20101025, 0x20201026, 0x20301027, + 0x20401028, 0x20501029, 0x2060102A, 0x2070102B, 0x2080102C, 0x2090102D, + 0x20A0102E, 0x20B0102F, 0x20C01030, 0x20D01031, 0x20E01032, 0x20F01033, + 0x20F11034, 0x20F21035, 0x20F31036, 0x20F41037, 0x20F51038, 0x20F61039, + 0x3010103A, 0x3010103B, 0x3010103C, 0x3010103D, 0x3010103E, 0x3010103F, + 0x30101040, 0x30101041, 0x30101042, 0x30101043, 0x30101044, 0x30101045, + 0x30101046, 0x30101047, 0x30101048, 0x30101049, 0x3010104A, 0x3010104B, + 0x3010104C, 0x3010104D, 0x3010104E, 0x3010104F, + ], + [(0x20, 21), (0xB0, 31), (0x10, 31), (0x2F, 31)], + ), + ]; + for (numbers, expected) in dimensions { + let mut offset = 0; + let mut prefix = Vec::new(); + for i in 0..4 { + let mut radix_select = RadixSelect::new(prefix.clone(), offset, 31); + for &number in &numbers { + radix_select.update(number); + } + let (byte, count) = radix_select.nth(); + if i != 3 { + assert_eq!(expected[i].0, byte); + assert_eq!(expected[i].1, count); + } + prefix.push(byte); + offset = count; + } + let mut sorted = numbers.clone(); + sorted.sort(); + let radix_result = i32::from_be_bytes(prefix.as_slice().try_into().unwrap()); + assert_eq!(radix_result, sorted[31]); + } + } +} From d26d6c34fc9e5f3c8fa5d4dc1a402a26f40391e3 Mon Sep 17 00:00:00 2001 From: Alan Gutierrez Date: Tue, 18 Nov 2025 02:11:25 -0600 Subject: [PATCH 15/22] Fix `select_nth_unstable_by_key` midpoint duplicates. Existing code behaved as if the result of `select_nth_unstable_by_key` was either a sorted array or the product of an algorithm that gathered partition values as in the Dutch national flag problem. The existing code was written knowing that the former isn't true and the latter isn't advertised. Knowing, but not remembering. Quite the oversight. --- src/spatial/bkd.rs | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/spatial/bkd.rs b/src/spatial/bkd.rs index 2fbe5e267c..5cda761825 100644 --- a/src/spatial/bkd.rs +++ b/src/spatial/bkd.rs @@ -139,6 +139,7 @@ fn write_leaf_pages( triangles: &mut [Triangle], write: &mut CountingWriter, ) -> io::Result { + // If less than 512 triangles we are at a leaf, otherwise we still in the inner nodes. if triangles.len() <= 512 { let pos = write.written_bytes(); let mut spreads = [SpreadSurvey::default(); 4]; @@ -197,6 +198,7 @@ fn write_leaf_pages( max_spread = current_spread; } } + // Partition the triangles. let mid = triangles.len() / 2; triangles.select_nth_unstable_by_key(mid, |t| t.words[dimension]); let partition = triangles[mid].words[dimension]; @@ -205,12 +207,37 @@ fn write_leaf_pages( { split_point += 1; } + // If we reached the end of triangles then all of the triangles share the partition value + // for the dimension. We handle this degeneracy by splitting at the midpoint so that we + // won't have a leaf with zero triangles. if split_point == triangles.len() { split_point = mid; // Force split at midpoint index + } else { + // Our partition does not sort the triangles, it only partitions. We have scan our right + // partition to find all the midpoint values and move them to the left partition. + let mut reverse = triangles.len() - 1; + loop { + // Scan backwards looking for the partition value. + while triangles[reverse].words[dimension] != partition { + reverse -= 1; + } + // If we have reached the split point then we are done. + if reverse <= split_point { + break; + } + // Swap the midpoint value with our current split point. + triangles.swap(split_point, reverse); + // Move the split point up one. + split_point += 1; + // We know that what was at the split point was not the midpoint value. + reverse -= 1; + } } + // Split into left and write partitions and create child nodes. let (left, right) = triangles.split_at_mut(split_point); let left_node = write_leaf_pages(left, write)?; let right_node = write_leaf_pages(right, write)?; + // Return an inner node. Ok(BuildNode::Branch { bbox: bounding_box.bbox(), left: Box::new(left_node), From 79622f1f0b17252d8c1269eb85f59696102c7e4f Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 1 Dec 2025 17:11:31 +0100 Subject: [PATCH 16/22] bugfix --- examples/geo_json.rs | 2 +- src/indexer/merger.rs | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/geo_json.rs b/examples/geo_json.rs index 3990d27671..f3efbc93d1 100644 --- a/examples/geo_json.rs +++ b/examples/geo_json.rs @@ -41,7 +41,7 @@ fn main() -> tantivy::Result<()> { [(-99.49, 45.56), (-99.45, 45.59)], tantivy::query::SpatialQueryType::Intersects, ); - let hits = searcher.search(&query, &TopDocs::with_limit(10))?; + let hits = searcher.search(&query, &TopDocs::with_limit(10).order_by_score())?; for (_score, doc_address) in &hits { let retrieved_doc: TantivyDocument = searcher.doc(*doc_address)?; if let Some(field_value) = retrieved_doc.get_first(field) { diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 7630b611f7..a5de676430 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -586,6 +586,7 @@ impl IndexMerger { // Get spatial writer and rebuild block kd-tree. spatial_serializer.serialize_field(field, triangles)?; } + spatial_serializer.close()?; } Ok(()) } From d8bc0e7c99fc35e044349b03944a63ac6d2eb0d6 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 3 Dec 2025 12:41:17 +0100 Subject: [PATCH 17/22] added doc --- src/indexer/merger.rs | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index a5de676430..152a31cf71 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -1,5 +1,5 @@ use std::collections::HashMap; -use std::io::Write; +use std::io::{BufWriter, Write}; use std::sync::Arc; use columnar::{ @@ -530,6 +530,18 @@ impl IndexMerger { serializer: &mut SegmentSerializer, doc_id_mapping: &SegmentDocIdMapping, ) -> crate::Result<()> { + /// Unfortunately, there are no special trick to merge segments. + /// We need to rebuild a BKD-tree based off the list of triangles. + /// + /// Because the data can be large, we do this by writing the sequence of triangles to disk, + /// and mmapping it as mutable slice, and calling the same code as what is done for the + /// segment serialization. + /// + /// The OS is in charge of deciding how to handle its page cache. + /// This is the same as what would have happened with swapping, + /// except by explicitly mapping the file, the OS is more likely to + /// swap, the memory will not be accounted as anonymous memory, + /// swap space is reserved etc. use crate::spatial::bkd::Segment; let mut segment_mappings: Vec>> = Vec::new(); for reader in &self.readers { @@ -549,10 +561,10 @@ impl IndexMerger { } for (segment_ord, reader) in self.readers.iter().enumerate() { for (field, temp_file) in &mut temp_files { + let mut buf_temp_file = BufWriter::new(temp_file); let spatial_readers = reader.spatial_fields(); - let spatial_reader = match spatial_readers.get_field(*field)? { - Some(reader) => reader, - None => continue, + let Some(spatial_reader) = spatial_readers.get_field(*field)? else { + continue; }; let segment = Segment::new(spatial_reader.get_bytes()); for triangle_result in LeafPageIterator::new(&segment) { @@ -561,20 +573,21 @@ impl IndexMerger { if let Some(new_doc_id) = segment_mappings[segment_ord][triangle.doc_id as usize] { + // This is really just a temporary file, not meant to be portable, so we + // use native endianness here. for &word in &triangle.words { - temp_file.write_all(&word.to_le_bytes())?; + buf_temp_file.write_all(&word.to_ne_bytes())?; } - temp_file.write_all(&new_doc_id.to_le_bytes())?; + buf_temp_file.write_all(&new_doc_id.to_ne_bytes())?; } } } + buf_temp_file.flush()?; + // No need to fsync here. This file is not here for persistency. } } if let Some(mut spatial_serializer) = serializer.extract_spatial_serializer() { - for (field, mut temp_file) in temp_files { - // Flush and sync triangles. - temp_file.flush()?; - temp_file.as_file_mut().sync_all()?; + for (field, temp_file) in temp_files { // Memory map the triangle file. use memmap2::MmapOptions; let mmap = unsafe { MmapOptions::new().map_mut(temp_file.as_file())? }; From 32beb06382182abb357fd2d8c678a875e79b036b Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 3 Dec 2025 13:02:10 +0100 Subject: [PATCH 18/22] plastic surgery --- src/indexer/merger.rs | 6 +++--- src/indexer/segment_serializer.rs | 2 +- src/spatial/bkd.rs | 25 +++++++++++++------------ 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 152a31cf71..1e5caddead 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -533,9 +533,9 @@ impl IndexMerger { /// Unfortunately, there are no special trick to merge segments. /// We need to rebuild a BKD-tree based off the list of triangles. /// - /// Because the data can be large, we do this by writing the sequence of triangles to disk, - /// and mmapping it as mutable slice, and calling the same code as what is done for the - /// segment serialization. + /// Because the data can be large, we do this by writing the sequence of triangles to + /// disk, and mmapping it as mutable slice, and calling the same code as what + /// is done for the segment serialization. /// /// The OS is in charge of deciding how to handle its page cache. /// This is the same as what would have happened with swapping, diff --git a/src/indexer/segment_serializer.rs b/src/indexer/segment_serializer.rs index 473dbd1e80..34b832e7ea 100644 --- a/src/indexer/segment_serializer.rs +++ b/src/indexer/segment_serializer.rs @@ -70,7 +70,7 @@ impl SegmentSerializer { &mut self.fast_field_write } - /// HUSH + /// Accessor to the `SpatialSerializer` pub fn extract_spatial_serializer(&mut self) -> Option { self.spatial_serializer.take() } diff --git a/src/spatial/bkd.rs b/src/spatial/bkd.rs index 5cda761825..f4d0829cd3 100644 --- a/src/spatial/bkd.rs +++ b/src/spatial/bkd.rs @@ -296,6 +296,8 @@ fn write_branch_nodes( } } +const VERSION: u16 = 1u16; + /// Builds and serializes a block kd-tree for spatial indexing of triangles. /// /// Takes a collection of triangles and constructs a complete block kd-tree, writing both the @@ -314,16 +316,14 @@ pub fn write_block_kd_tree( triangles: &mut [Triangle], write: &mut CountingWriter, ) -> io::Result<()> { - assert_eq!( - triangles.as_ptr() as usize % std::mem::align_of::(), - 0 - ); - write.write_all(&1u16.to_le_bytes())?; + write.write_all(&VERSION.to_le_bytes())?; + let tree = write_leaf_pages(triangles, write)?; let current = write.written_bytes(); - let aligned = (current + 31) & !31; + let aligned = current.next_multiple_of(32); let padding = aligned - current; write.write_all(&vec![0u8; padding as usize])?; + write_leaf_nodes(&tree, write)?; let branch_position = write.written_bytes(); let mut branch_offset: i32 = 0; @@ -336,15 +336,16 @@ pub fn write_block_kd_tree( Ok(()) } -fn decompress_leaf(data: &[u8]) -> io::Result> { - let count = u16::from_le_bytes([data[0], data[1]]) as usize; - let mut offset = 2; - let mut triangles = Vec::with_capacity(count); - offset += decompress::(&data[offset..], count, |_, doc_id| { +fn decompress_leaf(mut data: &[u8]) -> io::Result> { + use common::BinarySerializable; + let triangle_count: usize = u16::deserialize(&mut data)? as usize; + let mut offset: usize = 0; + let mut triangles: Vec = Vec::with_capacity(triangle_count); + offset += decompress::(&data[offset..], triangle_count, |_, doc_id| { triangles.push(Triangle::skeleton(doc_id)) })?; for i in 0..7 { - offset += decompress::(&data[offset..], count, |j, word| { + offset += decompress::(&data[offset..], triangle_count, |j, word| { triangles[j].words[i] = word })?; } From 5d03c600ba5e44f595388927bec9409afb7ad59f Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 3 Dec 2025 15:21:37 +0100 Subject: [PATCH 19/22] Added bugfix and unit tests Removed use of robust. --- Cargo.toml | 1 - src/spatial/triangle.rs | 89 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 83 insertions(+), 7 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 2cdb25b1c4..558f14378d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -56,7 +56,6 @@ itertools = "0.14.0" measure_time = "0.9.0" arc-swap = "1.5.0" bon = "3.3.1" -robust = "1.2" i_triangle = "0.38.0" columnar = { version = "0.6", path = "./columnar", package = "tantivy-columnar" } diff --git a/src/spatial/triangle.rs b/src/spatial/triangle.rs index 562d55b85b..4abad4b0b7 100644 --- a/src/spatial/triangle.rs +++ b/src/spatial/triangle.rs @@ -6,7 +6,8 @@ //! recovery when needed. use i_triangle::advanced::delaunay::IntDelaunay; -use robust::{orient2d, Coord}; + +use crate::DocId; const MINY_MINX_MAXY_MAXX_Y_X: i32 = 0; const MINY_MINX_Y_X_MAXY_MAXX: i32 = 1; @@ -58,7 +59,7 @@ pub struct Triangle { /// and a reconstruction code. pub words: [i32; 7], /// The id of the document associated with this triangle. - pub doc_id: u32, + pub doc_id: DocId, } impl Triangle { @@ -69,6 +70,9 @@ impl Triangle { /// indicating which edges are polygon boundaries. Returns a triangle struct with the bounding /// box in the first four words as `[min_y, min_x, max_y, max_x]`. When decoded, the vertex /// order may differ from the original input to `new()` due to normalized rotation. + /// + /// The edge boundary flags are here to express whether an edge is part of the boundaries + /// in the tesselation of the larger polygon it belongs to. pub fn new(doc_id: u32, triangle: [i32; 6], boundaries: [bool; 3]) -> Self { let mut ay = triangle[0]; let mut ax = triangle[1]; @@ -135,18 +139,19 @@ impl Triangle { } } } - // change orientation if CW - if orient2d( + // change orientation if clockwise (CW) + if !is_counter_clockwise( Coord { y: ay, x: ax }, Coord { y: by, x: bx }, Coord { y: cy, x: cx }, - ) < 0.0 + ) { + // To change the orientation, we simply swap B and C. let temp_x = bx; let temp_y = by; let temp_boundary = ab; // ax and ay do not change, ab becomes bc - ab = bc; + ab = ca; bx = cx; by = cy; // bc does not change, ca becomes ab @@ -328,6 +333,28 @@ pub fn delaunay_to_triangles(doc_id: u32, delaunay: &IntDelaunay, triangles: &mu } } +struct Coord { + x: i32, + y: i32, +} + +/// Returns true if the path A -> B -> C is Counter-Clockwise (CCW) or collinear. +/// Returns false if it is Clockwise (CW). +#[inline(always)] +fn is_counter_clockwise(a: Coord, b: Coord, c: Coord) -> bool { + // We calculate the 2D cross product (determinant) of vectors AB and AC. + // Formula: (bx - ax)(cy - ay) - (by - ay)(cx - ax) + + // We cast to i64 to prevent overflow, as multiplying two i32s can exceed i32::MAX. + let val = (b.x as i64 - a.x as i64) * (c.y as i64 - a.y as i64) + - (b.y as i64 - a.y as i64) * (c.x as i64 - a.x as i64); + + // If the result is positive, the triangle is CCW. + // If negative, it is CW. + // If zero, the points are collinear (we return true in that case). + val >= 0 +} + #[cfg(test)] mod tests { use i_triangle::i_overlay::i_float::int::point::IntPoint; @@ -355,6 +382,56 @@ mod tests { } } + #[test] + fn test_cw_triangle_boundary_and_coord_flip() { + // 1. Define distinct coordinates for a Clockwise triangle + // Visual layout: + // A(50,40): Top Center-ish + // B(10,60): Bottom Right + // C(20,10): Bottom Left (Has the Minimum X=10) + // Path A->B->C is Clockwise. + let input_coords = [ + 50, 40, // A (y, x) + 10, 60, // B + 20, 10 // C + ]; + + // 2. Define Boundaries [ab, bc, ca] + // We set BC=true and CA=false. + // The bug (ab=bc) would erroneously put 'true' into the first slot. + // The fix (ab=ca) should put 'false' into the first slot. + let input_bounds = [false, true, false]; + + // 3. Encode + let triangle = Triangle::new(1, input_coords, input_bounds); + let (decoded_coords, decoded_bounds) = triangle.decode(); + + // 4. Expected Coordinates + // The internal logic detects CW, swaps B/C to make it CCW: + // A(50,40) -> C(20,10) -> B(10,60) + // Then it rotates to put Min-X first. + // Min X is 10 (Vertex C). + // Final Sequence: C -> B -> A + let expected_coords = [ + 20, 10, // C + 10, 60, // B + 50, 40 // A + ]; + + // 5. Expected Boundaries + // After Flip (A->C->B): + // Edge AC (was CA) = false + // Edge CB (was BC) = true + // Edge BA (was AB) = false + // Unrotated: [false, true, false] + // After Rotation (shifting to start at C): + // Shift left by 1: [true, false, false] + let expected_bounds = [true, false, false]; + + assert_eq!(decoded_coords, expected_coords, "Coordinates did not decode as expected"); + assert_eq!(decoded_bounds, expected_bounds, "Boundary flags were incorrect (likely swap bug)"); + } + #[test] fn degenerate_triangle() { let test_cases = [ From 1619e05bc5e48af51a00862a91d9dc8bdeb638ed Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 3 Dec 2025 16:10:11 +0100 Subject: [PATCH 20/22] plastic surgery --- src/indexer/segment_serializer.rs | 11 +++++-- src/schema/schema.rs | 46 ++++++++++++++++++++++++++-- src/spatial/triangle.rs | 49 ++++++++++++++++++++---------- src/spatial/writer.rs | 50 ++++++++++++++----------------- 4 files changed, 107 insertions(+), 49 deletions(-) diff --git a/src/indexer/segment_serializer.rs b/src/indexer/segment_serializer.rs index 34b832e7ea..a6143f1703 100644 --- a/src/indexer/segment_serializer.rs +++ b/src/indexer/segment_serializer.rs @@ -37,15 +37,20 @@ impl SegmentSerializer { let fieldnorms_write = segment.open_write(SegmentComponent::FieldNorms)?; let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?; - let spatial_write = segment.open_write(SegmentComponent::Spatial)?; - let spatial_serializer = SpatialSerializer::from_write(spatial_write)?; + let spatial_serializer: Option = + if segment.schema().contains_spatial_field() { + let spatial_write = segment.open_write(SegmentComponent::Spatial)?; + Some(SpatialSerializer::from_write(spatial_write)?) + } else { + None + }; let postings_serializer = InvertedIndexSerializer::open(&mut segment)?; Ok(SegmentSerializer { segment, store_writer, fast_field_write, - spatial_serializer: Some(spatial_serializer), + spatial_serializer, fieldnorms_serializer: Some(fieldnorms_serializer), postings_serializer, }) diff --git a/src/schema/schema.rs b/src/schema/schema.rs index ddfd533e40..24069870db 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -218,9 +218,14 @@ impl SchemaBuilder { /// Finalize the creation of a `Schema` /// This will consume your `SchemaBuilder` pub fn build(self) -> Schema { + let contains_spatial_field = self + .fields + .iter() + .any(|field_entry| field_entry.field_type().value_type() == Type::Spatial); Schema(Arc::new(InnerSchema { fields: self.fields, fields_map: self.fields_map, + contains_spatial_field, })) } } @@ -228,6 +233,7 @@ impl SchemaBuilder { struct InnerSchema { fields: Vec, fields_map: HashMap, // transient + contains_spatial_field: bool, } impl PartialEq for InnerSchema { @@ -378,6 +384,11 @@ impl Schema { } Some((field, json_path)) } + + /// Returns true if the schema contains a spatial field. + pub(crate) fn contains_spatial_field(&self) -> bool { + self.0.contains_spatial_field + } } impl Serialize for Schema { @@ -405,16 +416,16 @@ impl<'de> Deserialize<'de> for Schema { fn visit_seq(self, mut seq: A) -> Result where A: SeqAccess<'de> { - let mut schema = SchemaBuilder { + let mut schema_builder = SchemaBuilder { fields: Vec::with_capacity(seq.size_hint().unwrap_or(0)), fields_map: HashMap::with_capacity(seq.size_hint().unwrap_or(0)), }; while let Some(value) = seq.next_element()? { - schema.add_field(value); + schema_builder.add_field(value); } - Ok(schema.build()) + Ok(schema_builder.build()) } } @@ -1030,4 +1041,33 @@ mod tests { Some((default, "foobar")) ); } + + #[test] + fn test_contains_spatial_field() { + // No spatial field + { + let mut schema_builder = Schema::builder(); + schema_builder.add_text_field("title", TEXT); + let schema = schema_builder.build(); + assert!(!schema.contains_spatial_field()); + + // Serialization check + let schema_json = serde_json::to_string(&schema).unwrap(); + let schema_deserialized: Schema = serde_json::from_str(&schema_json).unwrap(); + assert!(!schema_deserialized.contains_spatial_field()); + } + // With spatial field + { + let mut schema_builder = Schema::builder(); + schema_builder.add_text_field("title", TEXT); + schema_builder.add_spatial_field("location", SPATIAL); + let schema = schema_builder.build(); + assert!(schema.contains_spatial_field()); + + // Serialization check + let schema_json = serde_json::to_string(&schema).unwrap(); + let schema_deserialized: Schema = serde_json::from_str(&schema_json).unwrap(); + assert!(schema_deserialized.contains_spatial_field()); + } + } } diff --git a/src/spatial/triangle.rs b/src/spatial/triangle.rs index 4abad4b0b7..5136a683b4 100644 --- a/src/spatial/triangle.rs +++ b/src/spatial/triangle.rs @@ -6,6 +6,7 @@ //! recovery when needed. use i_triangle::advanced::delaunay::IntDelaunay; +use i_triangle::i_overlay::i_float::int::point::IntPoint; use crate::DocId; @@ -141,11 +142,10 @@ impl Triangle { } // change orientation if clockwise (CW) if !is_counter_clockwise( - Coord { y: ay, x: ax }, - Coord { y: by, x: bx }, - Coord { y: cy, x: cx }, - ) - { + IntPoint { y: ay, x: ax }, + IntPoint { y: by, x: bx }, + IntPoint { y: cy, x: cx }, + ) { // To change the orientation, we simply swap B and C. let temp_x = bx; let temp_y = by; @@ -194,6 +194,22 @@ impl Triangle { } } + /// Builds a degenerated triangle degenerating for a single point. + /// All vertices are that point, and all vertices are boundaries. + pub fn from_point(doc_id: DocId, point_x: i32, point_y: i32) -> Triangle { + Triangle::new( + doc_id, + [point_y, point_x, point_y, point_x, point_y, point_x], + [true, true, true], + ) + } + + /// Builds a degenerated triangle for a segment. + /// Line segment AB is represented as the triangle ABA. + pub fn from_line_segment(doc_id: DocId, a_x: i32, a_y: i32, b_x: i32, b_y: i32) -> Triangle { + Triangle::new(doc_id, [a_y, a_x, b_y, b_x, a_y, a_x], [true, true, true]) + } + /// Create a triangle with only the doc_id and the words initialized to zero. /// /// The doc_id and words in the field are delta-compressed as a series with the doc_id @@ -333,21 +349,16 @@ pub fn delaunay_to_triangles(doc_id: u32, delaunay: &IntDelaunay, triangles: &mu } } -struct Coord { - x: i32, - y: i32, -} - /// Returns true if the path A -> B -> C is Counter-Clockwise (CCW) or collinear. /// Returns false if it is Clockwise (CW). #[inline(always)] -fn is_counter_clockwise(a: Coord, b: Coord, c: Coord) -> bool { +fn is_counter_clockwise(a: IntPoint, b: IntPoint, c: IntPoint) -> bool { // We calculate the 2D cross product (determinant) of vectors AB and AC. // Formula: (bx - ax)(cy - ay) - (by - ay)(cx - ax) // We cast to i64 to prevent overflow, as multiplying two i32s can exceed i32::MAX. let val = (b.x as i64 - a.x as i64) * (c.y as i64 - a.y as i64) - - (b.y as i64 - a.y as i64) * (c.x as i64 - a.x as i64); + - (b.y as i64 - a.y as i64) * (c.x as i64 - a.x as i64); // If the result is positive, the triangle is CCW. // If negative, it is CW. @@ -393,7 +404,7 @@ mod tests { let input_coords = [ 50, 40, // A (y, x) 10, 60, // B - 20, 10 // C + 20, 10, // C ]; // 2. Define Boundaries [ab, bc, ca] @@ -415,7 +426,7 @@ mod tests { let expected_coords = [ 20, 10, // C 10, 60, // B - 50, 40 // A + 50, 40, // A ]; // 5. Expected Boundaries @@ -428,8 +439,14 @@ mod tests { // Shift left by 1: [true, false, false] let expected_bounds = [true, false, false]; - assert_eq!(decoded_coords, expected_coords, "Coordinates did not decode as expected"); - assert_eq!(decoded_bounds, expected_bounds, "Boundary flags were incorrect (likely swap bug)"); + assert_eq!( + decoded_coords, expected_coords, + "Coordinates did not decode as expected" + ); + assert_eq!( + decoded_bounds, expected_bounds, + "Boundary flags were incorrect (likely swap bug)" + ); } #[test] diff --git a/src/spatial/writer.rs b/src/spatial/writer.rs index b1584fcd14..a4dec7dba0 100644 --- a/src/spatial/writer.rs +++ b/src/spatial/writer.rs @@ -23,27 +23,27 @@ impl SpatialWriter { let triangles = &mut self.triangles_by_field.entry(field).or_default(); match geometry { Geometry::Point(point) => { - into_point(triangles, doc_id, point); + append_point(triangles, doc_id, point); } Geometry::MultiPoint(multi_point) => { for point in multi_point { - into_point(triangles, doc_id, point); + append_point(triangles, doc_id, point); } } Geometry::LineString(line_string) => { - into_line_string(triangles, doc_id, line_string); + append_line_string(triangles, doc_id, line_string); } Geometry::MultiLineString(multi_line_string) => { for line_string in multi_line_string { - into_line_string(triangles, doc_id, line_string); + append_line_string(triangles, doc_id, line_string); } } Geometry::Polygon(polygon) => { - into_polygon(triangles, doc_id, polygon); + append_polygon(triangles, doc_id, &polygon); } Geometry::MultiPolygon(multi_polygon) => { for polygon in multi_polygon { - into_polygon(triangles, doc_id, polygon); + append_polygon(triangles, doc_id, &polygon); } } Geometry::GeometryCollection(geometries) => { @@ -62,7 +62,7 @@ impl SpatialWriter { .sum() } - /// HUSH + /// Serializing our field. pub fn serialize(&mut self, mut serializer: SpatialSerializer) -> io::Result<()> { for (field, triangles) in &mut self.triangles_by_field { serializer.serialize_field(*field, triangles)?; @@ -81,7 +81,7 @@ impl Default for SpatialWriter { } } -/// HUSH +/// Convert a point of (longitude, latitude) to a integer point. pub fn as_point_i32(point: (f64, f64)) -> (i32, i32) { ( (point.0 / (360.0 / (1i64 << 32) as f64)).floor() as i32, @@ -89,43 +89,39 @@ pub fn as_point_i32(point: (f64, f64)) -> (i32, i32) { ) } -fn into_point(triangles: &mut Vec, doc_id: DocId, point: (f64, f64)) { +fn append_point(triangles: &mut Vec, doc_id: DocId, point: (f64, f64)) { let point = as_point_i32(point); - triangles.push(Triangle::new( - doc_id, - [point.1, point.0, point.1, point.0, point.1, point.0], - [true, true, true], - )); + triangles.push(Triangle::from_point(doc_id, point.0, point.1)); } -fn into_line_string(triangles: &mut Vec, doc_id: DocId, line_string: Vec<(f64, f64)>) { +fn append_line_string( + triangles: &mut Vec, + doc_id: DocId, + line_string: Vec<(f64, f64)>, +) { let mut previous = as_point_i32(line_string[0]); for point in line_string.into_iter().skip(1) { let point = as_point_i32(point); - triangles.push(Triangle::new( - doc_id, - [ - previous.1, previous.0, point.1, point.0, previous.1, previous.0, - ], - [true, true, true], + triangles.push(Triangle::from_line_segment( + doc_id, previous.0, previous.1, point.0, point.1, )); previous = point } } -fn into_ring(i_polygon: &mut Vec>, ring: Vec<(f64, f64)>) { - let mut i_ring = Vec::new(); - for point in ring { +fn append_ring(i_polygon: &mut Vec>, ring: &[(f64, f64)]) { + let mut i_ring = Vec::with_capacity(ring.len() + 1); + for &point in ring { let point = as_point_i32(point); i_ring.push(IntPoint::new(point.0, point.1)); } i_polygon.push(i_ring); } -fn into_polygon(triangles: &mut Vec, doc_id: DocId, polygon: Vec>) { - let mut i_polygon = Vec::new(); +fn append_polygon(triangles: &mut Vec, doc_id: DocId, polygon: &[Vec<(f64, f64)>]) { + let mut i_polygon: Vec> = Vec::new(); for ring in polygon { - into_ring(&mut i_polygon, ring); + append_ring(&mut i_polygon, ring); } let delaunay = i_polygon.triangulate().into_delaunay(); delaunay_to_triangles(doc_id, &delaunay, triangles); From f85a27068deee345fdfad090f7a18d0b897bd997 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 3 Dec 2025 17:05:16 +0100 Subject: [PATCH 21/22] Introduced geopoint. --- examples/geo_json.rs | 4 +- src/fastfield/mod.rs | 15 +++--- src/index/segment_reader.rs | 9 +++- src/indexer/merger.rs | 11 +++-- src/query/spatial_query.rs | 3 +- src/spatial/geometry.rs | 93 +++++++++++++++++++------------------ src/spatial/mod.rs | 1 + src/spatial/point.rs | 9 ++++ src/spatial/reader.rs | 7 ++- src/spatial/writer.rs | 17 +++---- src/spatial/xor.rs | 22 ++++++--- 11 files changed, 116 insertions(+), 75 deletions(-) create mode 100644 src/spatial/point.rs diff --git a/examples/geo_json.rs b/examples/geo_json.rs index f3efbc93d1..adf71e1f8f 100644 --- a/examples/geo_json.rs +++ b/examples/geo_json.rs @@ -1,6 +1,8 @@ +use geo_types::Point; use tantivy::collector::TopDocs; use tantivy::query::SpatialQuery; use tantivy::schema::{Schema, Value, SPATIAL, STORED, TEXT}; +use tantivy::spatial::point::GeoPoint; use tantivy::{Index, IndexWriter, TantivyDocument}; fn main() -> tantivy::Result<()> { let mut schema_builder = Schema::builder(); @@ -38,7 +40,7 @@ fn main() -> tantivy::Result<()> { let field = schema.get_field("geometry").unwrap(); let query = SpatialQuery::new( field, - [(-99.49, 45.56), (-99.45, 45.59)], + [GeoPoint { lon:-99.49, lat: 45.56}, GeoPoint {lon:-99.45, lat: 45.59}], tantivy::query::SpatialQueryType::Intersects, ); let hits = searcher.search(&query, &TopDocs::with_limit(10).order_by_score())?; diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 726b9b76a6..0edd118c43 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -683,7 +683,7 @@ mod tests { } #[test] - fn test_datefastfield() -> crate::Result<()> { + fn test_datefastfield() { let mut schema_builder = Schema::builder(); let date_field = schema_builder.add_date_field( "date", @@ -697,22 +697,22 @@ mod tests { ); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); - let mut index_writer = index.writer_for_tests()?; + let mut index_writer = index.writer_for_tests().unwrap(); index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.add_document(doc!( date_field => DateTime::from_u64(1i64.to_u64()), multi_date_field => DateTime::from_u64(2i64.to_u64()), multi_date_field => DateTime::from_u64(3i64.to_u64()) - ))?; + )).unwrap(); index_writer.add_document(doc!( date_field => DateTime::from_u64(4i64.to_u64()) - ))?; + )).unwrap(); index_writer.add_document(doc!( multi_date_field => DateTime::from_u64(5i64.to_u64()), multi_date_field => DateTime::from_u64(6i64.to_u64()) - ))?; - index_writer.commit()?; - let reader = index.reader()?; + )).unwrap(); + index_writer.commit().unwrap(); + let reader = index.reader().unwrap(); let searcher = reader.searcher(); assert_eq!(searcher.segment_readers().len(), 1); let segment_reader = searcher.segment_reader(0); @@ -746,7 +746,6 @@ mod tests { assert_eq!(dates[0].into_timestamp_nanos(), 5i64); assert_eq!(dates[1].into_timestamp_nanos(), 6i64); } - Ok(()) } #[test] diff --git a/src/index/segment_reader.rs b/src/index/segment_reader.rs index 576a5f83b1..f361e8c277 100644 --- a/src/index/segment_reader.rs +++ b/src/index/segment_reader.rs @@ -180,8 +180,13 @@ impl SegmentReader { let fast_fields_readers = FastFieldReaders::open(fast_fields_data, schema.clone())?; let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?; let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?; - let spatial_data = segment.open_read(SegmentComponent::Spatial)?; - let spatial_readers = SpatialReaders::open(spatial_data)?; + let spatial_readers = if schema.contains_spatial_field() { + + let spatial_data = segment.open_read(SegmentComponent::Spatial)?; + SpatialReaders::open(spatial_data)? + } else { + SpatialReaders::empty() + }; let original_bitset = if segment.meta().has_deletes() { let alive_doc_file_slice = segment.open_read(SegmentComponent::Delete)?; diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 1e5caddead..60c827ec17 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -175,6 +175,7 @@ impl IndexMerger { let mut readers = vec![]; for (segment, new_alive_bitset_opt) in segments.iter().zip(alive_bitset_opt) { if segment.meta().num_docs() > 0 { + dbg!("segment"); let reader = SegmentReader::open_with_custom_alive_set(segment, new_alive_bitset_opt)?; readers.push(reader); @@ -530,7 +531,6 @@ impl IndexMerger { serializer: &mut SegmentSerializer, doc_id_mapping: &SegmentDocIdMapping, ) -> crate::Result<()> { - /// Unfortunately, there are no special trick to merge segments. /// We need to rebuild a BKD-tree based off the list of triangles. /// /// Because the data can be large, we do this by writing the sequence of triangles to @@ -543,6 +543,12 @@ impl IndexMerger { /// swap, the memory will not be accounted as anonymous memory, /// swap space is reserved etc. use crate::spatial::bkd::Segment; + + let Some(mut spatial_serializer) = serializer.extract_spatial_serializer() else { + // The schema does not contain any spatial field. + return Ok(()) + }; + let mut segment_mappings: Vec>> = Vec::new(); for reader in &self.readers { let max_doc = reader.max_doc(); @@ -586,7 +592,6 @@ impl IndexMerger { // No need to fsync here. This file is not here for persistency. } } - if let Some(mut spatial_serializer) = serializer.extract_spatial_serializer() { for (field, temp_file) in temp_files { // Memory map the triangle file. use memmap2::MmapOptions; @@ -600,7 +605,7 @@ impl IndexMerger { spatial_serializer.serialize_field(field, triangles)?; } spatial_serializer.close()?; - } + Ok(()) } diff --git a/src/query/spatial_query.rs b/src/query/spatial_query.rs index 22b2134bd8..024b941534 100644 --- a/src/query/spatial_query.rs +++ b/src/query/spatial_query.rs @@ -6,6 +6,7 @@ use crate::query::explanation::does_not_match; use crate::query::{BitSetDocSet, Explanation, Query, Scorer, Weight}; use crate::schema::Field; use crate::spatial::bkd::{search_intersects, Segment}; +use crate::spatial::point::GeoPoint; use crate::spatial::writer::as_point_i32; use crate::{DocId, DocSet, Score, TantivyError, TERMINATED}; @@ -28,7 +29,7 @@ pub struct SpatialQuery { impl SpatialQuery { /// HUSH - pub fn new(field: Field, bounds: [(f64, f64); 2], query_type: SpatialQueryType) -> Self { + pub fn new(field: Field, bounds: [GeoPoint; 2], query_type: SpatialQueryType) -> Self { SpatialQuery { field, bounds: [as_point_i32(bounds[0]), as_point_i32(bounds[1])], diff --git a/src/spatial/geometry.rs b/src/spatial/geometry.rs index 08dd2cf163..4ada3360ec 100644 --- a/src/spatial/geometry.rs +++ b/src/spatial/geometry.rs @@ -5,6 +5,7 @@ use std::io::{self, Read, Write}; use common::{BinarySerializable, VInt}; use serde_json::{json, Map, Value}; +use crate::spatial::point::GeoPoint; use crate::spatial::xor::{compress_f64, decompress_f64}; /// HUSH @@ -26,17 +27,17 @@ pub enum GeometryError { #[derive(Debug, Clone, PartialEq)] pub enum Geometry { /// HUSH - Point((f64, f64)), + Point(GeoPoint), /// HUSH - MultiPoint(Vec<(f64, f64)>), + MultiPoint(Vec), /// HUSH - LineString(Vec<(f64, f64)>), + LineString(Vec), /// HUSH - MultiLineString(Vec>), + MultiLineString(Vec>), /// HUSH - Polygon(Vec>), + Polygon(Vec>), /// HUSH - MultiPolygon(Vec>>), + MultiPolygon(Vec>>), /// HUSH GeometryCollection(Vec), } @@ -137,23 +138,24 @@ impl Geometry { } } - /// HUSH + /// Serialize the geometry to GeoJSON format. + /// https://fr.wikipedia.org/wiki/GeoJSON pub fn to_geojson(&self) -> Map { let mut map = Map::new(); match self { Geometry::Point(point) => { map.insert("type".to_string(), Value::String("Point".to_string())); - let coords = json!([point.0, point.1]); + let coords = json!([point.lon, point.lat]); map.insert("coordinates".to_string(), coords); } Geometry::MultiPoint(points) => { map.insert("type".to_string(), Value::String("MultiPoint".to_string())); - let coords: Vec = points.iter().map(|p| json!([p.0, p.1])).collect(); + let coords: Vec = points.iter().map(|p| json!([p.lon, p.lat])).collect(); map.insert("coordinates".to_string(), Value::Array(coords)); } Geometry::LineString(line) => { map.insert("type".to_string(), Value::String("LineString".to_string())); - let coords: Vec = line.iter().map(|p| json!([p.0, p.1])).collect(); + let coords: Vec = line.iter().map(|p| json!([p.lon, p.lat])).collect(); map.insert("coordinates".to_string(), Value::Array(coords)); } Geometry::MultiLineString(lines) => { @@ -163,7 +165,7 @@ impl Geometry { ); let coords: Vec = lines .iter() - .map(|line| Value::Array(line.iter().map(|p| json!([p.0, p.1])).collect())) + .map(|line| Value::Array(line.iter().map(|p| json!([p.lon, p.lat])).collect())) .collect(); map.insert("coordinates".to_string(), Value::Array(coords)); } @@ -171,7 +173,7 @@ impl Geometry { map.insert("type".to_string(), Value::String("Polygon".to_string())); let coords: Vec = rings .iter() - .map(|ring| Value::Array(ring.iter().map(|p| json!([p.0, p.1])).collect())) + .map(|ring| Value::Array(ring.iter().map(|p| json!([p.lon, p.lat])).collect())) .collect(); map.insert("coordinates".to_string(), Value::Array(coords)); } @@ -187,7 +189,7 @@ impl Geometry { polygon .iter() .map(|ring| { - Value::Array(ring.iter().map(|p| json!([p.0, p.1])).collect()) + Value::Array(ring.iter().map(|p| json!([p.lon, p.lat])).collect()) }) .collect(), ) @@ -218,7 +220,7 @@ fn get_coordinates(object: &Map) -> Result<&Value, GeometryError> Ok(coordinates) } -fn to_point(value: &Value) -> Result<(f64, f64), GeometryError> { +fn to_point(value: &Value) -> Result { let lonlat = value.as_array().ok_or(GeometryError::InvalidStructure( "expected 2 element array pair of lon/lat".to_string(), ))?; @@ -245,10 +247,10 @@ fn to_point(value: &Value) -> Result<(f64, f64), GeometryError> { lat ))); } - Ok((lon, lat)) + Ok(GeoPoint { lon, lat }) } -fn to_line_string(value: &Value) -> Result, GeometryError> { +fn to_line_string(value: &Value) -> Result, GeometryError> { let mut result = Vec::new(); let coordinates = value.as_array().ok_or(GeometryError::InvalidStructure( "expected an array of lon/lat arrays".to_string(), @@ -259,7 +261,7 @@ fn to_line_string(value: &Value) -> Result, GeometryError> { Ok(result) } -fn to_multi_line_string(value: &Value) -> Result>, GeometryError> { +fn to_multi_line_string(value: &Value) -> Result>, GeometryError> { let mut result = Vec::new(); let coordinates = value.as_array().ok_or(GeometryError::InvalidStructure( "expected an array of an array of lon/lat arrays".to_string(), @@ -275,8 +277,8 @@ impl BinarySerializable for Geometry { match self { Geometry::Point(point) => { 0u8.serialize(writer)?; - point.0.serialize(writer)?; - point.1.serialize(writer)?; + point.lon.serialize(writer)?; + point.lat.serialize(writer)?; Ok(()) } Geometry::MultiPoint(points) => { @@ -289,7 +291,7 @@ impl BinarySerializable for Geometry { } Geometry::MultiLineString(multi_line_string) => { 3u8.serialize(writer)?; - serialize_polygon(multi_line_string, writer) + serialize_polygon(&multi_line_string[..], writer) } Geometry::Polygon(polygon) => { 4u8.serialize(writer)?; @@ -309,8 +311,8 @@ impl BinarySerializable for Geometry { for polygon in multi_polygon { for ring in polygon { for point in ring { - lon.push(point.0); - lat.push(point.1); + lon.push(point.lon); + lat.push(point.lat); } } } @@ -339,7 +341,7 @@ impl BinarySerializable for Geometry { 0 => { let lon = BinarySerializable::deserialize(reader)?; let lat = BinarySerializable::deserialize(reader)?; - Ok(Geometry::Point((lon, lat))) + Ok(Geometry::Point(GeoPoint { lon, lat })) } 1 => Ok(Geometry::MultiPoint(deserialize_line_string(reader)?)), 2 => Ok(Geometry::LineString(deserialize_line_string(reader)?)), @@ -370,7 +372,10 @@ impl BinarySerializable for Geometry { for point_count in rings { let mut ring = Vec::new(); for _ in 0..point_count { - ring.push((lon[offset], lat[offset])); + ring.push(GeoPoint { + lon: lon[offset], + lat: lat[offset], + }); offset += 1; } polygon.push(ring); @@ -396,15 +401,15 @@ impl BinarySerializable for Geometry { } fn serialize_line_string( - line: &Vec<(f64, f64)>, + line: &[GeoPoint], writer: &mut W, ) -> io::Result<()> { BinarySerializable::serialize(&VInt(line.len() as u64), writer)?; let mut lon = Vec::new(); let mut lat = Vec::new(); for point in line { - lon.push(point.0); - lat.push(point.1); + lon.push(point.lon); + lat.push(point.lat); } let lon = compress_f64(&lon); let lat = compress_f64(&lat); @@ -416,23 +421,23 @@ fn serialize_line_string( } fn serialize_polygon( - line_string: &Vec>, + line_string: &[Vec], writer: &mut W, ) -> io::Result<()> { BinarySerializable::serialize(&VInt(line_string.len() as u64), writer)?; for ring in line_string { BinarySerializable::serialize(&VInt(ring.len() as u64), writer)?; } - let mut lon = Vec::new(); - let mut lat = Vec::new(); + let mut lon: Vec = Vec::new(); + let mut lat: Vec = Vec::new(); for ring in line_string { for point in ring { - lon.push(point.0); - lat.push(point.1); + lon.push(point.lon); + lat.push(point.lat); } } - let lon = compress_f64(&lon); - let lat = compress_f64(&lat); + let lon: Vec = compress_f64(&lon); + let lat: Vec = compress_f64(&lat); VInt(lon.len() as u64).serialize(writer)?; writer.write_all(&lon)?; VInt(lat.len() as u64).serialize(writer)?; @@ -440,20 +445,20 @@ fn serialize_polygon( Ok(()) } -fn deserialize_line_string(reader: &mut R) -> io::Result> { +fn deserialize_line_string(reader: &mut R) -> io::Result> { let point_count = VInt::deserialize(reader)?.0 as usize; let lon_bytes: Vec = BinarySerializable::deserialize(reader)?; let lat_bytes: Vec = BinarySerializable::deserialize(reader)?; - let lon = decompress_f64(&lon_bytes, point_count); - let lat = decompress_f64(&lat_bytes, point_count); - let mut line_string = Vec::new(); + let lon: Vec = decompress_f64(&lon_bytes, point_count); + let lat: Vec = decompress_f64(&lat_bytes, point_count); + let mut line_string: Vec = Vec::new(); for offset in 0..point_count { - line_string.push((lon[offset], lat[offset])); + line_string.push(GeoPoint { lon: lon[offset], lat: lat[offset] }); } Ok(line_string) } -fn deserialize_polygon(reader: &mut R) -> io::Result>> { +fn deserialize_polygon(reader: &mut R) -> io::Result>> { let ring_count = VInt::deserialize(reader)?.0 as usize; let mut rings = Vec::new(); let mut count = 0; @@ -464,14 +469,14 @@ fn deserialize_polygon(reader: &mut R) -> io::Result = BinarySerializable::deserialize(reader)?; let lat_bytes: Vec = BinarySerializable::deserialize(reader)?; - let lon = decompress_f64(&lon_bytes, count); - let lat = decompress_f64(&lat_bytes, count); - let mut polygon = Vec::new(); + let lon: Vec = decompress_f64(&lon_bytes, count); + let lat: Vec = decompress_f64(&lat_bytes, count); + let mut polygon: Vec> = Vec::new(); let mut offset = 0; for point_count in rings { let mut ring = Vec::new(); for _ in 0..point_count { - ring.push((lon[offset], lat[offset])); + ring.push(GeoPoint { lon: lon[offset], lat: lat[offset] }); offset += 1; } polygon.push(ring); diff --git a/src/spatial/mod.rs b/src/spatial/mod.rs index 70c59aafc4..4ac391b349 100644 --- a/src/spatial/mod.rs +++ b/src/spatial/mod.rs @@ -3,6 +3,7 @@ pub mod bkd; pub mod delta; pub mod geometry; +pub mod point; pub mod radix_select; pub mod reader; pub mod serializer; diff --git a/src/spatial/point.rs b/src/spatial/point.rs new file mode 100644 index 0000000000..790fbde90c --- /dev/null +++ b/src/spatial/point.rs @@ -0,0 +1,9 @@ + +/// A point in the geographical coordinate system. +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct GeoPoint { + /// Longitude + pub lon: f64, + /// Latitude + pub lat: f64, +} diff --git a/src/spatial/reader.rs b/src/spatial/reader.rs index 9338111ff7..7bf10e3295 100644 --- a/src/spatial/reader.rs +++ b/src/spatial/reader.rs @@ -10,12 +10,17 @@ use crate::schema::Field; use crate::space_usage::PerFieldSpaceUsage; #[derive(Clone)] -/// HUSH pub struct SpatialReaders { data: Arc, } impl SpatialReaders { + pub fn empty() -> SpatialReaders { + SpatialReaders { + data: Arc::new(CompositeFile::empty()), + } + } + /// Creates a field norm reader. pub fn open(file: FileSlice) -> crate::Result { let data = CompositeFile::open(&file)?; diff --git a/src/spatial/writer.rs b/src/spatial/writer.rs index a4dec7dba0..384afb83e1 100644 --- a/src/spatial/writer.rs +++ b/src/spatial/writer.rs @@ -7,6 +7,7 @@ use i_triangle::int::triangulatable::IntTriangulatable; use crate::schema::Field; use crate::spatial::geometry::Geometry; +use crate::spatial::point::GeoPoint; use crate::spatial::serializer::SpatialSerializer; use crate::spatial::triangle::{delaunay_to_triangles, Triangle}; use crate::DocId; @@ -81,15 +82,15 @@ impl Default for SpatialWriter { } } -/// Convert a point of (longitude, latitude) to a integer point. -pub fn as_point_i32(point: (f64, f64)) -> (i32, i32) { +/// Convert a point of `(longitude, latitude)` to a integer point. +pub fn as_point_i32(point: GeoPoint) -> (i32, i32) { ( - (point.0 / (360.0 / (1i64 << 32) as f64)).floor() as i32, - (point.1 / (180.0 / (1i64 << 32) as f64)).floor() as i32, + (point.lon / (360.0 / (1i64 << 32) as f64)).floor() as i32, + (point.lat / (180.0 / (1i64 << 32) as f64)).floor() as i32, ) } -fn append_point(triangles: &mut Vec, doc_id: DocId, point: (f64, f64)) { +fn append_point(triangles: &mut Vec, doc_id: DocId, point: GeoPoint) { let point = as_point_i32(point); triangles.push(Triangle::from_point(doc_id, point.0, point.1)); } @@ -97,7 +98,7 @@ fn append_point(triangles: &mut Vec, doc_id: DocId, point: (f64, f64)) fn append_line_string( triangles: &mut Vec, doc_id: DocId, - line_string: Vec<(f64, f64)>, + line_string: Vec, ) { let mut previous = as_point_i32(line_string[0]); for point in line_string.into_iter().skip(1) { @@ -109,7 +110,7 @@ fn append_line_string( } } -fn append_ring(i_polygon: &mut Vec>, ring: &[(f64, f64)]) { +fn append_ring(i_polygon: &mut Vec>, ring: &[GeoPoint]) { let mut i_ring = Vec::with_capacity(ring.len() + 1); for &point in ring { let point = as_point_i32(point); @@ -118,7 +119,7 @@ fn append_ring(i_polygon: &mut Vec>, ring: &[(f64, f64)]) { i_polygon.push(i_ring); } -fn append_polygon(triangles: &mut Vec, doc_id: DocId, polygon: &[Vec<(f64, f64)>]) { +fn append_polygon(triangles: &mut Vec, doc_id: DocId, polygon: &[Vec]) { let mut i_polygon: Vec> = Vec::new(); for ring in polygon { append_ring(&mut i_polygon, ring); diff --git a/src/spatial/xor.rs b/src/spatial/xor.rs index af069a43a2..c9e6fb2804 100644 --- a/src/spatial/xor.rs +++ b/src/spatial/xor.rs @@ -18,7 +18,7 @@ //! Unlike delta.rs which uses arithmetic deltas for i32 spatial coordinates in the block kd-tree, //! this module operates on f64 bit patterns directly to preserve exact floating-point values for //! returning to users. -use std::io::{Cursor, Read}; +use std::io::Read; use common::VInt; @@ -34,8 +34,8 @@ pub fn compress_f64(values: &[f64]) -> Vec { if values.is_empty() { return Vec::new(); } - let mut output = Vec::new(); - let mut previous = values[0].to_bits(); + let mut output: Vec = Vec::new(); + let mut previous: u64 = f64_to_le(values[0]); output.extend_from_slice(&previous.to_le_bytes()); for &value in &values[1..] { let bits = value.to_bits(); @@ -46,13 +46,21 @@ pub fn compress_f64(values: &[f64]) -> Vec { if output.len() >= values.len() * 8 { let mut output = Vec::with_capacity(values.len() * 8); for &value in values { - output.extend_from_slice(&value.to_bits().to_le_bytes()); + output.extend_from_slice(&f64_to_le(value).to_le_bytes()); } return output; } output } +fn f64_to_le(value: f64) -> u64 { + u64::from_le_bytes(value.to_le_bytes()) +} + +fn f64_from_le(value: u64) -> f64 { + f64::from_le_bytes(value.to_le_bytes()) +} + /// Decompresses f64 coordinates from XOR delta or raw encoding. /// /// Detects compression format by byte length - if `bytes.len() == count * 8`, data is raw and @@ -60,16 +68,16 @@ pub fn compress_f64(values: &[f64]) -> Vec { /// reconstructing the original sequence. /// /// Returns exact f64 values that were passed to `compress_f64()`. -pub fn decompress_f64(bytes: &[u8], count: usize) -> Vec { +pub fn decompress_f64(mut bytes: &[u8], count: usize) -> Vec { let mut values = Vec::with_capacity(count); if bytes.len() == count * 8 { for i in 0..count { let bits = u64::from_le_bytes(bytes[i * 8..(i + 1) * 8].try_into().unwrap()); - values.push(f64::from_bits(bits)); + values.push(f64_from_le(bits)); } return values; } - let mut cursor = Cursor::new(bytes); + let mut cursor: &mut &[u8] = &mut bytes; // Read first value (raw 8 bytes) let mut first_bytes = [0u8; 8]; From 643639f14b5f6ac7dfd5053a0e1b5ad912fe106c Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 3 Dec 2025 17:05:27 +0100 Subject: [PATCH 22/22] Introduced geopoint. --- examples/geo_json.rs | 11 ++++++++++- src/fastfield/mod.rs | 30 ++++++++++++++++++------------ src/index/segment_reader.rs | 9 ++++----- src/indexer/merger.rs | 28 ++++++++++++++-------------- src/spatial/geometry.rs | 19 ++++++++++++------- src/spatial/point.rs | 1 - src/spatial/writer.rs | 6 +----- 7 files changed, 59 insertions(+), 45 deletions(-) diff --git a/examples/geo_json.rs b/examples/geo_json.rs index adf71e1f8f..5829c22d24 100644 --- a/examples/geo_json.rs +++ b/examples/geo_json.rs @@ -40,7 +40,16 @@ fn main() -> tantivy::Result<()> { let field = schema.get_field("geometry").unwrap(); let query = SpatialQuery::new( field, - [GeoPoint { lon:-99.49, lat: 45.56}, GeoPoint {lon:-99.45, lat: 45.59}], + [ + GeoPoint { + lon: -99.49, + lat: 45.56, + }, + GeoPoint { + lon: -99.45, + lat: 45.59, + }, + ], tantivy::query::SpatialQueryType::Intersects, ); let hits = searcher.search(&query, &TopDocs::with_limit(10).order_by_score())?; diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 0edd118c43..f01e891f50 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -699,18 +699,24 @@ mod tests { let index = Index::create_in_ram(schema); let mut index_writer = index.writer_for_tests().unwrap(); index_writer.set_merge_policy(Box::new(NoMergePolicy)); - index_writer.add_document(doc!( - date_field => DateTime::from_u64(1i64.to_u64()), - multi_date_field => DateTime::from_u64(2i64.to_u64()), - multi_date_field => DateTime::from_u64(3i64.to_u64()) - )).unwrap(); - index_writer.add_document(doc!( - date_field => DateTime::from_u64(4i64.to_u64()) - )).unwrap(); - index_writer.add_document(doc!( - multi_date_field => DateTime::from_u64(5i64.to_u64()), - multi_date_field => DateTime::from_u64(6i64.to_u64()) - )).unwrap(); + index_writer + .add_document(doc!( + date_field => DateTime::from_u64(1i64.to_u64()), + multi_date_field => DateTime::from_u64(2i64.to_u64()), + multi_date_field => DateTime::from_u64(3i64.to_u64()) + )) + .unwrap(); + index_writer + .add_document(doc!( + date_field => DateTime::from_u64(4i64.to_u64()) + )) + .unwrap(); + index_writer + .add_document(doc!( + multi_date_field => DateTime::from_u64(5i64.to_u64()), + multi_date_field => DateTime::from_u64(6i64.to_u64()) + )) + .unwrap(); index_writer.commit().unwrap(); let reader = index.reader().unwrap(); let searcher = reader.searcher(); diff --git a/src/index/segment_reader.rs b/src/index/segment_reader.rs index f361e8c277..1afe55b2ba 100644 --- a/src/index/segment_reader.rs +++ b/src/index/segment_reader.rs @@ -181,12 +181,11 @@ impl SegmentReader { let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?; let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?; let spatial_readers = if schema.contains_spatial_field() { - let spatial_data = segment.open_read(SegmentComponent::Spatial)?; - SpatialReaders::open(spatial_data)? - } else { - SpatialReaders::empty() - }; + SpatialReaders::open(spatial_data)? + } else { + SpatialReaders::empty() + }; let original_bitset = if segment.meta().has_deletes() { let alive_doc_file_slice = segment.open_read(SegmentComponent::Delete)?; diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 60c827ec17..6af60d361e 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -546,7 +546,7 @@ impl IndexMerger { let Some(mut spatial_serializer) = serializer.extract_spatial_serializer() else { // The schema does not contain any spatial field. - return Ok(()) + return Ok(()); }; let mut segment_mappings: Vec>> = Vec::new(); @@ -592,19 +592,19 @@ impl IndexMerger { // No need to fsync here. This file is not here for persistency. } } - for (field, temp_file) in temp_files { - // Memory map the triangle file. - use memmap2::MmapOptions; - let mmap = unsafe { MmapOptions::new().map_mut(temp_file.as_file())? }; - // Cast to &[Triangle] slice - let triangle_count = mmap.len() / std::mem::size_of::(); - let triangles = unsafe { - std::slice::from_raw_parts_mut(mmap.as_ptr() as *mut Triangle, triangle_count) - }; - // Get spatial writer and rebuild block kd-tree. - spatial_serializer.serialize_field(field, triangles)?; - } - spatial_serializer.close()?; + for (field, temp_file) in temp_files { + // Memory map the triangle file. + use memmap2::MmapOptions; + let mmap = unsafe { MmapOptions::new().map_mut(temp_file.as_file())? }; + // Cast to &[Triangle] slice + let triangle_count = mmap.len() / std::mem::size_of::(); + let triangles = unsafe { + std::slice::from_raw_parts_mut(mmap.as_ptr() as *mut Triangle, triangle_count) + }; + // Get spatial writer and rebuild block kd-tree. + spatial_serializer.serialize_field(field, triangles)?; + } + spatial_serializer.close()?; Ok(()) } diff --git a/src/spatial/geometry.rs b/src/spatial/geometry.rs index 4ada3360ec..091f9688c9 100644 --- a/src/spatial/geometry.rs +++ b/src/spatial/geometry.rs @@ -189,7 +189,9 @@ impl Geometry { polygon .iter() .map(|ring| { - Value::Array(ring.iter().map(|p| json!([p.lon, p.lat])).collect()) + Value::Array( + ring.iter().map(|p| json!([p.lon, p.lat])).collect(), + ) }) .collect(), ) @@ -400,10 +402,7 @@ impl BinarySerializable for Geometry { } } -fn serialize_line_string( - line: &[GeoPoint], - writer: &mut W, -) -> io::Result<()> { +fn serialize_line_string(line: &[GeoPoint], writer: &mut W) -> io::Result<()> { BinarySerializable::serialize(&VInt(line.len() as u64), writer)?; let mut lon = Vec::new(); let mut lat = Vec::new(); @@ -453,7 +452,10 @@ fn deserialize_line_string(reader: &mut R) -> io::Result> let lat: Vec = decompress_f64(&lat_bytes, point_count); let mut line_string: Vec = Vec::new(); for offset in 0..point_count { - line_string.push(GeoPoint { lon: lon[offset], lat: lat[offset] }); + line_string.push(GeoPoint { + lon: lon[offset], + lat: lat[offset], + }); } Ok(line_string) } @@ -476,7 +478,10 @@ fn deserialize_polygon(reader: &mut R) -> io::Result> for point_count in rings { let mut ring = Vec::new(); for _ in 0..point_count { - ring.push(GeoPoint { lon: lon[offset], lat: lat[offset] }); + ring.push(GeoPoint { + lon: lon[offset], + lat: lat[offset], + }); offset += 1; } polygon.push(ring); diff --git a/src/spatial/point.rs b/src/spatial/point.rs index 790fbde90c..93981b2ad2 100644 --- a/src/spatial/point.rs +++ b/src/spatial/point.rs @@ -1,4 +1,3 @@ - /// A point in the geographical coordinate system. #[derive(Debug, Clone, Copy, PartialEq)] pub struct GeoPoint { diff --git a/src/spatial/writer.rs b/src/spatial/writer.rs index 384afb83e1..5c8ff87e23 100644 --- a/src/spatial/writer.rs +++ b/src/spatial/writer.rs @@ -95,11 +95,7 @@ fn append_point(triangles: &mut Vec, doc_id: DocId, point: GeoPoint) { triangles.push(Triangle::from_point(doc_id, point.0, point.1)); } -fn append_line_string( - triangles: &mut Vec, - doc_id: DocId, - line_string: Vec, -) { +fn append_line_string(triangles: &mut Vec, doc_id: DocId, line_string: Vec) { let mut previous = as_point_i32(line_string[0]); for point in line_string.into_iter().skip(1) { let point = as_point_i32(point);