quickwit-oss · PSeitz · Dec 13, 2023 · adamreichold · Dec 13, 2023
diff --git a/Cargo.toml b/Cargo.toml
@@ -25,7 +25,7 @@ aho-corasick = "1.0"
 tantivy-fst = "0.5"
 memmap2 = { version = "0.9.0", optional = true }
 lz4_flex = { version = "0.11", default-features = false, optional = true }
-zstd = { version = "0.13", optional = true, default-features = false }
+zstd = { version = "0.13", default-features = false }
 tempfile = { version = "3.3.0", optional = true }
 log = "0.4.16"
 serde = { version = "1.0.136", features = ["derive"] }
@@ -105,7 +105,7 @@ mmap = ["fs4", "tempfile", "memmap2"]
 stopwords = []
 
 lz4-compression = ["lz4_flex"]
-zstd-compression = ["zstd"]
+zstd-compression = []
 
 failpoints = ["fail", "fail/failpoints"]
 unstable = [] # useful for benches.

diff --git a/columnar/src/columnar/column_type.rs b/columnar/src/columnar/column_type.rs
@@ -58,7 +58,7 @@ impl ColumnType {
         self == &ColumnType::DateTime
     }
 
-    pub(crate) fn try_from_code(code: u8) -> Result<ColumnType, InvalidData> {
+    pub fn try_from_code(code: u8) -> Result<ColumnType, InvalidData> {
         COLUMN_TYPES.get(code as usize).copied().ok_or(InvalidData)
     }
 }

diff --git a/columnar/src/columnar/writer/mod.rs b/columnar/src/columnar/writer/mod.rs
@@ -333,7 +333,7 @@ impl ColumnarWriter {
         num_docs: RowId,
         old_to_new_row_ids: Option<&[RowId]>,
         wrt: &mut dyn io::Write,
-    ) -> io::Result<()> {
+    ) -> io::Result<Vec<(String, ColumnType)>> {
         let mut serializer = ColumnarSerializer::new(wrt);
         let mut columns: Vec<(&[u8], ColumnType, Addr)> = self
             .numerical_field_hash_map
@@ -374,7 +374,9 @@ impl ColumnarWriter {
 
         let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries);
         let mut symbol_byte_buffer: Vec<u8> = Vec::new();
-        for (column_name, column_type, addr) in columns {
+        for (column_name, column_type, addr) in columns.iter() {
+            let column_type = *column_type;
+            let addr = *addr;
             match column_type {
                 ColumnType::Bool => {
                     let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr);
@@ -485,7 +487,15 @@ impl ColumnarWriter {
             };
         }
         serializer.finalize(num_docs)?;
-        Ok(())
+        Ok(columns
+            .into_iter()
+            .map(|(column_name, column_type, _)| {
+                (
+                    String::from_utf8_lossy(column_name).to_string(),
+                    column_type,
+                )
+            })
+            .collect())
     }
 }
 

diff --git a/common/src/bitset.rs b/common/src/bitset.rs
@@ -6,7 +6,7 @@ use ownedbytes::OwnedBytes;
 
 use crate::ByteCount;
 
-#[derive(Clone, Copy, Eq, PartialEq)]
+#[derive(Clone, Copy, Eq, PartialEq, Hash)]
 pub struct TinySet(u64);
 
 impl fmt::Debug for TinySet {

diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs
@@ -142,6 +142,7 @@ impl SegmentMeta {
             SegmentComponent::FastFields => ".fast".to_string(),
             SegmentComponent::FieldNorms => ".fieldnorm".to_string(),
             SegmentComponent::Delete => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
+            SegmentComponent::FieldList => ".fieldlist".to_string(),
         });
         PathBuf::from(path)
     }

diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs
@@ -70,7 +70,7 @@ impl InvertedIndexReader {
         &self.termdict
     }
 
-    /// Return the fields and types encoded in the dictionary in lexicographic oder.
+    /// Return the fields and types encoded in the dictionary in lexicographic order.
     /// Only valid on JSON fields.
     ///
     /// Notice: This requires a full scan and therefore **very expensive**.

diff --git a/src/core/json_utils.rs b/src/core/json_utils.rs
@@ -1,4 +1,4 @@
-use columnar::MonotonicallyMappableToU64;
+use columnar::{ColumnType, MonotonicallyMappableToU64};
 use common::{replace_in_place, JsonPathWriter};
 use rustc_hash::FxHashMap;
 
@@ -153,7 +153,7 @@ fn index_json_value<'a, V: Value<'a>>(
                 let mut token_stream = text_analyzer.token_stream(val);
                 let unordered_id = ctx
                     .path_to_unordered_id
-                    .get_or_allocate_unordered_id(json_path_writer.as_str());
+                    .get_or_allocate_unordered_id(json_path_writer.as_str(), ColumnType::Str);
 
                 // TODO: make sure the chain position works out.
                 set_path_id(term_buffer, unordered_id);
@@ -171,7 +171,7 @@ fn index_json_value<'a, V: Value<'a>>(
                 set_path_id(
                     term_buffer,
                     ctx.path_to_unordered_id
-                        .get_or_allocate_unordered_id(json_path_writer.as_str()),
+                        .get_or_allocate_unordered_id(json_path_writer.as_str(), ColumnType::U64),
                 );
                 term_buffer.append_type_and_fast_value(val);
                 postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
@@ -180,7 +180,7 @@ fn index_json_value<'a, V: Value<'a>>(
                 set_path_id(
                     term_buffer,
                     ctx.path_to_unordered_id
-                        .get_or_allocate_unordered_id(json_path_writer.as_str()),
+                        .get_or_allocate_unordered_id(json_path_writer.as_str(), ColumnType::I64),
                 );
                 term_buffer.append_type_and_fast_value(val);
                 postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
@@ -189,7 +189,7 @@ fn index_json_value<'a, V: Value<'a>>(
                 set_path_id(
                     term_buffer,
                     ctx.path_to_unordered_id
-                        .get_or_allocate_unordered_id(json_path_writer.as_str()),
+                        .get_or_allocate_unordered_id(json_path_writer.as_str(), ColumnType::F64),
                 );
                 term_buffer.append_type_and_fast_value(val);
                 postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
@@ -198,16 +198,18 @@ fn index_json_value<'a, V: Value<'a>>(
                 set_path_id(
                     term_buffer,
                     ctx.path_to_unordered_id
-                        .get_or_allocate_unordered_id(json_path_writer.as_str()),
+                        .get_or_allocate_unordered_id(json_path_writer.as_str(), ColumnType::Bool),
                 );
                 term_buffer.append_type_and_fast_value(val);
                 postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
             }
             ReferenceValueLeaf::Date(val) => {
                 set_path_id(
                     term_buffer,
-                    ctx.path_to_unordered_id
-                        .get_or_allocate_unordered_id(json_path_writer.as_str()),
+                    ctx.path_to_unordered_id.get_or_allocate_unordered_id(
+                        json_path_writer.as_str(),
+                        ColumnType::DateTime,
+                    ),
                 );
                 term_buffer.append_type_and_fast_value(val);
                 postings_writer.subscribe(doc, 0u32, term_buffer, ctx);

diff --git a/src/core/segment_component.rs b/src/core/segment_component.rs
@@ -27,12 +27,14 @@ pub enum SegmentComponent {
     /// Bitset describing which document of the segment is alive.
     /// (It was representing deleted docs but changed to represent alive docs from v0.17)
     Delete,
+    /// Field list describing the fields in the segment.
+    FieldList,
 }
 
 impl SegmentComponent {
     /// Iterates through the components.
     pub fn iterator() -> slice::Iter<'static, SegmentComponent> {
-        static SEGMENT_COMPONENTS: [SegmentComponent; 8] = [
+        static SEGMENT_COMPONENTS: [SegmentComponent; 9] = [
             SegmentComponent::Postings,
             SegmentComponent::Positions,
             SegmentComponent::FastFields,
@@ -41,6 +43,7 @@ impl SegmentComponent {
             SegmentComponent::Store,
             SegmentComponent::TempStore,
             SegmentComponent::Delete,
+            SegmentComponent::FieldList,
         ];
         SEGMENT_COMPONENTS.iter()
     }

diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs
@@ -3,15 +3,14 @@ use std::ops::BitOrAssign;
 use std::sync::{Arc, RwLock};
 use std::{fmt, io};
 
-use fnv::FnvHashMap;
 use itertools::Itertools;
 
 use crate::core::{InvertedIndexReader, Segment, SegmentComponent, SegmentId};
 use crate::directory::{CompositeFile, FileSlice};
 use crate::error::DataCorruption;
 use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders};
+use crate::field_list::read_split_fields;
 use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
-use crate::json_utils::json_path_sep_to_dot;
 use crate::schema::{Field, IndexRecordOption, Schema, Type};
 use crate::space_usage::SegmentSpaceUsage;
 use crate::store::StoreReader;
@@ -44,6 +43,7 @@ pub struct SegmentReader {
     fast_fields_readers: FastFieldReaders,
     fieldnorm_readers: FieldNormReaders,
 
+    list_fields_file: Option<FileSlice>, // Optional field list file for backwards compatibility
     store_file: FileSlice,
     alive_bitset_opt: Option<AliveBitSet>,
     schema: Schema,
@@ -153,6 +153,7 @@ impl SegmentReader {
         let termdict_composite = CompositeFile::open(&termdict_file)?;
 
         let store_file = segment.open_read(SegmentComponent::Store)?;
+        let list_fields_file = segment.open_read(SegmentComponent::FieldList).ok();
 
         crate::fail_point!("SegmentReader::open#middle");
 
@@ -201,6 +202,7 @@ impl SegmentReader {
             segment_id: segment.id(),
             delete_opstamp: segment.meta().delete_opstamp(),
             store_file,
+            list_fields_file,
             alive_bitset_opt,
             positions_composite,
             schema,
@@ -299,87 +301,25 @@ impl SegmentReader {
     /// field that is not indexed nor a fast field but is stored, it is possible for the field
     /// to not be listed.
     pub fn fields_metadata(&self) -> crate::Result<Vec<FieldMetadata>> {
-        let mut indexed_fields: Vec<FieldMetadata> = Vec::new();
-        let mut map_to_canonical = FnvHashMap::default();
-        for (field, field_entry) in self.schema().fields() {
-            let field_name = field_entry.name().to_string();
-            let is_indexed = field_entry.is_indexed();
-
-            if is_indexed {
-                let is_json = field_entry.field_type().value_type() == Type::Json;
-                if is_json {
-                    let inv_index = self.inverted_index(field)?;
-                    let encoded_fields_in_index = inv_index.list_encoded_fields()?;
-                    let mut build_path = |field_name: &str, mut json_path: String| {
-                        // In this case we need to map the potential fast field to the field name
-                        // accepted by the query parser.
-                        let create_canonical =
-                            !field_entry.is_expand_dots_enabled() && json_path.contains('.');
-                        if create_canonical {
-                            // Without expand dots enabled dots need to be escaped.
-                            let escaped_json_path = json_path.replace('.', "\\.");
-                            let full_path = format!("{}.{}", field_name, escaped_json_path);
-                            let full_path_unescaped = format!("{}.{}", field_name, &json_path);
-                            map_to_canonical.insert(full_path_unescaped, full_path.to_string());
-                            full_path
-                        } else {
-                            // With expand dots enabled, we can use '.' instead of '\u{1}'.
-                            json_path_sep_to_dot(&mut json_path);
-                            format!("{}.{}", field_name, json_path)
-                        }
-                    };
-                    indexed_fields.extend(
-                        encoded_fields_in_index
-                            .into_iter()
-                            .map(|(name, typ)| (build_path(&field_name, name), typ))
-                            .map(|(field_name, typ)| FieldMetadata {
-                                indexed: true,
-                                stored: false,
-                                field_name,
-                                fast: false,
-                                typ,
-                            }),
-                    );
-                } else {
-                    indexed_fields.push(FieldMetadata {
-                        indexed: true,
-                        stored: false,
-                        field_name: field_name.to_string(),
-                        fast: false,
-                        typ: field_entry.field_type().value_type(),
-                    });
-                }
-            }
+        if let Some(list_fields_file) = self.list_fields_file.as_ref() {
+            let file = list_fields_file.read_bytes()?;
+            let fields_metadata =
+                read_split_fields(file)?.collect::<io::Result<Vec<FieldMetadata>>>();
+            fields_metadata.map_err(|e| e.into())
+        } else {
+            // Schema fallback
+            Ok(self
+                .schema()
+                .fields()
+                .map(|(_field, entry)| FieldMetadata {
+                    field_name: entry.name().to_string(),
+                    typ: entry.field_type().value_type(),
+                    indexed: entry.is_indexed(),
+                    stored: entry.is_stored(),
+                    fast: entry.is_fast(),
+                })
+                .collect())
         }
-        if let Some(list_fields_file) = self.list_fields_file.as_ref() {
-            let file = list_fields_file.read_bytes()?;
-            let fields_metadata =
-                read_split_fields(file)?.collect::<io::Result<Vec<FieldMetadata>>>();
-            fields_metadata.map_err(|e| e.into())
-        } else {
-            // Schema fallback
-            Ok(self
-                .schema()
-                .fields()
-                .map(|(_field, entry)| FieldMetadata {
-                    field_name: entry.name().to_string(),
-                    typ: entry.field_type().value_type(),
-                    indexed: entry.is_indexed(),
-                    stored: entry.is_stored(),
-                    fast: entry.is_fast(),
-                })
-                .collect())
-        }
+        let fields_metadata = if let Some(list_fields_file) = self.list_fields_file.as_ref() {
+            let file = list_fields_file.read_bytes()?;
+            read_split_fields(file)?.collect::<io::Result<Vec<FieldMetadata>>>()?
+        } else {
+            // Schema fallback
+            self.schema()
+                .fields()
+                .map(|(_field, entry)| FieldMetadata {
+                    field_name: entry.name().to_string(),
+                    typ: entry.field_type().value_type(),
+                    indexed: entry.is_indexed(),
+                    stored: entry.is_stored(),
+                    fast: entry.is_fast(),
+                })
+                .collect()
+        };
+        Ok(fields_metadata)
-        if let Some(list_fields_file) = self.list_fields_file.as_ref() {
-            let file = list_fields_file.read_bytes()?;
-            let fields_metadata =
-                read_split_fields(file)?.collect::<io::Result<Vec<FieldMetadata>>>();
-            fields_metadata.map_err(|e| e.into())
-        } else {
-            // Schema fallback
-            Ok(self
-                .schema()
-                .fields()
-                .map(|(_field, entry)| FieldMetadata {
-                    field_name: entry.name().to_string(),
-                    typ: entry.field_type().value_type(),
-                    indexed: entry.is_indexed(),
-                    stored: entry.is_stored(),
-                    fast: entry.is_fast(),
-                })
-                .collect())
-        }
+        let fields_metadata = if let Some(list_fields_file) = self.list_fields_file.as_ref() {
+            let file = list_fields_file.read_bytes()?;
+            read_split_fields(file)?.collect::<io::Result<Vec<FieldMetadata>>>()?
+        } else {
+            // Schema fallback
+            self.schema()
+                .fields()
+                .map(|(_field, entry)| FieldMetadata {
+                    field_name: entry.name().to_string(),
+                    typ: entry.field_type().value_type(),
+                    indexed: entry.is_indexed(),
+                    stored: entry.is_stored(),
+                    fast: entry.is_fast(),
+                })
+                .collect()
+        };
+        Ok(fields_metadata)
-        let mut fast_fields: Vec<FieldMetadata> = self
-            .fast_fields()
-            .columnar()
-            .iter_columns()?
-            .map(|(mut field_name, handle)| {
-                json_path_sep_to_dot(&mut field_name);
-                // map to canonical path, to avoid similar but different entries.
-                // Eventually we should just accept '.' seperated for all cases.
-                let field_name = map_to_canonical
-                    .get(&field_name)
-                    .unwrap_or(&field_name)
-                    .to_string();
-                FieldMetadata {
-                    indexed: false,
-                    stored: false,
-                    field_name,
-                    fast: true,
-                    typ: Type::from(handle.column_type()),
-                }
-            })
-            .collect();
-        // Since the type is encoded differently in the fast field and in the inverted index,
-        // the order of the fields is not guaranteed to be the same. Therefore, we sort the fields.
-        // If we are sure that the order is the same, we can remove this sort.
-        indexed_fields.sort_unstable();
-        fast_fields.sort_unstable();
-        let merged = merge_field_meta_data(vec![indexed_fields, fast_fields], &self.schema);
-
-        Ok(merged)
     }
 
     /// Returns the segment id

diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs
@@ -238,13 +238,17 @@ impl FastFieldsWriter {
         mut self,
         wrt: &mut dyn io::Write,
         doc_id_map_opt: Option<&DocIdMapping>,
-    ) -> io::Result<()> {
+    ) -> io::Result<Vec<(String, Type)>> {
         let num_docs = self.num_docs;
         let old_to_new_row_ids =
             doc_id_map_opt.map(|doc_id_mapping| doc_id_mapping.old_to_new_ids());
-        self.columnar_writer
+        let columns = self
+            .columnar_writer
             .serialize(num_docs, old_to_new_row_ids, wrt)?;
-        Ok(())
+        Ok(columns
+            .into_iter()
+            .map(|(field_name, column)| (field_name.to_string(), column.into()))
+            .collect())
     }
 }