Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Store List of Fields in Segment #2279

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ aho-corasick = "1.0"
tantivy-fst = "0.5"
memmap2 = { version = "0.9.0", optional = true }
lz4_flex = { version = "0.11", default-features = false, optional = true }
zstd = { version = "0.13", optional = true, default-features = false }
zstd = { version = "0.13", default-features = false }
tempfile = { version = "3.3.0", optional = true }
log = "0.4.16"
serde = { version = "1.0.136", features = ["derive"] }
Expand Down Expand Up @@ -105,7 +105,7 @@ mmap = ["fs4", "tempfile", "memmap2"]
stopwords = []

lz4-compression = ["lz4_flex"]
zstd-compression = ["zstd"]
zstd-compression = []

failpoints = ["fail", "fail/failpoints"]
unstable = [] # useful for benches.
Expand Down
2 changes: 1 addition & 1 deletion columnar/src/columnar/column_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ impl ColumnType {
self == &ColumnType::DateTime
}

pub(crate) fn try_from_code(code: u8) -> Result<ColumnType, InvalidData> {
pub fn try_from_code(code: u8) -> Result<ColumnType, InvalidData> {
COLUMN_TYPES.get(code as usize).copied().ok_or(InvalidData)
}
}
Expand Down
16 changes: 13 additions & 3 deletions columnar/src/columnar/writer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ impl ColumnarWriter {
num_docs: RowId,
old_to_new_row_ids: Option<&[RowId]>,
wrt: &mut dyn io::Write,
) -> io::Result<()> {
) -> io::Result<Vec<(String, ColumnType)>> {
let mut serializer = ColumnarSerializer::new(wrt);
let mut columns: Vec<(&[u8], ColumnType, Addr)> = self
.numerical_field_hash_map
Expand Down Expand Up @@ -374,7 +374,9 @@ impl ColumnarWriter {

let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries);
let mut symbol_byte_buffer: Vec<u8> = Vec::new();
for (column_name, column_type, addr) in columns {
for (column_name, column_type, addr) in columns.iter() {
let column_type = *column_type;
let addr = *addr;
match column_type {
ColumnType::Bool => {
let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr);
Expand Down Expand Up @@ -485,7 +487,15 @@ impl ColumnarWriter {
};
}
serializer.finalize(num_docs)?;
Ok(())
Ok(columns
.into_iter()
.map(|(column_name, column_type, _)| {
(
String::from_utf8_lossy(column_name).to_string(),
column_type,
)
})
.collect())
}
}

Expand Down
2 changes: 1 addition & 1 deletion common/src/bitset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use ownedbytes::OwnedBytes;

use crate::ByteCount;

#[derive(Clone, Copy, Eq, PartialEq)]
#[derive(Clone, Copy, Eq, PartialEq, Hash)]
pub struct TinySet(u64);

impl fmt::Debug for TinySet {
Expand Down
1 change: 1 addition & 0 deletions src/core/index_meta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ impl SegmentMeta {
SegmentComponent::FastFields => ".fast".to_string(),
SegmentComponent::FieldNorms => ".fieldnorm".to_string(),
SegmentComponent::Delete => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
SegmentComponent::FieldList => ".fieldlist".to_string(),
});
PathBuf::from(path)
}
Expand Down
2 changes: 1 addition & 1 deletion src/core/inverted_index_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ impl InvertedIndexReader {
&self.termdict
}

/// Return the fields and types encoded in the dictionary in lexicographic oder.
/// Return the fields and types encoded in the dictionary in lexicographic order.
/// Only valid on JSON fields.
///
/// Notice: This requires a full scan and therefore **very expensive**.
Expand Down
18 changes: 10 additions & 8 deletions src/core/json_utils.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use columnar::MonotonicallyMappableToU64;
use columnar::{ColumnType, MonotonicallyMappableToU64};
use common::{replace_in_place, JsonPathWriter};
use rustc_hash::FxHashMap;

Expand Down Expand Up @@ -153,7 +153,7 @@ fn index_json_value<'a, V: Value<'a>>(
let mut token_stream = text_analyzer.token_stream(val);
let unordered_id = ctx
.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str());
.get_or_allocate_unordered_id(json_path_writer.as_str(), ColumnType::Str);

// TODO: make sure the chain position works out.
set_path_id(term_buffer, unordered_id);
Expand All @@ -171,7 +171,7 @@ fn index_json_value<'a, V: Value<'a>>(
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
.get_or_allocate_unordered_id(json_path_writer.as_str(), ColumnType::U64),
);
term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
Expand All @@ -180,7 +180,7 @@ fn index_json_value<'a, V: Value<'a>>(
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
.get_or_allocate_unordered_id(json_path_writer.as_str(), ColumnType::I64),
);
term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
Expand All @@ -189,7 +189,7 @@ fn index_json_value<'a, V: Value<'a>>(
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
.get_or_allocate_unordered_id(json_path_writer.as_str(), ColumnType::F64),
);
term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
Expand All @@ -198,16 +198,18 @@ fn index_json_value<'a, V: Value<'a>>(
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
.get_or_allocate_unordered_id(json_path_writer.as_str(), ColumnType::Bool),
);
term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::Date(val) => {
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
ctx.path_to_unordered_id.get_or_allocate_unordered_id(
json_path_writer.as_str(),
ColumnType::DateTime,
),
);
term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
Expand Down
5 changes: 4 additions & 1 deletion src/core/segment_component.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,14 @@ pub enum SegmentComponent {
/// Bitset describing which document of the segment is alive.
/// (It was representing deleted docs but changed to represent alive docs from v0.17)
Delete,
/// Field list describing the fields in the segment.
FieldList,
}

impl SegmentComponent {
/// Iterates through the components.
pub fn iterator() -> slice::Iter<'static, SegmentComponent> {
static SEGMENT_COMPONENTS: [SegmentComponent; 8] = [
static SEGMENT_COMPONENTS: [SegmentComponent; 9] = [
SegmentComponent::Postings,
SegmentComponent::Positions,
SegmentComponent::FastFields,
Expand All @@ -41,6 +43,7 @@ impl SegmentComponent {
SegmentComponent::Store,
SegmentComponent::TempStore,
SegmentComponent::Delete,
SegmentComponent::FieldList,
];
SEGMENT_COMPONENTS.iter()
}
Expand Down
104 changes: 22 additions & 82 deletions src/core/segment_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,14 @@ use std::ops::BitOrAssign;
use std::sync::{Arc, RwLock};
use std::{fmt, io};

use fnv::FnvHashMap;
use itertools::Itertools;

use crate::core::{InvertedIndexReader, Segment, SegmentComponent, SegmentId};
use crate::directory::{CompositeFile, FileSlice};
use crate::error::DataCorruption;
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders};
use crate::field_list::read_split_fields;
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
use crate::json_utils::json_path_sep_to_dot;
use crate::schema::{Field, IndexRecordOption, Schema, Type};
use crate::space_usage::SegmentSpaceUsage;
use crate::store::StoreReader;
Expand Down Expand Up @@ -44,6 +43,7 @@ pub struct SegmentReader {
fast_fields_readers: FastFieldReaders,
fieldnorm_readers: FieldNormReaders,

list_fields_file: Option<FileSlice>, // Optional field list file for backwards compatibility
store_file: FileSlice,
alive_bitset_opt: Option<AliveBitSet>,
schema: Schema,
Expand Down Expand Up @@ -153,6 +153,7 @@ impl SegmentReader {
let termdict_composite = CompositeFile::open(&termdict_file)?;

let store_file = segment.open_read(SegmentComponent::Store)?;
let list_fields_file = segment.open_read(SegmentComponent::FieldList).ok();

crate::fail_point!("SegmentReader::open#middle");

Expand Down Expand Up @@ -201,6 +202,7 @@ impl SegmentReader {
segment_id: segment.id(),
delete_opstamp: segment.meta().delete_opstamp(),
store_file,
list_fields_file,
alive_bitset_opt,
positions_composite,
schema,
Expand Down Expand Up @@ -299,87 +301,25 @@ impl SegmentReader {
/// field that is not indexed nor a fast field but is stored, it is possible for the field
/// to not be listed.
pub fn fields_metadata(&self) -> crate::Result<Vec<FieldMetadata>> {
let mut indexed_fields: Vec<FieldMetadata> = Vec::new();
let mut map_to_canonical = FnvHashMap::default();
for (field, field_entry) in self.schema().fields() {
let field_name = field_entry.name().to_string();
let is_indexed = field_entry.is_indexed();

if is_indexed {
let is_json = field_entry.field_type().value_type() == Type::Json;
if is_json {
let inv_index = self.inverted_index(field)?;
let encoded_fields_in_index = inv_index.list_encoded_fields()?;
let mut build_path = |field_name: &str, mut json_path: String| {
// In this case we need to map the potential fast field to the field name
// accepted by the query parser.
let create_canonical =
!field_entry.is_expand_dots_enabled() && json_path.contains('.');
if create_canonical {
// Without expand dots enabled dots need to be escaped.
let escaped_json_path = json_path.replace('.', "\\.");
let full_path = format!("{}.{}", field_name, escaped_json_path);
let full_path_unescaped = format!("{}.{}", field_name, &json_path);
map_to_canonical.insert(full_path_unescaped, full_path.to_string());
full_path
} else {
// With expand dots enabled, we can use '.' instead of '\u{1}'.
json_path_sep_to_dot(&mut json_path);
format!("{}.{}", field_name, json_path)
}
};
indexed_fields.extend(
encoded_fields_in_index
.into_iter()
.map(|(name, typ)| (build_path(&field_name, name), typ))
.map(|(field_name, typ)| FieldMetadata {
indexed: true,
stored: false,
field_name,
fast: false,
typ,
}),
);
} else {
indexed_fields.push(FieldMetadata {
indexed: true,
stored: false,
field_name: field_name.to_string(),
fast: false,
typ: field_entry.field_type().value_type(),
});
}
}
if let Some(list_fields_file) = self.list_fields_file.as_ref() {
let file = list_fields_file.read_bytes()?;
let fields_metadata =
read_split_fields(file)?.collect::<io::Result<Vec<FieldMetadata>>>();
fields_metadata.map_err(|e| e.into())
} else {
// Schema fallback
Ok(self
.schema()
.fields()
.map(|(_field, entry)| FieldMetadata {
field_name: entry.name().to_string(),
typ: entry.field_type().value_type(),
indexed: entry.is_indexed(),
stored: entry.is_stored(),
fast: entry.is_fast(),
})
.collect())
}
Comment on lines +304 to 322
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is admittedly code golfing, but I think it makes it a bit less noisier:

Suggested change
if let Some(list_fields_file) = self.list_fields_file.as_ref() {
let file = list_fields_file.read_bytes()?;
let fields_metadata =
read_split_fields(file)?.collect::<io::Result<Vec<FieldMetadata>>>();
fields_metadata.map_err(|e| e.into())
} else {
// Schema fallback
Ok(self
.schema()
.fields()
.map(|(_field, entry)| FieldMetadata {
field_name: entry.name().to_string(),
typ: entry.field_type().value_type(),
indexed: entry.is_indexed(),
stored: entry.is_stored(),
fast: entry.is_fast(),
})
.collect())
}
let fields_metadata = if let Some(list_fields_file) = self.list_fields_file.as_ref() {
let file = list_fields_file.read_bytes()?;
read_split_fields(file)?.collect::<io::Result<Vec<FieldMetadata>>>()?
} else {
// Schema fallback
self.schema()
.fields()
.map(|(_field, entry)| FieldMetadata {
field_name: entry.name().to_string(),
typ: entry.field_type().value_type(),
indexed: entry.is_indexed(),
stored: entry.is_stored(),
fast: entry.is_fast(),
})
.collect()
};
Ok(fields_metadata)

let mut fast_fields: Vec<FieldMetadata> = self
.fast_fields()
.columnar()
.iter_columns()?
.map(|(mut field_name, handle)| {
json_path_sep_to_dot(&mut field_name);
// map to canonical path, to avoid similar but different entries.
// Eventually we should just accept '.' seperated for all cases.
let field_name = map_to_canonical
.get(&field_name)
.unwrap_or(&field_name)
.to_string();
FieldMetadata {
indexed: false,
stored: false,
field_name,
fast: true,
typ: Type::from(handle.column_type()),
}
})
.collect();
// Since the type is encoded differently in the fast field and in the inverted index,
// the order of the fields is not guaranteed to be the same. Therefore, we sort the fields.
// If we are sure that the order is the same, we can remove this sort.
indexed_fields.sort_unstable();
fast_fields.sort_unstable();
let merged = merge_field_meta_data(vec![indexed_fields, fast_fields], &self.schema);

Ok(merged)
}

/// Returns the segment id
Expand Down
10 changes: 7 additions & 3 deletions src/fastfield/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -238,13 +238,17 @@ impl FastFieldsWriter {
mut self,
wrt: &mut dyn io::Write,
doc_id_map_opt: Option<&DocIdMapping>,
) -> io::Result<()> {
) -> io::Result<Vec<(String, Type)>> {
let num_docs = self.num_docs;
let old_to_new_row_ids =
doc_id_map_opt.map(|doc_id_mapping| doc_id_mapping.old_to_new_ids());
self.columnar_writer
let columns = self
.columnar_writer
.serialize(num_docs, old_to_new_row_ids, wrt)?;
Ok(())
Ok(columns
.into_iter()
.map(|(field_name, column)| (field_name.to_string(), column.into()))
.collect())
}
}

Expand Down
Loading
Loading