Skip to content
This repository was archived by the owner on Apr 4, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
uses: actions-rs/cargo@v1
with:
command: check
args: --all
args: --workspace --all-targets
- name: Run cargo test
uses: actions-rs/cargo@v1
with:
Expand Down
3 changes: 3 additions & 0 deletions benchmarks/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ publish = false

[dependencies]
milli = { path = "../milli" }
anyhow = "1.0"
serde_json = { version = "1.0.62", features = ["preserve_order"] }
csv = "1.1.6"

[target.'cfg(target_os = "linux")'.dependencies]
jemallocator = "0.3.2"
Expand Down
80 changes: 27 additions & 53 deletions benchmarks/benches/indexing.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
mod datasets_paths;
mod utils;

use std::fs::{create_dir_all, remove_dir_all, File};
use std::fs::{create_dir_all, remove_dir_all};
use std::path::Path;

use criterion::{criterion_group, criterion_main, Criterion};
use heed::EnvOpenOptions;
use milli::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat};
use milli::update::UpdateBuilder;
use milli::Index;

#[cfg(target_os = "linux")]
Expand Down Expand Up @@ -67,15 +68,10 @@ fn indexing_songs_default(c: &mut Criterion) {
move |index| {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, &index);

builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
"could not find the dataset in: {}",
datasets_paths::SMOL_SONGS
));
builder.execute(reader, |_, _| ()).unwrap();
let builder = update_builder.index_documents(&mut wtxn, &index);

let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
builder.execute(documents, |_, _| ()).unwrap();
wtxn.commit().unwrap();

index.prepare_for_closing().wait();
Expand Down Expand Up @@ -118,15 +114,10 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) {
move |index| {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, &index);

builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
"could not find the dataset in: {}",
datasets_paths::SMOL_SONGS
));
builder.execute(reader, |_, _| ()).unwrap();
let builder = update_builder.index_documents(&mut wtxn, &index);

let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
builder.execute(documents, |_, _| ()).unwrap();
wtxn.commit().unwrap();

index.prepare_for_closing().wait();
Expand Down Expand Up @@ -165,15 +156,10 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) {
move |index| {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, &index);

builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
"could not find the dataset in: {}",
datasets_paths::SMOL_SONGS
));
builder.execute(reader, |_, _| ()).unwrap();
let builder = update_builder.index_documents(&mut wtxn, &index);

let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
builder.execute(documents, |_, _| ()).unwrap();
wtxn.commit().unwrap();

index.prepare_for_closing().wait();
Expand Down Expand Up @@ -211,15 +197,10 @@ fn indexing_wiki(c: &mut Criterion) {
move |index| {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, &index);

builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(datasets_paths::SMOL_WIKI_ARTICLES).expect(&format!(
"could not find the dataset in: {}",
datasets_paths::SMOL_SONGS
));
builder.execute(reader, |_, _| ()).unwrap();
let builder = update_builder.index_documents(&mut wtxn, &index);

let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
builder.execute(documents, |_, _| ()).unwrap();
wtxn.commit().unwrap();

index.prepare_for_closing().wait();
Expand Down Expand Up @@ -262,13 +243,10 @@ fn indexing_movies_default(c: &mut Criterion) {
move |index| {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, &index);
let builder = update_builder.index_documents(&mut wtxn, &index);

builder.update_format(UpdateFormat::Json);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(datasets_paths::MOVIES)
.expect(&format!("could not find the dataset in: {}", datasets_paths::MOVIES));
builder.execute(reader, |_, _| ()).unwrap();
let documents = utils::documents_from(datasets_paths::MOVIES, "json");
builder.execute(documents, |_, _| ()).unwrap();
wtxn.commit().unwrap();

index.prepare_for_closing().wait();
Expand Down Expand Up @@ -316,15 +294,11 @@ fn indexing_geo(c: &mut Criterion) {
move |index| {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, &index);

builder.update_format(UpdateFormat::JsonStream);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(datasets_paths::SMOL_ALL_COUNTRIES).expect(&format!(
"could not find the dataset in: {}",
datasets_paths::SMOL_ALL_COUNTRIES
));
builder.execute(reader, |_, _| ()).unwrap();
let builder = update_builder.index_documents(&mut wtxn, &index);

let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
builder.execute(documents, |_, _| ()).unwrap();

wtxn.commit().unwrap();

index.prepare_for_closing().wait();
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/benches/search_geo.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ mod datasets_paths;
mod utils;

use criterion::{criterion_group, criterion_main};
use milli::update::{Settings, UpdateFormat};
use milli::update::Settings;
use utils::Conf;

#[cfg(target_os = "linux")]
Expand Down Expand Up @@ -33,7 +33,7 @@ fn base_conf(builder: &mut Settings) {
#[rustfmt::skip]
const BASE_CONF: Conf = Conf {
dataset: datasets_paths::SMOL_ALL_COUNTRIES,
dataset_format: UpdateFormat::JsonStream,
dataset_format: "jsonl",
queries: &[
"",
],
Expand Down
73 changes: 66 additions & 7 deletions benchmarks/benches/utils.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
#![allow(dead_code)]

use std::fs::{create_dir_all, remove_dir_all, File};
use std::io::{self, Cursor, Read, Seek};
use std::path::Path;

use criterion::BenchmarkId;
use heed::EnvOpenOptions;
use milli::update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat};
use milli::documents::DocumentBatchReader;
use milli::update::{IndexDocumentsMethod, Settings, UpdateBuilder};
use milli::{FilterCondition, Index};
use serde_json::{Map, Value};

pub struct Conf<'a> {
/// where we are going to create our database.mmdb directory
Expand All @@ -13,7 +18,7 @@ pub struct Conf<'a> {
/// the dataset to be used, it must be an uncompressed csv
pub dataset: &'a str,
/// The format of the dataset
pub dataset_format: UpdateFormat,
pub dataset_format: &'a str,
pub group_name: &'a str,
pub queries: &'a [&'a str],
/// here you can change which criterion are used and in which order.
Expand All @@ -33,7 +38,7 @@ pub struct Conf<'a> {
impl Conf<'_> {
pub const BASE: Self = Conf {
database_name: "benches.mmdb",
dataset_format: UpdateFormat::Csv,
dataset_format: "csv",
dataset: "",
group_name: "",
queries: &[],
Expand Down Expand Up @@ -87,11 +92,10 @@ pub fn base_setup(conf: &Conf) -> Index {
if let None = conf.primary_key {
builder.enable_autogenerate_docids();
}
builder.update_format(conf.dataset_format);
let documents = documents_from(conf.dataset, conf.dataset_format);

builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(conf.dataset)
.expect(&format!("could not find the dataset in: {}", conf.dataset));
builder.execute(reader, |_, _| ()).unwrap();
builder.execute(documents, |_, _| ()).unwrap();
wtxn.commit().unwrap();

index
Expand Down Expand Up @@ -128,3 +132,58 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
index.prepare_for_closing().wait();
}
}

pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader<impl Read + Seek> {
let reader =
File::open(filename).expect(&format!("could not find the dataset in: {}", filename));
let documents = match filetype {
"csv" => documents_from_csv(reader).unwrap(),
"json" => documents_from_json(reader).unwrap(),
"jsonl" => documents_from_jsonl(reader).unwrap(),
otherwise => panic!("invalid update format {:?}", otherwise),
};
DocumentBatchReader::from_reader(Cursor::new(documents)).unwrap()
}

fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
let mut writer = Cursor::new(Vec::new());
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;

let values = serde_json::Deserializer::from_reader(reader)
.into_iter::<serde_json::Map<String, serde_json::Value>>();
for document in values {
let document = document?;
documents.add_documents(document)?;
}
documents.finish()?;

Ok(writer.into_inner())
}

fn documents_from_json(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
let mut writer = Cursor::new(Vec::new());
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;

let json: serde_json::Value = serde_json::from_reader(reader)?;
documents.add_documents(json)?;
documents.finish()?;

Ok(writer.into_inner())
}

fn documents_from_csv(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
let mut writer = Cursor::new(Vec::new());
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;

let mut records = csv::Reader::from_reader(reader);
let iter = records.deserialize::<Map<String, Value>>();

for doc in iter {
let doc = doc?;
documents.add_documents(doc)?;
}

documents.finish()?;

Ok(writer.into_inner())
}