From 333c79aaaa63e797b7480d0cae21914aa8c8cf36 Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Sat, 2 Mar 2024 05:11:29 +0100 Subject: [PATCH 01/23] Added the ability to configure vector-search embeddings in the settings --- Cargo.toml | 1 + src/settings.rs | 221 ++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 217 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9ad71697..91a3c1d7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,6 +40,7 @@ wasm-bindgen-futures = "0.4" [features] default = ["reqwest"] reqwest = ["dep:reqwest", "pin-project-lite", "bytes"] +experimental-vector-search = [] [dev-dependencies] futures-await-test = "0.3" diff --git a/src/settings.rs b/src/settings.rs index de5aae70..442391bc 100644 --- a/src/settings.rs +++ b/src/settings.rs @@ -36,6 +36,70 @@ pub struct FacetingSettings { pub max_values_per_facet: usize, } +#[cfg(feature = "experimental-vector-search")] +#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)] +#[serde(rename_all = "camelCase", tag = "source")] +pub enum Embedder { + /// Compute embeddings locally. + /// This is a resource-intensive operation and might affect indexing performance. + HuggingFace(HuggingFaceEmbedderSettings), + /// Use OpenAi's API to generate embeddings + OpenAi(OpenapiEmbedderSettings), + /// Provide custom embeddings. + /// In this case, you must manually update your embeddings when adding, updating, and removing documents to your index. + UserProvided(UserProvidedEmbedderSettings), +} + +#[cfg(feature = "experimental-vector-search")] +#[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct HuggingFaceEmbedderSettings { + /// the BERT embedding model you want to use from HuggingFace + /// Defaults to `BAAI/bge-base-en-v1.5` + #[serde(skip_serializing_if = "Option::is_none")] + pub model: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub revision: Option, + /// if present, document_template must be a [Liquid template](https://shopify.github.io/liquid/). + /// Use `{{ doc.attribute }}` to access document field values. + /// Meilisearch also exposes a `{{ fields }}` array containing one object per document field, which you may access with `{{ field.name }}` and `{{ field.value }}`. + /// + /// For best results, use short strings indicating the type of document in that index, only include highly relevant document fields, and truncate long fields. + #[serde(skip_serializing_if = "Option::is_none")] + pub document_template: Option, +} + +#[cfg(feature = "experimental-vector-search")] +#[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct OpenapiEmbedderSettings { + /// API key used to authorize against OpenAI. + /// [Generate an API key](https://platform.openai.com/api-keys) from your OpenAI account. + /// Use [tier 2 keys](https://platform.openai.com/docs/guides/rate-limits/usage-tiers?context=tier-two) or above for optimal performance. + pub api_key: String, + /// The openapi model name + /// Default: `text-embedding-ada-002` + #[serde(skip_serializing_if = "Option::is_none")] + pub model: Option, + /// Defaults to the default for said model name + #[serde(skip_serializing_if = "Option::is_none")] + pub dimensions: Option, + /// if present, document_template must be a [Liquid template](https://shopify.github.io/liquid/). + /// Use `{{ doc.attribute }}` to access document field values. + /// Meilisearch also exposes a `{{ fields }}` array containing one object per document field, which you may access with `{{ field.name }}` and `{{ field.value }}`. + /// + /// For best results, use short strings indicating the type of document in that index, only include highly relevant document fields, and truncate long fields. + #[serde(skip_serializing_if = "Option::is_none")] + pub document_template: Option, +} + +#[cfg(feature = "experimental-vector-search")] +#[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq, Copy)] +pub struct UserProvidedEmbedderSettings { + /// dimensions of your custom embedding + pub dimensions: usize, +} + /// Struct reprensenting a set of settings. /// /// You can build this struct using the builder syntax. @@ -103,6 +167,10 @@ pub struct Settings { /// Proximity precision settings. #[serde(skip_serializing_if = "Option::is_none")] pub proximity_precision: Option, + /// Settings how the embeddings for the experimental vector search feature are generated + #[cfg(feature = "experimental-vector-search")] + #[serde(skip_serializing_if = "Option::is_none")] + pub embedders: Option>, } #[allow(missing_docs)] @@ -286,6 +354,23 @@ impl Settings { ..self } } + + #[must_use] + #[cfg(feature = "experimental-vector-search")] + pub fn with_embedders(self, embedders: HashMap) -> Settings + where + S: AsRef, + { + Settings { + embedders: Some( + embedders + .into_iter() + .map(|(key, value)| (key.as_ref().to_string(), value)) + .collect(), + ), + ..self + } + } } impl Index { @@ -733,7 +818,7 @@ impl Index { /// # let MEILISEARCH_API_KEY = option_env!("MEILISEARCH_API_KEY").unwrap_or("masterKey"); /// # /// # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async { - /// let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); + /// # let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); /// # client.create_index("get_typo_tolerance", None).await.unwrap().wait_for_completion(&client, None, None).await.unwrap(); /// let index = client.index("get_typo_tolerance"); /// @@ -755,6 +840,50 @@ impl Index { .await } + /// Get [embedders](https://www.meilisearch.com/docs/learn/experimental/vector_search) of the [Index]. + /// + /// ``` + /// # use std::collections::HashMap; + /// # use std::string::String; + /// # use meilisearch_sdk::{indexes::*,features::ExperimentalFeatures,settings::Embedder,settings::UserProvidedEmbedderSettings,settings::Settings,client::*}; + /// # + /// # let MEILISEARCH_URL = option_env!("MEILISEARCH_URL").unwrap_or("http://localhost:7700"); + /// # let MEILISEARCH_API_KEY = option_env!("MEILISEARCH_API_KEY").unwrap_or("masterKey"); + /// # + /// # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async { + /// # let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); + /// # client.create_index("get_embedders", None).await.unwrap().wait_for_completion(&client, None, None).await.unwrap(); + /// let index = client.index("get_embedders"); + /// + /// # let mut features = ExperimentalFeatures::new(&client); + /// # features.set_vector_store(true); + /// # let res = features.update().await.unwrap(); + /// # + /// # let t=index.set_settings(&Settings{ + /// # embedders:Some(HashMap::from([(String::from("default"),Embedder::UserProvided(UserProvidedEmbedderSettings{dimensions:1}))])), + /// # ..Settings::default() + /// # }).await.unwrap(); + /// # t.wait_for_completion(&client, None, None).await.unwrap(); + /// let embedders = index.get_embedders().await.unwrap(); + /// # index.delete().await.unwrap().wait_for_completion(&client, None, None).await.unwrap(); + /// # }); + /// ``` + #[cfg(feature = "experimental-vector-search")] + pub async fn get_embedders(&self) -> Result, Error> { + self.client + .http_client + .request::<(), (), Option>>( + &format!( + "{}/indexes/{}/settings/embedders", + self.client.host, self.uid + ), + Method::Get { query: () }, + 200, + ) + .await + .map(|r| r.unwrap_or_default()) + } + /// Update [settings](../settings/struct.Settings) of the [Index]. /// /// Updates in the settings are partial. This means that any parameters corresponding to a `None` value will be left unchanged. @@ -1274,7 +1403,7 @@ impl Index { /// # let MEILISEARCH_API_KEY = option_env!("MEILISEARCH_API_KEY").unwrap_or("masterKey"); /// # /// # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async { - /// let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); + /// # let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); /// # client.create_index("set_typo_tolerance", None).await.unwrap().wait_for_completion(&client, None, None).await.unwrap(); /// let mut index = client.index("set_typo_tolerance"); /// @@ -1320,7 +1449,7 @@ impl Index { /// # let MEILISEARCH_API_KEY = option_env!("MEILISEARCH_API_KEY").unwrap_or("masterKey"); /// # /// # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async { - /// let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); + /// # let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); /// # client.create_index("set_proximity_precision", None).await.unwrap().wait_for_completion(&client, None, None).await.unwrap(); /// let mut index = client.index("set_proximity_precision"); /// @@ -1756,7 +1885,7 @@ impl Index { /// # let MEILISEARCH_API_KEY = option_env!("MEILISEARCH_API_KEY").unwrap_or("masterKey"); /// # /// # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async { - /// let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); + /// # let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); /// # client.create_index("reset_typo_tolerance", None).await.unwrap().wait_for_completion(&client, None, None).await.unwrap(); /// let mut index = client.index("reset_typo_tolerance"); /// @@ -1789,7 +1918,7 @@ impl Index { /// # let MEILISEARCH_API_KEY = option_env!("MEILISEARCH_API_KEY").unwrap_or("masterKey"); /// # /// # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async { - /// let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); + /// # let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); /// # client.create_index("reset_proximity_precision", None).await.unwrap().wait_for_completion(&client, None, None).await.unwrap(); /// let mut index = client.index("reset_proximity_precision"); /// @@ -1810,6 +1939,40 @@ impl Index { ) .await } + + /// Reset [embedders](https://www.meilisearch.com/docs/learn/experimental/vector_search) of the [Index]. + /// + /// # Example + /// + /// ``` + /// # use meilisearch_sdk::{client::*, indexes::*, settings::Settings}; + /// # + /// # let MEILISEARCH_URL = option_env!("MEILISEARCH_URL").unwrap_or("http://localhost:7700"); + /// # let MEILISEARCH_API_KEY = option_env!("MEILISEARCH_API_KEY").unwrap_or("masterKey"); + /// # + /// # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async { + /// # let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); + /// # client.create_index("reset_embedders", None).await.unwrap().wait_for_completion(&client, None, None).await.unwrap(); + /// let mut index = client.index("reset_embedders"); + /// + /// let task = index.reset_embedders().await.unwrap(); + /// # index.delete().await.unwrap().wait_for_completion(&client, None, None).await.unwrap(); + /// # }); + /// ``` + #[cfg(feature = "experimental-vector-search")] + pub async fn reset_embedders(&self) -> Result { + self.client + .http_client + .request::<(), (), TaskInfo>( + &format!( + "{}/indexes/{}/settings/embedders", + self.client.host, self.uid + ), + Method::Delete { query: () }, + 202, + ) + .await + } } #[cfg(test)] @@ -1845,6 +2008,14 @@ mod tests { assert_eq!(faceting, res); } + #[cfg(feature = "experimental-vector-search")] + #[meilisearch_test] + async fn test_get_embeddings(index: Index) { + let res = index.get_embedders().await.unwrap(); + + assert_eq!(HashMap::new(), res); + } + #[meilisearch_test] async fn test_set_faceting(client: Client, index: Index) { let faceting = FacetingSettings { @@ -1871,6 +2042,23 @@ mod tests { assert_eq!(faceting, res); } + #[cfg(feature = "experimental-vector-search")] + #[meilisearch_test] + async fn test_reset_embedders(client: Client, index: Index) { + let features = crate::features::ExperimentalFeatures::new(&client) + .set_vector_store(true) + .update() + .await + .expect("could not enable the vector store"); + assert_eq!(features.vector_store, true); + let task_info = index.reset_embedders().await.unwrap(); + client.wait_for_task(task_info, None, None).await.unwrap(); + + let res = index.get_embedders().await.unwrap(); + + assert_eq!(HashMap::new(), res); + } + #[meilisearch_test] async fn test_get_dictionary(index: Index) { let dictionary: Vec = vec![]; @@ -2047,6 +2235,29 @@ mod tests { assert_eq!(expected, res); } + #[cfg(feature = "experimental-vector-search")] + #[meilisearch_test] + async fn test_set_embedding_settings(client: Client, index: Index) { + let features = crate::features::ExperimentalFeatures::new(&client) + .set_vector_store(true) + .update() + .await + .expect("could not enable the vector store"); + assert_eq!(features.vector_store, true); + + let custom_embedder = + Embedder::UserProvided(UserProvidedEmbedderSettings { dimensions: 2 }); + let embeddings = HashMap::from([("default".into(), custom_embedder)]); + let settings = Settings::new().with_embedders(embeddings.clone()); + + let task_info = index.set_settings(&settings).await.unwrap(); + client.wait_for_task(task_info, None, None).await.unwrap(); + + let res = index.get_embedders().await.unwrap(); + + assert_eq!(embeddings, res); + } + #[meilisearch_test] async fn test_reset_proximity_precision(index: Index) { let expected = "byWord".to_string(); From 83f543d79002d4ea5db1cc4fb1cd49ce54841338 Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Tue, 16 Apr 2024 04:32:00 +0200 Subject: [PATCH 02/23] Added the ability to configure the `hybrid` keyword in the search --- src/search.rs | 135 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 132 insertions(+), 3 deletions(-) diff --git a/src/search.rs b/src/search.rs index ec96d9d7..079ae080 100644 --- a/src/search.rs +++ b/src/search.rs @@ -138,6 +138,22 @@ pub enum Selectors { All, } +#[cfg(feature = "experimental-vector-search")] +#[derive(Debug, Serialize, Clone)] +#[serde(rename_all = "camelCase")] +pub struct HybridSearch<'a> { + /// Indicates one of the embedders configured for the queried index + /// + /// **Default: `"default"`** + embedder: &'a str, + /// number between `0` and `1`: + /// - `0.0` indicates full keyword search + /// - `1.0` indicates full semantic search + /// + /// **Default: `0.5`** + semantic_ratio: f32, +} + type AttributeToCrop<'a> = (&'a str, Option); /// A struct representing a query. @@ -328,6 +344,12 @@ pub struct SearchQuery<'a, Http: HttpClient> { #[serde(skip_serializing_if = "Option::is_none")] pub(crate) index_uid: Option<&'a str>, + + /// EXPERIMENTAL + /// Defines whether to utilise previously defined embedders for semantic searching + #[cfg(feature = "experimental-vector-search")] + #[serde(skip_serializing_if = "Option::is_none")] + pub hybrid: Option>, } #[allow(missing_docs)] @@ -356,6 +378,8 @@ impl<'a, Http: HttpClient> SearchQuery<'a, Http> { show_ranking_score: None, matching_strategy: None, index_uid: None, + #[cfg(feature = "experimental-vector-search")] + hybrid: None, } } pub fn with_query<'b>(&'b mut self, query: &'a str) -> &'b mut SearchQuery<'a, Http> { @@ -539,6 +563,20 @@ impl<'a, Http: HttpClient> SearchQuery<'a, Http> { self.index_uid = Some(&self.index.uid); self } + #[cfg(feature = "experimental-vector-search")] + pub fn with_hybrid<'b>( + &'b mut self, + embedder: &'a str, + semantic_ratio: f32, + ) -> &'b mut SearchQuery<'a, Http> { + self.hybrid = Some(HybridSearch { + embedder, + semantic_ratio, + }); + self + } + + #[must_use] pub fn build(&mut self) -> SearchQuery<'a, Http> { self.clone() } @@ -612,6 +650,7 @@ mod tests { use meilisearch_test_macro::meilisearch_test; use serde::{Deserialize, Serialize}; use serde_json::{json, Map, Value}; + use std::time::Duration; #[derive(Debug, Serialize, Deserialize, PartialEq)] struct Nested { @@ -654,9 +693,15 @@ mod tests { .await?; let t2 = index.set_sortable_attributes(["title"]).await?; - t2.wait_for_completion(client, None, None).await?; - t1.wait_for_completion(client, None, None).await?; - t0.wait_for_completion(client, None, None).await?; + // the vector search has longer indexing times leading to the timeout being triggered + let timeout = if cfg!(feature = "experimental-vector-search") { + Some(Duration::from_secs(120)) + } else { + None + }; + t2.wait_for_completion(client, None, timeout).await?; + t1.wait_for_completion(client, None, timeout).await?; + t0.wait_for_completion(client, None, timeout).await?; Ok(()) } @@ -1174,4 +1219,88 @@ mod tests { Ok(()) } + + #[cfg(feature = "experimental-vector-search")] + #[meilisearch_test] + async fn test_hybrid(client: Client, index: Index) -> Result<(), Error> { + use crate::settings::{Embedder, HuggingFaceEmbedderSettings}; + log::warn!("You are executing the vector search test. This WILL take a while and might lead to timeouts in other tests. You can disable this testcase by not enabling the `experimental-vector-search`-feature and running this "); + // enable vector searching and configure an embedder + let features = crate::features::ExperimentalFeatures::new(&client) + .set_vector_store(true) + .update() + .await + .expect("could not enable the vector store"); + assert_eq!(features.vector_store, true); + let embedder_setting = Embedder::HuggingFace(HuggingFaceEmbedderSettings { + model: Some("BAAI/bge-base-en-v1.5".into()), + revision: None, + document_template: Some("{{ doc.value }}".into()), + }); + let t3 = index + .set_settings(&crate::settings::Settings { + embedders: Some(HashMap::from([("default".to_string(), embedder_setting)])), + ..crate::settings::Settings::default() + }) + .await?; + t3.wait_for_completion(&client, None, None).await?; + + setup_test_index(&client, &index).await?; + + // "zweite" = "second" in german + // => an embedding should be able to detect that this is equivalent, but not the regular search + let results: SearchResults = index + .search() + .with_query("Facebook") + .with_hybrid("default", 1.0) // entirely rely on semantic searching + .execute() + .await?; + assert_eq!(results.hits.len(), 1); + assert_eq!( + &Document { + id: 1, + value: S("dolor sit amet, consectetur adipiscing elit"), + kind: S("text"), + number: 10, + nested: Nested { child: S("second") }, + }, + &results.hits[0].result + ); + let results: SearchResults = index + .search() + .with_query("zweite") + .with_hybrid("default", 0.0) // no semantic searching => no matches + .execute() + .await?; + assert_eq!(results.hits.len(), 0); + + // word that has a typo => would have been found via traditional means + // if entirely relying on semantic searching, no result is found + let results: SearchResults = index + .search() + .with_query("lohrem") + .with_hybrid("default", 1.0) + .execute() + .await?; + assert_eq!(results.hits.len(), 0); + let results: SearchResults = index + .search() + .with_query("lohrem") + .with_hybrid("default", 0.0) + .execute() + .await?; + assert_eq!(results.hits.len(), 1); + assert_eq!( + &Document { + id: 0, + value: S("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."), + kind: S("text"), + number: 0, + nested: Nested { child: S("first") } + }, + &results.hits[0].result + ); + + Ok(()) + } } From a4e50c97539d126f1d66141b405fa90f9edb3fa5 Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Tue, 16 Apr 2024 22:04:00 +0200 Subject: [PATCH 03/23] Migrated the testcase to use `_vectors` instead --- src/search.rs | 112 +++++++++++++++++++++++++++++--------------------- 1 file changed, 65 insertions(+), 47 deletions(-) diff --git a/src/search.rs b/src/search.rs index 079ae080..83d6e191 100644 --- a/src/search.rs +++ b/src/search.rs @@ -350,6 +350,12 @@ pub struct SearchQuery<'a, Http: HttpClient> { #[cfg(feature = "experimental-vector-search")] #[serde(skip_serializing_if = "Option::is_none")] pub hybrid: Option>, + + /// EXPERIMENTAL + /// Defines what vectors an userprovided embedder has gotten for semantic searching + #[cfg(feature = "experimental-vector-search")] + #[serde(skip_serializing_if = "Option::is_none")] + pub vector: Option<&'a [f32]>, } #[allow(missing_docs)] @@ -380,6 +386,8 @@ impl<'a, Http: HttpClient> SearchQuery<'a, Http> { index_uid: None, #[cfg(feature = "experimental-vector-search")] hybrid: None, + #[cfg(feature = "experimental-vector-search")] + vector: None, } } pub fn with_query<'b>(&'b mut self, query: &'a str) -> &'b mut SearchQuery<'a, Http> { @@ -563,6 +571,8 @@ impl<'a, Http: HttpClient> SearchQuery<'a, Http> { self.index_uid = Some(&self.index.uid); self } + /// EXPERIMENTAL + /// Defines whether to utilise previously defined embedders for semantic searching #[cfg(feature = "experimental-vector-search")] pub fn with_hybrid<'b>( &'b mut self, @@ -575,6 +585,13 @@ impl<'a, Http: HttpClient> SearchQuery<'a, Http> { }); self } + /// EXPERIMENTAL + /// Defines what vectors an userprovided embedder has gotten for semantic searching + #[cfg(feature = "experimental-vector-search")] + pub fn with_vector<'b>(&'b mut self, vector: &'a [f32]) -> &'b mut SearchQuery<'a, Http> { + self.vector = Some(vector); + self + } #[must_use] pub fn build(&mut self) -> SearchQuery<'a, Http> { @@ -650,7 +667,6 @@ mod tests { use meilisearch_test_macro::meilisearch_test; use serde::{Deserialize, Serialize}; use serde_json::{json, Map, Value}; - use std::time::Duration; #[derive(Debug, Serialize, Deserialize, PartialEq)] struct Nested { @@ -664,6 +680,7 @@ mod tests { kind: String, number: i32, nested: Nested, + _vectors: HashMap>, } impl PartialEq> for Document { @@ -677,31 +694,25 @@ mod tests { async fn setup_test_index(client: &Client, index: &Index) -> Result<(), Error> { let t0 = index.add_documents(&[ - Document { id: 0, kind: "text".into(), number: 0, value: S("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."), nested: Nested { child: S("first") } }, - Document { id: 1, kind: "text".into(), number: 10, value: S("dolor sit amet, consectetur adipiscing elit"), nested: Nested { child: S("second") } }, - Document { id: 2, kind: "title".into(), number: 20, value: S("The Social Network"), nested: Nested { child: S("third") } }, - Document { id: 3, kind: "title".into(), number: 30, value: S("Harry Potter and the Sorcerer's Stone"), nested: Nested { child: S("fourth") } }, - Document { id: 4, kind: "title".into(), number: 40, value: S("Harry Potter and the Chamber of Secrets"), nested: Nested { child: S("fift") } }, - Document { id: 5, kind: "title".into(), number: 50, value: S("Harry Potter and the Prisoner of Azkaban"), nested: Nested { child: S("sixth") } }, - Document { id: 6, kind: "title".into(), number: 60, value: S("Harry Potter and the Goblet of Fire"), nested: Nested { child: S("seventh") } }, - Document { id: 7, kind: "title".into(), number: 70, value: S("Harry Potter and the Order of the Phoenix"), nested: Nested { child: S("eighth") } }, - Document { id: 8, kind: "title".into(), number: 80, value: S("Harry Potter and the Half-Blood Prince"), nested: Nested { child: S("ninth") } }, - Document { id: 9, kind: "title".into(), number: 90, value: S("Harry Potter and the Deathly Hallows"), nested: Nested { child: S("tenth") } }, + Document { id: 0, kind: "text".into(), number: 0, value: S("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."), nested: Nested { child: S("first") }, _vectors: HashMap::from([(S("default"), vec![1000.0])])}, + Document { id: 1, kind: "text".into(), number: 10, value: S("dolor sit amet, consectetur adipiscing elit"), nested: Nested { child: S("second") }, _vectors: HashMap::from([(S("default"), vec![2000.0])]) }, + Document { id: 2, kind: "title".into(), number: 20, value: S("The Social Network"), nested: Nested { child: S("third") }, _vectors: HashMap::from([(S("default"), vec![3000.0])]) }, + Document { id: 3, kind: "title".into(), number: 30, value: S("Harry Potter and the Sorcerer's Stone"), nested: Nested { child: S("fourth") }, _vectors: HashMap::from([(S("default"), vec![4000.0])]) }, + Document { id: 4, kind: "title".into(), number: 40, value: S("Harry Potter and the Chamber of Secrets"), nested: Nested { child: S("fift") }, _vectors: HashMap::from([(S("default"), vec![5000.0])]) }, + Document { id: 5, kind: "title".into(), number: 50, value: S("Harry Potter and the Prisoner of Azkaban"), nested: Nested { child: S("sixth") }, _vectors: HashMap::from([(S("default"), vec![6000.0])]) }, + Document { id: 6, kind: "title".into(), number: 60, value: S("Harry Potter and the Goblet of Fire"), nested: Nested { child: S("seventh") }, _vectors: HashMap::from([(S("default"), vec![7000.0])]) }, + Document { id: 7, kind: "title".into(), number: 70, value: S("Harry Potter and the Order of the Phoenix"), nested: Nested { child: S("eighth") }, _vectors: HashMap::from([(S("default"), vec![8000.0])]) }, + Document { id: 8, kind: "title".into(), number: 80, value: S("Harry Potter and the Half-Blood Prince"), nested: Nested { child: S("ninth") }, _vectors: HashMap::from([(S("default"), vec![9000.0])]) }, + Document { id: 9, kind: "title".into(), number: 90, value: S("Harry Potter and the Deathly Hallows"), nested: Nested { child: S("tenth") }, _vectors: HashMap::from([(S("default"), vec![10000.0])]) }, ], None).await?; let t1 = index .set_filterable_attributes(["kind", "value", "number"]) .await?; let t2 = index.set_sortable_attributes(["title"]).await?; - // the vector search has longer indexing times leading to the timeout being triggered - let timeout = if cfg!(feature = "experimental-vector-search") { - Some(Duration::from_secs(120)) - } else { - None - }; - t2.wait_for_completion(client, None, timeout).await?; - t1.wait_for_completion(client, None, timeout).await?; - t0.wait_for_completion(client, None, timeout).await?; + t2.wait_for_completion(client, None, None).await?; + t1.wait_for_completion(client, None, None).await?; + t0.wait_for_completion(client, None, None).await?; Ok(()) } @@ -780,7 +791,8 @@ mod tests { value: S("dolor sit amet, consectetur adipiscing elit"), kind: S("text"), number: 10, - nested: Nested { child: S("second") } + nested: Nested { child: S("second") }, + _vectors: HashMap::from([(S("default"), vec![2000.0])]), }, &results.hits[0].result ); @@ -952,7 +964,8 @@ mod tests { value: S("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do…"), kind: S("text"), number: 0, - nested: Nested { child: S("first") } + nested: Nested { child: S("first") }, + _vectors: HashMap::from([(S("default"), vec![1000.0])]) }, results.hits[0].formatted_result.as_ref().unwrap() ); @@ -967,7 +980,8 @@ mod tests { value: S("Lorem ipsum dolor sit amet…"), kind: S("text"), number: 0, - nested: Nested { child: S("first") } + nested: Nested { child: S("first") }, + _vectors: HashMap::from([(S("default"), vec![1000.0])]) }, results.hits[0].formatted_result.as_ref().unwrap() ); @@ -988,7 +1002,8 @@ mod tests { value: S("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."), kind: S("text"), number: 0, - nested: Nested { child: S("first") } + nested: Nested { child: S("first") }, + _vectors: HashMap::from([(S("default"), vec![1000.0])]) }, results.hits[0].formatted_result.as_ref().unwrap()); @@ -1003,7 +1018,8 @@ mod tests { value: S("Lorem ipsum dolor sit amet…"), kind: S("text"), number: 0, - nested: Nested { child: S("first") } + nested: Nested { child: S("first") }, + _vectors: HashMap::from([(S("default"), vec![1000.0])]) }, results.hits[0].formatted_result.as_ref().unwrap() ); @@ -1028,7 +1044,8 @@ mod tests { value: S("(ꈍᴗꈍ)sed do eiusmod tempor incididunt ut(ꈍᴗꈍ)"), kind: S("text"), number: 0, - nested: Nested { child: S("first") } + nested: Nested { child: S("first") }, + _vectors: HashMap::from([(S("default"), vec![1000.0])]), }, results.hits[0].formatted_result.as_ref().unwrap() ); @@ -1055,7 +1072,8 @@ mod tests { value: S("The (⊃。•́‿•̀。)⊃ Social ⊂(´• ω •`⊂) Network"), kind: S("title"), number: 20, - nested: Nested { child: S("third") } + nested: Nested { child: S("third") }, + _vectors: HashMap::from([(S("default"), vec![3000.0])]) }, results.hits[0].formatted_result.as_ref().unwrap() ); @@ -1077,7 +1095,8 @@ mod tests { value: S("dolor sit amet, consectetur adipiscing elit"), kind: S("text"), number: 10, - nested: Nested { child: S("first") } + nested: Nested { child: S("second") }, + _vectors: HashMap::from([(S("default"), vec![1000.0])]), }, results.hits[0].formatted_result.as_ref().unwrap(), ); @@ -1092,7 +1111,8 @@ mod tests { value: S("dolor sit amet, consectetur adipiscing elit"), kind: S("text"), number: 10, - nested: Nested { child: S("first") } + nested: Nested { child: S("second") }, + _vectors: HashMap::from([(S("default"), vec![2000.0])]) }, results.hits[0].formatted_result.as_ref().unwrap() ); @@ -1223,8 +1243,7 @@ mod tests { #[cfg(feature = "experimental-vector-search")] #[meilisearch_test] async fn test_hybrid(client: Client, index: Index) -> Result<(), Error> { - use crate::settings::{Embedder, HuggingFaceEmbedderSettings}; - log::warn!("You are executing the vector search test. This WILL take a while and might lead to timeouts in other tests. You can disable this testcase by not enabling the `experimental-vector-search`-feature and running this "); + use crate::settings::{Embedder, UserProvidedEmbedderSettings}; // enable vector searching and configure an embedder let features = crate::features::ExperimentalFeatures::new(&client) .set_vector_store(true) @@ -1232,11 +1251,8 @@ mod tests { .await .expect("could not enable the vector store"); assert_eq!(features.vector_store, true); - let embedder_setting = Embedder::HuggingFace(HuggingFaceEmbedderSettings { - model: Some("BAAI/bge-base-en-v1.5".into()), - revision: None, - document_template: Some("{{ doc.value }}".into()), - }); + let embedder_setting = + Embedder::UserProvided(UserProvidedEmbedderSettings { dimensions: 1 }); let t3 = index .set_settings(&crate::settings::Settings { embedders: Some(HashMap::from([("default".to_string(), embedder_setting)])), @@ -1247,11 +1263,16 @@ mod tests { setup_test_index(&client, &index).await?; - // "zweite" = "second" in german - // => an embedding should be able to detect that this is equivalent, but not the regular search + // "2nd" = "second" + // no semantic searching => no matches + let results: SearchResults = index.search().with_query("2nd").execute().await?; + assert_eq!(results.hits.len(), 0); + + // an embedding should be able to detect that this is equivalent, but not the regular search let results: SearchResults = index .search() - .with_query("Facebook") + .with_query("2nd") + .with_vector(&[2000.0]) .with_hybrid("default", 1.0) // entirely rely on semantic searching .execute() .await?; @@ -1263,16 +1284,10 @@ mod tests { kind: S("text"), number: 10, nested: Nested { child: S("second") }, + _vectors: HashMap::from([(S("default"), vec![2000.0])]) }, &results.hits[0].result ); - let results: SearchResults = index - .search() - .with_query("zweite") - .with_hybrid("default", 0.0) // no semantic searching => no matches - .execute() - .await?; - assert_eq!(results.hits.len(), 0); // word that has a typo => would have been found via traditional means // if entirely relying on semantic searching, no result is found @@ -1280,6 +1295,7 @@ mod tests { .search() .with_query("lohrem") .with_hybrid("default", 1.0) + .with_vector(&[1000.0]) .execute() .await?; assert_eq!(results.hits.len(), 0); @@ -1287,6 +1303,7 @@ mod tests { .search() .with_query("lohrem") .with_hybrid("default", 0.0) + .with_vector(&[1000.0]) .execute() .await?; assert_eq!(results.hits.len(), 1); @@ -1296,7 +1313,8 @@ mod tests { value: S("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."), kind: S("text"), number: 0, - nested: Nested { child: S("first") } + nested: Nested { child: S("first") }, + _vectors: HashMap::from([(S("default"), vec![1000.0])]), }, &results.hits[0].result ); From 0e044b14c2937052e64ef12dfd46a34362a068d0 Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Wed, 17 Apr 2024 15:44:30 +0200 Subject: [PATCH 04/23] Removed the `experimental-vector-search` feature --- Cargo.toml | 1 - src/search.rs | 8 -------- src/settings.rs | 17 ++++++----------- 3 files changed, 6 insertions(+), 20 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 91a3c1d7..9ad71697 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,7 +40,6 @@ wasm-bindgen-futures = "0.4" [features] default = ["reqwest"] reqwest = ["dep:reqwest", "pin-project-lite", "bytes"] -experimental-vector-search = [] [dev-dependencies] futures-await-test = "0.3" diff --git a/src/search.rs b/src/search.rs index 83d6e191..85706ce9 100644 --- a/src/search.rs +++ b/src/search.rs @@ -138,7 +138,6 @@ pub enum Selectors { All, } -#[cfg(feature = "experimental-vector-search")] #[derive(Debug, Serialize, Clone)] #[serde(rename_all = "camelCase")] pub struct HybridSearch<'a> { @@ -347,13 +346,11 @@ pub struct SearchQuery<'a, Http: HttpClient> { /// EXPERIMENTAL /// Defines whether to utilise previously defined embedders for semantic searching - #[cfg(feature = "experimental-vector-search")] #[serde(skip_serializing_if = "Option::is_none")] pub hybrid: Option>, /// EXPERIMENTAL /// Defines what vectors an userprovided embedder has gotten for semantic searching - #[cfg(feature = "experimental-vector-search")] #[serde(skip_serializing_if = "Option::is_none")] pub vector: Option<&'a [f32]>, } @@ -384,9 +381,7 @@ impl<'a, Http: HttpClient> SearchQuery<'a, Http> { show_ranking_score: None, matching_strategy: None, index_uid: None, - #[cfg(feature = "experimental-vector-search")] hybrid: None, - #[cfg(feature = "experimental-vector-search")] vector: None, } } @@ -573,7 +568,6 @@ impl<'a, Http: HttpClient> SearchQuery<'a, Http> { } /// EXPERIMENTAL /// Defines whether to utilise previously defined embedders for semantic searching - #[cfg(feature = "experimental-vector-search")] pub fn with_hybrid<'b>( &'b mut self, embedder: &'a str, @@ -587,7 +581,6 @@ impl<'a, Http: HttpClient> SearchQuery<'a, Http> { } /// EXPERIMENTAL /// Defines what vectors an userprovided embedder has gotten for semantic searching - #[cfg(feature = "experimental-vector-search")] pub fn with_vector<'b>(&'b mut self, vector: &'a [f32]) -> &'b mut SearchQuery<'a, Http> { self.vector = Some(vector); self @@ -1240,7 +1233,6 @@ mod tests { Ok(()) } - #[cfg(feature = "experimental-vector-search")] #[meilisearch_test] async fn test_hybrid(client: Client, index: Index) -> Result<(), Error> { use crate::settings::{Embedder, UserProvidedEmbedderSettings}; diff --git a/src/settings.rs b/src/settings.rs index 442391bc..e399db3e 100644 --- a/src/settings.rs +++ b/src/settings.rs @@ -36,7 +36,8 @@ pub struct FacetingSettings { pub max_values_per_facet: usize, } -#[cfg(feature = "experimental-vector-search")] +/// EXPERIMENTAL +/// Allows configuring semantic seaarching #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)] #[serde(rename_all = "camelCase", tag = "source")] pub enum Embedder { @@ -50,7 +51,6 @@ pub enum Embedder { UserProvided(UserProvidedEmbedderSettings), } -#[cfg(feature = "experimental-vector-search")] #[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq)] #[serde(rename_all = "camelCase")] pub struct HuggingFaceEmbedderSettings { @@ -69,7 +69,6 @@ pub struct HuggingFaceEmbedderSettings { pub document_template: Option, } -#[cfg(feature = "experimental-vector-search")] #[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq)] #[serde(rename_all = "camelCase")] pub struct OpenapiEmbedderSettings { @@ -93,7 +92,6 @@ pub struct OpenapiEmbedderSettings { pub document_template: Option, } -#[cfg(feature = "experimental-vector-search")] #[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq, Copy)] pub struct UserProvidedEmbedderSettings { /// dimensions of your custom embedding @@ -168,7 +166,6 @@ pub struct Settings { #[serde(skip_serializing_if = "Option::is_none")] pub proximity_precision: Option, /// Settings how the embeddings for the experimental vector search feature are generated - #[cfg(feature = "experimental-vector-search")] #[serde(skip_serializing_if = "Option::is_none")] pub embedders: Option>, } @@ -355,8 +352,9 @@ impl Settings { } } + /// EXPERIMENTAL + /// Set the [embedders](https://www.meilisearch.com/docs/learn/experimental/vector_search) of the [Index]. #[must_use] - #[cfg(feature = "experimental-vector-search")] pub fn with_embedders(self, embedders: HashMap) -> Settings where S: AsRef, @@ -840,6 +838,7 @@ impl Index { .await } + /// EXPERIMENTAL /// Get [embedders](https://www.meilisearch.com/docs/learn/experimental/vector_search) of the [Index]. /// /// ``` @@ -868,7 +867,6 @@ impl Index { /// # index.delete().await.unwrap().wait_for_completion(&client, None, None).await.unwrap(); /// # }); /// ``` - #[cfg(feature = "experimental-vector-search")] pub async fn get_embedders(&self) -> Result, Error> { self.client .http_client @@ -1940,6 +1938,7 @@ impl Index { .await } + /// EXPERIMENTAL /// Reset [embedders](https://www.meilisearch.com/docs/learn/experimental/vector_search) of the [Index]. /// /// # Example @@ -1959,7 +1958,6 @@ impl Index { /// # index.delete().await.unwrap().wait_for_completion(&client, None, None).await.unwrap(); /// # }); /// ``` - #[cfg(feature = "experimental-vector-search")] pub async fn reset_embedders(&self) -> Result { self.client .http_client @@ -2008,7 +2006,6 @@ mod tests { assert_eq!(faceting, res); } - #[cfg(feature = "experimental-vector-search")] #[meilisearch_test] async fn test_get_embeddings(index: Index) { let res = index.get_embedders().await.unwrap(); @@ -2042,7 +2039,6 @@ mod tests { assert_eq!(faceting, res); } - #[cfg(feature = "experimental-vector-search")] #[meilisearch_test] async fn test_reset_embedders(client: Client, index: Index) { let features = crate::features::ExperimentalFeatures::new(&client) @@ -2235,7 +2231,6 @@ mod tests { assert_eq!(expected, res); } - #[cfg(feature = "experimental-vector-search")] #[meilisearch_test] async fn test_set_embedding_settings(client: Client, index: Index) { let features = crate::features::ExperimentalFeatures::new(&client) From 5e72f98ac2a481947fd95915d46b64c33dab3854 Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Sun, 7 Jul 2024 23:52:37 +0200 Subject: [PATCH 05/23] feat: added support for ollama and rest --- src/settings.rs | 180 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 172 insertions(+), 8 deletions(-) diff --git a/src/settings.rs b/src/settings.rs index 4e1f0c4d..bc50072a 100644 --- a/src/settings.rs +++ b/src/settings.rs @@ -37,38 +37,54 @@ pub struct FacetingSettings { } /// EXPERIMENTAL -/// Allows configuring semantic seaarching +/// Allows configuring semantic searching #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)] #[serde(rename_all = "camelCase", tag = "source")] pub enum Embedder { - /// Compute embeddings locally. - /// This is a resource-intensive operation and might affect indexing performance. + /// Compute embeddings inside meilisearch with models from [HuggingFace](https://huggingface.co/). + /// This is a resource-intensive operation and might affect indexing performance negatively. HuggingFace(HuggingFaceEmbedderSettings), /// Use OpenAi's API to generate embeddings + /// Depending on hardware, this is a OpenAi(OpenapiEmbedderSettings), + /// [Ollama](https://ollama.com/) is a framework for building and running language models locally. + /// This is a resource-intensive operation and might affect indexing performance negatively. + Ollama(OllamaEmbedderSettings), + /// Supports arbitrary embedders which supply a [REST](https://en.wikipedia.org/wiki/REST) interface + REST(GenericRESTEmbedderSettings), /// Provide custom embeddings. /// In this case, you must manually update your embeddings when adding, updating, and removing documents to your index. UserProvided(UserProvidedEmbedderSettings), } +/// EXPERIMENTAL #[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq)] #[serde(rename_all = "camelCase")] pub struct HuggingFaceEmbedderSettings { - /// the BERT embedding model you want to use from HuggingFace - /// Defaults to `BAAI/bge-base-en-v1.5` + /// the [BERT embedding model](https://en.wikipedia.org/wiki/BERT_(language_model)) you want to use from [HuggingFace](https://huggingface.co) + /// Defaults to `"BAAI/bge-base-en-v1.5"` #[serde(skip_serializing_if = "Option::is_none")] pub model: Option, + /// revisions allow you to pin a specific version of a model, using a commit hash, tag or branch + /// this allows (according to [huggingface](https://huggingface.co/transformers/v4.8.2/model_sharing.html)): + /// - built-in versioning + /// - access control + /// - scalability #[serde(skip_serializing_if = "Option::is_none")] pub revision: Option, - /// if present, document_template must be a [Liquid template](https://shopify.github.io/liquid/). + /// Use it to customize the data you send to the embedder. It is highly recommended you configure a custom template for your documents. + /// + /// if present, `document_template` must be a [Liquid template](https://shopify.github.io/liquid/). /// Use `{{ doc.attribute }}` to access document field values. /// Meilisearch also exposes a `{{ fields }}` array containing one object per document field, which you may access with `{{ field.name }}` and `{{ field.value }}`. /// /// For best results, use short strings indicating the type of document in that index, only include highly relevant document fields, and truncate long fields. + /// Example: "A document titled '{{doc.title}}' whose description starts with {{doc.overview|truncatewords: 20}}" #[serde(skip_serializing_if = "Option::is_none")] pub document_template: Option, } +/// EXPERIMENTAL #[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq)] #[serde(rename_all = "camelCase")] pub struct OpenapiEmbedderSettings { @@ -83,22 +99,169 @@ pub struct OpenapiEmbedderSettings { /// Defaults to the default for said model name #[serde(skip_serializing_if = "Option::is_none")] pub dimensions: Option, - /// if present, document_template must be a [Liquid template](https://shopify.github.io/liquid/). + /// Use it to customize the data you send to the embedder. It is highly recommended you configure a custom template for your documents. + /// + /// if present, `document_template` must be a [Liquid template](https://shopify.github.io/liquid/). + /// Use `{{ doc.attribute }}` to access document field values. + /// Meilisearch also exposes a `{{ fields }}` array containing one object per document field, which you may access with `{{ field.name }}` and `{{ field.value }}`. + /// + /// For best results, use short strings indicating the type of document in that index, only include highly relevant document fields, and truncate long fields. + /// Example: "A document titled '{{doc.title}}' whose description starts with {{doc.overview|truncatewords: 20}}" + #[serde(skip_serializing_if = "Option::is_none")] + pub document_template: Option, +} + +/// EXPERIMENTAL +/// +/// # Example +/// ``` +/// # use meilisearch_sdk::settings::OllamaEmbedderSettings; +/// let embedder_setting = OllamaEmbedderSettings { +/// url: Some("http://localhost:11434/api/embeddings".to_string()), +/// api_key: Some("foobarbaz".to_string()), +/// model: "nomic-embed-text".to_string(), +/// document_template: Some("A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}".to_string()), +/// }; +/// # let expected = r#"{"url":"http://localhost:11434/api/embeddings","apiKey":"foobarbaz","model":"nomic-embed-text","documentTemplate":"A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}"}"#; +/// # let expected: OllamaEmbedderSettings = serde_json::from_str(expected).unwrap(); +/// # assert_eq!(embedder_setting, expected); +/// ``` +#[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct OllamaEmbedderSettings { + /// Mandatory, full URL to the embedding endpoint. + /// Must be parseable as a URL. + /// If not specified, [Meilisearch](https://www.meilisearch.com/) (**not the sdk you are currently using**) will try to fetch the `MEILI_OLLAMA_URL` environment variable + /// Example: `"http://localhost:11434/api/embeddings"` + #[serde(skip_serializing_if = "Option::is_none")] + pub url: Option, + /// Optional, token used to authenticate against [Ollama](https://ollama.com/) + /// Example: `"foobarbaz"` + #[serde(skip_serializing_if = "Option::is_none")] + pub api_key: Option, + /// See https://ollama.com/library?q=embed for suitable embedding models + /// + /// # Example embedding models + /// + /// | Model | Parameter | Size | + /// |--------------------------|--------------|-----------------------------------------------------------------| + /// | `mxbai-embed-large` | `334M` | [View model](https://ollama.com/library/mxbai-embed-large) | + /// | `nomic-embed-text` | `137M` | [View model](https://ollama.com/library/nomic-embed-text) | + /// | `all-minilm` | `23M`,`33M` | [View model](https://ollama.com/library/all-minilm) | + /// | `snowflake-arctic-embed` | varies | [View model](https://ollama.com/library/snowflake-arctic-embed) | + pub model: String, + /// Use it to customize the data you send to the embedder. It is highly recommended you configure a custom template for your documents. + /// + /// if present, `document_template` must be a [Liquid template](https://shopify.github.io/liquid/). + /// Use `{{ doc.attribute }}` to access document field values. + /// Meilisearch also exposes a `{{ fields }}` array containing one object per document field, which you may access with `{{ field.name }}` and `{{ field.value }}`. + /// + /// For best results, use short strings indicating the type of document in that index, only include highly relevant document fields, and truncate long fields. + /// Example: "A document titled '{{doc.title}}' whose description starts with {{doc.overview|truncatewords: 20}}" + #[serde(skip_serializing_if = "Option::is_none")] + pub document_template: Option, +} + +/// EXPERIMENTAL +/// +/// # Example +/// ``` +/// # use std::collections::HashMap; +/// # use meilisearch_sdk::settings::{GenericRestEmbedderSettings,GenericRestInputType}; +/// use serde_json::Value; +/// let embedder_setting = GenericRestEmbedderSettings { +/// url: Some("http://localhost:12345/api/v1/embed".to_string()), +/// api_key: Some("SOURCE_API_KEY".to_string()), +/// dimensions: Some(512), +/// document_template: Some("A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}".to_string()), +/// input_field: vec!["data".to_string(), "text".to_string()], +/// input_type: Some(GenericRestInputType::Text), +/// query: HashMap::from([("model".to_string(), Value::from("MODEL_NAME")), ("dimensions".to_string(), Value::from(512))]), +/// path_to_embeddings: vec!["data".to_string()], +/// embedding_object: vec!["embedding".to_string()], +/// }; +/// # let expected = r#"{"url":"http://localhost:12345/api/v1/embed","apiKey":"SOURCE_API_KEY","dimensions":512,"documentTemplate":"A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}","inputField":["data","text"],"inputType":"text","query":{"dimensions":512,"model":"MODEL_NAME"},"pathToEmbeddings":["data"],"embeddingObject":["embedding"]}"#; +/// # let expected: GenericRestEmbedderSettings = serde_json::from_str(expected).unwrap(); +/// # assert_eq!(embedder_setting, expected); +/// ``` +#[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct GenericRestEmbedderSettings { + /// Mandatory, full URL to the embedding endpoint + /// Must be parseable as a URL. + /// If not specified, [Meilisearch](https://www.meilisearch.com/) (**not the sdk you are currently using**) will try to fetch the `MEILI_OLLAMA_URL` environment variable + /// Example: `"http://localhost:12345/api/v1/embed"` + pub url: Option, + /// Optional, passed as Bearer in the Authorization header + /// Example: `"187HFLDH97CNHN"` + #[serde(skip_serializing_if = "Option::is_none")] + pub api_key: Option, + /// Optional + /// Inferred with a dummy request if missing + #[serde(skip_serializing_if = "Option::is_none")] + pub dimensions: Option, + /// Use it to customize the data you send to the embedder. It is highly recommended you configure a custom template for your documents. + /// + /// if present, `document_template` must be a [Liquid template](https://shopify.github.io/liquid/). /// Use `{{ doc.attribute }}` to access document field values. /// Meilisearch also exposes a `{{ fields }}` array containing one object per document field, which you may access with `{{ field.name }}` and `{{ field.value }}`. /// /// For best results, use short strings indicating the type of document in that index, only include highly relevant document fields, and truncate long fields. + /// Example: `"A document titled '{{doc.title}}' whose description starts with {{doc.overview|truncatewords: 20}}"` #[serde(skip_serializing_if = "Option::is_none")] pub document_template: Option, + /// Optional + /// Inject texts in `data.text` in the query + /// Determines what name they use for that input. + /// + /// Default: [] + /// Example: `["data", "text"]` + #[serde(skip_serializing_if = "Vec::is_empty")] + pub input_field: Vec, + /// Optional + /// Default: [`GenericRestInputType::Text`] + #[serde(skip_serializing_if = "Option::is_none")] + pub input_type: Option, + /// Optional, defaults to {} + /// + /// Example: + /// ```json + /// { + /// "model": "MODEL_NAME", + /// "dimensions": 512 + /// } + /// ``` + #[serde(skip_serializing_if = "HashMap::is_empty")] + pub query: HashMap, + /// Optional + /// Defaults to [] + /// Example: `["data"]` + #[serde(skip_serializing_if = "Vec::is_empty")] + pub path_to_embeddings: Vec, + /// Optional + /// Defaults to [] + /// Example: `["embedding"]` + #[serde(skip_serializing_if = "Vec::is_empty")] + pub embedding_object: Vec, } +#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)] +#[serde(rename_all = "camelCase")] +pub enum GenericRestInputType { + /// indicates that the model accepts a single text + Text, + /// indicates that the model accepts a single text + TextArray, +} + +/// EXPERIMENTAL #[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq, Copy)] pub struct UserProvidedEmbedderSettings { /// dimensions of your custom embedding pub dimensions: usize, } -/// Struct reprensenting a set of settings. +/// Struct representing a set of settings. /// /// You can build this struct using the builder syntax. /// @@ -165,6 +328,7 @@ pub struct Settings { /// Proximity precision settings. #[serde(skip_serializing_if = "Option::is_none")] pub proximity_precision: Option, + /// EXPERIMENTAL /// Settings how the embeddings for the experimental vector search feature are generated #[serde(skip_serializing_if = "Option::is_none")] pub embedders: Option>, From 75e5585f2d5adde6ad4f08cc350809612f09726d Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Mon, 8 Jul 2024 01:20:43 +0200 Subject: [PATCH 06/23] chore: improved the documentation --- src/search.rs | 1 + src/settings.rs | 31 +++++++++++++++++++++++++++++-- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/src/search.rs b/src/search.rs index d4cced54..5245cf82 100644 --- a/src/search.rs +++ b/src/search.rs @@ -142,6 +142,7 @@ pub enum Selectors { All, } +/// EXPERIMENTAL #[derive(Debug, Serialize, Clone)] #[serde(rename_all = "camelCase")] pub struct HybridSearch<'a> { diff --git a/src/settings.rs b/src/settings.rs index bc50072a..2cd7ba4d 100644 --- a/src/settings.rs +++ b/src/settings.rs @@ -42,22 +42,35 @@ pub struct FacetingSettings { #[serde(rename_all = "camelCase", tag = "source")] pub enum Embedder { /// Compute embeddings inside meilisearch with models from [HuggingFace](https://huggingface.co/). + /// You may be able to significantly improve performance by [compiling a CUDA-compatible Meilisearch binary](https://www.meilisearch.com/docs/guides/ai/computing_hugging_face_embeddings_gpu). /// This is a resource-intensive operation and might affect indexing performance negatively. HuggingFace(HuggingFaceEmbedderSettings), /// Use OpenAi's API to generate embeddings /// Depending on hardware, this is a OpenAi(OpenapiEmbedderSettings), /// [Ollama](https://ollama.com/) is a framework for building and running language models locally. - /// This is a resource-intensive operation and might affect indexing performance negatively. Ollama(OllamaEmbedderSettings), /// Supports arbitrary embedders which supply a [REST](https://en.wikipedia.org/wiki/REST) interface - REST(GenericRESTEmbedderSettings), + REST(GenericRestEmbedderSettings), /// Provide custom embeddings. /// In this case, you must manually update your embeddings when adding, updating, and removing documents to your index. UserProvided(UserProvidedEmbedderSettings), } /// EXPERIMENTAL +/// +/// # Example +/// ``` +/// # use meilisearch_sdk::settings::HuggingFaceEmbedderSettings; +/// let embedder_setting = HuggingFaceEmbedderSettings { +/// model: Some("BAAI/bge-base-en-v1.5".to_string()), +/// document_template: Some("A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}".to_string()), +/// ..Default::default() +/// }; +/// # let expected = r#"{"model":"BAAI/bge-base-en-v1.5","documentTemplate":"A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}"}"#; +/// # let expected: HuggingFaceEmbedderSettings = serde_json::from_str(expected).unwrap(); +/// # assert_eq!(embedder_setting, expected); +/// ``` #[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq)] #[serde(rename_all = "camelCase")] pub struct HuggingFaceEmbedderSettings { @@ -85,6 +98,20 @@ pub struct HuggingFaceEmbedderSettings { } /// EXPERIMENTAL +/// +/// # Example +/// ``` +/// # use meilisearch_sdk::settings::OpenapiEmbedderSettings; +/// let embedder_setting = OpenapiEmbedderSettings { +/// api_key: "anOpenAiApiKey".to_string(), +/// model: Some("text-embedding-3-small".to_string()), +/// document_template: Some("A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}".to_string()), +/// ..Default::default() +/// }; +/// # let expected = r#"{"apiKey":"anOpenAiApiKey","model":"text-embedding-3-small","documentTemplate":"A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}","dimensions": 1536"}"#; +/// # let expected: OpenapiEmbedderSettings = serde_json::from_str(expected).unwrap(); +/// # assert_eq!(embedder_setting, expected); +/// ``` #[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq)] #[serde(rename_all = "camelCase")] pub struct OpenapiEmbedderSettings { From 464de4459dd6f0b91b88ad025733f73ee91cfaae Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Mon, 8 Jul 2024 03:08:08 +0200 Subject: [PATCH 07/23] feat: implemnted the `retrieve_vectors` flag --- src/search.rs | 113 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 86 insertions(+), 27 deletions(-) diff --git a/src/search.rs b/src/search.rs index 5245cf82..4b9a4bc0 100644 --- a/src/search.rs +++ b/src/search.rs @@ -364,6 +364,14 @@ pub struct SearchQuery<'a, Http: HttpClient> { /// Defines what vectors an userprovided embedder has gotten for semantic searching #[serde(skip_serializing_if = "Option::is_none")] pub vector: Option<&'a [f32]>, + + /// EXPERIMENTAL + /// Defines whether vectors for semantic searching are returned in the search results + /// Can Significantly increase the response size. + /// + /// **Default: `false`** + #[serde(skip_serializing_if = "Option::is_none")] + retrieve_vectors: Option } #[allow(missing_docs)] @@ -395,6 +403,7 @@ impl<'a, Http: HttpClient> SearchQuery<'a, Http> { index_uid: None, hybrid: None, vector: None, + retrieve_vectors: None, } } pub fn with_query<'b>(&'b mut self, query: &'a str) -> &'b mut SearchQuery<'a, Http> { @@ -487,6 +496,13 @@ impl<'a, Http: HttpClient> SearchQuery<'a, Http> { self.filter = Some(Filter::new(Either::Right(filter))); self } + pub fn with_retrieve_vectors<'b>( + &'b mut self, + retrieve_vectors: bool, + ) -> &'b mut SearchQuery<'a, Http> { + self.retrieve_vectors = Some(retrieve_vectors); + self + } pub fn with_facets<'b>( &'b mut self, facets: Selectors<&'a [&'a str]>, @@ -694,7 +710,23 @@ mod tests { kind: String, number: i32, nested: Nested, - _vectors: HashMap>, + #[serde(default)] + _vectors: Option, + } + + + #[derive(Debug, Serialize, Deserialize, PartialEq)] + struct Vector { + embeddings: Vec>, + regenerate: bool, + } + #[derive(Debug, Serialize, Deserialize, PartialEq)] + struct Vectors(HashMap); + + impl From<&[f32; 1]> for Vectors { + fn from(value: &[f32;1]) -> Self { + Vectors(HashMap::from([(S("default"), Vector { embeddings: Vec::from([value.to_vec()]), regenerate:false })])) + } } impl PartialEq> for Document { @@ -708,16 +740,16 @@ mod tests { async fn setup_test_index(client: &Client, index: &Index) -> Result<(), Error> { let t0 = index.add_documents(&[ - Document { id: 0, kind: "text".into(), number: 0, value: S("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."), nested: Nested { child: S("first") }, _vectors: HashMap::from([(S("default"), vec![1000.0])])}, - Document { id: 1, kind: "text".into(), number: 10, value: S("dolor sit amet, consectetur adipiscing elit"), nested: Nested { child: S("second") }, _vectors: HashMap::from([(S("default"), vec![2000.0])]) }, - Document { id: 2, kind: "title".into(), number: 20, value: S("The Social Network"), nested: Nested { child: S("third") }, _vectors: HashMap::from([(S("default"), vec![3000.0])]) }, - Document { id: 3, kind: "title".into(), number: 30, value: S("Harry Potter and the Sorcerer's Stone"), nested: Nested { child: S("fourth") }, _vectors: HashMap::from([(S("default"), vec![4000.0])]) }, - Document { id: 4, kind: "title".into(), number: 40, value: S("Harry Potter and the Chamber of Secrets"), nested: Nested { child: S("fift") }, _vectors: HashMap::from([(S("default"), vec![5000.0])]) }, - Document { id: 5, kind: "title".into(), number: 50, value: S("Harry Potter and the Prisoner of Azkaban"), nested: Nested { child: S("sixth") }, _vectors: HashMap::from([(S("default"), vec![6000.0])]) }, - Document { id: 6, kind: "title".into(), number: 60, value: S("Harry Potter and the Goblet of Fire"), nested: Nested { child: S("seventh") }, _vectors: HashMap::from([(S("default"), vec![7000.0])]) }, - Document { id: 7, kind: "title".into(), number: 70, value: S("Harry Potter and the Order of the Phoenix"), nested: Nested { child: S("eighth") }, _vectors: HashMap::from([(S("default"), vec![8000.0])]) }, - Document { id: 8, kind: "title".into(), number: 80, value: S("Harry Potter and the Half-Blood Prince"), nested: Nested { child: S("ninth") }, _vectors: HashMap::from([(S("default"), vec![9000.0])]) }, - Document { id: 9, kind: "title".into(), number: 90, value: S("Harry Potter and the Deathly Hallows"), nested: Nested { child: S("tenth") }, _vectors: HashMap::from([(S("default"), vec![10000.0])]) }, + Document { id: 0, kind: "text".into(), number: 0, value: S("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."), nested: Nested { child: S("first") }, _vectors: Some(Vectors::from(&[1000.0]))}, + Document { id: 1, kind: "text".into(), number: 10, value: S("dolor sit amet, consectetur adipiscing elit"), nested: Nested { child: S("second") }, _vectors: Some(Vectors::from(&[2000.0])) }, + Document { id: 2, kind: "title".into(), number: 20, value: S("The Social Network"), nested: Nested { child: S("third") }, _vectors: Some(Vectors::from(&[3000.0])) }, + Document { id: 3, kind: "title".into(), number: 30, value: S("Harry Potter and the Sorcerer's Stone"), nested: Nested { child: S("fourth") }, _vectors: Some(Vectors::from(&[4000.0])) }, + Document { id: 4, kind: "title".into(), number: 40, value: S("Harry Potter and the Chamber of Secrets"), nested: Nested { child: S("fift") }, _vectors: Some(Vectors::from(&[5000.0])) }, + Document { id: 5, kind: "title".into(), number: 50, value: S("Harry Potter and the Prisoner of Azkaban"), nested: Nested { child: S("sixth") }, _vectors: Some(Vectors::from(&[6000.0])) }, + Document { id: 6, kind: "title".into(), number: 60, value: S("Harry Potter and the Goblet of Fire"), nested: Nested { child: S("seventh") }, _vectors: Some(Vectors::from(&[7000.0])) }, + Document { id: 7, kind: "title".into(), number: 70, value: S("Harry Potter and the Order of the Phoenix"), nested: Nested { child: S("eighth") }, _vectors: Some(Vectors::from(&[8000.0])) }, + Document { id: 8, kind: "title".into(), number: 80, value: S("Harry Potter and the Half-Blood Prince"), nested: Nested { child: S("ninth") }, _vectors: Some(Vectors::from(&[9000.0])) }, + Document { id: 9, kind: "title".into(), number: 90, value: S("Harry Potter and the Deathly Hallows"), nested: Nested { child: S("tenth") }, _vectors: Some(Vectors::from(&[10000.0])) }, ], None).await?; let t1 = index .set_filterable_attributes(["kind", "value", "number"]) @@ -806,7 +838,9 @@ mod tests { kind: S("text"), number: 10, nested: Nested { child: S("second") }, - _vectors: HashMap::from([(S("default"), vec![2000.0])]), + // TODO: This is likely a bug upstream. vectors should not be present + // => correct would be `_vectors: None` + _vectors: Some(Vectors::from(&[2000.0])), }, &results.hits[0].result ); @@ -979,7 +1013,7 @@ mod tests { kind: S("text"), number: 0, nested: Nested { child: S("first") }, - _vectors: HashMap::from([(S("default"), vec![1000.0])]) + _vectors: None, }, results.hits[0].formatted_result.as_ref().unwrap() ); @@ -995,7 +1029,7 @@ mod tests { kind: S("text"), number: 0, nested: Nested { child: S("first") }, - _vectors: HashMap::from([(S("default"), vec![1000.0])]) + _vectors: None, }, results.hits[0].formatted_result.as_ref().unwrap() ); @@ -1017,7 +1051,7 @@ mod tests { kind: S("text"), number: 0, nested: Nested { child: S("first") }, - _vectors: HashMap::from([(S("default"), vec![1000.0])]) + _vectors: None, }, results.hits[0].formatted_result.as_ref().unwrap()); @@ -1033,7 +1067,7 @@ mod tests { kind: S("text"), number: 0, nested: Nested { child: S("first") }, - _vectors: HashMap::from([(S("default"), vec![1000.0])]) + _vectors: None, }, results.hits[0].formatted_result.as_ref().unwrap() ); @@ -1059,7 +1093,7 @@ mod tests { kind: S("text"), number: 0, nested: Nested { child: S("first") }, - _vectors: HashMap::from([(S("default"), vec![1000.0])]), + _vectors: None, }, results.hits[0].formatted_result.as_ref().unwrap() ); @@ -1087,7 +1121,7 @@ mod tests { kind: S("title"), number: 20, nested: Nested { child: S("third") }, - _vectors: HashMap::from([(S("default"), vec![3000.0])]) + _vectors: None, }, results.hits[0].formatted_result.as_ref().unwrap() ); @@ -1110,7 +1144,7 @@ mod tests { kind: S("text"), number: 10, nested: Nested { child: S("second") }, - _vectors: HashMap::from([(S("default"), vec![1000.0])]), + _vectors: None, }, results.hits[0].formatted_result.as_ref().unwrap(), ); @@ -1126,7 +1160,7 @@ mod tests { kind: S("text"), number: 10, nested: Nested { child: S("second") }, - _vectors: HashMap::from([(S("default"), vec![2000.0])]) + _vectors: None, }, results.hits[0].formatted_result.as_ref().unwrap() ); @@ -1284,18 +1318,16 @@ mod tests { Ok(()) } - #[meilisearch_test] - async fn test_hybrid(client: Client, index: Index) -> Result<(), Error> { + /// enable vector searching and configure an userProvided embedder + async fn setup_hybrid_searching(client: &Client, index: &Index) -> Result<(), Error> { use crate::settings::{Embedder, UserProvidedEmbedderSettings}; - // enable vector searching and configure an embedder let features = crate::features::ExperimentalFeatures::new(&client) .set_vector_store(true) .update() .await .expect("could not enable the vector store"); assert_eq!(features.vector_store, true); - let embedder_setting = - Embedder::UserProvided(UserProvidedEmbedderSettings { dimensions: 1 }); + let embedder_setting = Embedder::UserProvided(UserProvidedEmbedderSettings { dimensions: 1 }); let t3 = index .set_settings(&crate::settings::Settings { embedders: Some(HashMap::from([("default".to_string(), embedder_setting)])), @@ -1303,7 +1335,34 @@ mod tests { }) .await?; t3.wait_for_completion(&client, None, None).await?; + Ok(()) + } + + #[meilisearch_test] + async fn test_with_vectors(client: Client, index: Index) -> Result<(), Error> { + setup_hybrid_searching(&client, &index).await?; + setup_test_index(&client, &index).await?; + let results: SearchResults = index.search() + .with_query("lorem ipsum") + .with_retrieve_vectors(true) + .execute().await?; + assert_eq!(results.hits.len(), 1); + let expected = Vectors::from(&[1000.0]); + assert_eq!(results.hits[0].result._vectors, Some(expected)); + + let results: SearchResults = index.search() + .with_query("lorem ipsum") + .with_retrieve_vectors(false) + .execute().await?; + assert_eq!(results.hits.len(), 1); + assert_eq!(results.hits[0].result._vectors, None); + Ok(()) + } + + #[meilisearch_test] + async fn test_hybrid(client: Client, index: Index) -> Result<(), Error> { + setup_hybrid_searching(&client, &index).await?; setup_test_index(&client, &index).await?; // "2nd" = "second" @@ -1327,7 +1386,7 @@ mod tests { kind: S("text"), number: 10, nested: Nested { child: S("second") }, - _vectors: HashMap::from([(S("default"), vec![2000.0])]) + _vectors: None, }, &results.hits[0].result ); @@ -1357,7 +1416,7 @@ mod tests { kind: S("text"), number: 0, nested: Nested { child: S("first") }, - _vectors: HashMap::from([(S("default"), vec![1000.0])]), + _vectors: None, }, &results.hits[0].result ); From 980e714e818b7bbecb201c3131af48a00270c2fd Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Mon, 8 Jul 2024 15:04:14 +0200 Subject: [PATCH 08/23] chore: made sure that `test_hybrid` uses a mocked server --- src/search.rs | 80 ++++++++++++++++----------------------------------- 1 file changed, 24 insertions(+), 56 deletions(-) diff --git a/src/search.rs b/src/search.rs index 2d2a9bff..842df7af 100644 --- a/src/search.rs +++ b/src/search.rs @@ -1409,66 +1409,34 @@ mod tests { Ok(()) } - #[meilisearch_test] - async fn test_hybrid(client: Client, index: Index) -> Result<(), Error> { - setup_hybrid_searching(&client, &index).await?; - setup_test_index(&client, &index).await?; - - // "2nd" = "second" - // no semantic searching => no matches - let results: SearchResults = index.search().with_query("2nd").execute().await?; - assert_eq!(results.hits.len(), 0); - - // an embedding should be able to detect that this is equivalent, but not the regular search - let results: SearchResults = index - .search() - .with_query("2nd") - .with_vector(&[2000.0]) - .with_hybrid("default", 1.0) // entirely rely on semantic searching - .execute() - .await?; - assert_eq!(results.hits.len(), 1); - assert_eq!( - &Document { - id: 1, - value: S("dolor sit amet, consectetur adipiscing elit"), - kind: S("text"), - number: 10, - nested: Nested { child: S("second") }, - _vectors: None, - }, - &results.hits[0].result - ); - - // word that has a typo => would have been found via traditional means - // if entirely relying on semantic searching, no result is found - let results: SearchResults = index + #[tokio::test] + async fn test_hybrid() -> Result<(), Error> { + // this is mocked as I could not get the hybrid searching to work + // See https://github.com/meilisearch/meilisearch-rust/pull/554 for further context + let mut s = mockito::Server::new_async().await; + let mock_server_url = s.url(); + let client = Client::new(mock_server_url, None::)?; + let index = client.index("mocked_index"); + + let req = r#"{"q":"hello hybrid searching","hybrid":{"embedder":"default","semanticRatio":0.0},"vector":[1000.0]}"#.to_string(); + let response = r#"{"hits":[],"offset":null,"limit":null,"estimatedTotalHits":null,"page":null,"hitsPerPage":null,"totalHits":null,"totalPages":null,"facetDistribution":null,"facetStats":null,"processingTimeMs":0,"query":"","indexUid":null}"#.to_string(); + let mock_res = s + .mock("POST", "/indexes/mocked_index/search") + .with_status(200) + .match_body(mockito::Matcher::Exact(req)) + .with_body(&response) + .expect(1) + .create_async() + .await; + let results: Result, Error> = index .search() - .with_query("lohrem") - .with_hybrid("default", 1.0) - .with_vector(&[1000.0]) - .execute() - .await?; - assert_eq!(results.hits.len(), 0); - let results: SearchResults = index - .search() - .with_query("lohrem") + .with_query("hello hybrid searching") .with_hybrid("default", 0.0) .with_vector(&[1000.0]) .execute() - .await?; - assert_eq!(results.hits.len(), 1); - assert_eq!( - &Document { - id: 0, - value: S("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."), - kind: S("text"), - number: 0, - nested: Nested { child: S("first") }, - _vectors: None, - }, - &results.hits[0].result - ); + .await; + mock_res.assert_async().await; + results?; // purposely not done above to have better debugging output Ok(()) } From cbac495eb7c702a164a6d157e28bb5c84f9f9867 Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Mon, 8 Jul 2024 15:05:01 +0200 Subject: [PATCH 09/23] chore: formatting fixes --- src/search.rs | 29 ++++++++++++++++++++--------- src/settings.rs | 14 +++++++------- 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/src/search.rs b/src/search.rs index 842df7af..5ebb0669 100644 --- a/src/search.rs +++ b/src/search.rs @@ -379,7 +379,7 @@ pub struct SearchQuery<'a, Http: HttpClient> { /// /// **Default: `false`** #[serde(skip_serializing_if = "Option::is_none")] - retrieve_vectors: Option + retrieve_vectors: Option, } #[allow(missing_docs)] @@ -734,18 +734,24 @@ mod tests { _vectors: Option, } - #[derive(Debug, Serialize, Deserialize, PartialEq)] struct Vector { embeddings: Vec>, regenerate: bool, } + #[derive(Debug, Serialize, Deserialize, PartialEq)] struct Vectors(HashMap); impl From<&[f32; 1]> for Vectors { - fn from(value: &[f32;1]) -> Self { - Vectors(HashMap::from([(S("default"), Vector { embeddings: Vec::from([value.to_vec()]), regenerate:false })])) + fn from(value: &[f32; 1]) -> Self { + Vectors(HashMap::from([( + S("default"), + Vector { + embeddings: Vec::from([value.to_vec()]), + regenerate: false, + }, + )])) } } @@ -1376,7 +1382,8 @@ mod tests { .await .expect("could not enable the vector store"); assert_eq!(features.vector_store, true); - let embedder_setting = Embedder::UserProvided(UserProvidedEmbedderSettings { dimensions: 1 }); + let embedder_setting = + Embedder::UserProvided(UserProvidedEmbedderSettings { dimensions: 1 }); let t3 = index .set_settings(&crate::settings::Settings { embedders: Some(HashMap::from([("default".to_string(), embedder_setting)])), @@ -1392,18 +1399,22 @@ mod tests { setup_hybrid_searching(&client, &index).await?; setup_test_index(&client, &index).await?; - let results: SearchResults = index.search() + let results: SearchResults = index + .search() .with_query("lorem ipsum") .with_retrieve_vectors(true) - .execute().await?; + .execute() + .await?; assert_eq!(results.hits.len(), 1); let expected = Vectors::from(&[1000.0]); assert_eq!(results.hits[0].result._vectors, Some(expected)); - let results: SearchResults = index.search() + let results: SearchResults = index + .search() .with_query("lorem ipsum") .with_retrieve_vectors(false) - .execute().await?; + .execute() + .await?; assert_eq!(results.hits.len(), 1); assert_eq!(results.hits[0].result._vectors, None); Ok(()) diff --git a/src/settings.rs b/src/settings.rs index 2cd7ba4d..51fc3b44 100644 --- a/src/settings.rs +++ b/src/settings.rs @@ -68,7 +68,7 @@ pub enum Embedder { /// ..Default::default() /// }; /// # let expected = r#"{"model":"BAAI/bge-base-en-v1.5","documentTemplate":"A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}"}"#; -/// # let expected: HuggingFaceEmbedderSettings = serde_json::from_str(expected).unwrap(); +/// # let expected: HuggingFaceEmbedderSettings = serde_json::from_str(expected).unwrap(); /// # assert_eq!(embedder_setting, expected); /// ``` #[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq)] @@ -109,7 +109,7 @@ pub struct HuggingFaceEmbedderSettings { /// ..Default::default() /// }; /// # let expected = r#"{"apiKey":"anOpenAiApiKey","model":"text-embedding-3-small","documentTemplate":"A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}","dimensions": 1536"}"#; -/// # let expected: OpenapiEmbedderSettings = serde_json::from_str(expected).unwrap(); +/// # let expected: OpenapiEmbedderSettings = serde_json::from_str(expected).unwrap(); /// # assert_eq!(embedder_setting, expected); /// ``` #[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq)] @@ -150,7 +150,7 @@ pub struct OpenapiEmbedderSettings { /// document_template: Some("A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}".to_string()), /// }; /// # let expected = r#"{"url":"http://localhost:11434/api/embeddings","apiKey":"foobarbaz","model":"nomic-embed-text","documentTemplate":"A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}"}"#; -/// # let expected: OllamaEmbedderSettings = serde_json::from_str(expected).unwrap(); +/// # let expected: OllamaEmbedderSettings = serde_json::from_str(expected).unwrap(); /// # assert_eq!(embedder_setting, expected); /// ``` #[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq)] @@ -170,10 +170,10 @@ pub struct OllamaEmbedderSettings { /// /// # Example embedding models /// - /// | Model | Parameter | Size | + /// | Model | Parameter | Size | /// |--------------------------|--------------|-----------------------------------------------------------------| - /// | `mxbai-embed-large` | `334M` | [View model](https://ollama.com/library/mxbai-embed-large) | - /// | `nomic-embed-text` | `137M` | [View model](https://ollama.com/library/nomic-embed-text) | + /// | `mxbai-embed-large` | `334M` | [View model](https://ollama.com/library/mxbai-embed-large) | + /// | `nomic-embed-text` | `137M` | [View model](https://ollama.com/library/nomic-embed-text) | /// | `all-minilm` | `23M`,`33M` | [View model](https://ollama.com/library/all-minilm) | /// | `snowflake-arctic-embed` | varies | [View model](https://ollama.com/library/snowflake-arctic-embed) | pub model: String, @@ -208,7 +208,7 @@ pub struct OllamaEmbedderSettings { /// embedding_object: vec!["embedding".to_string()], /// }; /// # let expected = r#"{"url":"http://localhost:12345/api/v1/embed","apiKey":"SOURCE_API_KEY","dimensions":512,"documentTemplate":"A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}","inputField":["data","text"],"inputType":"text","query":{"dimensions":512,"model":"MODEL_NAME"},"pathToEmbeddings":["data"],"embeddingObject":["embedding"]}"#; -/// # let expected: GenericRestEmbedderSettings = serde_json::from_str(expected).unwrap(); +/// # let expected: GenericRestEmbedderSettings = serde_json::from_str(expected).unwrap(); /// # assert_eq!(embedder_setting, expected); /// ``` #[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq)] From 77399a2f493ba81f06b51fe6f382b618ddc876a5 Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Fri, 12 Jul 2024 22:32:40 +0200 Subject: [PATCH 10/23] chore: fixed a typo in a doc-comment --- src/settings.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/settings.rs b/src/settings.rs index 51fc3b44..5278101e 100644 --- a/src/settings.rs +++ b/src/settings.rs @@ -106,9 +106,10 @@ pub struct HuggingFaceEmbedderSettings { /// api_key: "anOpenAiApiKey".to_string(), /// model: Some("text-embedding-3-small".to_string()), /// document_template: Some("A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}".to_string()), +/// dimensions: Some(1536), /// ..Default::default() /// }; -/// # let expected = r#"{"apiKey":"anOpenAiApiKey","model":"text-embedding-3-small","documentTemplate":"A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}","dimensions": 1536"}"#; +/// # let expected = r#"{"apiKey":"anOpenAiApiKey","model":"text-embedding-3-small","documentTemplate":"A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}","dimensions":1536}"#; /// # let expected: OpenapiEmbedderSettings = serde_json::from_str(expected).unwrap(); /// # assert_eq!(embedder_setting, expected); /// ``` From 4c94ce54e59298d7ba25739ff7086183df3bf1a2 Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Sat, 24 Aug 2024 20:19:55 +0200 Subject: [PATCH 11/23] fix(tests): made the requested changes --- src/features.rs | 15 ++++----------- src/search.rs | 26 ++++++++++++++++---------- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/src/features.rs b/src/features.rs index 3001beb4..1f20a6d4 100644 --- a/src/features.rs +++ b/src/features.rs @@ -109,24 +109,17 @@ mod tests { use super::*; use meilisearch_test_macro::meilisearch_test; - #[meilisearch_test] - async fn test_experimental_features_get(client: Client) { - let mut features = ExperimentalFeatures::new(&client); - features.set_vector_store(false); - let _ = features.update().await.unwrap(); - - let res = features.get().await.unwrap(); - - assert!(!res.vector_store); - } - + /// there is purposely no test which disables this feature to prevent impact on other testcases + /// the setting is shared amongst all indexes #[meilisearch_test] async fn test_experimental_features_enable_vector_store(client: Client) { let mut features = ExperimentalFeatures::new(&client); features.set_vector_store(true); let res = features.update().await.unwrap(); + assert!(res.vector_store); + let res = features.get().await.unwrap(); assert!(res.vector_store); } } diff --git a/src/search.rs b/src/search.rs index 5ebb0669..43f5dcb8 100644 --- a/src/search.rs +++ b/src/search.rs @@ -736,10 +736,17 @@ mod tests { #[derive(Debug, Serialize, Deserialize, PartialEq)] struct Vector { - embeddings: Vec>, + embeddings: SingleOrMultipleVectors, regenerate: bool, } + #[derive(Serialize, Deserialize, Debug, PartialEq)] + #[serde(untagged)] + enum SingleOrMultipleVectors { + Single(Vec), + Multiple(Vec>), + } + #[derive(Debug, Serialize, Deserialize, PartialEq)] struct Vectors(HashMap); @@ -748,7 +755,7 @@ mod tests { Vectors(HashMap::from([( S("default"), Vector { - embeddings: Vec::from([value.to_vec()]), + embeddings: SingleOrMultipleVectors::Multiple(Vec::from([value.to_vec()])), regenerate: false, }, )])) @@ -765,6 +772,13 @@ mod tests { } async fn setup_test_index(client: &Client, index: &Index) -> Result<(), Error> { + // Vector store is enabled for all to have consistent test runs + // This setting is shared by every index + let features = crate::features::ExperimentalFeatures::new(&client) + .set_vector_store(true) + .update() + .await?; + assert_eq!(features.vector_store, true); let t0 = index.add_documents(&[ Document { id: 0, kind: "text".into(), number: 0, value: S("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."), nested: Nested { child: S("first") }, _vectors: Some(Vectors::from(&[1000.0]))}, Document { id: 1, kind: "text".into(), number: 10, value: S("dolor sit amet, consectetur adipiscing elit"), nested: Nested { child: S("second") }, _vectors: Some(Vectors::from(&[2000.0])) }, @@ -864,8 +878,6 @@ mod tests { kind: S("text"), number: 10, nested: Nested { child: S("second") }, - // TODO: This is likely a bug upstream. vectors should not be present - // => correct would be `_vectors: None` _vectors: Some(Vectors::from(&[2000.0])), }, &results.hits[0].result @@ -1376,12 +1388,6 @@ mod tests { /// enable vector searching and configure an userProvided embedder async fn setup_hybrid_searching(client: &Client, index: &Index) -> Result<(), Error> { use crate::settings::{Embedder, UserProvidedEmbedderSettings}; - let features = crate::features::ExperimentalFeatures::new(&client) - .set_vector_store(true) - .update() - .await - .expect("could not enable the vector store"); - assert_eq!(features.vector_store, true); let embedder_setting = Embedder::UserProvided(UserProvidedEmbedderSettings { dimensions: 1 }); let t3 = index From 73981850e687a87d7d317dbef7b5ad969e77f78b Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Mon, 26 Aug 2024 17:36:00 +0200 Subject: [PATCH 12/23] fix: changed the rest embedder to the `1.10.0` schema --- src/settings.rs | 74 +++++++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 42 deletions(-) diff --git a/src/settings.rs b/src/settings.rs index 5278101e..3801bd4f 100644 --- a/src/settings.rs +++ b/src/settings.rs @@ -195,21 +195,30 @@ pub struct OllamaEmbedderSettings { /// # Example /// ``` /// # use std::collections::HashMap; -/// # use meilisearch_sdk::settings::{GenericRestEmbedderSettings,GenericRestInputType}; +/// # use meilisearch_sdk::settings::{GenericRestEmbedderSettings}; /// use serde_json::Value; /// let embedder_setting = GenericRestEmbedderSettings { /// url: Some("http://localhost:12345/api/v1/embed".to_string()), /// api_key: Some("SOURCE_API_KEY".to_string()), /// dimensions: Some(512), /// document_template: Some("A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}".to_string()), -/// input_field: vec!["data".to_string(), "text".to_string()], -/// input_type: Some(GenericRestInputType::Text), -/// query: HashMap::from([("model".to_string(), Value::from("MODEL_NAME")), ("dimensions".to_string(), Value::from(512))]), -/// path_to_embeddings: vec!["data".to_string()], -/// embedding_object: vec!["embedding".to_string()], +/// request: HashMap::from([ +/// ("model".to_string(), Value::from("MODEL_NAME")), +/// ("prompt".to_string(), Value::from("{{text}}")) +/// ]), +/// response: HashMap::from([ +/// ("model".to_string(), Value::from("{{embedding}}")) +/// ]), /// }; -/// # let expected = r#"{"url":"http://localhost:12345/api/v1/embed","apiKey":"SOURCE_API_KEY","dimensions":512,"documentTemplate":"A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}","inputField":["data","text"],"inputType":"text","query":{"dimensions":512,"model":"MODEL_NAME"},"pathToEmbeddings":["data"],"embeddingObject":["embedding"]}"#; -/// # let expected: GenericRestEmbedderSettings = serde_json::from_str(expected).unwrap(); +/// # let expected = serde_json::json!({ +/// # "url":"http://localhost:12345/api/v1/embed", +/// # "apiKey":"SOURCE_API_KEY", +/// # "dimensions":512, +/// # "documentTemplate":"A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}", +/// # "request":{"prompt":"{{text}}","model":"MODEL_NAME"}, +/// # "response":{"model":"{{embedding}}"} +/// # }); +/// # let expected: GenericRestEmbedderSettings = serde_json::from_value(expected).unwrap(); /// # assert_eq!(embedder_setting, expected); /// ``` #[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq)] @@ -238,48 +247,29 @@ pub struct GenericRestEmbedderSettings { /// Example: `"A document titled '{{doc.title}}' whose description starts with {{doc.overview|truncatewords: 20}}"` #[serde(skip_serializing_if = "Option::is_none")] pub document_template: Option, - /// Optional - /// Inject texts in `data.text` in the query - /// Determines what name they use for that input. - /// - /// Default: [] - /// Example: `["data", "text"]` - #[serde(skip_serializing_if = "Vec::is_empty")] - pub input_field: Vec, - /// Optional - /// Default: [`GenericRestInputType::Text`] - #[serde(skip_serializing_if = "Option::is_none")] - pub input_type: Option, - /// Optional, defaults to {} + /// A JSON value that represents the request made by Meilisearch to the remote embedder. + /// The text to embed must be replaced by the placeholder value `“{{text}}”`. /// /// Example: /// ```json /// { /// "model": "MODEL_NAME", - /// "dimensions": 512 + /// "prompt": "{{text}}" /// } /// ``` #[serde(skip_serializing_if = "HashMap::is_empty")] - pub query: HashMap, - /// Optional - /// Defaults to [] - /// Example: `["data"]` - #[serde(skip_serializing_if = "Vec::is_empty")] - pub path_to_embeddings: Vec, - /// Optional - /// Defaults to [] - /// Example: `["embedding"]` - #[serde(skip_serializing_if = "Vec::is_empty")] - pub embedding_object: Vec, -} - -#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)] -#[serde(rename_all = "camelCase")] -pub enum GenericRestInputType { - /// indicates that the model accepts a single text - Text, - /// indicates that the model accepts a single text - TextArray, + pub request: HashMap, + /// A JSON value that represents a fragment of the response made by the remote embedder to Meilisearch. + /// The embedding must be replaced by the placeholder value `"{{embedding}}"` + /// + /// Example: + /// ```json + /// { + /// "embedding": "{{embedding}}" + /// } + /// ``` + #[serde(skip_serializing_if = "HashMap::is_empty")] + pub response: HashMap, } /// EXPERIMENTAL From 93b0bca2c5a979bc422595b4e96abfd9ace00928 Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Thu, 29 Aug 2024 20:30:49 +0200 Subject: [PATCH 13/23] feat: added `headers` support --- src/settings.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/settings.rs b/src/settings.rs index 3801bd4f..d852ee08 100644 --- a/src/settings.rs +++ b/src/settings.rs @@ -209,6 +209,9 @@ pub struct OllamaEmbedderSettings { /// response: HashMap::from([ /// ("model".to_string(), Value::from("{{embedding}}")) /// ]), +/// headers: HashMap::from([ +/// ("X-MAGIC".to_string(), "open sesame".to_string()) +/// ]), /// }; /// # let expected = serde_json::json!({ /// # "url":"http://localhost:12345/api/v1/embed", @@ -216,7 +219,8 @@ pub struct OllamaEmbedderSettings { /// # "dimensions":512, /// # "documentTemplate":"A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}", /// # "request":{"prompt":"{{text}}","model":"MODEL_NAME"}, -/// # "response":{"model":"{{embedding}}"} +/// # "response":{"model":"{{embedding}}"}, +/// # "headers":{"X-MAGIC":"open sesame"} /// # }); /// # let expected: GenericRestEmbedderSettings = serde_json::from_value(expected).unwrap(); /// # assert_eq!(embedder_setting, expected); @@ -270,6 +274,17 @@ pub struct GenericRestEmbedderSettings { /// ``` #[serde(skip_serializing_if = "HashMap::is_empty")] pub response: HashMap, + /// JSON object whose keys represent the name and values of additional headers to send in requests. + /// + /// Embedding requests sent from Meilisearch to a remote REST embedder by default contain these headers: + /// + /// - if `api_key` was provided: `Authorization: Bearer ` + /// - always: `Content-Type: application/json` + /// + /// If `headers` is empty, only `Authorization` and `Content-Type` are sent, as described above. + /// If `headers` contains `Authorization` and `Content-Type`, the declared values will override the ones that are sent by default. + #[serde(skip_serializing_if = "HashMap::is_empty")] + pub headers: HashMap, } /// EXPERIMENTAL From 53083ed018aca0defbbb31a8fd998a5f49686e68 Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Sun, 17 Nov 2024 19:40:49 +0100 Subject: [PATCH 14/23] Apply suggestions from code review --- src/settings.rs | 44 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/src/settings.rs b/src/settings.rs index d852ee08..a9e6b46f 100644 --- a/src/settings.rs +++ b/src/settings.rs @@ -92,7 +92,16 @@ pub struct HuggingFaceEmbedderSettings { /// Meilisearch also exposes a `{{ fields }}` array containing one object per document field, which you may access with `{{ field.name }}` and `{{ field.value }}`. /// /// For best results, use short strings indicating the type of document in that index, only include highly relevant document fields, and truncate long fields. - /// Example: "A document titled '{{doc.title}}' whose description starts with {{doc.overview|truncatewords: 20}}" + /// Example: `"A document titled '{{doc.title}}' whose description starts with {{doc.overview|truncatewords: 20}}"` + /// + /// Default: + /// ```raw + /// {% for field in fields %} + /// {% if field.is_searchable and not field.value == nil %} + /// {{ field.name }}: {{ field.value }}\n + /// {% endif %} + /// {% endfor %} + /// ``` #[serde(skip_serializing_if = "Option::is_none")] pub document_template: Option, } @@ -121,7 +130,7 @@ pub struct OpenapiEmbedderSettings { /// Use [tier 2 keys](https://platform.openai.com/docs/guides/rate-limits/usage-tiers?context=tier-two) or above for optimal performance. pub api_key: String, /// The openapi model name - /// Default: `text-embedding-ada-002` + /// Default: `text-embedding-3-small` #[serde(skip_serializing_if = "Option::is_none")] pub model: Option, /// Defaults to the default for said model name @@ -134,7 +143,16 @@ pub struct OpenapiEmbedderSettings { /// Meilisearch also exposes a `{{ fields }}` array containing one object per document field, which you may access with `{{ field.name }}` and `{{ field.value }}`. /// /// For best results, use short strings indicating the type of document in that index, only include highly relevant document fields, and truncate long fields. - /// Example: "A document titled '{{doc.title}}' whose description starts with {{doc.overview|truncatewords: 20}}" + /// Example: `"A document titled '{{doc.title}}' whose description starts with {{doc.overview|truncatewords: 20}}"` + /// + /// Default: + /// ```raw + /// {% for field in fields %} + /// {% if field.is_searchable and not field.value == nil %} + /// {{ field.name }}: {{ field.value }}\n + /// {% endif %} + /// {% endfor %} + /// ``` #[serde(skip_serializing_if = "Option::is_none")] pub document_template: Option, } @@ -185,7 +203,16 @@ pub struct OllamaEmbedderSettings { /// Meilisearch also exposes a `{{ fields }}` array containing one object per document field, which you may access with `{{ field.name }}` and `{{ field.value }}`. /// /// For best results, use short strings indicating the type of document in that index, only include highly relevant document fields, and truncate long fields. - /// Example: "A document titled '{{doc.title}}' whose description starts with {{doc.overview|truncatewords: 20}}" + /// Example: `"A document titled '{{doc.title}}' whose description starts with {{doc.overview|truncatewords: 20}}"` + /// + /// Default: + /// ```raw + /// {% for field in fields %} + /// {% if field.is_searchable and not field.value == nil %} + /// {{ field.name }}: {{ field.value }}\n + /// {% endif %} + /// {% endfor %} + /// ``` #[serde(skip_serializing_if = "Option::is_none")] pub document_template: Option, } @@ -249,6 +276,15 @@ pub struct GenericRestEmbedderSettings { /// /// For best results, use short strings indicating the type of document in that index, only include highly relevant document fields, and truncate long fields. /// Example: `"A document titled '{{doc.title}}' whose description starts with {{doc.overview|truncatewords: 20}}"` + /// + /// Default: + /// ```raw + /// {% for field in fields %} + /// {% if field.is_searchable and not field.value == nil %} + /// {{ field.name }}: {{ field.value }}\n + /// {% endif %} + /// {% endfor %} + /// ``` #[serde(skip_serializing_if = "Option::is_none")] pub document_template: Option, /// A JSON value that represents the request made by Meilisearch to the remote embedder. From a0b13c65cd8eb1d7ed5bad24f2b4724aa2b4a51d Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Fri, 3 Jan 2025 18:08:10 +0100 Subject: [PATCH 15/23] chore: formatting fix --- src/settings.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/settings.rs b/src/settings.rs index a9e6b46f..70d086a9 100644 --- a/src/settings.rs +++ b/src/settings.rs @@ -93,7 +93,7 @@ pub struct HuggingFaceEmbedderSettings { /// /// For best results, use short strings indicating the type of document in that index, only include highly relevant document fields, and truncate long fields. /// Example: `"A document titled '{{doc.title}}' whose description starts with {{doc.overview|truncatewords: 20}}"` - /// + /// /// Default: /// ```raw /// {% for field in fields %} @@ -144,7 +144,7 @@ pub struct OpenapiEmbedderSettings { /// /// For best results, use short strings indicating the type of document in that index, only include highly relevant document fields, and truncate long fields. /// Example: `"A document titled '{{doc.title}}' whose description starts with {{doc.overview|truncatewords: 20}}"` - /// + /// /// Default: /// ```raw /// {% for field in fields %} @@ -204,7 +204,7 @@ pub struct OllamaEmbedderSettings { /// /// For best results, use short strings indicating the type of document in that index, only include highly relevant document fields, and truncate long fields. /// Example: `"A document titled '{{doc.title}}' whose description starts with {{doc.overview|truncatewords: 20}}"` - /// + /// /// Default: /// ```raw /// {% for field in fields %} @@ -276,7 +276,7 @@ pub struct GenericRestEmbedderSettings { /// /// For best results, use short strings indicating the type of document in that index, only include highly relevant document fields, and truncate long fields. /// Example: `"A document titled '{{doc.title}}' whose description starts with {{doc.overview|truncatewords: 20}}"` - /// + /// /// Default: /// ```raw /// {% for field in fields %} From 9cd547dd3c4dc4451c5237a23e882fc37dad0548 Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Fri, 24 Jan 2025 22:24:59 +0100 Subject: [PATCH 16/23] Remove mentions to the experimental search from the PR --- src/search.rs | 26 ++++++++----------- src/settings.rs | 67 ++++++++++++++++++------------------------------- 2 files changed, 35 insertions(+), 58 deletions(-) diff --git a/src/search.rs b/src/search.rs index a08e6290..59e43bd3 100644 --- a/src/search.rs +++ b/src/search.rs @@ -142,20 +142,28 @@ pub enum Selectors { All, } -/// EXPERIMENTAL +/// Setting whether to utilise previously defined embedders for semantic searching #[derive(Debug, Serialize, Clone)] #[serde(rename_all = "camelCase")] pub struct HybridSearch<'a> { /// Indicates one of the embedders configured for the queried index /// /// **Default: `"default"`** - embedder: &'a str, + pub embedder: &'a str, /// number between `0` and `1`: /// - `0.0` indicates full keyword search /// - `1.0` indicates full semantic search /// /// **Default: `0.5`** - semantic_ratio: f32, + pub semantic_ratio: f32, +} +impl Default for HybridSearch{ + fn default() -> Self { + HybridSearch{ + embedder: "default", + semantic_ratio: 0.5, + } + } } type AttributeToCrop<'a> = (&'a str, Option); @@ -367,17 +375,14 @@ pub struct SearchQuery<'a, Http: HttpClient> { #[serde(skip_serializing_if = "Option::is_none")] pub(crate) index_uid: Option<&'a str>, - /// EXPERIMENTAL /// Defines whether to utilise previously defined embedders for semantic searching #[serde(skip_serializing_if = "Option::is_none")] pub hybrid: Option>, - /// EXPERIMENTAL /// Defines what vectors an userprovided embedder has gotten for semantic searching #[serde(skip_serializing_if = "Option::is_none")] pub vector: Option<&'a [f32]>, - /// EXPERIMENTAL /// Defines whether vectors for semantic searching are returned in the search results /// Can Significantly increase the response size. /// @@ -618,7 +623,6 @@ impl<'a, Http: HttpClient> SearchQuery<'a, Http> { self.index_uid = Some(&self.index.uid); self } - /// EXPERIMENTAL /// Defines whether to utilise previously defined embedders for semantic searching pub fn with_hybrid<'b>( &'b mut self, @@ -631,7 +635,6 @@ impl<'a, Http: HttpClient> SearchQuery<'a, Http> { }); self } - /// EXPERIMENTAL /// Defines what vectors an userprovided embedder has gotten for semantic searching pub fn with_vector<'b>(&'b mut self, vector: &'a [f32]) -> &'b mut SearchQuery<'a, Http> { self.vector = Some(vector); @@ -781,13 +784,6 @@ mod tests { } async fn setup_test_index(client: &Client, index: &Index) -> Result<(), Error> { - // Vector store is enabled for all to have consistent test runs - // This setting is shared by every index - let features = crate::features::ExperimentalFeatures::new(&client) - .set_vector_store(true) - .update() - .await?; - assert_eq!(features.vector_store, true); let t0 = index.add_documents(&[ Document { id: 0, kind: "text".into(), number: 0, value: S("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."), nested: Nested { child: S("first") }, _vectors: Some(Vectors::from(&[1000.0]))}, Document { id: 1, kind: "text".into(), number: 10, value: S("dolor sit amet, consectetur adipiscing elit"), nested: Nested { child: S("second") }, _vectors: Some(Vectors::from(&[2000.0])) }, diff --git a/src/settings.rs b/src/settings.rs index 604ed071..09e27fa4 100644 --- a/src/settings.rs +++ b/src/settings.rs @@ -36,7 +36,6 @@ pub struct FacetingSettings { pub max_values_per_facet: usize, } -/// EXPERIMENTAL /// Allows configuring semantic searching #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)] #[serde(rename_all = "camelCase", tag = "source")] @@ -45,19 +44,20 @@ pub enum Embedder { /// You may be able to significantly improve performance by [compiling a CUDA-compatible Meilisearch binary](https://www.meilisearch.com/docs/guides/ai/computing_hugging_face_embeddings_gpu). /// This is a resource-intensive operation and might affect indexing performance negatively. HuggingFace(HuggingFaceEmbedderSettings), - /// Use OpenAi's API to generate embeddings + /// Use OpenAI's API to generate embeddings /// Depending on hardware, this is a - OpenAi(OpenapiEmbedderSettings), + OpenAI(OpenAIEmbedderSettings), /// [Ollama](https://ollama.com/) is a framework for building and running language models locally. Ollama(OllamaEmbedderSettings), /// Supports arbitrary embedders which supply a [REST](https://en.wikipedia.org/wiki/REST) interface REST(GenericRestEmbedderSettings), - /// Provide custom embeddings. - /// In this case, you must manually update your embeddings when adding, updating, and removing documents to your index. + /// Provide custom embeddings + /// + /// When using a custom embedder, you must vectorize both your documents (both for adding and updating documents) and user queries UserProvided(UserProvidedEmbedderSettings), } -/// EXPERIMENTAL +/// Settings for configuring [Ollama](https://ollama.com/) embedders /// /// # Example /// ``` @@ -106,25 +106,25 @@ pub struct HuggingFaceEmbedderSettings { pub document_template: Option, } -/// EXPERIMENTAL +/// Settings for configuring [OpenAI](https://openai.com/) embedders /// /// # Example /// ``` -/// # use meilisearch_sdk::settings::OpenapiEmbedderSettings; -/// let embedder_setting = OpenapiEmbedderSettings { -/// api_key: "anOpenAiApiKey".to_string(), +/// # use meilisearch_sdk::settings::OpenAIEmbedderSettings; +/// let embedder_setting = OpenAIEmbedderSettings { +/// api_key: "anOpenAIApiKey".to_string(), /// model: Some("text-embedding-3-small".to_string()), /// document_template: Some("A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}".to_string()), /// dimensions: Some(1536), /// ..Default::default() /// }; -/// # let expected = r#"{"apiKey":"anOpenAiApiKey","model":"text-embedding-3-small","documentTemplate":"A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}","dimensions":1536}"#; -/// # let expected: OpenapiEmbedderSettings = serde_json::from_str(expected).unwrap(); +/// # let expected = r#"{"apiKey":"anOpenAIApiKey","model":"text-embedding-3-small","documentTemplate":"A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}","dimensions":1536}"#; +/// # let expected: OpenAIEmbedderSettings = serde_json::from_str(expected).unwrap(); /// # assert_eq!(embedder_setting, expected); /// ``` #[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq)] #[serde(rename_all = "camelCase")] -pub struct OpenapiEmbedderSettings { +pub struct OpenAIEmbedderSettings { /// API key used to authorize against OpenAI. /// [Generate an API key](https://platform.openai.com/api-keys) from your OpenAI account. /// Use [tier 2 keys](https://platform.openai.com/docs/guides/rate-limits/usage-tiers?context=tier-two) or above for optimal performance. @@ -157,7 +157,7 @@ pub struct OpenapiEmbedderSettings { pub document_template: Option, } -/// EXPERIMENTAL +/// Settings for configuring [Ollama](https://ollama.com/) embedders /// /// # Example /// ``` @@ -217,7 +217,7 @@ pub struct OllamaEmbedderSettings { pub document_template: Option, } -/// EXPERIMENTAL +/// Settings for configuring generic [REST](https://en.wikipedia.org/wiki/REST) embedders /// /// # Example /// ``` @@ -323,7 +323,9 @@ pub struct GenericRestEmbedderSettings { pub headers: HashMap, } -/// EXPERIMENTAL +/// Settings for user provided embedder +/// +/// When using a custom embedder, you must vectorize both your documents and user queries. #[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq, Copy)] pub struct UserProvidedEmbedderSettings { /// dimensions of your custom embedding @@ -404,8 +406,7 @@ pub struct Settings { /// Proximity precision settings. #[serde(skip_serializing_if = "Option::is_none")] pub proximity_precision: Option, - /// EXPERIMENTAL - /// Settings how the embeddings for the experimental vector search feature are generated + /// Settings how the embeddings for the vector search feature are generated #[serde(skip_serializing_if = "Option::is_none")] pub embedders: Option>, /// SearchCutoffMs settings. @@ -606,8 +607,7 @@ impl Settings { } } - /// EXPERIMENTAL - /// Set the [embedders](https://www.meilisearch.com/docs/learn/experimental/vector_search) of the [Index]. + /// Set the [embedders](https://www.meilisearch.com/docs/learn/vector_search) of the [Index]. #[must_use] pub fn with_embedders(self, embedders: HashMap) -> Settings where @@ -1142,13 +1142,12 @@ impl Index { .await } - /// EXPERIMENTAL - /// Get [embedders](https://www.meilisearch.com/docs/learn/experimental/vector_search) of the [Index]. + /// Get [embedders](https://www.meilisearch.com/docs/learn/vector_search) of the [Index]. /// /// ``` /// # use std::collections::HashMap; /// # use std::string::String; - /// # use meilisearch_sdk::{indexes::*,features::ExperimentalFeatures,settings::Embedder,settings::UserProvidedEmbedderSettings,settings::Settings,client::*}; + /// # use meilisearch_sdk::{indexes::*,settings::Embedder,settings::UserProvidedEmbedderSettings,settings::Settings,client::*}; /// # /// # let MEILISEARCH_URL = option_env!("MEILISEARCH_URL").unwrap_or("http://localhost:7700"); /// # let MEILISEARCH_API_KEY = option_env!("MEILISEARCH_API_KEY").unwrap_or("masterKey"); @@ -1157,12 +1156,8 @@ impl Index { /// # let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); /// # client.create_index("get_embedders", None).await.unwrap().wait_for_completion(&client, None, None).await.unwrap(); /// let index = client.index("get_embedders"); - /// - /// # let mut features = ExperimentalFeatures::new(&client); - /// # features.set_vector_store(true); - /// # let res = features.update().await.unwrap(); /// # - /// # let t=index.set_settings(&Settings{ + /// # let t = index.set_settings(&Settings{ /// # embedders:Some(HashMap::from([(String::from("default"),Embedder::UserProvided(UserProvidedEmbedderSettings{dimensions:1}))])), /// # ..Settings::default() /// # }).await.unwrap(); @@ -2532,8 +2527,7 @@ impl Index { .await } - /// EXPERIMENTAL - /// Reset [embedders](https://www.meilisearch.com/docs/learn/experimental/vector_search) of the [Index]. + /// Reset [embedders](https://www.meilisearch.com/docs/learn/vector_search) of the [Index]. /// /// # Example /// @@ -2767,12 +2761,6 @@ mod tests { #[meilisearch_test] async fn test_reset_embedders(client: Client, index: Index) { - let features = crate::features::ExperimentalFeatures::new(&client) - .set_vector_store(true) - .update() - .await - .expect("could not enable the vector store"); - assert_eq!(features.vector_store, true); let task_info = index.reset_embedders().await.unwrap(); client.wait_for_task(task_info, None, None).await.unwrap(); @@ -2959,13 +2947,6 @@ mod tests { #[meilisearch_test] async fn test_set_embedding_settings(client: Client, index: Index) { - let features = crate::features::ExperimentalFeatures::new(&client) - .set_vector_store(true) - .update() - .await - .expect("could not enable the vector store"); - assert_eq!(features.vector_store, true); - let custom_embedder = Embedder::UserProvided(UserProvidedEmbedderSettings { dimensions: 2 }); let embeddings = HashMap::from([("default".into(), custom_embedder)]); From f32abe2e1324faa7b820418883dd4ef01e1e57c5 Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Fri, 24 Jan 2025 22:37:45 +0100 Subject: [PATCH 17/23] Fix typos during the initial drafts --- src/search.rs | 11 ++++++----- src/settings.rs | 3 ++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/search.rs b/src/search.rs index 59e43bd3..da3f7ef2 100644 --- a/src/search.rs +++ b/src/search.rs @@ -384,11 +384,10 @@ pub struct SearchQuery<'a, Http: HttpClient> { pub vector: Option<&'a [f32]>, /// Defines whether vectors for semantic searching are returned in the search results - /// Can Significantly increase the response size. /// - /// **Default: `false`** + /// Can Significantly increase the response size. #[serde(skip_serializing_if = "Option::is_none")] - retrieve_vectors: Option, + pub retrieve_vectors: Option, } #[allow(missing_docs)] @@ -516,6 +515,9 @@ impl<'a, Http: HttpClient> SearchQuery<'a, Http> { self.filter = Some(Filter::new(Either::Right(filter))); self } + /// Defines whether vectors for semantic searching are returned in the search results + /// + /// Can Significantly increase the response size. pub fn with_retrieve_vectors<'b>( &'b mut self, retrieve_vectors: bool, @@ -640,7 +642,6 @@ impl<'a, Http: HttpClient> SearchQuery<'a, Http> { self.vector = Some(vector); self } - #[must_use] pub fn with_distinct<'b>(&'b mut self, distinct: &'a str) -> &'b mut SearchQuery<'a, Http> { self.distinct = Some(distinct); self @@ -742,7 +743,7 @@ mod tests { kind: String, number: i32, nested: Nested, - #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none", default)] _vectors: Option, } diff --git a/src/settings.rs b/src/settings.rs index 09e27fa4..2c55daa9 100644 --- a/src/settings.rs +++ b/src/settings.rs @@ -259,6 +259,7 @@ pub struct GenericRestEmbedderSettings { /// Must be parseable as a URL. /// If not specified, [Meilisearch](https://www.meilisearch.com/) (**not the sdk you are currently using**) will try to fetch the `MEILI_OLLAMA_URL` environment variable /// Example: `"http://localhost:12345/api/v1/embed"` + #[serde(skip_serializing_if = "Option::is_none")] pub url: Option, /// Optional, passed as Bearer in the Authorization header /// Example: `"187HFLDH97CNHN"` @@ -339,7 +340,7 @@ pub struct LocalizedAttributes { pub attribute_patterns: Vec, } -/// Struct reprensenting a set of settings. +/// Struct representing a set of settings. /// /// You can build this struct using the builder syntax. /// From 279e9742225614521f17cea4b28eee24d1dce5e9 Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Wed, 5 Feb 2025 18:50:19 +0100 Subject: [PATCH 18/23] fix formatting issues that crept in somehow in the merge --- src/search.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/search.rs b/src/search.rs index a290b83c..15a8a64a 100644 --- a/src/search.rs +++ b/src/search.rs @@ -157,9 +157,9 @@ pub struct HybridSearch<'a> { /// **Default: `0.5`** pub semantic_ratio: f32, } -impl Default for HybridSearch{ +impl Default for HybridSearch<'_> { fn default() -> Self { - HybridSearch{ + HybridSearch { embedder: "default", semantic_ratio: 0.5, } From 73305acf4e7fd4946a1f56163c62ff9a364d035a Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Thu, 20 Feb 2025 18:12:11 +0100 Subject: [PATCH 19/23] Update src/search.rs --- src/search.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search.rs b/src/search.rs index 6d989534..ed9ce1a6 100644 --- a/src/search.rs +++ b/src/search.rs @@ -895,7 +895,7 @@ mod tests { kind: S("text"), number: 10, nested: Nested { child: S("second") }, - _vectors: Some(Vectors::from(&[2000.0])), + _vectors: None, }, &results.hits[0].result ); From b7735a21d23d254944814a6effd57129b77c99f9 Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Thu, 20 Feb 2025 18:14:00 +0100 Subject: [PATCH 20/23] simplified code a bit --- src/search.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/search.rs b/src/search.rs index ed9ce1a6..0e84e45a 100644 --- a/src/search.rs +++ b/src/search.rs @@ -1420,13 +1420,14 @@ mod tests { use crate::settings::{Embedder, UserProvidedEmbedderSettings}; let embedder_setting = Embedder::UserProvided(UserProvidedEmbedderSettings { dimensions: 1 }); - let t3 = index + index .set_settings(&crate::settings::Settings { embedders: Some(HashMap::from([("default".to_string(), embedder_setting)])), ..crate::settings::Settings::default() }) + .await? + .wait_for_completion(&client, None, None) .await?; - t3.wait_for_completion(&client, None, None).await?; Ok(()) } From 8e265df2f898881dc4a178f446e3cf2a343317b5 Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Thu, 20 Feb 2025 18:47:27 +0100 Subject: [PATCH 21/23] update docs with newer phrasing --- src/settings.rs | 52 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/src/settings.rs b/src/settings.rs index 2c55daa9..0fa0d28e 100644 --- a/src/settings.rs +++ b/src/settings.rs @@ -104,6 +104,12 @@ pub struct HuggingFaceEmbedderSettings { /// ``` #[serde(skip_serializing_if = "Option::is_none")] pub document_template: Option, + /// The maximum size of a rendered document template. + // + // Longer texts are truncated to fit the configured limit. + /// Default: `400` + #[serde(skip_serializing_if = "Option::is_none")] + pub document_template_max_bytes: Option, } /// Settings for configuring [OpenAI](https://openai.com/) embedders @@ -126,6 +132,7 @@ pub struct HuggingFaceEmbedderSettings { #[serde(rename_all = "camelCase")] pub struct OpenAIEmbedderSettings { /// API key used to authorize against OpenAI. + /// /// [Generate an API key](https://platform.openai.com/api-keys) from your OpenAI account. /// Use [tier 2 keys](https://platform.openai.com/docs/guides/rate-limits/usage-tiers?context=tier-two) or above for optimal performance. pub api_key: String, @@ -133,7 +140,10 @@ pub struct OpenAIEmbedderSettings { /// Default: `text-embedding-3-small` #[serde(skip_serializing_if = "Option::is_none")] pub model: Option, - /// Defaults to the default for said model name + /// Number of dimensions in the chosen model. + /// + /// If not supplied, Meilisearch tries to infer this value. + /// In most cases, dimensions should be the exact same value of your chosen model #[serde(skip_serializing_if = "Option::is_none")] pub dimensions: Option, /// Use it to customize the data you send to the embedder. It is highly recommended you configure a custom template for your documents. @@ -155,6 +165,12 @@ pub struct OpenAIEmbedderSettings { /// ``` #[serde(skip_serializing_if = "Option::is_none")] pub document_template: Option, + /// The maximum size of a rendered document template. + // + // Longer texts are truncated to fit the configured limit. + /// Default: `400` + #[serde(skip_serializing_if = "Option::is_none")] + pub document_template_max_bytes: Option, } /// Settings for configuring [Ollama](https://ollama.com/) embedders @@ -215,6 +231,12 @@ pub struct OllamaEmbedderSettings { /// ``` #[serde(skip_serializing_if = "Option::is_none")] pub document_template: Option, + /// The maximum size of a rendered document template. + // + // Longer texts are truncated to fit the configured limit. + /// Default: `400` + #[serde(skip_serializing_if = "Option::is_none")] + pub document_template_max_bytes: Option, } /// Settings for configuring generic [REST](https://en.wikipedia.org/wiki/REST) embedders @@ -256,17 +278,22 @@ pub struct OllamaEmbedderSettings { #[serde(rename_all = "camelCase")] pub struct GenericRestEmbedderSettings { /// Mandatory, full URL to the embedding endpoint + /// /// Must be parseable as a URL. /// If not specified, [Meilisearch](https://www.meilisearch.com/) (**not the sdk you are currently using**) will try to fetch the `MEILI_OLLAMA_URL` environment variable /// Example: `"http://localhost:12345/api/v1/embed"` #[serde(skip_serializing_if = "Option::is_none")] pub url: Option, - /// Optional, passed as Bearer in the Authorization header + /// Authentication token Meilisearch should send with each request to the embedder. + /// + /// Is passed as Bearer in the Authorization header /// Example: `"187HFLDH97CNHN"` #[serde(skip_serializing_if = "Option::is_none")] pub api_key: Option, - /// Optional - /// Inferred with a dummy request if missing + /// Number of dimensions in the chosen model. + /// + /// If not supplied, Meilisearch tries to infer this value. + /// In most cases, dimensions should be the exact same value of your chosen model #[serde(skip_serializing_if = "Option::is_none")] pub dimensions: Option, /// Use it to customize the data you send to the embedder. It is highly recommended you configure a custom template for your documents. @@ -298,6 +325,19 @@ pub struct GenericRestEmbedderSettings { /// "prompt": "{{text}}" /// } /// ``` + /// The maximum size of a rendered document template. + // + // Longer texts are truncated to fit the configured limit. + /// Default: `400` + #[serde(skip_serializing_if = "Option::is_none")] + pub document_template_max_bytes: Option, + /// JSON object with the same structure and data of the request you must send to your rest embedder. + /// + /// The field containing the input text Meilisearch should send to the embedder must be replaced with `{{text}}`. + /// Example: + /// ```json + /// {"prompt": "{{text}}"} + /// ``` #[serde(skip_serializing_if = "HashMap::is_empty")] pub request: HashMap, /// A JSON value that represents a fragment of the response made by the remote embedder to Meilisearch. @@ -329,7 +369,9 @@ pub struct GenericRestEmbedderSettings { /// When using a custom embedder, you must vectorize both your documents and user queries. #[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq, Copy)] pub struct UserProvidedEmbedderSettings { - /// dimensions of your custom embedding + /// Number of dimensions in the user-provided model. + /// + /// In most cases, dimensions should be the exact same value of your chosen model pub dimensions: usize, } From 20c4d50a49541228e58052803e83c0e9dca3f15c Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Thu, 20 Feb 2025 18:51:04 +0100 Subject: [PATCH 22/23] formatting fix --- src/settings.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/settings.rs b/src/settings.rs index 0fa0d28e..a8f84453 100644 --- a/src/settings.rs +++ b/src/settings.rs @@ -105,7 +105,7 @@ pub struct HuggingFaceEmbedderSettings { #[serde(skip_serializing_if = "Option::is_none")] pub document_template: Option, /// The maximum size of a rendered document template. - // + // // Longer texts are truncated to fit the configured limit. /// Default: `400` #[serde(skip_serializing_if = "Option::is_none")] @@ -166,7 +166,7 @@ pub struct OpenAIEmbedderSettings { #[serde(skip_serializing_if = "Option::is_none")] pub document_template: Option, /// The maximum size of a rendered document template. - // + // // Longer texts are truncated to fit the configured limit. /// Default: `400` #[serde(skip_serializing_if = "Option::is_none")] @@ -232,7 +232,7 @@ pub struct OllamaEmbedderSettings { #[serde(skip_serializing_if = "Option::is_none")] pub document_template: Option, /// The maximum size of a rendered document template. - // + // // Longer texts are truncated to fit the configured limit. /// Default: `400` #[serde(skip_serializing_if = "Option::is_none")] @@ -326,13 +326,13 @@ pub struct GenericRestEmbedderSettings { /// } /// ``` /// The maximum size of a rendered document template. - // + // // Longer texts are truncated to fit the configured limit. /// Default: `400` #[serde(skip_serializing_if = "Option::is_none")] pub document_template_max_bytes: Option, /// JSON object with the same structure and data of the request you must send to your rest embedder. - /// + /// /// The field containing the input text Meilisearch should send to the embedder must be replaced with `{{text}}`. /// Example: /// ```json From 683893727bbce8dcb5b9e30eed2785f9adb42403 Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Thu, 20 Feb 2025 18:53:26 +0100 Subject: [PATCH 23/23] fix doctests --- src/settings.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/settings.rs b/src/settings.rs index a8f84453..837a0c44 100644 --- a/src/settings.rs +++ b/src/settings.rs @@ -183,6 +183,7 @@ pub struct OpenAIEmbedderSettings { /// api_key: Some("foobarbaz".to_string()), /// model: "nomic-embed-text".to_string(), /// document_template: Some("A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}".to_string()), +/// document_template_max_bytes: None, /// }; /// # let expected = r#"{"url":"http://localhost:11434/api/embeddings","apiKey":"foobarbaz","model":"nomic-embed-text","documentTemplate":"A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}"}"#; /// # let expected: OllamaEmbedderSettings = serde_json::from_str(expected).unwrap(); @@ -251,6 +252,7 @@ pub struct OllamaEmbedderSettings { /// api_key: Some("SOURCE_API_KEY".to_string()), /// dimensions: Some(512), /// document_template: Some("A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}".to_string()), +/// document_template_max_bytes: None, /// request: HashMap::from([ /// ("model".to_string(), Value::from("MODEL_NAME")), /// ("prompt".to_string(), Value::from("{{text}}"))