From 22f913c7344da7022ae19f7a15cca5156a71a2f5 Mon Sep 17 00:00:00 2001 From: Justin Castilla Date: Mon, 14 Jul 2025 18:36:50 -0700 Subject: [PATCH 01/11] Added an updated version of notebook for video script --- ...dated-ecommerce_dense_sparse_project.ipynb | 1113 +++++++++++++++++ 1 file changed, 1113 insertions(+) create mode 100644 supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb diff --git a/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb b/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb new file mode 100644 index 00000000..d0f27d6b --- /dev/null +++ b/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb @@ -0,0 +1,1113 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "r8OKk3QOGBXl", + "metadata": { + "id": "r8OKk3QOGBXl" + }, + "source": [ + "# **Lexical and Semantic Search with Elasticsearch**\n", + "\n", + "In this example, you will explore various approaches to retrieving information using Elasticsearch, focusing specifically on text, lexical and semantic search.\n", + "\n", + "To accomplish this, this example demonstrate various search scenarios on a dataset generated to simulate e-commerce product information.\n", + "\n", + "This dataset contains over 2,500 products, each with a description. These products are categorized into 76 distinct product categories, with each category containing a varying number of products.\n", + "\n", + "## **🧰 Requirements**\n", + "\n", + "For this example, you will need:\n", + "\n", + "- Python 3.6 or later\n", + "- The Elastic Python client\n", + "- Elastic 8.8 deployment or later, with 8GB memory machine learning node\n", + "- The Elastic Learned Sparse EncodeR model that comes pre-loaded into Elastic installed and started on your deployment\n", + "\n", + "We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html), a [free trial](https://cloud.elastic.co/registration?onboarding_token=vectorsearch&utm_source=github&utm_content=elasticsearch-labs-notebook) is available." + ] + }, + { + "cell_type": "markdown", + "id": "hmMWo2e-IkTB", + "metadata": { + "id": "hmMWo2e-IkTB" + }, + "source": [ + "## Setup Elasticsearch environment:\n", + "\n", + "To get started, we'll need to connect to our Elastic deployment using the Python client.\n", + "\n", + "Because we're using an Elastic Cloud deployment, we'll use the **Cloud ID** to identify our deployment.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8d24cd8-a437-4bd2-a1f0-93e535ccf8a9", + "metadata": { + "id": "e8d24cd8-a437-4bd2-a1f0-93e535ccf8a9" + }, + "outputs": [], + "source": [ + "%pip install elasticsearch" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8c36e9b5-8f2b-4734-9213-1350caa7f837", + "metadata": { + "id": "8c36e9b5-8f2b-4734-9213-1350caa7f837" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[31mERROR: Could not find a version that satisfies the requirement torch==1.11 (from versions: 2.0.0, 2.0.1, 2.1.0, 2.1.1, 2.1.2, 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.3.1, 2.4.0, 2.4.1, 2.5.0, 2.5.1, 2.6.0, 2.7.0, 2.7.1)\u001b[0m\u001b[31m\n", + "\u001b[0m\u001b[31mERROR: No matching distribution found for torch==1.11\u001b[0m\u001b[31m\n", + "\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip -q install eland elasticsearch sentence_transformers transformers torch==1.11 # Eland Python Client" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "eaf90bc8-647e-4ada-9aa9-5cb9e60762b7", + "metadata": { + "id": "eaf90bc8-647e-4ada-9aa9-5cb9e60762b7" + }, + "outputs": [], + "source": [ + "from elasticsearch import (\n", + " Elasticsearch,\n", + " helpers,\n", + ") # Import the Elasticsearch client and helpers module\n", + "from urllib.request import urlopen # library for opening URLs\n", + "import json # module for handling JSON data\n", + "from pathlib import Path # module for working with file paths\n", + "\n", + "import getpass # handling password input" + ] + }, + { + "cell_type": "markdown", + "id": "ea1VkDBXJIQR", + "metadata": { + "id": "ea1VkDBXJIQR" + }, + "source": [ + "Now we can instantiate the Python Elasticsearch client.\n", + "\n", + "First we prompt the user for their password and Cloud ID.\n", + "\n", + "🔐 NOTE: `getpass` enables us to securely prompt the user for credentials without echoing them to the terminal, or storing it in memory.\n", + "\n", + "Then we create a `client` object that instantiates an instance of the `Elasticsearch` class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6907a2bf-4927-428e-9ca8-9df3dd35a2cc", + "metadata": { + "id": "6907a2bf-4927-428e-9ca8-9df3dd35a2cc" + }, + "outputs": [], + "source": [ + "# Found in the 'Manage Deployment' page\n", + "ELASTIC_ENDPOINT = getpass.getpass(\"Enter Elastic Endpoint: \")\n", + "\n", + "# Password for the 'elastic' user generated by Elasticsearch\n", + "ELASTIC_API_KEY = getpass.getpass(\"Enter Elastic API Key: \")\n", + "\n", + "# Create the client instance\n", + "client = Elasticsearch(\n", + " hosts=[ELASTIC_ENDPOINT], api_key=ELASTIC_API_KEY, request_timeout=3600\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8980e76b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'name': 'serverless', 'cluster_name': 'f15e57523cf84631a30f3aaf16c3ecf0', 'cluster_uuid': 'Wumo0cJWRZC8YfiHnVNwnQ', 'version': {'number': '8.11.0', 'build_flavor': 'serverless', 'build_type': 'docker', 'build_hash': '00000000', 'build_date': '2023-10-31', 'build_snapshot': False, 'lucene_version': '9.7.0', 'minimum_wire_compatibility_version': '8.11.0', 'minimum_index_compatibility_version': '8.11.0'}, 'tagline': 'You Know, for Search'}\n" + ] + } + ], + "source": [ + "print(client.info())" + ] + }, + { + "cell_type": "markdown", + "id": "BH-N6epTJarM", + "metadata": { + "id": "BH-N6epTJarM" + }, + "source": [ + "## Setup emebdding model\n", + "\n", + "Next we upload the all-mpnet-base-v2 embedding model into Elasticsearch and create an ingest pipeline with inference processors for text embedding and text expansion, using the description field for both. This field contains the description of each product." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "7f6f3f5a-2b93-4a0c-93c8-c887ca80f687", + "metadata": { + "id": "7f6f3f5a-2b93-4a0c-93c8-c887ca80f687" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model ID: .multilingual-e5-small\n", + "Description: E5 small multilingual\n", + "Inference Config: {'text_embedding': {'vocabulary': {'index': '.ml-inference-native-000002'}, 'tokenization': {'xlm_roberta': {'do_lower_case': False, 'with_special_tokens': True, 'max_sequence_length': 512, 'truncate': 'first', 'span': -1}}, 'embedding_size': 384}}\n", + "Version: 12.0.0\n", + "Tags: []\n", + "------\n", + "{'inference_id': '.multilingual-e5-small-elasticsearch', 'task_type': 'text_embedding', 'service': 'elasticsearch', 'service_settings': {'num_threads': 1, 'model_id': '.multilingual-e5-small_linux-x86_64', 'adaptive_allocations': {'enabled': True, 'min_number_of_allocations': 0, 'max_number_of_allocations': 32}}, 'chunking_settings': {'strategy': 'sentence', 'max_chunk_size': 250, 'sentence_overlap': 1}}\n", + "Inference Endpoint ID: .multilingual-e5-small-elasticsearch\n", + "Model ID: .multilingual-e5-small_linux-x86_64\n", + "Task Type: text_embedding\n" + ] + } + ], + "source": [ + "# set the model to .multilingual-e5-small-elasticsearch\n", + "es_model_id = \".multilingual-e5-small\"\n", + "es_model_endpoint = \".multilingual-e5-small-elasticsearch\"\n", + "\n", + "# verify the model is loaded, deployed, and ready to use\n", + "models = client.ml.get_trained_models()\n", + "for model in models[\"trained_model_configs\"]:\n", + " if model[\"model_id\"] == es_model_id:\n", + " print(f\"Model ID: {model['model_id']}\")\n", + " print(f\"Description: {model.get('description', 'No description')}\")\n", + " print(\n", + " f\"Inference Config: {model.get('inference_config', 'No inference config')}\"\n", + " )\n", + " print(f\"Version: {model.get('version', 'N/A')}\")\n", + " print(f\"Tags: {model.get('tags', [])}\")\n", + " break\n", + "else:\n", + " print(f\"Model {es_model_id} not found.\")\n", + "\n", + "print(\"------\")\n", + "\n", + "inference_endpoint = client.inference.get(inference_id=es_model_endpoint)\n", + "inference_endpoint = inference_endpoint[\"endpoints\"][0]\n", + "print(inference_endpoint)\n", + "print(f\"Inference Endpoint ID: {es_model_endpoint}\")\n", + "print(\n", + " f\"Model ID: {inference_endpoint.get('service_settings', {}).get('model_id', 'N/A')}\"\n", + ")\n", + "print(f\"Task Type: {inference_endpoint['task_type']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "6739f55b-6983-4b48-9349-6e0111b313fe", + "metadata": { + "id": "6739f55b-6983-4b48-9349-6e0111b313fe" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "ObjectApiResponse({'acknowledged': True})" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Creating an ingest pipeline with inference processors to use ELSER (sparse) and all-mpnet-base-v2 (dense) to infer against data that will be ingested in the pipeline.\n", + "\n", + "client.ingest.put_pipeline(\n", + " id=\"ecommerce-pipeline\",\n", + " processors=[\n", + " {\n", + " \"inference\": {\n", + " \"model_id\": \".elser-2-elasticsearch\",\n", + " \"input_output\": [\n", + " {\n", + " \"input_field\": \"description\",\n", + " \"output_field\": \"elser_description_vector\",\n", + " }\n", + " ],\n", + " }\n", + " },\n", + " {\n", + " \"inference\": {\n", + " \"model_id\": \".multilingual-e5-small-elasticsearch\", # Inference endpoint ID\n", + " \"input_output\": [\n", + " {\n", + " \"input_field\": \"description\",\n", + " \"output_field\": \"e5_description_vector\",\n", + " }\n", + " ],\n", + " \"inference_config\": {\"text_embedding\": {}},\n", + " }\n", + " },\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "QUQ1nCaiKIQr", + "metadata": { + "id": "QUQ1nCaiKIQr" + }, + "source": [ + "## Index documents\n", + "\n", + "Then, we create a source index to load `products-ecommerce.json`, this will be the `ecommerce` index and a destination index to extract the documents from the source and index these documents into the destination `ecommerce-search`.\n", + "\n", + "For the `ecommerce-search` index we add a field to support dense vector storage and search `description_vector.predicted_value`, this is the target field for inference results. The field type in this case is `dense_vector`, the `all-mpnet-base-v2` model has embedding_size of 768, so dims is set to 768. We also add a `rank_features` field type to support the text expansion output." + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "6e115bd0-e758-44db-b5b9-96217af472c1", + "metadata": { + "id": "6e115bd0-e758-44db-b5b9-96217af472c1" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'ecommerce'})" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Index to load products-ecommerce.json docs\n", + "if client.indices.exists(index=\"ecommerce\"):\n", + " client.indices.delete(index=\"ecommerce\")\n", + "\n", + "client.indices.create(\n", + " index=\"ecommerce\",\n", + " mappings={\n", + " \"properties\": {\n", + " \"product\": {\n", + " \"type\": \"text\",\n", + " },\n", + " \"description\": {\n", + " \"type\": \"text\",\n", + " },\n", + " \"category\": {\n", + " \"type\": \"text\",\n", + " },\n", + " }\n", + " },\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "9b53b39e-d74e-4fa8-a364-e2c3caf37418", + "metadata": { + "id": "9b53b39e-d74e-4fa8-a364-e2c3caf37418" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'ecommerce-search'})" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Reindex dest index\n", + "\n", + "INDEX = \"ecommerce-search\"\n", + "if client.indices.exists(index=INDEX):\n", + " client.indices.delete(index=INDEX)\n", + "client.indices.create(\n", + " index=INDEX,\n", + " mappings={\n", + " # Saving disk space by excluding the ELSER tokens and the dense_vector field from document source.\n", + " # Note: That should only be applied if you are certain that reindexing will not be required in the future.\n", + " \"properties\": {\n", + " \"product\": {\n", + " \"type\": \"text\",\n", + " },\n", + " \"description\": {\n", + " \"type\": \"text\",\n", + " },\n", + " \"category\": {\n", + " \"type\": \"text\",\n", + " },\n", + " \"elser_description_vector\": {\"type\": \"sparse_vector\"},\n", + " \"e5_description_vector\": { # Inference results field, target_field.predicted_value\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 384, # The all-mpnet-base-v2 model has embedding_size of 768, so dims is set to 768.\n", + " \"index\": \"true\",\n", + " \"similarity\": \"cosine\", # When indexing vectors for approximate kNN search, you need to specify the similarity function for comparing the vectors.\n", + " },\n", + " },\n", + " },\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "Vo-LKu8TOT5j", + "metadata": { + "id": "Vo-LKu8TOT5j" + }, + "source": [ + "## Load documents\n", + "\n", + "Then we load `products-ecommerce.json` into the `ecommerce` index." + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "3cfdc3b7-7e4f-4111-997b-c333ac8938ba", + "metadata": { + "id": "3cfdc3b7-7e4f-4111-997b-c333ac8938ba" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done indexing documents into `ecommerce` index\n" + ] + } + ], + "source": [ + "# dataset\n", + "\n", + "import json\n", + "\n", + "with open(\"products-ecommerce.json\", \"r\") as f:\n", + " data_json = json.load(f)\n", + "\n", + "\n", + "def create_index_body(doc):\n", + " \"\"\"Generate the body for an Elasticsearch document.\"\"\"\n", + " return {\n", + " \"_index\": \"ecommerce\",\n", + " \"_source\": doc,\n", + " }\n", + "\n", + "\n", + "# Prepare the documents to be indexed\n", + "documents = [create_index_body(doc) for doc in data_json]\n", + "\n", + "# Use helpers.bulk to index\n", + "helpers.bulk(client, documents)\n", + "\n", + "print(\"Done indexing documents into `ecommerce` index\")" + ] + }, + { + "cell_type": "markdown", + "id": "3dShN9W4Opl8", + "metadata": { + "id": "3dShN9W4Opl8" + }, + "source": [ + "## Reindex\n", + "\n", + "Now we can reindex data from the `source` index `ecommerce` to the `dest` index `ecommerce-search` with the ingest pipeline `ecommerce-pipeline` we created.\n", + "\n", + "After this step our `dest` index will have the fields we need to perform Semantic Search." + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "4297cb0b-ae2e-44f9-811d-27a41c43a858", + "metadata": { + "id": "4297cb0b-ae2e-44f9-811d-27a41c43a858" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "ObjectApiResponse({'took': 76252, 'timed_out': False, 'total': 2506, 'updated': 0, 'created': 2506, 'deleted': 0, 'batches': 3, 'version_conflicts': 0, 'noops': 0, 'retries': {'bulk': 0, 'search': 0}, 'throttled_millis': 0, 'requests_per_second': -1.0, 'throttled_until_millis': 0, 'failures': []})" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Reindex data from one index 'source' to another 'dest' with the 'ecommerce-pipeline' pipeline.\n", + "\n", + "client.reindex(\n", + " wait_for_completion=True,\n", + " source={\"index\": \"ecommerce\"},\n", + " dest={\"index\": \"ecommerce-search\", \"pipeline\": \"ecommerce-pipeline\"},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "-qUXNuOvPDsI", + "metadata": { + "id": "-qUXNuOvPDsI" + }, + "source": [ + "## Text Analysis with Standard Analyzer" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "829ae6e8-807d-4f0d-ada6-fee86748b91a", + "metadata": { + "id": "829ae6e8-807d-4f0d-ada6-fee86748b91a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Analyzed Tokens: ['comfortable', 'furniture', 'for', 'a', 'large', 'balcony']\n" + ] + } + ], + "source": [ + "# Performs text analysis on a string and returns the resulting tokens.\n", + "\n", + "# Define the text to be analyzed\n", + "text = \"Comfortable furniture for a large balcony\"\n", + "\n", + "# Define the analyze request\n", + "request_body = {\"analyzer\": \"standard\", \"text\": text} # Standard Analyzer\n", + "\n", + "# Perform the analyze request\n", + "response = client.indices.analyze(\n", + " analyzer=request_body[\"analyzer\"], text=request_body[\"text\"]\n", + ")\n", + "\n", + "# Extract and display the analyzed tokens\n", + "tokens = [token[\"token\"] for token in response[\"tokens\"]]\n", + "print(\"Analyzed Tokens:\", tokens)" + ] + }, + { + "cell_type": "markdown", + "id": "12u70NLmPyNV", + "metadata": { + "id": "12u70NLmPyNV" + }, + "source": [ + "## Text Analysis with Stop Analyzer" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "55b602d1-f1e4-4b70-9273-5fc701ac9039", + "metadata": { + "id": "55b602d1-f1e4-4b70-9273-5fc701ac9039" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Analyzed Tokens: ['comfortable', 'furniture', 'large', 'balcony']\n" + ] + } + ], + "source": [ + "# Performs text analysis on a string and returns the resulting tokens.\n", + "\n", + "# Define the text to be analyzed\n", + "text = \"Comfortable furniture for a large balcony\"\n", + "\n", + "# Define the analyze request\n", + "request_body = {\"analyzer\": \"stop\", \"text\": text} # Stop Analyzer\n", + "\n", + "# Perform the analyze request\n", + "response = client.indices.analyze(\n", + " analyzer=request_body[\"analyzer\"], text=request_body[\"text\"]\n", + ")\n", + "\n", + "# Extract and display the analyzed tokens\n", + "tokens = [token[\"token\"] for token in response[\"tokens\"]]\n", + "print(\"Analyzed Tokens:\", tokens)" + ] + }, + { + "cell_type": "markdown", + "id": "8G8MKcUvP0zs", + "metadata": { + "id": "8G8MKcUvP0zs" + }, + "source": [ + "## Lexical Search" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "f4984f6c-ceec-46a4-b64c-f749e6b1b04f", + "metadata": { + "id": "f4984f6c-ceec-46a4-b64c-f749e6b1b04f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Score: 13.206358\n", + "Product: Barbie Dreamhouse\n", + "Category: Toys\n", + "Description: is a classic Barbie playset with multiple rooms, furniture, a large balcony, a pool, and accessories. It allows kids to create their dream Barbie world.\n", + "\n", + "\n", + "Score: 7.827815\n", + "Product: Comfortable Rocking Chair\n", + "Category: Indoor Furniture\n", + "Description: enjoy relaxing moments with this comfortable rocking chair. Its smooth motion and cushioned seat make it an ideal piece of furniture for unwinding.\n", + "\n" + ] + } + ], + "source": [ + "# BM25\n", + "\n", + "response = client.search(\n", + " size=2,\n", + " index=\"ecommerce-search\",\n", + " query={\n", + " \"match\": {\n", + " \"description\": {\n", + " \"query\": \"Comfortable furniture for a large balcony\",\n", + " \"analyzer\": \"stop\",\n", + " }\n", + " }\n", + " },\n", + ")\n", + "hits = response[\"hits\"][\"hits\"]\n", + "\n", + "if not hits:\n", + " print(\"No matches found\")\n", + "else:\n", + " for hit in hits:\n", + " score = hit[\"_score\"]\n", + " product = hit[\"_source\"][\"product\"]\n", + " category = hit[\"_source\"][\"category\"]\n", + " description = hit[\"_source\"][\"description\"]\n", + " print(\n", + " f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "xiywcf_-P39a", + "metadata": { + "id": "xiywcf_-P39a" + }, + "source": [ + "## Semantic Search with Dense Vector" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "72187c9a-14c1-4084-a080-4e5c1e614f22", + "metadata": { + "id": "72187c9a-14c1-4084-a080-4e5c1e614f22" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Score: 0.93147576\n", + "Product: Metal Garden Bench with Cushion\n", + "Category: Garden Furniture\n", + "Description: is a stylish and comfortable metal garden bench, complete with a cushion for added support.\n", + "\n", + "\n", + "Score: 0.9304026\n", + "Product: Garden Dining Set with Swivel Chairs\n", + "Category: Garden Furniture\n", + "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", + "\n" + ] + } + ], + "source": [ + "# KNN\n", + "\n", + "response = client.search(\n", + " index=\"ecommerce-search\",\n", + " size=2,\n", + " knn={\n", + " \"field\": \"e5_description_vector\",\n", + " \"k\": 50, # Number of nearest neighbors to return as top hits.\n", + " \"num_candidates\": 500, # Number of nearest neighbor candidates to consider per shard. Increasing num_candidates tends to improve the accuracy of the final k results.\n", + " \"query_vector_builder\": { # Object indicating how to build a query_vector. kNN search enables you to perform semantic search by using a previously deployed text embedding model.\n", + " \"text_embedding\": {\n", + " \"model_id\": \".multilingual-e5-small-elasticsearch\", # Text embedding model id\n", + " \"model_text\": \"Comfortable furniture for a large balcony\", # Query\n", + " }\n", + " },\n", + " },\n", + ")\n", + "\n", + "for hit in response[\"hits\"][\"hits\"]:\n", + "\n", + " score = hit[\"_score\"]\n", + " product = hit[\"_source\"][\"product\"]\n", + " category = hit[\"_source\"][\"category\"]\n", + " description = hit[\"_source\"][\"description\"]\n", + " print(\n", + " f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "QlWFdngRQFbv", + "metadata": { + "id": "QlWFdngRQFbv" + }, + "source": [ + "## Semantic Search with Sparse Vector" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "2c0bf5fc-ab32-4f33-8f26-904ff10635a5", + "metadata": { + "id": "2c0bf5fc-ab32-4f33-8f26-904ff10635a5" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Score: 11.354144\n", + "Product: Garden Lounge Set with Side Table\n", + "Category: Garden Furniture\n", + "Description: is a comfortable and stylish garden lounge set, including a sofa, chairs, and a side table for outdoor relaxation.\n", + "\n", + "\n", + "Score: 11.189863\n", + "Product: Garden Lounge Chair with Sunshade\n", + "Category: Garden Furniture\n", + "Description: is a comfortable and versatile garden lounge chair with a built-in sunshade, perfect for hot sunny days.\n", + "\n" + ] + } + ], + "source": [ + "# Elastic Learned Sparse Encoder - ELSER\n", + "\n", + "response = client.search(\n", + " index=\"ecommerce-search\",\n", + " size=2,\n", + " query={\n", + " \"sparse_vector\": {\n", + " \"field\": \"elser_description_vector\",\n", + " \"inference_id\": \".elser-2-elasticsearch\",\n", + " \"query\": \"Comfortable furniture for a large balcony\",\n", + " }\n", + " },\n", + ")\n", + "\n", + "for hit in response[\"hits\"][\"hits\"]:\n", + "\n", + " score = hit[\"_score\"]\n", + " product = hit[\"_source\"][\"product\"]\n", + " category = hit[\"_source\"][\"category\"]\n", + " description = hit[\"_source\"][\"description\"]\n", + " print(\n", + " f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "kz9deDBYQJxr", + "metadata": { + "id": "kz9deDBYQJxr" + }, + "source": [ + "## Hybrid Search - BM25+KNN linear combination" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "f84aa16b-49c5-4abf-a049-d556c225542e", + "metadata": { + "id": "f84aa16b-49c5-4abf-a049-d556c225542e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Score: 18.161213\n", + "Product: Garden Dining Set with Swivel Chairs\n", + "Category: Garden Furniture\n", + "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", + "\n", + "\n", + "Score: 17.770641\n", + "Product: Garden Dining Set with Swivel Rockers\n", + "Category: Garden Furniture\n", + "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel rockers for easy movement.\n", + "\n" + ] + } + ], + "source": [ + "# BM25 + KNN (Linear Combination)\n", + "\n", + "response = client.search(\n", + " index=\"ecommerce-search\",\n", + " size=2,\n", + " query={\n", + " \"bool\": {\n", + " \"should\": [\n", + " {\n", + " \"match\": {\n", + " \"description\": {\n", + " \"query\": \"A dining table and comfortable chairs for a large balcony\",\n", + " \"boost\": 1, # You can adjust the boost value\n", + " }\n", + " }\n", + " }\n", + " ]\n", + " }\n", + " },\n", + " knn={\n", + " \"field\": \"e5_description_vector\",\n", + " \"k\": 2,\n", + " \"num_candidates\": 20,\n", + " \"boost\": 1, # You can adjust the boost value\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \".multilingual-e5-small-elasticsearch\",\n", + " \"model_text\": \"A dining table and comfortable chairs for a large balcony\",\n", + " }\n", + " },\n", + " },\n", + ")\n", + "\n", + "for hit in response[\"hits\"][\"hits\"]:\n", + "\n", + " score = hit[\"_score\"]\n", + " product = hit[\"_source\"][\"product\"]\n", + " category = hit[\"_source\"][\"category\"]\n", + " description = hit[\"_source\"][\"description\"]\n", + " print(\n", + " f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "cybkWjmpQV8g", + "metadata": { + "id": "cybkWjmpQV8g" + }, + "source": [ + "## Hybrid Search - BM25+KNN RRF" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "id": "aa2e072d-37bb-43fd-a83f-e1cb55a24861", + "metadata": { + "id": "aa2e072d-37bb-43fd-a83f-e1cb55a24861" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Score: 0.09307359\n", + "Product: Garden Dining Set with Swivel Rockers\n", + "Category: Garden Furniture\n", + "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel rockers for easy movement.\n", + "\n", + "Score: 0.04761905\n", + "Product: Garden Dining Set with Swivel Chairs\n", + "Category: Garden Furniture\n", + "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", + "\n" + ] + } + ], + "source": [ + "# BM25 + KNN (RRF)\n", + "top_k = 2\n", + "response = client.search(\n", + " index=\"ecommerce-search\",\n", + " retriever={\n", + " \"rrf\": {\n", + " \"retrievers\": [\n", + " {\n", + " \"standard\": {\n", + " \"query\": {\n", + " \"match\": {\n", + " \"description\": \"A dining table and comfortable chairs for a large balcony\"\n", + " }\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"knn\": {\n", + " \"field\": \"e5_description_vector\",\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \".multilingual-e5-small\",\n", + " \"model_text\": \"A dining table and comfortable chairs for a large balcony\",\n", + " }\n", + " },\n", + " \"k\": 2,\n", + " \"num_candidates\": 20,\n", + " }\n", + " },\n", + " ],\n", + " \"rank_window_size\": 2,\n", + " \"rank_constant\": 20,\n", + " }\n", + " },\n", + ")\n", + "\n", + "for hit in response[\"hits\"][\"hits\"]:\n", + "\n", + " score = hit[\"_score\"]\n", + " category = hit[\"_source\"][\"category\"]\n", + " product = hit[\"_source\"][\"product\"]\n", + " description = hit[\"_source\"][\"description\"]\n", + " print(\n", + " f\"Score: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "LyKI2Z-XQbI6", + "metadata": { + "id": "LyKI2Z-XQbI6" + }, + "source": [ + "## Hybrid Search - BM25+ELSER linear combination" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "id": "bd842732-b20a-4c7a-b735-e1f558a9b922", + "metadata": { + "id": "bd842732-b20a-4c7a-b735-e1f558a9b922" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Score: 35.605705\n", + "Product: Garden Dining Set with Swivel Rockers\n", + "Category: Garden Furniture\n", + "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel rockers for easy movement.\n", + "\n", + "\n", + "Score: 33.858994\n", + "Product: Garden Dining Set with Swivel Chairs\n", + "Category: Garden Furniture\n", + "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", + "\n" + ] + } + ], + "source": [ + "# BM25 + Elastic Learned Sparse Encoder (Linear Combination)\n", + "\n", + "response = client.search(\n", + " index=\"ecommerce-search\",\n", + " size=2,\n", + " query={\n", + " \"bool\": {\n", + " \"should\": [\n", + " {\n", + " \"match\": {\n", + " \"description\": {\n", + " \"query\": \"A dining table and comfortable chairs for a large balcony\",\n", + " \"boost\": 1, # You can adjust the boost value\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"sparse_vector\": {\n", + " \"field\": \"elser_description_vector\",\n", + " \"inference_id\": \".elser-2-elasticsearch\",\n", + " \"query\": \"A dining table and comfortable chairs for a large balcony\",\n", + " }\n", + " },\n", + " ]\n", + " }\n", + " },\n", + ")\n", + "\n", + "for hit in response[\"hits\"][\"hits\"]:\n", + "\n", + " score = hit[\"_score\"]\n", + " product = hit[\"_source\"][\"product\"]\n", + " category = hit[\"_source\"][\"category\"]\n", + " description = hit[\"_source\"][\"description\"]\n", + " print(\n", + " f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "e3d5e4e9", + "metadata": {}, + "source": [ + "## Hybrid Search - BM25+ELSER RRF" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "id": "199c5c60", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Score: 0.0952381\n", + "Product: Garden Dining Set with Swivel Rockers\n", + "Category: Garden Furniture\n", + "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel rockers for easy movement.\n", + "\n", + "Score: 0.045454547\n", + "Product: Garden Dining Set with Swivel Chairs\n", + "Category: Garden Furniture\n", + "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", + "\n" + ] + } + ], + "source": [ + "# BM25 + ELSER (RRF)\n", + "top_k = 2\n", + "response = client.search(\n", + " index=\"ecommerce-search\",\n", + " retriever={\n", + " \"rrf\": {\n", + " \"retrievers\": [\n", + " {\n", + " \"standard\": {\n", + " \"query\": {\n", + " \"match\": {\n", + " \"description\": \"A dining table and comfortable chairs for a large balcony\"\n", + " }\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"standard\": {\n", + " \"query\": {\n", + " \"sparse_vector\": {\n", + " \"field\": \"elser_description_vector\",\n", + " \"inference_id\": \".elser-2-elasticsearch\",\n", + " \"query\": \"A dining table and comfortable chairs for a large balcony\",\n", + " }\n", + " }\n", + " }\n", + " },\n", + " ],\n", + " \"rank_window_size\": 2,\n", + " \"rank_constant\": 20,\n", + " }\n", + " },\n", + ")\n", + "\n", + "for hit in response[\"hits\"][\"hits\"]:\n", + "\n", + " score = hit[\"_score\"]\n", + " category = hit[\"_source\"][\"category\"]\n", + " product = hit[\"_source\"][\"product\"]\n", + " description = hit[\"_source\"][\"description\"]\n", + " print(\n", + " f\"Score: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", + " )" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From ea287039af04dad6d63f76806df53455e70d540a Mon Sep 17 00:00:00 2001 From: Justin Castilla Date: Tue, 15 Jul 2025 07:41:51 -0700 Subject: [PATCH 02/11] removes old install script --- ...dated-ecommerce_dense_sparse_project.ipynb | 37 ------------------- 1 file changed, 37 deletions(-) diff --git a/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb b/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb index d0f27d6b..16758064 100644 --- a/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb +++ b/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb @@ -53,31 +53,6 @@ "%pip install elasticsearch" ] }, - { - "cell_type": "code", - "execution_count": 2, - "id": "8c36e9b5-8f2b-4734-9213-1350caa7f837", - "metadata": { - "id": "8c36e9b5-8f2b-4734-9213-1350caa7f837" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[31mERROR: Could not find a version that satisfies the requirement torch==1.11 (from versions: 2.0.0, 2.0.1, 2.1.0, 2.1.1, 2.1.2, 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.3.1, 2.4.0, 2.4.1, 2.5.0, 2.5.1, 2.6.0, 2.7.0, 2.7.1)\u001b[0m\u001b[31m\n", - "\u001b[0m\u001b[31mERROR: No matching distribution found for torch==1.11\u001b[0m\u001b[31m\n", - "\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1.1\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "pip -q install eland elasticsearch sentence_transformers transformers torch==1.11 # Eland Python Client" - ] - }, { "cell_type": "code", "execution_count": 3, @@ -1094,18 +1069,6 @@ "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" } }, "nbformat": 4, From b8c06b6b23cf926dba6b61cd19812b0273840a9a Mon Sep 17 00:00:00 2001 From: Justin Castilla Date: Tue, 15 Jul 2025 07:55:03 -0700 Subject: [PATCH 03/11] updating cell outputs, copy, removes unecessary scripts --- ...dated-ecommerce_dense_sparse_project.ipynb | 361 +++++------------- 1 file changed, 88 insertions(+), 273 deletions(-) diff --git a/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb b/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb index 16758064..6c5fd70a 100644 --- a/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb +++ b/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb @@ -11,7 +11,7 @@ "\n", "In this example, you will explore various approaches to retrieving information using Elasticsearch, focusing specifically on text, lexical and semantic search.\n", "\n", - "To accomplish this, this example demonstrate various search scenarios on a dataset generated to simulate e-commerce product information.\n", + "To accomplish this, this example demonstrates various search scenarios on a dataset generated to simulate e-commerce product information.\n", "\n", "This dataset contains over 2,500 products, each with a description. These products are categorized into 76 distinct product categories, with each category containing a varying number of products.\n", "\n", @@ -19,10 +19,10 @@ "\n", "For this example, you will need:\n", "\n", - "- Python 3.6 or later\n", + "- Python 3.11 or later\n", "- The Elastic Python client\n", - "- Elastic 8.8 deployment or later, with 8GB memory machine learning node\n", - "- The Elastic Learned Sparse EncodeR model that comes pre-loaded into Elastic installed and started on your deployment\n", + "- Elastic 9.0 deployment or later, with 8GB memory machine learning node\n", + "\n", "\n", "We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html), a [free trial](https://cloud.elastic.co/registration?onboarding_token=vectorsearch&utm_source=github&utm_content=elasticsearch-labs-notebook) is available." ] @@ -38,24 +38,46 @@ "\n", "To get started, we'll need to connect to our Elastic deployment using the Python client.\n", "\n", - "Because we're using an Elastic Cloud deployment, we'll use the **Cloud ID** to identify our deployment.\n" + "Because we're using an Elastic Cloud deployment, we'll use the **Cloud Endpoint** and **Cloud API Key** to identify our deployment.\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 147, "id": "e8d24cd8-a437-4bd2-a1f0-93e535ccf8a9", "metadata": { "id": "e8d24cd8-a437-4bd2-a1f0-93e535ccf8a9" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "70582.50s - pydevd: Sending message related to process being replaced timed-out after 5 seconds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: elasticsearch in /Users/justin.castilla/.pyenv/versions/3.12.3/lib/python3.12/site-packages (9.0.2)\n", + "Requirement already satisfied: elastic-transport<9,>=8.15.1 in /Users/justin.castilla/.pyenv/versions/3.12.3/lib/python3.12/site-packages (from elasticsearch) (8.17.1)\n", + "Requirement already satisfied: python-dateutil in /Users/justin.castilla/.pyenv/versions/3.12.3/lib/python3.12/site-packages (from elasticsearch) (2.9.0.post0)\n", + "Requirement already satisfied: typing-extensions in /Users/justin.castilla/.pyenv/versions/3.12.3/lib/python3.12/site-packages (from elasticsearch) (4.12.2)\n", + "Requirement already satisfied: urllib3<3,>=1.26.2 in /Users/justin.castilla/.pyenv/versions/3.12.3/lib/python3.12/site-packages (from elastic-transport<9,>=8.15.1->elasticsearch) (2.2.2)\n", + "Requirement already satisfied: certifi in /Users/justin.castilla/.pyenv/versions/3.12.3/lib/python3.12/site-packages (from elastic-transport<9,>=8.15.1->elasticsearch) (2024.8.30)\n", + "Requirement already satisfied: six>=1.5 in /Users/justin.castilla/.pyenv/versions/3.12.3/lib/python3.12/site-packages (from python-dateutil->elasticsearch) (1.16.0)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ "%pip install elasticsearch" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 148, "id": "eaf90bc8-647e-4ada-9aa9-5cb9e60762b7", "metadata": { "id": "eaf90bc8-647e-4ada-9aa9-5cb9e60762b7" @@ -91,7 +113,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 151, "id": "6907a2bf-4927-428e-9ca8-9df3dd35a2cc", "metadata": { "id": "6907a2bf-4927-428e-9ca8-9df3dd35a2cc" @@ -110,22 +132,23 @@ ")" ] }, + { + "cell_type": "markdown", + "id": "9d05a473", + "metadata": {}, + "source": [ + "Let's verify that our client is connected." + ] + }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "8980e76b", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'name': 'serverless', 'cluster_name': 'f15e57523cf84631a30f3aaf16c3ecf0', 'cluster_uuid': 'Wumo0cJWRZC8YfiHnVNwnQ', 'version': {'number': '8.11.0', 'build_flavor': 'serverless', 'build_type': 'docker', 'build_hash': '00000000', 'build_date': '2023-10-31', 'build_snapshot': False, 'lucene_version': '9.7.0', 'minimum_wire_compatibility_version': '8.11.0', 'minimum_index_compatibility_version': '8.11.0'}, 'tagline': 'You Know, for Search'}\n" - ] - } - ], + "outputs": [], "source": [ - "print(client.info())" + "resp = client.info()\n", + "print(resp)" ] }, { @@ -142,29 +165,12 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "7f6f3f5a-2b93-4a0c-93c8-c887ca80f687", "metadata": { "id": "7f6f3f5a-2b93-4a0c-93c8-c887ca80f687" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model ID: .multilingual-e5-small\n", - "Description: E5 small multilingual\n", - "Inference Config: {'text_embedding': {'vocabulary': {'index': '.ml-inference-native-000002'}, 'tokenization': {'xlm_roberta': {'do_lower_case': False, 'with_special_tokens': True, 'max_sequence_length': 512, 'truncate': 'first', 'span': -1}}, 'embedding_size': 384}}\n", - "Version: 12.0.0\n", - "Tags: []\n", - "------\n", - "{'inference_id': '.multilingual-e5-small-elasticsearch', 'task_type': 'text_embedding', 'service': 'elasticsearch', 'service_settings': {'num_threads': 1, 'model_id': '.multilingual-e5-small_linux-x86_64', 'adaptive_allocations': {'enabled': True, 'min_number_of_allocations': 0, 'max_number_of_allocations': 32}}, 'chunking_settings': {'strategy': 'sentence', 'max_chunk_size': 250, 'sentence_overlap': 1}}\n", - "Inference Endpoint ID: .multilingual-e5-small-elasticsearch\n", - "Model ID: .multilingual-e5-small_linux-x86_64\n", - "Task Type: text_embedding\n" - ] - } - ], + "outputs": [], "source": [ "# set the model to .multilingual-e5-small-elasticsearch\n", "es_model_id = \".multilingual-e5-small\"\n", @@ -176,11 +182,7 @@ " if model[\"model_id\"] == es_model_id:\n", " print(f\"Model ID: {model['model_id']}\")\n", " print(f\"Description: {model.get('description', 'No description')}\")\n", - " print(\n", - " f\"Inference Config: {model.get('inference_config', 'No inference config')}\"\n", - " )\n", " print(f\"Version: {model.get('version', 'N/A')}\")\n", - " print(f\"Tags: {model.get('tags', [])}\")\n", " break\n", "else:\n", " print(f\"Model {es_model_id} not found.\")\n", @@ -189,7 +191,6 @@ "\n", "inference_endpoint = client.inference.get(inference_id=es_model_endpoint)\n", "inference_endpoint = inference_endpoint[\"endpoints\"][0]\n", - "print(inference_endpoint)\n", "print(f\"Inference Endpoint ID: {es_model_endpoint}\")\n", "print(\n", " f\"Model ID: {inference_endpoint.get('service_settings', {}).get('model_id', 'N/A')}\"\n", @@ -199,26 +200,14 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": null, "id": "6739f55b-6983-4b48-9349-6e0111b313fe", "metadata": { "id": "6739f55b-6983-4b48-9349-6e0111b313fe" }, - "outputs": [ - { - "data": { - "text/plain": [ - "ObjectApiResponse({'acknowledged': True})" - ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "# Creating an ingest pipeline with inference processors to use ELSER (sparse) and all-mpnet-base-v2 (dense) to infer against data that will be ingested in the pipeline.\n", - "\n", + "# Creating an ingest pipeline with inference processors to use ELSER (sparse) and e5_multilingual_small (dense) to infer against data that will be ingested in the pipeline.\n", "client.ingest.put_pipeline(\n", " id=\"ecommerce-pipeline\",\n", " processors=[\n", @@ -258,30 +247,19 @@ "source": [ "## Index documents\n", "\n", - "Then, we create a source index to load `products-ecommerce.json`, this will be the `ecommerce` index and a destination index to extract the documents from the source and index these documents into the destination `ecommerce-search`.\n", + "Then, we create a source index to load `products-ecommerce.json` or our remote source. This will be the `ecommerce` index and a destination index to extract the documents from the source and index these documents into the destination `ecommerce-search`.\n", "\n", - "For the `ecommerce-search` index we add a field to support dense vector storage and search `description_vector.predicted_value`, this is the target field for inference results. The field type in this case is `dense_vector`, the `all-mpnet-base-v2` model has embedding_size of 768, so dims is set to 768. We also add a `rank_features` field type to support the text expansion output." + "For the `ecommerce-search` index we add a field to support dense vector storage and search `e5_description_vector`, this is the target field for inference results. The field type in this case is `dense_vector`, the `e5_multilingual_small` model has embedding_size of 384, so dims is set to 384. We also add a `elser_description_vector` field type to support the text expansion output." ] }, { "cell_type": "code", - "execution_count": 52, + "execution_count": null, "id": "6e115bd0-e758-44db-b5b9-96217af472c1", "metadata": { "id": "6e115bd0-e758-44db-b5b9-96217af472c1" }, - "outputs": [ - { - "data": { - "text/plain": [ - "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'ecommerce'})" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Index to load products-ecommerce.json docs\n", "if client.indices.exists(index=\"ecommerce\"):\n", @@ -307,23 +285,12 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": null, "id": "9b53b39e-d74e-4fa8-a364-e2c3caf37418", "metadata": { "id": "9b53b39e-d74e-4fa8-a364-e2c3caf37418" }, - "outputs": [ - { - "data": { - "text/plain": [ - "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'ecommerce-search'})" - ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Reindex dest index\n", "\n", @@ -371,20 +338,12 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": null, "id": "3cfdc3b7-7e4f-4111-997b-c333ac8938ba", "metadata": { "id": "3cfdc3b7-7e4f-4111-997b-c333ac8938ba" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Done indexing documents into `ecommerce` index\n" - ] - } - ], + "outputs": [], "source": [ "# dataset\n", "\n", @@ -427,23 +386,12 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": null, "id": "4297cb0b-ae2e-44f9-811d-27a41c43a858", "metadata": { "id": "4297cb0b-ae2e-44f9-811d-27a41c43a858" }, - "outputs": [ - { - "data": { - "text/plain": [ - "ObjectApiResponse({'took': 76252, 'timed_out': False, 'total': 2506, 'updated': 0, 'created': 2506, 'deleted': 0, 'batches': 3, 'version_conflicts': 0, 'noops': 0, 'retries': {'bulk': 0, 'search': 0}, 'throttled_millis': 0, 'requests_per_second': -1.0, 'throttled_until_millis': 0, 'failures': []})" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Reindex data from one index 'source' to another 'dest' with the 'ecommerce-pipeline' pipeline.\n", "\n", @@ -466,20 +414,12 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": null, "id": "829ae6e8-807d-4f0d-ada6-fee86748b91a", "metadata": { "id": "829ae6e8-807d-4f0d-ada6-fee86748b91a" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Analyzed Tokens: ['comfortable', 'furniture', 'for', 'a', 'large', 'balcony']\n" - ] - } - ], + "outputs": [], "source": [ "# Performs text analysis on a string and returns the resulting tokens.\n", "\n", @@ -511,20 +451,12 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": null, "id": "55b602d1-f1e4-4b70-9273-5fc701ac9039", "metadata": { "id": "55b602d1-f1e4-4b70-9273-5fc701ac9039" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Analyzed Tokens: ['comfortable', 'furniture', 'large', 'balcony']\n" - ] - } - ], + "outputs": [], "source": [ "# Performs text analysis on a string and returns the resulting tokens.\n", "\n", @@ -556,31 +488,12 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": null, "id": "f4984f6c-ceec-46a4-b64c-f749e6b1b04f", "metadata": { "id": "f4984f6c-ceec-46a4-b64c-f749e6b1b04f" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Score: 13.206358\n", - "Product: Barbie Dreamhouse\n", - "Category: Toys\n", - "Description: is a classic Barbie playset with multiple rooms, furniture, a large balcony, a pool, and accessories. It allows kids to create their dream Barbie world.\n", - "\n", - "\n", - "Score: 7.827815\n", - "Product: Comfortable Rocking Chair\n", - "Category: Indoor Furniture\n", - "Description: enjoy relaxing moments with this comfortable rocking chair. Its smooth motion and cushioned seat make it an ideal piece of furniture for unwinding.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# BM25\n", "\n", @@ -623,31 +536,12 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": null, "id": "72187c9a-14c1-4084-a080-4e5c1e614f22", "metadata": { "id": "72187c9a-14c1-4084-a080-4e5c1e614f22" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Score: 0.93147576\n", - "Product: Metal Garden Bench with Cushion\n", - "Category: Garden Furniture\n", - "Description: is a stylish and comfortable metal garden bench, complete with a cushion for added support.\n", - "\n", - "\n", - "Score: 0.9304026\n", - "Product: Garden Dining Set with Swivel Chairs\n", - "Category: Garden Furniture\n", - "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# KNN\n", "\n", @@ -690,31 +584,12 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": null, "id": "2c0bf5fc-ab32-4f33-8f26-904ff10635a5", "metadata": { "id": "2c0bf5fc-ab32-4f33-8f26-904ff10635a5" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Score: 11.354144\n", - "Product: Garden Lounge Set with Side Table\n", - "Category: Garden Furniture\n", - "Description: is a comfortable and stylish garden lounge set, including a sofa, chairs, and a side table for outdoor relaxation.\n", - "\n", - "\n", - "Score: 11.189863\n", - "Product: Garden Lounge Chair with Sunshade\n", - "Category: Garden Furniture\n", - "Description: is a comfortable and versatile garden lounge chair with a built-in sunshade, perfect for hot sunny days.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# Elastic Learned Sparse Encoder - ELSER\n", "\n", @@ -753,31 +628,12 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": null, "id": "f84aa16b-49c5-4abf-a049-d556c225542e", "metadata": { "id": "f84aa16b-49c5-4abf-a049-d556c225542e" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Score: 18.161213\n", - "Product: Garden Dining Set with Swivel Chairs\n", - "Category: Garden Furniture\n", - "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", - "\n", - "\n", - "Score: 17.770641\n", - "Product: Garden Dining Set with Swivel Rockers\n", - "Category: Garden Furniture\n", - "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel rockers for easy movement.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# BM25 + KNN (Linear Combination)\n", "\n", @@ -835,29 +691,12 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": null, "id": "aa2e072d-37bb-43fd-a83f-e1cb55a24861", "metadata": { "id": "aa2e072d-37bb-43fd-a83f-e1cb55a24861" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Score: 0.09307359\n", - "Product: Garden Dining Set with Swivel Rockers\n", - "Category: Garden Furniture\n", - "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel rockers for easy movement.\n", - "\n", - "Score: 0.04761905\n", - "Product: Garden Dining Set with Swivel Chairs\n", - "Category: Garden Furniture\n", - "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# BM25 + KNN (RRF)\n", "top_k = 2\n", @@ -918,31 +757,12 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": null, "id": "bd842732-b20a-4c7a-b735-e1f558a9b922", "metadata": { "id": "bd842732-b20a-4c7a-b735-e1f558a9b922" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Score: 35.605705\n", - "Product: Garden Dining Set with Swivel Rockers\n", - "Category: Garden Furniture\n", - "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel rockers for easy movement.\n", - "\n", - "\n", - "Score: 33.858994\n", - "Product: Garden Dining Set with Swivel Chairs\n", - "Category: Garden Furniture\n", - "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# BM25 + Elastic Learned Sparse Encoder (Linear Combination)\n", "\n", @@ -993,27 +813,10 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": null, "id": "199c5c60", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Score: 0.0952381\n", - "Product: Garden Dining Set with Swivel Rockers\n", - "Category: Garden Furniture\n", - "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel rockers for easy movement.\n", - "\n", - "Score: 0.045454547\n", - "Product: Garden Dining Set with Swivel Chairs\n", - "Category: Garden Furniture\n", - "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# BM25 + ELSER (RRF)\n", "top_k = 2\n", @@ -1069,6 +872,18 @@ "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" } }, "nbformat": 4, From b76d5da3707450b763fa7fca21590c9b1db29856 Mon Sep 17 00:00:00 2001 From: Justin Castilla Date: Tue, 15 Jul 2025 22:17:40 -0700 Subject: [PATCH 04/11] removes older code patterns, adds table of results --- ...dated-ecommerce_dense_sparse_project.ipynb | 1030 +++++++++++++---- 1 file changed, 830 insertions(+), 200 deletions(-) diff --git a/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb b/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb index 6c5fd70a..d0479e95 100644 --- a/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb +++ b/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb @@ -43,56 +43,37 @@ }, { "cell_type": "code", - "execution_count": 147, + "execution_count": null, "id": "e8d24cd8-a437-4bd2-a1f0-93e535ccf8a9", "metadata": { "id": "e8d24cd8-a437-4bd2-a1f0-93e535ccf8a9" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "70582.50s - pydevd: Sending message related to process being replaced timed-out after 5 seconds\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: elasticsearch in /Users/justin.castilla/.pyenv/versions/3.12.3/lib/python3.12/site-packages (9.0.2)\n", - "Requirement already satisfied: elastic-transport<9,>=8.15.1 in /Users/justin.castilla/.pyenv/versions/3.12.3/lib/python3.12/site-packages (from elasticsearch) (8.17.1)\n", - "Requirement already satisfied: python-dateutil in /Users/justin.castilla/.pyenv/versions/3.12.3/lib/python3.12/site-packages (from elasticsearch) (2.9.0.post0)\n", - "Requirement already satisfied: typing-extensions in /Users/justin.castilla/.pyenv/versions/3.12.3/lib/python3.12/site-packages (from elasticsearch) (4.12.2)\n", - "Requirement already satisfied: urllib3<3,>=1.26.2 in /Users/justin.castilla/.pyenv/versions/3.12.3/lib/python3.12/site-packages (from elastic-transport<9,>=8.15.1->elasticsearch) (2.2.2)\n", - "Requirement already satisfied: certifi in /Users/justin.castilla/.pyenv/versions/3.12.3/lib/python3.12/site-packages (from elastic-transport<9,>=8.15.1->elasticsearch) (2024.8.30)\n", - "Requirement already satisfied: six>=1.5 in /Users/justin.castilla/.pyenv/versions/3.12.3/lib/python3.12/site-packages (from python-dateutil->elasticsearch) (1.16.0)\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], + "outputs": [], "source": [ - "%pip install elasticsearch" + "!pip install elasticsearch" ] }, { "cell_type": "code", - "execution_count": 148, + "execution_count": null, "id": "eaf90bc8-647e-4ada-9aa9-5cb9e60762b7", "metadata": { "id": "eaf90bc8-647e-4ada-9aa9-5cb9e60762b7" }, "outputs": [], "source": [ - "from elasticsearch import (\n", - " Elasticsearch,\n", - " helpers,\n", - ") # Import the Elasticsearch client and helpers module\n", - "from urllib.request import urlopen # library for opening URLs\n", + "# import the Elasticsearch client and bulk function\n", + "from elasticsearch import Elasticsearch\n", + "from elasticsearch.helpers import bulk\n", + "\n", + "# import json module to read JSON file of products\n", "import json # module for handling JSON data\n", - "from pathlib import Path # module for working with file paths\n", "\n", - "import getpass # handling password input" + "import getpass # handling password input\n", + "\n", + "# display search results in a table\n", + "import pandas as pd\n", + "from IPython.display import display, Markdown" ] }, { @@ -113,20 +94,20 @@ }, { "cell_type": "code", - "execution_count": 151, + "execution_count": null, "id": "6907a2bf-4927-428e-9ca8-9df3dd35a2cc", "metadata": { "id": "6907a2bf-4927-428e-9ca8-9df3dd35a2cc" }, "outputs": [], "source": [ - "# Found in the 'Manage Deployment' page\n", + "# your endpoint for your Elasticsearch instance\n", "ELASTIC_ENDPOINT = getpass.getpass(\"Enter Elastic Endpoint: \")\n", "\n", - "# Password for the 'elastic' user generated by Elasticsearch\n", + "# your Elastic API Key for Elasticsearch\n", "ELASTIC_API_KEY = getpass.getpass(\"Enter Elastic API Key: \")\n", "\n", - "# Create the client instance\n", + "# create the Elasticsearch client instance\n", "client = Elasticsearch(\n", " hosts=[ELASTIC_ENDPOINT], api_key=ELASTIC_API_KEY, request_timeout=3600\n", ")" @@ -142,13 +123,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 183, "id": "8980e76b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connected: True\n" + ] + } + ], "source": [ - "resp = client.info()\n", - "print(resp)" + "resp = client.ping()\n", + "print(f\"Connected: {resp}\")" ] }, { @@ -158,7 +147,7 @@ "id": "BH-N6epTJarM" }, "source": [ - "## Setup emebdding model\n", + "## Define our embedding model\n", "\n", "Next we upload the all-mpnet-base-v2 embedding model into Elasticsearch and create an ingest pipeline with inference processors for text embedding and text expansion, using the description field for both. This field contains the description of each product." ] @@ -170,7 +159,21 @@ "metadata": { "id": "7f6f3f5a-2b93-4a0c-93c8-c887ca80f687" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model ID: .multilingual-e5-small\n", + "Description: E5 small multilingual\n", + "Version: 12.0.0\n", + "------\n", + "Inference Endpoint ID: .multilingual-e5-small-elasticsearch\n", + "Model ID: .multilingual-e5-small_linux-x86_64\n", + "Task Type: text_embedding\n" + ] + } + ], "source": [ "# set the model to .multilingual-e5-small-elasticsearch\n", "es_model_id = \".multilingual-e5-small\"\n", @@ -189,6 +192,7 @@ "\n", "print(\"------\")\n", "\n", + "# verify the inference endpoint is ready to use\n", "inference_endpoint = client.inference.get(inference_id=es_model_endpoint)\n", "inference_endpoint = inference_endpoint[\"endpoints\"][0]\n", "print(f\"Inference Endpoint ID: {es_model_endpoint}\")\n", @@ -198,37 +202,56 @@ "print(f\"Task Type: {inference_endpoint['task_type']}\")" ] }, + { + "cell_type": "markdown", + "id": "80506477", + "metadata": {}, + "source": [ + "## Create an inference pipeline\n", + "This function will create an ingest pipeline with inference processors to use `ELSER` (sparse_vector) and `e5_multilingual_small` (dense_vector) to infer against data that will be ingested in the pipeline." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 200, "id": "6739f55b-6983-4b48-9349-6e0111b313fe", "metadata": { "id": "6739f55b-6983-4b48-9349-6e0111b313fe" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "ObjectApiResponse({'acknowledged': True})" + ] + }, + "execution_count": 200, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Creating an ingest pipeline with inference processors to use ELSER (sparse) and e5_multilingual_small (dense) to infer against data that will be ingested in the pipeline.\n", "client.ingest.put_pipeline(\n", " id=\"ecommerce-pipeline\",\n", " processors=[\n", " {\n", " \"inference\": {\n", - " \"model_id\": \".elser-2-elasticsearch\",\n", + " \"model_id\": \".elser-2-elasticsearch\", # inference endpoint ID\n", " \"input_output\": [\n", " {\n", - " \"input_field\": \"description\",\n", - " \"output_field\": \"elser_description_vector\",\n", + " \"input_field\": \"description\", # source field\n", + " \"output_field\": \"elser_description_vector\", # destination vector field\n", " }\n", " ],\n", " }\n", " },\n", " {\n", " \"inference\": {\n", - " \"model_id\": \".multilingual-e5-small-elasticsearch\", # Inference endpoint ID\n", + " \"model_id\": \".multilingual-e5-small-elasticsearch\", # inference endpoint ID\n", " \"input_output\": [\n", " {\n", - " \"input_field\": \"description\",\n", - " \"output_field\": \"e5_description_vector\",\n", + " \"input_field\": \"description\", # source field\n", + " \"output_field\": \"e5_description_vector\", # destination vector field\n", " }\n", " ],\n", " \"inference_config\": {\"text_embedding\": {}},\n", @@ -246,81 +269,68 @@ }, "source": [ "## Index documents\n", + "The `ecommerce-search` index we are creating will include fields to support dense and sparse vector storage and search. \n", "\n", - "Then, we create a source index to load `products-ecommerce.json` or our remote source. This will be the `ecommerce` index and a destination index to extract the documents from the source and index these documents into the destination `ecommerce-search`.\n", - "\n", - "For the `ecommerce-search` index we add a field to support dense vector storage and search `e5_description_vector`, this is the target field for inference results. The field type in this case is `dense_vector`, the `e5_multilingual_small` model has embedding_size of 384, so dims is set to 384. We also add a `elser_description_vector` field type to support the text expansion output." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6e115bd0-e758-44db-b5b9-96217af472c1", - "metadata": { - "id": "6e115bd0-e758-44db-b5b9-96217af472c1" - }, - "outputs": [], - "source": [ - "# Index to load products-ecommerce.json docs\n", - "if client.indices.exists(index=\"ecommerce\"):\n", - " client.indices.delete(index=\"ecommerce\")\n", + "We define the `e5_description_vector` and the `elser_description_vector` fields to store the inference pipeline results. The field type in `e5_description_vector` is a `dense_vector`. The `.e5_multilingual_small` model has embedding_size of 384, so the dimension of the fector (dims) is set to 384. \n", "\n", - "client.indices.create(\n", - " index=\"ecommerce\",\n", - " mappings={\n", - " \"properties\": {\n", - " \"product\": {\n", - " \"type\": \"text\",\n", - " },\n", - " \"description\": {\n", - " \"type\": \"text\",\n", - " },\n", - " \"category\": {\n", - " \"type\": \"text\",\n", - " },\n", - " }\n", - " },\n", - ")" + "We also add a `elser_description_vector` field type to support the `sparse_vector` output from our `.elser_model_2_linux-x86_64` model. No further configuration is needed for this field for our use case." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 221, "id": "9b53b39e-d74e-4fa8-a364-e2c3caf37418", "metadata": { "id": "9b53b39e-d74e-4fa8-a364-e2c3caf37418" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "ObjectApiResponse({'acknowledged': True})" + ] + }, + "execution_count": 221, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Reindex dest index\n", - "\n", - "INDEX = \"ecommerce-search\"\n", - "if client.indices.exists(index=INDEX):\n", - " client.indices.delete(index=INDEX)\n", - "client.indices.create(\n", - " index=INDEX,\n", - " mappings={\n", - " # Saving disk space by excluding the ELSER tokens and the dense_vector field from document source.\n", - " # Note: That should only be applied if you are certain that reindexing will not be required in the future.\n", - " \"properties\": {\n", - " \"product\": {\n", - " \"type\": \"text\",\n", - " },\n", - " \"description\": {\n", - " \"type\": \"text\",\n", - " },\n", - " \"category\": {\n", - " \"type\": \"text\",\n", - " },\n", - " \"elser_description_vector\": {\"type\": \"sparse_vector\"},\n", - " \"e5_description_vector\": { # Inference results field, target_field.predicted_value\n", - " \"type\": \"dense_vector\",\n", - " \"dims\": 384, # The all-mpnet-base-v2 model has embedding_size of 768, so dims is set to 768.\n", - " \"index\": \"true\",\n", - " \"similarity\": \"cosine\", # When indexing vectors for approximate kNN search, you need to specify the similarity function for comparing the vectors.\n", - " },\n", + "# define the index name and mapping\n", + "commerce_index = \"ecommerce-search\"\n", + "mappings = {\n", + " \"properties\": {\n", + " \"product\": {\n", + " \"type\": \"text\",\n", + " },\n", + " \"description\": {\n", + " \"type\": \"text\",\n", + " },\n", + " \"category\": {\n", + " \"type\": \"text\",\n", + " },\n", + " \"elser_description_vector\": {\"type\": \"sparse_vector\"},\n", + " \"e5_description_vector\": {\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 384,\n", + " \"index\": \"true\",\n", + " \"similarity\": \"cosine\",\n", " },\n", " },\n", + "}\n", + "\n", + "\n", + "if client.indices.exists(index=commerce_index):\n", + " client.indices.delete(index=commerce_index)\n", + "client.indices.create(\n", + " index=commerce_index,\n", + " mappings=mappings,\n", + ")\n", + "\n", + "# set the ecommerce-pipeline as a the default pipeline for the ecommerce-search index\n", + "client.indices.put_settings(\n", + " index=commerce_index,\n", + " body={\"default_pipeline\": \"ecommerce-pipeline\"},\n", ")" ] }, @@ -333,73 +343,50 @@ "source": [ "## Load documents\n", "\n", - "Then we load `products-ecommerce.json` into the `ecommerce` index." + "Then we load `products-ecommerce.json` into the `ecommerce-search` index. We will use the `bulk` helper function to index our documents en masse. " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 222, "id": "3cfdc3b7-7e4f-4111-997b-c333ac8938ba", "metadata": { "id": "3cfdc3b7-7e4f-4111-997b-c333ac8938ba" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Indexing documents...\n", + "Documents indexed successfully.\n" + ] + } + ], "source": [ - "# dataset\n", - "\n", - "import json\n", - "\n", + "# Load the dataset\n", "with open(\"products-ecommerce.json\", \"r\") as f:\n", " data_json = json.load(f)\n", "\n", "\n", + "# helper function to create bulk indexing body\n", "def create_index_body(doc):\n", - " \"\"\"Generate the body for an Elasticsearch document.\"\"\"\n", " return {\n", - " \"_index\": \"ecommerce\",\n", + " \"_index\": \"ecommerce-search\",\n", " \"_source\": doc,\n", " }\n", "\n", "\n", - "# Prepare the documents to be indexed\n", + "# prepare the documents to be indexed\n", "documents = [create_index_body(doc) for doc in data_json]\n", "\n", - "# Use helpers.bulk to index\n", - "helpers.bulk(client, documents)\n", - "\n", - "print(\"Done indexing documents into `ecommerce` index\")" - ] - }, - { - "cell_type": "markdown", - "id": "3dShN9W4Opl8", - "metadata": { - "id": "3dShN9W4Opl8" - }, - "source": [ - "## Reindex\n", - "\n", - "Now we can reindex data from the `source` index `ecommerce` to the `dest` index `ecommerce-search` with the ingest pipeline `ecommerce-pipeline` we created.\n", - "\n", - "After this step our `dest` index will have the fields we need to perform Semantic Search." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4297cb0b-ae2e-44f9-811d-27a41c43a858", - "metadata": { - "id": "4297cb0b-ae2e-44f9-811d-27a41c43a858" - }, - "outputs": [], - "source": [ - "# Reindex data from one index 'source' to another 'dest' with the 'ecommerce-pipeline' pipeline.\n", - "\n", - "client.reindex(\n", - " wait_for_completion=True,\n", - " source={\"index\": \"ecommerce\"},\n", - " dest={\"index\": \"ecommerce-search\", \"pipeline\": \"ecommerce-pipeline\"},\n", - ")" + "# use bulk function to index\n", + "try:\n", + " print(\"Indexing documents...\")\n", + " bulk(client, documents)\n", + " print(\"Documents indexed successfully.\")\n", + "except Exception as e:\n", + " print(f\"Error indexing documents: {e}\")" ] }, { @@ -414,12 +401,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 169, "id": "829ae6e8-807d-4f0d-ada6-fee86748b91a", "metadata": { "id": "829ae6e8-807d-4f0d-ada6-fee86748b91a" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Analyzed Tokens: ['comfortable', 'furniture', 'for', 'a', 'large', 'balcony']\n" + ] + } + ], "source": [ "# Performs text analysis on a string and returns the resulting tokens.\n", "\n", @@ -456,10 +451,18 @@ "metadata": { "id": "55b602d1-f1e4-4b70-9273-5fc701ac9039" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Analyzed Tokens: ['comfortable', 'furniture', 'large', 'balcony']\n" + ] + } + ], "source": [ "# Performs text analysis on a string and returns the resulting tokens.\n", - "\n", + "# TODO: Partial Smoosh together\n", "# Define the text to be analyzed\n", "text = \"Comfortable furniture for a large balcony\"\n", "\n", @@ -488,16 +491,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 225, "id": "f4984f6c-ceec-46a4-b64c-f749e6b1b04f", "metadata": { "id": "f4984f6c-ceec-46a4-b64c-f749e6b1b04f" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Score: 13.408413\n", + "Product: Barbie Dreamhouse\n", + "Category: Toys\n", + "Description: is a classic Barbie playset with multiple rooms, furniture, a large balcony, a pool, and accessories. It allows kids to create their dream Barbie world.\n", + "\n", + "\n", + "Score: 7.5048585\n", + "Product: Rattan Patio Conversation Set\n", + "Category: Outdoor Furniture\n", + "Description: is a stylish and comfortable outdoor furniture set, including a sofa, two chairs, and a coffee table, all made of durable rattan material.\n", + "\n" + ] + } + ], "source": [ - "# BM25\n", + "results_list = []\n", "\n", - "response = client.search(\n", + "# Regular BM25 (Lexical) Search\n", + "resp = client.search(\n", " size=2,\n", " index=\"ecommerce-search\",\n", " query={\n", @@ -508,13 +531,16 @@ " }\n", " }\n", " },\n", + " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", ")\n", - "hits = response[\"hits\"][\"hits\"]\n", "\n", - "if not hits:\n", + "lexical_search_results = resp[\"hits\"][\"hits\"]\n", + "results_list.append({\"lexical_search\": lexical_search_results})\n", + "\n", + "if not lexical_search_results:\n", " print(\"No matches found\")\n", "else:\n", - " for hit in hits:\n", + " for hit in lexical_search_results:\n", " score = hit[\"_score\"]\n", " product = hit[\"_source\"][\"product\"]\n", " category = hit[\"_source\"][\"category\"]\n", @@ -536,15 +562,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 226, "id": "72187c9a-14c1-4084-a080-4e5c1e614f22", "metadata": { "id": "72187c9a-14c1-4084-a080-4e5c1e614f22" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Score: 0.93147576\n", + "Product: Metal Garden Bench with Cushion\n", + "Category: Garden Furniture\n", + "Description: is a stylish and comfortable metal garden bench, complete with a cushion for added support.\n", + "\n", + "\n", + "Score: 0.9304026\n", + "Product: Garden Dining Set with Swivel Chairs\n", + "Category: Garden Furniture\n", + "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", + "\n" + ] + } + ], "source": [ "# KNN\n", - "\n", + "# TODO: Add Semantic_Text type?\n", "response = client.search(\n", " index=\"ecommerce-search\",\n", " size=2,\n", @@ -559,9 +604,13 @@ " }\n", " },\n", " },\n", + " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", ")\n", "\n", - "for hit in response[\"hits\"][\"hits\"]:\n", + "dense_semantic_search_results = response[\"hits\"][\"hits\"]\n", + "results_list.append({\"dense_semantic_search\": dense_semantic_search_results})\n", + "\n", + "for hit in dense_semantic_search_results:\n", "\n", " score = hit[\"_score\"]\n", " product = hit[\"_source\"][\"product\"]\n", @@ -584,16 +633,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 227, "id": "2c0bf5fc-ab32-4f33-8f26-904ff10635a5", "metadata": { "id": "2c0bf5fc-ab32-4f33-8f26-904ff10635a5" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Score: 11.1893\n", + "Product: Garden Lounge Chair with Sunshade\n", + "Category: Garden Furniture\n", + "Description: is a comfortable and versatile garden lounge chair with a built-in sunshade, perfect for hot sunny days.\n", + "\n", + "\n", + "Score: 11.187605\n", + "Product: Rattan Patio Conversation Set\n", + "Category: Outdoor Furniture\n", + "Description: is a stylish and comfortable outdoor furniture set, including a sofa, two chairs, and a coffee table, all made of durable rattan material.\n", + "\n" + ] + } + ], "source": [ "# Elastic Learned Sparse Encoder - ELSER\n", "\n", - "response = client.search(\n", + "resp = client.search(\n", " index=\"ecommerce-search\",\n", " size=2,\n", " query={\n", @@ -603,9 +671,14 @@ " \"query\": \"Comfortable furniture for a large balcony\",\n", " }\n", " },\n", + " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", ")\n", "\n", - "for hit in response[\"hits\"][\"hits\"]:\n", + "\n", + "sparse_semantic_search_results = resp[\"hits\"][\"hits\"]\n", + "results_list.append({\"sparse_semantic_search\": sparse_semantic_search_results})\n", + "\n", + "for hit in sparse_semantic_search_results:\n", "\n", " score = hit[\"_score\"]\n", " product = hit[\"_source\"][\"product\"]\n", @@ -628,16 +701,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 229, "id": "f84aa16b-49c5-4abf-a049-d556c225542e", "metadata": { "id": "f84aa16b-49c5-4abf-a049-d556c225542e" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Score: 18.161213\n", + "Product: Garden Dining Set with Swivel Chairs\n", + "Category: Garden Furniture\n", + "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", + "\n", + "\n", + "Score: 17.770641\n", + "Product: Garden Dining Set with Swivel Rockers\n", + "Category: Garden Furniture\n", + "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel rockers for easy movement.\n", + "\n" + ] + } + ], "source": [ "# BM25 + KNN (Linear Combination)\n", - "\n", - "response = client.search(\n", + "query = \"A dining table and comfortable chairs for a large balcony\"\n", + "resp = client.search(\n", " index=\"ecommerce-search\",\n", " size=2,\n", " query={\n", @@ -646,8 +738,8 @@ " {\n", " \"match\": {\n", " \"description\": {\n", - " \"query\": \"A dining table and comfortable chairs for a large balcony\",\n", - " \"boost\": 1, # You can adjust the boost value\n", + " \"query\": query,\n", + " \"boost\": 1,\n", " }\n", " }\n", " }\n", @@ -658,17 +750,21 @@ " \"field\": \"e5_description_vector\",\n", " \"k\": 2,\n", " \"num_candidates\": 20,\n", - " \"boost\": 1, # You can adjust the boost value\n", + " \"boost\": 1,\n", " \"query_vector_builder\": {\n", " \"text_embedding\": {\n", " \"model_id\": \".multilingual-e5-small-elasticsearch\",\n", - " \"model_text\": \"A dining table and comfortable chairs for a large balcony\",\n", + " \"model_text\": query,\n", " }\n", " },\n", " },\n", + " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", ")\n", "\n", - "for hit in response[\"hits\"][\"hits\"]:\n", + "dense_linear_search_results = resp[\"hits\"][\"hits\"]\n", + "results_list.append({\"dense_linear_search\": dense_linear_search_results})\n", + "\n", + "for hit in dense_linear_search_results:\n", "\n", " score = hit[\"_score\"]\n", " product = hit[\"_source\"][\"product\"]\n", @@ -691,16 +787,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 230, "id": "aa2e072d-37bb-43fd-a83f-e1cb55a24861", "metadata": { "id": "aa2e072d-37bb-43fd-a83f-e1cb55a24861" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Score: 0.0952381\n", + "Product: Garden Dining Set with Swivel Chairs\n", + "Category: Garden Furniture\n", + "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", + "\n", + "Score: 0.045454547\n", + "Product: Patio Dining Set with Bench\n", + "Category: Outdoor Furniture\n", + "Description: is a spacious and functional patio dining set, including a dining table, chairs, and a bench for additional seating.\n", + "\n" + ] + } + ], "source": [ "# BM25 + KNN (RRF)\n", "top_k = 2\n", - "response = client.search(\n", + "resp = client.search(\n", " index=\"ecommerce-search\",\n", " retriever={\n", " \"rrf\": {\n", @@ -732,9 +845,13 @@ " \"rank_constant\": 20,\n", " }\n", " },\n", + " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", ")\n", "\n", - "for hit in response[\"hits\"][\"hits\"]:\n", + "dense_rrf_search_results = resp[\"hits\"][\"hits\"]\n", + "results_list.append({\"dense_rrf_search\": dense_rrf_search_results})\n", + "\n", + "for hit in dense_rrf_search_results:\n", "\n", " score = hit[\"_score\"]\n", " category = hit[\"_source\"][\"category\"]\n", @@ -757,16 +874,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 232, "id": "bd842732-b20a-4c7a-b735-e1f558a9b922", "metadata": { "id": "bd842732-b20a-4c7a-b735-e1f558a9b922" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Score: 33.896286\n", + "Product: Garden Dining Set with Swivel Chairs\n", + "Category: Garden Furniture\n", + "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", + "\n", + "\n", + "Score: 32.462887\n", + "Product: Patio Dining Set with Bench\n", + "Category: Outdoor Furniture\n", + "Description: is a spacious and functional patio dining set, including a dining table, chairs, and a bench for additional seating.\n", + "\n" + ] + } + ], "source": [ "# BM25 + Elastic Learned Sparse Encoder (Linear Combination)\n", "\n", - "response = client.search(\n", + "resp = client.search(\n", " index=\"ecommerce-search\",\n", " size=2,\n", " query={\n", @@ -790,10 +926,13 @@ " ]\n", " }\n", " },\n", + " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", ")\n", "\n", - "for hit in response[\"hits\"][\"hits\"]:\n", + "sparse_linear_search_results = resp[\"hits\"][\"hits\"]\n", + "results_list.append({\"sparse_linear_search\": sparse_linear_search_results})\n", "\n", + "for hit in sparse_linear_search_results:\n", " score = hit[\"_score\"]\n", " product = hit[\"_source\"][\"product\"]\n", " category = hit[\"_source\"][\"category\"]\n", @@ -813,14 +952,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 233, "id": "199c5c60", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Score: 0.0952381\n", + "Product: Garden Dining Set with Swivel Chairs\n", + "Category: Garden Furniture\n", + "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", + "\n", + "Score: 0.045454547\n", + "Product: Patio Dining Set with Bench\n", + "Category: Outdoor Furniture\n", + "Description: is a spacious and functional patio dining set, including a dining table, chairs, and a bench for additional seating.\n", + "\n" + ] + } + ], "source": [ "# BM25 + ELSER (RRF)\n", "top_k = 2\n", - "response = client.search(\n", + "resp = client.search(\n", " index=\"ecommerce-search\",\n", " retriever={\n", " \"rrf\": {\n", @@ -850,9 +1006,13 @@ " \"rank_constant\": 20,\n", " }\n", " },\n", + " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", ")\n", "\n", - "for hit in response[\"hits\"][\"hits\"]:\n", + "sparse_rrf_search_results = resp[\"hits\"][\"hits\"]\n", + "results_list.append({\"sparse_rrf_search_results\": sparse_rrf_search_results})\n", + "\n", + "for hit in sparse_rrf_search_results:\n", "\n", " score = hit[\"_score\"]\n", " category = hit[\"_source\"][\"category\"]\n", @@ -862,6 +1022,476 @@ " f\"Score: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", " )" ] + }, + { + "cell_type": "markdown", + "id": "7b95f9b8", + "metadata": {}, + "source": [ + "TODO: \n", + "- Semantic Text / Query BUilder (ask Serena)\n", + "- Table of Results\n", + "- Conclusion\n", + "- Next steps\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1162a857", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "### Dense Linear Search" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 productcategorydescriptionscore
0Garden Dining Set with Swivel ChairsGarden Furnitureis a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.18.161213
1Garden Dining Set with Swivel RockersGarden Furnitureis a functional and comfortable garden dining set, including a table and chairs with swivel rockers for easy movement.17.770641
2Garden Dining Set with Swivel ChairsGarden Furnitureis a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.18.161213
3Garden Dining Set with Swivel RockersGarden Furnitureis a functional and comfortable garden dining set, including a table and chairs with swivel rockers for easy movement.17.770641
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### Dense Rrf Search" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 productcategorydescriptionscore
0Garden Dining Set with Swivel ChairsGarden Furnitureis a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.0.095238
1Patio Dining Set with BenchOutdoor Furnitureis a spacious and functional patio dining set, including a dining table, chairs, and a bench for additional seating.0.045455
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### Dense Semantic Search" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 productcategorydescriptionscore
0Metal Garden Bench with CushionGarden Furnitureis a stylish and comfortable metal garden bench, complete with a cushion for added support.0.931476
1Garden Dining Set with Swivel ChairsGarden Furnitureis a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.0.930403
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### Lexical Search" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 productcategorydescriptionscore
0Barbie DreamhouseToysis a classic Barbie playset with multiple rooms, furniture, a large balcony, a pool, and accessories. It allows kids to create their dream Barbie world.13.408413
1Rattan Patio Conversation SetOutdoor Furnitureis a stylish and comfortable outdoor furniture set, including a sofa, two chairs, and a coffee table, all made of durable rattan material.7.504859
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### Sparse Linear Search" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 productcategorydescriptionscore
0Garden Dining Set with Swivel ChairsGarden Furnitureis a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.33.896286
1Patio Dining Set with BenchOutdoor Furnitureis a spacious and functional patio dining set, including a dining table, chairs, and a bench for additional seating.32.462887
2Garden Dining Set with Swivel ChairsGarden Furnitureis a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.33.896286
3Patio Dining Set with BenchOutdoor Furnitureis a spacious and functional patio dining set, including a dining table, chairs, and a bench for additional seating.32.462887
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### Sparse Rrf Search Results" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 productcategorydescriptionscore
0Garden Dining Set with Swivel ChairsGarden Furnitureis a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.0.095238
1Patio Dining Set with BenchOutdoor Furnitureis a spacious and functional patio dining set, including a dining table, chairs, and a bench for additional seating.0.045455
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### Sparse Semantic Search" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 productcategorydescriptionscore
0Garden Lounge Chair with SunshadeGarden Furnitureis a comfortable and versatile garden lounge chair with a built-in sunshade, perfect for hot sunny days.11.189300
1Rattan Patio Conversation SetOutdoor Furnitureis a stylish and comfortable outdoor furniture set, including a sofa, two chairs, and a coffee table, all made of durable rattan material.11.187605
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Flatten results for each search type\n", + "rows = []\n", + "for result in results_list:\n", + " search_type = list(result.keys())[0]\n", + "\n", + " for doc in result[search_type]:\n", + " row = {\n", + " \"search_type\": search_type,\n", + " \"product\": doc[\"_source\"].get(\"product\"),\n", + " \"category\": doc[\"_source\"].get(\"category\"),\n", + " \"description\": doc[\"_source\"].get(\"description\"),\n", + " \"score\": doc.get(\"_score\"),\n", + " }\n", + " rows.append(row)\n", + "\n", + "df = pd.DataFrame(rows)\n", + "\n", + "for search_type, group in df.groupby(\"search_type\"):\n", + " display(Markdown(f\"### {search_type.replace('_', ' ').title()}\"))\n", + " styled = (\n", + " group.drop(columns=\"search_type\")\n", + " .reset_index(drop=True)\n", + " .style.set_properties(\n", + " subset=[\"description\"],\n", + " **{\"white-space\": \"pre-wrap\", \"word-break\": \"break-word\"},\n", + " )\n", + " )\n", + " display(styled)" + ] } ], "metadata": { From 3cd8fdb5c1f0ef202304e65f60f947b24293b16a Mon Sep 17 00:00:00 2001 From: Justin Castilla Date: Thu, 17 Jul 2025 13:33:53 -0700 Subject: [PATCH 05/11] adds copy and semantic_text examples --- ...dated-ecommerce_dense_sparse_project.ipynb | 1221 ++++++----------- 1 file changed, 407 insertions(+), 814 deletions(-) diff --git a/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb b/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb index d0479e95..7d0ba073 100644 --- a/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb +++ b/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb @@ -9,22 +9,42 @@ "source": [ "# **Lexical and Semantic Search with Elasticsearch**\n", "\n", - "In this example, you will explore various approaches to retrieving information using Elasticsearch, focusing specifically on text, lexical and semantic search.\n", + "In the following examples, we will explore various approaches to retrieving information using Elasticsearch - focusing specifically on full text search, semantic search, and a hybrid combination of both.\n", "\n", "To accomplish this, this example demonstrates various search scenarios on a dataset generated to simulate e-commerce product information.\n", "\n", - "This dataset contains over 2,500 products, each with a description. These products are categorized into 76 distinct product categories, with each category containing a varying number of products.\n", + "This dataset contains over 2,500 products, each with a description. These products are categorized into 76 distinct product categories, with each category containing a varying number of products. \n", "\n", + "Here is a sample of an object from the dataset:\n", + "\n", + "```json\n", + " {\n", + " \"product\": \"Samsung 49-inch Curved Gaming Monitor\",\n", + " \"description\": \"is a curved gaming monitor with a high refresh rate and AMD FreeSync technology. It offers an immersive gaming experience with smooth visuals.\",\n", + " \"category\": \"Monitors\"\n", + "}\n", + "\n", + "```\n", + "\n", + "We will consume the dataset from a JSON file into Elasticsearch using modern consumption patterns. We will then perform a series of search operations to demonstrate the different search strategies.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "6370f2e4", + "metadata": {}, + "source": [ "## **🧰 Requirements**\n", "\n", "For this example, you will need:\n", "\n", "- Python 3.11 or later\n", "- The Elastic Python client\n", - "- Elastic 9.0 deployment or later, with 8GB memory machine learning node\n", + "- Elastic 9.0 deployment or later on either a local, cloud, or serverless environment\n", "\n", "\n", - "We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html), a [free trial](https://cloud.elastic.co/registration?onboarding_token=vectorsearch&utm_source=github&utm_content=elasticsearch-labs-notebook) is available." + "We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html). You can use a [free trial here](https://cloud.elastic.co/registration?onboarding_token=vectorsearch&utm_source=github&utm_content=elasticsearch-labs-notebook) to get started." ] }, { @@ -38,7 +58,7 @@ "\n", "To get started, we'll need to connect to our Elastic deployment using the Python client.\n", "\n", - "Because we're using an Elastic Cloud deployment, we'll use the **Cloud Endpoint** and **Cloud API Key** to identify our deployment.\n" + "Because we're using an Elastic Cloud deployment, we'll use the **Cloud Endpoint** and **Cloud API Key** to identify our deployment. These may be found within Kibana by following the instructions [here](https://www.elastic.co/docs/deploy-manage/api-keys/elastic-cloud-api-keys).\n" ] }, { @@ -50,7 +70,22 @@ }, "outputs": [], "source": [ - "!pip install elasticsearch" + "%pip install elasticsearch pandas IPython -q" + ] + }, + { + "cell_type": "markdown", + "id": "38b734aa", + "metadata": {}, + "source": [ + "### Import the required packages\n", + "We will import the following packages:\n", + "- `Elasticsearch`: a client library for Elasticsearch actions\n", + "- `bulk`: a function to perform Elasticsearch actions in bulk\n", + "- `getpass`: a module for receiving Elasticsearch credentials via text prompt\n", + "- `json`: a module for reading and writing JSON data\n", + "- `pandas`, `display`, `Markdown`: for data visualization and markdown formatting\n", + "\n" ] }, { @@ -66,11 +101,12 @@ "from elasticsearch import Elasticsearch\n", "from elasticsearch.helpers import bulk\n", "\n", + "# import getpass module to handle Auth input\n", + "import getpass\n", + "\n", "# import json module to read JSON file of products\n", "import json # module for handling JSON data\n", "\n", - "import getpass # handling password input\n", - "\n", "# display search results in a table\n", "import pandas as pd\n", "from IPython.display import display, Markdown" @@ -83,13 +119,12 @@ "id": "ea1VkDBXJIQR" }, "source": [ - "Now we can instantiate the Python Elasticsearch client.\n", - "\n", - "First we prompt the user for their password and Cloud ID.\n", + "### 📚 Instantiating the Elasticsearch Client\n", "\n", - "🔐 NOTE: `getpass` enables us to securely prompt the user for credentials without echoing them to the terminal, or storing it in memory.\n", - "\n", - "Then we create a `client` object that instantiates an instance of the `Elasticsearch` class." + "First we prompt the user for their Elastic Endpoint URL and Elastic API Key.\n", + "Then we create a `client` object that instantiates an instance of the `Elasticsearch` class.\n", + "Lastly, we verify that our client is connected to our Elasticsearch instance by calling `client.ping()`.\n", + "> 🔐 *NOTE: `getpass` enables us to securely prompt the user for credentials without echoing them to the terminal, or storing it in memory.*" ] }, { @@ -101,43 +136,19 @@ }, "outputs": [], "source": [ - "# your endpoint for your Elasticsearch instance\n", + "# endpoint for Elasticsearch instance\n", "ELASTIC_ENDPOINT = getpass.getpass(\"Enter Elastic Endpoint: \")\n", "\n", - "# your Elastic API Key for Elasticsearch\n", + "# Elastic API key for Elasticsearch\n", "ELASTIC_API_KEY = getpass.getpass(\"Enter Elastic API Key: \")\n", "\n", "# create the Elasticsearch client instance\n", "client = Elasticsearch(\n", " hosts=[ELASTIC_ENDPOINT], api_key=ELASTIC_API_KEY, request_timeout=3600\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "9d05a473", - "metadata": {}, - "source": [ - "Let's verify that our client is connected." - ] - }, - { - "cell_type": "code", - "execution_count": 183, - "id": "8980e76b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Connected: True\n" - ] - } - ], - "source": [ + ")\n", + "\n", "resp = client.ping()\n", - "print(f\"Connected: {resp}\")" + "print(f\"Connected to Elastic instance: {resp}\")" ] }, { @@ -147,9 +158,11 @@ "id": "BH-N6epTJarM" }, "source": [ - "## Define our embedding model\n", + "## Prepare our embedding model workflow\n", + "\n", + "Next we ensure our embedding models are available in Elasticsearch. We will use Elastic's provided `e5_multilingual_small` and `elser_V2` models to provide dense and sparse vectoring, respectively. Using these models out of the box will ensure they are up-to-date and ready for integration with Elasticsearch.\n", "\n", - "Next we upload the all-mpnet-base-v2 embedding model into Elasticsearch and create an ingest pipeline with inference processors for text embedding and text expansion, using the description field for both. This field contains the description of each product." + "Other models may be uploaded and deployed using [Eland](https://www.elastic.co/docs/reference/elasticsearch/clients/eland) or integrated using the [inference endpoint API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-azureopenai) to connect to third-party models." ] }, { @@ -159,47 +172,44 @@ "metadata": { "id": "7f6f3f5a-2b93-4a0c-93c8-c887ca80f687" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model ID: .multilingual-e5-small\n", - "Description: E5 small multilingual\n", - "Version: 12.0.0\n", - "------\n", - "Inference Endpoint ID: .multilingual-e5-small-elasticsearch\n", - "Model ID: .multilingual-e5-small_linux-x86_64\n", - "Task Type: text_embedding\n" - ] - } - ], + "outputs": [], "source": [ - "# set the model to .multilingual-e5-small-elasticsearch\n", - "es_model_id = \".multilingual-e5-small\"\n", - "es_model_endpoint = \".multilingual-e5-small-elasticsearch\"\n", + "# Declare models and endpoint names predeployed by Elastic\n", + "elser_model = \".elser_model_2_linux-x86_64\"\n", + "elser_endpoint = \".elser-2-elasticsearch\"\n", + "\n", + "e5_model = \".multilingual-e5-small_linux-x86_64\"\n", + "e5_endpoint = \".multilingual-e5-small-elasticsearch\"\n", + "\n", + "# Define (model, endpoint) tuples to check\n", + "model_endpoint_pairs = [(elser_model, elser_endpoint), (e5_model, e5_endpoint)]\n", "\n", - "# verify the model is loaded, deployed, and ready to use\n", + "# Fetch all loaded models and endpoints once\n", "models = client.ml.get_trained_models()\n", - "for model in models[\"trained_model_configs\"]:\n", - " if model[\"model_id\"] == es_model_id:\n", - " print(f\"Model ID: {model['model_id']}\")\n", - " print(f\"Description: {model.get('description', 'No description')}\")\n", - " print(f\"Version: {model.get('version', 'N/A')}\")\n", - " break\n", - "else:\n", - " print(f\"Model {es_model_id} not found.\")\n", - "\n", - "print(\"------\")\n", - "\n", - "# verify the inference endpoint is ready to use\n", - "inference_endpoint = client.inference.get(inference_id=es_model_endpoint)\n", - "inference_endpoint = inference_endpoint[\"endpoints\"][0]\n", - "print(f\"Inference Endpoint ID: {es_model_endpoint}\")\n", - "print(\n", - " f\"Model ID: {inference_endpoint.get('service_settings', {}).get('model_id', 'N/A')}\"\n", - ")\n", - "print(f\"Task Type: {inference_endpoint['task_type']}\")" + "model_ids = {model[\"model_id\"]: model for model in models[\"trained_model_configs\"]}\n", + "endpoints = client.inference.get()\n", + "endpoint_ids = {\n", + " endpoint[\"inference_id\"]: endpoint for endpoint in endpoints[\"endpoints\"]\n", + "}\n", + "\n", + "# Check each (model, endpoint) pair\n", + "for model_id, endpoint_id in model_endpoint_pairs:\n", + " print(f\"Checking Model: {model_id}\")\n", + " model = model_ids.get(model_id)\n", + " if model:\n", + " print(f\" Model ID: {model['model_id']}\")\n", + " print(f\" Description: {model.get('description', 'No description')}\")\n", + " print(f\" Version: {model.get('version', 'N/A')}\")\n", + " else:\n", + " print(\" Model not found or not loaded.\")\n", + " print(f\"Checking Endpoint: {endpoint_id}\")\n", + " endpoint = endpoint_ids.get(endpoint_id)\n", + " if endpoint:\n", + " print(f\" Inference Endpoint ID: {endpoint['inference_id']}\")\n", + " print(f\" Task Type: {endpoint['task_type']}\")\n", + " else:\n", + " print(\" Endpoint not found or not ready.\")\n", + " print(\"------\")" ] }, { @@ -207,36 +217,34 @@ "id": "80506477", "metadata": {}, "source": [ - "## Create an inference pipeline\n", - "This function will create an ingest pipeline with inference processors to use `ELSER` (sparse_vector) and `e5_multilingual_small` (dense_vector) to infer against data that will be ingested in the pipeline." + "### Create an inference pipeline\n", + "This function will create an ingest pipeline with inference processors to use `ELSER` (sparse_vector) and `e5_multilingual_small` (dense_vector) to infer against data that will be ingested in the pipeline. This allows us to automatically generate embeddings for the product descriptions when they are indexed into Elasticsearch." ] }, { "cell_type": "code", - "execution_count": 200, + "execution_count": 56, "id": "6739f55b-6983-4b48-9349-6e0111b313fe", "metadata": { "id": "6739f55b-6983-4b48-9349-6e0111b313fe" }, "outputs": [ { - "data": { - "text/plain": [ - "ObjectApiResponse({'acknowledged': True})" - ] - }, - "execution_count": 200, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "ecommerce-pipeline created: True\n" + ] } ], "source": [ - "client.ingest.put_pipeline(\n", - " id=\"ecommerce-pipeline\",\n", + "index_pipeline = \"ecommerce-pipeline\"\n", + "resp = client.ingest.put_pipeline(\n", + " id=index_pipeline,\n", " processors=[\n", " {\n", " \"inference\": {\n", - " \"model_id\": \".elser-2-elasticsearch\", # inference endpoint ID\n", + " \"model_id\": elser_endpoint, # inference endpoint ID\n", " \"input_output\": [\n", " {\n", " \"input_field\": \"description\", # source field\n", @@ -247,7 +255,7 @@ " },\n", " {\n", " \"inference\": {\n", - " \"model_id\": \".multilingual-e5-small-elasticsearch\", # inference endpoint ID\n", + " \"model_id\": e5_endpoint, # inference endpoint ID\n", " \"input_output\": [\n", " {\n", " \"input_field\": \"description\", # source field\n", @@ -258,7 +266,9 @@ " }\n", " },\n", " ],\n", - ")" + ")\n", + "\n", + "print(f\"ecommerce-pipeline created: {resp['acknowledged']}\")" ] }, { @@ -271,28 +281,27 @@ "## Index documents\n", "The `ecommerce-search` index we are creating will include fields to support dense and sparse vector storage and search. \n", "\n", - "We define the `e5_description_vector` and the `elser_description_vector` fields to store the inference pipeline results. The field type in `e5_description_vector` is a `dense_vector`. The `.e5_multilingual_small` model has embedding_size of 384, so the dimension of the fector (dims) is set to 384. \n", + "We define the `e5_description_vector` and the `elser_description_vector` fields to store the inference pipeline results. \n", + "\n", + "The field type in `e5_description_vector` is a `dense_vector`. The `.e5_multilingual_small` model has an embedding size of 384, so the dimension of the vector (dims) is set to 384. \n", "\n", - "We also add a `elser_description_vector` field type to support the `sparse_vector` output from our `.elser_model_2_linux-x86_64` model. No further configuration is needed for this field for our use case." + "We also add an `elser_description_vector` field type to support the `sparse_vector` output from our `.elser_model_2_linux-x86_64` model. No further configuration is needed for this field for our use case." ] }, { "cell_type": "code", - "execution_count": 221, + "execution_count": null, "id": "9b53b39e-d74e-4fa8-a364-e2c3caf37418", "metadata": { "id": "9b53b39e-d74e-4fa8-a364-e2c3caf37418" }, "outputs": [ { - "data": { - "text/plain": [ - "ObjectApiResponse({'acknowledged': True})" - ] - }, - "execution_count": 221, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "Index ecommerce-search created: True\n" + ] } ], "source": [ @@ -316,22 +325,57 @@ " \"index\": \"true\",\n", " \"similarity\": \"cosine\",\n", " },\n", - " },\n", + " \"e5_semantic_description_vector\": {\n", + " \"type\": \"semantic_text\",\n", + " \"inference_id\": e5_endpoint,\n", + " },\n", + " \"elser_semantic_description_vector\": {\n", + " \"type\": \"semantic_text\",\n", + " \"inference_id\": elser_endpoint,\n", + " },\n", + " }\n", "}\n", "\n", "\n", "if client.indices.exists(index=commerce_index):\n", " client.indices.delete(index=commerce_index)\n", - "client.indices.create(\n", + "resp = client.indices.create(\n", " index=commerce_index,\n", " mappings=mappings,\n", ")\n", "\n", - "# set the ecommerce-pipeline as a the default pipeline for the ecommerce-search index\n", - "client.indices.put_settings(\n", + "print(f\"Index {commerce_index} created: {resp['acknowledged']}\")" + ] + }, + { + "cell_type": "markdown", + "id": "88db9926", + "metadata": {}, + "source": [ + "### Attach Pipeline to Index\n", + "Lets connect our pipeline to the index. This updates the settings of our index to use the pipeline we previously defined as the default.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "c4830b74", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pipeline set for ecommerce-search: True\n" + ] + } + ], + "source": [ + "resp = client.indices.put_settings(\n", " index=commerce_index,\n", - " body={\"default_pipeline\": \"ecommerce-pipeline\"},\n", - ")" + " body={\"default_pipeline\": index_pipeline},\n", + ")\n", + "print(f\"Pipeline set for {commerce_index}: {resp['acknowledged']}\")" ] }, { @@ -341,14 +385,14 @@ "id": "Vo-LKu8TOT5j" }, "source": [ - "## Load documents\n", + "### Load documents\n", "\n", - "Then we load `products-ecommerce.json` into the `ecommerce-search` index. We will use the `bulk` helper function to index our documents en masse. " + "We load the contents of`products-ecommerce.json` into the `ecommerce-search` index. We will use the `bulk` helper function to efficiently index our documents en masse. " ] }, { "cell_type": "code", - "execution_count": 222, + "execution_count": 58, "id": "3cfdc3b7-7e4f-4111-997b-c333ac8938ba", "metadata": { "id": "3cfdc3b7-7e4f-4111-997b-c333ac8938ba" @@ -359,7 +403,7 @@ "output_type": "stream", "text": [ "Indexing documents...\n", - "Documents indexed successfully.\n" + "Documents indexed successfully: 2506\n" ] } ], @@ -377,14 +421,14 @@ " }\n", "\n", "\n", - "# prepare the documents to be indexed\n", + "# prepare the documents array payload\n", "documents = [create_index_body(doc) for doc in data_json]\n", "\n", "# use bulk function to index\n", "try:\n", " print(\"Indexing documents...\")\n", - " bulk(client, documents)\n", - " print(\"Documents indexed successfully.\")\n", + " resp = bulk(client, documents)\n", + " print(f\"Documents indexed successfully: {resp[0]}\")\n", "except Exception as e:\n", " print(f\"Error indexing documents: {e}\")" ] @@ -396,78 +440,67 @@ "id": "-qUXNuOvPDsI" }, "source": [ - "## Text Analysis with Standard Analyzer" + "## Text Analysis\n", + "The classic way documents are ranked for relevance by Elasticsearch based on a text query uses the Lucene implementation of the [BM25](https://en.wikipedia.org/wiki/Okapi_BM25) model, a **sparse model for lexical search**. This method follows the traditional approach for text search, looking for exact term matches.\n", + "\n", + "To make this search possible, Elasticsearch converts **text field** data into a searchable format by performing text analysis.\n", + "\n", + "**Text analysis** is performed by an [analyzer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analyzer-anatomy.html), a set of rules to govern the process of extracting relevant tokens for searching. An analyzer must have exactly one [tokenizer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-tokenizers.html). The tokenizer receives a stream of characters and breaks it up into individual tokens (usually individual words.) \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "5f51e460", + "metadata": {}, + "source": [ + "### Standard Analyzer\n", + "In the example below we are using the default analyzer, the standard analyzer, which works well for most use cases as it provides English grammar based tokenization. Tokenization enables matching on individual terms, but each token is still matched literally." ] }, { "cell_type": "code", - "execution_count": 169, - "id": "829ae6e8-807d-4f0d-ada6-fee86748b91a", + "execution_count": null, + "id": "55b602d1-f1e4-4b70-9273-5fc701ac9039", "metadata": { - "id": "829ae6e8-807d-4f0d-ada6-fee86748b91a" + "id": "55b602d1-f1e4-4b70-9273-5fc701ac9039" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Analyzed Tokens: ['comfortable', 'furniture', 'for', 'a', 'large', 'balcony']\n" - ] - } - ], + "outputs": [], "source": [ - "# Performs text analysis on a string and returns the resulting tokens.\n", - "\n", "# Define the text to be analyzed\n", "text = \"Comfortable furniture for a large balcony\"\n", "\n", "# Define the analyze request\n", - "request_body = {\"analyzer\": \"standard\", \"text\": text} # Standard Analyzer\n", + "request_body = {\"analyzer\": \"standard\", \"text\": text} # Stop Analyzer\n", "\n", "# Perform the analyze request\n", - "response = client.indices.analyze(\n", + "resp = client.indices.analyze(\n", " analyzer=request_body[\"analyzer\"], text=request_body[\"text\"]\n", ")\n", "\n", "# Extract and display the analyzed tokens\n", - "tokens = [token[\"token\"] for token in response[\"tokens\"]]\n", - "print(\"Analyzed Tokens:\", tokens)" + "standard_tokens = [token[\"token\"] for token in resp[\"tokens\"]]\n", + "print(\"Standard-analyzed Tokens:\", standard_tokens)" ] }, { "cell_type": "markdown", - "id": "12u70NLmPyNV", - "metadata": { - "id": "12u70NLmPyNV" - }, + "id": "fb75f526", + "metadata": {}, "source": [ - "## Text Analysis with Stop Analyzer" + "### Stop Analyzer\n", + "If you want to personalize your search experience you can choose a different built-in analyzer. For example, by updating the code to use the stop analyzer it will break the text into tokens at any non-letter character with support for removing stop words." ] }, { "cell_type": "code", "execution_count": null, - "id": "55b602d1-f1e4-4b70-9273-5fc701ac9039", - "metadata": { - "id": "55b602d1-f1e4-4b70-9273-5fc701ac9039" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Analyzed Tokens: ['comfortable', 'furniture', 'large', 'balcony']\n" - ] - } - ], + "id": "3e3fdcff", + "metadata": {}, + "outputs": [], "source": [ - "# Performs text analysis on a string and returns the resulting tokens.\n", - "# TODO: Partial Smoosh together\n", - "# Define the text to be analyzed\n", - "text = \"Comfortable furniture for a large balcony\"\n", - "\n", "# Define the analyze request\n", - "request_body = {\"analyzer\": \"stop\", \"text\": text} # Stop Analyzer\n", + "request_body = {\"analyzer\": \"stop\", \"text\": text}\n", "\n", "# Perform the analyze request\n", "response = client.indices.analyze(\n", @@ -475,8 +508,107 @@ ")\n", "\n", "# Extract and display the analyzed tokens\n", - "tokens = [token[\"token\"] for token in response[\"tokens\"]]\n", - "print(\"Analyzed Tokens:\", tokens)" + "stop_tokens = [token[\"token\"] for token in response[\"tokens\"]]\n", + "print(\"Stop-analyzed Tokens:\", stop_tokens)" + ] + }, + { + "cell_type": "markdown", + "id": "aba7fad6", + "metadata": {}, + "source": [ + "### Custom Analyzer\n", + "When the built-in analyzers do not fulfill your needs, you can create a [custom analyzer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-custom-analyzer.html)\n", + "], which uses the appropriate combination of zero or more [character filters](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-charfilters.html), a [tokenizer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-tokenizers.html) and zero or more [token filters](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-tokenfilters.html).\n", + "\n", + "In the below example that combines a tokenizer and token filters, the text will be lowercased by the [lowercase filter](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lowercase-tokenfilter.html) before being processed by the [synonyms token filter](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html).\n", + "\n", + "> Note: you cannot pass a custom analyzer definition inline to analyze. Define the analyzer in your index settings, then reference it by name in the analyze call. For this reason we will create a temporary index to store the analyzer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d44f3e2b", + "metadata": {}, + "outputs": [], + "source": [ + "index_settings = {\n", + " \"settings\": {\n", + " \"analysis\": {\n", + " \"analyzer\": {\n", + " \"my_custom_analyzer\": {\n", + " \"type\": \"custom\",\n", + " \"tokenizer\": \"standard\",\n", + " \"char_filter\": [\"html_strip\"],\n", + " \"filter\": [\"lowercase\", \"asciifolding\"],\n", + " }\n", + " }\n", + " }\n", + " }\n", + "}\n", + "\n", + "custom_text = \"Čōmføřțǎble Fůrñíturę Fòr â ľarğe Bałcony\"\n", + "\n", + "# Create a temporary index with the custom analyzer\n", + "client.indices.create(index=\"temporary_index\", body=index_settings)\n", + "\n", + "# Perform the analyze request\n", + "resp = client.indices.analyze(\n", + " index=\"temporary_index\", analyzer=\"my_custom_analyzer\", text=custom_text\n", + ")\n", + "\n", + "# Extract and display the analyzed tokens\n", + "custom_tokens = [token[\"token\"] for token in resp[\"tokens\"]]\n", + "print(\"Custom Tokens:\", custom_tokens)\n", + "\n", + "# Delete the temporary index\n", + "client.indices.delete(index=\"temporary_index\")" + ] + }, + { + "cell_type": "markdown", + "id": "432620b6", + "metadata": {}, + "source": [ + "### Text Analysis Results\n", + "In the table below, we can observe that analyzers both included with Elasticsearch and custom made may be included with your search requests to improve the quality of your search results by reducing or refining the content being searched. Attention should be paid to your particular use case and the needs of your users." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c5d11cb", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Standard Token Analyzer\")\n", + "print(f\"Before: \\n{text}\")\n", + "print(f\"After: \\n{standard_tokens}\")\n", + "print(\"===================\")\n", + "print(\"Stop Token Analyzer\")\n", + "print(f\"Before: \\n{text}\")\n", + "print(f\"After: \\n{stop_tokens}\")\n", + "print(\"===================\")\n", + "print(\"Custom Token Analyzer\")\n", + "print(f\"Before: \\n{custom_text}\")\n", + "print(f\"After: \\n{custom_tokens}\")" + ] + }, + { + "cell_type": "markdown", + "id": "db4f86e3", + "metadata": {}, + "source": [ + "## Search \n", + "The remainder of this notebook will cover the following search types:\n", + "\n", + "\n", + "- Lexical Search\n", + "- Semantic Search \n", + " - ELSER Semantic Search (Sparse Vector)\n", + " - E5 Semantic Search (Dense Vector)\n", + "- Hybrid Search\n" ] }, { @@ -486,39 +618,36 @@ "id": "8G8MKcUvP0zs" }, "source": [ - "## Lexical Search" + "## Lexical Search\n", + "Our first search will be a straightforward BM25 text search within the description field. We are storing all of our results in a results_list for a final comparison at the end of the notebook. A convenience function to display the results is also defined." ] }, { "cell_type": "code", - "execution_count": 225, + "execution_count": null, "id": "f4984f6c-ceec-46a4-b64c-f749e6b1b04f", "metadata": { "id": "f4984f6c-ceec-46a4-b64c-f749e6b1b04f" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Score: 13.408413\n", - "Product: Barbie Dreamhouse\n", - "Category: Toys\n", - "Description: is a classic Barbie playset with multiple rooms, furniture, a large balcony, a pool, and accessories. It allows kids to create their dream Barbie world.\n", - "\n", - "\n", - "Score: 7.5048585\n", - "Product: Rattan Patio Conversation Set\n", - "Category: Outdoor Furniture\n", - "Description: is a stylish and comfortable outdoor furniture set, including a sofa, two chairs, and a coffee table, all made of durable rattan material.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "results_list = []\n", "\n", + "\n", + "def print_search_results(search_results):\n", + " if not search_results:\n", + " print(\"No matches found\")\n", + " else:\n", + " for hit in search_results:\n", + " score = hit[\"_score\"]\n", + " product = hit[\"_source\"][\"product\"]\n", + " category = hit[\"_source\"][\"category\"]\n", + " description = hit[\"_source\"][\"description\"]\n", + " print(\n", + " f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", + " )\n", + "\n", + "\n", "# Regular BM25 (Lexical) Search\n", "resp = client.search(\n", " size=2,\n", @@ -536,18 +665,7 @@ "\n", "lexical_search_results = resp[\"hits\"][\"hits\"]\n", "results_list.append({\"lexical_search\": lexical_search_results})\n", - "\n", - "if not lexical_search_results:\n", - " print(\"No matches found\")\n", - "else:\n", - " for hit in lexical_search_results:\n", - " score = hit[\"_score\"]\n", - " product = hit[\"_source\"][\"product\"]\n", - " category = hit[\"_source\"][\"category\"]\n", - " description = hit[\"_source\"][\"description\"]\n", - " print(\n", - " f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", - " )" + "print_search_results(lexical_search_results)" ] }, { @@ -562,31 +680,12 @@ }, { "cell_type": "code", - "execution_count": 226, + "execution_count": null, "id": "72187c9a-14c1-4084-a080-4e5c1e614f22", "metadata": { "id": "72187c9a-14c1-4084-a080-4e5c1e614f22" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Score: 0.93147576\n", - "Product: Metal Garden Bench with Cushion\n", - "Category: Garden Furniture\n", - "Description: is a stylish and comfortable metal garden bench, complete with a cushion for added support.\n", - "\n", - "\n", - "Score: 0.9304026\n", - "Product: Garden Dining Set with Swivel Chairs\n", - "Category: Garden Furniture\n", - "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# KNN\n", "# TODO: Add Semantic_Text type?\n", @@ -609,16 +708,7 @@ "\n", "dense_semantic_search_results = response[\"hits\"][\"hits\"]\n", "results_list.append({\"dense_semantic_search\": dense_semantic_search_results})\n", - "\n", - "for hit in dense_semantic_search_results:\n", - "\n", - " score = hit[\"_score\"]\n", - " product = hit[\"_source\"][\"product\"]\n", - " category = hit[\"_source\"][\"category\"]\n", - " description = hit[\"_source\"][\"description\"]\n", - " print(\n", - " f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", - " )" + "print_search_results(dense_semantic_search_results)" ] }, { @@ -633,7 +723,43 @@ }, { "cell_type": "code", - "execution_count": 227, + "execution_count": null, + "id": "c5475e21", + "metadata": {}, + "outputs": [], + "source": [ + "# Elastic Learned Sparse Encoder - ELSER\n", + "\n", + "resp = client.search(\n", + " index=\"ecommerce-search\",\n", + " size=2,\n", + " query={\n", + " \"sparse_vector\": {\n", + " \"field\": \"elser_description_vector\",\n", + " \"inference_id\": \".elser-2-elasticsearch\",\n", + " \"query\": \"Comfortable furniture for a large balcony\",\n", + " }\n", + " },\n", + " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", + ")\n", + "\n", + "\n", + "sparse_semantic_search_results = resp[\"hits\"][\"hits\"]\n", + "results_list.append({\"sparse_semantic_search\": sparse_semantic_search_results})\n", + "print_search_results(sparse_semantic_search_results)" + ] + }, + { + "cell_type": "markdown", + "id": "3a2a5267", + "metadata": {}, + "source": [ + "## Semantic Search with `semantic_text` Type (ELSER)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, "id": "2c0bf5fc-ab32-4f33-8f26-904ff10635a5", "metadata": { "id": "2c0bf5fc-ab32-4f33-8f26-904ff10635a5" @@ -643,18 +769,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\n", - "Score: 11.1893\n", - "Product: Garden Lounge Chair with Sunshade\n", - "Category: Garden Furniture\n", - "Description: is a comfortable and versatile garden lounge chair with a built-in sunshade, perfect for hot sunny days.\n", - "\n", - "\n", - "Score: 11.187605\n", - "Product: Rattan Patio Conversation Set\n", - "Category: Outdoor Furniture\n", - "Description: is a stylish and comfortable outdoor furniture set, including a sofa, two chairs, and a coffee table, all made of durable rattan material.\n", - "\n" + "No matches found\n" ] } ], @@ -665,9 +780,8 @@ " index=\"ecommerce-search\",\n", " size=2,\n", " query={\n", - " \"sparse_vector\": {\n", - " \"field\": \"elser_description_vector\",\n", - " \"inference_id\": \".elser-2-elasticsearch\",\n", + " \"semantic\": {\n", + " \"field\": \"elser_semantic_description_vector\",\n", " \"query\": \"Comfortable furniture for a large balcony\",\n", " }\n", " },\n", @@ -675,18 +789,9 @@ ")\n", "\n", "\n", - "sparse_semantic_search_results = resp[\"hits\"][\"hits\"]\n", - "results_list.append({\"sparse_semantic_search\": sparse_semantic_search_results})\n", - "\n", - "for hit in sparse_semantic_search_results:\n", - "\n", - " score = hit[\"_score\"]\n", - " product = hit[\"_source\"][\"product\"]\n", - " category = hit[\"_source\"][\"category\"]\n", - " description = hit[\"_source\"][\"description\"]\n", - " print(\n", - " f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", - " )" + "elser_semantic_text_search_results = resp[\"hits\"][\"hits\"]\n", + "results_list.append({\"elser_semantic_text_search\": sparse_semantic_search_results})\n", + "print_search_results(elser_semantic_text_search_results)" ] }, { @@ -696,36 +801,17 @@ "id": "kz9deDBYQJxr" }, "source": [ - "## Hybrid Search - BM25+KNN linear combination" + "## Hybrid Search - BM25 + Dense Vector linear combination" ] }, { "cell_type": "code", - "execution_count": 229, + "execution_count": null, "id": "f84aa16b-49c5-4abf-a049-d556c225542e", "metadata": { "id": "f84aa16b-49c5-4abf-a049-d556c225542e" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Score: 18.161213\n", - "Product: Garden Dining Set with Swivel Chairs\n", - "Category: Garden Furniture\n", - "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", - "\n", - "\n", - "Score: 17.770641\n", - "Product: Garden Dining Set with Swivel Rockers\n", - "Category: Garden Furniture\n", - "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel rockers for easy movement.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# BM25 + KNN (Linear Combination)\n", "query = \"A dining table and comfortable chairs for a large balcony\"\n", @@ -763,16 +849,7 @@ "\n", "dense_linear_search_results = resp[\"hits\"][\"hits\"]\n", "results_list.append({\"dense_linear_search\": dense_linear_search_results})\n", - "\n", - "for hit in dense_linear_search_results:\n", - "\n", - " score = hit[\"_score\"]\n", - " product = hit[\"_source\"][\"product\"]\n", - " category = hit[\"_source\"][\"category\"]\n", - " description = hit[\"_source\"][\"description\"]\n", - " print(\n", - " f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", - " )" + "print_search_results(dense_linear_search_results)" ] }, { @@ -782,34 +859,19 @@ "id": "cybkWjmpQV8g" }, "source": [ - "## Hybrid Search - BM25+KNN RRF" + "## Hybrid Search - BM25 + Dense Vector Reverse Reciprocal Fusion (RRF)\n", + "\n", + "[Reciprocal rank fusion](https://www.elastic.co/docs/reference/elasticsearch/rest-apis/reciprocal-rank-fusion) (RRF) is a method for combining multiple result sets with different relevance indicators into a single result set. RRF requires no tuning, and the different relevance indicators do not have to be related to each other to achieve high-quality results." ] }, { "cell_type": "code", - "execution_count": 230, + "execution_count": null, "id": "aa2e072d-37bb-43fd-a83f-e1cb55a24861", "metadata": { "id": "aa2e072d-37bb-43fd-a83f-e1cb55a24861" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Score: 0.0952381\n", - "Product: Garden Dining Set with Swivel Chairs\n", - "Category: Garden Furniture\n", - "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", - "\n", - "Score: 0.045454547\n", - "Product: Patio Dining Set with Bench\n", - "Category: Outdoor Furniture\n", - "Description: is a spacious and functional patio dining set, including a dining table, chairs, and a bench for additional seating.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# BM25 + KNN (RRF)\n", "top_k = 2\n", @@ -832,7 +894,7 @@ " \"field\": \"e5_description_vector\",\n", " \"query_vector_builder\": {\n", " \"text_embedding\": {\n", - " \"model_id\": \".multilingual-e5-small\",\n", + " \"model_id\": e5_endpoint,\n", " \"model_text\": \"A dining table and comfortable chairs for a large balcony\",\n", " }\n", " },\n", @@ -850,16 +912,7 @@ "\n", "dense_rrf_search_results = resp[\"hits\"][\"hits\"]\n", "results_list.append({\"dense_rrf_search\": dense_rrf_search_results})\n", - "\n", - "for hit in dense_rrf_search_results:\n", - "\n", - " score = hit[\"_score\"]\n", - " category = hit[\"_source\"][\"category\"]\n", - " product = hit[\"_source\"][\"product\"]\n", - " description = hit[\"_source\"][\"description\"]\n", - " print(\n", - " f\"Score: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", - " )" + "print_search_results(dense_rrf_search_results)" ] }, { @@ -869,36 +922,17 @@ "id": "LyKI2Z-XQbI6" }, "source": [ - "## Hybrid Search - BM25+ELSER linear combination" + "## Hybrid Search - BM25 + Sparse Vector linear combination" ] }, { "cell_type": "code", - "execution_count": 232, + "execution_count": null, "id": "bd842732-b20a-4c7a-b735-e1f558a9b922", "metadata": { "id": "bd842732-b20a-4c7a-b735-e1f558a9b922" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Score: 33.896286\n", - "Product: Garden Dining Set with Swivel Chairs\n", - "Category: Garden Furniture\n", - "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", - "\n", - "\n", - "Score: 32.462887\n", - "Product: Patio Dining Set with Bench\n", - "Category: Outdoor Furniture\n", - "Description: is a spacious and functional patio dining set, including a dining table, chairs, and a bench for additional seating.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# BM25 + Elastic Learned Sparse Encoder (Linear Combination)\n", "\n", @@ -919,7 +953,7 @@ " {\n", " \"sparse_vector\": {\n", " \"field\": \"elser_description_vector\",\n", - " \"inference_id\": \".elser-2-elasticsearch\",\n", + " \"inference_id\": elser_endpoint,\n", " \"query\": \"A dining table and comfortable chairs for a large balcony\",\n", " }\n", " },\n", @@ -931,15 +965,7 @@ "\n", "sparse_linear_search_results = resp[\"hits\"][\"hits\"]\n", "results_list.append({\"sparse_linear_search\": sparse_linear_search_results})\n", - "\n", - "for hit in sparse_linear_search_results:\n", - " score = hit[\"_score\"]\n", - " product = hit[\"_source\"][\"product\"]\n", - " category = hit[\"_source\"][\"category\"]\n", - " description = hit[\"_source\"][\"description\"]\n", - " print(\n", - " f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", - " )" + "print_search_results(sparse_linear_search_results)" ] }, { @@ -947,32 +973,15 @@ "id": "e3d5e4e9", "metadata": {}, "source": [ - "## Hybrid Search - BM25+ELSER RRF" + "## Hybrid Search - BM25 + Sparse Vector Reciprocal Rank Fusion (RRF)" ] }, { "cell_type": "code", - "execution_count": 233, + "execution_count": null, "id": "199c5c60", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Score: 0.0952381\n", - "Product: Garden Dining Set with Swivel Chairs\n", - "Category: Garden Furniture\n", - "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", - "\n", - "Score: 0.045454547\n", - "Product: Patio Dining Set with Bench\n", - "Category: Outdoor Furniture\n", - "Description: is a spacious and functional patio dining set, including a dining table, chairs, and a bench for additional seating.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# BM25 + ELSER (RRF)\n", "top_k = 2\n", @@ -995,7 +1004,7 @@ " \"query\": {\n", " \"sparse_vector\": {\n", " \"field\": \"elser_description_vector\",\n", - " \"inference_id\": \".elser-2-elasticsearch\",\n", + " \"inference_id\": elser_endpoint,\n", " \"query\": \"A dining table and comfortable chairs for a large balcony\",\n", " }\n", " }\n", @@ -1010,17 +1019,8 @@ ")\n", "\n", "sparse_rrf_search_results = resp[\"hits\"][\"hits\"]\n", - "results_list.append({\"sparse_rrf_search_results\": sparse_rrf_search_results})\n", - "\n", - "for hit in sparse_rrf_search_results:\n", - "\n", - " score = hit[\"_score\"]\n", - " category = hit[\"_source\"][\"category\"]\n", - " product = hit[\"_source\"][\"product\"]\n", - " description = hit[\"_source\"][\"description\"]\n", - " print(\n", - " f\"Score: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", - " )" + "results_list.append({\"sparse_rrf_search\": sparse_rrf_search_results})\n", + "print_search_results(sparse_rrf_search_results)" ] }, { @@ -1032,7 +1032,12 @@ "- Semantic Text / Query BUilder (ask Serena)\n", "- Table of Results\n", "- Conclusion\n", - "- Next steps\n" + "- Next steps\n", + "\n", + "\n", + "\n", + "## Compiled Results\n", + "Here are the results of the previous searches. We can see that all of the results return approximately the same the products." ] }, { @@ -1040,434 +1045,12 @@ "execution_count": null, "id": "1162a857", "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "### Dense Linear Search" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
 productcategorydescriptionscore
0Garden Dining Set with Swivel ChairsGarden Furnitureis a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.18.161213
1Garden Dining Set with Swivel RockersGarden Furnitureis a functional and comfortable garden dining set, including a table and chairs with swivel rockers for easy movement.17.770641
2Garden Dining Set with Swivel ChairsGarden Furnitureis a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.18.161213
3Garden Dining Set with Swivel RockersGarden Furnitureis a functional and comfortable garden dining set, including a table and chairs with swivel rockers for easy movement.17.770641
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "### Dense Rrf Search" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
 productcategorydescriptionscore
0Garden Dining Set with Swivel ChairsGarden Furnitureis a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.0.095238
1Patio Dining Set with BenchOutdoor Furnitureis a spacious and functional patio dining set, including a dining table, chairs, and a bench for additional seating.0.045455
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "### Dense Semantic Search" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
 productcategorydescriptionscore
0Metal Garden Bench with CushionGarden Furnitureis a stylish and comfortable metal garden bench, complete with a cushion for added support.0.931476
1Garden Dining Set with Swivel ChairsGarden Furnitureis a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.0.930403
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "### Lexical Search" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
 productcategorydescriptionscore
0Barbie DreamhouseToysis a classic Barbie playset with multiple rooms, furniture, a large balcony, a pool, and accessories. It allows kids to create their dream Barbie world.13.408413
1Rattan Patio Conversation SetOutdoor Furnitureis a stylish and comfortable outdoor furniture set, including a sofa, two chairs, and a coffee table, all made of durable rattan material.7.504859
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "### Sparse Linear Search" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
 productcategorydescriptionscore
0Garden Dining Set with Swivel ChairsGarden Furnitureis a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.33.896286
1Patio Dining Set with BenchOutdoor Furnitureis a spacious and functional patio dining set, including a dining table, chairs, and a bench for additional seating.32.462887
2Garden Dining Set with Swivel ChairsGarden Furnitureis a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.33.896286
3Patio Dining Set with BenchOutdoor Furnitureis a spacious and functional patio dining set, including a dining table, chairs, and a bench for additional seating.32.462887
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "### Sparse Rrf Search Results" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
 productcategorydescriptionscore
0Garden Dining Set with Swivel ChairsGarden Furnitureis a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.0.095238
1Patio Dining Set with BenchOutdoor Furnitureis a spacious and functional patio dining set, including a dining table, chairs, and a bench for additional seating.0.045455
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "### Sparse Semantic Search" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
 productcategorydescriptionscore
0Garden Lounge Chair with SunshadeGarden Furnitureis a comfortable and versatile garden lounge chair with a built-in sunshade, perfect for hot sunny days.11.189300
1Rattan Patio Conversation SetOutdoor Furnitureis a stylish and comfortable outdoor furniture set, including a sofa, two chairs, and a coffee table, all made of durable rattan material.11.187605
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ - "# Flatten results for each search type\n", + "# Flatten results for each search type, preserving insertion order\n", "rows = []\n", "for result in results_list:\n", " search_type = list(result.keys())[0]\n", - "\n", " for doc in result[search_type]:\n", " row = {\n", " \"search_type\": search_type,\n", @@ -1478,9 +1061,18 @@ " }\n", " rows.append(row)\n", "\n", + "# Create DataFrame without altering row order\n", "df = pd.DataFrame(rows)\n", "\n", - "for search_type, group in df.groupby(\"search_type\"):\n", + "# Get the unique search_types in order of appearance\n", + "ordered_search_types = []\n", + "for row in rows:\n", + " st = row[\"search_type\"]\n", + " if st not in ordered_search_types:\n", + " ordered_search_types.append(st)\n", + "\n", + "for search_type in ordered_search_types:\n", + " group = df[df[\"search_type\"] == search_type]\n", " display(Markdown(f\"### {search_type.replace('_', ' ').title()}\"))\n", " styled = (\n", " group.drop(columns=\"search_type\")\n", @@ -1489,6 +1081,7 @@ " subset=[\"description\"],\n", " **{\"white-space\": \"pre-wrap\", \"word-break\": \"break-word\"},\n", " )\n", + " .hide(axis=\"index\") # For pandas >=1.4.0\n", " )\n", " display(styled)" ] From ca99d4fb6892a1bc76df9d598636108d5ec8ae6f Mon Sep 17 00:00:00 2001 From: Justin Castilla Date: Thu, 17 Jul 2025 13:35:57 -0700 Subject: [PATCH 06/11] sample object edit --- ...dated-ecommerce_dense_sparse_project.ipynb | 179 ++++++++++++++++-- 1 file changed, 160 insertions(+), 19 deletions(-) diff --git a/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb b/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb index 7d0ba073..5cb31638 100644 --- a/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb +++ b/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb @@ -20,7 +20,7 @@ "```json\n", " {\n", " \"product\": \"Samsung 49-inch Curved Gaming Monitor\",\n", - " \"description\": \"is a curved gaming monitor with a high refresh rate and AMD FreeSync technology. It offers an immersive gaming experience with smooth visuals.\",\n", + " \"description\": \"is a curved gaming monitor with a high refresh rate and AMD FreeSync technology.\",\n", " \"category\": \"Monitors\"\n", "}\n", "\n", @@ -223,7 +223,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 78, "id": "6739f55b-6983-4b48-9349-6e0111b313fe", "metadata": { "id": "6739f55b-6983-4b48-9349-6e0111b313fe" @@ -290,7 +290,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 82, "id": "9b53b39e-d74e-4fa8-a364-e2c3caf37418", "metadata": { "id": "9b53b39e-d74e-4fa8-a364-e2c3caf37418" @@ -330,8 +330,8 @@ " \"inference_id\": e5_endpoint,\n", " },\n", " \"elser_semantic_description_vector\": {\n", - " \"type\": \"semantic_text\",\n", - " \"inference_id\": elser_endpoint,\n", + " \"type\": \"semantic_text\"\n", + " # \"inference_id\": elser_endpoint\n", " },\n", " }\n", "}\n", @@ -358,7 +358,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 83, "id": "c4830b74", "metadata": {}, "outputs": [ @@ -392,7 +392,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": null, "id": "3cfdc3b7-7e4f-4111-997b-c333ac8938ba", "metadata": { "id": "3cfdc3b7-7e4f-4111-997b-c333ac8938ba" @@ -415,6 +415,9 @@ "\n", "# helper function to create bulk indexing body\n", "def create_index_body(doc):\n", + " doc[\"elser_semantic_description_vector\"] = doc[\"description\"]\n", + " doc[\"e5_semantic_description_vector\"] = doc[\"description\"]\n", + "\n", " return {\n", " \"_index\": \"ecommerce-search\",\n", " \"_source\": doc,\n", @@ -528,10 +531,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 64, "id": "d44f3e2b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Custom Tokens: ['comfortable', 'furniture', 'for', 'a', 'large', 'balcony']\n" + ] + }, + { + "data": { + "text/plain": [ + "ObjectApiResponse({'acknowledged': True})" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "index_settings = {\n", " \"settings\": {\n", @@ -624,12 +645,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 65, "id": "f4984f6c-ceec-46a4-b64c-f749e6b1b04f", "metadata": { "id": "f4984f6c-ceec-46a4-b64c-f749e6b1b04f" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Score: 12.93728\n", + "Product: Barbie Dreamhouse\n", + "Category: Toys\n", + "Description: is a classic Barbie playset with multiple rooms, furniture, a large balcony, a pool, and accessories. It allows kids to create their dream Barbie world.\n", + "\n", + "\n", + "Score: 7.9097595\n", + "Product: Rattan Patio Conversation Set\n", + "Category: Outdoor Furniture\n", + "Description: is a stylish and comfortable outdoor furniture set, including a sofa, two chairs, and a coffee table, all made of durable rattan material.\n", + "\n" + ] + } + ], "source": [ "results_list = []\n", "\n", @@ -680,12 +720,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 66, "id": "72187c9a-14c1-4084-a080-4e5c1e614f22", "metadata": { "id": "72187c9a-14c1-4084-a080-4e5c1e614f22" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Score: 0.93147576\n", + "Product: Metal Garden Bench with Cushion\n", + "Category: Garden Furniture\n", + "Description: is a stylish and comfortable metal garden bench, complete with a cushion for added support.\n", + "\n", + "\n", + "Score: 0.9304026\n", + "Product: Garden Dining Set with Swivel Chairs\n", + "Category: Garden Furniture\n", + "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", + "\n" + ] + } + ], "source": [ "# KNN\n", "# TODO: Add Semantic_Text type?\n", @@ -723,10 +782,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 85, "id": "c5475e21", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Score: 11.354144\n", + "Product: Garden Lounge Set with Side Table\n", + "Category: Garden Furniture\n", + "Description: is a comfortable and stylish garden lounge set, including a sofa, chairs, and a side table for outdoor relaxation.\n", + "\n", + "\n", + "Score: 11.200024\n", + "Product: Garden Lounge Set with Ottoman\n", + "Category: Garden Furniture\n", + "Description: is a versatile and comfortable garden lounge set, including a sofa, chairs, and ottoman for outdoor relaxation.\n", + "\n" + ] + } + ], "source": [ "# Elastic Learned Sparse Encoder - ELSER\n", "\n", @@ -759,7 +837,60 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 93, + "id": "4d2fb926", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Score: 11.354144\n", + "Product: Garden Lounge Set with Side Table\n", + "Category: Garden Furniture\n", + "Description: is a comfortable and stylish garden lounge set, including a sofa, chairs, and a side table for outdoor relaxation.\n", + "\n", + "\n", + "Score: 11.200024\n", + "Product: Garden Lounge Set with Ottoman\n", + "Category: Garden Furniture\n", + "Description: is a versatile and comfortable garden lounge set, including a sofa, chairs, and ottoman for outdoor relaxation.\n", + "\n" + ] + } + ], + "source": [ + "# Elastic Learned Sparse Encoder - ELSER\n", + "\n", + "resp = client.search(\n", + " index=\"ecommerce-search\",\n", + " size=2,\n", + " query={\n", + " \"semantic\": {\n", + " \"field\": \"elser_semantic_description_vector\",\n", + " \"query\": \"Comfortable furniture for a large balcony\",\n", + " }\n", + " },\n", + " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", + ")\n", + "\n", + "elser_semantic_text_search_results = resp[\"hits\"][\"hits\"]\n", + "results_list.append({\"elser_semantic_text_search\": sparse_semantic_search_results})\n", + "print_search_results(elser_semantic_text_search_results)" + ] + }, + { + "cell_type": "markdown", + "id": "1df079f3", + "metadata": {}, + "source": [ + "## Semantic Search with `semantic_text` Type (e5)" + ] + }, + { + "cell_type": "code", + "execution_count": 94, "id": "2c0bf5fc-ab32-4f33-8f26-904ff10635a5", "metadata": { "id": "2c0bf5fc-ab32-4f33-8f26-904ff10635a5" @@ -769,7 +900,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "No matches found\n" + "\n", + "Score: 0.93147576\n", + "Product: Metal Garden Bench with Cushion\n", + "Category: Garden Furniture\n", + "Description: is a stylish and comfortable metal garden bench, complete with a cushion for added support.\n", + "\n", + "\n", + "Score: 0.9304026\n", + "Product: Garden Dining Set with Swivel Chairs\n", + "Category: Garden Furniture\n", + "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", + "\n" ] } ], @@ -781,14 +923,13 @@ " size=2,\n", " query={\n", " \"semantic\": {\n", - " \"field\": \"elser_semantic_description_vector\",\n", + " \"field\": \"e5_semantic_description_vector\",\n", " \"query\": \"Comfortable furniture for a large balcony\",\n", " }\n", " },\n", " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", ")\n", "\n", - "\n", "elser_semantic_text_search_results = resp[\"hits\"][\"hits\"]\n", "results_list.append({\"elser_semantic_text_search\": sparse_semantic_search_results})\n", "print_search_results(elser_semantic_text_search_results)" From 81af33fecc93b438a4e15d644d9198f618a5932d Mon Sep 17 00:00:00 2001 From: Justin Castilla Date: Thu, 17 Jul 2025 13:37:24 -0700 Subject: [PATCH 07/11] removes commented code --- ...dated-ecommerce_dense_sparse_project.ipynb | 189 ++---------------- 1 file changed, 20 insertions(+), 169 deletions(-) diff --git a/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb b/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb index 5cb31638..4221e0d3 100644 --- a/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb +++ b/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb @@ -223,20 +223,12 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": null, "id": "6739f55b-6983-4b48-9349-6e0111b313fe", "metadata": { "id": "6739f55b-6983-4b48-9349-6e0111b313fe" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ecommerce-pipeline created: True\n" - ] - } - ], + "outputs": [], "source": [ "index_pipeline = \"ecommerce-pipeline\"\n", "resp = client.ingest.put_pipeline(\n", @@ -290,20 +282,12 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": null, "id": "9b53b39e-d74e-4fa8-a364-e2c3caf37418", "metadata": { "id": "9b53b39e-d74e-4fa8-a364-e2c3caf37418" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index ecommerce-search created: True\n" - ] - } - ], + "outputs": [], "source": [ "# define the index name and mapping\n", "commerce_index = \"ecommerce-search\"\n", @@ -329,10 +313,7 @@ " \"type\": \"semantic_text\",\n", " \"inference_id\": e5_endpoint,\n", " },\n", - " \"elser_semantic_description_vector\": {\n", - " \"type\": \"semantic_text\"\n", - " # \"inference_id\": elser_endpoint\n", - " },\n", + " \"elser_semantic_description_vector\": {\"type\": \"semantic_text\"},\n", " }\n", "}\n", "\n", @@ -358,18 +339,10 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": null, "id": "c4830b74", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Pipeline set for ecommerce-search: True\n" - ] - } - ], + "outputs": [], "source": [ "resp = client.indices.put_settings(\n", " index=commerce_index,\n", @@ -397,16 +370,7 @@ "metadata": { "id": "3cfdc3b7-7e4f-4111-997b-c333ac8938ba" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Indexing documents...\n", - "Documents indexed successfully: 2506\n" - ] - } - ], + "outputs": [], "source": [ "# Load the dataset\n", "with open(\"products-ecommerce.json\", \"r\") as f:\n", @@ -531,28 +495,10 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": null, "id": "d44f3e2b", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Custom Tokens: ['comfortable', 'furniture', 'for', 'a', 'large', 'balcony']\n" - ] - }, - { - "data": { - "text/plain": [ - "ObjectApiResponse({'acknowledged': True})" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "index_settings = {\n", " \"settings\": {\n", @@ -645,31 +591,12 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": null, "id": "f4984f6c-ceec-46a4-b64c-f749e6b1b04f", "metadata": { "id": "f4984f6c-ceec-46a4-b64c-f749e6b1b04f" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Score: 12.93728\n", - "Product: Barbie Dreamhouse\n", - "Category: Toys\n", - "Description: is a classic Barbie playset with multiple rooms, furniture, a large balcony, a pool, and accessories. It allows kids to create their dream Barbie world.\n", - "\n", - "\n", - "Score: 7.9097595\n", - "Product: Rattan Patio Conversation Set\n", - "Category: Outdoor Furniture\n", - "Description: is a stylish and comfortable outdoor furniture set, including a sofa, two chairs, and a coffee table, all made of durable rattan material.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "results_list = []\n", "\n", @@ -720,31 +647,12 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": null, "id": "72187c9a-14c1-4084-a080-4e5c1e614f22", "metadata": { "id": "72187c9a-14c1-4084-a080-4e5c1e614f22" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Score: 0.93147576\n", - "Product: Metal Garden Bench with Cushion\n", - "Category: Garden Furniture\n", - "Description: is a stylish and comfortable metal garden bench, complete with a cushion for added support.\n", - "\n", - "\n", - "Score: 0.9304026\n", - "Product: Garden Dining Set with Swivel Chairs\n", - "Category: Garden Furniture\n", - "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# KNN\n", "# TODO: Add Semantic_Text type?\n", @@ -782,29 +690,10 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": null, "id": "c5475e21", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Score: 11.354144\n", - "Product: Garden Lounge Set with Side Table\n", - "Category: Garden Furniture\n", - "Description: is a comfortable and stylish garden lounge set, including a sofa, chairs, and a side table for outdoor relaxation.\n", - "\n", - "\n", - "Score: 11.200024\n", - "Product: Garden Lounge Set with Ottoman\n", - "Category: Garden Furniture\n", - "Description: is a versatile and comfortable garden lounge set, including a sofa, chairs, and ottoman for outdoor relaxation.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# Elastic Learned Sparse Encoder - ELSER\n", "\n", @@ -837,29 +726,10 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": null, "id": "4d2fb926", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Score: 11.354144\n", - "Product: Garden Lounge Set with Side Table\n", - "Category: Garden Furniture\n", - "Description: is a comfortable and stylish garden lounge set, including a sofa, chairs, and a side table for outdoor relaxation.\n", - "\n", - "\n", - "Score: 11.200024\n", - "Product: Garden Lounge Set with Ottoman\n", - "Category: Garden Furniture\n", - "Description: is a versatile and comfortable garden lounge set, including a sofa, chairs, and ottoman for outdoor relaxation.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# Elastic Learned Sparse Encoder - ELSER\n", "\n", @@ -890,31 +760,12 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": null, "id": "2c0bf5fc-ab32-4f33-8f26-904ff10635a5", "metadata": { "id": "2c0bf5fc-ab32-4f33-8f26-904ff10635a5" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Score: 0.93147576\n", - "Product: Metal Garden Bench with Cushion\n", - "Category: Garden Furniture\n", - "Description: is a stylish and comfortable metal garden bench, complete with a cushion for added support.\n", - "\n", - "\n", - "Score: 0.9304026\n", - "Product: Garden Dining Set with Swivel Chairs\n", - "Category: Garden Furniture\n", - "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# Elastic Learned Sparse Encoder - ELSER\n", "\n", From ee98aa5d752cd0a52da520292f0dce7e4da96ddf Mon Sep 17 00:00:00 2001 From: Justin Castilla Date: Thu, 17 Jul 2025 13:44:38 -0700 Subject: [PATCH 08/11] adds e5_semantic_text_search_results --- .../updated-ecommerce_dense_sparse_project.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb b/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb index 4221e0d3..6c2d9b55 100644 --- a/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb +++ b/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb @@ -781,9 +781,9 @@ " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", ")\n", "\n", - "elser_semantic_text_search_results = resp[\"hits\"][\"hits\"]\n", - "results_list.append({\"elser_semantic_text_search\": sparse_semantic_search_results})\n", - "print_search_results(elser_semantic_text_search_results)" + "e5_semantic_text_search_results = resp[\"hits\"][\"hits\"]\n", + "results_list.append({\"e5_semantic_text_search\": e5_semantic_text_search_results})\n", + "print_search_results(e5_semantic_text_search_results)" ] }, { From 5a266ba307d392a4cf286a6dabf23ae0d6fc4fda Mon Sep 17 00:00:00 2001 From: Justin Castilla Date: Tue, 12 Aug 2025 08:40:42 -0700 Subject: [PATCH 09/11] black jupyter magic --- .../ecommerce_dense_sparse_project.ipynb | 1149 ++++++++++++----- 1 file changed, 822 insertions(+), 327 deletions(-) diff --git a/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/ecommerce_dense_sparse_project.ipynb b/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/ecommerce_dense_sparse_project.ipynb index 42386dae..588e9c02 100644 --- a/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/ecommerce_dense_sparse_project.ipynb +++ b/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/ecommerce_dense_sparse_project.ipynb @@ -9,22 +9,42 @@ "source": [ "# **Lexical and Semantic Search with Elasticsearch**\n", "\n", - "In this example, you will explore various approaches to retrieving information using Elasticsearch, focusing specifically on text, lexical and semantic search.\n", + "In the following examples, we will explore various approaches to retrieving information using Elasticsearch - focusing specifically on full text search, semantic search, and a hybrid combination of both.\n", "\n", - "To accomplish this, this example demonstrate various search scenarios on a dataset generated to simulate e-commerce product information.\n", + "To accomplish this, this example demonstrates various search scenarios on a dataset generated to simulate e-commerce product information.\n", "\n", - "This dataset contains over 2,500 products, each with a description. These products are categorized into 76 distinct product categories, with each category containing a varying number of products.\n", + "This dataset contains over 2,500 products, each with a description. These products are categorized into 76 distinct product categories, with each category containing a varying number of products. \n", "\n", + "Here is a sample of an object from the dataset:\n", + "\n", + "```json\n", + " {\n", + " \"product\": \"Samsung 49-inch Curved Gaming Monitor\",\n", + " \"description\": \"is a curved gaming monitor with a high refresh rate and AMD FreeSync technology.\",\n", + " \"category\": \"Monitors\"\n", + "}\n", + "\n", + "```\n", + "\n", + "We will consume the dataset from a JSON file into Elasticsearch using modern consumption patterns. We will then perform a series of search operations to demonstrate the different search strategies.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "6370f2e4", + "metadata": {}, + "source": [ "## **🧰 Requirements**\n", "\n", "For this example, you will need:\n", "\n", - "- Python 3.6 or later\n", + "- Python 3.11 or later\n", "- The Elastic Python client\n", - "- Elastic 8.8 deployment or later, with 8GB memory machine learning node\n", - "- The Elastic Learned Sparse EncodeR model that comes pre-loaded into Elastic installed and started on your deployment\n", + "- Elastic 9.0 deployment or later on either a local, cloud, or serverless environment\n", + "\n", "\n", - "We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html), a [free trial](https://cloud.elastic.co/registration?onboarding_token=vectorsearch&utm_source=github&utm_content=elasticsearch-labs-notebook) is available." + "We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html). You can use a [free trial here](https://cloud.elastic.co/registration?onboarding_token=vectorsearch&utm_source=github&utm_content=elasticsearch-labs-notebook) to get started." ] }, { @@ -38,7 +58,7 @@ "\n", "To get started, we'll need to connect to our Elastic deployment using the Python client.\n", "\n", - "Because we're using an Elastic Cloud deployment, we'll use the **Cloud ID** to identify our deployment.\n" + "Because we're using an Elastic Cloud deployment, we'll use the **Cloud Endpoint** and **Cloud API Key** to identify our deployment. These may be found within Kibana by following the instructions [here](https://www.elastic.co/docs/deploy-manage/api-keys/elastic-cloud-api-keys).\n" ] }, { @@ -50,19 +70,22 @@ }, "outputs": [], "source": [ - "!pip install elasticsearch==8.8 #Elasticsearch" + "%pip install elasticsearch pandas IPython -q" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "8c36e9b5-8f2b-4734-9213-1350caa7f837", - "metadata": { - "id": "8c36e9b5-8f2b-4734-9213-1350caa7f837" - }, - "outputs": [], + "cell_type": "markdown", + "id": "38b734aa", + "metadata": {}, "source": [ - "pip -q install eland elasticsearch sentence_transformers transformers torch==1.11 # Eland Python Client" + "### Import the required packages\n", + "We will import the following packages:\n", + "- `Elasticsearch`: a client library for Elasticsearch actions\n", + "- `bulk`: a function to perform Elasticsearch actions in bulk\n", + "- `getpass`: a module for receiving Elasticsearch credentials via text prompt\n", + "- `json`: a module for reading and writing JSON data\n", + "- `pandas`, `display`, `Markdown`: for data visualization and markdown formatting\n", + "\n" ] }, { @@ -74,19 +97,19 @@ }, "outputs": [], "source": [ - "from elasticsearch import (\n", - " Elasticsearch,\n", - " helpers,\n", - ") # Import the Elasticsearch client and helpers module\n", - "from urllib.request import urlopen # library for opening URLs\n", + "# import the Elasticsearch client and bulk function\n", + "from elasticsearch import Elasticsearch\n", + "from elasticsearch.helpers import bulk\n", + "\n", + "# import getpass module to handle Auth input\n", + "import getpass\n", + "\n", + "# import json module to read JSON file of products\n", "import json # module for handling JSON data\n", - "from pathlib import Path # module for working with file paths\n", "\n", - "# Python client and toolkit for machine learning in Elasticsearch\n", - "from eland.ml.pytorch import PyTorchModel\n", - "from eland.ml.pytorch.transformers import TransformerModel\n", - "from elasticsearch.client import MlClient # Elastic module for ml\n", - "import getpass # handling password input" + "# display search results in a table\n", + "import pandas as pd\n", + "from IPython.display import display, Markdown" ] }, { @@ -96,13 +119,12 @@ "id": "ea1VkDBXJIQR" }, "source": [ - "Now we can instantiate the Python Elasticsearch client.\n", - "\n", - "First we prompt the user for their password and Cloud ID.\n", - "\n", - "🔐 NOTE: `getpass` enables us to securely prompt the user for credentials without echoing them to the terminal, or storing it in memory.\n", + "### 📚 Instantiating the Elasticsearch Client\n", "\n", - "Then we create a `client` object that instantiates an instance of the `Elasticsearch` class." + "First we prompt the user for their Elastic Endpoint URL and Elastic API Key.\n", + "Then we create a `client` object that instantiates an instance of the `Elasticsearch` class.\n", + "Lastly, we verify that our client is connected to our Elasticsearch instance by calling `client.ping()`.\n", + "> 🔐 *NOTE: `getpass` enables us to securely prompt the user for credentials without echoing them to the terminal, or storing it in memory.*" ] }, { @@ -114,16 +136,19 @@ }, "outputs": [], "source": [ - "# Found in the 'Manage Deployment' page\n", - "CLOUD_ID = getpass.getpass(\"Enter Elastic Cloud ID: \")\n", + "# endpoint for Elasticsearch instance\n", + "ELASTIC_ENDPOINT = getpass.getpass(\"Enter Elastic Endpoint: \")\n", "\n", - "# Password for the 'elastic' user generated by Elasticsearch\n", - "ELASTIC_PASSWORD = getpass.getpass(\"Enter Elastic password: \")\n", + "# Elastic API key for Elasticsearch\n", + "ELASTIC_API_KEY = getpass.getpass(\"Enter Elastic API Key: \")\n", "\n", - "# Create the client instance\n", + "# create the Elasticsearch client instance\n", "client = Elasticsearch(\n", - " cloud_id=CLOUD_ID, basic_auth=(\"elastic\", ELASTIC_PASSWORD), request_timeout=3600\n", - ")" + " hosts=[ELASTIC_ENDPOINT], api_key=ELASTIC_API_KEY, request_timeout=3600\n", + ")\n", + "\n", + "resp = client.ping()\n", + "print(f\"Connected to Elastic instance: {resp}\")" ] }, { @@ -133,9 +158,11 @@ "id": "BH-N6epTJarM" }, "source": [ - "## Setup emebdding model\n", + "## Prepare our embedding model workflow\n", "\n", - "Next we upload the all-mpnet-base-v2 embedding model into Elasticsearch and create an ingest pipeline with inference processors for text embedding and text expansion, using the description field for both. This field contains the description of each product." + "Next we ensure our embedding models are available in Elasticsearch. We will use Elastic's provided `e5_multilingual_small` and `elser_V2` models to provide dense and sparse vectoring, respectively. Using these models out of the box will ensure they are up-to-date and ready for integration with Elasticsearch.\n", + "\n", + "Other models may be uploaded and deployed using [Eland](https://www.elastic.co/docs/reference/elasticsearch/clients/eland) or integrated using the [inference endpoint API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-azureopenai) to connect to third-party models." ] }, { @@ -147,28 +174,51 @@ }, "outputs": [], "source": [ - "# Set the model name from Hugging Face and task type\n", - "# sentence-transformers model\n", - "hf_model_id = \"sentence-transformers/all-mpnet-base-v2\"\n", - "tm = TransformerModel(hf_model_id, \"text_embedding\")\n", + "# Declare models and endpoint names predeployed by Elastic\n", + "elser_model = \".elser_model_2_linux-x86_64\"\n", + "elser_endpoint = \".elser-2-elasticsearch\"\n", "\n", - "# set the modelID as it is named in Elasticsearch\n", - "es_model_id = tm.elasticsearch_model_id()\n", + "e5_model = \".multilingual-e5-small_linux-x86_64\"\n", + "e5_endpoint = \".multilingual-e5-small-elasticsearch\"\n", "\n", - "# Download the model from Hugging Face\n", - "tmp_path = \"models\"\n", - "Path(tmp_path).mkdir(parents=True, exist_ok=True)\n", - "model_path, config, vocab_path = tm.save(tmp_path)\n", + "# Define (model, endpoint) tuples to check\n", + "model_endpoint_pairs = [(elser_model, elser_endpoint), (e5_model, e5_endpoint)]\n", "\n", - "# Load the model into Elasticsearch\n", - "ptm = PyTorchModel(client, es_model_id)\n", - "ptm.import_model(\n", - " model_path=model_path, config_path=None, vocab_path=vocab_path, config=config\n", - ")\n", + "# Fetch all loaded models and endpoints once\n", + "models = client.ml.get_trained_models()\n", + "model_ids = {model[\"model_id\"]: model for model in models[\"trained_model_configs\"]}\n", + "endpoints = client.inference.get()\n", + "endpoint_ids = {\n", + " endpoint[\"inference_id\"]: endpoint for endpoint in endpoints[\"endpoints\"]\n", + "}\n", "\n", - "# Start the model\n", - "s = MlClient.start_trained_model_deployment(client, model_id=es_model_id)\n", - "s.body" + "# Check each (model, endpoint) pair\n", + "for model_id, endpoint_id in model_endpoint_pairs:\n", + " print(f\"Checking Model: {model_id}\")\n", + " model = model_ids.get(model_id)\n", + " if model:\n", + " print(f\" Model ID: {model['model_id']}\")\n", + " print(f\" Description: {model.get('description', 'No description')}\")\n", + " print(f\" Version: {model.get('version', 'N/A')}\")\n", + " else:\n", + " print(\" Model not found or not loaded.\")\n", + " print(f\"Checking Endpoint: {endpoint_id}\")\n", + " endpoint = endpoint_ids.get(endpoint_id)\n", + " if endpoint:\n", + " print(f\" Inference Endpoint ID: {endpoint['inference_id']}\")\n", + " print(f\" Task Type: {endpoint['task_type']}\")\n", + " else:\n", + " print(\" Endpoint not found or not ready.\")\n", + " print(\"------\")" + ] + }, + { + "cell_type": "markdown", + "id": "80506477", + "metadata": {}, + "source": [ + "### Create an inference pipeline\n", + "This function will create an ingest pipeline with inference processors to use `ELSER` (sparse_vector) and `e5_multilingual_small` (dense_vector) to infer against data that will be ingested in the pipeline. This allows us to automatically generate embeddings for the product descriptions when they are indexed into Elasticsearch." ] }, { @@ -180,34 +230,37 @@ }, "outputs": [], "source": [ - "# Creating an ingest pipeline with inference processors to use ELSER (sparse) and all-mpnet-base-v2 (dense) to infer against data that will be ingested in the pipeline.\n", - "\n", - "client.ingest.put_pipeline(\n", - " id=\"ecommerce-pipeline\",\n", + "index_pipeline = \"ecommerce-pipeline\"\n", + "resp = client.ingest.put_pipeline(\n", + " id=index_pipeline,\n", " processors=[\n", " {\n", " \"inference\": {\n", - " \"model_id\": \"elser_model\",\n", - " \"target_field\": \"ml\",\n", - " \"field_map\": {\"description\": \"text_field\"},\n", - " \"inference_config\": {\n", - " \"text_expansion\": { # text_expansion inference type (ELSER)\n", - " \"results_field\": \"tokens\"\n", + " \"model_id\": elser_endpoint, # inference endpoint ID\n", + " \"input_output\": [\n", + " {\n", + " \"input_field\": \"description\", # source field\n", + " \"output_field\": \"elser_description_vector\", # destination vector field\n", " }\n", - " },\n", + " ],\n", " }\n", " },\n", " {\n", " \"inference\": {\n", - " \"model_id\": \"sentence-transformers__all-mpnet-base-v2\",\n", - " \"target_field\": \"description_vector\", # Target field for the inference results\n", - " \"field_map\": {\n", - " \"description\": \"text_field\" # Field matching our configured trained model input. Typically for NLP models, the field name is text_field.\n", - " },\n", + " \"model_id\": e5_endpoint, # inference endpoint ID\n", + " \"input_output\": [\n", + " {\n", + " \"input_field\": \"description\", # source field\n", + " \"output_field\": \"e5_description_vector\", # destination vector field\n", + " }\n", + " ],\n", + " \"inference_config\": {\"text_embedding\": {}},\n", " }\n", " },\n", " ],\n", - ")" + ")\n", + "\n", + "print(f\"ecommerce-pipeline created: {resp['acknowledged']}\")" ] }, { @@ -218,88 +271,84 @@ }, "source": [ "## Index documents\n", + "The `ecommerce-search` index we are creating will include fields to support dense and sparse vector storage and search. \n", "\n", - "Then, we create a source index to load `products-ecommerce.json`, this will be the `ecommerce` index and a destination index to extract the documents from the source and index these documents into the destination `ecommerce-search`.\n", + "We define the `e5_description_vector` and the `elser_description_vector` fields to store the inference pipeline results. \n", "\n", - "For the `ecommerce-search` index we add a field to support dense vector storage and search `description_vector.predicted_value`, this is the target field for inference results. The field type in this case is `dense_vector`, the `all-mpnet-base-v2` model has embedding_size of 768, so dims is set to 768. We also add a `rank_features` field type to support the text expansion output." + "The field type in `e5_description_vector` is a `dense_vector`. The `.e5_multilingual_small` model has an embedding size of 384, so the dimension of the vector (dims) is set to 384. \n", + "\n", + "We also add an `elser_description_vector` field type to support the `sparse_vector` output from our `.elser_model_2_linux-x86_64` model. No further configuration is needed for this field for our use case." ] }, { "cell_type": "code", "execution_count": null, - "id": "6e115bd0-e758-44db-b5b9-96217af472c1", + "id": "9b53b39e-d74e-4fa8-a364-e2c3caf37418", "metadata": { - "id": "6e115bd0-e758-44db-b5b9-96217af472c1" + "id": "9b53b39e-d74e-4fa8-a364-e2c3caf37418" }, "outputs": [], "source": [ - "# Index to load products-ecommerce.json docs\n", + "# define the index name and mapping\n", + "commerce_index = \"ecommerce-search\"\n", + "mappings = {\n", + " \"properties\": {\n", + " \"product\": {\n", + " \"type\": \"text\",\n", + " },\n", + " \"description\": {\n", + " \"type\": \"text\",\n", + " },\n", + " \"category\": {\n", + " \"type\": \"text\",\n", + " },\n", + " \"elser_description_vector\": {\"type\": \"sparse_vector\"},\n", + " \"e5_description_vector\": {\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 384,\n", + " \"index\": \"true\",\n", + " \"similarity\": \"cosine\",\n", + " },\n", + " \"e5_semantic_description_vector\": {\n", + " \"type\": \"semantic_text\",\n", + " \"inference_id\": e5_endpoint,\n", + " },\n", + " \"elser_semantic_description_vector\": {\"type\": \"semantic_text\"},\n", + " }\n", + "}\n", "\n", - "client.indices.create(\n", - " index=\"ecommerce\",\n", - " mappings={\n", - " \"properties\": {\n", - " \"product\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", - " },\n", - " \"description\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", - " },\n", - " \"category\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", - " },\n", - " }\n", - " },\n", - ")" + "\n", + "if client.indices.exists(index=commerce_index):\n", + " client.indices.delete(index=commerce_index)\n", + "resp = client.indices.create(\n", + " index=commerce_index,\n", + " mappings=mappings,\n", + ")\n", + "\n", + "print(f\"Index {commerce_index} created: {resp['acknowledged']}\")" + ] + }, + { + "cell_type": "markdown", + "id": "88db9926", + "metadata": {}, + "source": [ + "### Attach Pipeline to Index\n", + "Lets connect our pipeline to the index. This updates the settings of our index to use the pipeline we previously defined as the default.\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "9b53b39e-d74e-4fa8-a364-e2c3caf37418", - "metadata": { - "id": "9b53b39e-d74e-4fa8-a364-e2c3caf37418" - }, + "id": "c4830b74", + "metadata": {}, "outputs": [], "source": [ - "# Reindex dest index\n", - "\n", - "INDEX = \"ecommerce-search\"\n", - "client.indices.create(\n", - " index=INDEX,\n", - " settings={\"index\": {\"number_of_shards\": 1, \"number_of_replicas\": 1}},\n", - " mappings={\n", - " # Saving disk space by excluding the ELSER tokens and the dense_vector field from document source.\n", - " # Note: That should only be applied if you are certain that reindexing will not be required in the future.\n", - " \"_source\": {\"excludes\": [\"ml.tokens\", \"description_vector.predicted_value\"]},\n", - " \"properties\": {\n", - " \"product\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", - " },\n", - " \"description\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", - " },\n", - " \"category\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", - " },\n", - " \"ml.tokens\": { # The name of the field to contain the generated tokens.\n", - " \"type\": \"rank_features\" # ELSER output must be ingested into a field with the rank_features field type.\n", - " },\n", - " \"description_vector.predicted_value\": { # Inference results field, target_field.predicted_value\n", - " \"type\": \"dense_vector\",\n", - " \"dims\": 768, # The all-mpnet-base-v2 model has embedding_size of 768, so dims is set to 768.\n", - " \"index\": \"true\",\n", - " \"similarity\": \"dot_product\", # When indexing vectors for approximate kNN search, you need to specify the similarity function for comparing the vectors.\n", - " },\n", - " },\n", - " },\n", - ")" + "resp = client.indices.put_settings(\n", + " index=commerce_index,\n", + " body={\"default_pipeline\": index_pipeline},\n", + ")\n", + "print(f\"Pipeline set for {commerce_index}: {resp['acknowledged']}\")" ] }, { @@ -309,9 +358,9 @@ "id": "Vo-LKu8TOT5j" }, "source": [ - "## Load documents\n", + "### Load documents\n", "\n", - "Then we load `products-ecommerce.json` into the `ecommerce` index." + "We load the contents of`products-ecommerce.json` into the `ecommerce-search` index. We will use the `bulk` helper function to efficiently index our documents en masse. " ] }, { @@ -323,91 +372,102 @@ }, "outputs": [], "source": [ - "# dataset\n", - "\n", - "url = \"https://raw.githubusercontent.com/elastic/elasticsearch-labs/02c01b3450e8ddc72ccec85d559eee5280c185ac/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/products-ecommerce.json\" # json raw file - update the link here\n", - "\n", - "response = urlopen(url)\n", - "\n", - "# Load the response data into a JSON object\n", - "data_json = json.loads(response.read())\n", + "# Load the dataset\n", + "with open(\"products-ecommerce.json\", \"r\") as f:\n", + " data_json = json.load(f)\n", "\n", "\n", + "# helper function to create bulk indexing body\n", "def create_index_body(doc):\n", - " \"\"\"Generate the body for an Elasticsearch document.\"\"\"\n", + " doc[\"elser_semantic_description_vector\"] = doc[\"description\"]\n", + " doc[\"e5_semantic_description_vector\"] = doc[\"description\"]\n", + "\n", " return {\n", - " \"_index\": \"ecommerce\",\n", + " \"_index\": \"ecommerce-search\",\n", " \"_source\": doc,\n", " }\n", "\n", "\n", - "# Prepare the documents to be indexed\n", + "# prepare the documents array payload\n", "documents = [create_index_body(doc) for doc in data_json]\n", "\n", - "# Use helpers.bulk to index\n", - "helpers.bulk(client, documents)\n", - "\n", - "print(\"Done indexing documents into `ecommerce` index\")" + "# use bulk function to index\n", + "try:\n", + " print(\"Indexing documents...\")\n", + " resp = bulk(client, documents)\n", + " print(f\"Documents indexed successfully: {resp[0]}\")\n", + "except Exception as e:\n", + " print(f\"Error indexing documents: {e}\")" ] }, { "cell_type": "markdown", - "id": "3dShN9W4Opl8", + "id": "-qUXNuOvPDsI", "metadata": { - "id": "3dShN9W4Opl8" + "id": "-qUXNuOvPDsI" }, "source": [ - "## Reindex\n", + "## Text Analysis\n", + "The classic way documents are ranked for relevance by Elasticsearch based on a text query uses the Lucene implementation of the [BM25](https://en.wikipedia.org/wiki/Okapi_BM25) model, a **sparse model for lexical search**. This method follows the traditional approach for text search, looking for exact term matches.\n", "\n", - "Now we can reindex data from the `source` index `ecommerce` to the `dest` index `ecommerce-search` with the ingest pipeline `ecommerce-pipeline` we created.\n", + "To make this search possible, Elasticsearch converts **text field** data into a searchable format by performing text analysis.\n", "\n", - "After this step our `dest` index will have the fields we need to perform Semantic Search." + "**Text analysis** is performed by an [analyzer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analyzer-anatomy.html), a set of rules to govern the process of extracting relevant tokens for searching. An analyzer must have exactly one [tokenizer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-tokenizers.html). The tokenizer receives a stream of characters and breaks it up into individual tokens (usually individual words.) \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "5f51e460", + "metadata": {}, + "source": [ + "### Standard Analyzer\n", + "In the example below we are using the default analyzer, the standard analyzer, which works well for most use cases as it provides English grammar based tokenization. Tokenization enables matching on individual terms, but each token is still matched literally." ] }, { "cell_type": "code", "execution_count": null, - "id": "4297cb0b-ae2e-44f9-811d-27a41c43a858", + "id": "55b602d1-f1e4-4b70-9273-5fc701ac9039", "metadata": { - "id": "4297cb0b-ae2e-44f9-811d-27a41c43a858" + "id": "55b602d1-f1e4-4b70-9273-5fc701ac9039" }, "outputs": [], "source": [ - "# Reindex data from one index 'source' to another 'dest' with the 'ecommerce-pipeline' pipeline.\n", + "# Define the text to be analyzed\n", + "text = \"Comfortable furniture for a large balcony\"\n", + "\n", + "# Define the analyze request\n", + "request_body = {\"analyzer\": \"standard\", \"text\": text} # Stop Analyzer\n", + "\n", + "# Perform the analyze request\n", + "resp = client.indices.analyze(\n", + " analyzer=request_body[\"analyzer\"], text=request_body[\"text\"]\n", + ")\n", "\n", - "client.reindex(\n", - " wait_for_completion=True,\n", - " source={\"index\": \"ecommerce\"},\n", - " dest={\"index\": \"ecommerce-search\", \"pipeline\": \"ecommerce-pipeline\"},\n", - ")" + "# Extract and display the analyzed tokens\n", + "standard_tokens = [token[\"token\"] for token in resp[\"tokens\"]]\n", + "print(\"Standard-analyzed Tokens:\", standard_tokens)" ] }, { "cell_type": "markdown", - "id": "-qUXNuOvPDsI", - "metadata": { - "id": "-qUXNuOvPDsI" - }, + "id": "fb75f526", + "metadata": {}, "source": [ - "## Text Analysis with Standard Analyzer" + "### Stop Analyzer\n", + "If you want to personalize your search experience you can choose a different built-in analyzer. For example, by updating the code to use the stop analyzer it will break the text into tokens at any non-letter character with support for removing stop words." ] }, { "cell_type": "code", "execution_count": null, - "id": "829ae6e8-807d-4f0d-ada6-fee86748b91a", - "metadata": { - "id": "829ae6e8-807d-4f0d-ada6-fee86748b91a" - }, + "id": "3e3fdcff", + "metadata": {}, "outputs": [], "source": [ - "# Performs text analysis on a string and returns the resulting tokens.\n", - "\n", - "# Define the text to be analyzed\n", - "text = \"Comfortable furniture for a large balcony\"\n", - "\n", "# Define the analyze request\n", - "request_body = {\"analyzer\": \"standard\", \"text\": text} # Standard Analyzer\n", + "request_body = {\"analyzer\": \"stop\", \"text\": text}\n", "\n", "# Perform the analyze request\n", "response = client.indices.analyze(\n", @@ -415,45 +475,120 @@ ")\n", "\n", "# Extract and display the analyzed tokens\n", - "tokens = [token[\"token\"] for token in response[\"tokens\"]]\n", - "print(\"Analyzed Tokens:\", tokens)" + "stop_tokens = [token[\"token\"] for token in response[\"tokens\"]]\n", + "print(\"Stop-analyzed Tokens:\", stop_tokens)" ] }, { "cell_type": "markdown", - "id": "12u70NLmPyNV", - "metadata": { - "id": "12u70NLmPyNV" - }, + "id": "aba7fad6", + "metadata": {}, "source": [ - "## Text Analysis with Stop Analyzer" + "### Custom Analyzer\n", + "When the built-in analyzers do not fulfill your needs, you can create a [custom analyzer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-custom-analyzer.html)\n", + "], which uses the appropriate combination of zero or more [character filters](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-charfilters.html), a [tokenizer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-tokenizers.html) and zero or more [token filters](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-tokenfilters.html).\n", + "\n", + "In the below example that combines a tokenizer and token filters, the text will be lowercased by the [lowercase filter](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lowercase-tokenfilter.html) before being processed by the [synonyms token filter](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html).\n", + "\n", + "> Note: you cannot pass a custom analyzer definition inline to analyze. Define the analyzer in your index settings, then reference it by name in the analyze call. For this reason we will create a temporary index to store the analyzer." ] }, { "cell_type": "code", "execution_count": null, - "id": "55b602d1-f1e4-4b70-9273-5fc701ac9039", - "metadata": { - "id": "55b602d1-f1e4-4b70-9273-5fc701ac9039" - }, + "id": "d44f3e2b", + "metadata": {}, "outputs": [], "source": [ - "# Performs text analysis on a string and returns the resulting tokens.\n", + "index_settings = {\n", + " \"settings\": {\n", + " \"analysis\": {\n", + " \"analyzer\": {\n", + " \"my_custom_analyzer\": {\n", + " \"type\": \"custom\",\n", + " \"tokenizer\": \"standard\",\n", + " \"char_filter\": [\"html_strip\"],\n", + " \"filter\": [\"lowercase\", \"asciifolding\"],\n", + " }\n", + " }\n", + " }\n", + " }\n", + "}\n", "\n", - "# Define the text to be analyzed\n", - "text = \"Comfortable furniture for a large balcony\"\n", + "custom_text = \"Čōmføřțǎble Fůrñíturę Fòr â ľarğe Bałcony\"\n", "\n", - "# Define the analyze request\n", - "request_body = {\"analyzer\": \"stop\", \"text\": text} # Stop Analyzer\n", + "# Create a temporary index with the custom analyzer\n", + "client.indices.create(index=\"temporary_index\", body=index_settings)\n", "\n", "# Perform the analyze request\n", - "response = client.indices.analyze(\n", - " analyzer=request_body[\"analyzer\"], text=request_body[\"text\"]\n", + "resp = client.indices.analyze(\n", + " index=\"temporary_index\", analyzer=\"my_custom_analyzer\", text=custom_text\n", ")\n", "\n", "# Extract and display the analyzed tokens\n", - "tokens = [token[\"token\"] for token in response[\"tokens\"]]\n", - "print(\"Analyzed Tokens:\", tokens)" + "custom_tokens = [token[\"token\"] for token in resp[\"tokens\"]]\n", + "print(\"Custom Tokens:\", custom_tokens)\n", + "\n", + "# Delete the temporary index\n", + "client.indices.delete(index=\"temporary_index\")" + ] + }, + { + "cell_type": "markdown", + "id": "432620b6", + "metadata": {}, + "source": [ + "### Text Analysis Results\n", + "In the table below, we can observe that analyzers both included with Elasticsearch and custom made may be included with your search requests to improve the quality of your search results by reducing or refining the content being searched. Attention should be paid to your particular use case and the needs of your users." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c5d11cb", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Standard Token Analyzer\")\n", + "print(f\"Before: \\n{text}\")\n", + "print(f\"After: \\n{standard_tokens}\")\n", + "print(\"===================\")\n", + "print(\"Stop Token Analyzer\")\n", + "print(f\"Before: \\n{text}\")\n", + "print(f\"After: \\n{stop_tokens}\")\n", + "print(\"===================\")\n", + "print(\"Custom Token Analyzer\")\n", + "print(f\"Before: \\n{custom_text}\")\n", + "print(f\"After: \\n{custom_tokens}\")" + ] + }, + { + "cell_type": "markdown", + "id": "db4f86e3", + "metadata": {}, + "source": [ + "## Search \n", + "The remainder of this notebook will cover the following search types:\n", + "\n", + "\n", + "- Lexical Search\n", + "- Semantic Search \n", + " - ELSER Semantic Search (Sparse Vector)\n", + " - E5 Semantic Search (Dense Vector)\n", + " - ELSER Semantic Search with `semantic_text`\n", + " - E5 Semantic Search with `semantic_text`\n", + "- Hybrid Search\n", + " - E5 + Lexical (linear combination)\n", + " - E5 + Lexical (RRF)\n", + " - ELSER + Lexical (linear combination)\n", + " - ELSER + Lexical (RRF)\n", + "- ES|QL Search\n", + " - Semantic Search ES|QL\n", + " - ELSER ES|QL\n", + " - E5 ES|QL\n", + " - ELSER ES|QL with `semantic_text`\n", + " - E5 ES|QL with `semantic_text`\n", + " " ] }, { @@ -463,7 +598,8 @@ "id": "8G8MKcUvP0zs" }, "source": [ - "## Lexical Search" + "### Lexical Search\n", + "Our first search will be a straightforward BM25 text search within the description field. We are storing all of our results in a results_list for a final comparison at the end of the notebook. A convenience function to display the results is also defined." ] }, { @@ -475,9 +611,25 @@ }, "outputs": [], "source": [ - "# BM25\n", + "results_list = []\n", "\n", - "response = client.search(\n", + "\n", + "def print_search_results(search_results):\n", + " if not search_results:\n", + " print(\"No matches found\")\n", + " else:\n", + " for hit in search_results:\n", + " score = hit[\"_score\"]\n", + " product = hit[\"_source\"][\"product\"]\n", + " category = hit[\"_source\"][\"category\"]\n", + " description = hit[\"_source\"][\"description\"]\n", + " print(\n", + " f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", + " )\n", + "\n", + "\n", + "# Regular BM25 (Lexical) Search\n", + "resp = client.search(\n", " size=2,\n", " index=\"ecommerce-search\",\n", " query={\n", @@ -488,20 +640,12 @@ " }\n", " }\n", " },\n", + " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", ")\n", - "hits = response[\"hits\"][\"hits\"]\n", "\n", - "if not hits:\n", - " print(\"No matches found\")\n", - "else:\n", - " for hit in hits:\n", - " score = hit[\"_score\"]\n", - " product = hit[\"_source\"][\"product\"]\n", - " category = hit[\"_source\"][\"category\"]\n", - " description = hit[\"_source\"][\"description\"]\n", - " print(\n", - " f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", - " )" + "lexical_search_results = resp[\"hits\"][\"hits\"]\n", + "results_list.append({\"lexical_search\": lexical_search_results})\n", + "print_search_results(lexical_search_results)" ] }, { @@ -511,7 +655,7 @@ "id": "xiywcf_-P39a" }, "source": [ - "## Semantic Search with Dense Vector" + "### Semantic Search with Dense Vector" ] }, { @@ -523,33 +667,26 @@ }, "outputs": [], "source": [ - "# KNN\n", - "\n", "response = client.search(\n", " index=\"ecommerce-search\",\n", " size=2,\n", " knn={\n", - " \"field\": \"description_vector.predicted_value\",\n", + " \"field\": \"e5_description_vector\",\n", " \"k\": 50, # Number of nearest neighbors to return as top hits.\n", " \"num_candidates\": 500, # Number of nearest neighbor candidates to consider per shard. Increasing num_candidates tends to improve the accuracy of the final k results.\n", " \"query_vector_builder\": { # Object indicating how to build a query_vector. kNN search enables you to perform semantic search by using a previously deployed text embedding model.\n", " \"text_embedding\": {\n", - " \"model_id\": \"sentence-transformers__all-mpnet-base-v2\", # Text embedding model id\n", + " \"model_id\": \".multilingual-e5-small-elasticsearch\", # Text embedding model id\n", " \"model_text\": \"Comfortable furniture for a large balcony\", # Query\n", " }\n", " },\n", " },\n", + " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", ")\n", "\n", - "for hit in response[\"hits\"][\"hits\"]:\n", - "\n", - " score = hit[\"_score\"]\n", - " product = hit[\"_source\"][\"product\"]\n", - " category = hit[\"_source\"][\"category\"]\n", - " description = hit[\"_source\"][\"description\"]\n", - " print(\n", - " f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", - " )" + "dense_semantic_search_results = response[\"hits\"][\"hits\"]\n", + "results_list.append({\"dense_semantic_search\": dense_semantic_search_results})\n", + "print_search_results(dense_semantic_search_results)" ] }, { @@ -559,42 +696,194 @@ "id": "QlWFdngRQFbv" }, "source": [ - "## Semantic Search with Sparse Vector" + "### Semantic Search with Sparse Vector" ] }, { "cell_type": "code", "execution_count": null, + "id": "c5475e21", + "metadata": {}, + "outputs": [], + "source": [ + "# Elastic Learned Sparse Encoder - ELSER\n", + "\n", + "resp = client.search(\n", + " index=\"ecommerce-search\",\n", + " size=2,\n", + " query={\n", + " \"sparse_vector\": {\n", + " \"field\": \"elser_description_vector\",\n", + " \"inference_id\": \".elser-2-elasticsearch\",\n", + " \"query\": \"Comfortable furniture for a large balcony\",\n", + " }\n", + " },\n", + " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", + ")\n", + "\n", + "\n", + "sparse_semantic_search_results = resp[\"hits\"][\"hits\"]\n", + "results_list.append({\"sparse_semantic_search\": sparse_semantic_search_results})\n", + "print_search_results(sparse_semantic_search_results)" + ] + }, + { + "cell_type": "markdown", + "id": "3a2a5267", + "metadata": {}, + "source": [ + "### Semantic Search with `semantic_text` Type (ELSER)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d2fb926", + "metadata": {}, + "outputs": [], + "source": [ + "# Elastic Learned Sparse Encoder - ELSER\n", + "\n", + "resp = client.search(\n", + " index=\"ecommerce-search\",\n", + " size=2,\n", + " query={\n", + " \"semantic\": {\n", + " \"field\": \"elser_semantic_description_vector\",\n", + " \"query\": \"Comfortable furniture for a large balcony\",\n", + " }\n", + " },\n", + " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", + ")\n", + "\n", + "elser_semantic_text_search_results = resp[\"hits\"][\"hits\"]\n", + "results_list.append({\"elser_semantic_text_search\": sparse_semantic_search_results})\n", + "print_search_results(elser_semantic_text_search_results)" + ] + }, + { + "cell_type": "markdown", + "id": "1df079f3", + "metadata": {}, + "source": [ + "### Semantic Search with `semantic_text` Type (e5)" + ] + }, + { + "cell_type": "code", + "execution_count": 78, "id": "2c0bf5fc-ab32-4f33-8f26-904ff10635a5", "metadata": { "id": "2c0bf5fc-ab32-4f33-8f26-904ff10635a5" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Score: 0.93147576\n", + "Product: Metal Garden Bench with Cushion\n", + "Category: Garden Furniture\n", + "Description: is a stylish and comfortable metal garden bench, complete with a cushion for added support.\n", + "\n", + "\n", + "Score: 0.9304026\n", + "Product: Garden Dining Set with Swivel Chairs\n", + "Category: Garden Furniture\n", + "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", + "\n" + ] + } + ], "source": [ "# Elastic Learned Sparse Encoder - ELSER\n", "\n", - "response = client.search(\n", + "resp = client.search(\n", " index=\"ecommerce-search\",\n", " size=2,\n", " query={\n", - " \"text_expansion\": {\n", - " \"ml.tokens\": {\n", - " \"model_id\": \"elser_model\",\n", - " \"model_text\": \"Comfortable furniture for a large balcony\",\n", - " }\n", + " \"semantic\": {\n", + " \"field\": \"e5_semantic_description_vector\",\n", + " \"query\": \"Comfortable furniture for a large balcony\",\n", " }\n", " },\n", + " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", ")\n", "\n", - "for hit in response[\"hits\"][\"hits\"]:\n", + "e5_semantic_text_search_results = resp[\"hits\"][\"hits\"]\n", + "results_list.append({\"e5_semantic_text_search\": e5_semantic_text_search_results})\n", + "print_search_results(e5_semantic_text_search_results)" + ] + }, + { + "cell_type": "markdown", + "id": "6b5016f3", + "metadata": {}, + "source": [ + "### Hybrid Search - BM25 + `semantic_text` Type" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c174fc71", + "metadata": {}, + "outputs": [ + { + "ename": "BadRequestError", + "evalue": "BadRequestError(400, 'x_content_parse_exception', '[1:194] [knn] unknown field [query]', [1:194] [knn] unknown field [query])", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mBadRequestError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[80], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# BM25 + semantic_text (RRF)\u001b[39;00m\n\u001b[1;32m 2\u001b[0m top_k \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m2\u001b[39m\n\u001b[0;32m----> 3\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[43mclient\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mecommerce-search\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43mretriever\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrrf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mretrievers\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstandard\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mquery\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmatch\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdescription\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mA dining table and comfortable chairs for a large balcony\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m 13\u001b[0m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\n\u001b[1;32m 14\u001b[0m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\n\u001b[1;32m 15\u001b[0m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\n\u001b[1;32m 16\u001b[0m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 17\u001b[0m \u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[1;32m 18\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mknn\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[1;32m 19\u001b[0m \n\u001b[1;32m 20\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfield\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43me5_semantic_description_vector\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 21\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mquery\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mComfortable furniture for a large balcony\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 22\u001b[0m \n\u001b[1;32m 23\u001b[0m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\n\u001b[1;32m 24\u001b[0m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 25\u001b[0m \u001b[43m \u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 26\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrank_window_size\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 27\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrank_constant\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m20\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 28\u001b[0m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\n\u001b[1;32m 29\u001b[0m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 30\u001b[0m \u001b[43m \u001b[49m\u001b[43msource_excludes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m*_description_vector\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Exclude vector fields from response\u001b[39;49;00m\n\u001b[1;32m 31\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m 33\u001b[0m dense_rrf_search_results \u001b[38;5;241m=\u001b[39m resp[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhits\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhits\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 34\u001b[0m results_list\u001b[38;5;241m.\u001b[39mappend({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdense_rrf_search\u001b[39m\u001b[38;5;124m\"\u001b[39m: dense_rrf_search_results})\n", + "File \u001b[0;32m~/.pyenv/versions/3.12.3/lib/python3.12/site-packages/elasticsearch/_sync/client/utils.py:415\u001b[0m, in \u001b[0;36m_rewrite_parameters..wrapper..wrapped\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 412\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m:\n\u001b[1;32m 413\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[0;32m--> 415\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mapi\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.pyenv/versions/3.12.3/lib/python3.12/site-packages/elasticsearch/_sync/client/__init__.py:4916\u001b[0m, in \u001b[0;36mElasticsearch.search\u001b[0;34m(self, index, aggregations, aggs, allow_no_indices, allow_partial_search_results, analyze_wildcard, analyzer, batched_reduce_size, ccs_minimize_roundtrips, collapse, default_operator, df, docvalue_fields, error_trace, expand_wildcards, explain, ext, fields, filter_path, force_synthetic_source, from_, highlight, human, ignore_throttled, ignore_unavailable, include_named_queries_score, indices_boost, knn, lenient, max_concurrent_shard_requests, min_score, pit, post_filter, pre_filter_shard_size, preference, pretty, profile, q, query, rank, request_cache, rescore, rest_total_hits_as_int, retriever, routing, runtime_mappings, script_fields, scroll, search_after, search_type, seq_no_primary_term, size, slice, sort, source, source_excludes, source_includes, stats, stored_fields, suggest, suggest_field, suggest_mode, suggest_size, suggest_text, terminate_after, timeout, track_scores, track_total_hits, typed_keys, version, body)\u001b[0m\n\u001b[1;32m 4914\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m __body \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 4915\u001b[0m __headers[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcontent-type\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mapplication/json\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 4916\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mperform_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore[return-value]\u001b[39;49;00m\n\u001b[1;32m 4917\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPOST\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4918\u001b[0m \u001b[43m \u001b[49m\u001b[43m__path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4919\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m__query\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4920\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m__headers\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4921\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m__body\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4922\u001b[0m \u001b[43m \u001b[49m\u001b[43mendpoint_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msearch\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4923\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath_parts\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m__path_parts\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4924\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.pyenv/versions/3.12.3/lib/python3.12/site-packages/elasticsearch/_sync/client/_base.py:271\u001b[0m, in \u001b[0;36mBaseClient.perform_request\u001b[0;34m(self, method, path, params, headers, body, endpoint_id, path_parts)\u001b[0m\n\u001b[1;32m 255\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mperform_request\u001b[39m(\n\u001b[1;32m 256\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 257\u001b[0m method: \u001b[38;5;28mstr\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 264\u001b[0m path_parts: Optional[Mapping[\u001b[38;5;28mstr\u001b[39m, Any]] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 265\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m ApiResponse[Any]:\n\u001b[1;32m 266\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_otel\u001b[38;5;241m.\u001b[39mspan(\n\u001b[1;32m 267\u001b[0m method,\n\u001b[1;32m 268\u001b[0m endpoint_id\u001b[38;5;241m=\u001b[39mendpoint_id,\n\u001b[1;32m 269\u001b[0m path_parts\u001b[38;5;241m=\u001b[39mpath_parts \u001b[38;5;129;01mor\u001b[39;00m {},\n\u001b[1;32m 270\u001b[0m ) \u001b[38;5;28;01mas\u001b[39;00m otel_span:\n\u001b[0;32m--> 271\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_perform_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 272\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 273\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 274\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 275\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 276\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 277\u001b[0m \u001b[43m \u001b[49m\u001b[43motel_span\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43motel_span\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 278\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 279\u001b[0m otel_span\u001b[38;5;241m.\u001b[39mset_elastic_cloud_metadata(response\u001b[38;5;241m.\u001b[39mmeta\u001b[38;5;241m.\u001b[39mheaders)\n\u001b[1;32m 280\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m response\n", + "File \u001b[0;32m~/.pyenv/versions/3.12.3/lib/python3.12/site-packages/elasticsearch/_sync/client/_base.py:351\u001b[0m, in \u001b[0;36mBaseClient._perform_request\u001b[0;34m(self, method, path, params, headers, body, otel_span)\u001b[0m\n\u001b[1;32m 348\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mValueError\u001b[39;00m, \u001b[38;5;167;01mKeyError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m):\n\u001b[1;32m 349\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[0;32m--> 351\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m HTTP_EXCEPTIONS\u001b[38;5;241m.\u001b[39mget(meta\u001b[38;5;241m.\u001b[39mstatus, ApiError)(\n\u001b[1;32m 352\u001b[0m message\u001b[38;5;241m=\u001b[39mmessage, meta\u001b[38;5;241m=\u001b[39mmeta, body\u001b[38;5;241m=\u001b[39mresp_body\n\u001b[1;32m 353\u001b[0m )\n\u001b[1;32m 355\u001b[0m \u001b[38;5;66;03m# 'X-Elastic-Product: Elasticsearch' should be on every 2XX response.\u001b[39;00m\n\u001b[1;32m 356\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_verified_elasticsearch:\n\u001b[1;32m 357\u001b[0m \u001b[38;5;66;03m# If the header is set we mark the server as verified.\u001b[39;00m\n", + "\u001b[0;31mBadRequestError\u001b[0m: BadRequestError(400, 'x_content_parse_exception', '[1:194] [knn] unknown field [query]', [1:194] [knn] unknown field [query])" + ] + } + ], + "source": [ + "# BM25 + semantic_text (RRF)\n", + "top_k = 2\n", + "resp = client.search(\n", + " index=\"ecommerce-search\",\n", + " retriever={\n", + " \"rrf\": {\n", + " \"retrievers\": [\n", + " {\n", + " \"standard\": {\n", + " \"query\": {\n", + " \"match\": {\n", + " \"description\": \"A dining table and comfortable chairs for a large balcony\"\n", + " }\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"standard\": {\n", + " \"query\": {\n", + " \"semantic\": {\n", + " \"field\": \"e5_semantic_description_vector\",\n", + " \"query\": \"Comfortable furniture for a large balcony\",\n", + " }\n", + " }\n", + " }\n", + " },\n", + " ],\n", + " \"rank_window_size\": 2,\n", + " \"rank_constant\": 20,\n", + " }\n", + " },\n", + " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", + ")\n", "\n", - " score = hit[\"_score\"]\n", - " product = hit[\"_source\"][\"product\"]\n", - " category = hit[\"_source\"][\"category\"]\n", - " description = hit[\"_source\"][\"description\"]\n", - " print(\n", - " f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", - " )" + "dense_rrf_search_results = resp[\"hits\"][\"hits\"]\n", + "results_list.append({\"dense_rrf_search\": dense_rrf_search_results})\n", + "print_search_results(dense_rrf_search_results)" ] }, { @@ -604,7 +893,7 @@ "id": "kz9deDBYQJxr" }, "source": [ - "## Hybrid Search - BM25+KNN linear combination" + "### Hybrid Search - BM25 + Dense Vector linear combination" ] }, { @@ -617,8 +906,8 @@ "outputs": [], "source": [ "# BM25 + KNN (Linear Combination)\n", - "\n", - "response = client.search(\n", + "query = \"A dining table and comfortable chairs for a large balcony\"\n", + "resp = client.search(\n", " index=\"ecommerce-search\",\n", " size=2,\n", " query={\n", @@ -627,8 +916,8 @@ " {\n", " \"match\": {\n", " \"description\": {\n", - " \"query\": \"A dining table and comfortable chairs for a large balcony\",\n", - " \"boost\": 1, # You can adjust the boost value\n", + " \"query\": query,\n", + " \"boost\": 1,\n", " }\n", " }\n", " }\n", @@ -636,28 +925,23 @@ " }\n", " },\n", " knn={\n", - " \"field\": \"description_vector.predicted_value\",\n", - " \"k\": 50,\n", - " \"num_candidates\": 500,\n", - " \"boost\": 1, # You can adjust the boost value\n", + " \"field\": \"e5_description_vector\",\n", + " \"k\": 2,\n", + " \"num_candidates\": 20,\n", + " \"boost\": 1,\n", " \"query_vector_builder\": {\n", " \"text_embedding\": {\n", - " \"model_id\": \"sentence-transformers__all-mpnet-base-v2\",\n", - " \"model_text\": \"A dining table and comfortable chairs for a large balcony\",\n", + " \"model_id\": \".multilingual-e5-small-elasticsearch\",\n", + " \"model_text\": query,\n", " }\n", " },\n", " },\n", + " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", ")\n", "\n", - "for hit in response[\"hits\"][\"hits\"]:\n", - "\n", - " score = hit[\"_score\"]\n", - " product = hit[\"_source\"][\"product\"]\n", - " category = hit[\"_source\"][\"category\"]\n", - " description = hit[\"_source\"][\"description\"]\n", - " print(\n", - " f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", - " )" + "dense_linear_search_results = resp[\"hits\"][\"hits\"]\n", + "results_list.append({\"dense_linear_search\": dense_linear_search_results})\n", + "print_search_results(dense_linear_search_results)" ] }, { @@ -667,7 +951,9 @@ "id": "cybkWjmpQV8g" }, "source": [ - "## Hybrid Search - BM25+KNN RRF" + "### Hybrid Search - BM25 + Dense Vector Reverse Reciprocal Fusion (RRF)\n", + "\n", + "[Reciprocal rank fusion](https://www.elastic.co/docs/reference/elasticsearch/rest-apis/reciprocal-rank-fusion) (RRF) is a method for combining multiple result sets with different relevance indicators into a single result set. RRF requires no tuning, and the different relevance indicators do not have to be related to each other to achieve high-quality results." ] }, { @@ -680,52 +966,45 @@ "outputs": [], "source": [ "# BM25 + KNN (RRF)\n", - "# RRF functionality is in technical preview and may be changed or removed in a future release. The syntax will likely change before GA.\n", - "\n", - "response = client.search(\n", + "top_k = 2\n", + "resp = client.search(\n", " index=\"ecommerce-search\",\n", - " size=2,\n", - " query={\n", - " \"bool\": {\n", - " \"should\": [\n", + " retriever={\n", + " \"rrf\": {\n", + " \"retrievers\": [\n", " {\n", - " \"match\": {\n", - " \"description\": {\n", - " \"query\": \"A dining table and comfortable chairs for a large balcony\"\n", + " \"standard\": {\n", + " \"query\": {\n", + " \"match\": {\n", + " \"description\": \"A dining table and comfortable chairs for a large balcony\"\n", + " }\n", " }\n", " }\n", - " }\n", - " ]\n", - " }\n", - " },\n", - " knn={\n", - " \"field\": \"description_vector.predicted_value\",\n", - " \"k\": 50,\n", - " \"num_candidates\": 500,\n", - " \"query_vector_builder\": {\n", - " \"text_embedding\": {\n", - " \"model_id\": \"sentence-transformers__all-mpnet-base-v2\",\n", - " \"model_text\": \"A dining table and comfortable chairs for a large balcony\",\n", - " }\n", - " },\n", - " },\n", - " rank={\n", - " \"rrf\": { # Reciprocal rank fusion\n", - " \"window_size\": 50, # This value determines the size of the individual result sets per query.\n", - " \"rank_constant\": 20, # This value determines how much influence documents in individual result sets per query have over the final ranked result set.\n", + " },\n", + " {\n", + " \"knn\": {\n", + " \"field\": \"e5_description_vector\",\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": e5_endpoint,\n", + " \"model_text\": \"A dining table and comfortable chairs for a large balcony\",\n", + " }\n", + " },\n", + " \"k\": 2,\n", + " \"num_candidates\": 20,\n", + " }\n", + " },\n", + " ],\n", + " \"rank_window_size\": 2,\n", + " \"rank_constant\": 20,\n", " }\n", " },\n", + " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", ")\n", "\n", - "for hit in response[\"hits\"][\"hits\"]:\n", - "\n", - " rank = hit[\"_rank\"]\n", - " category = hit[\"_source\"][\"category\"]\n", - " product = hit[\"_source\"][\"product\"]\n", - " description = hit[\"_source\"][\"description\"]\n", - " print(\n", - " f\"\\nRank: {rank}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", - " )" + "dense_rrf_search_results = resp[\"hits\"][\"hits\"]\n", + "results_list.append({\"dense_rrf_search\": dense_rrf_search_results})\n", + "print_search_results(dense_rrf_search_results)" ] }, { @@ -735,7 +1014,7 @@ "id": "LyKI2Z-XQbI6" }, "source": [ - "## Hybrid Search - BM25+ELSER linear combination" + "### Hybrid Search - BM25 + Sparse Vector linear combination" ] }, { @@ -749,7 +1028,7 @@ "source": [ "# BM25 + Elastic Learned Sparse Encoder (Linear Combination)\n", "\n", - "response = client.search(\n", + "resp = client.search(\n", " index=\"ecommerce-search\",\n", " size=2,\n", " query={\n", @@ -764,28 +1043,244 @@ " }\n", " },\n", " {\n", - " \"text_expansion\": {\n", - " \"ml.tokens\": {\n", - " \"model_id\": \"elser_model\",\n", - " \"model_text\": \"A dining table and comfortable chairs for a large balcony\",\n", - " \"boost\": 1, # You can adjust the boost value\n", - " }\n", + " \"sparse_vector\": {\n", + " \"field\": \"elser_description_vector\",\n", + " \"inference_id\": elser_endpoint,\n", + " \"query\": \"A dining table and comfortable chairs for a large balcony\",\n", " }\n", " },\n", " ]\n", " }\n", " },\n", + " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", + ")\n", + "\n", + "sparse_linear_search_results = resp[\"hits\"][\"hits\"]\n", + "results_list.append({\"sparse_linear_search\": sparse_linear_search_results})\n", + "print_search_results(sparse_linear_search_results)" + ] + }, + { + "cell_type": "markdown", + "id": "e3d5e4e9", + "metadata": {}, + "source": [ + "### Hybrid Search - BM25 + Sparse Vector Reciprocal Rank Fusion (RRF)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "199c5c60", + "metadata": {}, + "outputs": [], + "source": [ + "# BM25 + ELSER (RRF)\n", + "top_k = 2\n", + "resp = client.search(\n", + " index=\"ecommerce-search\",\n", + " retriever={\n", + " \"rrf\": {\n", + " \"retrievers\": [\n", + " {\n", + " \"standard\": {\n", + " \"query\": {\n", + " \"match\": {\n", + " \"description\": \"A dining table and comfortable chairs for a large balcony\"\n", + " }\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"standard\": {\n", + " \"query\": {\n", + " \"sparse_vector\": {\n", + " \"field\": \"elser_description_vector\",\n", + " \"inference_id\": elser_endpoint,\n", + " \"query\": \"A dining table and comfortable chairs for a large balcony\",\n", + " }\n", + " }\n", + " }\n", + " },\n", + " ],\n", + " \"rank_window_size\": 2,\n", + " \"rank_constant\": 20,\n", + " }\n", + " },\n", + " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", ")\n", "\n", - "for hit in response[\"hits\"][\"hits\"]:\n", + "print(resp[\"hits\"][\"hits\"])\n", + "sparse_rrf_search_results = resp[\"hits\"][\"hits\"]\n", + "results_list.append({\"sparse_rrf_search\": sparse_rrf_search_results})\n", + "print_search_results(sparse_rrf_search_results)" + ] + }, + { + "cell_type": "markdown", + "id": "f11de3ac", + "metadata": {}, + "source": [ + "### ES|QL Search\n", + "Elastic offers its own query language called ES|QL. ES|QL is a SQL-like query language that allows you to search and analyze data in Elasticsearch. Further information can be found in the [official documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/esql.html)." + ] + }, + { + "cell_type": "markdown", + "id": "9d1343a4", + "metadata": {}, + "source": [ + "#### Lexical Search with ES|QL\n", + "This demonstrates the lexical search capabilities of ES|QL using the match function. The function `MATCH` specifically searches for matches in a query string within a specified field. In the example below, we search for documents containing the phrase \"Comfortable furniture for a large balcony\" in the description field.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91c3d193", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\" Convert search_results from es|ql to a dict with _source\n", + " and subproperties of score, description, category, and product \"\"\"\n", + "\n", + "\n", + "def normalize_results(search_results):\n", + " normalized_results = []\n", + " results = search_results.body[\"values\"]\n", + " for result in results:\n", + " new_result = {\"_source\": {}}\n", + " new_result[\"_score\"] = result[-1]\n", + " new_result[\"_source\"][\"product\"] = result[-2]\n", + " new_result[\"_source\"][\"category\"] = result[0]\n", + " new_result[\"_source\"][\"description\"] = result[1]\n", + " normalized_results.append(new_result)\n", + "\n", + " return normalized_results\n", + "\n", + "\n", + "esql_query = \"\"\"\n", + "FROM ecommerce-search METADATA _score\n", + "| WHERE match(description, \"Comfortable furniture for a large balcony\")\n", + "| SORT _score DESC\n", + "| LIMIT 2\n", + "\"\"\"\n", + "\n", + "resp = client.esql.query(query=esql_query)\n", + "esql_lexical_search_results = normalize_results(resp)\n", + "results_list.append({\"esql_lexical_search\": esql_lexical_search_results})\n", + "print_search_results(esql_lexical_search_results)" + ] + }, + { + "cell_type": "markdown", + "id": "dbbdb5f3", + "metadata": {}, + "source": [ + "#### Semantic Search with ES|QL\n", + "To perform a semantic search using ES|QL, use the `semantic_text` type for your query. This will run a similarity search based on the semantic meaning of the text, rather than the lexical (word-level) matching of the `text` type. Similar to the ease of performing a search with semantic search using the `semantic_text` type with the Python client, the ES|QL query is simple to write and understand.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ba37cd7", + "metadata": {}, + "outputs": [], + "source": [ + "esql_query = \"\"\"\n", + "FROM ecommerce-search METADATA _score\n", + "| WHERE elser_semantic_description_vector:\"Comfortable furniture for a large balcony\"\n", + "| SORT _score DESC\n", + "| LIMIT 2\n", + "\"\"\"\n", + "\n", + "resp = client.esql.query(query=esql_query)\n", + "esql_semantic_search_results = normalize_results(resp)\n", + "results_list.append({\"esql_semantic_search\": esql_semantic_search_results})\n", + "print_search_results(esql_semantic_search_results)" + ] + }, + { + "cell_type": "markdown", + "id": "7b95f9b8", + "metadata": {}, + "source": [ + "## Compiled Results\n", + "Here are the results of the previous searches. We can see that all of the results return approximately the same the products." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1162a857", + "metadata": {}, + "outputs": [], + "source": [ + "# Flatten results for each search type, preserving insertion order\n", + "rows = []\n", + "for result in results_list:\n", + " search_type = list(result.keys())[0]\n", + " for doc in result[search_type]:\n", + " row = {\n", + " \"search_type\": search_type,\n", + " \"product\": doc[\"_source\"].get(\"product\"),\n", + " \"category\": doc[\"_source\"].get(\"category\"),\n", + " \"description\": doc[\"_source\"].get(\"description\"),\n", + " \"score\": doc.get(\"_score\"),\n", + " }\n", + " rows.append(row)\n", + "\n", + "# Create DataFrame without altering row order\n", + "df = pd.DataFrame(rows)\n", + "\n", + "# Get the unique search_types in order of appearance\n", + "ordered_search_types = []\n", + "for row in rows:\n", + " st = row[\"search_type\"]\n", + " if st not in ordered_search_types:\n", + " ordered_search_types.append(st)\n", + "\n", + "for search_type in ordered_search_types:\n", + " group = df[df[\"search_type\"] == search_type]\n", + " display(Markdown(f\"### {search_type.replace('_', ' ').title()}\"))\n", + " styled = (\n", + " group.drop(columns=\"search_type\")\n", + " .reset_index(drop=True)\n", + " .style.set_properties(\n", + " subset=[\"description\"],\n", + " **{\"white-space\": \"pre-wrap\", \"word-break\": \"break-word\"},\n", + " )\n", + " .hide(axis=\"index\") # For pandas >=1.4.0\n", + " )\n", + " display(styled)" + ] + }, + { + "cell_type": "markdown", + "id": "b08c83b6", + "metadata": {}, + "source": [ + "As can be seen in the results, the semantic search query provides more relevant results than the lexical search query. This is due to the semantic search query using the semantic_text field, which is based on the dense vector representation of the text, while the lexical search query uses the description field, which is based on the lexical representation of the text. Nuances and context are better captured by the semantic search query, making it more effective for finding relevant results." + ] + }, + { + "cell_type": "markdown", + "id": "2b83cbe6", + "metadata": {}, + "source": [ + "# Conclusion\n", + "\n", + "It should be noted that while the semantic search query provides more relevant results, it is also more computationally expensive than the lexical search query. This is because the semantic search query requires the calculation of vector representations, which can be computationally intensive. \n", + "\n", + "Ultimately, it is recommended to use the semantic_text type when implementing semantic search for a few key reasons:\n", + "- Query structure is simple and easy to understand.\n", + "- Implementing the semantic_text type requires minimal changes to the index mapping and query.\n", + "- Setting up an ingest pipeline and inference endpoint is unnecessary.\n", + "\n", + "Using `spare_vector` and `dense_vector` types are more complex and requires additional setup, but can be useful in certain scenarios where semantic search needs to be customized beyond standard semantic text search. This could be a change in the similarity algorithm, use of different vectorization models, or any necessary preprocessing steps. \n", "\n", - " score = hit[\"_score\"]\n", - " product = hit[\"_source\"][\"product\"]\n", - " category = hit[\"_source\"][\"category\"]\n", - " description = hit[\"_source\"][\"description\"]\n", - " print(\n", - " f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", - " )" + "Hybrid search retains the power of both lexical and semantic search, allowing for a more flexible and effective search experience. With hybrid search, you can balance the trade-off between relevance and performance, making it a more practical choice for production environments. This should be considered the default approach for search." ] } ], @@ -808,7 +1303,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.12.3" } }, "nbformat": 4, From b6fdcd9d3ae8fb8e4cd63601e0e0b10664fa3989 Mon Sep 17 00:00:00 2001 From: Justin Castilla Date: Tue, 12 Aug 2025 08:48:24 -0700 Subject: [PATCH 10/11] md formatting --- .../ecommerce_dense_sparse_project.ipynb | 47 +++---------------- 1 file changed, 6 insertions(+), 41 deletions(-) diff --git a/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/ecommerce_dense_sparse_project.ipynb b/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/ecommerce_dense_sparse_project.ipynb index 588e9c02..b8d8a7b5 100644 --- a/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/ecommerce_dense_sparse_project.ipynb +++ b/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/ecommerce_dense_sparse_project.ipynb @@ -771,31 +771,12 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": null, "id": "2c0bf5fc-ab32-4f33-8f26-904ff10635a5", "metadata": { "id": "2c0bf5fc-ab32-4f33-8f26-904ff10635a5" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Score: 0.93147576\n", - "Product: Metal Garden Bench with Cushion\n", - "Category: Garden Furniture\n", - "Description: is a stylish and comfortable metal garden bench, complete with a cushion for added support.\n", - "\n", - "\n", - "Score: 0.9304026\n", - "Product: Garden Dining Set with Swivel Chairs\n", - "Category: Garden Furniture\n", - "Description: is a functional and comfortable garden dining set, including a table and chairs with swivel seats for convenience.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# Elastic Learned Sparse Encoder - ELSER\n", "\n", @@ -829,23 +810,7 @@ "execution_count": null, "id": "c174fc71", "metadata": {}, - "outputs": [ - { - "ename": "BadRequestError", - "evalue": "BadRequestError(400, 'x_content_parse_exception', '[1:194] [knn] unknown field [query]', [1:194] [knn] unknown field [query])", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mBadRequestError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[80], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# BM25 + semantic_text (RRF)\u001b[39;00m\n\u001b[1;32m 2\u001b[0m top_k \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m2\u001b[39m\n\u001b[0;32m----> 3\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[43mclient\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mecommerce-search\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43mretriever\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrrf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mretrievers\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstandard\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mquery\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmatch\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdescription\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mA dining table and comfortable chairs for a large balcony\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m 13\u001b[0m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\n\u001b[1;32m 14\u001b[0m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\n\u001b[1;32m 15\u001b[0m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\n\u001b[1;32m 16\u001b[0m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 17\u001b[0m \u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[1;32m 18\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mknn\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[1;32m 19\u001b[0m \n\u001b[1;32m 20\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfield\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43me5_semantic_description_vector\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 21\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mquery\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mComfortable furniture for a large balcony\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 22\u001b[0m \n\u001b[1;32m 23\u001b[0m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\n\u001b[1;32m 24\u001b[0m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 25\u001b[0m \u001b[43m \u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 26\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrank_window_size\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 27\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrank_constant\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m20\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 28\u001b[0m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\n\u001b[1;32m 29\u001b[0m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 30\u001b[0m \u001b[43m \u001b[49m\u001b[43msource_excludes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m*_description_vector\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Exclude vector fields from response\u001b[39;49;00m\n\u001b[1;32m 31\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m 33\u001b[0m dense_rrf_search_results \u001b[38;5;241m=\u001b[39m resp[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhits\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhits\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 34\u001b[0m results_list\u001b[38;5;241m.\u001b[39mappend({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdense_rrf_search\u001b[39m\u001b[38;5;124m\"\u001b[39m: dense_rrf_search_results})\n", - "File \u001b[0;32m~/.pyenv/versions/3.12.3/lib/python3.12/site-packages/elasticsearch/_sync/client/utils.py:415\u001b[0m, in \u001b[0;36m_rewrite_parameters..wrapper..wrapped\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 412\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m:\n\u001b[1;32m 413\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[0;32m--> 415\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mapi\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.pyenv/versions/3.12.3/lib/python3.12/site-packages/elasticsearch/_sync/client/__init__.py:4916\u001b[0m, in \u001b[0;36mElasticsearch.search\u001b[0;34m(self, index, aggregations, aggs, allow_no_indices, allow_partial_search_results, analyze_wildcard, analyzer, batched_reduce_size, ccs_minimize_roundtrips, collapse, default_operator, df, docvalue_fields, error_trace, expand_wildcards, explain, ext, fields, filter_path, force_synthetic_source, from_, highlight, human, ignore_throttled, ignore_unavailable, include_named_queries_score, indices_boost, knn, lenient, max_concurrent_shard_requests, min_score, pit, post_filter, pre_filter_shard_size, preference, pretty, profile, q, query, rank, request_cache, rescore, rest_total_hits_as_int, retriever, routing, runtime_mappings, script_fields, scroll, search_after, search_type, seq_no_primary_term, size, slice, sort, source, source_excludes, source_includes, stats, stored_fields, suggest, suggest_field, suggest_mode, suggest_size, suggest_text, terminate_after, timeout, track_scores, track_total_hits, typed_keys, version, body)\u001b[0m\n\u001b[1;32m 4914\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m __body \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 4915\u001b[0m __headers[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcontent-type\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mapplication/json\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 4916\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mperform_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore[return-value]\u001b[39;49;00m\n\u001b[1;32m 4917\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPOST\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4918\u001b[0m \u001b[43m \u001b[49m\u001b[43m__path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4919\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m__query\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4920\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m__headers\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4921\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m__body\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4922\u001b[0m \u001b[43m \u001b[49m\u001b[43mendpoint_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msearch\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4923\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath_parts\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m__path_parts\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4924\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.pyenv/versions/3.12.3/lib/python3.12/site-packages/elasticsearch/_sync/client/_base.py:271\u001b[0m, in \u001b[0;36mBaseClient.perform_request\u001b[0;34m(self, method, path, params, headers, body, endpoint_id, path_parts)\u001b[0m\n\u001b[1;32m 255\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mperform_request\u001b[39m(\n\u001b[1;32m 256\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 257\u001b[0m method: \u001b[38;5;28mstr\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 264\u001b[0m path_parts: Optional[Mapping[\u001b[38;5;28mstr\u001b[39m, Any]] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 265\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m ApiResponse[Any]:\n\u001b[1;32m 266\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_otel\u001b[38;5;241m.\u001b[39mspan(\n\u001b[1;32m 267\u001b[0m method,\n\u001b[1;32m 268\u001b[0m endpoint_id\u001b[38;5;241m=\u001b[39mendpoint_id,\n\u001b[1;32m 269\u001b[0m path_parts\u001b[38;5;241m=\u001b[39mpath_parts \u001b[38;5;129;01mor\u001b[39;00m {},\n\u001b[1;32m 270\u001b[0m ) \u001b[38;5;28;01mas\u001b[39;00m otel_span:\n\u001b[0;32m--> 271\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_perform_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 272\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 273\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 274\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 275\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 276\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 277\u001b[0m \u001b[43m \u001b[49m\u001b[43motel_span\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43motel_span\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 278\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 279\u001b[0m otel_span\u001b[38;5;241m.\u001b[39mset_elastic_cloud_metadata(response\u001b[38;5;241m.\u001b[39mmeta\u001b[38;5;241m.\u001b[39mheaders)\n\u001b[1;32m 280\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m response\n", - "File \u001b[0;32m~/.pyenv/versions/3.12.3/lib/python3.12/site-packages/elasticsearch/_sync/client/_base.py:351\u001b[0m, in \u001b[0;36mBaseClient._perform_request\u001b[0;34m(self, method, path, params, headers, body, otel_span)\u001b[0m\n\u001b[1;32m 348\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mValueError\u001b[39;00m, \u001b[38;5;167;01mKeyError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m):\n\u001b[1;32m 349\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[0;32m--> 351\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m HTTP_EXCEPTIONS\u001b[38;5;241m.\u001b[39mget(meta\u001b[38;5;241m.\u001b[39mstatus, ApiError)(\n\u001b[1;32m 352\u001b[0m message\u001b[38;5;241m=\u001b[39mmessage, meta\u001b[38;5;241m=\u001b[39mmeta, body\u001b[38;5;241m=\u001b[39mresp_body\n\u001b[1;32m 353\u001b[0m )\n\u001b[1;32m 355\u001b[0m \u001b[38;5;66;03m# 'X-Elastic-Product: Elasticsearch' should be on every 2XX response.\u001b[39;00m\n\u001b[1;32m 356\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_verified_elasticsearch:\n\u001b[1;32m 357\u001b[0m \u001b[38;5;66;03m# If the header is set we mark the server as verified.\u001b[39;00m\n", - "\u001b[0;31mBadRequestError\u001b[0m: BadRequestError(400, 'x_content_parse_exception', '[1:194] [knn] unknown field [query]', [1:194] [knn] unknown field [query])" - ] - } - ], + "outputs": [], "source": [ "# BM25 + semantic_text (RRF)\n", "top_k = 2\n", @@ -1261,7 +1226,7 @@ "id": "b08c83b6", "metadata": {}, "source": [ - "As can be seen in the results, the semantic search query provides more relevant results than the lexical search query. This is due to the semantic search query using the semantic_text field, which is based on the dense vector representation of the text, while the lexical search query uses the description field, which is based on the lexical representation of the text. Nuances and context are better captured by the semantic search query, making it more effective for finding relevant results." + "As can be seen in the results, the semantic search query provides more relevant results than the lexical search query. This is due to the semantic search query using the `semantic_text` field, which is based on the dense vector representation of the text, while the lexical search query uses the description field, which is based on the lexical representation of the text. Nuances and context are better captured by the semantic search query, making it more effective for finding relevant results." ] }, { @@ -1289,7 +1254,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -1303,7 +1268,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.3" + "version": "3.11.6" } }, "nbformat": 4, From 598255aaae39510de89837c13c470895638a30df Mon Sep 17 00:00:00 2001 From: Justin Castilla Date: Tue, 12 Aug 2025 08:52:51 -0700 Subject: [PATCH 11/11] removing draft notebook, moved everything to current file --- ...dated-ecommerce_dense_sparse_project.ipynb | 1106 ----------------- 1 file changed, 1106 deletions(-) delete mode 100644 supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb diff --git a/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb b/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb deleted file mode 100644 index 6c2d9b55..00000000 --- a/supporting-blog-content/lexical-and-semantic-search-with-elasticsearch/updated-ecommerce_dense_sparse_project.ipynb +++ /dev/null @@ -1,1106 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "r8OKk3QOGBXl", - "metadata": { - "id": "r8OKk3QOGBXl" - }, - "source": [ - "# **Lexical and Semantic Search with Elasticsearch**\n", - "\n", - "In the following examples, we will explore various approaches to retrieving information using Elasticsearch - focusing specifically on full text search, semantic search, and a hybrid combination of both.\n", - "\n", - "To accomplish this, this example demonstrates various search scenarios on a dataset generated to simulate e-commerce product information.\n", - "\n", - "This dataset contains over 2,500 products, each with a description. These products are categorized into 76 distinct product categories, with each category containing a varying number of products. \n", - "\n", - "Here is a sample of an object from the dataset:\n", - "\n", - "```json\n", - " {\n", - " \"product\": \"Samsung 49-inch Curved Gaming Monitor\",\n", - " \"description\": \"is a curved gaming monitor with a high refresh rate and AMD FreeSync technology.\",\n", - " \"category\": \"Monitors\"\n", - "}\n", - "\n", - "```\n", - "\n", - "We will consume the dataset from a JSON file into Elasticsearch using modern consumption patterns. We will then perform a series of search operations to demonstrate the different search strategies.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "6370f2e4", - "metadata": {}, - "source": [ - "## **🧰 Requirements**\n", - "\n", - "For this example, you will need:\n", - "\n", - "- Python 3.11 or later\n", - "- The Elastic Python client\n", - "- Elastic 9.0 deployment or later on either a local, cloud, or serverless environment\n", - "\n", - "\n", - "We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html). You can use a [free trial here](https://cloud.elastic.co/registration?onboarding_token=vectorsearch&utm_source=github&utm_content=elasticsearch-labs-notebook) to get started." - ] - }, - { - "cell_type": "markdown", - "id": "hmMWo2e-IkTB", - "metadata": { - "id": "hmMWo2e-IkTB" - }, - "source": [ - "## Setup Elasticsearch environment:\n", - "\n", - "To get started, we'll need to connect to our Elastic deployment using the Python client.\n", - "\n", - "Because we're using an Elastic Cloud deployment, we'll use the **Cloud Endpoint** and **Cloud API Key** to identify our deployment. These may be found within Kibana by following the instructions [here](https://www.elastic.co/docs/deploy-manage/api-keys/elastic-cloud-api-keys).\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e8d24cd8-a437-4bd2-a1f0-93e535ccf8a9", - "metadata": { - "id": "e8d24cd8-a437-4bd2-a1f0-93e535ccf8a9" - }, - "outputs": [], - "source": [ - "%pip install elasticsearch pandas IPython -q" - ] - }, - { - "cell_type": "markdown", - "id": "38b734aa", - "metadata": {}, - "source": [ - "### Import the required packages\n", - "We will import the following packages:\n", - "- `Elasticsearch`: a client library for Elasticsearch actions\n", - "- `bulk`: a function to perform Elasticsearch actions in bulk\n", - "- `getpass`: a module for receiving Elasticsearch credentials via text prompt\n", - "- `json`: a module for reading and writing JSON data\n", - "- `pandas`, `display`, `Markdown`: for data visualization and markdown formatting\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eaf90bc8-647e-4ada-9aa9-5cb9e60762b7", - "metadata": { - "id": "eaf90bc8-647e-4ada-9aa9-5cb9e60762b7" - }, - "outputs": [], - "source": [ - "# import the Elasticsearch client and bulk function\n", - "from elasticsearch import Elasticsearch\n", - "from elasticsearch.helpers import bulk\n", - "\n", - "# import getpass module to handle Auth input\n", - "import getpass\n", - "\n", - "# import json module to read JSON file of products\n", - "import json # module for handling JSON data\n", - "\n", - "# display search results in a table\n", - "import pandas as pd\n", - "from IPython.display import display, Markdown" - ] - }, - { - "cell_type": "markdown", - "id": "ea1VkDBXJIQR", - "metadata": { - "id": "ea1VkDBXJIQR" - }, - "source": [ - "### 📚 Instantiating the Elasticsearch Client\n", - "\n", - "First we prompt the user for their Elastic Endpoint URL and Elastic API Key.\n", - "Then we create a `client` object that instantiates an instance of the `Elasticsearch` class.\n", - "Lastly, we verify that our client is connected to our Elasticsearch instance by calling `client.ping()`.\n", - "> 🔐 *NOTE: `getpass` enables us to securely prompt the user for credentials without echoing them to the terminal, or storing it in memory.*" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6907a2bf-4927-428e-9ca8-9df3dd35a2cc", - "metadata": { - "id": "6907a2bf-4927-428e-9ca8-9df3dd35a2cc" - }, - "outputs": [], - "source": [ - "# endpoint for Elasticsearch instance\n", - "ELASTIC_ENDPOINT = getpass.getpass(\"Enter Elastic Endpoint: \")\n", - "\n", - "# Elastic API key for Elasticsearch\n", - "ELASTIC_API_KEY = getpass.getpass(\"Enter Elastic API Key: \")\n", - "\n", - "# create the Elasticsearch client instance\n", - "client = Elasticsearch(\n", - " hosts=[ELASTIC_ENDPOINT], api_key=ELASTIC_API_KEY, request_timeout=3600\n", - ")\n", - "\n", - "resp = client.ping()\n", - "print(f\"Connected to Elastic instance: {resp}\")" - ] - }, - { - "cell_type": "markdown", - "id": "BH-N6epTJarM", - "metadata": { - "id": "BH-N6epTJarM" - }, - "source": [ - "## Prepare our embedding model workflow\n", - "\n", - "Next we ensure our embedding models are available in Elasticsearch. We will use Elastic's provided `e5_multilingual_small` and `elser_V2` models to provide dense and sparse vectoring, respectively. Using these models out of the box will ensure they are up-to-date and ready for integration with Elasticsearch.\n", - "\n", - "Other models may be uploaded and deployed using [Eland](https://www.elastic.co/docs/reference/elasticsearch/clients/eland) or integrated using the [inference endpoint API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-azureopenai) to connect to third-party models." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7f6f3f5a-2b93-4a0c-93c8-c887ca80f687", - "metadata": { - "id": "7f6f3f5a-2b93-4a0c-93c8-c887ca80f687" - }, - "outputs": [], - "source": [ - "# Declare models and endpoint names predeployed by Elastic\n", - "elser_model = \".elser_model_2_linux-x86_64\"\n", - "elser_endpoint = \".elser-2-elasticsearch\"\n", - "\n", - "e5_model = \".multilingual-e5-small_linux-x86_64\"\n", - "e5_endpoint = \".multilingual-e5-small-elasticsearch\"\n", - "\n", - "# Define (model, endpoint) tuples to check\n", - "model_endpoint_pairs = [(elser_model, elser_endpoint), (e5_model, e5_endpoint)]\n", - "\n", - "# Fetch all loaded models and endpoints once\n", - "models = client.ml.get_trained_models()\n", - "model_ids = {model[\"model_id\"]: model for model in models[\"trained_model_configs\"]}\n", - "endpoints = client.inference.get()\n", - "endpoint_ids = {\n", - " endpoint[\"inference_id\"]: endpoint for endpoint in endpoints[\"endpoints\"]\n", - "}\n", - "\n", - "# Check each (model, endpoint) pair\n", - "for model_id, endpoint_id in model_endpoint_pairs:\n", - " print(f\"Checking Model: {model_id}\")\n", - " model = model_ids.get(model_id)\n", - " if model:\n", - " print(f\" Model ID: {model['model_id']}\")\n", - " print(f\" Description: {model.get('description', 'No description')}\")\n", - " print(f\" Version: {model.get('version', 'N/A')}\")\n", - " else:\n", - " print(\" Model not found or not loaded.\")\n", - " print(f\"Checking Endpoint: {endpoint_id}\")\n", - " endpoint = endpoint_ids.get(endpoint_id)\n", - " if endpoint:\n", - " print(f\" Inference Endpoint ID: {endpoint['inference_id']}\")\n", - " print(f\" Task Type: {endpoint['task_type']}\")\n", - " else:\n", - " print(\" Endpoint not found or not ready.\")\n", - " print(\"------\")" - ] - }, - { - "cell_type": "markdown", - "id": "80506477", - "metadata": {}, - "source": [ - "### Create an inference pipeline\n", - "This function will create an ingest pipeline with inference processors to use `ELSER` (sparse_vector) and `e5_multilingual_small` (dense_vector) to infer against data that will be ingested in the pipeline. This allows us to automatically generate embeddings for the product descriptions when they are indexed into Elasticsearch." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6739f55b-6983-4b48-9349-6e0111b313fe", - "metadata": { - "id": "6739f55b-6983-4b48-9349-6e0111b313fe" - }, - "outputs": [], - "source": [ - "index_pipeline = \"ecommerce-pipeline\"\n", - "resp = client.ingest.put_pipeline(\n", - " id=index_pipeline,\n", - " processors=[\n", - " {\n", - " \"inference\": {\n", - " \"model_id\": elser_endpoint, # inference endpoint ID\n", - " \"input_output\": [\n", - " {\n", - " \"input_field\": \"description\", # source field\n", - " \"output_field\": \"elser_description_vector\", # destination vector field\n", - " }\n", - " ],\n", - " }\n", - " },\n", - " {\n", - " \"inference\": {\n", - " \"model_id\": e5_endpoint, # inference endpoint ID\n", - " \"input_output\": [\n", - " {\n", - " \"input_field\": \"description\", # source field\n", - " \"output_field\": \"e5_description_vector\", # destination vector field\n", - " }\n", - " ],\n", - " \"inference_config\": {\"text_embedding\": {}},\n", - " }\n", - " },\n", - " ],\n", - ")\n", - "\n", - "print(f\"ecommerce-pipeline created: {resp['acknowledged']}\")" - ] - }, - { - "cell_type": "markdown", - "id": "QUQ1nCaiKIQr", - "metadata": { - "id": "QUQ1nCaiKIQr" - }, - "source": [ - "## Index documents\n", - "The `ecommerce-search` index we are creating will include fields to support dense and sparse vector storage and search. \n", - "\n", - "We define the `e5_description_vector` and the `elser_description_vector` fields to store the inference pipeline results. \n", - "\n", - "The field type in `e5_description_vector` is a `dense_vector`. The `.e5_multilingual_small` model has an embedding size of 384, so the dimension of the vector (dims) is set to 384. \n", - "\n", - "We also add an `elser_description_vector` field type to support the `sparse_vector` output from our `.elser_model_2_linux-x86_64` model. No further configuration is needed for this field for our use case." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9b53b39e-d74e-4fa8-a364-e2c3caf37418", - "metadata": { - "id": "9b53b39e-d74e-4fa8-a364-e2c3caf37418" - }, - "outputs": [], - "source": [ - "# define the index name and mapping\n", - "commerce_index = \"ecommerce-search\"\n", - "mappings = {\n", - " \"properties\": {\n", - " \"product\": {\n", - " \"type\": \"text\",\n", - " },\n", - " \"description\": {\n", - " \"type\": \"text\",\n", - " },\n", - " \"category\": {\n", - " \"type\": \"text\",\n", - " },\n", - " \"elser_description_vector\": {\"type\": \"sparse_vector\"},\n", - " \"e5_description_vector\": {\n", - " \"type\": \"dense_vector\",\n", - " \"dims\": 384,\n", - " \"index\": \"true\",\n", - " \"similarity\": \"cosine\",\n", - " },\n", - " \"e5_semantic_description_vector\": {\n", - " \"type\": \"semantic_text\",\n", - " \"inference_id\": e5_endpoint,\n", - " },\n", - " \"elser_semantic_description_vector\": {\"type\": \"semantic_text\"},\n", - " }\n", - "}\n", - "\n", - "\n", - "if client.indices.exists(index=commerce_index):\n", - " client.indices.delete(index=commerce_index)\n", - "resp = client.indices.create(\n", - " index=commerce_index,\n", - " mappings=mappings,\n", - ")\n", - "\n", - "print(f\"Index {commerce_index} created: {resp['acknowledged']}\")" - ] - }, - { - "cell_type": "markdown", - "id": "88db9926", - "metadata": {}, - "source": [ - "### Attach Pipeline to Index\n", - "Lets connect our pipeline to the index. This updates the settings of our index to use the pipeline we previously defined as the default.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c4830b74", - "metadata": {}, - "outputs": [], - "source": [ - "resp = client.indices.put_settings(\n", - " index=commerce_index,\n", - " body={\"default_pipeline\": index_pipeline},\n", - ")\n", - "print(f\"Pipeline set for {commerce_index}: {resp['acknowledged']}\")" - ] - }, - { - "cell_type": "markdown", - "id": "Vo-LKu8TOT5j", - "metadata": { - "id": "Vo-LKu8TOT5j" - }, - "source": [ - "### Load documents\n", - "\n", - "We load the contents of`products-ecommerce.json` into the `ecommerce-search` index. We will use the `bulk` helper function to efficiently index our documents en masse. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3cfdc3b7-7e4f-4111-997b-c333ac8938ba", - "metadata": { - "id": "3cfdc3b7-7e4f-4111-997b-c333ac8938ba" - }, - "outputs": [], - "source": [ - "# Load the dataset\n", - "with open(\"products-ecommerce.json\", \"r\") as f:\n", - " data_json = json.load(f)\n", - "\n", - "\n", - "# helper function to create bulk indexing body\n", - "def create_index_body(doc):\n", - " doc[\"elser_semantic_description_vector\"] = doc[\"description\"]\n", - " doc[\"e5_semantic_description_vector\"] = doc[\"description\"]\n", - "\n", - " return {\n", - " \"_index\": \"ecommerce-search\",\n", - " \"_source\": doc,\n", - " }\n", - "\n", - "\n", - "# prepare the documents array payload\n", - "documents = [create_index_body(doc) for doc in data_json]\n", - "\n", - "# use bulk function to index\n", - "try:\n", - " print(\"Indexing documents...\")\n", - " resp = bulk(client, documents)\n", - " print(f\"Documents indexed successfully: {resp[0]}\")\n", - "except Exception as e:\n", - " print(f\"Error indexing documents: {e}\")" - ] - }, - { - "cell_type": "markdown", - "id": "-qUXNuOvPDsI", - "metadata": { - "id": "-qUXNuOvPDsI" - }, - "source": [ - "## Text Analysis\n", - "The classic way documents are ranked for relevance by Elasticsearch based on a text query uses the Lucene implementation of the [BM25](https://en.wikipedia.org/wiki/Okapi_BM25) model, a **sparse model for lexical search**. This method follows the traditional approach for text search, looking for exact term matches.\n", - "\n", - "To make this search possible, Elasticsearch converts **text field** data into a searchable format by performing text analysis.\n", - "\n", - "**Text analysis** is performed by an [analyzer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analyzer-anatomy.html), a set of rules to govern the process of extracting relevant tokens for searching. An analyzer must have exactly one [tokenizer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-tokenizers.html). The tokenizer receives a stream of characters and breaks it up into individual tokens (usually individual words.) \n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "5f51e460", - "metadata": {}, - "source": [ - "### Standard Analyzer\n", - "In the example below we are using the default analyzer, the standard analyzer, which works well for most use cases as it provides English grammar based tokenization. Tokenization enables matching on individual terms, but each token is still matched literally." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "55b602d1-f1e4-4b70-9273-5fc701ac9039", - "metadata": { - "id": "55b602d1-f1e4-4b70-9273-5fc701ac9039" - }, - "outputs": [], - "source": [ - "# Define the text to be analyzed\n", - "text = \"Comfortable furniture for a large balcony\"\n", - "\n", - "# Define the analyze request\n", - "request_body = {\"analyzer\": \"standard\", \"text\": text} # Stop Analyzer\n", - "\n", - "# Perform the analyze request\n", - "resp = client.indices.analyze(\n", - " analyzer=request_body[\"analyzer\"], text=request_body[\"text\"]\n", - ")\n", - "\n", - "# Extract and display the analyzed tokens\n", - "standard_tokens = [token[\"token\"] for token in resp[\"tokens\"]]\n", - "print(\"Standard-analyzed Tokens:\", standard_tokens)" - ] - }, - { - "cell_type": "markdown", - "id": "fb75f526", - "metadata": {}, - "source": [ - "### Stop Analyzer\n", - "If you want to personalize your search experience you can choose a different built-in analyzer. For example, by updating the code to use the stop analyzer it will break the text into tokens at any non-letter character with support for removing stop words." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3e3fdcff", - "metadata": {}, - "outputs": [], - "source": [ - "# Define the analyze request\n", - "request_body = {\"analyzer\": \"stop\", \"text\": text}\n", - "\n", - "# Perform the analyze request\n", - "response = client.indices.analyze(\n", - " analyzer=request_body[\"analyzer\"], text=request_body[\"text\"]\n", - ")\n", - "\n", - "# Extract and display the analyzed tokens\n", - "stop_tokens = [token[\"token\"] for token in response[\"tokens\"]]\n", - "print(\"Stop-analyzed Tokens:\", stop_tokens)" - ] - }, - { - "cell_type": "markdown", - "id": "aba7fad6", - "metadata": {}, - "source": [ - "### Custom Analyzer\n", - "When the built-in analyzers do not fulfill your needs, you can create a [custom analyzer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-custom-analyzer.html)\n", - "], which uses the appropriate combination of zero or more [character filters](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-charfilters.html), a [tokenizer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-tokenizers.html) and zero or more [token filters](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-tokenfilters.html).\n", - "\n", - "In the below example that combines a tokenizer and token filters, the text will be lowercased by the [lowercase filter](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lowercase-tokenfilter.html) before being processed by the [synonyms token filter](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html).\n", - "\n", - "> Note: you cannot pass a custom analyzer definition inline to analyze. Define the analyzer in your index settings, then reference it by name in the analyze call. For this reason we will create a temporary index to store the analyzer." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d44f3e2b", - "metadata": {}, - "outputs": [], - "source": [ - "index_settings = {\n", - " \"settings\": {\n", - " \"analysis\": {\n", - " \"analyzer\": {\n", - " \"my_custom_analyzer\": {\n", - " \"type\": \"custom\",\n", - " \"tokenizer\": \"standard\",\n", - " \"char_filter\": [\"html_strip\"],\n", - " \"filter\": [\"lowercase\", \"asciifolding\"],\n", - " }\n", - " }\n", - " }\n", - " }\n", - "}\n", - "\n", - "custom_text = \"Čōmføřțǎble Fůrñíturę Fòr â ľarğe Bałcony\"\n", - "\n", - "# Create a temporary index with the custom analyzer\n", - "client.indices.create(index=\"temporary_index\", body=index_settings)\n", - "\n", - "# Perform the analyze request\n", - "resp = client.indices.analyze(\n", - " index=\"temporary_index\", analyzer=\"my_custom_analyzer\", text=custom_text\n", - ")\n", - "\n", - "# Extract and display the analyzed tokens\n", - "custom_tokens = [token[\"token\"] for token in resp[\"tokens\"]]\n", - "print(\"Custom Tokens:\", custom_tokens)\n", - "\n", - "# Delete the temporary index\n", - "client.indices.delete(index=\"temporary_index\")" - ] - }, - { - "cell_type": "markdown", - "id": "432620b6", - "metadata": {}, - "source": [ - "### Text Analysis Results\n", - "In the table below, we can observe that analyzers both included with Elasticsearch and custom made may be included with your search requests to improve the quality of your search results by reducing or refining the content being searched. Attention should be paid to your particular use case and the needs of your users." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9c5d11cb", - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Standard Token Analyzer\")\n", - "print(f\"Before: \\n{text}\")\n", - "print(f\"After: \\n{standard_tokens}\")\n", - "print(\"===================\")\n", - "print(\"Stop Token Analyzer\")\n", - "print(f\"Before: \\n{text}\")\n", - "print(f\"After: \\n{stop_tokens}\")\n", - "print(\"===================\")\n", - "print(\"Custom Token Analyzer\")\n", - "print(f\"Before: \\n{custom_text}\")\n", - "print(f\"After: \\n{custom_tokens}\")" - ] - }, - { - "cell_type": "markdown", - "id": "db4f86e3", - "metadata": {}, - "source": [ - "## Search \n", - "The remainder of this notebook will cover the following search types:\n", - "\n", - "\n", - "- Lexical Search\n", - "- Semantic Search \n", - " - ELSER Semantic Search (Sparse Vector)\n", - " - E5 Semantic Search (Dense Vector)\n", - "- Hybrid Search\n" - ] - }, - { - "cell_type": "markdown", - "id": "8G8MKcUvP0zs", - "metadata": { - "id": "8G8MKcUvP0zs" - }, - "source": [ - "## Lexical Search\n", - "Our first search will be a straightforward BM25 text search within the description field. We are storing all of our results in a results_list for a final comparison at the end of the notebook. A convenience function to display the results is also defined." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f4984f6c-ceec-46a4-b64c-f749e6b1b04f", - "metadata": { - "id": "f4984f6c-ceec-46a4-b64c-f749e6b1b04f" - }, - "outputs": [], - "source": [ - "results_list = []\n", - "\n", - "\n", - "def print_search_results(search_results):\n", - " if not search_results:\n", - " print(\"No matches found\")\n", - " else:\n", - " for hit in search_results:\n", - " score = hit[\"_score\"]\n", - " product = hit[\"_source\"][\"product\"]\n", - " category = hit[\"_source\"][\"category\"]\n", - " description = hit[\"_source\"][\"description\"]\n", - " print(\n", - " f\"\\nScore: {score}\\nProduct: {product}\\nCategory: {category}\\nDescription: {description}\\n\"\n", - " )\n", - "\n", - "\n", - "# Regular BM25 (Lexical) Search\n", - "resp = client.search(\n", - " size=2,\n", - " index=\"ecommerce-search\",\n", - " query={\n", - " \"match\": {\n", - " \"description\": {\n", - " \"query\": \"Comfortable furniture for a large balcony\",\n", - " \"analyzer\": \"stop\",\n", - " }\n", - " }\n", - " },\n", - " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", - ")\n", - "\n", - "lexical_search_results = resp[\"hits\"][\"hits\"]\n", - "results_list.append({\"lexical_search\": lexical_search_results})\n", - "print_search_results(lexical_search_results)" - ] - }, - { - "cell_type": "markdown", - "id": "xiywcf_-P39a", - "metadata": { - "id": "xiywcf_-P39a" - }, - "source": [ - "## Semantic Search with Dense Vector" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "72187c9a-14c1-4084-a080-4e5c1e614f22", - "metadata": { - "id": "72187c9a-14c1-4084-a080-4e5c1e614f22" - }, - "outputs": [], - "source": [ - "# KNN\n", - "# TODO: Add Semantic_Text type?\n", - "response = client.search(\n", - " index=\"ecommerce-search\",\n", - " size=2,\n", - " knn={\n", - " \"field\": \"e5_description_vector\",\n", - " \"k\": 50, # Number of nearest neighbors to return as top hits.\n", - " \"num_candidates\": 500, # Number of nearest neighbor candidates to consider per shard. Increasing num_candidates tends to improve the accuracy of the final k results.\n", - " \"query_vector_builder\": { # Object indicating how to build a query_vector. kNN search enables you to perform semantic search by using a previously deployed text embedding model.\n", - " \"text_embedding\": {\n", - " \"model_id\": \".multilingual-e5-small-elasticsearch\", # Text embedding model id\n", - " \"model_text\": \"Comfortable furniture for a large balcony\", # Query\n", - " }\n", - " },\n", - " },\n", - " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", - ")\n", - "\n", - "dense_semantic_search_results = response[\"hits\"][\"hits\"]\n", - "results_list.append({\"dense_semantic_search\": dense_semantic_search_results})\n", - "print_search_results(dense_semantic_search_results)" - ] - }, - { - "cell_type": "markdown", - "id": "QlWFdngRQFbv", - "metadata": { - "id": "QlWFdngRQFbv" - }, - "source": [ - "## Semantic Search with Sparse Vector" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c5475e21", - "metadata": {}, - "outputs": [], - "source": [ - "# Elastic Learned Sparse Encoder - ELSER\n", - "\n", - "resp = client.search(\n", - " index=\"ecommerce-search\",\n", - " size=2,\n", - " query={\n", - " \"sparse_vector\": {\n", - " \"field\": \"elser_description_vector\",\n", - " \"inference_id\": \".elser-2-elasticsearch\",\n", - " \"query\": \"Comfortable furniture for a large balcony\",\n", - " }\n", - " },\n", - " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", - ")\n", - "\n", - "\n", - "sparse_semantic_search_results = resp[\"hits\"][\"hits\"]\n", - "results_list.append({\"sparse_semantic_search\": sparse_semantic_search_results})\n", - "print_search_results(sparse_semantic_search_results)" - ] - }, - { - "cell_type": "markdown", - "id": "3a2a5267", - "metadata": {}, - "source": [ - "## Semantic Search with `semantic_text` Type (ELSER)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4d2fb926", - "metadata": {}, - "outputs": [], - "source": [ - "# Elastic Learned Sparse Encoder - ELSER\n", - "\n", - "resp = client.search(\n", - " index=\"ecommerce-search\",\n", - " size=2,\n", - " query={\n", - " \"semantic\": {\n", - " \"field\": \"elser_semantic_description_vector\",\n", - " \"query\": \"Comfortable furniture for a large balcony\",\n", - " }\n", - " },\n", - " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", - ")\n", - "\n", - "elser_semantic_text_search_results = resp[\"hits\"][\"hits\"]\n", - "results_list.append({\"elser_semantic_text_search\": sparse_semantic_search_results})\n", - "print_search_results(elser_semantic_text_search_results)" - ] - }, - { - "cell_type": "markdown", - "id": "1df079f3", - "metadata": {}, - "source": [ - "## Semantic Search with `semantic_text` Type (e5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2c0bf5fc-ab32-4f33-8f26-904ff10635a5", - "metadata": { - "id": "2c0bf5fc-ab32-4f33-8f26-904ff10635a5" - }, - "outputs": [], - "source": [ - "# Elastic Learned Sparse Encoder - ELSER\n", - "\n", - "resp = client.search(\n", - " index=\"ecommerce-search\",\n", - " size=2,\n", - " query={\n", - " \"semantic\": {\n", - " \"field\": \"e5_semantic_description_vector\",\n", - " \"query\": \"Comfortable furniture for a large balcony\",\n", - " }\n", - " },\n", - " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", - ")\n", - "\n", - "e5_semantic_text_search_results = resp[\"hits\"][\"hits\"]\n", - "results_list.append({\"e5_semantic_text_search\": e5_semantic_text_search_results})\n", - "print_search_results(e5_semantic_text_search_results)" - ] - }, - { - "cell_type": "markdown", - "id": "kz9deDBYQJxr", - "metadata": { - "id": "kz9deDBYQJxr" - }, - "source": [ - "## Hybrid Search - BM25 + Dense Vector linear combination" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f84aa16b-49c5-4abf-a049-d556c225542e", - "metadata": { - "id": "f84aa16b-49c5-4abf-a049-d556c225542e" - }, - "outputs": [], - "source": [ - "# BM25 + KNN (Linear Combination)\n", - "query = \"A dining table and comfortable chairs for a large balcony\"\n", - "resp = client.search(\n", - " index=\"ecommerce-search\",\n", - " size=2,\n", - " query={\n", - " \"bool\": {\n", - " \"should\": [\n", - " {\n", - " \"match\": {\n", - " \"description\": {\n", - " \"query\": query,\n", - " \"boost\": 1,\n", - " }\n", - " }\n", - " }\n", - " ]\n", - " }\n", - " },\n", - " knn={\n", - " \"field\": \"e5_description_vector\",\n", - " \"k\": 2,\n", - " \"num_candidates\": 20,\n", - " \"boost\": 1,\n", - " \"query_vector_builder\": {\n", - " \"text_embedding\": {\n", - " \"model_id\": \".multilingual-e5-small-elasticsearch\",\n", - " \"model_text\": query,\n", - " }\n", - " },\n", - " },\n", - " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", - ")\n", - "\n", - "dense_linear_search_results = resp[\"hits\"][\"hits\"]\n", - "results_list.append({\"dense_linear_search\": dense_linear_search_results})\n", - "print_search_results(dense_linear_search_results)" - ] - }, - { - "cell_type": "markdown", - "id": "cybkWjmpQV8g", - "metadata": { - "id": "cybkWjmpQV8g" - }, - "source": [ - "## Hybrid Search - BM25 + Dense Vector Reverse Reciprocal Fusion (RRF)\n", - "\n", - "[Reciprocal rank fusion](https://www.elastic.co/docs/reference/elasticsearch/rest-apis/reciprocal-rank-fusion) (RRF) is a method for combining multiple result sets with different relevance indicators into a single result set. RRF requires no tuning, and the different relevance indicators do not have to be related to each other to achieve high-quality results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aa2e072d-37bb-43fd-a83f-e1cb55a24861", - "metadata": { - "id": "aa2e072d-37bb-43fd-a83f-e1cb55a24861" - }, - "outputs": [], - "source": [ - "# BM25 + KNN (RRF)\n", - "top_k = 2\n", - "resp = client.search(\n", - " index=\"ecommerce-search\",\n", - " retriever={\n", - " \"rrf\": {\n", - " \"retrievers\": [\n", - " {\n", - " \"standard\": {\n", - " \"query\": {\n", - " \"match\": {\n", - " \"description\": \"A dining table and comfortable chairs for a large balcony\"\n", - " }\n", - " }\n", - " }\n", - " },\n", - " {\n", - " \"knn\": {\n", - " \"field\": \"e5_description_vector\",\n", - " \"query_vector_builder\": {\n", - " \"text_embedding\": {\n", - " \"model_id\": e5_endpoint,\n", - " \"model_text\": \"A dining table and comfortable chairs for a large balcony\",\n", - " }\n", - " },\n", - " \"k\": 2,\n", - " \"num_candidates\": 20,\n", - " }\n", - " },\n", - " ],\n", - " \"rank_window_size\": 2,\n", - " \"rank_constant\": 20,\n", - " }\n", - " },\n", - " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", - ")\n", - "\n", - "dense_rrf_search_results = resp[\"hits\"][\"hits\"]\n", - "results_list.append({\"dense_rrf_search\": dense_rrf_search_results})\n", - "print_search_results(dense_rrf_search_results)" - ] - }, - { - "cell_type": "markdown", - "id": "LyKI2Z-XQbI6", - "metadata": { - "id": "LyKI2Z-XQbI6" - }, - "source": [ - "## Hybrid Search - BM25 + Sparse Vector linear combination" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bd842732-b20a-4c7a-b735-e1f558a9b922", - "metadata": { - "id": "bd842732-b20a-4c7a-b735-e1f558a9b922" - }, - "outputs": [], - "source": [ - "# BM25 + Elastic Learned Sparse Encoder (Linear Combination)\n", - "\n", - "resp = client.search(\n", - " index=\"ecommerce-search\",\n", - " size=2,\n", - " query={\n", - " \"bool\": {\n", - " \"should\": [\n", - " {\n", - " \"match\": {\n", - " \"description\": {\n", - " \"query\": \"A dining table and comfortable chairs for a large balcony\",\n", - " \"boost\": 1, # You can adjust the boost value\n", - " }\n", - " }\n", - " },\n", - " {\n", - " \"sparse_vector\": {\n", - " \"field\": \"elser_description_vector\",\n", - " \"inference_id\": elser_endpoint,\n", - " \"query\": \"A dining table and comfortable chairs for a large balcony\",\n", - " }\n", - " },\n", - " ]\n", - " }\n", - " },\n", - " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", - ")\n", - "\n", - "sparse_linear_search_results = resp[\"hits\"][\"hits\"]\n", - "results_list.append({\"sparse_linear_search\": sparse_linear_search_results})\n", - "print_search_results(sparse_linear_search_results)" - ] - }, - { - "cell_type": "markdown", - "id": "e3d5e4e9", - "metadata": {}, - "source": [ - "## Hybrid Search - BM25 + Sparse Vector Reciprocal Rank Fusion (RRF)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "199c5c60", - "metadata": {}, - "outputs": [], - "source": [ - "# BM25 + ELSER (RRF)\n", - "top_k = 2\n", - "resp = client.search(\n", - " index=\"ecommerce-search\",\n", - " retriever={\n", - " \"rrf\": {\n", - " \"retrievers\": [\n", - " {\n", - " \"standard\": {\n", - " \"query\": {\n", - " \"match\": {\n", - " \"description\": \"A dining table and comfortable chairs for a large balcony\"\n", - " }\n", - " }\n", - " }\n", - " },\n", - " {\n", - " \"standard\": {\n", - " \"query\": {\n", - " \"sparse_vector\": {\n", - " \"field\": \"elser_description_vector\",\n", - " \"inference_id\": elser_endpoint,\n", - " \"query\": \"A dining table and comfortable chairs for a large balcony\",\n", - " }\n", - " }\n", - " }\n", - " },\n", - " ],\n", - " \"rank_window_size\": 2,\n", - " \"rank_constant\": 20,\n", - " }\n", - " },\n", - " source_excludes=[\"*_description_vector\"], # Exclude vector fields from response\n", - ")\n", - "\n", - "sparse_rrf_search_results = resp[\"hits\"][\"hits\"]\n", - "results_list.append({\"sparse_rrf_search\": sparse_rrf_search_results})\n", - "print_search_results(sparse_rrf_search_results)" - ] - }, - { - "cell_type": "markdown", - "id": "7b95f9b8", - "metadata": {}, - "source": [ - "TODO: \n", - "- Semantic Text / Query BUilder (ask Serena)\n", - "- Table of Results\n", - "- Conclusion\n", - "- Next steps\n", - "\n", - "\n", - "\n", - "## Compiled Results\n", - "Here are the results of the previous searches. We can see that all of the results return approximately the same the products." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1162a857", - "metadata": {}, - "outputs": [], - "source": [ - "# Flatten results for each search type, preserving insertion order\n", - "rows = []\n", - "for result in results_list:\n", - " search_type = list(result.keys())[0]\n", - " for doc in result[search_type]:\n", - " row = {\n", - " \"search_type\": search_type,\n", - " \"product\": doc[\"_source\"].get(\"product\"),\n", - " \"category\": doc[\"_source\"].get(\"category\"),\n", - " \"description\": doc[\"_source\"].get(\"description\"),\n", - " \"score\": doc.get(\"_score\"),\n", - " }\n", - " rows.append(row)\n", - "\n", - "# Create DataFrame without altering row order\n", - "df = pd.DataFrame(rows)\n", - "\n", - "# Get the unique search_types in order of appearance\n", - "ordered_search_types = []\n", - "for row in rows:\n", - " st = row[\"search_type\"]\n", - " if st not in ordered_search_types:\n", - " ordered_search_types.append(st)\n", - "\n", - "for search_type in ordered_search_types:\n", - " group = df[df[\"search_type\"] == search_type]\n", - " display(Markdown(f\"### {search_type.replace('_', ' ').title()}\"))\n", - " styled = (\n", - " group.drop(columns=\"search_type\")\n", - " .reset_index(drop=True)\n", - " .style.set_properties(\n", - " subset=[\"description\"],\n", - " **{\"white-space\": \"pre-wrap\", \"word-break\": \"break-word\"},\n", - " )\n", - " .hide(axis=\"index\") # For pandas >=1.4.0\n", - " )\n", - " display(styled)" - ] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}