Update with new ES queries for completeness

qn895 · qn895 · commit 4ef9d83c3964 · 2025-10-08T10:46:52.000-05:00
diff --git a/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb b/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb
@@ -189,6 +189,233 @@
     "\n",
     "print(\"Indexing complete!\")"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now are going to create a pipeline to vectorize the descriptions text_field through our inference text embedding model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline_body = {\n",
+    "    \"description\": \"Pipeline to run the descriptions text_field through our inference text embedding model\",\n",
+    "    \"processors\": [\n",
+    "        {\n",
+    "            \"set\": {\n",
+    "                \"field\": \"temp_desc\",\n",
+    "                \"value\": \"passage: {{description}}\"\n",
+    "            }\n",
+    "        },\n",
+    "        {\n",
+    "            \"inference\": {\n",
+    "                \"field_map\": {\n",
+    "                    \"temp_desc\": \"text_field\"\n",
+    "                },\n",
+    "                \"model_id\": \".multilingual-e5-small_linux-x86_64_search\",\n",
+    "                \"target_field\": \"vector_description\"\n",
+    "            }\n",
+    "        },\n",
+    "        {\n",
+    "            \"remove\": {\n",
+    "                \"field\": \"temp_desc\"\n",
+    "            }\n",
+    "        }\n",
+    "    ]\n",
+    "}\n",
+    "\n",
+    "try:\n",
+    "    es.ingest.put_pipeline(id=\"vectorize_descriptions\", body=pipeline_body)\n",
+    "    print(\"Pipeline 'vectorize_descriptions' created successfully.\")\n",
+    "except Exception as e:\n",
+    "    print(f\"Error creating pipeline: {str(e)}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We also need to create a new Elasticsearch index with the specified vector mapping."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "index_body = {\n",
+    "    \"mappings\": {\n",
+    "        \"properties\": {\n",
+    "            \"description\": {\n",
+    "                \"type\": \"text\"\n",
+    "            },\n",
+    "            \"en\": {\n",
+    "                \"type\": \"text\"\n",
+    "            },\n",
+    "            \"image_url\": {\n",
+    "                \"type\": \"keyword\"\n",
+    "            },\n",
+    "            \"language\": {\n",
+    "                \"type\": \"keyword\"\n",
+    "            },\n",
+    "            \"vector_description.predicted_value\": {\n",
+    "                \"type\": \"dense_vector\",\n",
+    "                \"dims\": 384,\n",
+    "                \"index\": True,\n",
+    "                \"similarity\": \"cosine\",\n",
+    "                \"index_options\": {\n",
+    "                    \"type\": \"bbq_hnsw\"\n",
+    "                }\n",
+    "            }\n",
+    "        }\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "try:\n",
+    "    es.indices.create(index=\"coco_multi\", body=index_body)\n",
+    "    print(\"Index 'coco_multi' created successfully.\")\n",
+    "except Exception as e:\n",
+    "    print(f\"Error creating index: {str(e)}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, we just need to run the pipeline to bring and vectorize the data into the Elasticsearch index."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from elasticsearch import Elasticsearch\n",
+    "\n",
+    "es = Elasticsearch()\n",
+    "\n",
+    "reindex_body = {\n",
+    "    \"source\": {\n",
+    "        \"index\": \"coco\"\n",
+    "    },\n",
+    "    \"dest\": {\n",
+    "        \"index\": \"coco_multilingual\",\n",
+    "        \"pipeline\": \"vectorize_descriptions\"\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "response = es.reindex(\n",
+    "    body=reindex_body,\n",
+    "    # Not waiting for completion here cause this process might take a while\n",
+    "    wait_for_completion=False\n",
+    ")\n",
+    "\n",
+    "print(\"Reindex task started. Task info:\")\n",
+    "print(response)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Voilà, now let's try some queries and have some fun!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query_body = {\n",
+    "    \"size\": 10,\n",
+    "    \"_source\": [\n",
+    "        \"description\", \"language\", \"en\"\n",
+    "    ],\n",
+    "    \"knn\": {\n",
+    "        \"field\": \"vector_description.predicted_value\",\n",
+    "        \"k\": 10,\n",
+    "        \"num_candidates\": 100,\n",
+    "        \"query_vector_builder\": {\n",
+    "            \"text_embedding\": {\n",
+    "                \"model_id\": \".multilingual-e5-small_linux-x86_64_search\",\n",
+    "                \"model_text\": \"query: kitty\"\n",
+    "            }\n",
+    "        }\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "response = es.search(index=\"coco_multi\", body=query_body)\n",
+    "print(response)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query_body = {\n",
+    "    \"size\": 100,\n",
+    "    \"_source\": [\n",
+    "        \"description\", \"language\", \"en\"\n",
+    "    ],\n",
+    "    \"knn\": {\n",
+    "        \"field\": \"vector_description.predicted_value\",\n",
+    "        \"k\": 50,\n",
+    "        \"num_candidates\": 1000,\n",
+    "        \"query_vector_builder\": {\n",
+    "            \"text_embedding\": {\n",
+    "                \"model_id\": \".multilingual-e5-small_linux-x86_64_search\",\n",
+    "                \"model_text\": \"query: kitty lying on something\"\n",
+    "            }\n",
+    "        }\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "response = es.search(index=\"coco_multi\", body=query_body)\n",
+    "print(response)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query_body = {\n",
+    "    \"size\": 100,\n",
+    "    \"_source\": [\n",
+    "        \"description\", \"language\", \"en\"\n",
+    "    ],\n",
+    "    \"knn\": {\n",
+    "        \"field\": \"vector_description.predicted_value\",\n",
+    "        \"k\": 50,\n",
+    "        \"num_candidates\": 1000,\n",
+    "        \"query_vector_builder\": {\n",
+    "            \"text_embedding\": {\n",
+    "                \"model_id\": \".multilingual-e5-small_linux-x86_64_search\",\n",
+    "                \"model_text\": \"query: 고양이\"\n",
+    "            }\n",
+    "        }\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "response = es.search(index=\"coco_multi\", body=query_body)\n",
+    "print(response)\n"
+   ]
   }
  ],
  "metadata": {