Skip to content

Commit 4ef9d83

Browse files
committed
Update with new ES queries for completeness
1 parent 3b8e5d2 commit 4ef9d83

File tree

1 file changed

+227
-0
lines changed

1 file changed

+227
-0
lines changed

supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb

Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,233 @@
189189
"\n",
190190
"print(\"Indexing complete!\")"
191191
]
192+
},
193+
{
194+
"cell_type": "markdown",
195+
"metadata": {},
196+
"source": [
197+
"Now are going to create a pipeline to vectorize the descriptions text_field through our inference text embedding model."
198+
]
199+
},
200+
{
201+
"cell_type": "code",
202+
"execution_count": null,
203+
"metadata": {},
204+
"outputs": [],
205+
"source": [
206+
"pipeline_body = {\n",
207+
" \"description\": \"Pipeline to run the descriptions text_field through our inference text embedding model\",\n",
208+
" \"processors\": [\n",
209+
" {\n",
210+
" \"set\": {\n",
211+
" \"field\": \"temp_desc\",\n",
212+
" \"value\": \"passage: {{description}}\"\n",
213+
" }\n",
214+
" },\n",
215+
" {\n",
216+
" \"inference\": {\n",
217+
" \"field_map\": {\n",
218+
" \"temp_desc\": \"text_field\"\n",
219+
" },\n",
220+
" \"model_id\": \".multilingual-e5-small_linux-x86_64_search\",\n",
221+
" \"target_field\": \"vector_description\"\n",
222+
" }\n",
223+
" },\n",
224+
" {\n",
225+
" \"remove\": {\n",
226+
" \"field\": \"temp_desc\"\n",
227+
" }\n",
228+
" }\n",
229+
" ]\n",
230+
"}\n",
231+
"\n",
232+
"try:\n",
233+
" es.ingest.put_pipeline(id=\"vectorize_descriptions\", body=pipeline_body)\n",
234+
" print(\"Pipeline 'vectorize_descriptions' created successfully.\")\n",
235+
"except Exception as e:\n",
236+
" print(f\"Error creating pipeline: {str(e)}\")\n"
237+
]
238+
},
239+
{
240+
"cell_type": "markdown",
241+
"metadata": {},
242+
"source": [
243+
"We also need to create a new Elasticsearch index with the specified vector mapping."
244+
]
245+
},
246+
{
247+
"cell_type": "code",
248+
"execution_count": null,
249+
"metadata": {},
250+
"outputs": [],
251+
"source": [
252+
"index_body = {\n",
253+
" \"mappings\": {\n",
254+
" \"properties\": {\n",
255+
" \"description\": {\n",
256+
" \"type\": \"text\"\n",
257+
" },\n",
258+
" \"en\": {\n",
259+
" \"type\": \"text\"\n",
260+
" },\n",
261+
" \"image_url\": {\n",
262+
" \"type\": \"keyword\"\n",
263+
" },\n",
264+
" \"language\": {\n",
265+
" \"type\": \"keyword\"\n",
266+
" },\n",
267+
" \"vector_description.predicted_value\": {\n",
268+
" \"type\": \"dense_vector\",\n",
269+
" \"dims\": 384,\n",
270+
" \"index\": True,\n",
271+
" \"similarity\": \"cosine\",\n",
272+
" \"index_options\": {\n",
273+
" \"type\": \"bbq_hnsw\"\n",
274+
" }\n",
275+
" }\n",
276+
" }\n",
277+
" }\n",
278+
"}\n",
279+
"\n",
280+
"try:\n",
281+
" es.indices.create(index=\"coco_multi\", body=index_body)\n",
282+
" print(\"Index 'coco_multi' created successfully.\")\n",
283+
"except Exception as e:\n",
284+
" print(f\"Error creating index: {str(e)}\")\n"
285+
]
286+
},
287+
{
288+
"cell_type": "markdown",
289+
"metadata": {},
290+
"source": [
291+
"Now, we just need to run the pipeline to bring and vectorize the data into the Elasticsearch index."
292+
]
293+
},
294+
{
295+
"cell_type": "code",
296+
"execution_count": null,
297+
"metadata": {},
298+
"outputs": [],
299+
"source": [
300+
"from elasticsearch import Elasticsearch\n",
301+
"\n",
302+
"es = Elasticsearch()\n",
303+
"\n",
304+
"reindex_body = {\n",
305+
" \"source\": {\n",
306+
" \"index\": \"coco\"\n",
307+
" },\n",
308+
" \"dest\": {\n",
309+
" \"index\": \"coco_multilingual\",\n",
310+
" \"pipeline\": \"vectorize_descriptions\"\n",
311+
" }\n",
312+
"}\n",
313+
"\n",
314+
"response = es.reindex(\n",
315+
" body=reindex_body,\n",
316+
" # Not waiting for completion here cause this process might take a while\n",
317+
" wait_for_completion=False\n",
318+
")\n",
319+
"\n",
320+
"print(\"Reindex task started. Task info:\")\n",
321+
"print(response)\n"
322+
]
323+
},
324+
{
325+
"cell_type": "markdown",
326+
"metadata": {},
327+
"source": [
328+
"Voilà, now let's try some queries and have some fun!"
329+
]
330+
},
331+
{
332+
"cell_type": "code",
333+
"execution_count": null,
334+
"metadata": {},
335+
"outputs": [],
336+
"source": [
337+
"query_body = {\n",
338+
" \"size\": 10,\n",
339+
" \"_source\": [\n",
340+
" \"description\", \"language\", \"en\"\n",
341+
" ],\n",
342+
" \"knn\": {\n",
343+
" \"field\": \"vector_description.predicted_value\",\n",
344+
" \"k\": 10,\n",
345+
" \"num_candidates\": 100,\n",
346+
" \"query_vector_builder\": {\n",
347+
" \"text_embedding\": {\n",
348+
" \"model_id\": \".multilingual-e5-small_linux-x86_64_search\",\n",
349+
" \"model_text\": \"query: kitty\"\n",
350+
" }\n",
351+
" }\n",
352+
" }\n",
353+
"}\n",
354+
"\n",
355+
"response = es.search(index=\"coco_multi\", body=query_body)\n",
356+
"print(response)\n"
357+
]
358+
},
359+
{
360+
"cell_type": "markdown",
361+
"metadata": {},
362+
"source": []
363+
},
364+
{
365+
"cell_type": "code",
366+
"execution_count": null,
367+
"metadata": {},
368+
"outputs": [],
369+
"source": [
370+
"query_body = {\n",
371+
" \"size\": 100,\n",
372+
" \"_source\": [\n",
373+
" \"description\", \"language\", \"en\"\n",
374+
" ],\n",
375+
" \"knn\": {\n",
376+
" \"field\": \"vector_description.predicted_value\",\n",
377+
" \"k\": 50,\n",
378+
" \"num_candidates\": 1000,\n",
379+
" \"query_vector_builder\": {\n",
380+
" \"text_embedding\": {\n",
381+
" \"model_id\": \".multilingual-e5-small_linux-x86_64_search\",\n",
382+
" \"model_text\": \"query: kitty lying on something\"\n",
383+
" }\n",
384+
" }\n",
385+
" }\n",
386+
"}\n",
387+
"\n",
388+
"response = es.search(index=\"coco_multi\", body=query_body)\n",
389+
"print(response)\n"
390+
]
391+
},
392+
{
393+
"cell_type": "code",
394+
"execution_count": null,
395+
"metadata": {},
396+
"outputs": [],
397+
"source": [
398+
"query_body = {\n",
399+
" \"size\": 100,\n",
400+
" \"_source\": [\n",
401+
" \"description\", \"language\", \"en\"\n",
402+
" ],\n",
403+
" \"knn\": {\n",
404+
" \"field\": \"vector_description.predicted_value\",\n",
405+
" \"k\": 50,\n",
406+
" \"num_candidates\": 1000,\n",
407+
" \"query_vector_builder\": {\n",
408+
" \"text_embedding\": {\n",
409+
" \"model_id\": \".multilingual-e5-small_linux-x86_64_search\",\n",
410+
" \"model_text\": \"query: 고양이\"\n",
411+
" }\n",
412+
" }\n",
413+
" }\n",
414+
"}\n",
415+
"\n",
416+
"response = es.search(index=\"coco_multi\", body=query_body)\n",
417+
"print(response)\n"
418+
]
192419
}
193420
],
194421
"metadata": {

0 commit comments

Comments
 (0)