Skip to content

Commit 5d74fcd

Browse files
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
1 parent 7acee94 commit 5d74fcd

File tree

4 files changed

+102
-93
lines changed

4 files changed

+102
-93
lines changed

engine/clients/opensearch/configure.py

+14-11
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from opensearchpy import NotFoundError, OpenSearch
1+
from opensearchpy import OpenSearch
22

33
from benchmark.dataset import Dataset
44
from engine.base_client.configure import BaseConfigurator
@@ -40,26 +40,29 @@ def __init__(self, host, collection_params: dict, connection_params: dict):
4040
)
4141

4242
def clean(self):
43-
is_index_available = self.client.indices.exists(index=OPENSEARCH_INDEX,
43+
is_index_available = self.client.indices.exists(
44+
index=OPENSEARCH_INDEX,
4445
params={
4546
"timeout": 300,
46-
})
47-
if(is_index_available):
47+
},
48+
)
49+
if is_index_available:
4850
print(f"Deleting index: {OPENSEARCH_INDEX}, as it is already present")
4951
self.client.indices.delete(
5052
index=OPENSEARCH_INDEX,
5153
params={
5254
"timeout": 300,
5355
},
5456
)
55-
5657

5758
def recreate(self, dataset: Dataset, collection_params):
5859
self._update_cluster_settings()
5960
distance = self.DISTANCE_MAPPING[dataset.config.distance]
6061
if dataset.config.distance == Distance.COSINE:
6162
distance = self.DISTANCE_MAPPING[Distance.DOT]
62-
print(f"Using distance type: {distance} as dataset distance is : {dataset.config.distance}")
63+
print(
64+
f"Using distance type: {distance} as dataset distance is : {dataset.config.distance}"
65+
)
6366

6467
self.client.indices.create(
6568
index=OPENSEARCH_INDEX,
@@ -70,7 +73,7 @@ def recreate(self, dataset: Dataset, collection_params):
7073
"refresh_interval": -1,
7174
"number_of_replicas": 0,
7275
"number_of_shards": 1,
73-
"knn.advanced.approximate_threshold": "-1"
76+
"knn.advanced.approximate_threshold": "-1",
7477
}
7578
},
7679
"mappings": {
@@ -83,7 +86,7 @@ def recreate(self, dataset: Dataset, collection_params):
8386
"name": "hnsw",
8487
"engine": "faiss",
8588
"space_type": distance,
86-
**collection_params.get("method")
89+
**collection_params.get("method"),
8790
},
8891
},
8992
},
@@ -102,8 +105,8 @@ def _update_cluster_settings(self):
102105
index_thread_qty = get_index_thread_qty(self.client)
103106
cluster_settings_body = {
104107
"persistent": {
105-
"knn.memory.circuit_breaker.limit": "75%", # putting a higher value to ensure that even with small cluster the latencies for vector search are good
106-
"knn.algo_param.index_thread_qty": index_thread_qty
108+
"knn.memory.circuit_breaker.limit": "75%", # putting a higher value to ensure that even with small cluster the latencies for vector search are good
109+
"knn.algo_param.index_thread_qty": index_thread_qty,
107110
}
108111
}
109112
self.client.cluster.put_settings(cluster_settings_body)
@@ -118,7 +121,7 @@ def _prepare_fields_config(self, dataset: Dataset):
118121
}
119122
for field_name, field_type in dataset.config.schema.items()
120123
}
121-
124+
122125
def execution_params(self, distance, vector_size) -> dict:
123126
# normalize the vectors if cosine similarity is there.
124127
if distance == Distance.COSINE:

engine/clients/opensearch/search.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from opensearchpy import OpenSearch
66

77
from dataset_reader.base_reader import Query
8-
from engine.base_client.distances import Distance
98
from engine.base_client.search import BaseSearcher
109
from engine.clients.opensearch.config import (
1110
OPENSEARCH_INDEX,
@@ -14,7 +13,6 @@
1413
OPENSEARCH_USER,
1514
)
1615
from engine.clients.opensearch.parser import OpenSearchConditionParser
17-
import numpy as np
1816

1917

2018
class ClosableOpenSearch(OpenSearch):
@@ -56,9 +54,11 @@ def search_one(cls, query: Query, top: int) -> List[Tuple[int, float]]:
5654
"vector": {
5755
"vector": query.vector,
5856
"k": top,
59-
"method_parameters" : {
60-
"ef_search": cls.search_params["config"]["ef_search"] # ef_search parameter is added in the query time
61-
}
57+
"method_parameters": {
58+
"ef_search": cls.search_params["config"][
59+
"ef_search"
60+
] # ef_search parameter is added in the query time
61+
},
6262
}
6363
}
6464
}
@@ -80,7 +80,7 @@ def search_one(cls, query: Query, top: int) -> List[Tuple[int, float]]:
8080
docvalue_fields=["_id"],
8181
stored_fields="_none_",
8282
)
83-
83+
8484
return [
8585
(uuid.UUID(hex=hit["fields"]["_id"][0]).int, hit["_score"])
8686
for hit in res["hits"]["hits"]
@@ -89,5 +89,5 @@ def search_one(cls, query: Query, top: int) -> List[Tuple[int, float]]:
8989
@classmethod
9090
def setup_search(cls):
9191
# Load the graphs in memory
92-
warmup_endpoint = f'/_plugins/_knn/warmup/{OPENSEARCH_INDEX}'
93-
cls.client.transport.perform_request('GET', warmup_endpoint)
92+
warmup_endpoint = f"/_plugins/_knn/warmup/{OPENSEARCH_INDEX}"
93+
cls.client.transport.perform_request("GET", warmup_endpoint)

engine/clients/opensearch/upload.py

+17-11
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,22 @@
11
import multiprocessing as mp
2-
import uuid
32
import time
3+
import uuid
44
from typing import List
55

66
from opensearchpy import OpenSearch
77

88
from dataset_reader.base_reader import Record
9-
from engine.base_client.distances import Distance
109
from engine.base_client.upload import BaseUploader
1110
from engine.clients.opensearch.config import (
1211
OPENSEARCH_INDEX,
1312
OPENSEARCH_PASSWORD,
1413
OPENSEARCH_PORT,
1514
OPENSEARCH_USER,
1615
)
17-
from engine.clients.opensearch.utils import get_index_thread_qty_for_force_merge, update_force_merge_threads
16+
from engine.clients.opensearch.utils import (
17+
get_index_thread_qty_for_force_merge,
18+
update_force_merge_threads,
19+
)
1820

1921

2022
class ClosableOpenSearch(OpenSearch):
@@ -76,9 +78,9 @@ def post_upload(cls, _distance):
7678
@classmethod
7779
def _refresh_index(cls):
7880
print(f"Refreshing index: {OPENSEARCH_INDEX}")
79-
params={"timeout": 300}
81+
params = {"timeout": 300}
8082
cls.client.indices.refresh(index=OPENSEARCH_INDEX, params=params)
81-
83+
8284
@classmethod
8385
def _update_vector_threshold_setting(cls):
8486
body = {
@@ -90,13 +92,17 @@ def _update_vector_threshold_setting(cls):
9092
@classmethod
9193
def _force_merge_index(cls):
9294
index_thread_qty = get_index_thread_qty_for_force_merge(cls.client)
93-
update_force_merge_threads(client = cls.client, index_thread_qty = index_thread_qty)
94-
force_merge_endpoint = f'/{OPENSEARCH_INDEX}/_forcemerge?max_num_segments=1&wait_for_completion=false'
95-
force_merge_task_id = cls.client.transport.perform_request('POST', force_merge_endpoint)['task']
95+
update_force_merge_threads(client=cls.client, index_thread_qty=index_thread_qty)
96+
force_merge_endpoint = f"/{OPENSEARCH_INDEX}/_forcemerge?max_num_segments=1&wait_for_completion=false"
97+
force_merge_task_id = cls.client.transport.perform_request(
98+
"POST", force_merge_endpoint
99+
)["task"]
96100
SECONDS_WAITING_FOR_FORCE_MERGE_API_CALL_SEC = 30
97-
print(f"Starting force merge on index: {OPENSEARCH_INDEX}, task_id: {force_merge_task_id}")
101+
print(
102+
f"Starting force merge on index: {OPENSEARCH_INDEX}, task_id: {force_merge_task_id}"
103+
)
98104
while True:
99105
time.sleep(SECONDS_WAITING_FOR_FORCE_MERGE_API_CALL_SEC)
100106
task_status = cls.client.tasks.get(task_id=force_merge_task_id)
101-
if task_status['completed']:
102-
break
107+
if task_status["completed"]:
108+
break

engine/clients/opensearch/utils.py

+63-63
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
from opensearchpy import OpenSearch
22

3+
34
def get_index_thread_qty_for_force_merge(client: OpenSearch):
45
processors_per_node = get_cores_for_data_nodes(client=client)
56
# since during force merge only 1 shard will be doing the merge we can be aggressive in parallelization factor
67
index_thread_qty = max(1, processors_per_node // 2)
78
print(f"Index thread qty for force merge: {index_thread_qty}")
89
return index_thread_qty
910

11+
1012
def get_index_thread_qty(client: OpenSearch):
1113
processors_per_node = get_cores_for_data_nodes(client=client)
1214
# since during index more than 1 shard will be doing indexing, we are becoming conservative in parallelization factor
@@ -16,70 +18,68 @@ def get_index_thread_qty(client: OpenSearch):
1618

1719

1820
def get_cores_for_data_nodes(client: OpenSearch):
19-
# Sample nodes info response which is getting parsed.
20-
# {
21-
# "nodes": {
22-
# "Or9Nm4UJR3-gcMOGwJhHHQ": {
23-
# "roles": [
24-
# "data",
25-
# "ingest",
26-
# "master",
27-
# "remote_cluster_client"
28-
# ],
29-
# "os": {
30-
# "refresh_interval_in_millis": 1000,
31-
# "available_processors": 8,
32-
# "allocated_processors": 8
33-
# }
34-
# },
35-
# "A-cqbeekROeR3kzKhOXpRw": {
36-
# "roles": [
37-
# "data",
38-
# "ingest",
39-
# "master",
40-
# "remote_cluster_client"
41-
# ],
42-
# "os": {
43-
# "refresh_interval_in_millis": 1000,
44-
# "available_processors": 8,
45-
# "allocated_processors": 8
46-
# }
47-
# },
48-
# "FrDs-vOMQ8yDZ0HEkDwRHA": {
49-
# "roles": [
50-
# "data",
51-
# "ingest",
52-
# "master",
53-
# "remote_cluster_client"
54-
# ],
55-
# "os": {
56-
# "refresh_interval_in_millis": 1000,
57-
# "available_processors": 8,
58-
# "allocated_processors": 8
59-
# }
60-
# }
61-
# }
62-
# }
21+
# Sample nodes info response which is getting parsed.
22+
# {
23+
# "nodes": {
24+
# "Or9Nm4UJR3-gcMOGwJhHHQ": {
25+
# "roles": [
26+
# "data",
27+
# "ingest",
28+
# "master",
29+
# "remote_cluster_client"
30+
# ],
31+
# "os": {
32+
# "refresh_interval_in_millis": 1000,
33+
# "available_processors": 8,
34+
# "allocated_processors": 8
35+
# }
36+
# },
37+
# "A-cqbeekROeR3kzKhOXpRw": {
38+
# "roles": [
39+
# "data",
40+
# "ingest",
41+
# "master",
42+
# "remote_cluster_client"
43+
# ],
44+
# "os": {
45+
# "refresh_interval_in_millis": 1000,
46+
# "available_processors": 8,
47+
# "allocated_processors": 8
48+
# }
49+
# },
50+
# "FrDs-vOMQ8yDZ0HEkDwRHA": {
51+
# "roles": [
52+
# "data",
53+
# "ingest",
54+
# "master",
55+
# "remote_cluster_client"
56+
# ],
57+
# "os": {
58+
# "refresh_interval_in_millis": 1000,
59+
# "available_processors": 8,
60+
# "allocated_processors": 8
61+
# }
62+
# }
63+
# }
64+
# }
6365

64-
nodes_stats_res = client.nodes.info(filter_path="nodes.*.roles,nodes.*.os")
65-
nodes_data = nodes_stats_res.get("nodes")
66-
data_node_count = 0
67-
total_processors = 0
68-
for node_id in nodes_data:
69-
node_info = nodes_data.get(node_id)
70-
roles = node_info["roles"]
71-
os_info = node_info["os"]
72-
if 'data' in roles:
73-
data_node_count += 1
74-
total_processors += int(os_info['allocated_processors'])
75-
processors_per_node = total_processors // data_node_count
76-
return processors_per_node
66+
nodes_stats_res = client.nodes.info(filter_path="nodes.*.roles,nodes.*.os")
67+
nodes_data = nodes_stats_res.get("nodes")
68+
data_node_count = 0
69+
total_processors = 0
70+
for node_id in nodes_data:
71+
node_info = nodes_data.get(node_id)
72+
roles = node_info["roles"]
73+
os_info = node_info["os"]
74+
if "data" in roles:
75+
data_node_count += 1
76+
total_processors += int(os_info["allocated_processors"])
77+
processors_per_node = total_processors // data_node_count
78+
return processors_per_node
7779

7880

7981
def update_force_merge_threads(client: OpenSearch, index_thread_qty=1):
80-
cluster_settings_body = {
81-
"persistent": {
82-
"knn.algo_param.index_thread_qty": index_thread_qty
83-
}
84-
}
85-
client.cluster.put_settings(cluster_settings_body)
82+
cluster_settings_body = {
83+
"persistent": {"knn.algo_param.index_thread_qty": index_thread_qty}
84+
}
85+
client.cluster.put_settings(cluster_settings_body)

0 commit comments

Comments
 (0)