Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benchmark/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def main():
data = json.load(open(search_file))
experiment_name = data["params"]["experiment"]
dataset_name = data["params"]["dataset"]
engine_params = data["params"]["config"]
engine_params = data["params"].get("config", {})
parallel = data["params"]["parallel"]
engine_name = data["params"]["engine"]

Expand Down
9 changes: 9 additions & 0 deletions engine/clients/chroma/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from engine.clients.chroma.configure import ChromaConfigurator
from engine.clients.chroma.search import ChromaSearcher
from engine.clients.chroma.upload import ChromaUploader

__all__ = [
"ChromaConfigurator",
"ChromaSearcher",
"ChromaUploader",
]
7 changes: 7 additions & 0 deletions engine/clients/chroma/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import os

CHROMA_COLLECTION_NAME = os.getenv("CHROMA_COLLECTION_NAME", "benchmark")


def chroma_fix_host(host: str):
return host if host != "localhost" else "127.0.0.1"
45 changes: 45 additions & 0 deletions engine/clients/chroma/configure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from chromadb import HttpClient, Settings

from benchmark.dataset import Dataset
from engine.base_client.configure import BaseConfigurator
from engine.base_client.distances import Distance
from engine.clients.chroma.config import CHROMA_COLLECTION_NAME, chroma_fix_host


class ChromaConfigurator(BaseConfigurator):

DISTANCE_MAPPING = {
Distance.L2: "l2",
Distance.COSINE: "cosine",
Distance.DOT: "ip",
}

def __init__(self, host, collection_params: dict, connection_params: dict):
super().__init__(host, collection_params, connection_params)
self.client = HttpClient(
host=chroma_fix_host(host),
settings=Settings(allow_reset=True, anonymized_telemetry=False),
**connection_params,
)

def clean(self):
"""
Delete a collection and all associated embeddings, documents, and metadata.

This is destructive and not reversible.
"""
try:
self.client.delete_collection(name=CHROMA_COLLECTION_NAME)
except (Exception, ValueError):
pass

def recreate(self, dataset: Dataset, collection_params):
params = self.collection_params
params["metadata"] = dict(
{"hnsw:space": self.DISTANCE_MAPPING.get(dataset.config.distance)},
**params.get("metadata", {}),
)
self.client.create_collection(
name=CHROMA_COLLECTION_NAME,
**params,
)
57 changes: 57 additions & 0 deletions engine/clients/chroma/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from typing import List, Optional

from chromadb import Where
from chromadb.types import OperatorExpression

from engine.base_client import IncompatibilityError
from engine.base_client.parser import BaseConditionParser, FieldValue


class ChromaConditionParser(BaseConditionParser):
def build_condition(
self,
and_subfilters: Optional[List[Where]],
or_subfilters: Optional[List[Where]],
) -> Where:
condition: Where = {}
if and_subfilters is not None:
if len(and_subfilters) >= 2:
condition["$and"] = and_subfilters
elif len(and_subfilters) == 1:
condition = {**condition, **and_subfilters[0]}

if or_subfilters is not None:
if len(or_subfilters) >= 2:
condition["$or"] = or_subfilters
elif len(or_subfilters) == 1:
condition = {**condition, **or_subfilters[0]}

return condition
# return {k: v for d in [flt for xs in [and_subfilters, or_subfilters] for flt in xs] for k, v in d.items()}

def build_exact_match_filter(self, field_name: str, value: FieldValue) -> Where:
return {field_name: value}

def build_range_filter(
self,
field_name: str,
lt: Optional[FieldValue],
gt: Optional[FieldValue],
lte: Optional[FieldValue],
gte: Optional[FieldValue],
) -> Where:
raw_filters: OperatorExpression = {
"$lt": lt,
"$gt": gt,
"$lte": lte,
"$gte": gte,
}
filters: OperatorExpression = {
k: v for k, v in raw_filters.items() if v is not None
}
return {field_name: filters}

def build_geo_filter(
self, field_name: str, lat: float, lon: float, radius: float
) -> Where:
raise IncompatibilityError
44 changes: 44 additions & 0 deletions engine/clients/chroma/search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from typing import List, Tuple

from chromadb import ClientAPI, HttpClient, Settings
from chromadb.api.types import IncludeEnum

from dataset_reader.base_reader import Query
from engine.base_client.search import BaseSearcher
from engine.clients.chroma.config import CHROMA_COLLECTION_NAME, chroma_fix_host
from engine.clients.chroma.parser import ChromaConditionParser


class ChromaSearcher(BaseSearcher):
client: ClientAPI = None
parser = ChromaConditionParser()

@classmethod
def init_client(cls, host, distance, connection_params: dict, search_params: dict):
cls.client = HttpClient(
host=chroma_fix_host(host),
settings=Settings(allow_reset=True, anonymized_telemetry=False),
**connection_params,
)
cls.collection = cls.client.get_collection(name=CHROMA_COLLECTION_NAME)
cls.search_params = search_params

@classmethod
def search_one(cls, query: Query, top: int) -> List[Tuple[int, float]]:
res = cls.collection.query(
query_embeddings=[query.vector],
n_results=top,
where=cls.parser.parse(query.meta_conditions),
include=[IncludeEnum.distances],
)

return [
(int(hit[0]), float(hit[1]))
for hit in zip(res["ids"][0], res["distances"][0])
]

def setup_search(self):
metadata = self.collection.metadata.copy()
metadata.pop("hnsw:space", None) # Not allowed in the collection.modify method
metadata.update(self.search_params.get("metadata", {}))
self.collection.modify(metadata=metadata)
35 changes: 35 additions & 0 deletions engine/clients/chroma/upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from typing import List

from chromadb import ClientAPI, HttpClient, Settings

from dataset_reader.base_reader import Record
from engine.base_client.upload import BaseUploader
from engine.clients.chroma.config import CHROMA_COLLECTION_NAME, chroma_fix_host


class ChromaUploader(BaseUploader):
client: ClientAPI = None
upload_params = {}

@classmethod
def init_client(cls, host, distance, connection_params, upload_params):
cls.client = HttpClient(
host=chroma_fix_host(host),
settings=Settings(allow_reset=True, anonymized_telemetry=False),
**connection_params,
)
cls.collection = cls.client.get_collection(name=CHROMA_COLLECTION_NAME)

@classmethod
def upload_batch(cls, batch: List[Record]):
ids, vectors, payloads = [], [], []
for point in batch:
ids.append(str(point.id))
vectors.append(point.vector)
payloads.append(point.metadata or None)

cls.collection.add(
embeddings=vectors,
metadatas=payloads or None,
ids=ids,
)
4 changes: 4 additions & 0 deletions engine/clients/client_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
BaseSearcher,
BaseUploader,
)
from engine.clients.chroma import ChromaConfigurator, ChromaSearcher, ChromaUploader
from engine.clients.elasticsearch import (
ElasticConfigurator,
ElasticSearcher,
Expand Down Expand Up @@ -39,6 +40,7 @@
"opensearch": OpenSearchConfigurator,
"redis": RedisConfigurator,
"pgvector": PgVectorConfigurator,
"chroma": ChromaConfigurator,
}

ENGINE_UPLOADERS = {
Expand All @@ -49,6 +51,7 @@
"opensearch": OpenSearchUploader,
"redis": RedisUploader,
"pgvector": PgVectorUploader,
"chroma": ChromaUploader,
}

ENGINE_SEARCHERS = {
Expand All @@ -59,6 +62,7 @@
"opensearch": OpenSearchSearcher,
"redis": RedisSearcher,
"pgvector": PgVectorSearcher,
"chroma": ChromaSearcher,
}


Expand Down
20 changes: 20 additions & 0 deletions engine/servers/chroma-single-node/docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
services:
chromadb_bench:
image: ${CONTAINER_REGISTRY:-docker.io}/chromadb/chroma:0.5.7
#volumes:
# - ./chromadb:/chroma/chroma
ports:
- "8000:8000"
logging:
driver: "json-file"
options:
max-file: 1
max-size: 10m
environment:
IS_PERSISTENT: TRUE
ANONYMIZED_TELEMETRY: False
CHROMA_WORKERS: 2
deploy:
resources:
limits:
memory: 25Gb
113 changes: 113 additions & 0 deletions experiments/configurations/chroma-single-node.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
[
{
"name": "chroma-default",
"engine": "chroma",
"connection_params": {},
"collection_params": {},
"search_params": [
{
"parallel": 8
}
],
"upload_params": {
"parallel": 16,
"batch_size": 1024
}
},
{
"name": "chroma-m-16-ef-128",
"engine": "chroma",
"connection_params": {},
"collection_params": {
"metadata": {
"hnsw:M": 16,
"hnsw:construction_ef": 128
}
},
"search_params": [
{ "parallel": 1, "metadata": {"hnsw:search_ef": 128 } }, { "parallel": 1, "metadata": {"hnsw:search_ef": 256 } }, { "parallel": 1, "metadata": {"hnsw:search_ef": 512 } },
{ "parallel": 100, "metadata": {"hnsw:search_ef": 128 } }, { "parallel": 100, "metadata": {"hnsw:search_ef": 256 } }, { "parallel": 100, "metadata": {"hnsw:search_ef": 512 } }
],
"upload_params": { "parallel": 16 }
},
{
"name": "chroma-m-32-ef-128",
"engine": "chroma",
"connection_params": {},
"collection_params": {
"metadata": {
"hnsw:M": 32,
"hnsw:construction_ef": 128
}
},
"search_params": [
{ "parallel": 1, "metadata": {"hnsw:search_ef": 128 } }, { "parallel": 1, "metadata": {"hnsw:search_ef": 256 } }, { "parallel": 1, "metadata": {"hnsw:search_ef": 512 } },
{ "parallel": 100, "metadata": {"hnsw:search_ef": 128 } }, { "parallel": 100, "metadata": {"hnsw:search_ef": 256 } }, { "parallel": 100, "metadata": {"hnsw:search_ef": 512 } }
],
"upload_params": { "parallel": 16 }
},
{
"name": "chroma-m-32-ef-256",
"engine": "chroma",
"connection_params": {},
"collection_params": {
"metadata": {
"hnsw:M": 32,
"hnsw:construction_ef": 256
}
},
"search_params": [
{ "parallel": 1, "metadata": {"hnsw:search_ef": 128 } }, { "parallel": 1, "metadata": {"hnsw:search_ef": 256 } }, { "parallel": 1, "metadata": {"hnsw:search_ef": 512 } },
{ "parallel": 100, "metadata": {"hnsw:search_ef": 128 } }, { "parallel": 100, "metadata": {"hnsw:search_ef": 256 } }, { "parallel": 100, "metadata": {"hnsw:search_ef": 512 } }
],
"upload_params": { "parallel": 16 }
},
{
"name": "chroma-m-32-ef-512",
"engine": "chroma",
"connection_params": {},
"collection_params": {
"metadata": {
"hnsw:M": 32,
"hnsw:construction_ef": 512
}
},
"search_params": [
{ "parallel": 1, "metadata": {"hnsw:search_ef": 128 } }, { "parallel": 1, "metadata": {"hnsw:search_ef": 256 } }, { "parallel": 1, "metadata": {"hnsw:search_ef": 512 } },
{ "parallel": 100, "metadata": {"hnsw:search_ef": 128 } }, { "parallel": 100, "metadata": {"hnsw:search_ef": 256 } }, { "parallel": 100, "metadata": {"hnsw:search_ef": 512 } }
],
"upload_params": { "parallel": 16 }
},
{
"name": "chroma-m-64-ef-256",
"engine": "chroma",
"connection_params": {},
"collection_params": {
"metadata": {
"hnsw:M": 64,
"hnsw:construction_ef": 256
}
},
"search_params": [
{ "parallel": 1, "metadata": {"hnsw:search_ef": 128 } }, { "parallel": 1, "metadata": {"hnsw:search_ef": 256 } }, { "parallel": 1, "metadata": {"hnsw:search_ef": 512 } },
{ "parallel": 100, "metadata": {"hnsw:search_ef": 128 } }, { "parallel": 100, "metadata": {"hnsw:search_ef": 256 } }, { "parallel": 100, "metadata": {"hnsw:search_ef": 512 } }
],
"upload_params": { "parallel": 16 }
},
{
"name": "chroma-m-64-ef-512",
"engine": "chroma",
"connection_params": {},
"collection_params": {
"metadata": {
"hnsw:M": 64,
"hnsw:construction_ef": 512
}
},
"search_params": [
{ "parallel": 1, "metadata": {"hnsw:search_ef": 128 } }, { "parallel": 1, "metadata": {"hnsw:search_ef": 256 } }, { "parallel": 1, "metadata": {"hnsw:search_ef": 512 } },
{ "parallel": 100, "metadata": {"hnsw:search_ef": 128 } }, { "parallel": 100, "metadata": {"hnsw:search_ef": 256 } }, { "parallel": 100, "metadata": {"hnsw:search_ef": 512 } }
],
"upload_params": { "parallel": 16 }
}
]
Loading