Skip to content

Commit 440778c

Browse files
mihaicslice4empozniak95
authored
Add Cohere dataset and configuration (#30)
* Add Cohere dataset and configuration The Cohere HDF5 does not have a "distances" dataset. Since this is not used, we can make it optional. * Add SQ8 * Fix distance * Remove NUM_THREADS and SQ8 from cohere-cal.json file. * Add vector_count and description to cohere dataset --------- Co-authored-by: Martin Dimitrov <[email protected]> Co-authored-by: Marcin Poźniak <[email protected]>
1 parent f36f6fd commit 440778c

File tree

3 files changed

+405
-3
lines changed

3 files changed

+405
-3
lines changed

dataset_reader/ann_h5_reader.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import itertools
12
from typing import Iterator
23

34
import h5py
@@ -14,17 +15,18 @@ def __init__(self, path, normalize=False):
1415

1516
def read_queries(self) -> Iterator[Query]:
1617
data = h5py.File(self.path)
18+
distances = data["distances"] if "distances" in data else itertools.repeat(None)
1719

1820
for vector, expected_result, expected_scores in zip(
19-
data["test"], data["neighbors"], data["distances"]
21+
data["test"], data["neighbors"], distances
2022
):
2123
if self.normalize:
2224
vector /= np.linalg.norm(vector)
2325
yield Query(
2426
vector=vector.tolist(),
2527
meta_conditions=None,
2628
expected_result=expected_result.tolist(),
27-
expected_scores=expected_scores.tolist(),
29+
expected_scores=expected_scores.tolist() if expected_scores is not None else None,
2830
)
2931

3032
def read_data(self, *args, **kwargs) -> Iterator[Record]:

datasets/datasets.json

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1296,5 +1296,25 @@
12961296
"path": "random-100-match-kw-small-vocab/random_keywords_1m_vocab_10_no_filters",
12971297
"vector_count": 100,
12981298
"description": "Synthetic data"
1299+
},
1300+
{
1301+
"name": "cohere-768-1M",
1302+
"vector_size": 768,
1303+
"distance": "dot",
1304+
"type": "h5",
1305+
"path": "cohere-768-1M/cohere-768-1M.hdf5",
1306+
"link": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/vectorsearch/cohere-wikipedia-22-12-en-embeddings/documents-1m.hdf5.bz2",
1307+
"vector_count": 1000000,
1308+
"description": "Wikipedia embeddings"
1309+
},
1310+
{
1311+
"name": "cohere-768-10M",
1312+
"vector_size": 768,
1313+
"distance": "dot",
1314+
"type": "h5",
1315+
"path": "cohere-768-10M/cohere-768-10M.hdf5",
1316+
"link": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/vectorsearch/cohere-wikipedia-22-12-en-embeddings/documents-10m.hdf5.bz2",
1317+
"vector_count": 10000000,
1318+
"description": "Wikipedia embeddings"
12991319
}
1300-
]
1320+
]

0 commit comments

Comments
 (0)