Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ and this project adheres to [Semantic Versioning].
- Add tests for `BiQwen2Retriever`
- Add tests for `ColIdefics3Retriever`
- Add tests and E2E tests for cli command `evaluate-retriever`
- Add tests for `ViDoReEvaluatorBEIR`

## [4.0.2] - 2024-10-17

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@ class BEIRDataset(TypedDict):
queries: The dataset containing the queries.
qrels: The dataset containing the query relevance scores.

Each subset is associated to a key with the same name.
`qrels` follows the TREC format, where the structure is `{query_id: {doc_id: relevance_score}}`.
`relevance_score` is an integer indicating the relevance of the document to the query. For each query i,
the relevance scores are integers in the range [0, N_i], where the higher the score, the more relevant
the document is to the given query.
"""

corpus: Dataset
Expand Down
23 changes: 23 additions & 0 deletions tests/evaluation/vidore_evaluators/conftest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest
import torch

from vidore_benchmark.retrievers.base_vision_retriever import BaseVisionRetriever
from vidore_benchmark.retrievers.bm25_retriever import BM25Retriever


Expand All @@ -15,3 +16,25 @@ def get_scores_bm25(self, queries, passages, **kwargs):
@pytest.fixture
def mock_bm25_retriever():
return MockBM25Retriever()


class MockVisionRetriever(BaseVisionRetriever):
def __init__(self, use_visual_embedding=True):
self.use_visual_embedding = use_visual_embedding

def forward_queries(self, queries, batch_size=None, **kwargs):
return torch.tensor([[1.0, 0.0] for _ in queries])

def forward_passages(self, passages, batch_size=None, **kwargs):
return torch.tensor([[0.0, 1.0] for _ in passages])

def get_scores(self, query_embeddings, passage_embeddings, batch_size=None):
return torch.tensor([[0.5 for _ in range(len(passage_embeddings))] for _ in range(len(query_embeddings))])

def get_scores_bm25(self, queries, passages):
return torch.tensor([[0.5 for _ in range(len(passages))] for _ in range(len(queries))])


@pytest.fixture
def mock_vision_retriever():
return MockVisionRetriever()
74 changes: 74 additions & 0 deletions tests/evaluation/vidore_evaluators/test_vidore_evaluator_beir.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import pytest
from datasets import Dataset
from PIL import Image

from vidore_benchmark.evaluation.vidore_evaluators.vidore_evaluator_beir import BEIRDataset, ViDoReEvaluatorBEIR
from vidore_benchmark.retrievers.base_vision_retriever import BaseVisionRetriever


@pytest.fixture
def mock_beir_dataset() -> BEIRDataset:
corpus = Dataset.from_dict(
{
"corpus-id": [1, 2, 3, 4],
"image": [Image.new("RGB", (10, 10)) for _ in range(4)],
"text_description": ["desc1", "desc2", "desc3", "desc4"],
}
)

queries = Dataset.from_dict(
{
"query-id": [1, 2],
"query": ["query1", "query2"],
}
)

qrels = Dataset.from_dict(
{
"query-id": [1, 1, 2],
"corpus-id": [1, 2, 3],
"score": [1, 1, 1],
}
)

return BEIRDataset(corpus=corpus, queries=queries, qrels=qrels)


@pytest.fixture
def evaluator(mock_vision_retriever):
return ViDoReEvaluatorBEIR(vision_retriever=mock_vision_retriever)


def test_init(evaluator):
assert isinstance(evaluator.vision_retriever, BaseVisionRetriever)


def test_evaluate_dataset(evaluator, mock_beir_dataset):
metrics = evaluator.evaluate_dataset(
ds=mock_beir_dataset,
batch_query=2,
batch_passage=2,
)

assert isinstance(metrics, dict)
assert "ndcg_at_1" in metrics
assert "map_at_1" in metrics
assert "recall_at_1" in metrics
assert "precision_at_1" in metrics
assert "mrr_at_1" in metrics


def test_evaluate_dataset_with_bm25(mock_bm25_retriever, mock_beir_dataset):
evaluator = ViDoReEvaluatorBEIR(vision_retriever=mock_bm25_retriever)
metrics = evaluator.evaluate_dataset(
ds=mock_beir_dataset,
batch_query=2,
batch_passage=2,
)

assert isinstance(metrics, dict)
assert "ndcg_at_1" in metrics
assert "map_at_1" in metrics
assert "recall_at_1" in metrics
assert "precision_at_1" in metrics
assert "mrr_at_1" in metrics
32 changes: 7 additions & 25 deletions tests/evaluation/vidore_evaluators/test_vidore_evaluator_qa.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,13 @@
import pytest
import torch
from datasets import Dataset
from PIL import Image

from vidore_benchmark.evaluation.vidore_evaluators.vidore_evaluator_qa import ViDoReEvaluatorQA
from vidore_benchmark.retrievers.base_vision_retriever import BaseVisionRetriever


class MockVisionRetriever(BaseVisionRetriever):
def __init__(self, use_visual_embedding=True):
self.use_visual_embedding = use_visual_embedding

def forward_queries(self, queries, batch_size=None, **kwargs):
return torch.tensor([[1.0, 0.0] for _ in queries])

def forward_passages(self, passages, batch_size=None, **kwargs):
return torch.tensor([[0.0, 1.0] for _ in passages])

def get_scores(self, query_embeddings, passage_embeddings, batch_size=None):
return torch.tensor([[0.5 for _ in range(len(passage_embeddings))] for _ in range(len(query_embeddings))])

def get_scores_bm25(self, queries, passages):
return torch.tensor([[0.5 for _ in range(len(passages))] for _ in range(len(queries))])


@pytest.fixture
def mock_dataset():
def mock_qa_dataset():
return Dataset.from_dict(
{
"query": ["query1", "query2", "query1", None],
Expand All @@ -37,17 +19,17 @@ def mock_dataset():


@pytest.fixture
def evaluator():
return ViDoReEvaluatorQA(vision_retriever=MockVisionRetriever())
def evaluator(mock_vision_retriever):
return ViDoReEvaluatorQA(vision_retriever=mock_vision_retriever)


def test_init(evaluator):
assert isinstance(evaluator.vision_retriever, BaseVisionRetriever)


def test_evaluate_dataset(evaluator, mock_dataset):
def test_evaluate_dataset(evaluator, mock_qa_dataset):
metrics = evaluator.evaluate_dataset(
ds=mock_dataset,
ds=mock_qa_dataset,
batch_query=2,
batch_passage=2,
)
Expand All @@ -60,10 +42,10 @@ def test_evaluate_dataset(evaluator, mock_dataset):
assert "mrr_at_1" in metrics


def test_evaluate_dataset_with_bm25(mock_bm25_retriever, mock_dataset):
def test_evaluate_dataset_with_bm25(mock_bm25_retriever, mock_qa_dataset):
evaluator = ViDoReEvaluatorQA(vision_retriever=mock_bm25_retriever)
metrics = evaluator.evaluate_dataset(
ds=mock_dataset,
ds=mock_qa_dataset,
batch_query=2,
batch_passage=2,
)
Expand Down