diff --git a/CHANGELOG.md b/CHANGELOG.md index c24ce42d..41881998 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -72,6 +72,7 @@ and this project adheres to [Semantic Versioning]. - Add tests for `BiQwen2Retriever` - Add tests for `ColIdefics3Retriever` - Add tests and E2E tests for cli command `evaluate-retriever` +- Add tests for `ViDoReEvaluatorBEIR` ## [4.0.2] - 2024-10-17 diff --git a/src/vidore_benchmark/evaluation/vidore_evaluators/vidore_evaluator_beir.py b/src/vidore_benchmark/evaluation/vidore_evaluators/vidore_evaluator_beir.py index 3f90d359..254bc904 100644 --- a/src/vidore_benchmark/evaluation/vidore_evaluators/vidore_evaluator_beir.py +++ b/src/vidore_benchmark/evaluation/vidore_evaluators/vidore_evaluator_beir.py @@ -18,7 +18,10 @@ class BEIRDataset(TypedDict): queries: The dataset containing the queries. qrels: The dataset containing the query relevance scores. - Each subset is associated to a key with the same name. + `qrels` follows the TREC format, where the structure is `{query_id: {doc_id: relevance_score}}`. + `relevance_score` is an integer indicating the relevance of the document to the query. For each query i, + the relevance scores are integers in the range [0, N_i], where the higher the score, the more relevant + the document is to the given query. """ corpus: Dataset diff --git a/tests/evaluation/vidore_evaluators/conftest.py b/tests/evaluation/vidore_evaluators/conftest.py index d8d81f0c..d8f2ba01 100644 --- a/tests/evaluation/vidore_evaluators/conftest.py +++ b/tests/evaluation/vidore_evaluators/conftest.py @@ -1,6 +1,7 @@ import pytest import torch +from vidore_benchmark.retrievers.base_vision_retriever import BaseVisionRetriever from vidore_benchmark.retrievers.bm25_retriever import BM25Retriever @@ -15,3 +16,25 @@ def get_scores_bm25(self, queries, passages, **kwargs): @pytest.fixture def mock_bm25_retriever(): return MockBM25Retriever() + + +class MockVisionRetriever(BaseVisionRetriever): + def __init__(self, use_visual_embedding=True): + self.use_visual_embedding = use_visual_embedding + + def forward_queries(self, queries, batch_size=None, **kwargs): + return torch.tensor([[1.0, 0.0] for _ in queries]) + + def forward_passages(self, passages, batch_size=None, **kwargs): + return torch.tensor([[0.0, 1.0] for _ in passages]) + + def get_scores(self, query_embeddings, passage_embeddings, batch_size=None): + return torch.tensor([[0.5 for _ in range(len(passage_embeddings))] for _ in range(len(query_embeddings))]) + + def get_scores_bm25(self, queries, passages): + return torch.tensor([[0.5 for _ in range(len(passages))] for _ in range(len(queries))]) + + +@pytest.fixture +def mock_vision_retriever(): + return MockVisionRetriever() diff --git a/tests/evaluation/vidore_evaluators/test_vidore_evaluator_beir.py b/tests/evaluation/vidore_evaluators/test_vidore_evaluator_beir.py new file mode 100644 index 00000000..441b411a --- /dev/null +++ b/tests/evaluation/vidore_evaluators/test_vidore_evaluator_beir.py @@ -0,0 +1,74 @@ +import pytest +from datasets import Dataset +from PIL import Image + +from vidore_benchmark.evaluation.vidore_evaluators.vidore_evaluator_beir import BEIRDataset, ViDoReEvaluatorBEIR +from vidore_benchmark.retrievers.base_vision_retriever import BaseVisionRetriever + + +@pytest.fixture +def mock_beir_dataset() -> BEIRDataset: + corpus = Dataset.from_dict( + { + "corpus-id": [1, 2, 3, 4], + "image": [Image.new("RGB", (10, 10)) for _ in range(4)], + "text_description": ["desc1", "desc2", "desc3", "desc4"], + } + ) + + queries = Dataset.from_dict( + { + "query-id": [1, 2], + "query": ["query1", "query2"], + } + ) + + qrels = Dataset.from_dict( + { + "query-id": [1, 1, 2], + "corpus-id": [1, 2, 3], + "score": [1, 1, 1], + } + ) + + return BEIRDataset(corpus=corpus, queries=queries, qrels=qrels) + + +@pytest.fixture +def evaluator(mock_vision_retriever): + return ViDoReEvaluatorBEIR(vision_retriever=mock_vision_retriever) + + +def test_init(evaluator): + assert isinstance(evaluator.vision_retriever, BaseVisionRetriever) + + +def test_evaluate_dataset(evaluator, mock_beir_dataset): + metrics = evaluator.evaluate_dataset( + ds=mock_beir_dataset, + batch_query=2, + batch_passage=2, + ) + + assert isinstance(metrics, dict) + assert "ndcg_at_1" in metrics + assert "map_at_1" in metrics + assert "recall_at_1" in metrics + assert "precision_at_1" in metrics + assert "mrr_at_1" in metrics + + +def test_evaluate_dataset_with_bm25(mock_bm25_retriever, mock_beir_dataset): + evaluator = ViDoReEvaluatorBEIR(vision_retriever=mock_bm25_retriever) + metrics = evaluator.evaluate_dataset( + ds=mock_beir_dataset, + batch_query=2, + batch_passage=2, + ) + + assert isinstance(metrics, dict) + assert "ndcg_at_1" in metrics + assert "map_at_1" in metrics + assert "recall_at_1" in metrics + assert "precision_at_1" in metrics + assert "mrr_at_1" in metrics diff --git a/tests/evaluation/vidore_evaluators/test_vidore_evaluator_qa.py b/tests/evaluation/vidore_evaluators/test_vidore_evaluator_qa.py index e2fa9d17..06a8d37c 100644 --- a/tests/evaluation/vidore_evaluators/test_vidore_evaluator_qa.py +++ b/tests/evaluation/vidore_evaluators/test_vidore_evaluator_qa.py @@ -1,5 +1,4 @@ import pytest -import torch from datasets import Dataset from PIL import Image @@ -7,25 +6,8 @@ from vidore_benchmark.retrievers.base_vision_retriever import BaseVisionRetriever -class MockVisionRetriever(BaseVisionRetriever): - def __init__(self, use_visual_embedding=True): - self.use_visual_embedding = use_visual_embedding - - def forward_queries(self, queries, batch_size=None, **kwargs): - return torch.tensor([[1.0, 0.0] for _ in queries]) - - def forward_passages(self, passages, batch_size=None, **kwargs): - return torch.tensor([[0.0, 1.0] for _ in passages]) - - def get_scores(self, query_embeddings, passage_embeddings, batch_size=None): - return torch.tensor([[0.5 for _ in range(len(passage_embeddings))] for _ in range(len(query_embeddings))]) - - def get_scores_bm25(self, queries, passages): - return torch.tensor([[0.5 for _ in range(len(passages))] for _ in range(len(queries))]) - - @pytest.fixture -def mock_dataset(): +def mock_qa_dataset(): return Dataset.from_dict( { "query": ["query1", "query2", "query1", None], @@ -37,17 +19,17 @@ def mock_dataset(): @pytest.fixture -def evaluator(): - return ViDoReEvaluatorQA(vision_retriever=MockVisionRetriever()) +def evaluator(mock_vision_retriever): + return ViDoReEvaluatorQA(vision_retriever=mock_vision_retriever) def test_init(evaluator): assert isinstance(evaluator.vision_retriever, BaseVisionRetriever) -def test_evaluate_dataset(evaluator, mock_dataset): +def test_evaluate_dataset(evaluator, mock_qa_dataset): metrics = evaluator.evaluate_dataset( - ds=mock_dataset, + ds=mock_qa_dataset, batch_query=2, batch_passage=2, ) @@ -60,10 +42,10 @@ def test_evaluate_dataset(evaluator, mock_dataset): assert "mrr_at_1" in metrics -def test_evaluate_dataset_with_bm25(mock_bm25_retriever, mock_dataset): +def test_evaluate_dataset_with_bm25(mock_bm25_retriever, mock_qa_dataset): evaluator = ViDoReEvaluatorQA(vision_retriever=mock_bm25_retriever) metrics = evaluator.evaluate_dataset( - ds=mock_dataset, + ds=mock_qa_dataset, batch_query=2, batch_passage=2, )