Skip to content

Unexpected Test Results Variance with Different Batch Sizes #111

@VickiCui

Description

@VickiCui

In theory, the performance should be stable across different batch sizes. But when testing the ViDoRe Benchmark 2 with different batch_query and batch_passage values, I observed significant performance discrepancies.

With batch_query=4 and batch_passage=4, the test results of colqwen2.5-v0.2 (max_num_visual_tokens=768) on esg_reports_human_labeled_v2 are:

{'ndcg_at_1': 0.62179, 'ndcg_at_3': 0.67745, 'ndcg_at_5': 0.67287, 'ndcg_at_10': 0.70728, 'ndcg_at_20': 0.72364, 'ndcg_at_50': 0.74802, 'ndcg_at_100': 0.75417, 'map_at_1': 0.41233, 'map_at_3': 0.57055, 'map_at_5': 0.59417, 'map_at_10': 0.62827, 'map_at_20': 0.63596, 'map_at_50': 0.64363, 'map_at_100': 0.64546, 'recall_at_1': 0.41233, 'recall_at_3': 0.6822, 'recall_at_5': 0.72194, 'recall_at_10': 0.82212, 'recall_at_20': 0.87167, 'recall_at_50': 0.95485, 'recall_at_100': 0.981, 'precision_at_1': 0.63462, 'precision_at_3': 0.40385, 'precision_at_5': 0.27308, 'precision_at_10': 0.16538, 'precision_at_20': 0.09327, 'precision_at_50': 0.04346, 'precision_at_100': 0.02308, 'mrr_at_1': 0.6730769230769231, 'mrr_at_3': 0.7596153846153846, 'mrr_at_5': 0.7692307692307693, 'mrr_at_10': 0.7743589743589744, 'mrr_at_20': 0.7743589743589744, 'mrr_at_50': 0.7765232345174206, 'mrr_at_100': 0.7765232345174206, 'naucs_at_1_max': np.float64(0.15808842304687007), 'naucs_at_1_std': np.float64(0.09610820916153678), 'naucs_at_1_diff1': np.float64(0.6580852599392208), 'naucs_at_3_max': np.float64(-0.05157372399638369), 'naucs_at_3_std': np.float64(0.06351447012509337), 'naucs_at_3_diff1': np.float64(-0.14346690686426106), 'naucs_at_5_max': np.float64(-0.12159532915431565), 'naucs_at_5_std': np.float64(0.03573545110145072), 'naucs_at_5_diff1': np.float64(-0.25402425764340436), 'naucs_at_10_max': np.float64(-0.19579840780747396), 'naucs_at_10_std': np.float64(-0.0588230883206641), 'naucs_at_10_diff1': np.float64(-0.34384957425129475), 'naucs_at_20_max': np.float64(-0.1746776235137422), 'naucs_at_20_std': np.float64(-0.04811434229056555), 'naucs_at_20_diff1': np.float64(-0.41708692070003905), 'naucs_at_50_max': np.float64(-0.167966993303741), 'naucs_at_50_std': np.float64(-0.042923187520787366), 'naucs_at_50_diff1': np.float64(-0.4540246380452276), 'naucs_at_100_max': np.float64(-0.15290268208724309), 'naucs_at_100_std': np.float64(-0.03148299510892002), 'naucs_at_100_diff1': np.float64(-0.4484928211339262)}

However, with batch_query=32 and batch_passage=32, the test results are:

{'ndcg_at_1': 0.44872, 'ndcg_at_3': 0.42337, 'ndcg_at_5': 0.42367, 'ndcg_at_10': 0.42917, 'ndcg_at_20': 0.44455, 'ndcg_at_50': 0.45184, 'ndcg_at_100': 0.45877, 'map_at_1': 0.30481, 'map_at_3': 0.35801, 'map_at_5': 0.37332, 'map_at_10': 0.37951, 'map_at_20': 0.38592, 'map_at_50': 0.38873, 'map_at_100': 0.38945, 'recall_at_1': 0.30481, 'recall_at_3': 0.39487, 'recall_at_5': 0.43494, 'recall_at_10': 0.4603, 'recall_at_20': 0.49607, 'recall_at_50': 0.52685, 'recall_at_100': 0.56531, 'precision_at_1': 0.46154, 'precision_at_3': 0.22436, 'precision_at_5': 0.15385, 'precision_at_10': 0.08846, 'precision_at_20': 0.05385, 'precision_at_50': 0.02385, 'precision_at_100': 0.0125, 'mrr_at_1': 0.5, 'mrr_at_3': 0.5288461538461539, 'mrr_at_5': 0.5375, 'mrr_at_10': 0.5399038461538461, 'mrr_at_20': 0.5429857001972387, 'mrr_at_50': 0.5438218205985764, 'mrr_at_100': 0.544285576306339, 'naucs_at_1_max': np.float64(0.25587549371639967), 'naucs_at_1_std': np.float64(0.014463906636920631), 'naucs_at_1_diff1': np.float64(0.4968858770946776), 'naucs_at_3_max': np.float64(-0.002553332813834949), 'naucs_at_3_std': np.float64(-0.10030853800895284), 'naucs_at_3_diff1': np.float64(0.11776417938443044), 'naucs_at_5_max': np.float64(-0.05793992323759826), 'naucs_at_5_std': np.float64(-0.16493527873739347), 'naucs_at_5_diff1': np.float64(0.0027769146142092027), 'naucs_at_10_max': np.float64(-0.022817220388127064), 'naucs_at_10_std': np.float64(-0.08731661589664137), 'naucs_at_10_diff1': np.float64(-0.10630531273010901), 'naucs_at_20_max': np.float64(-0.013609007863728513), 'naucs_at_20_std': np.float64(-0.05326622089219273), 'naucs_at_20_diff1': np.float64(-0.14949256053059332), 'naucs_at_50_max': np.float64(0.011573411943572498), 'naucs_at_50_std': np.float64(0.002126999025563241), 'naucs_at_50_diff1': np.float64(-0.19579215392936578), 'naucs_at_100_max': np.float64(-0.015311183218817678), 'naucs_at_100_std': np.float64(-0.012216797767884598), 'naucs_at_100_diff1': np.float64(-0.18778334725455362)}

Reproduction Code:

import torch
from colpali_engine.models import ColQwen2_5, ColQwen2_5_Processor
from colpali_engine.utils.torch_utils import get_torch_device
from datasets import load_dataset
from tqdm import tqdm

from vidore_benchmark.evaluation.vidore_evaluators import ViDoReEvaluatorBEIR
from vidore_benchmark.retrievers import VisionRetriever

model_name = "local_path_to/colqwen2.5-v0.2"
device = get_torch_device("auto")

# Load the model
model = ColQwen2_5.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map=device,
).eval()
processor = ColQwen2_5_Processor.from_pretrained(
    model_name,
    max_num_visual_tokens=768
)

# Get retriever instance
vision_retriever = VisionRetriever(model=model, processor=processor)

# Evaluate on a single BEIR format dataset (e.g one of the ViDoRe benchmark 2 dataset)
vidore_evaluator_beir = ViDoReEvaluatorBEIR(vision_retriever)
ds = {
    "corpus" : load_dataset("vidore/esg_reports_human_labeled_v2", name="corpus", split="test"),
    "queries" : load_dataset("vidore/esg_reports_human_labeled_v2", name="queries", split="test"),
    "qrels" : load_dataset("vidore/esg_reports_human_labeled_v2", name="qrels", split="test")
}

metrics_dataset_beir = vidore_evaluator_beir.evaluate_dataset(
    ds=ds,
    batch_query=4,
    batch_passage=4,
    batch_score=128,
)
print(metrics_dataset_beir)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions