Skip to content

Unify llm judges into a single prepare file #1696

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

This file was deleted.

108 changes: 79 additions & 29 deletions prepare/metrics/llm_as_judge/llm_as_judge.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Union
from typing import Optional, Union

from unitxt import add_to_catalog, get_logger
from unitxt.inference import CrossProviderInferenceEngine
Expand All @@ -8,6 +8,7 @@
EVALUATOR_TO_MODEL_ID,
EVALUATORS_METADATA,
PAIRWISE_CRITERIA,
EvaluatorMetadata,
EvaluatorNameEnum,
EvaluatorTypeEnum,
ModelProviderEnum,
Expand All @@ -16,17 +17,24 @@

logger = get_logger()


def get_evaluator(
name: EvaluatorNameEnum,
evaluator_type: EvaluatorTypeEnum,
provider: ModelProviderEnum,
provider: Optional[ModelProviderEnum] = None,
evaluator_params: Optional[dict] = None,
) -> Union[LLMJudgeDirect, LLMJudgePairwise]:
evaluator_metadata = get_evaluator_metadata(name)
inference_params = {"max_tokens": 1024, "seed": 42, "temperature": 0, "provider": provider.value}
inference_params = {
"max_tokens": 1024,
"seed": 42,
"temperature": 0,
}
if provider is not None:
inference_params["provider"] = provider.value

model_name = EVALUATOR_TO_MODEL_ID[name]

if provider == ModelProviderEnum.AZURE_OPENAI:
if provider is not None and provider == ModelProviderEnum.AZURE_OPENAI:
inference_params["credentials"] = {}
inference_params["credentials"]["api_base"] = (
f"https://eteopenai.azure-api.net/openai/deployments/{model_name}/chat/completions?api-version=2024-08-01-preview"
Expand All @@ -42,6 +50,9 @@ def get_evaluator(
"generate_summaries": False,
}

if evaluator_params is not None:
params.update(evaluator_params)

evaluator_klass = (
LLMJudgeDirect
if evaluator_type == EvaluatorTypeEnum.DIRECT
Expand All @@ -51,6 +62,28 @@ def get_evaluator(
return evaluator_klass(**params)


def get_evaluator_catalog_name(
evaluator_metadata: EvaluatorMetadata,
provider: ModelProviderEnum,
prefix: str = "",
):
metric_name = (
evaluator_metadata.name.value.lower()
.replace("-", "_")
.replace(".", "_")
.replace(" ", "_")
)
provider_name = ""
# for backward compatibility, ideally we would use cross inference engines provider ids
if provider == ModelProviderEnum.AZURE_OPENAI:
provider_name = "azure_openai"
elif provider == ModelProviderEnum.OPENAI:
provider_name = "openai"
else:
provider_name = provider.value.lower()
return f"metrics.{prefix}.{provider_name}.{metric_name}"


logger.debug("Registering criteria...")
# Register all the predefined criterisa
for criteria in DIRECT_CRITERIA:
Expand All @@ -67,36 +100,53 @@ def get_evaluator(
overwrite=True,
)

logger.debug("Registering evaluators...")

logger.debug("Registering generic judges (no criterion is set)...")
for evaluator_metadata in EVALUATORS_METADATA:
for provider in evaluator_metadata.providers:
for evaluator_type in [
EvaluatorTypeEnum.DIRECT,
EvaluatorTypeEnum.PAIRWISE,
]:
evaluator = get_evaluator(
name=evaluator_metadata.name,
evaluator_type=evaluator_type,
provider=provider,
)

metric_name = (
evaluator_metadata.name.value.lower()
.replace("-", "_")
.replace(".", "_")
.replace(" ", "_")
)
provider_name = ""
# for backward compatibility, ideally we would use cross inference engines provider ids
if provider == ModelProviderEnum.AZURE_OPENAI:
provider_name = "azure_openai"
elif provider == ModelProviderEnum.OPENAI:
provider_name = "openai"
else:
provider_name = provider.value.lower()

add_to_catalog(
evaluator,
f"metrics.llm_as_judge.{evaluator_type.value}.{provider_name}.{metric_name}",
get_evaluator(
name=evaluator_metadata.name,
evaluator_type=evaluator_type,
provider=provider,
),
get_evaluator_catalog_name(evaluator_metadata, provider, f"llm_as_judge.{evaluator_type.value}"),
overwrite=True,
)

logger.debug("Registering judges with a specific criterion...")
add_to_catalog(
get_evaluator(
name=EvaluatorNameEnum.LLAMA3_3_70B,
evaluator_type=EvaluatorTypeEnum.DIRECT,
# provider=ModelProviderEnum.WATSONX,
evaluator_params={
"criteria": "metrics.llm_as_judge.direct.criteria.adherence_with_format",
"context_fields": {
"question": "question",
"instructions": "metadata/template/instruction",
},
},
),
"metrics.rag.response_generation.adherence_with_format.llama_3_3_70b_instruct_judge",
overwrite=True,
)


add_to_catalog(
get_evaluator(
name=EvaluatorNameEnum.LLAMA3_3_70B,
evaluator_type=EvaluatorTypeEnum.DIRECT,
# provider=ModelProviderEnum.WATSONX,
evaluator_params={
"criteria": "metrics.llm_as_judge.direct.criteria.answer_completeness",
"context_fields": {"question": "question", "reference_answers": "reference_answers"},
},
),
"metrics.rag.response_generation.answer_completeness.llama_3_3_70b_instruct_judge",
overwrite=True,
)
Original file line number Diff line number Diff line change
Expand Up @@ -2,55 +2,16 @@
"__type__": "llm_judge_direct",
"inference_engine": {
"__type__": "cross_provider_inference_engine",
"model": "llama-3-3-70b-instruct",
"max_tokens": 1024,
"seed": 42,
"temperature": 0,
"provider": "watsonx"
},
"criteria": {
"__type__": "criteria_with_options",
"name": "adherence_with_format",
"description": "The response aligns with the requested structure, style, or format (e.g., bullet points, headings, specific phrasing).",
"options": [
{
"__type__": "criteria_option",
"name": "Excellent",
"description": "The response perfectly aligns with the requested structure, style, or format, with no deviations."
},
{
"__type__": "criteria_option",
"name": "Good",
"description": "The response aligns well with the requested structure, style, or format, with minor deviations that do not affect clarity or usability."
},
{
"__type__": "criteria_option",
"name": "mediocre",
"description": "The response generally follows the requested structure, style, or format, but noticeable inconsistencies or omissions are present."
},
{
"__type__": "criteria_option",
"name": "Bad",
"description": "The response only partially aligns with the requested structure, style, or format, with significant inconsistencies or a lack of adherence."
},
{
"__type__": "criteria_option",
"name": "Very Bad",
"description": "The response fails to align with the requested structure, style, or format."
}
],
"option_map": {
"Excellent": 1.0,
"Good": 0.75,
"mediocre": 0.5,
"Bad": 0.25,
"Very Bad": 0
}
"model": "llama-3-3-70b-instruct"
},
"evaluator_name": "LLAMA3_3_70B",
"generate_summaries": false,
"criteria": "metrics.llm_as_judge.direct.criteria.adherence_with_format",
"context_fields": {
"question": "question",
"instructions": "metadata/template/instruction"
},
"criteria_field": "criteria",
"generate_summaries": false,
"check_positional_bias": false
}
}
Loading