Skip to content

Commit

Permalink
chore: add single metric for llm predefined (#164)
Browse files Browse the repository at this point in the history
  • Loading branch information
MaksymAI authored Feb 27, 2025
1 parent e9b4e54 commit 782a176
Show file tree
Hide file tree
Showing 6 changed files with 467 additions and 376 deletions.
80 changes: 60 additions & 20 deletions dynamiq/evaluations/metrics/answer_correctness.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from pydantic import BaseModel, Field, PrivateAttr, model_validator

from dynamiq.evaluations import BaseEvaluator
from dynamiq.evaluations.llm_evaluator import LLMEvaluator
from dynamiq.nodes.llms import BaseLLM
Expand All @@ -10,7 +9,6 @@ class ExtractStatementsInput(BaseModel):
"""
Input model for extracting candidate statements.
"""

question: str = Field(description="The question to answer")
answer: str = Field(description="The answer to the question")

Expand All @@ -19,15 +17,13 @@ class ExtractStatementsOutput(BaseModel):
"""
Output model for extracted candidate statements.
"""

statements: list[str] = Field(description="The generated statements")


class ClassifyStatementInput(BaseModel):
"""
Input model for classifying a candidate pair.
"""

question: str = Field(description="The question for context")
answer_statement: str = Field(description="A candidate statement from the answer")
ground_truth_statement: str = Field(
Expand All @@ -39,7 +35,6 @@ class ClassifyStatementOutput(BaseModel):
"""
Output model for classifying a candidate pair.
"""

match: bool = Field(
description=("Verdict: true if the core fact of the statement is supported by the ground truth")
)
Expand All @@ -62,11 +57,21 @@ def check_equal_length(self):
return self


class AnswerCorrectnessRunSingleInput(BaseModel):
"""
Single-run input model for answer correctness evaluation.
"""

question: str = Field(description="The question to answer")
answer: str = Field(description="The answer to the question")
ground_truth_answer: str = Field(description="The ground truth answer")
verbose: bool = False


class F1Result(BaseModel):
"""
Model for F1 score calculation.
"""

precision: float
recall: float
f1: float
Expand All @@ -79,7 +84,6 @@ class RunResult(BaseModel):
"""
Result containing final score and detailed, user-friendly reasoning.
"""

score: float
reasoning: str

Expand All @@ -88,7 +92,6 @@ class RunOutput(BaseModel):
"""
Output model for final scores and detailed reasoning.
"""

results: list[RunResult]


Expand Down Expand Up @@ -317,7 +320,7 @@ def _build_reasoning(
"Overview:",
" The evaluator splits the answer and the ground truth answer into core fact statements.",
" Each statement from the answer is compared to the ground truth answer to determine if",
" the core fact is supported. Similarly, the ground truth statements are checked for their",
" the core fact is supported. Similarly, ground truth statements are checked for their",
" presence in the answer. '✅' indicates support/presence, while '❌' indicates lack thereof.",
"",
"1. Answer Statements Analysis:",
Expand Down Expand Up @@ -404,6 +407,36 @@ def _evaluate_question(self, question: str, answer_stmts: list[str], gt_stmts: l
reasoning = self._build_reasoning(ans_class, gt_class, tp, fp, fn, precision, recall, f1)
return RunResult(score=round(f1, 2), reasoning=reasoning)

def run_single(self, question: str, answer: str, ground_truth_answer: str, verbose: bool = False) -> RunResult:
"""
Evaluate answer correctness for a single sample.
Steps:
1) Extract candidate statements from both the answer and the ground truth answer.
2) Compare the candidate statements.
3) Compute Precision, Recall, and F1 Score.
4) Generate detailed reasoning.
Args:
question (str): The question.
answer (str): The answer.
ground_truth_answer (str): The ground truth answer.
verbose (bool): Flag to output verbose logs.
Returns:
RunResult: The evaluation result with score and reasoning.
"""
# Extract candidate statements for answer and ground truth
ans_candidates = self.extract_statements([question], [answer])[0]
gt_candidates = self.extract_statements([question], [ground_truth_answer])[0]
result = self._evaluate_question(question, ans_candidates, gt_candidates)
if verbose:
logger.debug(f"Question: {question}")
logger.debug(f"Answer: {self._join_candidates(ans_candidates)}")
logger.debug(f"Ground Truth Answer: {self._join_candidates(gt_candidates)}")
logger.debug(result.reasoning)
return result

def run(
self, questions: list[str], answers: list[str], ground_truth_answers: list[str], verbose: bool = False
) -> RunOutput:
Expand All @@ -414,19 +447,26 @@ def run(
and vice versa.
3) Compute Precision, Recall, and F1 Score.
4) Generate detailed and easy-to-understand reasoning that explains the metrics.
Args:
questions (list[str]): List of questions.
answers (list[str]): List of answers.
ground_truth_answers (list[str]): List of ground truth answers.
verbose (bool): Flag for verbose logging.
Returns:
RunOutput: The overall evaluation results.
"""
run_in = RunInput(
run_input = RunInput(
questions=questions, answers=answers, ground_truth_answers=ground_truth_answers, verbose=verbose
)
ans_candidates = self.extract_statements(run_in.questions, run_in.answers)
gt_candidates = self.extract_statements(run_in.questions, run_in.ground_truth_answers)
out_results = []
for i, question in enumerate(run_in.questions):
res = self._evaluate_question(question, ans_candidates[i], gt_candidates[i])
if verbose:
logger.debug(f"Question: {question}")
logger.debug(f"Ground Truth Answer: {self._join_candidates(gt_candidates[i])}")
logger.debug(f"Answer: {self._join_candidates(ans_candidates[i])}")
logger.debug(res.reasoning)
out_results.append(res)
for i in range(len(run_input.questions)):
result = self.run_single(
question=run_input.questions[i],
answer=run_input.answers[i],
ground_truth_answer=run_input.ground_truth_answers[i],
verbose=run_input.verbose,
)
out_results.append(result)
return RunOutput(results=out_results)
170 changes: 86 additions & 84 deletions dynamiq/evaluations/metrics/context_precision.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from pydantic import BaseModel, PrivateAttr, field_validator, model_validator

from dynamiq.evaluations import BaseEvaluator
from dynamiq.evaluations.llm_evaluator import LLMEvaluator
from dynamiq.nodes.llms import BaseLLM
Expand All @@ -23,15 +22,15 @@ class ContextPrecisionInput(BaseModel):
verbose: bool = False

@field_validator("contexts_list", mode="before")
def normalize_contexts_list(cls, v):
def normalize_contexts_list(cls, value):
# If the user provides a list[str], wrap it into [list[str]].
# If the user provides a list[list[str]], leave as-is.
# Otherwise, raise an error.
if isinstance(v, list):
if all(isinstance(item, str) for item in v):
return [v] # e.g. ["foo", "bar"] becomes [["foo", "bar"]]
if all(isinstance(item, list) and all(isinstance(x, str) for x in item) for item in v):
return v
if isinstance(value, list):
if all(isinstance(item, str) for item in value):
return [value] # e.g. ["foo", "bar"] becomes [["foo", "bar"]]
if all(isinstance(item, list) and all(isinstance(x, str) for x in item) for item in value):
return value
raise ValueError("contexts_list must be either a list of strings or a list of list of strings.")

@model_validator(mode="after")
Expand Down Expand Up @@ -77,10 +76,10 @@ class VerdictResult(BaseModel):

@field_validator("verdict")
@classmethod
def validate_verdict(cls, v):
if v not in (0, 1):
def validate_verdict(cls, value):
if value not in (0, 1):
raise ValueError("Verdict must be either 0 or 1.")
return v
return value


class ContextPrecisionEvaluator(BaseEvaluator):
Expand Down Expand Up @@ -208,11 +207,11 @@ def calculate_average_precision(verdicts: list[int]) -> float:
total_relevant = sum(verdicts)
if total_relevant == 0:
return 0.0
for i, verdict in enumerate(verdicts):
for index, verdict in enumerate(verdicts):
if verdict == 1:
cumulative_hits += 1
precision_at_i = cumulative_hits / (i + 1)
numerator += precision_at_i
precision_at_index = cumulative_hits / (index + 1)
numerator += precision_at_index
average_precision = numerator / total_relevant
return round(float(average_precision), 2)

Expand Down Expand Up @@ -244,21 +243,72 @@ def _build_reasoning(
Returns:
str: Detailed reasoning.
"""
reasoning_strs = ["Reasoning:", "", f"Question: {question}", f"Answer: {answer}", "", "Context Evaluations:"]

for ctx, v, detail in zip(contexts, verdicts, verdict_details):
mark = "✅" if v == 1 else "❌"
reasoning_strs.extend(
[f" - Context: {ctx}", f" Verdict: {mark} (value: {v})", f" Explanation: {detail}", ""]
reasoning_strings = ["Reasoning:", "", f"Question: {question}", f"Answer: {answer}", "", "Context Evaluations:"]
for context_text, verdict_value, detail in zip(contexts, verdicts, verdict_details):
verdict_mark = "✅" if verdict_value == 1 else "❌"
reasoning_strings.extend(
[
f" - Context: {context_text}",
f" Verdict: {verdict_mark} (value: {verdict_value})",
f" Explanation: {detail}",
"",
]
)
reasoning_strings.append(f"Average Precision Score = {average_precision:.2f}")
return "\n".join(reasoning_strings)

reasoning_strs.extend(
[
f"Average Precision Score = {average_precision:.2f}",
]
)
def run_single(
self, question: str, answer: str, contexts: list[str], verbose: bool = False
) -> ContextPrecisionRunResult:
"""
Evaluate the context precision for a single sample.
return "\n".join(reasoning_strs)
Args:
question (str): The question.
answer (str): The corresponding answer.
contexts (list[str]): A list of contexts for this question.
verbose (bool): Flag to enable verbose logging.
Returns:
ContextPrecisionRunResult: Contains the computed average precision score and detailed reasoning.
"""
verdicts = []
verdict_details = []
for context in contexts:
evaluation_result = self._context_precision_evaluator.run(
question=[question], answer=[answer], context=[context]
)
if ("results" not in evaluation_result) or (not evaluation_result["results"]):
default_verdict = 0
verdicts.append(default_verdict)
verdict_details.append("No results returned from evaluator.")
if verbose:
logger.debug(f"Missing results for context: {context}. Defaulting verdict to {default_verdict}.")
continue

result_item = evaluation_result["results"][0]
verdict_raw = result_item.get("verdict", "0")
try:
verdict = int(verdict_raw) if not isinstance(verdict_raw, str) else int(verdict_raw.strip())
except (ValueError, AttributeError):
verdict = 0
verdicts.append(verdict)
verdict_details.append(result_item.get("reason", "No reasoning provided"))

if verbose:
logger.debug(f"Question: {question}")
logger.debug(f"Answer: {answer}")
logger.debug(f"Context: {context}")
logger.debug(f"Verdict: {verdict}")
logger.debug(f"Reason: {result_item.get('reason', 'No reasoning provided')}")
logger.debug("-" * 50)

average_precision = self.calculate_average_precision(verdicts)
reasoning_text = self._build_reasoning(question, answer, contexts, verdicts, verdict_details, average_precision)
if verbose:
logger.debug(f"Average Precision Score: {average_precision}")
logger.debug("=" * 50)
return ContextPrecisionRunResult(score=average_precision, reasoning=reasoning_text)

def run(
self,
Expand All @@ -280,67 +330,19 @@ def run(
Returns:
ContextPrecisionOutput: Contains a list of context precision scores and reasoning.
"""
# Pass everything to the Pydantic model.
input_data = ContextPrecisionInput(
run_input = ContextPrecisionInput(
questions=questions,
answers=answers,
contexts_list=contexts_list,
verbose=verbose,
)

results_out = []
for idx in range(len(input_data.questions)):
question = input_data.questions[idx]
answer = input_data.answers[idx]
contexts = input_data.contexts_list[idx]

verdicts = []
verdict_details = []
for context in contexts:
# Prepare inputs for the evaluator.
result = self._context_precision_evaluator.run(
question=[question],
answer=[answer],
context=[context],
)
# Check if results are present.
if ("results" not in result) or (not result["results"]):
# if no results are returned, assign a default verdict and note in details.
default_verdict = 0
verdicts.append(default_verdict)
verdict_details.append("No results returned from evaluator.")
if input_data.verbose:
logger.debug(
f"Missing results for context: {context}. Defaulting verdict to {default_verdict}."
)
continue

res = result["results"][0]
verdict_raw = res.get("verdict", "0")
try:
# Convert verdict to int.
verdict = int(verdict_raw) if not isinstance(verdict_raw, str) else int(verdict_raw.strip())
except (ValueError, AttributeError):
# In case conversion fails, default the verdict.
verdict = 0
verdicts.append(verdict)
verdict_details.append(res.get("reason", "No reasoning provided"))

if input_data.verbose:
logger.debug(f"Question: {question}")
logger.debug(f"Answer: {answer}")
logger.debug(f"Context: {context}")
logger.debug(f"Verdict: {verdict}")
logger.debug(f"Reason: {res.get('reason', 'No reasoning provided')}")
logger.debug("-" * 50)

avg_precision = self.calculate_average_precision(verdicts)

reasoning_str = self._build_reasoning(question, answer, contexts, verdicts, verdict_details, avg_precision)

results_out.append(ContextPrecisionRunResult(score=avg_precision, reasoning=reasoning_str))
if input_data.verbose:
logger.debug(f"Average Precision Score: {avg_precision}")
logger.debug("=" * 50)

return ContextPrecisionOutput(results=results_out)
results_output = []
for index in range(len(run_input.questions)):
question = run_input.questions[index]
answer = run_input.answers[index]
contexts = run_input.contexts_list[index]
result_single = self.run_single(
question=question, answer=answer, contexts=contexts, verbose=run_input.verbose
)
results_output.append(result_single)
return ContextPrecisionOutput(results=results_output)
Loading

1 comment on commit 782a176

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report •
FileStmtsMissCoverMissing
dynamiq/evaluations/metrics
   answer_correctness.py144993%56, 260, 282, 348, 375, 434–437
   context_precision.py1112181%34, 40, 80–82, 282–287, 293–294, 299–304, 309–310
   context_recall.py981584%33, 38, 43, 64, 239–240, 244–245, 265–271
   factual_correctness.py1753480%77, 82, 90, 258, 305, 375–376, 378–379, 414–416, 422–424, 431–433, 443–444, 447–448, 453–454, 457, 459, 477–484
   faithfulness.py1532285%24, 52, 73, 110, 115, 120, 314, 423–425, 432–434, 451–459
TOTAL11938363069% 

Tests Skipped Failures Errors Time
416 0 💤 0 ❌ 0 🔥 46.948s ⏱️

Please sign in to comment.