chore: add single metric for llm predefined (#164)

dynamiq-ai · Feb 27, 2025 · 782a176 · 782a176 · github-actions · Feb 27, 2025
1 parent e9b4e54
commit 782a176
Show file tree

Hide file tree

Showing 6 changed files with 467 additions and 376 deletions.
diff --git a/dynamiq/evaluations/metrics/answer_correctness.py b/dynamiq/evaluations/metrics/answer_correctness.py
@@ -1,5 +1,4 @@
 from pydantic import BaseModel, Field, PrivateAttr, model_validator
-
 from dynamiq.evaluations import BaseEvaluator
 from dynamiq.evaluations.llm_evaluator import LLMEvaluator
 from dynamiq.nodes.llms import BaseLLM
@@ -10,7 +9,6 @@ class ExtractStatementsInput(BaseModel):
     """
     Input model for extracting candidate statements.
     """
-
     question: str = Field(description="The question to answer")
     answer: str = Field(description="The answer to the question")
 
@@ -19,15 +17,13 @@ class ExtractStatementsOutput(BaseModel):
     """
     Output model for extracted candidate statements.
     """
-
     statements: list[str] = Field(description="The generated statements")
 
 
 class ClassifyStatementInput(BaseModel):
     """
     Input model for classifying a candidate pair.
     """
-
     question: str = Field(description="The question for context")
     answer_statement: str = Field(description="A candidate statement from the answer")
     ground_truth_statement: str = Field(
@@ -39,7 +35,6 @@ class ClassifyStatementOutput(BaseModel):
     """
     Output model for classifying a candidate pair.
     """
-
     match: bool = Field(
         description=("Verdict: true if the core fact of the statement is supported by the ground truth")
     )
@@ -62,11 +57,21 @@ def check_equal_length(self):
         return self
 
 
+class AnswerCorrectnessRunSingleInput(BaseModel):
+    """
+    Single-run input model for answer correctness evaluation.
+    """
+
+    question: str = Field(description="The question to answer")
+    answer: str = Field(description="The answer to the question")
+    ground_truth_answer: str = Field(description="The ground truth answer")
+    verbose: bool = False
+
+
 class F1Result(BaseModel):
     """
     Model for F1 score calculation.
     """
-
     precision: float
     recall: float
     f1: float
@@ -79,7 +84,6 @@ class RunResult(BaseModel):
     """
     Result containing final score and detailed, user-friendly reasoning.
     """
-
     score: float
     reasoning: str
 
@@ -88,7 +92,6 @@ class RunOutput(BaseModel):
     """
     Output model for final scores and detailed reasoning.
     """
-
     results: list[RunResult]
 
 
@@ -317,7 +320,7 @@ def _build_reasoning(
                 "Overview:",
                 "  The evaluator splits the answer and the ground truth answer into core fact statements.",
                 "  Each statement from the answer is compared to the ground truth answer to determine if",
-                "  the core fact is supported. Similarly, the ground truth statements are checked for their",
+                "  the core fact is supported. Similarly, ground truth statements are checked for their",
                 "  presence in the answer. '✅' indicates support/presence, while '❌' indicates lack thereof.",
                 "",
                 "1. Answer Statements Analysis:",
@@ -404,6 +407,36 @@ def _evaluate_question(self, question: str, answer_stmts: list[str], gt_stmts: l
         reasoning = self._build_reasoning(ans_class, gt_class, tp, fp, fn, precision, recall, f1)
         return RunResult(score=round(f1, 2), reasoning=reasoning)
 
+    def run_single(self, question: str, answer: str, ground_truth_answer: str, verbose: bool = False) -> RunResult:
+        """
+        Evaluate answer correctness for a single sample.
+
+        Steps:
+          1) Extract candidate statements from both the answer and the ground truth answer.
+          2) Compare the candidate statements.
+          3) Compute Precision, Recall, and F1 Score.
+          4) Generate detailed reasoning.
+
+        Args:
+          question (str): The question.
+          answer (str): The answer.
+          ground_truth_answer (str): The ground truth answer.
+          verbose (bool): Flag to output verbose logs.
+
+        Returns:
+          RunResult: The evaluation result with score and reasoning.
+        """
+        # Extract candidate statements for answer and ground truth
+        ans_candidates = self.extract_statements([question], [answer])[0]
+        gt_candidates = self.extract_statements([question], [ground_truth_answer])[0]
+        result = self._evaluate_question(question, ans_candidates, gt_candidates)
+        if verbose:
+            logger.debug(f"Question: {question}")
+            logger.debug(f"Answer: {self._join_candidates(ans_candidates)}")
+            logger.debug(f"Ground Truth Answer: {self._join_candidates(gt_candidates)}")
+            logger.debug(result.reasoning)
+        return result
+
     def run(
         self, questions: list[str], answers: list[str], ground_truth_answers: list[str], verbose: bool = False
     ) -> RunOutput:
@@ -414,19 +447,26 @@ def run(
              and vice versa.
           3) Compute Precision, Recall, and F1 Score.
           4) Generate detailed and easy-to-understand reasoning that explains the metrics.
+
+        Args:
+          questions (list[str]): List of questions.
+          answers (list[str]): List of answers.
+          ground_truth_answers (list[str]): List of ground truth answers.
+          verbose (bool): Flag for verbose logging.
+
+        Returns:
+          RunOutput: The overall evaluation results.
         """
-        run_in = RunInput(
+        run_input = RunInput(
             questions=questions, answers=answers, ground_truth_answers=ground_truth_answers, verbose=verbose
         )
-        ans_candidates = self.extract_statements(run_in.questions, run_in.answers)
-        gt_candidates = self.extract_statements(run_in.questions, run_in.ground_truth_answers)
         out_results = []
-        for i, question in enumerate(run_in.questions):
-            res = self._evaluate_question(question, ans_candidates[i], gt_candidates[i])
-            if verbose:
-                logger.debug(f"Question: {question}")
-                logger.debug(f"Ground Truth Answer: {self._join_candidates(gt_candidates[i])}")
-                logger.debug(f"Answer: {self._join_candidates(ans_candidates[i])}")
-                logger.debug(res.reasoning)
-            out_results.append(res)
+        for i in range(len(run_input.questions)):
+            result = self.run_single(
+                question=run_input.questions[i],
+                answer=run_input.answers[i],
+                ground_truth_answer=run_input.ground_truth_answers[i],
+                verbose=run_input.verbose,
+            )
+            out_results.append(result)
         return RunOutput(results=out_results)
diff --git a/dynamiq/evaluations/metrics/context_precision.py b/dynamiq/evaluations/metrics/context_precision.py
@@ -1,5 +1,4 @@
 from pydantic import BaseModel, PrivateAttr, field_validator, model_validator
-
 from dynamiq.evaluations import BaseEvaluator
 from dynamiq.evaluations.llm_evaluator import LLMEvaluator
 from dynamiq.nodes.llms import BaseLLM
@@ -23,15 +22,15 @@ class ContextPrecisionInput(BaseModel):
     verbose: bool = False
 
     @field_validator("contexts_list", mode="before")
-    def normalize_contexts_list(cls, v):
+    def normalize_contexts_list(cls, value):
         # If the user provides a list[str], wrap it into [list[str]].
         # If the user provides a list[list[str]], leave as-is.
         # Otherwise, raise an error.
-        if isinstance(v, list):
-            if all(isinstance(item, str) for item in v):
-                return [v]  # e.g. ["foo", "bar"] becomes [["foo", "bar"]]
-            if all(isinstance(item, list) and all(isinstance(x, str) for x in item) for item in v):
-                return v
+        if isinstance(value, list):
+            if all(isinstance(item, str) for item in value):
+                return [value]  # e.g. ["foo", "bar"] becomes [["foo", "bar"]]
+            if all(isinstance(item, list) and all(isinstance(x, str) for x in item) for item in value):
+                return value
         raise ValueError("contexts_list must be either a list of strings or a list of list of strings.")
 
     @model_validator(mode="after")
@@ -77,10 +76,10 @@ class VerdictResult(BaseModel):
 
     @field_validator("verdict")
     @classmethod
-    def validate_verdict(cls, v):
-        if v not in (0, 1):
+    def validate_verdict(cls, value):
+        if value not in (0, 1):
             raise ValueError("Verdict must be either 0 or 1.")
-        return v
+        return value
 
 
 class ContextPrecisionEvaluator(BaseEvaluator):
@@ -208,11 +207,11 @@ def calculate_average_precision(verdicts: list[int]) -> float:
         total_relevant = sum(verdicts)
         if total_relevant == 0:
             return 0.0
-        for i, verdict in enumerate(verdicts):
+        for index, verdict in enumerate(verdicts):
             if verdict == 1:
                 cumulative_hits += 1
-                precision_at_i = cumulative_hits / (i + 1)
-                numerator += precision_at_i
+                precision_at_index = cumulative_hits / (index + 1)
+                numerator += precision_at_index
         average_precision = numerator / total_relevant
         return round(float(average_precision), 2)
 
@@ -244,21 +243,72 @@ def _build_reasoning(
         Returns:
             str: Detailed reasoning.
         """
-        reasoning_strs = ["Reasoning:", "", f"Question: {question}", f"Answer: {answer}", "", "Context Evaluations:"]
-
-        for ctx, v, detail in zip(contexts, verdicts, verdict_details):
-            mark = "✅" if v == 1 else "❌"
-            reasoning_strs.extend(
-                [f" - Context: {ctx}", f"   Verdict: {mark} (value: {v})", f"   Explanation: {detail}", ""]
+        reasoning_strings = ["Reasoning:", "", f"Question: {question}", f"Answer: {answer}", "", "Context Evaluations:"]
+        for context_text, verdict_value, detail in zip(contexts, verdicts, verdict_details):
+            verdict_mark = "✅" if verdict_value == 1 else "❌"
+            reasoning_strings.extend(
+                [
+                    f" - Context: {context_text}",
+                    f"   Verdict: {verdict_mark} (value: {verdict_value})",
+                    f"   Explanation: {detail}",
+                    "",
+                ]
             )
+        reasoning_strings.append(f"Average Precision Score = {average_precision:.2f}")
+        return "\n".join(reasoning_strings)
 
-        reasoning_strs.extend(
-            [
-                f"Average Precision Score = {average_precision:.2f}",
-            ]
-        )
+    def run_single(
+        self, question: str, answer: str, contexts: list[str], verbose: bool = False
+    ) -> ContextPrecisionRunResult:
+        """
+        Evaluate the context precision for a single sample.
 
-        return "\n".join(reasoning_strs)
+        Args:
+            question (str): The question.
+            answer (str): The corresponding answer.
+            contexts (list[str]): A list of contexts for this question.
+            verbose (bool): Flag to enable verbose logging.
+
+        Returns:
+            ContextPrecisionRunResult: Contains the computed average precision score and detailed reasoning.
+        """
+        verdicts = []
+        verdict_details = []
+        for context in contexts:
+            evaluation_result = self._context_precision_evaluator.run(
+                question=[question], answer=[answer], context=[context]
+            )
+            if ("results" not in evaluation_result) or (not evaluation_result["results"]):
+                default_verdict = 0
+                verdicts.append(default_verdict)
+                verdict_details.append("No results returned from evaluator.")
+                if verbose:
+                    logger.debug(f"Missing results for context: {context}. Defaulting verdict to {default_verdict}.")
+                continue
+
+            result_item = evaluation_result["results"][0]
+            verdict_raw = result_item.get("verdict", "0")
+            try:
+                verdict = int(verdict_raw) if not isinstance(verdict_raw, str) else int(verdict_raw.strip())
+            except (ValueError, AttributeError):
+                verdict = 0
+            verdicts.append(verdict)
+            verdict_details.append(result_item.get("reason", "No reasoning provided"))
+
+            if verbose:
+                logger.debug(f"Question: {question}")
+                logger.debug(f"Answer: {answer}")
+                logger.debug(f"Context: {context}")
+                logger.debug(f"Verdict: {verdict}")
+                logger.debug(f"Reason: {result_item.get('reason', 'No reasoning provided')}")
+                logger.debug("-" * 50)
+
+        average_precision = self.calculate_average_precision(verdicts)
+        reasoning_text = self._build_reasoning(question, answer, contexts, verdicts, verdict_details, average_precision)
+        if verbose:
+            logger.debug(f"Average Precision Score: {average_precision}")
+            logger.debug("=" * 50)
+        return ContextPrecisionRunResult(score=average_precision, reasoning=reasoning_text)
 
     def run(
         self,
@@ -280,67 +330,19 @@ def run(
         Returns:
             ContextPrecisionOutput: Contains a list of context precision scores and reasoning.
         """
-        # Pass everything to the Pydantic model.
-        input_data = ContextPrecisionInput(
+        run_input = ContextPrecisionInput(
             questions=questions,
             answers=answers,
             contexts_list=contexts_list,
             verbose=verbose,
         )
-
-        results_out = []
-        for idx in range(len(input_data.questions)):
-            question = input_data.questions[idx]
-            answer = input_data.answers[idx]
-            contexts = input_data.contexts_list[idx]
-
-            verdicts = []
-            verdict_details = []
-            for context in contexts:
-                # Prepare inputs for the evaluator.
-                result = self._context_precision_evaluator.run(
-                    question=[question],
-                    answer=[answer],
-                    context=[context],
-                )
-                # Check if results are present.
-                if ("results" not in result) or (not result["results"]):
-                    # if no results are returned, assign a default verdict and note in details.
-                    default_verdict = 0
-                    verdicts.append(default_verdict)
-                    verdict_details.append("No results returned from evaluator.")
-                    if input_data.verbose:
-                        logger.debug(
-                            f"Missing results for context: {context}. Defaulting verdict to {default_verdict}."
-                        )
-                    continue
-
-                res = result["results"][0]
-                verdict_raw = res.get("verdict", "0")
-                try:
-                    # Convert verdict to int.
-                    verdict = int(verdict_raw) if not isinstance(verdict_raw, str) else int(verdict_raw.strip())
-                except (ValueError, AttributeError):
-                    # In case conversion fails, default the verdict.
-                    verdict = 0
-                verdicts.append(verdict)
-                verdict_details.append(res.get("reason", "No reasoning provided"))
-
-                if input_data.verbose:
-                    logger.debug(f"Question: {question}")
-                    logger.debug(f"Answer: {answer}")
-                    logger.debug(f"Context: {context}")
-                    logger.debug(f"Verdict: {verdict}")
-                    logger.debug(f"Reason: {res.get('reason', 'No reasoning provided')}")
-                    logger.debug("-" * 50)
-
-            avg_precision = self.calculate_average_precision(verdicts)
-
-            reasoning_str = self._build_reasoning(question, answer, contexts, verdicts, verdict_details, avg_precision)
-
-            results_out.append(ContextPrecisionRunResult(score=avg_precision, reasoning=reasoning_str))
-            if input_data.verbose:
-                logger.debug(f"Average Precision Score: {avg_precision}")
-                logger.debug("=" * 50)
-
-        return ContextPrecisionOutput(results=results_out)
+        results_output = []
+        for index in range(len(run_input.questions)):
+            question = run_input.questions[index]
+            answer = run_input.answers[index]
+            contexts = run_input.contexts_list[index]
+            result_single = self.run_single(
+                question=question, answer=answer, contexts=contexts, verbose=run_input.verbose
+            )
+            results_output.append(result_single)
+        return ContextPrecisionOutput(results=results_output)
File	Stmts	Miss	Cover	Missing
dynamiq/evaluations/metrics
answer_correctness.py	144	9	93%	56, 260, 282, 348, 375, 434–437
context_precision.py	111	21	81%	34, 40, 80–82, 282–287, 293–294, 299–304, 309–310
context_recall.py	98	15	84%	33, 38, 43, 64, 239–240, 244–245, 265–271
factual_correctness.py	175	34	80%	77, 82, 90, 258, 305, 375–376, 378–379, 414–416, 422–424, 431–433, 443–444, 447–448, 453–454, 457, 459, 477–484
faithfulness.py	153	22	85%	24, 52, 73, 110, 115, 120, 314, 423–425, 432–434, 451–459
TOTAL	11938	3630	69%