fix(rlm/evaluator): log/verdict consistency + INDETERMINATE for evaluator failures (#6697) (#7116)

mrveiss · t · claude · web-flow · commit 075695b36245 · 2026-05-07T09:04:02.000+03:00
Two bugs in one log line on every evaluator exception path:

  1. Log said "accepting response" but the return was verdict=FAIL — caller
     could not distinguish a real FAIL (response was bad) from an evaluator
     infrastructure error (LLM down, parse error).

  2. ``%s`` on the exception was empty whenever ``__str__`` returned ''
     (e.g. ConnectionError() with no args, common in aiohttp). The log was
     literally "RLM evaluator failed:  — accepting response" with no
     diagnostic value.

Fix:

  * Add ReflectionVerdict.INDETERMINATE so callers can tell evaluator-broke
    apart from response-bad. Routing semantics unchanged: graph.py:1080
    only branches on verdict == "REFINE"; INDETERMINATE falls through to
    persist_conversation, same behavior as the previous FAIL path. FAIL
    enum value retained for backwards-compat (currently unused but cheap).

  * Log includes ``type(exc).__name__`` + ``repr(exc)`` so empty __str__
    no longer produces blank diagnostic lines.

  * ``exc_info=True`` captures full traceback for post-mortem.

  * Critique field uses ``{exc!r}`` so even empty messages have a useful
    representation.

Caller audit: searched for ReflectionVerdict.* and ReflectionResult.verdict
across the codebase. Only chat_workflow/graph.py:1080 branches on the
verdict (string == "REFINE"); knowledge/pipeline/cognifiers/recursive_summarizer
and rlm/__init__ only consume quality_score/critique. No caller needs an
update — INDETERMINATE will route the same as ACCEPT/FAIL did.

Tests: 6 tests, all pass:
  * INDETERMINATE returned on infrastructure error (was FAIL)
  * Log includes exception type even when str(exc) is empty
  * Log claim ("INDETERMINATE" / "passing through") matches the returned verdict
  * exc_info=True attaches traceback to LogRecord
  * INDETERMINATE present on enum
  * Existing ACCEPT / REFINE / FAIL preserved

Acceptance criteria met:
- ✅ Log + verdict agree (no more "accepting" while returning FAIL)
- ✅ Log always includes exception type and repr (never empty diagnostic)
- ✅ exc_info=True captures traceback
- ✅ INDETERMINATE added to ReflectionVerdict enum
- ✅ Caller code (graph.py:1080) handles new value (no change needed —
     routing is "REFINE or fall-through")
- ✅ Test induces evaluator failure, asserts log content + verdict

Co-authored-by: t &lt;t@t&gt;
Co-authored-by: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/autobot-backend/rlm/evaluator.py b/autobot-backend/rlm/evaluator.py
@@ -93,11 +93,22 @@ async def evaluate(
             raw = await self._call_llm(prompt)
             return self._parse(raw, iteration)
         except Exception as exc:
-            logger.warning("RLM evaluator failed: %s — accepting response", exc)
+            # #6697: previous log claimed "accepting response" while returning
+            # verdict=FAIL with empty exception text when exc.__str__ was
+            # empty (e.g. ConnectionError()). Now log type+repr+traceback and
+            # use INDETERMINATE so callers can tell evaluator-broke from a
+            # genuine FAIL. Routing semantics unchanged (graph only branches
+            # on REFINE; INDETERMINATE falls through to accept like ACCEPT).
+            logger.warning(
+                "RLM evaluator failed (%s: %r) — passing through with INDETERMINATE verdict",
+                type(exc).__name__,
+                exc,
+                exc_info=True,
+            )
             return ReflectionResult(
-                verdict=ReflectionVerdict.FAIL,
+                verdict=ReflectionVerdict.INDETERMINATE,
                 quality_score=0.7,
-                critique=f"Evaluation error: {exc}",
+                critique=f"Evaluation error ({type(exc).__name__}): {exc!r}",
                 iteration=iteration,
             )
 
diff --git a/autobot-backend/rlm/evaluator_test.py b/autobot-backend/rlm/evaluator_test.py
@@ -0,0 +1,129 @@
+# AutoBot - AI-Powered Automation Platform
+# Copyright (c) 2025 mrveiss
+# Author: mrveiss
+"""
+Unit tests for ResponseQualityEvaluator error path (Issue #6697).
+
+Pre-#6697 the evaluator's exception handler emitted::
+
+    RLM evaluator failed:  — accepting response
+
+with two bugs:
+  1. Log said "accepting response" but the return verdict was FAIL.
+  2. ``%s`` on the exception was empty when ``__str__`` returned ''
+     (e.g. ``ConnectionError()`` with no args).
+
+Fix: log ``type(exc).__name__`` + ``repr(exc)`` + ``exc_info=True``, return
+``ReflectionVerdict.INDETERMINATE`` so callers can distinguish evaluator
+failures from genuine FAIL verdicts.
+"""
+
+import logging
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+from rlm.evaluator import ResponseQualityEvaluator
+from rlm.types import ReflectionVerdict, RLMConfig
+
+
+class TestEvaluatorErrorPath:
+    """Issue #6697: log/verdict consistency on evaluator failure."""
+
+    @pytest.mark.asyncio
+    async def test_returns_indeterminate_when_llm_call_raises(self, caplog):
+        """Evaluator infrastructure error → INDETERMINATE (not FAIL)."""
+        evaluator = ResponseQualityEvaluator(config=RLMConfig())
+
+        with patch.object(
+            evaluator,
+            "_call_llm",
+            new=AsyncMock(side_effect=ConnectionError("ollama timeout")),
+        ):
+            with caplog.at_level(logging.WARNING):
+                result = await evaluator.evaluate(
+                    query="What is 2+2?", response="Probably 5", iteration=1
+                )
+
+        assert result.verdict == ReflectionVerdict.INDETERMINATE
+        assert "ConnectionError" in result.critique
+        assert "ollama timeout" in result.critique
+
+    @pytest.mark.asyncio
+    async def test_log_includes_exception_type_when_str_is_empty(self, caplog):
+        """Bug 2: empty ``__str__`` no longer produces blank log lines."""
+        evaluator = ResponseQualityEvaluator(config=RLMConfig())
+
+        # ConnectionError() with no args → __str__ returns ''. Pre-fix the
+        # log was 'RLM evaluator failed:  — accepting response' (note the
+        # double space).
+        with patch.object(
+            evaluator,
+            "_call_llm",
+            new=AsyncMock(side_effect=ConnectionError()),
+        ):
+            with caplog.at_level(logging.WARNING):
+                result = await evaluator.evaluate(
+                    query="x", response="y", iteration=1
+                )
+
+        assert result.verdict == ReflectionVerdict.INDETERMINATE
+        # Log must mention the exception type even when message is empty
+        warning_records = [r for r in caplog.records if r.levelno == logging.WARNING]
+        assert any("ConnectionError" in r.getMessage() for r in warning_records), (
+            "Warning log must include exception type even when str(exc) is empty; "
+            f"got messages: {[r.getMessage() for r in warning_records]}"
+        )
+
+    @pytest.mark.asyncio
+    async def test_log_message_matches_returned_verdict(self, caplog):
+        """Bug 1: log says what the return value actually means."""
+        evaluator = ResponseQualityEvaluator(config=RLMConfig())
+
+        with patch.object(
+            evaluator,
+            "_call_llm",
+            new=AsyncMock(side_effect=ValueError("parse error")),
+        ):
+            with caplog.at_level(logging.WARNING):
+                result = await evaluator.evaluate(query="q", response="r", iteration=1)
+
+        log_text = " ".join(r.getMessage() for r in caplog.records)
+        # Either the log uses INDETERMINATE wording OR doesn't claim acceptance —
+        # not the previous "accepting response" while returning FAIL.
+        assert (
+            "INDETERMINATE" in log_text
+            or "passing through" in log_text
+        )
+        assert result.verdict == ReflectionVerdict.INDETERMINATE
+
+    @pytest.mark.asyncio
+    async def test_log_captures_traceback(self, caplog):
+        """exc_info=True so debugging has a traceback to follow."""
+        evaluator = ResponseQualityEvaluator(config=RLMConfig())
+
+        def _raise():
+            raise RuntimeError("boom")
+
+        async def _broken_call(_prompt):
+            _raise()
+
+        with patch.object(evaluator, "_call_llm", new=AsyncMock(side_effect=_raise)):
+            with caplog.at_level(logging.WARNING):
+                await evaluator.evaluate(query="q", response="r", iteration=1)
+
+        warning_records = [r for r in caplog.records if r.levelno == logging.WARNING]
+        # exc_info=True attaches an exc_info tuple to the LogRecord
+        assert any(r.exc_info is not None for r in warning_records), (
+            "exc_info=True must be set so traceback is captured"
+        )
+
+    def test_indeterminate_value_exists_on_enum(self):
+        """The new INDETERMINATE verdict must be on the enum."""
+        assert hasattr(ReflectionVerdict, "INDETERMINATE")
+
+    def test_existing_verdicts_preserved(self):
+        """ACCEPT / REFINE / FAIL still exist (no breaking enum changes)."""
+        assert hasattr(ReflectionVerdict, "ACCEPT")
+        assert hasattr(ReflectionVerdict, "REFINE")
+        assert hasattr(ReflectionVerdict, "FAIL")
diff --git a/autobot-backend/rlm/types.py b/autobot-backend/rlm/types.py
@@ -25,7 +25,11 @@ class ReflectionVerdict(Enum):
 
     ACCEPT = auto()  # Response is good enough — proceed
     REFINE = auto()  # Response needs improvement — recurse
-    FAIL = auto()  # Unable to evaluate — fall through
+    FAIL = auto()  # Hard failure (unused; retained for backwards-compat).
+    # #6697: evaluator infrastructure error (LLM down, parse error, etc.).
+    # Routing treats this like ACCEPT (graph branches only on REFINE), but
+    # callers can distinguish "evaluator broke" from "response was bad."
+    INDETERMINATE = auto()
 
 
 @dataclass