Skip to content

Commit 075695b

Browse files
mrveisstclaude
authored
fix(rlm/evaluator): log/verdict consistency + INDETERMINATE for evaluator failures (#6697) (#7116)
Two bugs in one log line on every evaluator exception path: 1. Log said "accepting response" but the return was verdict=FAIL — caller could not distinguish a real FAIL (response was bad) from an evaluator infrastructure error (LLM down, parse error). 2. ``%s`` on the exception was empty whenever ``__str__`` returned '' (e.g. ConnectionError() with no args, common in aiohttp). The log was literally "RLM evaluator failed: — accepting response" with no diagnostic value. Fix: * Add ReflectionVerdict.INDETERMINATE so callers can tell evaluator-broke apart from response-bad. Routing semantics unchanged: graph.py:1080 only branches on verdict == "REFINE"; INDETERMINATE falls through to persist_conversation, same behavior as the previous FAIL path. FAIL enum value retained for backwards-compat (currently unused but cheap). * Log includes ``type(exc).__name__`` + ``repr(exc)`` so empty __str__ no longer produces blank diagnostic lines. * ``exc_info=True`` captures full traceback for post-mortem. * Critique field uses ``{exc!r}`` so even empty messages have a useful representation. Caller audit: searched for ReflectionVerdict.* and ReflectionResult.verdict across the codebase. Only chat_workflow/graph.py:1080 branches on the verdict (string == "REFINE"); knowledge/pipeline/cognifiers/recursive_summarizer and rlm/__init__ only consume quality_score/critique. No caller needs an update — INDETERMINATE will route the same as ACCEPT/FAIL did. Tests: 6 tests, all pass: * INDETERMINATE returned on infrastructure error (was FAIL) * Log includes exception type even when str(exc) is empty * Log claim ("INDETERMINATE" / "passing through") matches the returned verdict * exc_info=True attaches traceback to LogRecord * INDETERMINATE present on enum * Existing ACCEPT / REFINE / FAIL preserved Acceptance criteria met: - ✅ Log + verdict agree (no more "accepting" while returning FAIL) - ✅ Log always includes exception type and repr (never empty diagnostic) - ✅ exc_info=True captures traceback - ✅ INDETERMINATE added to ReflectionVerdict enum - ✅ Caller code (graph.py:1080) handles new value (no change needed — routing is "REFINE or fall-through") - ✅ Test induces evaluator failure, asserts log content + verdict Co-authored-by: t <t@t> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 6e4752b commit 075695b

3 files changed

Lines changed: 148 additions & 4 deletions

File tree

autobot-backend/rlm/evaluator.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,22 @@ async def evaluate(
9393
raw = await self._call_llm(prompt)
9494
return self._parse(raw, iteration)
9595
except Exception as exc:
96-
logger.warning("RLM evaluator failed: %s — accepting response", exc)
96+
# #6697: previous log claimed "accepting response" while returning
97+
# verdict=FAIL with empty exception text when exc.__str__ was
98+
# empty (e.g. ConnectionError()). Now log type+repr+traceback and
99+
# use INDETERMINATE so callers can tell evaluator-broke from a
100+
# genuine FAIL. Routing semantics unchanged (graph only branches
101+
# on REFINE; INDETERMINATE falls through to accept like ACCEPT).
102+
logger.warning(
103+
"RLM evaluator failed (%s: %r) — passing through with INDETERMINATE verdict",
104+
type(exc).__name__,
105+
exc,
106+
exc_info=True,
107+
)
97108
return ReflectionResult(
98-
verdict=ReflectionVerdict.FAIL,
109+
verdict=ReflectionVerdict.INDETERMINATE,
99110
quality_score=0.7,
100-
critique=f"Evaluation error: {exc}",
111+
critique=f"Evaluation error ({type(exc).__name__}): {exc!r}",
101112
iteration=iteration,
102113
)
103114

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
# AutoBot - AI-Powered Automation Platform
2+
# Copyright (c) 2025 mrveiss
3+
# Author: mrveiss
4+
"""
5+
Unit tests for ResponseQualityEvaluator error path (Issue #6697).
6+
7+
Pre-#6697 the evaluator's exception handler emitted::
8+
9+
RLM evaluator failed: — accepting response
10+
11+
with two bugs:
12+
1. Log said "accepting response" but the return verdict was FAIL.
13+
2. ``%s`` on the exception was empty when ``__str__`` returned ''
14+
(e.g. ``ConnectionError()`` with no args).
15+
16+
Fix: log ``type(exc).__name__`` + ``repr(exc)`` + ``exc_info=True``, return
17+
``ReflectionVerdict.INDETERMINATE`` so callers can distinguish evaluator
18+
failures from genuine FAIL verdicts.
19+
"""
20+
21+
import logging
22+
from unittest.mock import AsyncMock, patch
23+
24+
import pytest
25+
26+
from rlm.evaluator import ResponseQualityEvaluator
27+
from rlm.types import ReflectionVerdict, RLMConfig
28+
29+
30+
class TestEvaluatorErrorPath:
31+
"""Issue #6697: log/verdict consistency on evaluator failure."""
32+
33+
@pytest.mark.asyncio
34+
async def test_returns_indeterminate_when_llm_call_raises(self, caplog):
35+
"""Evaluator infrastructure error → INDETERMINATE (not FAIL)."""
36+
evaluator = ResponseQualityEvaluator(config=RLMConfig())
37+
38+
with patch.object(
39+
evaluator,
40+
"_call_llm",
41+
new=AsyncMock(side_effect=ConnectionError("ollama timeout")),
42+
):
43+
with caplog.at_level(logging.WARNING):
44+
result = await evaluator.evaluate(
45+
query="What is 2+2?", response="Probably 5", iteration=1
46+
)
47+
48+
assert result.verdict == ReflectionVerdict.INDETERMINATE
49+
assert "ConnectionError" in result.critique
50+
assert "ollama timeout" in result.critique
51+
52+
@pytest.mark.asyncio
53+
async def test_log_includes_exception_type_when_str_is_empty(self, caplog):
54+
"""Bug 2: empty ``__str__`` no longer produces blank log lines."""
55+
evaluator = ResponseQualityEvaluator(config=RLMConfig())
56+
57+
# ConnectionError() with no args → __str__ returns ''. Pre-fix the
58+
# log was 'RLM evaluator failed: — accepting response' (note the
59+
# double space).
60+
with patch.object(
61+
evaluator,
62+
"_call_llm",
63+
new=AsyncMock(side_effect=ConnectionError()),
64+
):
65+
with caplog.at_level(logging.WARNING):
66+
result = await evaluator.evaluate(
67+
query="x", response="y", iteration=1
68+
)
69+
70+
assert result.verdict == ReflectionVerdict.INDETERMINATE
71+
# Log must mention the exception type even when message is empty
72+
warning_records = [r for r in caplog.records if r.levelno == logging.WARNING]
73+
assert any("ConnectionError" in r.getMessage() for r in warning_records), (
74+
"Warning log must include exception type even when str(exc) is empty; "
75+
f"got messages: {[r.getMessage() for r in warning_records]}"
76+
)
77+
78+
@pytest.mark.asyncio
79+
async def test_log_message_matches_returned_verdict(self, caplog):
80+
"""Bug 1: log says what the return value actually means."""
81+
evaluator = ResponseQualityEvaluator(config=RLMConfig())
82+
83+
with patch.object(
84+
evaluator,
85+
"_call_llm",
86+
new=AsyncMock(side_effect=ValueError("parse error")),
87+
):
88+
with caplog.at_level(logging.WARNING):
89+
result = await evaluator.evaluate(query="q", response="r", iteration=1)
90+
91+
log_text = " ".join(r.getMessage() for r in caplog.records)
92+
# Either the log uses INDETERMINATE wording OR doesn't claim acceptance —
93+
# not the previous "accepting response" while returning FAIL.
94+
assert (
95+
"INDETERMINATE" in log_text
96+
or "passing through" in log_text
97+
)
98+
assert result.verdict == ReflectionVerdict.INDETERMINATE
99+
100+
@pytest.mark.asyncio
101+
async def test_log_captures_traceback(self, caplog):
102+
"""exc_info=True so debugging has a traceback to follow."""
103+
evaluator = ResponseQualityEvaluator(config=RLMConfig())
104+
105+
def _raise():
106+
raise RuntimeError("boom")
107+
108+
async def _broken_call(_prompt):
109+
_raise()
110+
111+
with patch.object(evaluator, "_call_llm", new=AsyncMock(side_effect=_raise)):
112+
with caplog.at_level(logging.WARNING):
113+
await evaluator.evaluate(query="q", response="r", iteration=1)
114+
115+
warning_records = [r for r in caplog.records if r.levelno == logging.WARNING]
116+
# exc_info=True attaches an exc_info tuple to the LogRecord
117+
assert any(r.exc_info is not None for r in warning_records), (
118+
"exc_info=True must be set so traceback is captured"
119+
)
120+
121+
def test_indeterminate_value_exists_on_enum(self):
122+
"""The new INDETERMINATE verdict must be on the enum."""
123+
assert hasattr(ReflectionVerdict, "INDETERMINATE")
124+
125+
def test_existing_verdicts_preserved(self):
126+
"""ACCEPT / REFINE / FAIL still exist (no breaking enum changes)."""
127+
assert hasattr(ReflectionVerdict, "ACCEPT")
128+
assert hasattr(ReflectionVerdict, "REFINE")
129+
assert hasattr(ReflectionVerdict, "FAIL")

autobot-backend/rlm/types.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,11 @@ class ReflectionVerdict(Enum):
2525

2626
ACCEPT = auto() # Response is good enough — proceed
2727
REFINE = auto() # Response needs improvement — recurse
28-
FAIL = auto() # Unable to evaluate — fall through
28+
FAIL = auto() # Hard failure (unused; retained for backwards-compat).
29+
# #6697: evaluator infrastructure error (LLM down, parse error, etc.).
30+
# Routing treats this like ACCEPT (graph branches only on REFINE), but
31+
# callers can distinguish "evaluator broke" from "response was bad."
32+
INDETERMINATE = auto()
2933

3034

3135
@dataclass

0 commit comments

Comments
 (0)