ls1intum · toukhi · Apr 12, 2026 · Apr 12, 2026 · Apr 13, 2026 · Apr 19, 2026
@@ -12,6 +12,10 @@
     AbstractAgentPipeline,
     AgentPipelineExecutionState,
 )
+from iris.pipeline.shared.confidence_scoring import (
+    is_large_model,
+    parse_confidence_response,
+)
 from iris.pipeline.shared.utils import (
     REDACTED_ANSWER_PLACEHOLDER,
     format_post_discussion,
@@ -74,6 +78,12 @@ def __init__(self):
         self.system_prompt_template = self.jinja_env.get_template(
             "autonomous_tutor_system_prompt.j2"
         )
+        self.confidence_combo_template = self.jinja_env.get_template(
+            "autonomous_tutor_confidence_combo.j2"
+        )
+        self.confidence_basic_template = self.jinja_env.get_template(
+            "autonomous_tutor_confidence_basic.j2"
+        )
 
         self.tokens = []
 
@@ -189,7 +199,15 @@ def build_system_message(
                 else "the course"
             ),
         }
-        return self.system_prompt_template.render(template_context)
+        base_prompt = self.system_prompt_template.render(template_context)
+        model_id = state.llm.model_name if state.llm else ""
+        if is_large_model(model_id):
+            logger.info("Using combo confidence prompt | model=%s", model_id)
+            confidence_section = self.confidence_combo_template.render()
+        else:
+            logger.info("Using basic confidence prompt | model=%s", model_id)
+            confidence_section = self.confidence_basic_template.render()
+        return base_prompt + "\n\n" + confidence_section
 
     def get_memiris_tenant(self, dto: AutonomousTutorPipelineExecutionDTO) -> str:
         """
@@ -238,12 +256,15 @@ def post_agent_hook(
             )
             return ""
 
-        # TODO(IRIS-22): Implement Confidence Evaluation
-        # For now, use a placeholder confidence value
         confidence = self._estimate_confidence(state)
         should_post_directly = confidence >= self.DIRECT_POST_CONFIDENCE_THRESHOLD
 
         logger.info("Generated response: %s", state.result)
+        logger.info(
+            "Confidence score | score=%.4f should_post_directly=%s",
+            confidence,
+            should_post_directly,
+        )
 
         state.callback.done(
             "Response generated",
@@ -256,24 +277,26 @@ def post_agent_hook(
 
     def _estimate_confidence(
         self,
-        state: AgentPipelineExecutionState[  # pylint: disable=unused-argument
+        state: AgentPipelineExecutionState[
             AutonomousTutorPipelineExecutionDTO, Variant
         ],
     ) -> float:
-        """
-        Estimate confidence score for the generated response.
+        """Parse the verbalized confidence score from the agent's response.
+
+        Mutates state.result to contain only the clean answer text (without the
+        trailing Probability line), and returns the extracted probability.
 
         Confidence thresholds:
         - >= 0.95: Post immediately
         - 0.80 - 0.95: Forward to verification queue
         - < 0.80: Do not post, forward to verification queue
 
-        TODO: Implement actual confidence estimation
-
         Returns:
             float: Confidence score between 0.0 and 1.0
         """
-        return 0.99
+        answer_text, confidence = parse_confidence_response(state.result)
+        state.result = answer_text
+        return confidence
 
     def _generate_retrieval_query_text(self, discussion: str) -> str:
         """Generate query text for retrieval tools."""

@@ -0,0 +1,14 @@
+{# Confidence scoring addon — basic_probscore method (Yang et al. 2024) for small models.
+   This section is appended to the main system prompt. #}
+---
+
+## Confidence Scoring
+
+After your answer, state the probability between 0.0 and 1.0 that your answer is correct.
+
+**Output format — you MUST follow this exactly:**
+
+Answer: <your response to the student>
+Probability: <a single decimal between 0.0 and 1.0>
+
+Do not include any text after the Probability line.
@@ -0,0 +1,50 @@
+{# Confidence scoring addon — combo method (Yang et al. 2024) for large models.
+   This section is appended to the main system prompt. #}
+---
+
+## Confidence Scoring
+
+Your response must include a probability that your answer is correct, expressed as a decimal between 0.0 and 1.0.
+
+When assigning the probability, consider:
+- **Task difficulty**: Is the question straightforward or does it require deep reasoning?
+- **Knowledge availability**: Did you have access to sufficient, reliable information (via tools or training) to answer confidently?
+- **Uncertainty in the question**: Is the question ambiguous or could it be interpreted in multiple ways?
+
+Do not anchor on a comfortable middle value. Calibrate honestly: if you are nearly certain, use a high value; if you are mostly guessing, use a low value.
+
+Here are examples of how to format your response:
+
+---
+**Example 1** (very low confidence — topic outside course scope with no tool access):
+Guess: I'm not sure this is covered in the course materials, but binary search trees store elements such that each node's left subtree contains only smaller values and the right subtree only larger values, enabling O(log n) average-case search.
+Probability: 0.08
+
+---
+**Example 2** (low confidence — question is vague and tools returned limited information):
+Guess: The submission deadline is likely the end of the semester, but I couldn't find a specific date in the course FAQ or exercise details. I recommend checking the course announcements.
+Probability: 0.24
+
+---
+**Example 3** (moderate confidence — general knowledge, no direct course evidence):
+Guess: The gradient descent algorithm updates model parameters by moving in the direction of the negative gradient of the loss function with respect to those parameters, scaled by a learning rate.
+Probability: 0.47
+
+---
+**Example 4** (high confidence — answered from retrieved lecture content):
+Guess: According to the lecture slides, a mutex (mutual exclusion lock) ensures that only one thread can access a critical section at a time, preventing race conditions.
+Probability: 0.77
+
+---
+**Example 5** (very high confidence — directly found in course FAQ):
+Guess: Yes, you can submit up to 3 days late with a 10% penalty per day, as stated in the course FAQ.
+Probability: 0.89
+
+---
+
+**Output format — you MUST follow this exactly:**
+
+Guess: <your best response to the student>
+Probability: <a single decimal between 0.0 and 1.0>
+
+Do not include any text after the Probability line.
@@ -0,0 +1,87 @@
+import re
+
+_LARGE_MODEL_PATTERNS = [
+    "70b",
+    "72b",
+    "110b",
+    "32b",
+    "gpt-4",
+    "gpt-5",
+    "gpt-oss",
+]
+
+_ANSWER_PREFIX_RE = re.compile(
+    r"^(?:answer|guess)\s*:\s*",
+    re.IGNORECASE,
+)
+
+_PROBABILITY_LINE_RE = re.compile(
+    r"(?:probability|confidence|p)\s*:\s*(-?\d+(?:\.\d+)?)(\s*%)?",
+    re.IGNORECASE,
+)
+
+
+def is_large_model(model_id: str) -> bool:
+    """Return True if the model should use the combo confidence prompt.
+
+    Large models include any GPT-4/GPT-5 generation model (including mini
+    variants and gpt-oss) and open-source models with ≥32B parameters.
+    Everything else is treated as small.
+    """
+    lower = model_id.lower()
+    return any(pattern in lower for pattern in _LARGE_MODEL_PATTERNS)
+
+
+def parse_confidence_response(raw_response: str) -> tuple[str, float]:
+    """Extract (answer_text, probability) from a verbalized confidence response.
+
+    Handles both large-model format (Guess: ... / Probability: ...) and
+    small-model format (Answer: ... / Probability: ...).  Also accepts
+    "Confidence:" and "P:" as alternatives to "Probability:", and values
+    expressed as percentages (e.g. "85%" → 0.85).  The probability is
+    clamped to [0.0, 1.0].
+
+    If parsing fails for any reason this function returns (raw_response, 0.0)
+    so that callers never receive an exception.  A score of 0.0 will be
+    treated as below threshold and discarded by Artemis.
+    """
+    try:
+        lines = raw_response.strip().splitlines()
+
+        # Find the last line that matches a probability pattern.
+        prob_line_index = None
+        probability = 0.0
+        for i in range(len(lines) - 1, -1, -1):
+            m = _PROBABILITY_LINE_RE.search(lines[i])
+            if m:
+                prob_line_index = i
+                raw_value = float(m.group(1))
+                is_percent = bool(m.group(2) and m.group(2).strip() == "%")
+                if is_percent:
+                    probability = raw_value / 100.0
+                else:
+                    probability = raw_value
+                probability = max(0.0, min(1.0, probability))
+                break
+
+        if prob_line_index is None:
+            # No probability line found — safe fallback.
+            return raw_response, 0.0
+
+        # Everything before the probability line is the answer block.
+        answer_lines = lines[:prob_line_index]
+
+        # Strip the "Answer:" / "Guess:" prefix from the first line if present.
+        if answer_lines:
+            answer_lines[0] = _ANSWER_PREFIX_RE.sub("", answer_lines[0])
+
+        answer_text = "\n".join(answer_lines).strip()
+
+        # If nothing is left after stripping, fall back to the raw response.
+        if not answer_text:
+            answer_text = raw_response
+
+        return answer_text, probability
+
+    except Exception:  # pylint: disable=broad-except
+        return raw_response, 0.0