Merge pull request #370 from asynkron/codex/fix-virtual-agent-summary-output

rogeralsing · web-flow · commit 7b1ead3d95cd · 2025-10-24T19:24:01.000+02:00
Improve virtual agent summaries
diff --git a/packages/core/src/agent/__tests__/virtualCommandExecutor.test.ts b/packages/core/src/agent/__tests__/virtualCommandExecutor.test.ts
@@ -3,6 +3,7 @@ import { describe, expect, test, jest } from '@jest/globals';
 
 import type { ResponsesClient } from '../../openai/responses.js';
 import { createChatMessageEntry } from '../historyEntry.js';
+import { createObservationHistoryEntry } from '../historyMessageBuilder.js';
 import type { PassExecutionBaseOptions } from '../loopSupport.js';
 import type { ExecuteAgentPassOptions } from '../passExecutor/types.js';
 import { createVirtualCommandExecutor } from '../virtualCommandExecutor.js';
@@ -82,6 +83,7 @@ describe('createVirtualCommandExecutor', () => {
 
     expect(passExecutor).toHaveBeenCalledTimes(1);
     expect(outcome.result.exit_code).toBe(0);
+    expect(outcome.result.stdout).toContain('Summary for "Virtual agent: research"');
     expect(outcome.result.stdout).toContain('virtual result summary');
     expect(outcome.executionDetails.type).toBe('VIRTUAL');
     for (const call of emitEvent.mock.calls) {
@@ -100,6 +102,51 @@ describe('createVirtualCommandExecutor', () => {
     }
   });
 
+  test('includes command observations when the assistant does not respond', async () => {
+    const baseOptions = createBaseOptions();
+    const emitEvent = jest.fn();
+    const emitDebug = jest.fn();
+
+    const passExecutor = jest.fn(async (options: ExecuteAgentPassOptions) => {
+      options.history.push(
+        createObservationHistoryEntry({
+          observation: {
+            observation_for_llm: {
+              stdout: 'README.md explains the CLI usage.',
+              stderr: '',
+              truncated: false,
+              exit_code: 0,
+            },
+            observation_metadata: { timestamp: '2024-01-01T00:00:00.000Z' },
+          },
+          pass: options.passIndex,
+        }),
+      );
+      return false;
+    });
+
+    const executor = createVirtualCommandExecutor({
+      systemPrompt: 'system prompt',
+      baseOptions,
+      passExecutor,
+      createChatMessageEntryFn: createChatMessageEntry,
+      emitEvent,
+      emitDebug,
+      createSubAgentLabel: () => 'SubAgent-observation',
+    });
+
+    const outcome = await executor({
+      command: { shell: 'openagent', run: 'virtual-agent explore {}' },
+      descriptor: { action: 'explore', argument: '{}' },
+    });
+
+    expect(outcome.result.exit_code).toBe(0);
+    expect(outcome.result.stdout).toContain('Summary for "Virtual agent: explore"');
+    expect(outcome.result.stdout).toContain('No assistant summary was produced. Review command results below.');
+    expect(outcome.result.stdout).toContain('Command Results:');
+    expect(outcome.result.stdout).toContain('README.md explains the CLI usage.');
+  });
+
   test('limits the number of passes when configured via JSON argument', async () => {
     const baseOptions = createBaseOptions();
     const emitEvent = jest.fn();
diff --git a/packages/core/src/agent/context.md b/packages/core/src/agent/context.md
@@ -67,7 +67,7 @@
 - `passExecutor.ts` now consolidates approval, execution safety, and plan snapshot helpers so the main loop reads linearly while emitting consistent status updates.
 - Pass executor unit tests now rely on `passExecutor/__testUtils__/passExecutor.ts` helpers (stored outside `__tests__` so Jest does not collect them as suites), keeping the primary spec focused on behavior assertions instead of repeated mock wiring.
 - `passExecutor/commandRuntime.ts` emits the active plan step snapshot alongside each `command-result` event so downstream UIs can display the parent step metadata with command output, races command approval/execution against ESC triggers so human cancellations surface a `'stop'` outcome immediately instead of marching through the remaining plan steps, and short-circuits ESC-triggered waits while finishing command result processing in the background.
-- `commandExecution.ts` now understands `openagent` shell commands with the `virtual-agent` prefix and routes them to an injected virtual command executor, enabling recursive/knowledge tasks without leaving the plan runtime. When no executor is configured the runtime emits a structured virtual-command error so plans can recover gracefully, truncating oversized arguments and pointing hosts at the `virtualCommandExecutor` hook. The agent loop now wires a default in-process executor that spins a scoped pass sequence and reports the collected assistant messages as the command result, so virtual commands behave like sub-agents by default. The executor defaults to 10 passes when callers omit a limit and now honors higher requested limits without imposing an artificial ceiling.
+- `commandExecution.ts` now understands `openagent` shell commands with the `virtual-agent` prefix and routes them to an injected virtual command executor, enabling recursive/knowledge tasks without leaving the plan runtime. When no executor is configured the runtime emits a structured virtual-command error so plans can recover gracefully, truncating oversized arguments and pointing hosts at the `virtualCommandExecutor` hook. The agent loop now wires a default in-process executor that spins a scoped pass sequence and reports the collected assistant messages as the command result, so virtual commands behave like sub-agents by default. The executor defaults to 10 passes when callers omit a limit and now honors higher requested limits without imposing an artificial ceiling. Virtual command results now consolidate the final assistant message with the recorded command observations so hosts receive a readable summary and the underlying stdout/stderr payloads instead of a bare command log.
 - `passExecutor.ts` explicitly treats human command rejections as a successful pass result, and the pass executor suite now includes a regression test to ensure the loop continues after vetoes.
 - `passExecutor/planRuntime/` now hosts dedicated helpers (`stateMachine/`, `initialization.ts`, `finalization.ts`, `idleHandlers.ts`, `effects.ts`, `persistence.ts`, `persistenceCoordinator.ts`, `runtimeController.ts`, `observationRecorder.ts`, `reminderController.ts`) so `planRuntime.ts` delegates mutations, persistence, and reminder tracking to focused modules. The runtime methods now return discriminated-union results with explicit side-effect descriptors that callers commit via `applyPlanRuntimeEffects`, shrinking the core class dramatically.
 - Persistence/plan state bridging helpers now live in `passExecutor/planRuntime/persistenceEffects.ts`, so initialization, idle-handling, and finalization modules compose persistence warnings/snapshots without hand-rolled duplication.
diff --git a/packages/core/src/agent/virtualCommandExecutor.ts b/packages/core/src/agent/virtualCommandExecutor.ts
@@ -12,6 +12,21 @@ import type { PlanHistory } from './passExecutor/types.js';
 import type { EmitEvent } from './passExecutor/types.js';
 import type { DebugRuntimeEventPayload, EmitRuntimeEventOptions } from './runtimeTypes.js';
 
+interface ObservationSummary {
+  readonly summary: string | null;
+  readonly details: string | null;
+  readonly stdout: string;
+  readonly stderr: string;
+  readonly exitCode: number | null;
+  readonly truncated: boolean;
+  readonly truncationNotice: string | null;
+}
+
+interface VirtualAgentFindings {
+  readonly assistantMessages: string[];
+  readonly observations: ObservationSummary[];
+}
+
 interface VirtualAgentExecutorConfig {
   readonly systemPrompt: string;
   readonly baseOptions: PassExecutionBaseOptions;
@@ -104,6 +119,205 @@ const parseDescriptor = (descriptor: VirtualCommandDescriptor): ParsedVirtualDes
   } satisfies ParsedVirtualDescriptor;
 };
 
+const toTrimmedString = (value: unknown): string | null => {
+  if (typeof value !== 'string') {
+    return null;
+  }
+
+  const trimmed = value.trim();
+  return trimmed.length > 0 ? trimmed : null;
+};
+
+const toFiniteNumber = (value: unknown): number | null => {
+  if (typeof value !== 'number' || !Number.isFinite(value)) {
+    return null;
+  }
+
+  return value;
+};
+
+const parseObservationContent = (raw: string): ObservationSummary | null => {
+  try {
+    const parsed = JSON.parse(raw) as {
+      type?: unknown;
+      payload?: unknown;
+      summary?: unknown;
+      details?: unknown;
+    } | null;
+
+    if (!parsed || typeof parsed !== 'object') {
+      return null;
+    }
+
+    const typeValue = toTrimmedString(parsed.type);
+    if (typeValue !== 'observation') {
+      return null;
+    }
+
+    const payloadCandidate = parsed.payload as {
+      stdout?: unknown;
+      stderr?: unknown;
+      exit_code?: unknown;
+      truncated?: unknown;
+      truncation_notice?: unknown;
+      summary?: unknown;
+      details?: unknown;
+    } | null;
+
+    if (!payloadCandidate || typeof payloadCandidate !== 'object') {
+      return null;
+    }
+
+    const summary = toTrimmedString(parsed.summary)
+      ?? toTrimmedString(payloadCandidate.summary)
+      ?? null;
+    const details = toTrimmedString(parsed.details)
+      ?? toTrimmedString(payloadCandidate.details)
+      ?? null;
+    const stdout = typeof payloadCandidate.stdout === 'string' ? payloadCandidate.stdout : '';
+    const stderr = typeof payloadCandidate.stderr === 'string' ? payloadCandidate.stderr : '';
+    const exitCode = toFiniteNumber(payloadCandidate.exit_code);
+    const truncationNotice = toTrimmedString(payloadCandidate.truncation_notice);
+    const truncated = payloadCandidate.truncated === true;
+
+    return {
+      summary,
+      details,
+      stdout,
+      stderr,
+      exitCode,
+      truncated,
+      truncationNotice,
+    } satisfies ObservationSummary;
+  } catch (error) {
+    return null;
+  }
+};
+
+const collectFindings = (history: PlanHistory): VirtualAgentFindings => {
+  const assistantMessages: string[] = [];
+  const observations: ObservationSummary[] = [];
+
+  for (const entry of history) {
+    if (!entry || typeof entry !== 'object') {
+      continue;
+    }
+
+    const payload = entry.payload as {
+      role?: unknown;
+      content?: unknown;
+    } | undefined;
+
+    if (!payload || typeof payload !== 'object') {
+      continue;
+    }
+
+    const role = toTrimmedString(payload.role);
+    if (role === 'assistant') {
+      const content = toTrimmedString(payload.content);
+      if (content) {
+        assistantMessages.push(content);
+      }
+      continue;
+    }
+
+    const contentValue = payload.content;
+    if (typeof contentValue === 'string') {
+      const observation = parseObservationContent(contentValue);
+      if (observation) {
+        observations.push(observation);
+      }
+      continue;
+    }
+
+    if (
+      contentValue &&
+      typeof contentValue === 'object'
+    ) {
+      const serialized = JSON.stringify(contentValue);
+      const observation = parseObservationContent(serialized);
+      if (observation) {
+        observations.push(observation);
+      }
+    }
+  }
+
+  return { assistantMessages, observations } satisfies VirtualAgentFindings;
+};
+
+const indentBlock = (text: string): string =>
+  text
+    .split('\n')
+    .map((line) => `  ${line}`)
+    .join('\n');
+
+const formatObservation = (
+  observation: ObservationSummary,
+  index: number,
+  total: number,
+): string => {
+  const lines: string[] = [];
+  const headingPrefix = total > 1 ? `${index + 1}. ` : '';
+  const headingBody = observation.summary ?? 'Command result';
+  lines.push(`${headingPrefix}${headingBody}`);
+
+  if (typeof observation.exitCode === 'number') {
+    lines.push(`   Exit code: ${observation.exitCode}`);
+  }
+
+  const trimmedStdout = observation.stdout.trim();
+  if (trimmedStdout.length > 0) {
+    lines.push('   Stdout:');
+    lines.push(indentBlock(trimmedStdout));
+  }
+
+  const trimmedStderr = observation.stderr.trim();
+  if (trimmedStderr.length > 0) {
+    lines.push('   Stderr:');
+    lines.push(indentBlock(trimmedStderr));
+  }
+
+  if (observation.truncated) {
+    const notice = observation.truncationNotice ?? 'Output truncated.';
+    lines.push(`   Notice: ${notice}`);
+  } else if (observation.truncationNotice) {
+    lines.push(`   Notice: ${observation.truncationNotice}`);
+  }
+
+  if (observation.details) {
+    lines.push(`   Details: ${observation.details}`);
+  }
+
+  return lines.join('\n');
+};
+
+const buildStdoutFromFindings = (
+  taskLabel: string,
+  findings: VirtualAgentFindings,
+): string => {
+  const sections: string[] = [];
+  const assistantCount = findings.assistantMessages.length;
+  const summaryText =
+    assistantCount > 0
+      ? findings.assistantMessages[assistantCount - 1]
+      : 'No assistant summary was produced. Review command results below.';
+
+  const summarySection = [`Summary for "${taskLabel}":`, summaryText].join('\n');
+  sections.push(summarySection);
+
+  if (findings.observations.length > 0) {
+    const observationLines: string[] = [];
+    observationLines.push('Command Results:');
+    for (let index = 0; index < findings.observations.length; index += 1) {
+      const formatted = formatObservation(findings.observations[index], index, findings.observations.length);
+      observationLines.push(formatted);
+    }
+    sections.push(observationLines.join('\n'));
+  }
+
+  return sections.join('\n\n').trim();
+};
+
 const buildInitialHistory = (
   config: VirtualAgentExecutorConfig,
   parsed: ParsedVirtualDescriptor,
@@ -206,42 +420,23 @@ const cloneBaseOptions = (
   return cloned;
 };
 
-const collectAssistantMessages = (history: PlanHistory): string[] => {
-  const outputs: string[] = [];
-  for (const entry of history) {
-    if (!entry || typeof entry !== 'object') {
-      continue;
-    }
-    const payload = (entry as { payload?: unknown }).payload;
-    if (!payload || typeof payload !== 'object') {
-      continue;
-    }
-    const role = (payload as { role?: unknown }).role;
-    if (role !== 'assistant') {
-      continue;
-    }
-    const content = (payload as { content?: unknown }).content;
-    if (typeof content === 'string' && content.trim()) {
-      outputs.push(content.trim());
-    }
-  }
-  return outputs;
-};
-
 const buildResult = (
   command: VirtualCommandExecutionContext['command'],
   descriptor: VirtualCommandDescriptor,
   history: PlanHistory,
   passesExecuted: number,
   maxPasses: number,
+  taskLabel: string,
   failure: string | null,
   runtimeMs: number,
 ): CommandExecutionResult => {
-  const assistantOutputs = collectAssistantMessages(history);
-  const stdout = assistantOutputs.length > 0 ? assistantOutputs.join('\n\n---\n\n') : '';
-  const success = !failure && stdout.length > 0;
-
-  const normalizedFailure = success ? null : failure ?? 'Virtual agent did not produce a response.';
+  const findings = collectFindings(history);
+  const hasResults = findings.assistantMessages.length > 0 || findings.observations.length > 0;
+  const normalizedFailure = !failure && hasResults
+    ? null
+    : failure ?? 'Virtual agent did not produce a response.';
+  const stdout = normalizedFailure ? '' : buildStdoutFromFindings(taskLabel, findings);
+  const success = normalizedFailure === null && stdout.length > 0;
 
   const result = {
     stdout: success ? stdout : '',
@@ -338,6 +533,7 @@ export const createVirtualCommandExecutor = (
       history,
       passesExecuted,
       parsed.maxPasses,
+      parsed.summary,
       failure,
       runtimeMs,
     );