fix: only mark evaluation as finished when eval is finalized

devversion · devversion · commit d66c8bc9abf4 · 2025-10-23T18:43:46.000+02:00
* Only marks evaluation as finished when the eval is finished
* Gracefully handles errors in the executor + makes web app testing
  optional for some executors.
diff --git a/runner/orchestration/build-serve-test-loop.ts b/runner/orchestration/build-serve-test-loop.ts
@@ -249,7 +249,7 @@ export async function attemptBuildAndTest(
         progress,
       )) ?? undefined;
 
-    if (hasAxeFailure && lastAttempt.serveTestingResult.axeViolations?.length === 0) {
+    if (hasAxeFailure && lastAttempt.serveTestingResult?.axeViolations?.length === 0) {
       progress.log(rootPromptDef, 'success', `Successfully fixed all Axe accessibility violations`);
     }
     if (hasTestFailure && lastAttempt.testResult?.passed) {
diff --git a/runner/orchestration/executors/executor.ts b/runner/orchestration/executors/executor.ts
@@ -58,21 +58,23 @@ export const executorSchema = z.object({
     ]),
     z.promise(z.custom<BuildResult>()),
   ),
-  serveWebApplication: z.function(
-    z.tuple([
-      z.custom<EvalID>().describe('ID of the eval'),
-      z.string().describe('Path to the application directory'),
-      z.custom<RootPromptDefinition>().describe('Root prompt definition'),
-      z.custom<ProgressLogger>().describe('Progress logger'),
-      z
-        .function(
-          z.tuple([z.string().describe('URL of the running server')]),
-          z.promise(z.custom<ServeTestingResult>()),
-        )
-        .describe('Call this function while the server is running'),
-    ]),
-    z.promise(z.custom<ServeTestingResult>()),
-  ),
+  serveWebApplication: z
+    .function(
+      z.tuple([
+        z.custom<EvalID>().describe('ID of the eval'),
+        z.string().describe('Path to the application directory'),
+        z.custom<RootPromptDefinition>().describe('Root prompt definition'),
+        z.custom<ProgressLogger>().describe('Progress logger'),
+        z
+          .function(
+            z.tuple([z.string().describe('URL of the running server')]),
+            z.promise(z.custom<ServeTestingResult>()),
+          )
+          .describe('Call this function while the server is running'),
+      ]),
+      z.promise(z.custom<ServeTestingResult>()),
+    )
+    .nullable(),
   executeProjectTests: z.function(
     z.tuple([
       z.custom<EvalID>().describe('ID of the eval'),
diff --git a/runner/orchestration/generate.ts b/runner/orchestration/generate.ts
@@ -44,7 +44,15 @@ import {combineAbortSignals} from '../utils/abort-signal.js';
 export async function generateCodeAndAssess(options: AssessmentConfig): Promise<RunInfo> {
   const env = await getEnvironmentByPath(options.environmentConfigPath, options.runner);
   const cleanup = async () => {
-    await env.executor.destroy();
+    // Clean-up should never interrupt a potentially passing completion.
+    try {
+      await env.executor.destroy();
+    } catch (e) {
+      console.error(`Failed to destroy executor: ${e}`);
+      if (e instanceof Error) {
+        console.error(e.stack);
+      }
+    }
   };
 
   // Ensure cleanup logic runs when the evaluation is aborted.
@@ -147,8 +155,13 @@ export async function generateCodeAndAssess(options: AssessmentConfig): Promise<
             progress.log(rootPromptDef, 'error', 'Failed to evaluate code', details);
             return [] satisfies AssessmentResult[];
           } finally {
+            // Gracefully finalize the eval. Errors in finalization should not propagate.
+            try {
+              await env.executor.finalizeEval(evalID);
+            } catch (e) {
+              progress.log(rootPromptDef, 'error', 'Failed to finalize eval', `${e}`);
+            }
             progress.evalFinished(rootPromptDef, results || []);
-            await env.executor.finalizeEval(evalID);
           }
         }),
       );
diff --git a/runner/orchestration/serve-testing-worker.ts b/runner/orchestration/serve-testing-worker.ts
@@ -9,7 +9,7 @@ import {
   ServeTestingWorkerMessage,
   ServeTestingWorkerResponseMessage,
 } from '../workers/serve-testing/worker-types.js';
-import {EvalID, Executor} from './executors/executor.js';
+import {EvalID} from './executors/executor.js';
 import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js';
 import PQueue from 'p-queue';
 
@@ -24,61 +24,71 @@ export async function serveAndTestApp(
   abortSignal: AbortSignal,
   progress: ProgressLogger,
   userJourneyAgentTaskInput?: BrowserAgentTaskInput,
-): Promise<ServeTestingResult> {
+): Promise<ServeTestingResult | null> {
+  if (env.executor.serveWebApplication === null) {
+    return null;
+  }
+
   progress.log(rootPromptDef, 'serve-testing', `Validating the running app`);
 
-  const result = await env.executor.serveWebApplication(
-    evalID,
-    appDirectoryPath,
-    rootPromptDef,
-    progress,
-    async serveUrl => {
-      const serveParams: ServeTestingWorkerMessage = {
-        serveUrl,
-        appName: rootPromptDef.name,
-        enableAutoCsp: !!config.enableAutoCsp,
-        includeAxeTesting: config.skipAxeTesting === false,
-        takeScreenshots: config.skipScreenshots === false,
-        includeLighthouseData: config.skipLighthouse !== true,
-        userJourneyAgentTaskInput,
-      };
+  try {
+    const result = await env.executor.serveWebApplication(
+      evalID,
+      appDirectoryPath,
+      rootPromptDef,
+      progress,
+      async serveUrl => {
+        const serveParams: ServeTestingWorkerMessage = {
+          serveUrl,
+          appName: rootPromptDef.name,
+          enableAutoCsp: !!config.enableAutoCsp,
+          includeAxeTesting: config.skipAxeTesting === false,
+          takeScreenshots: config.skipScreenshots === false,
+          includeLighthouseData: config.skipLighthouse !== true,
+          userJourneyAgentTaskInput,
+        };
 
-      return await workerConcurrencyQueue.add(
-        () =>
-          new Promise<ServeTestingResult>((resolve, reject) => {
-            const child: ChildProcess = fork(
-              path.resolve(import.meta.dirname, '../workers/serve-testing/worker.js'),
-              {signal: abortSignal},
-            );
-            child.send(serveParams);
+        return await workerConcurrencyQueue.add(
+          () =>
+            new Promise<ServeTestingResult>((resolve, reject) => {
+              const child: ChildProcess = fork(
+                path.resolve(import.meta.dirname, '../workers/serve-testing/worker.js'),
+                {signal: abortSignal},
+              );
+              child.send(serveParams);
 
-            child.on('message', async (result: ServeTestingWorkerResponseMessage) => {
-              if (result.type === 'result') {
+              child.on('message', async (result: ServeTestingWorkerResponseMessage) => {
+                if (result.type === 'result') {
+                  await killChildProcessGracefully(child);
+                  resolve(result.payload);
+                } else {
+                  progress.log(
+                    rootPromptDef,
+                    result.payload.state,
+                    result.payload.message,
+                    result.payload.details,
+                  );
+                }
+              });
+              child.on('error', async err => {
                 await killChildProcessGracefully(child);
-                resolve(result.payload);
-              } else {
-                progress.log(
-                  rootPromptDef,
-                  result.payload.state,
-                  result.payload.message,
-                  result.payload.details,
-                );
-              }
-            });
-            child.on('error', async err => {
-              await killChildProcessGracefully(child);
-              reject(err);
-            });
-          }),
-      );
-    },
-  );
+                reject(err);
+              });
+            }),
+        );
+      },
+    );
+
+    if (result.errorMessage === undefined) {
+      progress.log(rootPromptDef, 'success', 'Validation of running app is successful');
+    } else {
+      progress.log(rootPromptDef, 'error', 'Validation of running app failed', result.errorMessage);
+    }
 
-  if (result.errorMessage === undefined) {
-    progress.log(rootPromptDef, 'success', 'Testing is successful');
-  } else {
-    progress.log(rootPromptDef, 'error', 'Testing has failed', result.errorMessage);
+    return result;
+  } catch (e) {
+    progress.log(rootPromptDef, 'error', 'Error while trying to validate running app', `${e}`);
   }
 
-  return result;
+  return null;
 }

Original file line number	Diff line number	Diff line change
`@@ -249,7 +249,7 @@ export async function attemptBuildAndTest(`
`249`	`249`	`progress,`
`250`	`250`	`)) ?? undefined;`
`251`	`251`
`252`		`- if (hasAxeFailure && lastAttempt.serveTestingResult.axeViolations?.length === 0) {`
	`252`	`+ if (hasAxeFailure && lastAttempt.serveTestingResult?.axeViolations?.length === 0) {`
`253`	`253`	progress.log(rootPromptDef, 'success', `Successfully fixed all Axe accessibility violations`);
`254`	`254`	`}`
`255`	`255`	`if (hasTestFailure && lastAttempt.testResult?.passed) {`