Skip to content

Commit d66c8bc

Browse files
committed
fix: only mark evaluation as finished when eval is finalized
* Only marks evaluation as finished when the eval is finished * Gracefully handles errors in the executor + makes web app testing optional for some executors.
1 parent 5aa06a5 commit d66c8bc

File tree

4 files changed

+93
-68
lines changed

4 files changed

+93
-68
lines changed

runner/orchestration/build-serve-test-loop.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ export async function attemptBuildAndTest(
249249
progress,
250250
)) ?? undefined;
251251

252-
if (hasAxeFailure && lastAttempt.serveTestingResult.axeViolations?.length === 0) {
252+
if (hasAxeFailure && lastAttempt.serveTestingResult?.axeViolations?.length === 0) {
253253
progress.log(rootPromptDef, 'success', `Successfully fixed all Axe accessibility violations`);
254254
}
255255
if (hasTestFailure && lastAttempt.testResult?.passed) {

runner/orchestration/executors/executor.ts

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -58,21 +58,23 @@ export const executorSchema = z.object({
5858
]),
5959
z.promise(z.custom<BuildResult>()),
6060
),
61-
serveWebApplication: z.function(
62-
z.tuple([
63-
z.custom<EvalID>().describe('ID of the eval'),
64-
z.string().describe('Path to the application directory'),
65-
z.custom<RootPromptDefinition>().describe('Root prompt definition'),
66-
z.custom<ProgressLogger>().describe('Progress logger'),
67-
z
68-
.function(
69-
z.tuple([z.string().describe('URL of the running server')]),
70-
z.promise(z.custom<ServeTestingResult>()),
71-
)
72-
.describe('Call this function while the server is running'),
73-
]),
74-
z.promise(z.custom<ServeTestingResult>()),
75-
),
61+
serveWebApplication: z
62+
.function(
63+
z.tuple([
64+
z.custom<EvalID>().describe('ID of the eval'),
65+
z.string().describe('Path to the application directory'),
66+
z.custom<RootPromptDefinition>().describe('Root prompt definition'),
67+
z.custom<ProgressLogger>().describe('Progress logger'),
68+
z
69+
.function(
70+
z.tuple([z.string().describe('URL of the running server')]),
71+
z.promise(z.custom<ServeTestingResult>()),
72+
)
73+
.describe('Call this function while the server is running'),
74+
]),
75+
z.promise(z.custom<ServeTestingResult>()),
76+
)
77+
.nullable(),
7678
executeProjectTests: z.function(
7779
z.tuple([
7880
z.custom<EvalID>().describe('ID of the eval'),

runner/orchestration/generate.ts

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,15 @@ import {combineAbortSignals} from '../utils/abort-signal.js';
4444
export async function generateCodeAndAssess(options: AssessmentConfig): Promise<RunInfo> {
4545
const env = await getEnvironmentByPath(options.environmentConfigPath, options.runner);
4646
const cleanup = async () => {
47-
await env.executor.destroy();
47+
// Clean-up should never interrupt a potentially passing completion.
48+
try {
49+
await env.executor.destroy();
50+
} catch (e) {
51+
console.error(`Failed to destroy executor: ${e}`);
52+
if (e instanceof Error) {
53+
console.error(e.stack);
54+
}
55+
}
4856
};
4957

5058
// Ensure cleanup logic runs when the evaluation is aborted.
@@ -147,8 +155,13 @@ export async function generateCodeAndAssess(options: AssessmentConfig): Promise<
147155
progress.log(rootPromptDef, 'error', 'Failed to evaluate code', details);
148156
return [] satisfies AssessmentResult[];
149157
} finally {
158+
// Gracefully finalize the eval. Errors in finalization should not propagate.
159+
try {
160+
await env.executor.finalizeEval(evalID);
161+
} catch (e) {
162+
progress.log(rootPromptDef, 'error', 'Failed to finalize eval', `${e}`);
163+
}
150164
progress.evalFinished(rootPromptDef, results || []);
151-
await env.executor.finalizeEval(evalID);
152165
}
153166
}),
154167
);

runner/orchestration/serve-testing-worker.ts

Lines changed: 60 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ import {
99
ServeTestingWorkerMessage,
1010
ServeTestingWorkerResponseMessage,
1111
} from '../workers/serve-testing/worker-types.js';
12-
import {EvalID, Executor} from './executors/executor.js';
12+
import {EvalID} from './executors/executor.js';
1313
import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js';
1414
import PQueue from 'p-queue';
1515

@@ -24,61 +24,71 @@ export async function serveAndTestApp(
2424
abortSignal: AbortSignal,
2525
progress: ProgressLogger,
2626
userJourneyAgentTaskInput?: BrowserAgentTaskInput,
27-
): Promise<ServeTestingResult> {
27+
): Promise<ServeTestingResult | null> {
28+
if (env.executor.serveWebApplication === null) {
29+
return null;
30+
}
31+
2832
progress.log(rootPromptDef, 'serve-testing', `Validating the running app`);
2933

30-
const result = await env.executor.serveWebApplication(
31-
evalID,
32-
appDirectoryPath,
33-
rootPromptDef,
34-
progress,
35-
async serveUrl => {
36-
const serveParams: ServeTestingWorkerMessage = {
37-
serveUrl,
38-
appName: rootPromptDef.name,
39-
enableAutoCsp: !!config.enableAutoCsp,
40-
includeAxeTesting: config.skipAxeTesting === false,
41-
takeScreenshots: config.skipScreenshots === false,
42-
includeLighthouseData: config.skipLighthouse !== true,
43-
userJourneyAgentTaskInput,
44-
};
34+
try {
35+
const result = await env.executor.serveWebApplication(
36+
evalID,
37+
appDirectoryPath,
38+
rootPromptDef,
39+
progress,
40+
async serveUrl => {
41+
const serveParams: ServeTestingWorkerMessage = {
42+
serveUrl,
43+
appName: rootPromptDef.name,
44+
enableAutoCsp: !!config.enableAutoCsp,
45+
includeAxeTesting: config.skipAxeTesting === false,
46+
takeScreenshots: config.skipScreenshots === false,
47+
includeLighthouseData: config.skipLighthouse !== true,
48+
userJourneyAgentTaskInput,
49+
};
4550

46-
return await workerConcurrencyQueue.add(
47-
() =>
48-
new Promise<ServeTestingResult>((resolve, reject) => {
49-
const child: ChildProcess = fork(
50-
path.resolve(import.meta.dirname, '../workers/serve-testing/worker.js'),
51-
{signal: abortSignal},
52-
);
53-
child.send(serveParams);
51+
return await workerConcurrencyQueue.add(
52+
() =>
53+
new Promise<ServeTestingResult>((resolve, reject) => {
54+
const child: ChildProcess = fork(
55+
path.resolve(import.meta.dirname, '../workers/serve-testing/worker.js'),
56+
{signal: abortSignal},
57+
);
58+
child.send(serveParams);
5459

55-
child.on('message', async (result: ServeTestingWorkerResponseMessage) => {
56-
if (result.type === 'result') {
60+
child.on('message', async (result: ServeTestingWorkerResponseMessage) => {
61+
if (result.type === 'result') {
62+
await killChildProcessGracefully(child);
63+
resolve(result.payload);
64+
} else {
65+
progress.log(
66+
rootPromptDef,
67+
result.payload.state,
68+
result.payload.message,
69+
result.payload.details,
70+
);
71+
}
72+
});
73+
child.on('error', async err => {
5774
await killChildProcessGracefully(child);
58-
resolve(result.payload);
59-
} else {
60-
progress.log(
61-
rootPromptDef,
62-
result.payload.state,
63-
result.payload.message,
64-
result.payload.details,
65-
);
66-
}
67-
});
68-
child.on('error', async err => {
69-
await killChildProcessGracefully(child);
70-
reject(err);
71-
});
72-
}),
73-
);
74-
},
75-
);
75+
reject(err);
76+
});
77+
}),
78+
);
79+
},
80+
);
81+
82+
if (result.errorMessage === undefined) {
83+
progress.log(rootPromptDef, 'success', 'Validation of running app is successful');
84+
} else {
85+
progress.log(rootPromptDef, 'error', 'Validation of running app failed', result.errorMessage);
86+
}
7687

77-
if (result.errorMessage === undefined) {
78-
progress.log(rootPromptDef, 'success', 'Testing is successful');
79-
} else {
80-
progress.log(rootPromptDef, 'error', 'Testing has failed', result.errorMessage);
88+
return result;
89+
} catch (e) {
90+
progress.log(rootPromptDef, 'error', 'Error while trying to validate running app', `${e}`);
8191
}
8292

83-
return result;
93+
return null;
8494
}

0 commit comments

Comments
 (0)