Skip to content

feat(cli/evals): Generate metricSummaries #2768

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions genkit-tools/common/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
"@asteasolutions/zod-to-openapi": "^7.0.0",
"@trpc/server": "^10.45.2",
"adm-zip": "^0.5.12",
"ajv": "^8.12.0",
"ajv-formats": "^3.0.1",
"axios": "^1.7.7",
"body-parser": "^1.20.2",
"chokidar": "^3.5.3",
Expand All @@ -26,8 +28,6 @@
"json-2-csv": "^5.5.1",
"json-schema": "^0.4.0",
"terminate": "^2.6.1",
"ajv": "^8.12.0",
"ajv-formats": "^3.0.1",
"tsx": "^4.19.2",
"uuid": "^9.0.1",
"winston": "^3.11.0",
Expand Down
8 changes: 7 additions & 1 deletion genkit-tools/common/src/eval/evaluate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,11 @@ import {
logger,
stackTraceSpans,
} from '../utils';
import { enrichResultsWithScoring, extractMetricsMetadata } from './parser';
import {
enrichResultsWithScoring,
extractMetricSummaries,
extractMetricsMetadata,
} from './parser';

interface InferenceRunState {
testCaseId: string;
Expand Down Expand Up @@ -181,11 +185,13 @@ export async function runEvaluation(params: {

const scoredResults = enrichResultsWithScoring(scores, evalDataset);
const metadata = extractMetricsMetadata(evaluatorActions);
const metricSummaries = extractMetricSummaries(scores);

const evalRun = {
key: {
evalRunId,
createdAt: new Date().toISOString(),
metricSummaries,
...augments,
},
results: scoredResults,
Expand Down
130 changes: 130 additions & 0 deletions genkit-tools/common/src/eval/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,14 @@ import { EvalFnResponse, EvalResponse } from '../types/evaluator';
import {
EVALUATOR_METADATA_KEY_DEFINITION,
EVALUATOR_METADATA_KEY_DISPLAY_NAME,
countBy,
groupBy,
meanBy,
} from '../utils/eval';

/** Maximum allowed unique strings / enums for generating summaries */
export const MAX_UNIQUE_STRING_DIST = 5;

/**
* Combines EvalInput with the generated scores to create a storable EvalResult.
*/
Expand Down Expand Up @@ -78,3 +84,127 @@ export function extractMetricsMetadata(evaluatorActions: Action[]) {
}
return metadata;
}

export function extractMetricSummaries(
/** key: evaluatorRef */
scores: Record<string, EvalResponse>
) {
// key: evaluatorRef or evaluatorRef + scoreId (if available)
const testCaseCountMap: Record<string, number> = {};

const entries = Object.entries(scores)
.map(([evaluator, responseArray]) => {
testCaseCountMap[evaluator] = responseArray.length;
return {
evaluator,
score: responseArray.flatMap((response) =>
Array.isArray(response.evaluation)
? response.evaluation
: [response.evaluation]
),
};
})
.flatMap((entry) => {
const groupedScores = groupBy(entry.score, 'id');
const groupedScoresKeys = Object.keys(groupedScores);

if (
groupedScoresKeys.length === 1 &&
groupedScoresKeys[0] === 'undefined'
) {
// No score-level granularity
return entry.score.flatMap((score) => ({
evaluator: entry.evaluator,
testCaseCount: testCaseCountMap[entry.evaluator] ?? 0,
status: score.status,
score: score.score,
error: score.error,
}));
} else {
return Object.entries(groupedScores).flatMap(([scoreId, scores]) => {
if (scoreId === 'undefined') {
return scores.map((score) => ({
evaluator: entry.evaluator,
testCaseCount: testCaseCountMap[entry.evaluator] ?? 0,
status: score.status,
score: score.score,
error: score.error,
}));
} else {
// Duplicate tracking to simplify lookup.
testCaseCountMap[entry.evaluator + '/' + scoreId] =
testCaseCountMap[entry.evaluator] ?? 0;
return scores.map((score) => ({
// Synthetic ID to separate different scores
evaluator: entry.evaluator + '/' + scoreId,
testCaseCount: testCaseCountMap[entry.evaluator] ?? 0,
status: score.status,
score: score.score,
error: score.error,
}));
}
});
}
});

const grouped = groupBy(entries, 'evaluator');

const summaries = Object.entries(grouped).map(([evaluator, items]) => {
const definedItems = items.filter(
(item) => typeof item.score !== 'undefined'
);
const scoreUndefinedCount = items.filter(
(item) => typeof item.score === 'undefined'
).length;
const errorCount = items.filter((item) => item.error !== undefined).length;
const statusDistribution = countBy(items, 'status');

if (definedItems.length > 0) {
// At least one score be registered for this
const validItem = definedItems[0];
const scoreType = typeof validItem.score;
if (scoreType === 'number') {
return {
evaluator,
testCaseCount: validItem.testCaseCount,
errorCount,
scoreUndefinedCount,
statusDistribution,
averageScore: meanBy(definedItems, 'score'),
};
} else if (scoreType === 'boolean') {
return {
evaluator,
testCaseCount: validItem.testCaseCount,
errorCount,
scoreUndefinedCount,
statusDistribution,
scoreDistribution: countBy(definedItems, 'score'),
};
} else if (scoreType === 'string') {
// Treat as enum, but limit to 5 by heuristics
const scoreDistribution = countBy(definedItems, 'score');

if (Object.keys(scoreDistribution).length <= MAX_UNIQUE_STRING_DIST) {
return {
evaluator,
testCaseCount: validItem.testCaseCount,
errorCount,
scoreUndefinedCount,
scoreDistribution,
statusDistribution,
};
}
}
}
return {
evaluator,
testCaseCount: testCaseCountMap[evaluator] ?? 0,
errorCount,
scoreUndefinedCount,
statusDistribution,
};
});

return summaries;
}
7 changes: 7 additions & 0 deletions genkit-tools/common/src/types/eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,12 @@ export const EvalInputDatasetSchema = z.array(EvalInputSchema);
export type EvalInputDataset = z.infer<typeof EvalInputDatasetSchema>;

const EvalStatusEnumSchema = z.enum(['UNKNOWN', 'PASS', 'FAIL']);
/** Enum that indicates if an evaluation has passed or failed */
export enum EvalStatusEnum {
UNKNOWN = 'UNKNOWN',
PASS = 'PASS',
FAIL = 'FAIL',
}

export const EvalMetricSchema = z.object({
evaluator: z.string(),
Expand Down Expand Up @@ -164,6 +170,7 @@ export const EvalRunKeySchema = z.object({
evalRunId: z.string(),
createdAt: z.string(),
actionConfig: z.any().optional(),
metricSummaries: z.array(z.record(z.string(), z.any())).optional(),
});
export type EvalRunKey = z.infer<typeof EvalRunKeySchema>;
export const EvalKeyAugmentsSchema = EvalRunKeySchema.pick({
Expand Down
57 changes: 57 additions & 0 deletions genkit-tools/common/src/utils/eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -363,3 +363,60 @@ export function getModelInput(data: any, modelConfig: any): GenerateRequest {
}
}
}

/**
* Helper method to groupBy an array of objects, replaces lodash equivalent.
*/
export function groupBy(
arr: any[],
criteria: ((i: any) => any) | string
): Record<string, any[]> {
return arr.reduce(function (obj, item) {
const key =
typeof criteria === 'function' ? criteria(item) : item[criteria];

if (!obj.hasOwnProperty(key)) {
obj[key] = [];
}
obj[key].push(item);

return obj;
}, {});
}

/**
* Helper method to countBy an array of objects, replaces lodash equivalent.
*/
export function countBy(
arr: any[],
criteria: ((i: any) => any) | string
): Record<string, number> {
return arr.reduce((acc, item) => {
const key =
typeof criteria === 'function' ? criteria(item) : item[criteria];
acc[key] = (acc[key] || 0) + 1;

return acc;
}, {});
}

/**
* Helper method to meanBy an array of objects, replaces lodash equivalent.
*/
export function meanBy(
arr: any[],
criteria: ((i: any) => any) | string
): number | undefined {
if (!arr || arr.length === 0) {
return undefined;
}

let sum = 0;
for (const item of arr) {
const value =
typeof criteria === 'function' ? criteria(item) : item[criteria];
sum += value;
}

return sum / arr.length;
}
Loading
Loading