diff --git a/genkit-tools/common/package.json b/genkit-tools/common/package.json index 95a6104fa0..0c68d23b7f 100644 --- a/genkit-tools/common/package.json +++ b/genkit-tools/common/package.json @@ -12,6 +12,8 @@ "@asteasolutions/zod-to-openapi": "^7.0.0", "@trpc/server": "^10.45.2", "adm-zip": "^0.5.12", + "ajv": "^8.12.0", + "ajv-formats": "^3.0.1", "axios": "^1.7.7", "body-parser": "^1.20.2", "chokidar": "^3.5.3", @@ -26,8 +28,6 @@ "json-2-csv": "^5.5.1", "json-schema": "^0.4.0", "terminate": "^2.6.1", - "ajv": "^8.12.0", - "ajv-formats": "^3.0.1", "tsx": "^4.19.2", "uuid": "^9.0.1", "winston": "^3.11.0", diff --git a/genkit-tools/common/src/eval/evaluate.ts b/genkit-tools/common/src/eval/evaluate.ts index 6415d582b4..b2df6abccf 100644 --- a/genkit-tools/common/src/eval/evaluate.ts +++ b/genkit-tools/common/src/eval/evaluate.ts @@ -40,7 +40,11 @@ import { logger, stackTraceSpans, } from '../utils'; -import { enrichResultsWithScoring, extractMetricsMetadata } from './parser'; +import { + enrichResultsWithScoring, + extractMetricSummaries, + extractMetricsMetadata, +} from './parser'; interface InferenceRunState { testCaseId: string; @@ -181,11 +185,13 @@ export async function runEvaluation(params: { const scoredResults = enrichResultsWithScoring(scores, evalDataset); const metadata = extractMetricsMetadata(evaluatorActions); + const metricSummaries = extractMetricSummaries(scores); const evalRun = { key: { evalRunId, createdAt: new Date().toISOString(), + metricSummaries, ...augments, }, results: scoredResults, diff --git a/genkit-tools/common/src/eval/parser.ts b/genkit-tools/common/src/eval/parser.ts index 1a4ba976c0..42a28cf358 100644 --- a/genkit-tools/common/src/eval/parser.ts +++ b/genkit-tools/common/src/eval/parser.ts @@ -20,8 +20,14 @@ import { EvalFnResponse, EvalResponse } from '../types/evaluator'; import { EVALUATOR_METADATA_KEY_DEFINITION, EVALUATOR_METADATA_KEY_DISPLAY_NAME, + countBy, + groupBy, + meanBy, } from '../utils/eval'; +/** Maximum allowed unique strings / enums for generating summaries */ +export const MAX_UNIQUE_STRING_DIST = 5; + /** * Combines EvalInput with the generated scores to create a storable EvalResult. */ @@ -78,3 +84,127 @@ export function extractMetricsMetadata(evaluatorActions: Action[]) { } return metadata; } + +export function extractMetricSummaries( + /** key: evaluatorRef */ + scores: Record +) { + // key: evaluatorRef or evaluatorRef + scoreId (if available) + const testCaseCountMap: Record = {}; + + const entries = Object.entries(scores) + .map(([evaluator, responseArray]) => { + testCaseCountMap[evaluator] = responseArray.length; + return { + evaluator, + score: responseArray.flatMap((response) => + Array.isArray(response.evaluation) + ? response.evaluation + : [response.evaluation] + ), + }; + }) + .flatMap((entry) => { + const groupedScores = groupBy(entry.score, 'id'); + const groupedScoresKeys = Object.keys(groupedScores); + + if ( + groupedScoresKeys.length === 1 && + groupedScoresKeys[0] === 'undefined' + ) { + // No score-level granularity + return entry.score.flatMap((score) => ({ + evaluator: entry.evaluator, + testCaseCount: testCaseCountMap[entry.evaluator] ?? 0, + status: score.status, + score: score.score, + error: score.error, + })); + } else { + return Object.entries(groupedScores).flatMap(([scoreId, scores]) => { + if (scoreId === 'undefined') { + return scores.map((score) => ({ + evaluator: entry.evaluator, + testCaseCount: testCaseCountMap[entry.evaluator] ?? 0, + status: score.status, + score: score.score, + error: score.error, + })); + } else { + // Duplicate tracking to simplify lookup. + testCaseCountMap[entry.evaluator + '/' + scoreId] = + testCaseCountMap[entry.evaluator] ?? 0; + return scores.map((score) => ({ + // Synthetic ID to separate different scores + evaluator: entry.evaluator + '/' + scoreId, + testCaseCount: testCaseCountMap[entry.evaluator] ?? 0, + status: score.status, + score: score.score, + error: score.error, + })); + } + }); + } + }); + + const grouped = groupBy(entries, 'evaluator'); + + const summaries = Object.entries(grouped).map(([evaluator, items]) => { + const definedItems = items.filter( + (item) => typeof item.score !== 'undefined' + ); + const scoreUndefinedCount = items.filter( + (item) => typeof item.score === 'undefined' + ).length; + const errorCount = items.filter((item) => item.error !== undefined).length; + const statusDistribution = countBy(items, 'status'); + + if (definedItems.length > 0) { + // At least one score be registered for this + const validItem = definedItems[0]; + const scoreType = typeof validItem.score; + if (scoreType === 'number') { + return { + evaluator, + testCaseCount: validItem.testCaseCount, + errorCount, + scoreUndefinedCount, + statusDistribution, + averageScore: meanBy(definedItems, 'score'), + }; + } else if (scoreType === 'boolean') { + return { + evaluator, + testCaseCount: validItem.testCaseCount, + errorCount, + scoreUndefinedCount, + statusDistribution, + scoreDistribution: countBy(definedItems, 'score'), + }; + } else if (scoreType === 'string') { + // Treat as enum, but limit to 5 by heuristics + const scoreDistribution = countBy(definedItems, 'score'); + + if (Object.keys(scoreDistribution).length <= MAX_UNIQUE_STRING_DIST) { + return { + evaluator, + testCaseCount: validItem.testCaseCount, + errorCount, + scoreUndefinedCount, + scoreDistribution, + statusDistribution, + }; + } + } + } + return { + evaluator, + testCaseCount: testCaseCountMap[evaluator] ?? 0, + errorCount, + scoreUndefinedCount, + statusDistribution, + }; + }); + + return summaries; +} diff --git a/genkit-tools/common/src/types/eval.ts b/genkit-tools/common/src/types/eval.ts index 0d57e4898f..f7b9bbe422 100644 --- a/genkit-tools/common/src/types/eval.ts +++ b/genkit-tools/common/src/types/eval.ts @@ -131,6 +131,12 @@ export const EvalInputDatasetSchema = z.array(EvalInputSchema); export type EvalInputDataset = z.infer; const EvalStatusEnumSchema = z.enum(['UNKNOWN', 'PASS', 'FAIL']); +/** Enum that indicates if an evaluation has passed or failed */ +export enum EvalStatusEnum { + UNKNOWN = 'UNKNOWN', + PASS = 'PASS', + FAIL = 'FAIL', +} export const EvalMetricSchema = z.object({ evaluator: z.string(), @@ -164,6 +170,7 @@ export const EvalRunKeySchema = z.object({ evalRunId: z.string(), createdAt: z.string(), actionConfig: z.any().optional(), + metricSummaries: z.array(z.record(z.string(), z.any())).optional(), }); export type EvalRunKey = z.infer; export const EvalKeyAugmentsSchema = EvalRunKeySchema.pick({ diff --git a/genkit-tools/common/src/utils/eval.ts b/genkit-tools/common/src/utils/eval.ts index 4e26bed85d..265fa7bbde 100644 --- a/genkit-tools/common/src/utils/eval.ts +++ b/genkit-tools/common/src/utils/eval.ts @@ -363,3 +363,60 @@ export function getModelInput(data: any, modelConfig: any): GenerateRequest { } } } + +/** + * Helper method to groupBy an array of objects, replaces lodash equivalent. + */ +export function groupBy( + arr: any[], + criteria: ((i: any) => any) | string +): Record { + return arr.reduce(function (obj, item) { + const key = + typeof criteria === 'function' ? criteria(item) : item[criteria]; + + if (!obj.hasOwnProperty(key)) { + obj[key] = []; + } + obj[key].push(item); + + return obj; + }, {}); +} + +/** + * Helper method to countBy an array of objects, replaces lodash equivalent. + */ +export function countBy( + arr: any[], + criteria: ((i: any) => any) | string +): Record { + return arr.reduce((acc, item) => { + const key = + typeof criteria === 'function' ? criteria(item) : item[criteria]; + acc[key] = (acc[key] || 0) + 1; + + return acc; + }, {}); +} + +/** + * Helper method to meanBy an array of objects, replaces lodash equivalent. + */ +export function meanBy( + arr: any[], + criteria: ((i: any) => any) | string +): number | undefined { + if (!arr || arr.length === 0) { + return undefined; + } + + let sum = 0; + for (const item of arr) { + const value = + typeof criteria === 'function' ? criteria(item) : item[criteria]; + sum += value; + } + + return sum / arr.length; +} diff --git a/genkit-tools/common/tests/eval/parser_test.ts b/genkit-tools/common/tests/eval/parser_test.ts index 42f78417fb..f8e309cda0 100644 --- a/genkit-tools/common/tests/eval/parser_test.ts +++ b/genkit-tools/common/tests/eval/parser_test.ts @@ -15,9 +15,16 @@ */ import { describe, expect, it } from '@jest/globals'; -import { enrichResultsWithScoring } from '../../src/eval'; -import { EvalMetricSchema, EvalResult } from '../../src/types/eval'; -import { EvalResponse } from '../../src/types/evaluator'; +import { + enrichResultsWithScoring, + extractMetricSummaries, +} from '../../src/eval'; +import { + EvalMetricSchema, + EvalResult, + EvalStatusEnum, +} from '../../src/types/eval'; +import { EvalFnResponse, EvalResponse } from '../../src/types/evaluator'; describe('parser', () => { const evalRunResults: EvalResult[] = [ @@ -113,4 +120,610 @@ describe('parser', () => { }); }); }); + + describe('extractMetricSummaries', () => { + const simpleEvalOutput: Record = { + '/evaluator/genkit/context_relevancy': [ + { + testCaseId: 'case1', + evaluation: { + score: 7, + }, + }, + { + testCaseId: 'case2', + evaluation: { + score: 10, + }, + }, + { + testCaseId: 'case3', + evaluation: { + score: 5, + }, + }, + ], + }; + + describe('simpler scenarios', () => { + it('mean for simple numeric scores', () => { + const results = extractMetricSummaries(simpleEvalOutput); + + expect(results).toHaveLength(1); + + const result = results[0]; + expect(result).toEqual({ + evaluator: '/evaluator/genkit/context_relevancy', + testCaseCount: 3, + errorCount: 0, + scoreUndefinedCount: 0, + statusDistribution: { undefined: 3 }, + // 7 + 10 + 5 + averageScore: 22.0 / 3, + }); + }); + + it('scoreDistribution for simple boolean scores', () => { + const booleanScores = reMapScores(simpleEvalOutput, (response, i) => ({ + testCaseId: response.testCaseId, + evaluation: { + // True, False, True + score: i % 2 === 0, + }, + })); + const results = extractMetricSummaries(booleanScores); + + expect(results).toHaveLength(1); + + const result = results[0]; + expect(result).toEqual({ + evaluator: '/evaluator/genkit/context_relevancy', + testCaseCount: 3, + errorCount: 0, + scoreUndefinedCount: 0, + statusDistribution: { undefined: 3 }, + // True, False, True + scoreDistribution: { true: 2, false: 1 }, + }); + }); + + it('scoreDistribution for simple string scores (under 5)', () => { + const stringScores = reMapScores(simpleEvalOutput, (response, i) => ({ + testCaseId: response.testCaseId, + evaluation: { + // TYPE_0, TYPE_1, TYPE_0 + score: `TYPE_${i % 2}`, + }, + })); + const results = extractMetricSummaries(stringScores); + + expect(results).toHaveLength(1); + + const result = results[0]; + expect(result).toEqual({ + evaluator: '/evaluator/genkit/context_relevancy', + testCaseCount: 3, + errorCount: 0, + scoreUndefinedCount: 0, + statusDistribution: { undefined: 3 }, + // TYPE_0, TYPE_1, TYPE_0 + scoreDistribution: { TYPE_0: 2, TYPE_1: 1 }, + }); + }); + + it('scoreDistribution for simple string scores (over 5)', () => { + const extendedSimpleEvalOutput: Record = {}; + // 2x the simpleEvalOutput to get 6 samples. + extendedSimpleEvalOutput['/evaluator/genkit/context_relevancy'] = Array( + 2 + ) + .fill(simpleEvalOutput['/evaluator/genkit/context_relevancy']) + .flat(); + + const stringScores = reMapScores( + extendedSimpleEvalOutput, + (response, i) => ({ + testCaseId: response.testCaseId, + evaluation: { + // TYPE_0, TYPE_1, TYPE_2, TYPE_3, TYPE_4, TYPE_5 + score: `TYPE_${i}`, + }, + }) + ); + const results = extractMetricSummaries(stringScores); + + expect(results).toHaveLength(1); + + const result = results[0]; + expect(result).toEqual({ + evaluator: '/evaluator/genkit/context_relevancy', + testCaseCount: 6, + errorCount: 0, + scoreUndefinedCount: 0, + statusDistribution: { undefined: 6 }, + }); + }); + + it('status distribution for simple numeric scores', () => { + const mockStatuses = [ + EvalStatusEnum.PASS, + EvalStatusEnum.FAIL, + undefined, + ]; + const withStatus = reMapScores(simpleEvalOutput, (response, i) => ({ + testCaseId: response.testCaseId, + evaluation: { + // 0, 1, 2 + score: i, + // PASS, FAIL, undefined + status: mockStatuses[i], + }, + })); + const results = extractMetricSummaries(withStatus); + + expect(results).toHaveLength(1); + + const result = results[0]; + expect(result).toEqual({ + evaluator: '/evaluator/genkit/context_relevancy', + testCaseCount: 3, + errorCount: 0, + scoreUndefinedCount: 0, + statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 }, + // avg(0, 1, 2) + averageScore: 3.0 / 3, + }); + }); + }); + + describe('edge cases', () => { + it('metrics if scores are undefined but status available', () => { + const mockStatuses = [ + EvalStatusEnum.PASS, + EvalStatusEnum.FAIL, + undefined, + ]; + const undefinedScores = reMapScores( + simpleEvalOutput, + (response, i) => ({ + testCaseId: response.testCaseId, + evaluation: { + score: undefined, + // PASS, FAIL, undefined + status: mockStatuses[i], + }, + }) + ); + const results = extractMetricSummaries(undefinedScores); + + expect(results).toHaveLength(1); + + const result = results[0]; + expect(result).toEqual({ + evaluator: '/evaluator/genkit/context_relevancy', + testCaseCount: 3, + errorCount: 0, + scoreUndefinedCount: 3, + // PASS, FAIL, undefined + statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 }, + }); + }); + + it('metrics if some scores are undefined with status available', () => { + const mockStatuses = [ + EvalStatusEnum.PASS, + EvalStatusEnum.FAIL, + undefined, + ]; + const someDefinedScores = reMapScores( + simpleEvalOutput, + (response, i) => ({ + testCaseId: response.testCaseId, + evaluation: { + // 0, 1, undefined + score: i === 2 ? undefined : i, + // PASS, FAIL, undefined + status: mockStatuses[i], + }, + }) + ); + const results = extractMetricSummaries(someDefinedScores); + + expect(results).toHaveLength(1); + + const result = results[0]; + expect(result).toEqual({ + evaluator: '/evaluator/genkit/context_relevancy', + testCaseCount: 3, + errorCount: 0, + scoreUndefinedCount: 1, + // avg(0, 1) + averageScore: 1 / 2.0, + // PASS, FAIL, undefined + statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 }, + }); + }); + + it('metrics if some scores are undefined, some errors and with status available', () => { + const mockStatuses = [ + EvalStatusEnum.PASS, + EvalStatusEnum.FAIL, + undefined, + ]; + const someDefinedScores = reMapScores( + simpleEvalOutput, + (response, i) => ({ + testCaseId: response.testCaseId, + evaluation: { + // undefined, 1, undefined + score: i % 2 === 0 ? undefined : i, + status: mockStatuses[i], + // error, undefined, error + error: i % 2 === 0 ? 'some error' : undefined, + }, + }) + ); + const results = extractMetricSummaries(someDefinedScores); + + expect(results).toHaveLength(1); + + const result = results[0]; + expect(result).toEqual({ + evaluator: '/evaluator/genkit/context_relevancy', + testCaseCount: 3, + errorCount: 2, + scoreUndefinedCount: 2, + // avg(1) + averageScore: 1.0, + // PASS, FAIL, undefined + statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 }, + }); + }); + }); + + describe('multiple evaluators grouped', () => { + const multiOutput: Record = { + '/evaluator/genkit/faithfulness': [ + { + testCaseId: 'case1', + evaluation: { + score: 7, + }, + }, + { + testCaseId: 'case2', + evaluation: { + score: 10, + }, + }, + { + testCaseId: 'case3', + evaluation: { + score: 5, + }, + }, + ], + '/evaluator/genkit/context_relevancy': [ + { + testCaseId: 'case1', + evaluation: { + score: true, + }, + }, + { + testCaseId: 'case2', + evaluation: { + score: false, + }, + }, + { + testCaseId: 'case3', + evaluation: { + score: true, + }, + }, + ], + }; + + it('treats each evaluator separately', () => { + const results = extractMetricSummaries(multiOutput); + + expect(results).toHaveLength(2); + expect(results).toContainEqual({ + evaluator: '/evaluator/genkit/context_relevancy', + testCaseCount: 3, + errorCount: 0, + scoreUndefinedCount: 0, + statusDistribution: { undefined: 3 }, + // true, false, true + scoreDistribution: { true: 2, false: 1 }, + }); + expect(results).toContainEqual({ + evaluator: '/evaluator/genkit/faithfulness', + testCaseCount: 3, + errorCount: 0, + scoreUndefinedCount: 0, + statusDistribution: { undefined: 3 }, + // avg(7, 10, 5) + averageScore: 22.0 / 3, + }); + }); + + it('treats each evaluator separately, with errors, status, undefined scores', () => { + const mockStatuses = [ + EvalStatusEnum.PASS, + EvalStatusEnum.FAIL, + undefined, + ]; + const numericScores = [7, 10, 5]; + const stringScores = ['alpha', 'beta', 'gamma']; + const someDefinedScores = reMapScores( + multiOutput, + (response, i, evaluator) => { + if (evaluator === '/evaluator/genkit/faithfulness') { + return { + testCaseId: response.testCaseId, + evaluation: { + // undefined, 10, undefined + score: i % 2 === 0 ? undefined : numericScores[i], + // PASS, FAIL, undefined + status: mockStatuses[i], + // error, undefined, error + error: i % 2 === 0 ? 'some error' : undefined, + }, + }; + } else { + return { + testCaseId: response.testCaseId, + evaluation: { + // alpha, undefined, gamma + score: i % 2 !== 0 ? undefined : stringScores[i], + // PASS, FAIL, undefined + status: mockStatuses[i], + // undefined, error, undefined + error: i % 2 !== 0 ? 'some error' : undefined, + }, + }; + } + } + ); + const results = extractMetricSummaries(someDefinedScores); + + expect(results).toHaveLength(2); + expect(results).toContainEqual({ + evaluator: '/evaluator/genkit/faithfulness', + testCaseCount: 3, + errorCount: 2, + scoreUndefinedCount: 2, + statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 }, + // avg(10) + averageScore: 10.0, + }); + expect(results).toContainEqual({ + evaluator: '/evaluator/genkit/context_relevancy', + testCaseCount: 3, + errorCount: 1, + scoreUndefinedCount: 1, + statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 }, + // alpha, gamma + scoreDistribution: { alpha: 1, gamma: 1 }, + }); + }); + + describe('multi-scores', () => { + it('mix of scores', () => { + const mockEvaluations = [ + { + score: 1, + status: EvalStatusEnum.PASS, + }, + [ + { + score: 1, + status: EvalStatusEnum.FAIL, + }, + { + score: 2, + status: EvalStatusEnum.PASS, + }, + ], + { + score: undefined, + }, + ]; + const mixedScores = reMapScores(simpleEvalOutput, (response, i) => ({ + testCaseId: response.testCaseId, + evaluation: mockEvaluations[i], + })); + const results = extractMetricSummaries(mixedScores); + + expect(results).toHaveLength(1); + + const result = results[0]; + expect(result).toEqual({ + evaluator: '/evaluator/genkit/context_relevancy', + testCaseCount: 3, + errorCount: 0, + scoreUndefinedCount: 1, + // avg(1, 1, 2) + averageScore: 4.0 / 3, + // PASS, FAIL, PASS, undefined + statusDistribution: { undefined: 1, PASS: 2, FAIL: 1 }, + }); + }); + + it('scores with IDs', () => { + const mockEvaluations = [ + [ + { + score: 5, + id: 'numeric', + status: EvalStatusEnum.PASS, + }, + { + score: 'YES', + id: 'enum', + status: EvalStatusEnum.FAIL, + }, + ], + [ + { + score: 7, + id: 'numeric', + status: EvalStatusEnum.FAIL, + }, + { + score: 'NO', + id: 'enum', + status: EvalStatusEnum.PASS, + }, + ], + [ + { + score: undefined, + id: 'numeric', + error: 'somer error', + }, + { + score: undefined, + id: 'enum', + }, + ], + ]; + const mixedScores = reMapScores(simpleEvalOutput, (response, i) => ({ + testCaseId: response.testCaseId, + evaluation: mockEvaluations[i], + })); + const results = extractMetricSummaries(mixedScores); + + expect(results).toHaveLength(2); + expect(results).toContainEqual({ + evaluator: '/evaluator/genkit/context_relevancy/numeric', + testCaseCount: 3, + errorCount: 1, + scoreUndefinedCount: 1, + // avg(5, 7) + averageScore: 12.0 / 2, + // PASS, FAIL, undefined + statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 }, + }); + expect(results).toContainEqual({ + evaluator: '/evaluator/genkit/context_relevancy/enum', + testCaseCount: 3, + errorCount: 0, + scoreUndefinedCount: 1, + // YES, NO + scoreDistribution: { YES: 1, NO: 1 }, + // FAIL, PASS, undefined + statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 }, + }); + }); + + it('multi-scores with IDs', () => { + const mockEvaluations = [ + [ + { + score: 5, + id: 'numeric', + status: EvalStatusEnum.PASS, + }, + { + score: 'YES', + id: 'enum', + status: EvalStatusEnum.FAIL, + }, + ], + [ + { + score: 7, + id: 'numeric', + status: EvalStatusEnum.FAIL, + }, + { + score: 'NO', + id: 'enum', + status: EvalStatusEnum.PASS, + }, + ], + [ + { + score: undefined, + id: 'numeric', + error: 'somer error', + }, + { + score: undefined, + id: 'enum', + }, + ], + ]; + const mixedScores = reMapScores(multiOutput, (response, i) => ({ + testCaseId: response.testCaseId, + evaluation: mockEvaluations[i], + })); + const results = extractMetricSummaries(mixedScores); + + expect(results).toHaveLength(4); + expect(results).toContainEqual({ + evaluator: '/evaluator/genkit/context_relevancy/numeric', + testCaseCount: 3, + errorCount: 1, + scoreUndefinedCount: 1, + // avg(5, 7) + averageScore: 12.0 / 2, + // PASS, FAIL, undefined + statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 }, + }); + expect(results).toContainEqual({ + evaluator: '/evaluator/genkit/context_relevancy/enum', + testCaseCount: 3, + errorCount: 0, + scoreUndefinedCount: 1, + // YES, NO + scoreDistribution: { YES: 1, NO: 1 }, + // FAIL, PASS, undefined + statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 }, + }); + expect(results).toContainEqual({ + evaluator: '/evaluator/genkit/faithfulness/numeric', + testCaseCount: 3, + errorCount: 1, + scoreUndefinedCount: 1, + // avg(5, 7) + averageScore: 12.0 / 2, + // PASS, FAIL, undefined + statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 }, + }); + expect(results).toContainEqual({ + evaluator: '/evaluator/genkit/faithfulness/enum', + testCaseCount: 3, + errorCount: 0, + scoreUndefinedCount: 1, + // YES, NO + scoreDistribution: { YES: 1, NO: 1 }, + // FAIL, PASS, undefined + statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 }, + }); + }); + }); + }); + }); }); + +function reMapScores( + scoresMap: Record, + fn: ( + score: EvalFnResponse, + index: number, + evaluator?: string + ) => EvalFnResponse +): Record { + let remapped: Record = {}; + + for (const [evaluator, scores] of Object.entries(scoresMap)) { + remapped[evaluator] = scores.map((score, index) => + fn(score, index, evaluator) + ); + } + return remapped; +} diff --git a/js/testapps/evals/src/genkit.ts b/js/testapps/evals/src/genkit.ts index 86d01a056f..3521c2e66d 100644 --- a/js/testapps/evals/src/genkit.ts +++ b/js/testapps/evals/src/genkit.ts @@ -28,7 +28,6 @@ import { VertexAIEvaluationMetricType, } from '@genkit-ai/vertexai/evaluation'; import { genkit } from 'genkit'; -import { EvalStatusEnum } from 'genkit/evaluator'; import { langchain } from 'genkitx-langchain'; // Turn off safety checks for evaluation so that the LLM as an evaluator can @@ -63,10 +62,6 @@ export const ai = genkit({ type: GenkitMetric.MALICIOUSNESS, judge: gemini15Pro, judgeConfig: PERMISSIVE_SAFETY_SETTINGS, - statusOverrideFn: ({ score: Score }) => { - // Always set to fail to test override - return EvalStatusEnum.FAIL; - }, }, ], }),