diff --git a/commits.txt b/commits.txt new file mode 100644 index 000000000..73fd43c52 --- /dev/null +++ b/commits.txt @@ -0,0 +1,6 @@ +e8443df2 +9503a0a6 +293fe825 +a88ecd67 +66d74dd2 +53f34370 \ No newline at end of file diff --git a/diff.txt b/diff.txt new file mode 100644 index 000000000..b393df88f --- /dev/null +++ b/diff.txt @@ -0,0 +1,9297 @@ +diff --git a/commits.txt b/commits.txt +new file mode 100644 +index 00000000..73fd43c5 +--- /dev/null ++++ b/commits.txt +@@ -0,0 +1,6 @@ ++e8443df2 ++9503a0a6 ++293fe825 ++a88ecd67 ++66d74dd2 ++53f34370 +\ No newline at end of file +diff --git a/examples/with-client-side-tools/next-env.d.ts b/examples/with-client-side-tools/next-env.d.ts +index 1b3be084..9edff1c7 100644 +--- a/examples/with-client-side-tools/next-env.d.ts ++++ b/examples/with-client-side-tools/next-env.d.ts +@@ -1,5 +1,6 @@ + /// + /// ++import "./.next/types/routes.d.ts"; + + // NOTE: This file should not be edited + // see https://nextjs.org/docs/app/api-reference/config/typescript for more information. +diff --git a/examples/with-client-side-tools/tsconfig.json b/examples/with-client-side-tools/tsconfig.json +index 3697fcb9..0fca67d3 100644 +--- a/examples/with-client-side-tools/tsconfig.json ++++ b/examples/with-client-side-tools/tsconfig.json +@@ -1,6 +1,10 @@ + { + "compilerOptions": { +- "lib": ["dom", "dom.iterable", "esnext"], ++ "lib": [ ++ "dom", ++ "dom.iterable", ++ "esnext" ++ ], + "allowJs": true, + "skipLibCheck": true, + "strict": true, +@@ -11,7 +15,7 @@ + "resolveJsonModule": true, + "isolatedModules": true, + "sourceMap": true, +- "jsx": "preserve", ++ "jsx": "react-jsx", + "incremental": true, + "plugins": [ + { +@@ -19,10 +23,20 @@ + } + ], + "paths": { +- "@/*": ["./*"] ++ "@/*": [ ++ "./*" ++ ] + }, + "target": "ES2017" + }, +- "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"], +- "exclude": ["node_modules"] ++ "include": [ ++ "next-env.d.ts", ++ "**/*.ts", ++ "**/*.tsx", ++ ".next/types/**/*.ts", ++ ".next/dev/types/**/*.ts" ++ ], ++ "exclude": [ ++ "node_modules" ++ ] + } +diff --git a/examples/with-netlify-functions/netlify/functions/voltagent.js b/examples/with-netlify-functions/netlify/functions/voltagent.js +new file mode 100644 +index 00000000..0ec386b8 +--- /dev/null ++++ b/examples/with-netlify-functions/netlify/functions/voltagent.js +@@ -0,0 +1,4 @@ ++import { createNetlifyFunctionHandler } from "@voltagent/serverless-hono"; ++import { getVoltAgent } from "../../src/index"; ++const voltAgent = getVoltAgent(); ++export const handler = createNetlifyFunctionHandler(voltAgent); +diff --git a/examples/with-netlify-functions/src/index.js b/examples/with-netlify-functions/src/index.js +new file mode 100644 +index 00000000..af385b50 +--- /dev/null ++++ b/examples/with-netlify-functions/src/index.js +@@ -0,0 +1,17 @@ ++import { openai } from "@ai-sdk/openai"; ++import { Agent, VoltAgent } from "@voltagent/core"; ++import { serverlessHono } from "@voltagent/serverless-hono"; ++import { weatherTool } from "./tools"; ++const agent = new Agent({ ++ name: "netlify-function-agent", ++ instructions: "Help the user quickly and call tools when needed.", ++ model: openai("gpt-4o-mini"), ++ tools: [weatherTool], ++}); ++const voltAgent = new VoltAgent({ ++ agents: { agent }, ++ serverless: serverlessHono(), ++}); ++export function getVoltAgent() { ++ return voltAgent; ++} +diff --git a/examples/with-netlify-functions/src/tools/index.js b/examples/with-netlify-functions/src/tools/index.js +new file mode 100644 +index 00000000..d1c5bf43 +--- /dev/null ++++ b/examples/with-netlify-functions/src/tools/index.js +@@ -0,0 +1,26 @@ ++import { createTool } from "@voltagent/core"; ++import z from "zod"; ++export const weatherTool = createTool({ ++ id: "get-weather", ++ name: "getWeather", ++ description: "Return a mock weather report for the requested location", ++ parameters: z.object({ ++ location: z.string().describe("City or location to look up"), ++ }), ++ execute: async ({ location }, context) => { ++ context?.logger.info(`Fetching weather for ${location}`); ++ const mockWeatherData = { ++ location, ++ temperature: Math.floor(Math.random() * 30) + 5, ++ condition: ["Sunny", "Cloudy", "Rainy", "Snowy", "Partly Cloudy"][ ++ Math.floor(Math.random() * 5) ++ ], ++ humidity: Math.floor(Math.random() * 60) + 30, ++ windSpeed: Math.floor(Math.random() * 30), ++ }; ++ return { ++ weather: mockWeatherData, ++ message: `Current weather in ${location}: ${mockWeatherData.temperature}°C and ${mockWeatherData.condition.toLowerCase()} with ${mockWeatherData.humidity}% humidity and wind speed of ${mockWeatherData.windSpeed} km/h.`, ++ }; ++ }, ++}); +diff --git a/package.json b/package.json +index 7c80f7c5..7e3ef8ba 100644 +--- a/package.json ++++ b/package.json +@@ -32,9 +32,10 @@ + "publint": "^0.3.8", + "rimraf": "^5.0.5", + "syncpack": "^13.0.2", ++ "ts-node": "^10.9.2", + "tslib": "^2.3.0", + "tsup": "^8.5.0", +- "typescript": "^5.8.2", ++ "typescript": "^5.9.2", + "vite": "^7.2.7", + "vitest": "^3.2.4" + }, +diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts +index 291bdf7f..84343c04 100644 +--- a/packages/core/src/agent/agent.ts ++++ b/packages/core/src/agent/agent.ts +@@ -48,6 +48,14 @@ import type { BaseRetriever } from "../retriever/retriever"; + import type { Tool, Toolkit } from "../tool"; + import { createTool } from "../tool"; + import { ToolManager } from "../tool/manager"; ++import { ++ type FallbackChainEntry, ++ type TrafficPriority, ++ type TrafficRequest, ++ type TrafficRequestMetadata, ++ getTrafficController, ++} from "../traffic/traffic-controller"; ++import { findHeaders } from "../traffic/traffic-error-utils"; + import { randomUUID } from "../utils/id"; + import { convertModelMessagesToUIMessages } from "../utils/message-converter"; + import { NodeType, createNodeId } from "../utils/node-utils"; +@@ -262,8 +270,42 @@ export interface BaseGenerationOptions extends Partial { + // Context + userId?: string; + conversationId?: string; ++ tenantId?: string; ++ /** ++ * Optional key metadata for per-key rate limits. ++ */ ++ apiKeyId?: string; ++ /** ++ * Optional region metadata for per-region rate limits. ++ */ ++ region?: string; ++ /** ++ * Optional endpoint metadata for per-endpoint rate limits. ++ */ ++ endpoint?: string; ++ /** ++ * Optional tenant tier metadata for per-tier rate limits. ++ */ ++ tenantTier?: string; + context?: ContextInput; + elicitation?: (request: unknown) => Promise; ++ /** ++ * Optional priority override for scheduling. ++ * Defaults to agent-level priority when omitted. ++ */ ++ trafficPriority?: TrafficPriority; ++ /** ++ * Optional maximum time to wait in the queue before timing out. ++ */ ++ maxQueueWaitMs?: number; ++ /** ++ * Optional task classification for circuit-breaker fallback policies. ++ */ ++ taskType?: string; ++ /** ++ * Optional explicit fallback policy id. ++ */ ++ fallbackPolicyId?: string; + + // Parent tracking + parentAgentId?: string; +@@ -303,6 +345,8 @@ export interface BaseGenerationOptions extends Partial { + + // Provider-specific options + providerOptions?: ProviderOptions; ++ // Optional per-call model override (used for fallbacks) ++ model?: LanguageModel; + + // Experimental output (for structured generation) + experimental_output?: ReturnType | ReturnType; +@@ -347,6 +391,7 @@ export class Agent { + readonly voice?: Voice; + readonly retriever?: BaseRetriever; + readonly supervisorConfig?: SupervisorConfig; ++ private readonly trafficPriority: TrafficPriority; + private readonly context?: Map; + + private readonly logger: Logger; +@@ -372,6 +417,7 @@ export class Agent { + this.temperature = options.temperature; + this.maxOutputTokens = options.maxOutputTokens; + this.maxSteps = options.maxSteps || 5; ++ this.trafficPriority = options.trafficPriority ?? "P1"; + this.stopWhen = options.stopWhen; + this.markdown = options.markdown ?? false; + this.voice = options.voice; +@@ -444,6 +490,47 @@ export class Agent { + async generateText( + input: string | UIMessage[] | BaseMessage[], + options?: GenerateTextOptions, ++ ): Promise { ++ const controller = getTrafficController({ logger: this.logger }); // Use shared controller so all agent calls flow through central queue/metrics ++ const tenantId = this.resolveTenantId(options); ++ const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => { ++ const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); ++ const metadata = this.buildTrafficMetadata( ++ mergedOptions?.model, ++ mergedOptions, ++ providerOverride, ++ ); // Compute once per queued request (including per-call model overrides) ++ return { ++ tenantId, ++ metadata, ++ maxQueueWaitMs: options?.maxQueueWaitMs, ++ estimatedTokens: this.estimateTokens(input, mergedOptions), ++ execute: () => this.executeGenerateText(input, mergedOptions, metadata), // Defer actual execution so controller can schedule it ++ extractUsage: (result: GenerateTextResultWithContext) => ++ this.extractUsageFromResponse(result), ++ createFallbackRequest: (fallbackTarget) => { ++ if (this.isShortResponseFallback(fallbackTarget)) { ++ return this.buildShortTextFallbackRequest( ++ tenantId, ++ metadata, ++ mergedOptions, ++ fallbackTarget.text, ++ ); ++ } ++ const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = ++ this.resolveFallbackTarget(fallbackTarget); ++ return buildRequest(fallbackModel, fallbackProvider); ++ }, ++ }; ++ }; ++ ++ return controller.handleText(buildRequest(options?.model)); ++ } ++ ++ private async executeGenerateText( ++ input: string | UIMessage[] | BaseMessage[], ++ options?: GenerateTextOptions, ++ trafficMetadata?: TrafficRequestMetadata, + ): Promise { + const startTime = Date.now(); + const oc = this.createOperationContext(input, options); +@@ -471,7 +558,7 @@ export class Agent { + options, + ); + +- const modelName = this.getModelName(); ++ const modelName = this.getModelName(model); + const contextLimit = options?.contextLimit; + + // Add model attributes and all options +@@ -544,10 +631,20 @@ export class Agent { + hooks, + maxSteps: userMaxSteps, + tools: userTools, ++ maxQueueWaitMs, ++ taskType, ++ fallbackPolicyId, + experimental_output, + providerOptions, ++ maxRetries: _maxRetries, // Always disable provider retries (TrafficController handles retries) ++ model: _model, // Exclude model so aiSDKOptions doesn't override resolved model + ...aiSDKOptions + } = options || {}; ++ void _model; ++ void _maxRetries; ++ void maxQueueWaitMs; ++ void taskType; ++ void fallbackPolicyId; + + const llmSpan = this.createLLMSpan(oc, { + operation: "generateText", +@@ -567,6 +664,11 @@ export class Agent { + + let result!: GenerateTextResult; + try { ++ methodLogger.info("[AI SDK] Calling generateText", { ++ messageCount: messages.length, ++ modelName, ++ tools: tools ? Object.keys(tools) : [], ++ }); + result = await oc.traceContext.withSpan(llmSpan, () => + generateText({ + model, +@@ -575,7 +677,7 @@ export class Agent { + // Default values + temperature: this.temperature, + maxOutputTokens: this.maxOutputTokens, +- maxRetries: 3, ++ maxRetries: 0, + stopWhen: options?.stopWhen ?? this.stopWhen ?? stepCountIs(maxSteps), + // User overrides from AI SDK options + ...aiSDKOptions, +@@ -588,7 +690,15 @@ export class Agent { + onStepFinish: this.createStepHandler(oc, options), + }), + ); ++ methodLogger.info("[AI SDK] Received generateText result", { ++ finishReason: result.finishReason, ++ usage: result.usage ? safeStringify(result.usage) : undefined, ++ stepCount: result.steps?.length ?? 0, ++ rawResult: safeStringify(result), ++ }); ++ this.updateTrafficControllerRateLimits(result.response, trafficMetadata, methodLogger); + } catch (error) { ++ this.updateTrafficControllerRateLimits(error, trafficMetadata, methodLogger); + finalizeLLMSpan(SpanStatusCode.ERROR, { message: (error as Error).message }); + throw error; + } +@@ -771,6 +881,47 @@ export class Agent { + async streamText( + input: string | UIMessage[] | BaseMessage[], + options?: StreamTextOptions, ++ ): Promise { ++ const controller = getTrafficController({ logger: this.logger }); // Same controller handles streaming to keep ordering/backpressure consistent ++ const tenantId = this.resolveTenantId(options); ++ const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => { ++ const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); ++ const metadata = this.buildTrafficMetadata( ++ mergedOptions?.model, ++ mergedOptions, ++ providerOverride, ++ ); // Compute once per queued request (including per-call model overrides) ++ return { ++ tenantId, ++ metadata, ++ maxQueueWaitMs: options?.maxQueueWaitMs, ++ estimatedTokens: this.estimateTokens(input, mergedOptions), ++ execute: () => this.executeStreamText(input, mergedOptions, metadata), // Actual streaming work happens after the controller dequeues us ++ extractUsage: (result: StreamTextResultWithContext) => ++ this.extractUsageFromResponse(result), ++ createFallbackRequest: (fallbackTarget) => { ++ if (this.isShortResponseFallback(fallbackTarget)) { ++ return this.buildShortStreamTextFallbackRequest( ++ tenantId, ++ metadata, ++ mergedOptions, ++ fallbackTarget.text, ++ ); ++ } ++ const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = ++ this.resolveFallbackTarget(fallbackTarget); ++ return buildRequest(fallbackModel, fallbackProvider); ++ }, ++ }; ++ }; ++ ++ return controller.handleStream(buildRequest(options?.model)); ++ } ++ ++ private async executeStreamText( ++ input: string | UIMessage[] | BaseMessage[], ++ options?: StreamTextOptions, ++ trafficMetadata?: TrafficRequestMetadata, + ): Promise { + const startTime = Date.now(); + const oc = this.createOperationContext(input, options); +@@ -800,7 +951,7 @@ export class Agent { + options, + ); + +- const modelName = this.getModelName(); ++ const modelName = this.getModelName(model); + const contextLimit = options?.contextLimit; + + // Add model attributes to root span if TraceContext exists +@@ -868,10 +1019,20 @@ export class Agent { + maxSteps: userMaxSteps, + tools: userTools, + onFinish: userOnFinish, ++ maxQueueWaitMs, ++ taskType, ++ fallbackPolicyId, + experimental_output, + providerOptions, ++ maxRetries: _maxRetries, // Always disable provider retries (TrafficController handles retries) ++ model: _model, // Exclude model from aiSDKOptions to avoid overriding resolved model + ...aiSDKOptions + } = options || {}; ++ void _model; ++ void _maxRetries; ++ void maxQueueWaitMs; ++ void taskType; ++ void fallbackPolicyId; + + const guardrailStreamingEnabled = guardrailSet.output.length > 0; + +@@ -893,7 +1054,13 @@ export class Agent { + }, + }); + const finalizeLLMSpan = this.createLLMSpanFinalizer(llmSpan); ++ const trafficController = getTrafficController({ logger: this.logger }); + ++ methodLogger.info("[AI SDK] Calling streamText", { ++ messageCount: messages.length, ++ modelName, ++ tools: tools ? Object.keys(tools) : [], ++ }); + const result = streamText({ + model, + messages, +@@ -901,7 +1068,7 @@ export class Agent { + // Default values + temperature: this.temperature, + maxOutputTokens: this.maxOutputTokens, +- maxRetries: 3, ++ maxRetries: 0, // Retry via traffic controller to avoid provider-level storms + stopWhen: options?.stopWhen ?? this.stopWhen ?? stepCountIs(maxSteps), + // User overrides from AI SDK options + ...aiSDKOptions, +@@ -937,6 +1104,8 @@ export class Agent { + modelName: this.getModelName(), + }); + ++ this.updateTrafficControllerRateLimits(actualError, trafficMetadata, methodLogger); ++ trafficController.reportStreamFailure(trafficMetadata, actualError); + finalizeLLMSpan(SpanStatusCode.ERROR, { message: (actualError as Error)?.message }); + + // History update removed - using OpenTelemetry only +@@ -962,6 +1131,18 @@ export class Agent { + .catch(() => {}); + }, + onFinish: async (finalResult) => { ++ methodLogger.info("[AI SDK] streamText finished", { ++ finishReason: finalResult.finishReason, ++ usage: finalResult.totalUsage ? safeStringify(finalResult.totalUsage) : undefined, ++ stepCount: finalResult.steps?.length ?? 0, ++ rawResult: safeStringify(finalResult), ++ }); ++ this.updateTrafficControllerRateLimits( ++ finalResult.response, ++ trafficMetadata, ++ methodLogger, ++ ); ++ trafficController.reportStreamSuccess(trafficMetadata); + const providerUsage = finalResult.usage + ? await Promise.resolve(finalResult.usage) + : undefined; +@@ -1428,6 +1609,49 @@ export class Agent { + input: string | UIMessage[] | BaseMessage[], + schema: T, + options?: GenerateObjectOptions, ++ ): Promise>> { ++ const controller = getTrafficController({ logger: this.logger }); ++ const tenantId = this.resolveTenantId(options); ++ const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => { ++ const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); ++ const metadata = this.buildTrafficMetadata( ++ mergedOptions?.model, ++ mergedOptions, ++ providerOverride, ++ ); // Compute once per queued request (including per-call model overrides) ++ return { ++ tenantId, ++ metadata, ++ maxQueueWaitMs: options?.maxQueueWaitMs, ++ estimatedTokens: this.estimateTokens(input, mergedOptions), ++ execute: () => this.executeGenerateObject(input, schema, mergedOptions, metadata), ++ extractUsage: (result: GenerateObjectResultWithContext>) => ++ this.extractUsageFromResponse(result), ++ createFallbackRequest: (fallbackTarget) => { ++ if (this.isShortResponseFallback(fallbackTarget)) { ++ return this.buildShortObjectFallbackRequest( ++ tenantId, ++ metadata, ++ schema, ++ mergedOptions, ++ fallbackTarget.text, ++ ); ++ } ++ const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = ++ this.resolveFallbackTarget(fallbackTarget); ++ return buildRequest(fallbackModel, fallbackProvider); ++ }, ++ }; ++ }; ++ ++ return controller.handleText(buildRequest(options?.model)); ++ } ++ ++ private async executeGenerateObject( ++ input: string | UIMessage[] | BaseMessage[], ++ schema: T, ++ options?: GenerateObjectOptions, ++ trafficMetadata?: TrafficRequestMetadata, + ): Promise>> { + const startTime = Date.now(); + const oc = this.createOperationContext(input, options); +@@ -1452,7 +1676,7 @@ export class Agent { + options, + ); + +- const modelName = this.getModelName(); ++ const modelName = this.getModelName(model); + const schemaName = schema.description || "unknown"; + + // Add model attributes and all options +@@ -1510,10 +1734,25 @@ export class Agent { + hooks, + maxSteps: userMaxSteps, + tools: userTools, ++ taskType, ++ fallbackPolicyId, ++ maxQueueWaitMs, + providerOptions, ++ maxRetries: _maxRetries, // Always disable provider retries (TrafficController handles retries) ++ model: _model, // Exclude model so spread does not override resolved model + ...aiSDKOptions + } = options || {}; +- ++ void _model; ++ void _maxRetries; ++ void taskType; ++ void fallbackPolicyId; ++ void maxQueueWaitMs; ++ ++ methodLogger.info("[AI SDK] Calling generateObject", { ++ messageCount: messages.length, ++ modelName, ++ schemaName, ++ }); + const result = await generateObject({ + model, + messages, +@@ -1522,7 +1761,7 @@ export class Agent { + // Default values + maxOutputTokens: this.maxOutputTokens, + temperature: this.temperature, +- maxRetries: 3, ++ maxRetries: 0, + // User overrides from AI SDK options + ...aiSDKOptions, + // Provider-specific options +@@ -1530,6 +1769,13 @@ export class Agent { + // VoltAgent controlled + abortSignal: oc.abortController.signal, + }); ++ methodLogger.info("[AI SDK] Received generateObject result", { ++ finishReason: result.finishReason, ++ usage: result.usage ? safeStringify(result.usage) : undefined, ++ warnings: result.warnings, ++ rawResult: safeStringify(result), ++ }); ++ this.updateTrafficControllerRateLimits(result.response, trafficMetadata, methodLogger); + + const usageInfo = convertUsage(result.usage); + const finalObject = await executeOutputGuardrails({ +@@ -1638,6 +1884,7 @@ export class Agent { + context: oc.context, + }; + } catch (error) { ++ this.updateTrafficControllerRateLimits(error, trafficMetadata, methodLogger); + await this.flushPendingMessagesOnError(oc).catch(() => {}); + return this.handleError(error as Error, oc, options, startTime); + } finally { +@@ -1655,6 +1902,49 @@ export class Agent { + input: string | UIMessage[] | BaseMessage[], + schema: T, + options?: StreamObjectOptions, ++ ): Promise>> { ++ const controller = getTrafficController({ logger: this.logger }); ++ const tenantId = this.resolveTenantId(options); ++ const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => { ++ const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); ++ const metadata = this.buildTrafficMetadata( ++ mergedOptions?.model, ++ mergedOptions, ++ providerOverride, ++ ); // Compute once per queued request (including per-call model overrides) ++ return { ++ tenantId, ++ metadata, ++ maxQueueWaitMs: options?.maxQueueWaitMs, ++ estimatedTokens: this.estimateTokens(input, mergedOptions), ++ execute: () => this.executeStreamObject(input, schema, mergedOptions, metadata), ++ extractUsage: (result: StreamObjectResultWithContext>) => ++ this.extractUsageFromResponse(result), ++ createFallbackRequest: (fallbackTarget) => { ++ if (this.isShortResponseFallback(fallbackTarget)) { ++ return this.buildShortStreamObjectFallbackRequest( ++ tenantId, ++ metadata, ++ schema, ++ mergedOptions, ++ fallbackTarget.text, ++ ); ++ } ++ const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = ++ this.resolveFallbackTarget(fallbackTarget); ++ return buildRequest(fallbackModel, fallbackProvider); ++ }, ++ }; ++ }; ++ ++ return controller.handleStream(buildRequest(options?.model)); ++ } ++ ++ private async executeStreamObject( ++ input: string | UIMessage[] | BaseMessage[], ++ schema: T, ++ options?: StreamObjectOptions, ++ trafficMetadata?: TrafficRequestMetadata, + ): Promise>> { + const startTime = Date.now(); + const oc = this.createOperationContext(input, options); +@@ -1680,7 +1970,7 @@ export class Agent { + options, + ); + +- const modelName = this.getModelName(); ++ const modelName = this.getModelName(model); + const schemaName = schema.description || "unknown"; + + // Add model attributes and all options +@@ -1739,14 +2029,30 @@ export class Agent { + maxSteps: userMaxSteps, + tools: userTools, + onFinish: userOnFinish, ++ taskType, ++ fallbackPolicyId, ++ maxQueueWaitMs, + providerOptions, ++ maxRetries: _maxRetries, // Always disable provider retries (TrafficController handles retries) ++ model: _model, // Exclude model so aiSDKOptions cannot override resolved model + ...aiSDKOptions + } = options || {}; ++ void _model; ++ void _maxRetries; ++ void taskType; ++ void fallbackPolicyId; ++ void maxQueueWaitMs; + + let guardrailObjectPromise!: Promise>; + let resolveGuardrailObject: ((value: z.infer) => void) | undefined; + let rejectGuardrailObject: ((reason: unknown) => void) | undefined; ++ const trafficController = getTrafficController({ logger: this.logger }); + ++ methodLogger.info("[AI SDK] Calling streamObject", { ++ messageCount: messages.length, ++ modelName, ++ schemaName, ++ }); + const result = streamObject({ + model, + messages, +@@ -1755,7 +2061,7 @@ export class Agent { + // Default values + maxOutputTokens: this.maxOutputTokens, + temperature: this.temperature, +- maxRetries: 3, ++ maxRetries: 0, + // User overrides from AI SDK options + ...aiSDKOptions, + // Provider-specific options +@@ -1771,9 +2077,11 @@ export class Agent { + methodLogger.error("Stream object error occurred", { + error: actualError, + agentName: this.name, +- modelName: this.getModelName(), ++ modelName: this.getModelName(model), + schemaName: schemaName, + }); ++ this.updateTrafficControllerRateLimits(actualError, trafficMetadata, methodLogger); ++ trafficController.reportStreamFailure(trafficMetadata, actualError); + + // History update removed - using OpenTelemetry only + +@@ -1800,6 +2108,17 @@ export class Agent { + }, + onFinish: async (finalResult: any) => { + try { ++ methodLogger.info("[AI SDK] streamObject finished", { ++ finishReason: finalResult.finishReason, ++ usage: finalResult.usage ? safeStringify(finalResult.usage) : undefined, ++ rawResult: safeStringify(finalResult), ++ }); ++ this.updateTrafficControllerRateLimits( ++ finalResult.response, ++ trafficMetadata, ++ methodLogger, ++ ); ++ trafficController.reportStreamSuccess(trafficMetadata); + const usageInfo = convertUsage(finalResult.usage as any); + let finalObject = finalResult.object as z.infer; + if (guardrailSet.output.length > 0) { +@@ -2021,8 +2340,9 @@ export class Agent { + // Calculate maxSteps (use provided option or calculate based on subagents) + const maxSteps = options?.maxSteps ?? this.calculateMaxSteps(); + +- // Resolve dynamic values +- const model = await this.resolveValue(this.model, oc); ++ // Resolve dynamic values (allow per-call model override for fallbacks) ++ const selectedModel = options?.model ?? this.model; ++ const model = await this.resolveValue(selectedModel, oc); + const dynamicToolList = (await this.resolveValue(this.dynamicTools, oc)) || []; + + // Merge agent tools with option tools +@@ -2073,6 +2393,12 @@ export class Agent { + ): OperationContext { + const operationId = randomUUID(); + const startTimeDate = new Date(); ++ const priority = this.resolveTrafficPriority(options); ++ const tenantId = this.resolveTenantId(options); ++ const apiKeyId = options?.apiKeyId ?? options?.parentOperationContext?.apiKeyId; ++ const region = options?.region ?? options?.parentOperationContext?.region; ++ const endpoint = options?.endpoint ?? options?.parentOperationContext?.endpoint; ++ const tenantTier = options?.tenantTier ?? options?.parentOperationContext?.tenantTier; + + // Prefer reusing an existing context instance to preserve reference across calls/subagents + const runtimeContext = toContextMap(options?.context); +@@ -2123,6 +2449,7 @@ export class Agent { + operationId, + userId: options?.userId, + conversationId: options?.conversationId, ++ tenantId, + executionId: operationId, + }); + +@@ -2137,6 +2464,9 @@ export class Agent { + parentAgentId: options?.parentAgentId, + input, + }); ++ if (tenantId) { ++ traceContext.getRootSpan().setAttribute("tenant.id", tenantId); ++ } + traceContext.getRootSpan().setAttribute("voltagent.operation_id", operationId); + + // Use parent's AbortController if available, otherwise create new one +@@ -2174,8 +2504,14 @@ export class Agent { + logger, + conversationSteps: options?.parentOperationContext?.conversationSteps || [], + abortController, ++ priority, + userId: options?.userId, + conversationId: options?.conversationId, ++ tenantId, ++ apiKeyId, ++ region, ++ endpoint, ++ tenantTier, + parentAgentId: options?.parentAgentId, + traceContext, + startTime: startTimeDate, +@@ -3170,6 +3506,20 @@ export class Agent { + return value; + } + ++ private mergeOptionsWithModel( ++ options: BaseGenerationOptions | undefined, ++ modelOverride?: LanguageModel, ++ ): BaseGenerationOptions | undefined { ++ if (!options && modelOverride === undefined) { ++ return undefined; ++ } ++ ++ return { ++ ...(options ?? {}), ++ ...(modelOverride !== undefined ? { model: modelOverride } : {}), ++ }; ++ } ++ + /** + * Prepare tools with execution context + */ +@@ -3822,17 +4172,622 @@ export class Agent { + return this.subAgentManager.calculateMaxSteps(this.maxSteps); + } + ++ private resolveTrafficPriority(options?: BaseGenerationOptions): TrafficPriority { ++ const normalize = (value?: TrafficPriority): TrafficPriority | undefined => { ++ if (value === "P0" || value === "P1" || value === "P2") { ++ return value; ++ } ++ return undefined; ++ }; ++ ++ const parentPriority = normalize(options?.parentOperationContext?.priority); ++ const localPriority = normalize(options?.trafficPriority) ?? this.trafficPriority ?? "P1"; ++ ++ if (parentPriority) { ++ return this.pickHigherPriority(parentPriority, localPriority); ++ } ++ ++ return localPriority; ++ } ++ ++ private resolveTenantId(options?: BaseGenerationOptions): string { ++ const parentTenant = options?.parentOperationContext?.tenantId; ++ if (parentTenant) { ++ return parentTenant; ++ } ++ ++ if (options?.tenantId) { ++ return options.tenantId; ++ } ++ ++ return "default"; ++ } ++ ++ private pickHigherPriority(a: TrafficPriority, b: TrafficPriority): TrafficPriority { ++ const rank: Record = { P0: 0, P1: 1, P2: 2 }; ++ return rank[a] <= rank[b] ? a : b; ++ } ++ ++ private buildTrafficMetadata( ++ modelOverride?: LanguageModel | DynamicValue, ++ options?: BaseGenerationOptions, ++ providerOverride?: string, ++ ): TrafficRequestMetadata { ++ const provider = ++ providerOverride ?? ++ this.resolveProvider(modelOverride) ?? ++ this.resolveProvider(this.model) ?? ++ undefined; ++ const priority = this.resolveTrafficPriority(options); ++ const apiKeyId = options?.apiKeyId ?? options?.parentOperationContext?.apiKeyId; ++ const region = options?.region ?? options?.parentOperationContext?.region; ++ const endpoint = options?.endpoint ?? options?.parentOperationContext?.endpoint; ++ const tenantTier = options?.tenantTier ?? options?.parentOperationContext?.tenantTier; ++ ++ return { ++ agentId: this.id, // Identify which agent issued the request ++ agentName: this.name, // Human-readable label for logs/metrics ++ model: this.getModelName(modelOverride), // Used for future capacity policies ++ provider, // Allows per-provider throttling later ++ priority, ++ tenantId: this.resolveTenantId(options), ++ apiKeyId, ++ region, ++ endpoint, ++ tenantTier, ++ taskType: options?.taskType, ++ fallbackPolicyId: options?.fallbackPolicyId, ++ }; ++ } ++ ++ private estimateTokens( ++ input: string | UIMessage[] | BaseMessage[], ++ options?: BaseGenerationOptions, ++ ): number | undefined { ++ let text = ""; ++ if (typeof input === "string") { ++ text = input; ++ } else if (Array.isArray(input)) { ++ text = input ++ .map((message) => { ++ if (typeof message === "string") return message; ++ if (message && typeof message === "object") { ++ const content = (message as { content?: unknown }).content; ++ if (typeof content === "string") return content; ++ if (content !== undefined) return safeStringify(content); ++ return safeStringify(message); ++ } ++ return String(message ?? ""); ++ }) ++ .join(" "); ++ } else if (input) { ++ text = safeStringify(input); ++ } ++ ++ const inputTokens = text ? Math.ceil(text.length / 4) : 0; ++ const outputTokensRaw = ++ typeof options?.maxOutputTokens === "number" ? options.maxOutputTokens : this.maxOutputTokens; ++ const outputTokens = ++ typeof outputTokensRaw === "number" && Number.isFinite(outputTokensRaw) ++ ? Math.max(0, Math.floor(outputTokensRaw)) ++ : 0; ++ const total = inputTokens + outputTokens; ++ return total > 0 ? total : undefined; ++ } ++ ++ private resolveFallbackTarget(target: FallbackChainEntry): { ++ modelOverride?: LanguageModel; ++ providerOverride?: string; ++ } { ++ if (typeof target === "string") { ++ return { modelOverride: target }; ++ } ++ return { ++ modelOverride: target.model, ++ providerOverride: target.provider, ++ }; ++ } ++ ++ private isShortResponseFallback( ++ target: FallbackChainEntry, ++ ): target is { kind: "short-response"; text: string } { ++ return ( ++ typeof target === "object" && ++ target !== null && ++ "kind" in target && ++ (target as { kind?: string }).kind === "short-response" ++ ); ++ } ++ ++ private buildShortResponseMetadata( ++ baseMetadata: TrafficRequestMetadata | undefined, ++ ): TrafficRequestMetadata { ++ const metadata = baseMetadata ?? this.buildTrafficMetadata(); ++ return { ++ ...metadata, ++ provider: "short-response", ++ model: "short-response", ++ }; ++ } ++ ++ private createZeroUsage(): LanguageModelUsage { ++ return { inputTokens: 0, outputTokens: 0, totalTokens: 0 }; ++ } ++ ++ private createShortTextStream(text: string): AsyncIterableStream { ++ return createAsyncIterableReadable((controller) => { ++ controller.enqueue(text); ++ controller.close(); ++ }); ++ } ++ ++ private createShortFullStream(text: string): AsyncIterableStream { ++ const usage = this.createZeroUsage(); ++ const id = `short-response-${randomUUID()}`; ++ return createAsyncIterableReadable((controller) => { ++ controller.enqueue({ ++ type: "text-delta", ++ id, ++ delta: text, ++ text, ++ } as VoltAgentTextStreamPart); ++ controller.enqueue({ ++ type: "finish", ++ finishReason: "stop", ++ usage, ++ totalUsage: usage, ++ } as VoltAgentTextStreamPart); ++ controller.close(); ++ }); ++ } ++ ++ private createShortTextResult( ++ text: string, ++ options?: GenerateTextOptions, ++ ): GenerateTextResultWithContext { ++ const usage = this.createZeroUsage(); ++ const context = toContextMap(options?.context) ?? new Map(); ++ const createTextStream = (): AsyncIterableStream => this.createShortTextStream(text); ++ ++ return { ++ text, ++ content: [], ++ reasoning: [], ++ reasoningText: "", ++ files: [], ++ sources: [], ++ toolCalls: [], ++ staticToolCalls: [], ++ dynamicToolCalls: [], ++ toolResults: [], ++ staticToolResults: [], ++ dynamicToolResults: [], ++ usage, ++ totalUsage: usage, ++ warnings: [], ++ finishReason: "stop", ++ steps: [], ++ experimental_output: undefined, ++ response: { ++ id: "short-response", ++ modelId: "short-response", ++ timestamp: new Date(), ++ messages: [], ++ }, ++ context, ++ request: { ++ body: {}, ++ }, ++ providerMetadata: undefined, ++ experimental_providerMetadata: undefined, ++ pipeTextStreamToResponse: (response, init) => { ++ pipeTextStreamToResponse({ ++ response, ++ textStream: createTextStream(), ++ ...(init ?? {}), ++ }); ++ }, ++ toTextStreamResponse: (init) => { ++ return createTextStreamResponse({ ++ textStream: createTextStream(), ++ ...(init ?? {}), ++ }); ++ }, ++ toDataStream: () => createTextStream(), ++ toDataStreamResponse: (init) => { ++ return createTextStreamResponse({ ++ textStream: createTextStream(), ++ ...(init ?? {}), ++ }); ++ }, ++ pipeDataStreamToResponse: (response, init) => { ++ pipeTextStreamToResponse({ ++ response, ++ textStream: createTextStream(), ++ ...(init ?? {}), ++ }); ++ }, ++ } as GenerateTextResultWithContext; ++ } ++ ++ private createShortStreamTextResult( ++ text: string, ++ options?: StreamTextOptions, ++ ): StreamTextResultWithContext { ++ const usage = this.createZeroUsage(); ++ const context = toContextMap(options?.context) ?? new Map(); ++ const createTextStream = (): AsyncIterableStream => this.createShortTextStream(text); ++ const createFullStream = (): AsyncIterableStream => ++ this.createShortFullStream(text); ++ ++ const toUIMessageStream = (_options?: unknown) => ++ createUIMessageStream({ ++ execute: async ({ writer }) => { ++ writer.write({ type: "text", text } as any); ++ }, ++ onError: (error) => String(error), ++ }); ++ ++ const toUIMessageStreamResponse = (options?: ResponseInit) => { ++ const stream = toUIMessageStream(options); ++ const responseInit = options ? { ...options } : {}; ++ return createUIMessageStreamResponse({ ++ stream, ++ ...responseInit, ++ }); ++ }; ++ ++ const pipeUIMessageStreamToResponse = (response: any, init?: ResponseInit) => { ++ const stream = toUIMessageStream(init); ++ const initOptions = init ? { ...init } : {}; ++ pipeUIMessageStreamToResponse({ ++ response, ++ stream, ++ ...initOptions, ++ }); ++ }; ++ ++ return { ++ text: Promise.resolve(text), ++ get textStream() { ++ return createTextStream(); ++ }, ++ get fullStream() { ++ return createFullStream(); ++ }, ++ usage: Promise.resolve(usage), ++ finishReason: Promise.resolve("stop"), ++ experimental_partialOutputStream: undefined, ++ toUIMessageStream: toUIMessageStream as StreamTextResultWithContext["toUIMessageStream"], ++ toUIMessageStreamResponse: ++ toUIMessageStreamResponse as StreamTextResultWithContext["toUIMessageStreamResponse"], ++ pipeUIMessageStreamToResponse: ++ pipeUIMessageStreamToResponse as StreamTextResultWithContext["pipeUIMessageStreamToResponse"], ++ pipeTextStreamToResponse: (response, init) => { ++ pipeTextStreamToResponse({ ++ response, ++ textStream: createTextStream(), ++ ...(init ?? {}), ++ }); ++ }, ++ toTextStreamResponse: (init) => { ++ return createTextStreamResponse({ ++ textStream: createTextStream(), ++ ...(init ?? {}), ++ }); ++ }, ++ context, ++ }; ++ } ++ ++ private resolveShortResponseObject(schema: T, text: string): z.infer { ++ const candidates: unknown[] = []; ++ if (text.length > 0) { ++ try { ++ candidates.push(JSON.parse(text)); ++ } catch {} ++ } ++ candidates.push(text); ++ candidates.push({ text }); ++ for (const candidate of candidates) { ++ const parsed = schema.safeParse(candidate); ++ if (parsed.success) { ++ return parsed.data; ++ } ++ } ++ return (candidates[0] ?? text) as z.infer; ++ } ++ ++ private createShortObjectResult( ++ schema: T, ++ text: string, ++ options?: GenerateObjectOptions, ++ ): GenerateObjectResultWithContext> { ++ const object = this.resolveShortResponseObject(schema, text); ++ const usage = this.createZeroUsage(); ++ const context = toContextMap(options?.context) ?? new Map(); ++ ++ return { ++ object, ++ usage, ++ warnings: [], ++ finishReason: "stop", ++ response: { ++ id: "short-response", ++ modelId: "short-response", ++ timestamp: new Date(), ++ messages: [], ++ }, ++ context, ++ request: { ++ body: {}, ++ }, ++ reasoning: "", ++ providerMetadata: undefined, ++ toJsonResponse: (init?: ResponseInit) => { ++ const responseInit = init ? { ...init } : {}; ++ const headers = { ++ "content-type": "application/json", ++ ...(responseInit.headers ?? {}), ++ }; ++ return new Response(safeStringify(object), { ++ ...responseInit, ++ headers, ++ }); ++ }, ++ } as GenerateObjectResultWithContext>; ++ } ++ ++ private createShortStreamObjectResult( ++ schema: T, ++ text: string, ++ options?: StreamObjectOptions, ++ ): StreamObjectResultWithContext> { ++ const object = this.resolveShortResponseObject(schema, text); ++ const usage = this.createZeroUsage(); ++ const context = toContextMap(options?.context) ?? new Map(); ++ const textPayload = safeStringify(object); ++ const createTextStream = (): AsyncIterableStream => ++ this.createShortTextStream(textPayload); ++ ++ const partialObjectStream = new ReadableStream>>({ ++ start(controller) { ++ controller.enqueue(object); ++ controller.close(); ++ }, ++ }); ++ ++ return { ++ object: Promise.resolve(object), ++ partialObjectStream, ++ textStream: createTextStream(), ++ warnings: Promise.resolve(undefined), ++ usage: Promise.resolve(usage), ++ finishReason: Promise.resolve("stop"), ++ pipeTextStreamToResponse: (response, init) => { ++ pipeTextStreamToResponse({ ++ response, ++ textStream: createTextStream(), ++ ...(init ?? {}), ++ }); ++ }, ++ toTextStreamResponse: (init) => { ++ return createTextStreamResponse({ ++ textStream: createTextStream(), ++ ...(init ?? {}), ++ }); ++ }, ++ context, ++ }; ++ } ++ ++ private buildShortTextFallbackRequest( ++ tenantId: string, ++ metadata: TrafficRequestMetadata | undefined, ++ options: GenerateTextOptions | undefined, ++ text: string, ++ ): TrafficRequest { ++ const shortMetadata = this.buildShortResponseMetadata(metadata); ++ return { ++ tenantId, ++ metadata: shortMetadata, ++ maxQueueWaitMs: options?.maxQueueWaitMs, ++ estimatedTokens: 0, ++ execute: async () => this.createShortTextResult(text, options), ++ extractUsage: (result: GenerateTextResultWithContext) => ++ this.extractUsageFromResponse(result), ++ }; ++ } ++ ++ private buildShortStreamTextFallbackRequest( ++ tenantId: string, ++ metadata: TrafficRequestMetadata | undefined, ++ options: StreamTextOptions | undefined, ++ text: string, ++ ): TrafficRequest { ++ const shortMetadata = this.buildShortResponseMetadata(metadata); ++ return { ++ tenantId, ++ metadata: shortMetadata, ++ maxQueueWaitMs: options?.maxQueueWaitMs, ++ estimatedTokens: 0, ++ execute: async () => this.createShortStreamTextResult(text, options), ++ extractUsage: (result: StreamTextResultWithContext) => this.extractUsageFromResponse(result), ++ }; ++ } ++ ++ private buildShortObjectFallbackRequest( ++ tenantId: string, ++ metadata: TrafficRequestMetadata | undefined, ++ schema: T, ++ options: GenerateObjectOptions | undefined, ++ text: string, ++ ): TrafficRequest>> { ++ const shortMetadata = this.buildShortResponseMetadata(metadata); ++ return { ++ tenantId, ++ metadata: shortMetadata, ++ maxQueueWaitMs: options?.maxQueueWaitMs, ++ estimatedTokens: 0, ++ execute: async () => this.createShortObjectResult(schema, text, options), ++ extractUsage: (result: GenerateObjectResultWithContext>) => ++ this.extractUsageFromResponse(result), ++ }; ++ } ++ ++ private buildShortStreamObjectFallbackRequest( ++ tenantId: string, ++ metadata: TrafficRequestMetadata | undefined, ++ schema: T, ++ options: StreamObjectOptions | undefined, ++ text: string, ++ ): TrafficRequest>> { ++ const shortMetadata = this.buildShortResponseMetadata(metadata); ++ return { ++ tenantId, ++ metadata: shortMetadata, ++ maxQueueWaitMs: options?.maxQueueWaitMs, ++ estimatedTokens: 0, ++ execute: async () => this.createShortStreamObjectResult(schema, text, options), ++ extractUsage: (result: StreamObjectResultWithContext>) => ++ this.extractUsageFromResponse(result), ++ }; ++ } ++ ++ private updateTrafficControllerRateLimits( ++ response: unknown, ++ metadata: TrafficRequestMetadata | undefined, ++ logger?: Logger, ++ ): void { ++ const headerCandidates = findHeaders(response); ++ if (headerCandidates.length === 0) { ++ logger?.debug?.("[Traffic] No headers found for rate limit update"); ++ return; ++ } ++ ++ const controller = getTrafficController(); ++ const effectiveMetadata = metadata ?? this.buildTrafficMetadata(); ++ let updateResult: ReturnType | undefined; ++ for (const headers of headerCandidates) { ++ updateResult = controller.updateRateLimitFromHeaders(effectiveMetadata, headers); ++ if (updateResult) break; ++ } ++ ++ if (!updateResult) { ++ logger?.debug?.("[Traffic] No rate limit headers applied from response"); ++ return; ++ } ++ ++ const now = Date.now(); ++ const effectiveRemaining = Math.max( ++ 0, ++ updateResult.state.remaining - updateResult.state.reserved, ++ ); ++ const resetInMs = Math.max(0, updateResult.state.resetAt - now); ++ const nextAllowedInMs = Math.max(0, updateResult.state.nextAllowedAt - now); ++ logger?.info?.("[Traffic] Applied rate limit from response headers", { ++ rateLimitKey: updateResult.key, ++ limit: updateResult.state.limit, ++ remaining: updateResult.state.remaining, ++ reserved: updateResult.state.reserved, ++ effectiveRemaining, ++ resetAt: updateResult.state.resetAt, ++ resetInMs, ++ nextAllowedAt: updateResult.state.nextAllowedAt, ++ nextAllowedInMs, ++ headers: { ++ limitRequests: updateResult.headerSnapshot.limitRequests, ++ remainingRequests: updateResult.headerSnapshot.remainingRequests, ++ resetRequestsMs: updateResult.headerSnapshot.resetRequestsMs, ++ }, ++ }); ++ } ++ ++ private extractUsageFromResponse( ++ result: ++ | { ++ usage?: LanguageModelUsage | Promise; ++ totalUsage?: LanguageModelUsage | Promise; ++ } ++ | undefined, ++ ): Promise | LanguageModelUsage | undefined { ++ if (!result) { ++ return undefined; ++ } ++ ++ const usageCandidate = ++ (result as { totalUsage?: LanguageModelUsage | Promise }) ++ ?.totalUsage ?? ++ (result as { usage?: LanguageModelUsage | Promise })?.usage; ++ ++ if (!usageCandidate) { ++ return undefined; ++ } ++ ++ const normalizeUsage = ( ++ usage: LanguageModelUsage | undefined, ++ ): LanguageModelUsage | undefined => { ++ if (!usage) return undefined; ++ const input = Number.isFinite(usage.inputTokens) ? (usage.inputTokens as number) : undefined; ++ const output = Number.isFinite(usage.outputTokens) ++ ? (usage.outputTokens as number) ++ : undefined; ++ const total = Number.isFinite(usage.totalTokens) ? (usage.totalTokens as number) : undefined; ++ ++ if (total === undefined && input === undefined && output === undefined) { ++ return undefined; ++ } ++ ++ const safeInput = input ?? 0; ++ const safeOutput = output ?? 0; ++ const safeTotal = total ?? safeInput + safeOutput; ++ ++ return { ++ ...usage, ++ inputTokens: safeInput, ++ outputTokens: safeOutput, ++ totalTokens: safeTotal, ++ }; ++ }; ++ ++ if ( ++ typeof (usageCandidate as PromiseLike).then === "function" ++ ) { ++ return (usageCandidate as Promise) ++ .then((usage) => normalizeUsage(usage)) ++ .catch(() => undefined); ++ } ++ ++ return normalizeUsage(usageCandidate as LanguageModelUsage); ++ } ++ ++ private resolveProvider( ++ model: LanguageModel | DynamicValue | undefined, ++ ): string | undefined { ++ if ( ++ model && ++ typeof model === "object" && ++ "provider" in model && ++ typeof (model as any).provider === "string" ++ ) { ++ return (model as any).provider; ++ } ++ ++ return undefined; ++ } ++ + /** + * Get the model name + */ +- public getModelName(): string { +- if (typeof this.model === "function") { ++ public getModelName(modelOverride?: LanguageModel | DynamicValue): string { ++ const selectedModel = modelOverride ?? this.model; ++ if (typeof selectedModel === "function") { + return "dynamic"; + } +- if (typeof this.model === "string") { +- return this.model; ++ if (typeof selectedModel === "string") { ++ return selectedModel; + } +- return this.model.modelId || "unknown"; ++ return selectedModel.modelId || "unknown"; + } + + /** +diff --git a/packages/core/src/agent/eval.ts b/packages/core/src/agent/eval.ts +index 9e4fe9f2..de712505 100644 +--- a/packages/core/src/agent/eval.ts ++++ b/packages/core/src/agent/eval.ts +@@ -711,6 +711,7 @@ function buildEvalPayload( + rawOutput: output, + userId: oc.userId, + conversationId: oc.conversationId, ++ tenantId: oc.tenantId, + traceId: spanContext.traceId, + spanId: spanContext.spanId, + metadata, +diff --git a/packages/core/src/agent/types.ts b/packages/core/src/agent/types.ts +index dd5fb29d..c70bd478 100644 +--- a/packages/core/src/agent/types.ts ++++ b/packages/core/src/agent/types.ts +@@ -29,6 +29,7 @@ import type { Logger } from "@voltagent/internal"; + import type { LocalScorerDefinition, SamplingPolicy } from "../eval/runtime"; + import type { MemoryOptions, MemoryStorageMetadata, WorkingMemorySummary } from "../memory/types"; + import type { VoltAgentObservability } from "../observability"; ++import type { TrafficPriority } from "../traffic/traffic-controller"; + import type { + DynamicValue, + DynamicValueOptions, +@@ -456,6 +457,11 @@ export type AgentOptions = { + temperature?: number; + maxOutputTokens?: number; + maxSteps?: number; ++ /** ++ * Default scheduling priority for this agent's LLM calls. ++ * Defaults to P1 when unspecified. ++ */ ++ trafficPriority?: TrafficPriority; + /** + * Default stop condition for step execution (ai-sdk `stopWhen`). + * Per-call `stopWhen` in method options overrides this. +@@ -493,6 +499,7 @@ export interface AgentEvalPayload { + rawOutput?: unknown; + userId?: string; + conversationId?: string; ++ tenantId?: string; + traceId: string; + spanId: string; + metadata?: Record; +@@ -890,6 +897,21 @@ export type OperationContext = { + /** Optional conversation identifier associated with this operation */ + conversationId?: string; + ++ /** Optional tenant identifier propagated across nested operations */ ++ tenantId?: string; ++ ++ /** Optional key identifier for per-key traffic limits */ ++ apiKeyId?: string; ++ ++ /** Optional region identifier for per-region traffic limits */ ++ region?: string; ++ ++ /** Optional endpoint identifier for per-endpoint traffic limits */ ++ endpoint?: string; ++ ++ /** Optional tenant tier identifier for per-tier traffic limits */ ++ tenantTier?: string; ++ + /** User-managed context map for this operation */ + readonly context: Map; + +@@ -914,6 +936,9 @@ export type OperationContext = { + /** Conversation steps for building full message history including tool calls/results */ + conversationSteps?: StepWithContent[]; + ++ /** Scheduling priority propagated from parent calls */ ++ priority?: TrafficPriority; ++ + /** AbortController for cancelling the operation and accessing the signal */ + abortController: AbortController; + +diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts +index 8753f039..9dee4333 100644 +--- a/packages/core/src/index.ts ++++ b/packages/core/src/index.ts +@@ -21,6 +21,30 @@ export type { + WorkflowTimelineEvent, + RegisteredWorkflow, + } from "./workflow"; ++export { ++ // Surface traffic controller so downstream consumers can route agent calls through the shared scheduler ++ TrafficController, ++ CircuitBreakerOpenError, ++ QueueWaitTimeoutError, ++ RateLimitedUpstreamError, ++ getTrafficController, ++ type FallbackChainEntry, ++ type FallbackPolicy, ++ type FallbackPolicyConfig, ++ type FallbackPolicyMode, ++ type FallbackTarget, ++ type RateLimitConfig, ++ type RateLimitKey, ++ type RateLimitOptions, ++ type AdaptiveLimiterConfig, ++ type PriorityWeights, ++ type PriorityBurstLimits, ++ type TrafficRequest, ++ type TrafficRequestMetadata, ++ type TrafficResponseMetadata, ++ type TrafficPriority, ++ type TrafficRequestType, ++} from "./traffic/traffic-controller"; + // Export new Agent from agent.ts + export { + Agent, +diff --git a/packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts +new file mode 100644 +index 00000000..652b7e59 +--- /dev/null ++++ b/packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts +@@ -0,0 +1,243 @@ ++import type { Logger } from "../../logger"; ++import { ++ RATE_LIMIT_EXHAUSTION_BUFFER, ++ RATE_LIMIT_MIN_PACE_INTERVAL_MS, ++ RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS, ++ RATE_LIMIT_PROBE_DELAY_MS, ++} from "../traffic-constants"; ++import type { ++ DispatchDecision, ++ QueuedRequest, ++ RateLimitWindowState, ++} from "../traffic-controller-internal"; ++import { parseRetryAfterMs, readHeaderValue } from "../traffic-error-utils"; ++import type { TrafficRequestMetadata } from "../traffic-types"; ++import type { ++ RateLimitHeaderSnapshot, ++ RateLimitStrategy, ++ RateLimitUpdateResult, ++} from "./rate-limit-strategy"; ++import { parseResetDurationToMs } from "./rate-limit-utils"; ++ ++export class DefaultRateLimitStrategy implements RateLimitStrategy { ++ private state?: RateLimitWindowState; ++ private readonly key: string; ++ ++ constructor(key: string) { ++ this.key = key; ++ } ++ ++ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const state = this.state; ++ if (!state) { ++ rateLimitLogger?.trace?.("Rate limit state missing; allow request", { ++ rateLimitKey: this.key, ++ }); ++ return null; ++ } ++ ++ const now = Date.now(); ++ const effectiveRemaining = Math.max(0, state.remaining - state.reserved); ++ const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; ++ ++ if (effectiveRemaining <= RATE_LIMIT_EXHAUSTION_BUFFER) { ++ if (now < probeAt) { ++ rateLimitLogger?.debug?.("Rate limit exhausted; waiting for probe", { ++ rateLimitKey: this.key, ++ remaining: state.remaining, ++ reserved: state.reserved, ++ effectiveRemaining, ++ resetAt: state.resetAt, ++ probeAt, ++ }); ++ return { kind: "wait", wakeUpAt: probeAt }; ++ } ++ if (state.reserved > 0) { ++ rateLimitLogger?.debug?.("Rate limit exhausted but in-flight reservations exist; waiting", { ++ rateLimitKey: this.key, ++ remaining: state.remaining, ++ reserved: state.reserved, ++ effectiveRemaining, ++ resetAt: state.resetAt, ++ }); ++ return { kind: "wait" }; ++ } ++ } ++ ++ if (now < state.nextAllowedAt) { ++ rateLimitLogger?.debug?.("Rate limit pacing; waiting until nextAllowedAt", { ++ rateLimitKey: this.key, ++ nextAllowedAt: state.nextAllowedAt, ++ resetAt: state.resetAt, ++ waitMs: Math.min(state.resetAt, state.nextAllowedAt) - now, ++ }); ++ return { kind: "wait", wakeUpAt: Math.min(state.resetAt, state.nextAllowedAt) }; ++ } ++ ++ state.reserved += 1; ++ next.rateLimitKey = this.key; ++ rateLimitLogger?.trace?.("Reserved rate limit token", { ++ rateLimitKey: this.key, ++ reserved: state.reserved, ++ remaining: state.remaining, ++ resetAt: state.resetAt, ++ nextAllowedAt: state.nextAllowedAt, ++ }); ++ ++ const remainingWindowMs = Math.max(0, state.resetAt - now); ++ const intervalMs = Math.max( ++ RATE_LIMIT_MIN_PACE_INTERVAL_MS, ++ Math.ceil(remainingWindowMs / Math.max(effectiveRemaining, 1)), ++ ); ++ ++ const candidateNext = Math.max(state.nextAllowedAt, now + intervalMs); ++ if ( ++ state.nextAllowedAt <= now || ++ candidateNext >= state.nextAllowedAt + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS ++ ) { ++ state.nextAllowedAt = candidateNext; ++ rateLimitLogger?.trace?.("Updated pacing nextAllowedAt", { ++ rateLimitKey: this.key, ++ nextAllowedAt: state.nextAllowedAt, ++ intervalMs, ++ remainingWindowMs, ++ effectiveRemaining, ++ }); ++ } ++ ++ return null; ++ } ++ ++ onDispatch(_logger?: Logger): void {} ++ ++ onComplete(logger?: Logger): void { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const state = this.state; ++ if (!state || state.reserved <= 0) return; ++ state.reserved -= 1; ++ rateLimitLogger?.trace?.("Released rate limit reservation", { ++ rateLimitKey: this.key, ++ reserved: state.reserved, ++ remaining: state.remaining, ++ resetAt: state.resetAt, ++ nextAllowedAt: state.nextAllowedAt, ++ }); ++ } ++ ++ updateFromHeaders( ++ _metadata: TrafficRequestMetadata | undefined, ++ headers: unknown, ++ logger?: Logger, ++ ): RateLimitUpdateResult | undefined { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const limitRequests = readHeaderValue(headers, "x-ratelimit-limit-requests"); ++ const remainingRequests = readHeaderValue(headers, "x-ratelimit-remaining-requests"); ++ const resetRequests = readHeaderValue(headers, "x-ratelimit-reset-requests"); ++ const retryAfter = readHeaderValue(headers, "retry-after"); ++ const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter) : undefined; ++ ++ const now = Date.now(); ++ const existing = this.state; ++ let state: RateLimitWindowState | undefined; ++ let headerSnapshot: RateLimitHeaderSnapshot | undefined; ++ ++ if (limitRequests && remainingRequests && resetRequests) { ++ const limit = Number(limitRequests); ++ const remaining = Number(remainingRequests); ++ if (!Number.isFinite(limit) || !Number.isFinite(remaining)) { ++ rateLimitLogger?.debug?.("Invalid rate limit numeric headers; skipping", { ++ rateLimitKey: this.key, ++ limitRequests, ++ remainingRequests, ++ }); ++ return undefined; ++ } ++ ++ const resetRequestsMs = parseResetDurationToMs(resetRequests); ++ if (resetRequestsMs === undefined) { ++ rateLimitLogger?.debug?.("Unable to parse reset duration; skipping", { ++ rateLimitKey: this.key, ++ resetRequests, ++ }); ++ return undefined; ++ } ++ ++ const parsedResetAt = now + resetRequestsMs; ++ const isSameWindow = !!existing && now < existing.resetAt; ++ const resetAt = isSameWindow ? Math.max(existing.resetAt, parsedResetAt) : parsedResetAt; ++ const nextAllowedAt = isSameWindow ? Math.max(existing.nextAllowedAt, now) : now; ++ const reserved = Math.max(0, existing?.reserved ?? 0); ++ ++ state = { ++ limit, ++ remaining: isSameWindow ? Math.min(existing.remaining, remaining) : remaining, ++ resetAt, ++ reserved, ++ nextAllowedAt, ++ }; ++ headerSnapshot = { ++ limitRequests, ++ remainingRequests, ++ resetRequests, ++ resetRequestsMs, ++ }; ++ } else if (retryAfterMs === undefined) { ++ rateLimitLogger?.trace?.("Missing rate limit headers; skipping", { ++ rateLimitKey: this.key, ++ hasLimit: !!limitRequests, ++ hasRemaining: !!remainingRequests, ++ hasReset: !!resetRequests, ++ hasRetryAfter: !!retryAfter, ++ }); ++ return undefined; ++ } ++ ++ if (!state) { ++ if (retryAfterMs === undefined) { ++ rateLimitLogger?.trace?.("Retry-After missing or unparsable; skipping", { ++ rateLimitKey: this.key, ++ retryAfter, ++ }); ++ return undefined; ++ } ++ const targetAt = now + retryAfterMs; ++ const isSameWindow = !!existing && now < existing.resetAt; ++ state = { ++ limit: existing?.limit ?? 1, ++ remaining: 0, ++ resetAt: isSameWindow ? Math.max(existing.resetAt, targetAt) : targetAt, ++ reserved: Math.max(0, existing?.reserved ?? 0), ++ nextAllowedAt: Math.max(existing?.nextAllowedAt ?? now, targetAt), ++ }; ++ headerSnapshot = { retryAfter, retryAfterMs }; ++ } else if (retryAfterMs !== undefined) { ++ const targetAt = now + retryAfterMs; ++ state = { ++ ...state, ++ remaining: 0, ++ resetAt: Math.max(state.resetAt, targetAt), ++ nextAllowedAt: Math.max(state.nextAllowedAt, targetAt), ++ }; ++ headerSnapshot = { ...headerSnapshot, retryAfter, retryAfterMs }; ++ } ++ ++ this.state = state; ++ rateLimitLogger?.debug?.("Applied rate limit headers to state", { ++ rateLimitKey: this.key, ++ limit: state.limit, ++ remaining: state.remaining, ++ effectiveRemaining: Math.max(0, state.remaining - state.reserved), ++ resetAt: state.resetAt, ++ nextAllowedAt: state.nextAllowedAt, ++ resetRequestsMs: headerSnapshot?.resetRequestsMs, ++ retryAfterMs: headerSnapshot?.retryAfterMs, ++ }); ++ ++ return { ++ key: this.key, ++ headerSnapshot: headerSnapshot ?? {}, ++ state, ++ }; ++ } ++} +diff --git a/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts +new file mode 100644 +index 00000000..fdb1c7a8 +--- /dev/null ++++ b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts +@@ -0,0 +1,353 @@ ++import type { Logger } from "../../logger"; ++import { ++ RATE_LIMIT_EXHAUSTION_BUFFER, ++ RATE_LIMIT_MIN_PACE_INTERVAL_MS, ++ RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS, ++ RATE_LIMIT_PROBE_DELAY_MS, ++} from "../traffic-constants"; ++import type { ++ DispatchDecision, ++ QueuedRequest, ++ RateLimitWindowState, ++} from "../traffic-controller-internal"; ++import { parseRetryAfterMs, readHeaderValue } from "../traffic-error-utils"; ++import type { RateLimitOptions, TrafficRequestMetadata } from "../traffic-types"; ++import { DefaultRateLimitStrategy } from "./default-rate-limit-strategy"; ++import type { ++ RateLimitStrategy, ++ RateLimitUpdateResult, ++ RateLimitUsage, ++} from "./rate-limit-strategy"; ++import { parseResetDurationToMs } from "./rate-limit-utils"; ++ ++export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { ++ readonly handlesTokenLimits = true; ++ private readonly window: DefaultRateLimitStrategy; ++ private readonly key: string; ++ private readonly requestsPerMinute?: number; ++ private readonly tokensPerMinute?: number; ++ private requestState?: RateLimitWindowState; ++ private tokenState?: RateLimitWindowState; ++ private bootstrapReserved = 0; ++ private readonly windowMs = 60_000; ++ ++ constructor(key: string, options?: RateLimitOptions) { ++ this.key = key; ++ this.window = new DefaultRateLimitStrategy(key); ++ // Window strategy enforces fixed 60s windows; burstSize is intentionally ignored here. ++ this.requestsPerMinute = this.normalizeLimit(options?.requestsPerMinute); ++ this.tokensPerMinute = this.normalizeLimit(options?.tokensPerMinute); ++ } ++ ++ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { ++ if (this.requestsPerMinute !== undefined) { ++ const requestDecision = this.resolveRequestWindow(next, logger); ++ if (requestDecision) return requestDecision; ++ } else { ++ const decision = this.window.resolve(next, logger); ++ if (decision) return decision; ++ ++ if (!next.rateLimitKey && this.tokensPerMinute === undefined) { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ if (this.bootstrapReserved >= 1) { ++ rateLimitLogger?.debug?.("OpenAI rate limit bootstrap active; waiting", { ++ rateLimitKey: this.key, ++ bootstrapReserved: this.bootstrapReserved, ++ }); ++ return { kind: "wait" }; ++ } ++ ++ this.bootstrapReserved += 1; ++ next.rateLimitKey = this.key; ++ rateLimitLogger?.debug?.("OpenAI rate limit bootstrap reserved", { ++ rateLimitKey: this.key, ++ bootstrapReserved: this.bootstrapReserved, ++ }); ++ } ++ } ++ ++ const tokenDecision = this.resolveTokenWindow(next, logger); ++ if (tokenDecision) return tokenDecision; ++ return null; ++ } ++ ++ onDispatch(logger?: Logger): void { ++ if (this.requestsPerMinute === undefined) { ++ this.window.onDispatch(logger); ++ } ++ } ++ ++ onComplete(logger?: Logger): void { ++ if (this.requestsPerMinute !== undefined) { ++ const now = Date.now(); ++ const state = this.ensureRequestState(now); ++ if (state.reserved > 0) { ++ state.reserved -= 1; ++ } ++ state.remaining = Math.max(0, state.remaining - 1); ++ return; ++ } ++ ++ if (this.bootstrapReserved > 0) { ++ this.bootstrapReserved -= 1; ++ } ++ this.window.onComplete(logger); ++ } ++ ++ recordUsage(usage: RateLimitUsage, logger?: Logger, reservedTokens?: number): void { ++ const tokens = this.resolveTokenCount(usage); ++ if (tokens <= 0) return; ++ ++ const now = Date.now(); ++ const state = this.ensureTokenState(now); ++ if (!state) return; ++ const reserved = typeof reservedTokens === "number" ? reservedTokens : 0; ++ const delta = tokens - reserved; ++ if (delta > 0) { ++ state.remaining = Math.max(0, state.remaining - delta); ++ } else if (delta < 0) { ++ state.remaining = Math.min(state.limit, state.remaining + Math.abs(delta)); ++ } ++ logger?.child({ module: "rate-limiter" })?.trace?.("OpenAI token usage recorded", { ++ rateLimitKey: this.key, ++ tokens, ++ remaining: state.remaining, ++ resetAt: state.resetAt, ++ }); ++ } ++ ++ updateFromHeaders( ++ metadata: TrafficRequestMetadata | undefined, ++ headers: unknown, ++ logger?: Logger, ++ ): RateLimitUpdateResult | undefined { ++ const update = ++ this.requestsPerMinute !== undefined ++ ? undefined ++ : this.window.updateFromHeaders(metadata, headers, logger); ++ this.applyTokenHeaderUpdates(headers, logger); ++ return update; ++ } ++ ++ private resolveRequestWindow(next: QueuedRequest, logger?: Logger): DispatchDecision | null { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const now = Date.now(); ++ const state = this.ensureRequestState(now); ++ const effectiveRemaining = Math.max(0, state.remaining - state.reserved); ++ const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; ++ ++ if (effectiveRemaining <= RATE_LIMIT_EXHAUSTION_BUFFER) { ++ if (now < probeAt) { ++ rateLimitLogger?.debug?.("OpenAI request window exhausted; waiting for probe", { ++ rateLimitKey: this.key, ++ remaining: state.remaining, ++ reserved: state.reserved, ++ effectiveRemaining, ++ resetAt: state.resetAt, ++ probeAt, ++ }); ++ return { kind: "wait", wakeUpAt: probeAt }; ++ } ++ if (state.reserved > 0) { ++ rateLimitLogger?.debug?.( ++ "OpenAI request window exhausted but in-flight reservations exist; waiting", ++ { ++ rateLimitKey: this.key, ++ remaining: state.remaining, ++ reserved: state.reserved, ++ effectiveRemaining, ++ resetAt: state.resetAt, ++ }, ++ ); ++ return { kind: "wait" }; ++ } ++ } ++ ++ if (now < state.nextAllowedAt) { ++ rateLimitLogger?.debug?.("OpenAI request window pacing; waiting until nextAllowedAt", { ++ rateLimitKey: this.key, ++ nextAllowedAt: state.nextAllowedAt, ++ resetAt: state.resetAt, ++ waitMs: Math.min(state.resetAt, state.nextAllowedAt) - now, ++ }); ++ return { kind: "wait", wakeUpAt: Math.min(state.resetAt, state.nextAllowedAt) }; ++ } ++ ++ state.reserved += 1; ++ next.rateLimitKey = this.key; ++ rateLimitLogger?.trace?.("Reserved OpenAI request window slot", { ++ rateLimitKey: this.key, ++ reserved: state.reserved, ++ remaining: state.remaining, ++ resetAt: state.resetAt, ++ nextAllowedAt: state.nextAllowedAt, ++ }); ++ ++ const remainingWindowMs = Math.max(0, state.resetAt - now); ++ const intervalMs = Math.max( ++ RATE_LIMIT_MIN_PACE_INTERVAL_MS, ++ Math.ceil(remainingWindowMs / Math.max(effectiveRemaining, 1)), ++ ); ++ ++ const candidateNext = Math.max(state.nextAllowedAt, now + intervalMs); ++ if ( ++ state.nextAllowedAt <= now || ++ candidateNext >= state.nextAllowedAt + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS ++ ) { ++ state.nextAllowedAt = candidateNext; ++ rateLimitLogger?.trace?.("Updated OpenAI request pacing nextAllowedAt", { ++ rateLimitKey: this.key, ++ nextAllowedAt: state.nextAllowedAt, ++ intervalMs, ++ remainingWindowMs, ++ effectiveRemaining, ++ }); ++ } ++ ++ return null; ++ } ++ ++ private resolveTokenWindow(next: QueuedRequest, logger?: Logger): DispatchDecision | null { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const now = Date.now(); ++ const state = this.ensureTokenState(now); ++ if (!state) return null; ++ const estimatedTokens = next.estimatedTokens; ++ ++ if (typeof estimatedTokens === "number" && estimatedTokens > 0) { ++ if (state.remaining >= estimatedTokens) { ++ state.remaining = Math.max(0, state.remaining - estimatedTokens); ++ next.reservedTokens = estimatedTokens; ++ return null; ++ } ++ } else if (state.remaining > 0) { ++ return null; ++ } ++ ++ const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; ++ rateLimitLogger?.debug?.("OpenAI token window exhausted; waiting", { ++ rateLimitKey: this.key, ++ remaining: state.remaining, ++ resetAt: state.resetAt, ++ probeAt, ++ }); ++ return { kind: "wait", wakeUpAt: probeAt }; ++ } ++ ++ private ensureRequestState(now: number): RateLimitWindowState { ++ const limit = this.requestsPerMinute ?? 0; ++ const state = this.requestState; ++ if (!state || now >= state.resetAt) { ++ this.requestState = { ++ limit, ++ remaining: limit, ++ resetAt: now + this.windowMs, ++ reserved: 0, ++ nextAllowedAt: now, ++ }; ++ return this.requestState; ++ } ++ return state; ++ } ++ ++ private ensureTokenState(now: number): RateLimitWindowState | undefined { ++ const configuredLimit = this.tokensPerMinute; ++ const state = this.tokenState; ++ if (!state) { ++ if (configuredLimit === undefined) return undefined; ++ this.tokenState = { ++ limit: configuredLimit, ++ remaining: configuredLimit, ++ resetAt: now + this.windowMs, ++ reserved: 0, ++ nextAllowedAt: now, ++ }; ++ return this.tokenState; ++ } ++ ++ if (now >= state.resetAt) { ++ const limit = configuredLimit ?? state.limit; ++ this.tokenState = { ++ limit, ++ remaining: limit, ++ resetAt: now + this.windowMs, ++ reserved: 0, ++ nextAllowedAt: now, ++ }; ++ return this.tokenState; ++ } ++ ++ if (configuredLimit !== undefined && configuredLimit !== state.limit) { ++ state.limit = configuredLimit; ++ state.remaining = Math.min(state.remaining, configuredLimit); ++ } ++ ++ return state; ++ } ++ ++ private normalizeLimit(value: number | undefined): number | undefined { ++ const numeric = typeof value === "number" ? value : Number(value); ++ return Number.isFinite(numeric) && numeric > 0 ? numeric : undefined; ++ } ++ ++ private applyTokenHeaderUpdates(headers: unknown, logger?: Logger): void { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const limitTokens = readHeaderValue(headers, "x-ratelimit-limit-tokens"); ++ const remainingTokens = readHeaderValue(headers, "x-ratelimit-remaining-tokens"); ++ const resetTokens = readHeaderValue(headers, "x-ratelimit-reset-tokens"); ++ const retryAfter = readHeaderValue(headers, "retry-after"); ++ ++ const limit = Number(limitTokens); ++ const remaining = Number(remainingTokens); ++ const resetTokensMs = resetTokens ? parseResetDurationToMs(resetTokens) : undefined; ++ const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter) : undefined; ++ ++ if (!Number.isFinite(limit) || !Number.isFinite(remaining) || resetTokensMs === undefined) { ++ rateLimitLogger?.trace?.("OpenAI token headers missing or invalid; skipping", { ++ rateLimitKey: this.key, ++ hasLimit: !!limitTokens, ++ hasRemaining: !!remainingTokens, ++ hasReset: !!resetTokens, ++ }); ++ return; ++ } ++ ++ const now = Date.now(); ++ const configuredLimit = this.tokensPerMinute; ++ const effectiveLimit = configuredLimit === undefined ? limit : Math.min(configuredLimit, limit); ++ const clampedRemaining = Math.max(0, Math.min(remaining, effectiveLimit)); ++ const parsedResetAt = now + resetTokensMs; ++ const existing = this.tokenState; ++ const isSameWindow = !!existing && now < existing.resetAt; ++ const resetAt = isSameWindow ? Math.max(existing.resetAt, parsedResetAt) : parsedResetAt; ++ const nextAllowedAt = isSameWindow ? Math.max(existing.nextAllowedAt, now) : now; ++ const reserved = Math.max(0, existing?.reserved ?? 0); ++ const effectiveRemaining = isSameWindow ++ ? Math.min(existing.remaining, clampedRemaining) ++ : clampedRemaining; ++ ++ this.tokenState = { ++ limit: effectiveLimit, ++ remaining: effectiveRemaining, ++ resetAt, ++ reserved, ++ nextAllowedAt, ++ }; ++ ++ rateLimitLogger?.debug?.("OpenAI token headers applied", { ++ rateLimitKey: this.key, ++ limit: effectiveLimit, ++ remaining: effectiveRemaining, ++ resetAt, ++ retryAfterMs, ++ }); ++ } ++ ++ private resolveTokenCount(usage: RateLimitUsage): number { ++ const total = Number.isFinite(usage.totalTokens) ? usage.totalTokens : undefined; ++ if (total !== undefined) return total; ++ const input = Number.isFinite(usage.inputTokens) ? usage.inputTokens : 0; ++ const output = Number.isFinite(usage.outputTokens) ? usage.outputTokens : 0; ++ return input + output; ++ } ++} +diff --git a/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts +new file mode 100644 +index 00000000..653fdaf2 +--- /dev/null ++++ b/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts +@@ -0,0 +1,41 @@ ++import type { Logger } from "../../logger"; ++import type { ++ DispatchDecision, ++ QueuedRequest, ++ RateLimitWindowState, ++} from "../traffic-controller-internal"; ++import type { TrafficRequestMetadata } from "../traffic-types"; ++ ++export type RateLimitHeaderSnapshot = { ++ limitRequests?: string; ++ remainingRequests?: string; ++ resetRequests?: string; ++ resetRequestsMs?: number; ++ retryAfter?: string; ++ retryAfterMs?: number; ++}; ++ ++export type RateLimitUpdateResult = { ++ key: string; ++ headerSnapshot: RateLimitHeaderSnapshot; ++ state: RateLimitWindowState; ++}; ++ ++export type RateLimitUsage = { ++ inputTokens?: number; ++ outputTokens?: number; ++ totalTokens?: number; ++}; ++ ++export interface RateLimitStrategy { ++ readonly handlesTokenLimits?: boolean; ++ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null; ++ onDispatch(logger?: Logger): void; ++ onComplete(logger?: Logger): void; ++ recordUsage?(usage: RateLimitUsage, logger?: Logger, reservedTokens?: number): void; ++ updateFromHeaders( ++ metadata: TrafficRequestMetadata | undefined, ++ headers: unknown, ++ logger?: Logger, ++ ): RateLimitUpdateResult | undefined; ++} +diff --git a/packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts b/packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts +new file mode 100644 +index 00000000..310c9a7e +--- /dev/null ++++ b/packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts +@@ -0,0 +1,26 @@ ++export function parseResetDurationToMs(raw: string): number | undefined { ++ const value = raw.trim(); ++ if (!value) return undefined; ++ ++ let totalMs = 0; ++ const regex = /(\d+(?:\.\d+)?)(ms|s|m|h|d)/g; ++ let matched = false; ++ for (const match of value.matchAll(regex)) { ++ matched = true; ++ const amount = Number.parseFloat(match[1] ?? ""); ++ if (!Number.isFinite(amount)) continue; ++ const unit = match[2]; ++ if (unit === "ms") totalMs += amount; ++ else if (unit === "s") totalMs += amount * 1000; ++ else if (unit === "m") totalMs += amount * 60_000; ++ else if (unit === "h") totalMs += amount * 3_600_000; ++ else if (unit === "d") totalMs += amount * 86_400_000; ++ } ++ ++ if (matched) { ++ return Math.round(totalMs); ++ } ++ ++ const n = Number(value); ++ return Number.isFinite(n) ? Math.round(n) : undefined; ++} +diff --git a/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts +new file mode 100644 +index 00000000..ee269ecd +--- /dev/null ++++ b/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts +@@ -0,0 +1,218 @@ ++import type { Logger } from "../../logger"; ++import type { ++ DispatchDecision, ++ QueuedRequest, ++ RateLimitWindowState, ++} from "../traffic-controller-internal"; ++import { parseRetryAfterMs, readHeaderValue } from "../traffic-error-utils"; ++import type { RateLimitOptions, TrafficRequestMetadata } from "../traffic-types"; ++import type { ++ RateLimitHeaderSnapshot, ++ RateLimitStrategy, ++ RateLimitUpdateResult, ++} from "./rate-limit-strategy"; ++import { parseResetDurationToMs } from "./rate-limit-utils"; ++ ++type TokenBucketState = { ++ capacity: number; ++ refillPerSecond: number; ++ tokens: number; ++ updatedAt: number; ++}; ++ ++function normalizeTokenBucketOptions( ++ raw: RateLimitOptions | undefined, ++): Omit | undefined { ++ const requestsPerMinuteRaw = raw?.requestsPerMinute; ++ const tokensPerMinuteRaw = raw?.tokensPerMinute; ++ const burstSizeRaw = raw?.burstSize; ++ ++ const requestsPerMinute = ++ typeof requestsPerMinuteRaw === "number" ? requestsPerMinuteRaw : Number(requestsPerMinuteRaw); ++ const tokensPerMinute = ++ typeof tokensPerMinuteRaw === "number" ? tokensPerMinuteRaw : Number(tokensPerMinuteRaw); ++ const burstSize = typeof burstSizeRaw === "number" ? burstSizeRaw : Number(burstSizeRaw); ++ ++ const safeRequestsPerMinute = Number.isFinite(requestsPerMinute) ? requestsPerMinute : 0; ++ const hasTokenLimit = Number.isFinite(tokensPerMinute) && tokensPerMinute > 0; ++ if (safeRequestsPerMinute <= 0 && hasTokenLimit) { ++ return undefined; ++ } ++ const safeBurst = Number.isFinite(burstSize) ? burstSize : safeRequestsPerMinute; ++ const refillPerSecond = safeRequestsPerMinute > 0 ? safeRequestsPerMinute / 60 : 0; ++ ++ return { ++ capacity: safeBurst > 0 ? Math.max(1, safeBurst) : 0, ++ refillPerSecond, ++ }; ++} ++function refillTokenBucket(bucket: TokenBucketState, now: number): void { ++ const elapsedMs = now - bucket.updatedAt; ++ if (elapsedMs <= 0) return; ++ bucket.updatedAt = now; ++ if (bucket.capacity <= 0 || bucket.refillPerSecond <= 0) return; ++ ++ const refill = (elapsedMs / 1000) * bucket.refillPerSecond; ++ if (refill <= 0) return; ++ bucket.tokens = Math.min(bucket.capacity, bucket.tokens + refill); ++} ++ ++export class TokenBucketRateLimitStrategy implements RateLimitStrategy { ++ private readonly key: string; ++ private bucket?: TokenBucketState; ++ private cooldownUntil?: number; ++ ++ constructor(key: string, options?: RateLimitOptions) { ++ this.key = key; ++ if (!options) return; ++ const normalized = normalizeTokenBucketOptions(options); ++ if (!normalized) return; ++ const now = Date.now(); ++ this.bucket = { ++ ...normalized, ++ tokens: normalized.capacity, ++ updatedAt: now, ++ }; ++ } ++ ++ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const now = Date.now(); ++ ++ if (this.cooldownUntil !== undefined && now < this.cooldownUntil) { ++ rateLimitLogger?.debug?.("Token bucket cooldown active; waiting", { ++ rateLimitKey: this.key, ++ cooldownUntil: this.cooldownUntil, ++ waitMs: this.cooldownUntil - now, ++ }); ++ return { kind: "wait", wakeUpAt: this.cooldownUntil }; ++ } ++ ++ const bucket = this.bucket; ++ if (!bucket) return null; ++ ++ refillTokenBucket(bucket, now); ++ ++ if (bucket.capacity <= 0) { ++ rateLimitLogger?.debug?.("Token bucket misconfigured; blocking", { ++ rateLimitKey: this.key, ++ capacity: bucket.capacity, ++ refillPerSecond: bucket.refillPerSecond, ++ }); ++ return { kind: "wait" }; ++ } ++ ++ if (bucket.tokens >= 1) { ++ bucket.tokens -= 1; ++ next.rateLimitKey = this.key; ++ rateLimitLogger?.trace?.("Consumed token bucket token", { ++ rateLimitKey: this.key, ++ tokens: bucket.tokens, ++ capacity: bucket.capacity, ++ refillPerSecond: bucket.refillPerSecond, ++ }); ++ return null; ++ } ++ ++ if (bucket.refillPerSecond <= 0) { ++ rateLimitLogger?.debug?.("Token bucket has no refill; blocking", { ++ rateLimitKey: this.key, ++ capacity: bucket.capacity, ++ refillPerSecond: bucket.refillPerSecond, ++ }); ++ return { kind: "wait" }; ++ } ++ ++ const requiredTokens = 1 - bucket.tokens; ++ const waitMs = Math.max(1, Math.ceil((requiredTokens / bucket.refillPerSecond) * 1000)); ++ const wakeUpAt = now + waitMs; ++ rateLimitLogger?.debug?.("Token bucket empty; waiting", { ++ rateLimitKey: this.key, ++ tokens: bucket.tokens, ++ capacity: bucket.capacity, ++ refillPerSecond: bucket.refillPerSecond, ++ wakeUpAt, ++ waitMs, ++ }); ++ return { kind: "wait", wakeUpAt }; ++ } ++ ++ onDispatch(_logger?: Logger): void {} ++ ++ onComplete(_logger?: Logger): void {} ++ ++ updateFromHeaders( ++ _metadata: TrafficRequestMetadata | undefined, ++ headers: unknown, ++ logger?: Logger, ++ ): RateLimitUpdateResult | undefined { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const now = Date.now(); ++ ++ const retryAfter = readHeaderValue(headers, "retry-after"); ++ const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter, now) : undefined; ++ ++ const remainingRequests = readHeaderValue(headers, "x-ratelimit-remaining-requests"); ++ const resetRequests = readHeaderValue(headers, "x-ratelimit-reset-requests"); ++ const resetRequestsMs = resetRequests ? parseResetDurationToMs(resetRequests) : undefined; ++ ++ let appliedUntil: number | undefined; ++ ++ if (retryAfterMs !== undefined) { ++ const targetAt = now + retryAfterMs; ++ this.cooldownUntil = ++ this.cooldownUntil === undefined ? targetAt : Math.max(this.cooldownUntil, targetAt); ++ appliedUntil = this.cooldownUntil; ++ } ++ ++ if (remainingRequests && resetRequestsMs !== undefined) { ++ const remaining = Number(remainingRequests); ++ if (Number.isFinite(remaining) && remaining <= 0) { ++ const targetAt = now + resetRequestsMs; ++ this.cooldownUntil = ++ this.cooldownUntil === undefined ? targetAt : Math.max(this.cooldownUntil, targetAt); ++ appliedUntil = this.cooldownUntil; ++ } ++ } ++ ++ if (appliedUntil === undefined) { ++ rateLimitLogger?.trace?.("No applicable cooldown headers; skipping", { ++ rateLimitKey: this.key, ++ hasRetryAfter: !!retryAfter, ++ hasRemainingRequests: !!remainingRequests, ++ hasResetRequests: !!resetRequests, ++ }); ++ return undefined; ++ } ++ ++ rateLimitLogger?.debug?.("Applied token bucket cooldown from headers", { ++ rateLimitKey: this.key, ++ cooldownUntil: appliedUntil, ++ inMs: Math.max(0, appliedUntil - now), ++ retryAfterMs, ++ resetRequestsMs, ++ }); ++ ++ const headerSnapshot: RateLimitHeaderSnapshot = { ++ remainingRequests, ++ resetRequests, ++ resetRequestsMs, ++ retryAfter, ++ retryAfterMs, ++ }; ++ ++ const state: RateLimitWindowState = { ++ limit: 1, ++ remaining: 0, ++ resetAt: appliedUntil, ++ reserved: 0, ++ nextAllowedAt: appliedUntil, ++ }; ++ ++ return { ++ key: this.key, ++ headerSnapshot, ++ state, ++ }; ++ } ++} +diff --git a/packages/core/src/traffic/traffic-circuit-breaker.ts b/packages/core/src/traffic/traffic-circuit-breaker.ts +new file mode 100644 +index 00000000..20d166ca +--- /dev/null ++++ b/packages/core/src/traffic/traffic-circuit-breaker.ts +@@ -0,0 +1,478 @@ ++import type { Logger } from "../logger"; ++import { ++ CIRCUIT_COOLDOWN_MS, ++ CIRCUIT_FAILURE_THRESHOLD, ++ CIRCUIT_FAILURE_WINDOW_MS, ++ CIRCUIT_PROBE_INTERVAL_MS, ++ CIRCUIT_TIMEOUT_THRESHOLD, ++ CIRCUIT_TIMEOUT_WINDOW_MS, ++ DEFAULT_FALLBACK_CHAINS, ++} from "./traffic-constants"; ++import type { ++ CircuitState, ++ CircuitStateStatus, ++ DispatchDecision, ++ QueuedRequest, ++} from "./traffic-controller-internal"; ++import { extractStatusCode, isTimeoutError } from "./traffic-error-utils"; ++import { CircuitBreakerOpenError } from "./traffic-errors"; ++import type { ++ FallbackChainEntry, ++ FallbackPolicy, ++ FallbackPolicyConfig, ++ FallbackTarget, ++ TrafficRequestMetadata, ++ TrafficResponseMetadata, ++} from "./traffic-types"; ++ ++export class TrafficCircuitBreaker { ++ private readonly circuitBreakers = new Map(); ++ private readonly fallbackChains: Map; ++ private readonly fallbackPolicy?: FallbackPolicyConfig; ++ private readonly buildRateLimitKey: (metadata?: TrafficRequestMetadata) => string; ++ ++ constructor(options: { ++ fallbackChains?: Record; ++ fallbackPolicy?: FallbackPolicyConfig; ++ buildRateLimitKey: (metadata?: TrafficRequestMetadata) => string; ++ }) { ++ this.buildRateLimitKey = options.buildRateLimitKey; ++ const chains = options.fallbackChains ?? DEFAULT_FALLBACK_CHAINS; ++ this.fallbackChains = new Map(Object.entries(chains)); ++ this.fallbackPolicy = options.fallbackPolicy; ++ } ++ ++ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { ++ const circuitLogger = logger?.child({ module: "circuit-breaker" }); ++ const visitedKeys = new Set(); ++ ++ while (true) { ++ const key = this.buildRateLimitKey(next.request.metadata); ++ next.circuitKey = key; ++ visitedKeys.add(key); ++ circuitLogger?.trace?.("Circuit resolve step", { ++ circuitKey: key, ++ provider: next.request.metadata?.provider, ++ model: next.request.metadata?.model, ++ }); ++ ++ const evaluation = this.evaluateCircuitState(key, circuitLogger); ++ next.circuitStatus = evaluation.state; ++ circuitLogger?.debug?.("Circuit evaluated", { ++ circuitKey: key, ++ state: evaluation.state, ++ allowRequest: evaluation.allowRequest, ++ retryAfterMs: evaluation.retryAfterMs, ++ }); ++ ++ if (evaluation.allowRequest) return null; ++ ++ const { policy, policyId } = this.resolveFallbackPolicy(next.request.metadata); ++ if (policy.mode === "wait") { ++ const wakeUpAt = ++ evaluation.retryAfterMs !== undefined ? Date.now() + evaluation.retryAfterMs : undefined; ++ circuitLogger?.debug?.("Circuit open; waiting per fallback policy", { ++ circuitKey: key, ++ policyId, ++ retryAfterMs: evaluation.retryAfterMs, ++ wakeUpAt, ++ }); ++ return { kind: "wait", wakeUpAt }; ++ } ++ ++ const fallback = this.findFallbackTarget(next.request.metadata, visitedKeys, circuitLogger); ++ circuitLogger?.debug?.("Circuit open; attempting fallback", { ++ circuitKey: key, ++ currentModel: next.request.metadata?.model, ++ fallback, ++ visitedKeys: Array.from(visitedKeys), ++ }); ++ if (!fallback || !next.request.createFallbackRequest) { ++ const error = new CircuitBreakerOpenError( ++ `Circuit open for ${key}`, ++ next.request.metadata, ++ evaluation.retryAfterMs, ++ ); ++ const traffic: TrafficResponseMetadata = { ++ rateLimitKey: key, ++ retryAfterMs: evaluation.retryAfterMs, ++ tenantId: next.request.metadata?.tenantId ?? next.tenantId, ++ priority: next.request.metadata?.priority, ++ taskType: next.request.metadata?.taskType, ++ }; ++ (error as Record).traffic = traffic; ++ next.reject(error); ++ circuitLogger?.warn?.("No fallback available; rejecting request", { ++ circuitKey: key, ++ retryAfterMs: evaluation.retryAfterMs, ++ }); ++ return { kind: "skip" }; ++ } ++ ++ const fallbackRequest = next.request.createFallbackRequest(fallback); ++ if (!fallbackRequest) { ++ circuitLogger?.warn?.("createFallbackRequest returned undefined; skipping", { ++ circuitKey: key, ++ fallback, ++ }); ++ return { kind: "skip" }; ++ } ++ ++ this.applyFallbackRequest(next, fallbackRequest, fallback, circuitLogger, { ++ previousCircuitKey: key, ++ reason: "circuit-open", ++ }); ++ } ++ } ++ ++ tryFallback(next: QueuedRequest, reason: "queue-timeout", logger?: Logger): boolean { ++ const circuitLogger = logger?.child({ module: "circuit-breaker" }); ++ const { policy, policyId } = this.resolveFallbackPolicy(next.request.metadata); ++ if (policy.mode === "wait") { ++ circuitLogger?.debug?.("Fallback skipped by policy", { ++ policyId, ++ reason, ++ provider: next.request.metadata?.provider, ++ model: next.request.metadata?.model, ++ }); ++ return false; ++ } ++ ++ const visitedKeys = new Set(); ++ const key = this.buildRateLimitKey(next.request.metadata); ++ visitedKeys.add(key); ++ ++ const fallback = this.findFallbackTarget(next.request.metadata, visitedKeys, circuitLogger); ++ if (!fallback || !next.request.createFallbackRequest) { ++ circuitLogger?.debug?.("Fallback unavailable for request", { ++ reason, ++ provider: next.request.metadata?.provider, ++ model: next.request.metadata?.model, ++ fallback, ++ }); ++ return false; ++ } ++ ++ const fallbackRequest = next.request.createFallbackRequest(fallback); ++ if (!fallbackRequest) { ++ circuitLogger?.warn?.("createFallbackRequest returned undefined; skipping", { ++ reason, ++ fallback, ++ }); ++ return false; ++ } ++ ++ this.applyFallbackRequest(next, fallbackRequest, fallback, circuitLogger, { ++ previousCircuitKey: key, ++ reason, ++ policyId, ++ }); ++ return true; ++ } ++ ++ markTrial(item: QueuedRequest, logger?: Logger): void { ++ const circuitLogger = logger?.child({ module: "circuit-breaker" }); ++ const key = item.circuitKey; ++ if (!key) return; ++ const state = this.circuitBreakers.get(key); ++ if (state && state.status === "half-open" && !state.trialInFlight) { ++ state.trialInFlight = true; ++ circuitLogger?.debug?.("Marked half-open trial in flight", { circuitKey: key }); ++ } ++ } ++ ++ recordSuccess(metadata?: TrafficRequestMetadata, logger?: Logger): void { ++ const circuitLogger = logger?.child({ module: "circuit-breaker" }); ++ const key = this.buildRateLimitKey(metadata); ++ this.circuitBreakers.delete(key); ++ circuitLogger?.debug?.("Circuit success; cleared circuit state", { ++ circuitKey: key, ++ provider: metadata?.provider, ++ model: metadata?.model, ++ }); ++ } ++ ++ recordFailure( ++ metadata: TrafficRequestMetadata | undefined, ++ error: unknown, ++ logger?: Logger, ++ ): void { ++ const circuitLogger = logger?.child({ module: "circuit-breaker" }); ++ const key = this.buildRateLimitKey(metadata); ++ const status = extractStatusCode(error, logger); ++ const isTimeout = status === 408 || isTimeoutError(error, logger); ++ const isStatusEligible = this.isCircuitBreakerStatus(status); ++ const isTimeoutEligible = !isStatusEligible && isTimeout; ++ const isEligible = isStatusEligible || isTimeoutEligible; ++ ++ circuitLogger?.debug?.("Circuit failure observed", { ++ circuitKey: key, ++ status, ++ isTimeout, ++ eligible: isEligible, ++ provider: metadata?.provider, ++ model: metadata?.model, ++ }); ++ ++ if (!isEligible) { ++ this.circuitBreakers.delete(key); ++ circuitLogger?.debug?.("Failure not eligible for circuit breaker; cleared circuit state", { ++ circuitKey: key, ++ status, ++ isTimeout, ++ }); ++ return; ++ } ++ ++ const now = Date.now(); ++ const state = ++ this.circuitBreakers.get(key) ?? ++ ({ status: "closed", failureTimestamps: [], timeoutTimestamps: [] } as CircuitState); ++ ++ state.failureTimestamps = state.failureTimestamps.filter( ++ (t) => now - t <= CIRCUIT_FAILURE_WINDOW_MS, ++ ); ++ state.timeoutTimestamps = state.timeoutTimestamps.filter( ++ (t) => now - t <= CIRCUIT_TIMEOUT_WINDOW_MS, ++ ); ++ ++ state.failureTimestamps.push(now); ++ if (isTimeoutEligible) { ++ state.timeoutTimestamps.push(now); ++ } ++ ++ if ( ++ state.status === "half-open" || ++ state.failureTimestamps.length >= CIRCUIT_FAILURE_THRESHOLD || ++ state.timeoutTimestamps.length >= CIRCUIT_TIMEOUT_THRESHOLD ++ ) { ++ const openReasons: string[] = []; ++ if (state.status === "half-open") openReasons.push("half-open-failure"); ++ if (state.failureTimestamps.length >= CIRCUIT_FAILURE_THRESHOLD) { ++ openReasons.push("failure-threshold"); ++ } ++ if (state.timeoutTimestamps.length >= CIRCUIT_TIMEOUT_THRESHOLD) { ++ openReasons.push("timeout-threshold"); ++ } ++ ++ state.status = "open"; ++ state.openedAt = now; ++ state.trialInFlight = false; ++ state.nextProbeAt = now + CIRCUIT_PROBE_INTERVAL_MS; ++ circuitLogger?.warn?.("Circuit opened", { ++ circuitKey: key, ++ openReasons, ++ status, ++ isTimeout, ++ failureCount: state.failureTimestamps.length, ++ failureThreshold: CIRCUIT_FAILURE_THRESHOLD, ++ timeoutCount: state.timeoutTimestamps.length, ++ timeoutThreshold: CIRCUIT_TIMEOUT_THRESHOLD, ++ openedAt: state.openedAt, ++ }); ++ } ++ ++ this.circuitBreakers.set(key, state); ++ circuitLogger?.trace?.("Circuit state updated", { ++ circuitKey: key, ++ status: state.status, ++ failureCount: state.failureTimestamps.length, ++ failureWindowMs: CIRCUIT_FAILURE_WINDOW_MS, ++ timeoutCount: state.timeoutTimestamps.length, ++ timeoutWindowMs: CIRCUIT_TIMEOUT_WINDOW_MS, ++ }); ++ } ++ ++ private evaluateCircuitState( ++ key: string, ++ logger?: Logger, ++ ): { ++ allowRequest: boolean; ++ state: CircuitStateStatus; ++ retryAfterMs?: number; ++ } { ++ const state = this.circuitBreakers.get(key); ++ if (!state) { ++ logger?.trace?.("Circuit state missing; allow request", { circuitKey: key }); ++ return { allowRequest: true, state: "closed" }; ++ } ++ ++ const now = Date.now(); ++ ++ if (state.status === "open") { ++ const elapsed = state.openedAt ? now - state.openedAt : 0; ++ if (state.nextProbeAt === undefined) { ++ state.nextProbeAt = now + CIRCUIT_PROBE_INTERVAL_MS; ++ } ++ const cooldownRemaining = Math.max(0, CIRCUIT_COOLDOWN_MS - elapsed); ++ const probeRemaining = Math.max(0, state.nextProbeAt - now); ++ if (probeRemaining === 0 || cooldownRemaining === 0) { ++ state.status = "half-open"; ++ state.trialInFlight = false; ++ state.failureTimestamps = []; ++ state.timeoutTimestamps = []; ++ state.nextProbeAt = undefined; ++ logger?.debug?.("Circuit transitioned to half-open", { ++ circuitKey: key, ++ reason: cooldownRemaining === 0 ? "cooldown" : "probe", ++ }); ++ return { allowRequest: true, state: "half-open" }; ++ } ++ return { ++ allowRequest: false, ++ state: "open", ++ retryAfterMs: Math.min(cooldownRemaining, probeRemaining), ++ }; ++ } ++ ++ if (state.status === "half-open" && state.trialInFlight) { ++ return { allowRequest: false, state: "half-open" }; ++ } ++ ++ return { allowRequest: true, state: state.status }; ++ } ++ ++ private resolveFallbackPolicy(metadata: TrafficRequestMetadata | undefined): { ++ policy: FallbackPolicy; ++ policyId?: string; ++ } { ++ const policyId = ++ metadata?.fallbackPolicyId ?? ++ (metadata?.taskType ++ ? this.fallbackPolicy?.taskTypePolicyIds?.[metadata.taskType] ++ : undefined) ?? ++ this.fallbackPolicy?.defaultPolicyId; ++ ++ const policy = policyId ? this.fallbackPolicy?.policies?.[policyId] : undefined; ++ return { ++ policy: policy ?? { mode: "fallback" }, ++ policyId, ++ }; ++ } ++ ++ private applyFallbackRequest( ++ next: QueuedRequest, ++ fallbackRequest: QueuedRequest["request"], ++ fallback: FallbackChainEntry, ++ logger?: Logger, ++ context?: { previousCircuitKey?: string; reason?: string; policyId?: string }, ++ ): void { ++ next.request = fallbackRequest; ++ next.attempt = 1; ++ next.estimatedTokens = fallbackRequest.estimatedTokens; ++ next.reservedTokens = undefined; ++ next.tenantConcurrencyKey = undefined; ++ next.providerModelConcurrencyKey = undefined; ++ next.rateLimitKey = undefined; ++ next.etaMs = undefined; ++ next.circuitKey = undefined; ++ next.circuitStatus = undefined; ++ next.extractUsage = fallbackRequest.extractUsage; ++ if (context?.reason === "queue-timeout") { ++ next.queueTimeoutDisabled = true; ++ } ++ logger?.debug?.("Switched to fallback request", { ++ previousCircuitKey: context?.previousCircuitKey, ++ fallbackModel: fallback, ++ reason: context?.reason, ++ policyId: context?.policyId, ++ }); ++ } ++ ++ private isShortResponseFallback( ++ candidate: FallbackChainEntry, ++ ): candidate is { kind: "short-response"; text: string } { ++ return ( ++ typeof candidate === "object" && ++ candidate !== null && ++ "kind" in candidate && ++ (candidate as { kind?: string }).kind === "short-response" ++ ); ++ } ++ ++ private findFallbackTarget( ++ metadata: TrafficRequestMetadata | undefined, ++ visitedKeys: Set, ++ logger?: Logger, ++ ): FallbackChainEntry | undefined { ++ const currentModel = metadata?.model; ++ if (!currentModel) { ++ logger?.trace?.("No current model; no fallback", {}); ++ return undefined; ++ } ++ ++ const provider = metadata?.provider; ++ const chain = this.resolveFallbackChain(provider, currentModel); ++ if (!chain) { ++ logger?.trace?.("No fallback chain for model", { ++ currentModel, ++ provider, ++ }); ++ return undefined; ++ } ++ ++ for (const candidate of chain) { ++ if (this.isShortResponseFallback(candidate)) { ++ logger?.debug?.("Selected short-response fallback", { ++ currentModel, ++ currentProvider: provider, ++ }); ++ return candidate; ++ } ++ const target = this.normalizeFallbackTarget(candidate, provider); ++ const candidateMetadata: TrafficRequestMetadata = { ++ ...(metadata ?? {}), ++ provider: target.provider ?? provider, ++ model: target.model, ++ }; ++ const candidateKey = this.buildRateLimitKey(candidateMetadata); ++ if (visitedKeys.has(candidateKey)) { ++ continue; ++ } ++ ++ const evaluation = this.evaluateCircuitState(candidateKey, logger); ++ if (evaluation.allowRequest) { ++ visitedKeys.add(candidateKey); ++ logger?.debug?.("Selected fallback target", { ++ currentModel, ++ currentProvider: provider, ++ fallbackModel: target.model, ++ fallbackProvider: target.provider ?? provider, ++ fallbackCircuitKey: candidateKey, ++ }); ++ return candidate; ++ } ++ } ++ ++ return undefined; ++ } ++ ++ private resolveFallbackChain( ++ provider: string | undefined, ++ model: string, ++ ): FallbackChainEntry[] | undefined { ++ const providerKey = provider ? `${provider}::${model}` : undefined; ++ if (providerKey) { ++ const providerChain = this.fallbackChains.get(providerKey); ++ if (providerChain) return providerChain; ++ } ++ return this.fallbackChains.get(model); ++ } ++ ++ private normalizeFallbackTarget( ++ candidate: FallbackChainEntry, ++ provider: string | undefined, ++ ): FallbackTarget { ++ if (typeof candidate === "string") { ++ return { provider, model: candidate }; ++ } ++ return { ++ provider: candidate.provider ?? provider, ++ model: candidate.model, ++ }; ++ } ++ ++ private isCircuitBreakerStatus(status?: number): boolean { ++ return status === 429 || (status !== undefined && status >= 500); ++ } ++} +diff --git a/packages/core/src/traffic/traffic-concurrency-limiter.ts b/packages/core/src/traffic/traffic-concurrency-limiter.ts +new file mode 100644 +index 00000000..e1525612 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-concurrency-limiter.ts +@@ -0,0 +1,235 @@ ++import type { Logger } from "../logger"; ++import type { QueuedRequest } from "./traffic-controller-internal"; ++import type { ++ ProviderModelConcurrencyLimit, ++ TenantConcurrencyLimit, ++ TrafficRequestMetadata, ++} from "./traffic-types"; ++ ++export type ConcurrencyBlockReason = ++ | { ++ gate: "providerModel"; ++ key: string; ++ inFlight: number; ++ limit: number; ++ } ++ | { ++ gate: "tenant"; ++ key: string; ++ inFlight: number; ++ limit: number; ++ }; ++ ++export type ConcurrencyDecision = ++ | { kind: "allow" } ++ | { kind: "wait"; reasons: ConcurrencyBlockReason[] }; ++ ++function toNonNegativeIntegerLimit(raw: unknown): number | undefined { ++ if (raw === undefined || raw === null) return undefined; ++ const n = typeof raw === "number" ? raw : Number(raw); ++ if (!Number.isFinite(n)) return undefined; ++ if (n <= 0) return 0; ++ return Math.floor(n); ++} ++ ++function getInFlight(map: Map, key: string): number { ++ return map.get(key) ?? 0; ++} ++ ++function incrementInFlight(map: Map, key: string): void { ++ map.set(key, getInFlight(map, key) + 1); ++} ++ ++function decrementInFlight(map: Map, key: string): void { ++ const current = getInFlight(map, key); ++ if (current <= 1) { ++ map.delete(key); ++ return; ++ } ++ map.set(key, current - 1); ++} ++ ++export class TrafficConcurrencyLimiter { ++ private readonly inFlightByProviderModel = new Map(); ++ private readonly inFlightByTenant = new Map(); ++ ++ private readonly buildProviderModelKey: (metadata?: TrafficRequestMetadata) => string; ++ private readonly providerModelLimit?: ProviderModelConcurrencyLimit; ++ private readonly tenantLimit?: TenantConcurrencyLimit; ++ private readonly providerModelEnabled: boolean; ++ private readonly tenantEnabled: boolean; ++ ++ constructor(options: { ++ buildProviderModelKey: (metadata?: TrafficRequestMetadata) => string; ++ maxConcurrentPerProviderModel?: ProviderModelConcurrencyLimit; ++ maxConcurrentPerTenant?: TenantConcurrencyLimit; ++ }) { ++ this.buildProviderModelKey = options.buildProviderModelKey; ++ this.providerModelLimit = options.maxConcurrentPerProviderModel; ++ this.tenantLimit = options.maxConcurrentPerTenant; ++ this.providerModelEnabled = options.maxConcurrentPerProviderModel !== undefined; ++ this.tenantEnabled = options.maxConcurrentPerTenant !== undefined; ++ } ++ ++ resolve(next: QueuedRequest, logger?: Logger): ConcurrencyDecision { ++ if (!this.providerModelEnabled && !this.tenantEnabled) return { kind: "allow" }; ++ const concurrencyLogger = logger?.child({ module: "concurrency-limiter" }); ++ const reasons: ConcurrencyBlockReason[] = []; ++ ++ if (this.providerModelEnabled) { ++ const providerModelKey = this.buildProviderModelKey(next.request.metadata); ++ const providerModelLimit = this.resolveProviderModelLimit( ++ providerModelKey, ++ next.request.metadata, ++ concurrencyLogger, ++ ); ++ if (providerModelLimit !== undefined) { ++ const inFlight = getInFlight(this.inFlightByProviderModel, providerModelKey); ++ if (inFlight >= providerModelLimit) { ++ reasons.push({ ++ gate: "providerModel", ++ key: providerModelKey, ++ inFlight, ++ limit: providerModelLimit, ++ }); ++ } ++ } ++ } ++ ++ if (this.tenantEnabled) { ++ const tenantKey = next.tenantId; ++ const tenantLimit = this.resolveTenantLimit( ++ tenantKey, ++ next.request.metadata, ++ concurrencyLogger, ++ ); ++ if (tenantLimit !== undefined) { ++ const inFlight = getInFlight(this.inFlightByTenant, tenantKey); ++ if (inFlight >= tenantLimit) { ++ reasons.push({ ++ gate: "tenant", ++ key: tenantKey, ++ inFlight, ++ limit: tenantLimit, ++ }); ++ } ++ } ++ } ++ ++ if (reasons.length === 0) return { kind: "allow" }; ++ ++ concurrencyLogger?.trace?.("Concurrency gate blocked request", { ++ tenantId: next.tenantId, ++ reasons, ++ }); ++ return { kind: "wait", reasons }; ++ } ++ ++ acquire(next: QueuedRequest, logger?: Logger): void { ++ if (!this.providerModelEnabled && !this.tenantEnabled) return; ++ const concurrencyLogger = logger?.child({ module: "concurrency-limiter" }); ++ ++ let tenantKey: string | undefined; ++ if (this.tenantEnabled) { ++ tenantKey = next.tenantId; ++ next.tenantConcurrencyKey = tenantKey; ++ incrementInFlight(this.inFlightByTenant, tenantKey); ++ } ++ ++ let providerModelKey: string | undefined; ++ if (this.providerModelEnabled) { ++ providerModelKey = this.buildProviderModelKey(next.request.metadata); ++ next.providerModelConcurrencyKey = providerModelKey; ++ incrementInFlight(this.inFlightByProviderModel, providerModelKey); ++ } ++ ++ concurrencyLogger?.trace?.("Concurrency slots acquired", { ++ tenantId: tenantKey, ++ tenantInFlight: tenantKey ? getInFlight(this.inFlightByTenant, tenantKey) : undefined, ++ providerModelKey, ++ providerModelInFlight: providerModelKey ++ ? getInFlight(this.inFlightByProviderModel, providerModelKey) ++ : undefined, ++ }); ++ } ++ ++ release(next: QueuedRequest, logger?: Logger): void { ++ const concurrencyLogger = logger?.child({ module: "concurrency-limiter" }); ++ const tenantKey = next.tenantConcurrencyKey; ++ const providerModelKey = next.providerModelConcurrencyKey; ++ ++ if (tenantKey) { ++ decrementInFlight(this.inFlightByTenant, tenantKey); ++ } ++ ++ if (providerModelKey) { ++ decrementInFlight(this.inFlightByProviderModel, providerModelKey); ++ } ++ ++ if (tenantKey || providerModelKey) { ++ concurrencyLogger?.trace?.("Concurrency slots released", { ++ tenantId: tenantKey, ++ tenantInFlight: tenantKey ? getInFlight(this.inFlightByTenant, tenantKey) : undefined, ++ providerModelKey, ++ providerModelInFlight: providerModelKey ++ ? getInFlight(this.inFlightByProviderModel, providerModelKey) ++ : undefined, ++ }); ++ } ++ ++ next.tenantConcurrencyKey = undefined; ++ next.providerModelConcurrencyKey = undefined; ++ } ++ ++ private resolveTenantLimit( ++ tenantId: string, ++ metadata: TrafficRequestMetadata | undefined, ++ logger?: Logger, ++ ): number | undefined { ++ const policy = this.tenantLimit; ++ if (policy === undefined) return undefined; ++ ++ if (typeof policy === "number") return toNonNegativeIntegerLimit(policy); ++ if (typeof policy === "function") { ++ try { ++ return toNonNegativeIntegerLimit(policy(tenantId, metadata)); ++ } catch (error) { ++ logger?.warn?.("Tenant concurrency resolver threw; ignoring", { ++ tenantId, ++ errorName: (error as { name?: unknown } | null)?.name, ++ errorMessage: (error as { message?: unknown } | null)?.message, ++ }); ++ return undefined; ++ } ++ } ++ ++ return toNonNegativeIntegerLimit(policy[tenantId]); ++ } ++ ++ private resolveProviderModelLimit( ++ key: string, ++ metadata: TrafficRequestMetadata | undefined, ++ logger?: Logger, ++ ): number | undefined { ++ const policy = this.providerModelLimit; ++ if (policy === undefined) return undefined; ++ ++ if (typeof policy === "number") return toNonNegativeIntegerLimit(policy); ++ if (typeof policy === "function") { ++ try { ++ return toNonNegativeIntegerLimit(policy(metadata, key)); ++ } catch (error) { ++ logger?.warn?.("Provider/model concurrency resolver threw; ignoring", { ++ key, ++ provider: metadata?.provider, ++ model: metadata?.model, ++ errorName: (error as { name?: unknown } | null)?.name, ++ errorMessage: (error as { message?: unknown } | null)?.message, ++ }); ++ return undefined; ++ } ++ } ++ ++ return toNonNegativeIntegerLimit(policy[key]); ++ } ++} +diff --git a/packages/core/src/traffic/traffic-constants.ts b/packages/core/src/traffic/traffic-constants.ts +new file mode 100644 +index 00000000..68d99df7 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-constants.ts +@@ -0,0 +1,26 @@ ++export const MAX_RETRY_ATTEMPTS = 3; ++export const TIMEOUT_RETRY_ATTEMPTS = 2; ++ ++export const RATE_LIMIT_BASE_BACKOFF_MS = 500; ++export const SERVER_ERROR_BASE_BACKOFF_MS = 1000; ++export const TIMEOUT_BASE_BACKOFF_MS = 750; ++ ++export const RATE_LIMIT_JITTER_FACTOR = 0.35; ++export const SERVER_ERROR_JITTER_FACTOR = 0.8; ++export const TIMEOUT_JITTER_FACTOR = 0.5; ++ ++export const CIRCUIT_FAILURE_THRESHOLD = 5; ++export const CIRCUIT_FAILURE_WINDOW_MS = 10_000; ++export const CIRCUIT_TIMEOUT_THRESHOLD = CIRCUIT_FAILURE_THRESHOLD; ++export const CIRCUIT_TIMEOUT_WINDOW_MS = CIRCUIT_FAILURE_WINDOW_MS; ++export const CIRCUIT_COOLDOWN_MS = 30_000; ++export const CIRCUIT_PROBE_INTERVAL_MS = 5_000; ++ ++export const RATE_LIMIT_EXHAUSTION_BUFFER = 1; ++export const RATE_LIMIT_PROBE_DELAY_MS = 50; ++export const RATE_LIMIT_MIN_PACE_INTERVAL_MS = 10; ++export const RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS = 10; ++ ++export const DEFAULT_FALLBACK_CHAINS: Record = { ++ "gpt-4o": ["gpt-4o-mini", "gpt-3.5"], ++}; +diff --git a/packages/core/src/traffic/traffic-controller-internal.ts b/packages/core/src/traffic/traffic-controller-internal.ts +new file mode 100644 +index 00000000..fd2012cf +--- /dev/null ++++ b/packages/core/src/traffic/traffic-controller-internal.ts +@@ -0,0 +1,57 @@ ++import type { TrafficPriority, TrafficRequest, TrafficRequestType } from "./traffic-types"; ++ ++export type Scheduler = (callback: () => void) => void; ++ ++export type DispatchDecision = ++ | { kind: "dispatch" } ++ | { kind: "skip" } ++ | { kind: "wait"; wakeUpAt?: number }; ++ ++export type CircuitStateStatus = "closed" | "open" | "half-open"; ++ ++export interface CircuitState { ++ status: CircuitStateStatus; ++ failureTimestamps: number[]; ++ timeoutTimestamps: number[]; ++ openedAt?: number; ++ trialInFlight?: boolean; ++ nextProbeAt?: number; ++} ++ ++export interface RateLimitWindowState { ++ limit: number; ++ remaining: number; ++ resetAt: number; ++ reserved: number; ++ nextAllowedAt: number; ++} ++ ++type BivariantHandler = { ++ bivarianceHack(...args: TArgs): void; ++}["bivarianceHack"]; ++ ++export interface QueuedRequest { ++ type: TrafficRequestType; ++ request: TrafficRequest; ++ resolve: BivariantHandler<[TResponse | PromiseLike]>; ++ reject: BivariantHandler<[reason?: unknown]>; ++ attempt: number; ++ priority: TrafficPriority; ++ tenantId: string; ++ enqueuedAt: number; ++ dispatchedAt?: number; ++ estimatedTokens?: number; ++ reservedTokens?: number; ++ queueTimeoutDisabled?: boolean; ++ ++ tenantConcurrencyKey?: string; ++ providerModelConcurrencyKey?: string; ++ ++ rateLimitKey?: string; ++ etaMs?: number; ++ ++ circuitKey?: string; ++ circuitStatus?: CircuitStateStatus; ++ ++ extractUsage?: TrafficRequest["extractUsage"]; ++} +diff --git a/packages/core/src/traffic/traffic-controller.spec.ts b/packages/core/src/traffic/traffic-controller.spec.ts +new file mode 100644 +index 00000000..8f0a2c47 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-controller.spec.ts +@@ -0,0 +1,706 @@ ++import { describe, expect, it, vi } from "vitest"; ++import { CIRCUIT_FAILURE_THRESHOLD, RATE_LIMIT_PROBE_DELAY_MS } from "./traffic-constants"; ++import { TrafficController } from "./traffic-controller"; ++ ++describe("TrafficController priority scheduling", () => { ++ it("prioritizes P0 over lower priorities when runnable", async () => { ++ const controller = new TrafficController({ maxConcurrent: 1 }); ++ const order: string[] = []; ++ ++ const p1 = controller.handleText({ ++ metadata: { provider: "p", model: "m1", priority: "P1" }, ++ execute: async () => { ++ order.push("P1"); ++ return "P1"; ++ }, ++ }); ++ ++ const p2 = controller.handleText({ ++ metadata: { provider: "p", model: "m2", priority: "P2" }, ++ execute: async () => { ++ order.push("P2"); ++ return "P2"; ++ }, ++ }); ++ ++ const p0 = controller.handleText({ ++ metadata: { provider: "p", model: "m0", priority: "P0" }, ++ execute: async () => { ++ order.push("P0"); ++ return "P0"; ++ }, ++ }); ++ ++ await Promise.all([p0, p1, p2]); ++ ++ expect(order[0]).toBe("P0"); ++ expect(order).toEqual(["P0", "P1", "P2"]); ++ }); ++ ++ it("allows lower priorities to proceed when a higher priority request is rate limited", async () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ maxConcurrent: 1 }); ++ controller.updateRateLimitFromHeaders( ++ { provider: "p0", model: "m0" }, ++ { ++ "x-ratelimit-limit-requests": "1", ++ "x-ratelimit-remaining-requests": "0", ++ "x-ratelimit-reset-requests": "1s", ++ }, ++ ); ++ ++ const order: string[] = []; ++ ++ const p0 = controller.handleText({ ++ metadata: { provider: "p0", model: "m0", priority: "P0" }, ++ execute: async () => { ++ order.push("P0"); ++ return "P0"; ++ }, ++ }); ++ ++ const p1 = controller.handleText({ ++ metadata: { provider: "p1", model: "m1", priority: "P1" }, ++ execute: async () => { ++ order.push("P1"); ++ return "P1"; ++ }, ++ }); ++ ++ await vi.runAllTimersAsync(); ++ await Promise.all([p0, p1]); ++ ++ expect(order[0]).toBe("P1"); ++ expect(order[1]).toBe("P0"); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++}); ++ ++describe("TrafficController concurrency limits", () => { ++ it("shares provider/model limits across tenants", async () => { ++ const controller = new TrafficController({ ++ maxConcurrent: 2, ++ maxConcurrentPerProviderModel: 1, ++ }); ++ const started: string[] = []; ++ let releaseFirst!: () => void; ++ const firstGate = new Promise((resolve) => { ++ releaseFirst = resolve; ++ }); ++ ++ const first = controller.handleText({ ++ tenantId: "tenant-a", ++ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, ++ execute: async () => { ++ started.push("tenant-a"); ++ await firstGate; ++ return "a"; ++ }, ++ }); ++ ++ const second = controller.handleText({ ++ tenantId: "tenant-b", ++ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, ++ execute: async () => { ++ started.push("tenant-b"); ++ return "b"; ++ }, ++ }); ++ ++ await new Promise((resolve) => setTimeout(resolve, 0)); ++ expect(started).toEqual(["tenant-a"]); ++ ++ releaseFirst(); ++ await Promise.all([first, second]); ++ expect(started).toEqual(["tenant-a", "tenant-b"]); ++ }); ++}); ++ ++describe("TrafficController rate limit headers", () => { ++ it("parses OpenAI-style compound reset durations (e.g. 1m30.951s)", () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(1_000_000)); ++ const controller = new TrafficController({ maxConcurrent: 1 }); ++ const now = Date.now(); ++ ++ const result = controller.updateRateLimitFromHeaders( ++ { provider: "openai.responses", model: "gpt-4o-mini" }, ++ { ++ "x-ratelimit-limit-requests": "10000", ++ "x-ratelimit-remaining-requests": "9989", ++ "x-ratelimit-reset-requests": "1m30.951s", ++ }, ++ ); ++ ++ expect(result).toBeTruthy(); ++ expect(result?.headerSnapshot.resetRequestsMs).toBeCloseTo(90_951, 6); ++ expect(result?.state.limit).toBe(10000); ++ expect(result?.state.remaining).toBe(9989); ++ expect(result?.state.resetAt).toBe(now + 90_951); ++ expect(result?.state.reserved).toBe(0); ++ expect(result?.state.nextAllowedAt).toBe(now); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++ ++ it("keeps resetAt monotonic when headers shorten the reset duration", () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ maxConcurrent: 1 }); ++ ++ const first = controller.updateRateLimitFromHeaders( ++ { provider: "openai.responses", model: "gpt-4o-mini" }, ++ { ++ "x-ratelimit-limit-requests": "10000", ++ "x-ratelimit-remaining-requests": "9999", ++ "x-ratelimit-reset-requests": "60s", ++ }, ++ ); ++ ++ expect(first).toBeTruthy(); ++ expect(first?.state.resetAt).toBe(60_000); ++ ++ vi.setSystemTime(new Date(10_000)); ++ const second = controller.updateRateLimitFromHeaders( ++ { provider: "openai.responses", model: "gpt-4o-mini" }, ++ { ++ "x-ratelimit-limit-requests": "10000", ++ "x-ratelimit-remaining-requests": "9998", ++ "x-ratelimit-reset-requests": "5s", ++ }, ++ ); ++ ++ expect(second).toBeTruthy(); ++ expect(second?.state.resetAt).toBe(60_000); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++ ++ it("never increases remaining within the same window", () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ maxConcurrent: 1 }); ++ ++ const first = controller.updateRateLimitFromHeaders( ++ { provider: "openai.responses", model: "gpt-4o-mini" }, ++ { ++ "x-ratelimit-limit-requests": "10", ++ "x-ratelimit-remaining-requests": "9", ++ "x-ratelimit-reset-requests": "60s", ++ }, ++ ); ++ ++ expect(first?.state.remaining).toBe(9); ++ expect(first?.state.resetAt).toBe(60_000); ++ ++ vi.setSystemTime(new Date(10_000)); ++ const second = controller.updateRateLimitFromHeaders( ++ { provider: "openai.responses", model: "gpt-4o-mini" }, ++ { ++ "x-ratelimit-limit-requests": "10", ++ "x-ratelimit-remaining-requests": "8", ++ "x-ratelimit-reset-requests": "50s", ++ }, ++ ); ++ ++ expect(second?.state.remaining).toBe(8); ++ expect(second?.state.resetAt).toBe(60_000); ++ ++ vi.setSystemTime(new Date(20_000)); ++ const third = controller.updateRateLimitFromHeaders( ++ { provider: "openai.responses", model: "gpt-4o-mini" }, ++ { ++ "x-ratelimit-limit-requests": "10", ++ "x-ratelimit-remaining-requests": "9", ++ "x-ratelimit-reset-requests": "40s", ++ }, ++ ); ++ ++ expect(third?.state.remaining).toBe(8); ++ expect(third?.state.resetAt).toBe(60_000); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++ ++ it("applies Retry-After even when x-ratelimit headers are missing", async () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ maxConcurrent: 1 }); ++ const order: string[] = []; ++ ++ controller.updateRateLimitFromHeaders( ++ { provider: "p", model: "m" }, ++ { ++ "retry-after": "2", ++ }, ++ ); ++ ++ const p0 = controller.handleText({ ++ metadata: { provider: "p", model: "m", priority: "P0" }, ++ execute: async () => { ++ order.push("P0"); ++ return "P0"; ++ }, ++ }); ++ ++ await vi.advanceTimersByTimeAsync(1_999); ++ expect(order).toEqual([]); ++ ++ await vi.advanceTimersByTimeAsync(1); ++ await vi.runAllTimersAsync(); ++ await p0; ++ expect(order).toEqual(["P0"]); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++ ++ it("shares rate limits across tenants for the same provider/model", async () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ maxConcurrent: 1 }); ++ controller.updateRateLimitFromHeaders( ++ { provider: "openai", model: "gpt-4o", tenantId: "tenant-a" }, ++ { ++ "x-ratelimit-limit-requests": "1", ++ "x-ratelimit-remaining-requests": "0", ++ "x-ratelimit-reset-requests": "1s", ++ }, ++ ); ++ ++ const order: string[] = []; ++ const request = controller.handleText({ ++ tenantId: "tenant-b", ++ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, ++ execute: async () => { ++ order.push("tenant-b"); ++ return "ok"; ++ }, ++ }); ++ ++ await vi.advanceTimersByTimeAsync(999); ++ await Promise.resolve(); ++ expect(order).toEqual([]); ++ ++ await vi.advanceTimersByTimeAsync(1); ++ await vi.runAllTimersAsync(); ++ await request; ++ ++ expect(order).toEqual(["tenant-b"]); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++}); ++ ++describe("TrafficController token limits", () => { ++ it("blocks OpenAI when the token window is exhausted even without RPM config", async () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ ++ maxConcurrent: 1, ++ rateLimits: { ++ "openai::gpt-4o": { ++ requestsPerMinute: 0, ++ tokensPerMinute: 2, ++ }, ++ }, ++ }); ++ const order: string[] = []; ++ ++ const first = controller.handleText({ ++ tenantId: "tenant-a", ++ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, ++ execute: async () => { ++ order.push("first"); ++ return "first"; ++ }, ++ extractUsage: () => ({ totalTokens: 2 }), ++ }); ++ ++ const second = controller.handleText({ ++ tenantId: "tenant-b", ++ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, ++ execute: async () => { ++ order.push("second"); ++ return "second"; ++ }, ++ extractUsage: () => ({ totalTokens: 1 }), ++ }); ++ ++ await first; ++ expect(order).toEqual(["first"]); ++ ++ await vi.advanceTimersByTimeAsync(60_000 + RATE_LIMIT_PROBE_DELAY_MS - 1); ++ expect(order).toEqual(["first"]); ++ ++ await vi.advanceTimersByTimeAsync(1); ++ await vi.runAllTimersAsync(); ++ await second; ++ expect(order).toEqual(["first", "second"]); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++ ++ it("reserves estimated tokens before dispatch", async () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ ++ maxConcurrent: 2, ++ rateLimits: { ++ "openai::gpt-4o": { ++ requestsPerMinute: 0, ++ tokensPerMinute: 2, ++ }, ++ }, ++ }); ++ const order: string[] = []; ++ let releaseFirst!: () => void; ++ const firstGate = new Promise((resolve) => { ++ releaseFirst = resolve; ++ }); ++ ++ const first = controller.handleText({ ++ tenantId: "tenant-a", ++ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, ++ estimatedTokens: 2, ++ execute: async () => { ++ order.push("first"); ++ await firstGate; ++ return "first"; ++ }, ++ extractUsage: () => ({ totalTokens: 2 }), ++ }); ++ ++ const second = controller.handleText({ ++ tenantId: "tenant-b", ++ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, ++ estimatedTokens: 1, ++ execute: async () => { ++ order.push("second"); ++ return "second"; ++ }, ++ extractUsage: () => ({ totalTokens: 1 }), ++ }); ++ ++ await Promise.resolve(); ++ expect(order).toEqual(["first"]); ++ ++ await vi.advanceTimersByTimeAsync(60_000 + RATE_LIMIT_PROBE_DELAY_MS - 1); ++ await Promise.resolve(); ++ expect(order).toEqual(["first"]); ++ ++ await vi.advanceTimersByTimeAsync(1); ++ await vi.runAllTimersAsync(); ++ await Promise.resolve(); ++ expect(order).toEqual(["first", "second"]); ++ ++ releaseFirst(); ++ await Promise.all([first, second]); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++ ++ it("allows token-only configs on non-OpenAI providers", async () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ ++ maxConcurrent: 2, ++ rateLimits: { ++ "p::m": { ++ requestsPerMinute: 0, ++ tokensPerMinute: 2, ++ }, ++ }, ++ }); ++ const order: string[] = []; ++ ++ const first = controller.handleText({ ++ tenantId: "tenant-a", ++ metadata: { provider: "p", model: "m", priority: "P1" }, ++ estimatedTokens: 2, ++ execute: async () => { ++ order.push("first"); ++ return "first"; ++ }, ++ extractUsage: () => ({ totalTokens: 2 }), ++ }); ++ ++ const second = controller.handleText({ ++ tenantId: "tenant-b", ++ metadata: { provider: "p", model: "m", priority: "P1" }, ++ estimatedTokens: 1, ++ execute: async () => { ++ order.push("second"); ++ return "second"; ++ }, ++ extractUsage: () => ({ totalTokens: 1 }), ++ }); ++ ++ await first; ++ expect(order).toEqual(["first"]); ++ ++ await vi.advanceTimersByTimeAsync(29_999); ++ await Promise.resolve(); ++ expect(order).toEqual(["first"]); ++ ++ await vi.advanceTimersByTimeAsync(1); ++ await vi.runAllTimersAsync(); ++ await second; ++ expect(order).toEqual(["first", "second"]); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++ ++ it("honors OpenAI token headers even without token config", async () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ maxConcurrent: 1 }); ++ controller.updateRateLimitFromHeaders( ++ { provider: "openai", model: "gpt-4o" }, ++ { ++ "x-ratelimit-limit-tokens": "2", ++ "x-ratelimit-remaining-tokens": "0", ++ "x-ratelimit-reset-tokens": "1s", ++ }, ++ ); ++ ++ const order: string[] = []; ++ const request = controller.handleText({ ++ tenantId: "tenant-a", ++ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, ++ estimatedTokens: 1, ++ execute: async () => { ++ order.push("run"); ++ return "ok"; ++ }, ++ }); ++ ++ await Promise.resolve(); ++ expect(order).toEqual([]); ++ ++ await vi.advanceTimersByTimeAsync(1_000 + RATE_LIMIT_PROBE_DELAY_MS - 1); ++ await Promise.resolve(); ++ expect(order).toEqual([]); ++ ++ await vi.advanceTimersByTimeAsync(1); ++ await vi.runAllTimersAsync(); ++ await request; ++ expect(order).toEqual(["run"]); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++}); ++ ++describe("TrafficController stream reporting", () => { ++ it("slows down after stream 429 errors", async () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ ++ maxConcurrent: 1, ++ adaptiveLimiter: { ++ windowMs: 1_000, ++ threshold: 1, ++ minPenaltyMs: 10, ++ maxPenaltyMs: 10, ++ penaltyMultiplier: 1, ++ decayMs: 1_000, ++ }, ++ }); ++ const metadata = { ++ provider: "p", ++ model: "m", ++ priority: "P1" as const, ++ tenantId: "tenant-a", ++ }; ++ ++ controller.reportStreamFailure( ++ metadata, ++ Object.assign(new Error("rate limit"), { status: 429 }), ++ ); ++ ++ const order: string[] = []; ++ const request = controller.handleText({ ++ tenantId: "tenant-a", ++ metadata, ++ execute: async () => { ++ order.push("run"); ++ return "ok"; ++ }, ++ }); ++ ++ await Promise.resolve(); ++ expect(order).toEqual([]); ++ ++ await vi.advanceTimersByTimeAsync(9); ++ await Promise.resolve(); ++ expect(order).toEqual([]); ++ ++ await vi.advanceTimersByTimeAsync(1); ++ await vi.runAllTimersAsync(); ++ await request; ++ expect(order).toEqual(["run"]); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++ ++ it("treats post-start stream failures as circuit breaker failures", async () => { ++ const controller = new TrafficController({ ++ maxConcurrent: 1, ++ fallbackChains: { ++ primary: ["fallback"], ++ }, ++ }); ++ const tenantId = "tenant-1"; ++ const metadata = { provider: "p", model: "primary", priority: "P1" as const }; ++ ++ await controller.handleStream({ ++ tenantId, ++ metadata, ++ execute: async () => ({ ok: true }), ++ }); ++ ++ for (let i = 0; i < CIRCUIT_FAILURE_THRESHOLD; i += 1) { ++ controller.reportStreamFailure(metadata, new Error("stream-failure")); ++ } ++ ++ const order: string[] = []; ++ await controller.handleStream({ ++ tenantId, ++ metadata, ++ execute: async () => { ++ order.push("primary"); ++ return "primary"; ++ }, ++ createFallbackRequest: (target) => ({ ++ tenantId, ++ metadata: { ++ provider: "p", ++ model: typeof target === "string" ? target : target.model, ++ priority: "P1", ++ }, ++ execute: async () => { ++ const modelId = typeof target === "string" ? target : target.model; ++ order.push(modelId); ++ return modelId; ++ }, ++ }), ++ }); ++ ++ expect(order).toEqual(["fallback"]); ++ }); ++}); ++ ++describe("TrafficController queue timeouts", () => { ++ it("lets fallback requests wait after queue timeout without rejecting", async () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ ++ maxConcurrent: 1, ++ fallbackChains: { ++ "p::m": ["m-fallback"], ++ }, ++ }); ++ const order: string[] = []; ++ let releaseFirst!: () => void; ++ const firstGate = new Promise((resolve) => { ++ releaseFirst = resolve; ++ }); ++ ++ const first = controller.handleText({ ++ tenantId: "tenant-a", ++ metadata: { provider: "p", model: "m", priority: "P1" }, ++ execute: async () => { ++ order.push("first"); ++ await firstGate; ++ return "first"; ++ }, ++ }); ++ ++ const second = controller.handleText({ ++ tenantId: "tenant-a", ++ metadata: { provider: "p", model: "m", priority: "P1" }, ++ maxQueueWaitMs: 1, ++ execute: async () => { ++ order.push("primary"); ++ return "primary"; ++ }, ++ createFallbackRequest: (target) => ({ ++ tenantId: "tenant-a", ++ metadata: { ++ provider: "p", ++ model: typeof target === "string" ? target : target.model, ++ priority: "P1", ++ }, ++ maxQueueWaitMs: 1, ++ execute: async () => { ++ order.push("fallback"); ++ return "fallback"; ++ }, ++ }), ++ }); ++ ++ await Promise.resolve(); ++ expect(order).toEqual(["first"]); ++ ++ await vi.advanceTimersByTimeAsync(2); ++ ++ const third = controller.handleText({ ++ tenantId: "tenant-a", ++ metadata: { provider: "p", model: "other", priority: "P1" }, ++ execute: async () => { ++ order.push("third"); ++ return "third"; ++ }, ++ }); ++ ++ await Promise.resolve(); ++ expect(order).toEqual(["first"]); ++ ++ releaseFirst(); ++ await vi.runAllTimersAsync(); ++ ++ await expect(second).resolves.toBe("fallback"); ++ await Promise.all([first, third]); ++ ++ expect(order).toEqual(["first", "fallback", "third"]); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++}); +diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts +new file mode 100644 +index 00000000..269304d9 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-controller.ts +@@ -0,0 +1,1268 @@ ++import type { Logger } from "../logger"; ++import { LoggerProxy } from "../logger"; ++import { TrafficCircuitBreaker } from "./traffic-circuit-breaker"; ++import { TrafficConcurrencyLimiter } from "./traffic-concurrency-limiter"; ++import type { DispatchDecision, QueuedRequest, Scheduler } from "./traffic-controller-internal"; ++import { ++ CircuitBreakerOpenError, ++ QueueWaitTimeoutError, ++ RateLimitedUpstreamError, ++ normalizeRateLimitError, ++} from "./traffic-errors"; ++import { ++ OpenAIWindowRateLimitStrategy, ++ type RateLimitUpdateResult, ++ TokenBucketRateLimitStrategy, ++ TrafficRateLimiter, ++} from "./traffic-rate-limiter"; ++import { buildRetryPlanWithPolicy } from "./traffic-retry"; ++import type { ++ AdaptiveLimiterConfig, ++ FallbackChainEntry, ++ FallbackPolicy, ++ FallbackPolicyConfig, ++ FallbackPolicyMode, ++ FallbackTarget, ++ PriorityBurstLimits, ++ PriorityWeights, ++ ProviderModelConcurrencyLimit, ++ RateLimitConfig, ++ RateLimitKey, ++ RateLimitStrategyConfig, ++ RateLimitStrategyKind, ++ RetryPlan, ++ RetryPolicyConfig, ++ TenantConcurrencyLimit, ++ TenantUsage, ++ TrafficControllerOptions, ++ TrafficPriority, ++ TrafficRequest, ++ TrafficRequestMetadata, ++ TrafficRequestType, ++ TrafficResponseMetadata, ++} from "./traffic-types"; ++import { TrafficUsageTracker } from "./traffic-usage-tracker"; ++ ++/* ============================================================ ++ * Traffic Controller ++ * ============================================================ ++ */ ++ ++export type { ++ AdaptiveLimiterConfig, ++ FallbackChainEntry, ++ FallbackPolicy, ++ FallbackPolicyConfig, ++ FallbackPolicyMode, ++ FallbackTarget, ++ PriorityBurstLimits, ++ PriorityWeights, ++ ProviderModelConcurrencyLimit, ++ RateLimitConfig, ++ RateLimitKey, ++ RateLimitStrategyConfig, ++ RateLimitStrategyKind, ++ TenantConcurrencyLimit, ++ TenantUsage, ++ TrafficControllerOptions, ++ TrafficPriority, ++ TrafficRequest, ++ TrafficRequestMetadata, ++ TrafficResponseMetadata, ++ TrafficRequestType, ++}; ++ ++export { CircuitBreakerOpenError }; ++export { QueueWaitTimeoutError }; ++export { RateLimitedUpstreamError }; ++ ++type TenantQueueState = { ++ order: string[]; ++ index: number; ++ queues: Map; ++}; ++ ++type RateLimitSnapshot = { ++ limit?: number; ++ remaining?: number; ++ resetAt?: number; ++ nextAllowedAt?: number; ++ retryAfterMs?: number; ++}; ++ ++type AdaptiveLimiterState = { ++ recent429s: number[]; ++ penaltyMs: number; ++ cooldownUntil?: number; ++ last429At?: number; ++}; ++ ++const DEFAULT_PRIORITY_WEIGHTS: Record = { ++ P0: 5, ++ P1: 3, ++ P2: 2, ++}; ++ ++const DEFAULT_ADAPTIVE_LIMITER: Required = { ++ windowMs: 30_000, ++ threshold: 3, ++ minPenaltyMs: 500, ++ maxPenaltyMs: 10_000, ++ penaltyMultiplier: 2, ++ decayMs: 10_000, ++}; ++ ++export class TrafficController { ++ /* ---------- Core ---------- */ ++ ++ private readonly scheduler: Scheduler; ++ private readonly maxConcurrent: number; ++ private readonly rateLimitKeyBuilder: (metadata?: TrafficRequestMetadata) => string; ++ private readonly retryPolicy?: RetryPolicyConfig; ++ private readonly logger: Logger; ++ private readonly trafficLogger: Logger; ++ private readonly controllerLogger: Logger; ++ private readonly concurrencyLimiter: TrafficConcurrencyLimiter; ++ ++ private readonly queues: Record = { ++ P0: { order: [], index: 0, queues: new Map() }, ++ P1: { order: [], index: 0, queues: new Map() }, ++ P2: { order: [], index: 0, queues: new Map() }, ++ }; ++ private readonly priorityOrder: TrafficPriority[] = ["P0", "P1", "P2"]; ++ private readonly priorityWeights: Record; ++ private readonly priorityCredits: Record; ++ ++ private activeCount = 0; ++ private drainScheduled = false; ++ ++ /* ---------- Rate limits ---------- */ ++ private readonly rateLimiter: TrafficRateLimiter; ++ ++ /* ---------- Circuit breakers ---------- */ ++ private readonly circuitBreaker: TrafficCircuitBreaker; ++ ++ /* ---------- Usage ---------- */ ++ private readonly usageTracker = new TrafficUsageTracker(); ++ ++ /* ---------- Traffic metadata ---------- */ ++ private readonly rateLimitSnapshots = new Map(); ++ ++ /* ---------- Adaptive limiter ---------- */ ++ private readonly adaptiveLimiterConfig: Required; ++ private readonly adaptiveLimiterState = new Map(); ++ ++ constructor(options: TrafficControllerOptions = {}) { ++ this.maxConcurrent = options.maxConcurrent ?? Number.POSITIVE_INFINITY; ++ this.scheduler = this.createScheduler(); ++ this.rateLimitKeyBuilder = options.rateLimitKeyBuilder ?? buildRateLimitKeyFromMetadata; ++ this.retryPolicy = options.retryPolicy; ++ const priorityOverrides = options.priorityWeights ?? options.priorityBurstLimits; ++ const priorityWeights = { ++ ...DEFAULT_PRIORITY_WEIGHTS, ++ ...(priorityOverrides ?? {}), ++ }; ++ this.priorityWeights = { ++ P0: Math.max(0, Math.floor(priorityWeights.P0)), ++ P1: Math.max(0, Math.floor(priorityWeights.P1)), ++ P2: Math.max(0, Math.floor(priorityWeights.P2)), ++ }; ++ this.priorityCredits = { ...this.priorityWeights }; ++ this.adaptiveLimiterConfig = { ++ ...DEFAULT_ADAPTIVE_LIMITER, ++ ...(options.adaptiveLimiter ?? {}), ++ }; ++ this.logger = new LoggerProxy({ component: "traffic-controller" }, options.logger); ++ this.trafficLogger = this.logger.child({ subsystem: "traffic" }); ++ this.controllerLogger = this.trafficLogger.child({ module: "controller" }); ++ const rateLimits = options.rateLimits; ++ const rateLimitStrategy = options.rateLimitStrategy; ++ this.rateLimiter = new TrafficRateLimiter(() => this.scheduleDrain(), { ++ rateLimits, ++ strategyFactory: (key) => { ++ const strategyKind = this.resolveRateLimitStrategy(key, rateLimitStrategy); ++ if (strategyKind === "window") { ++ return new OpenAIWindowRateLimitStrategy(key, rateLimits?.[key]); ++ } ++ return new TokenBucketRateLimitStrategy(key, rateLimits?.[key]); ++ }, ++ }); ++ this.circuitBreaker = new TrafficCircuitBreaker({ ++ fallbackChains: options.fallbackChains, ++ fallbackPolicy: options.fallbackPolicy, ++ buildRateLimitKey: (metadata) => this.buildRateLimitKey(metadata), ++ }); ++ this.concurrencyLimiter = new TrafficConcurrencyLimiter({ ++ buildProviderModelKey: (metadata) => buildProviderModelKeyFromMetadata(metadata), ++ maxConcurrentPerProviderModel: options.maxConcurrentPerProviderModel, ++ maxConcurrentPerTenant: options.maxConcurrentPerTenant, ++ }); ++ ++ this.controllerLogger.debug("Initialized TrafficController", { ++ maxConcurrent: this.maxConcurrent, ++ hasFallbackChains: !!options.fallbackChains, ++ hasFallbackPolicy: options.fallbackPolicy !== undefined, ++ hasProviderModelConcurrency: options.maxConcurrentPerProviderModel !== undefined, ++ hasTenantConcurrency: options.maxConcurrentPerTenant !== undefined, ++ hasConfigRateLimits: options.rateLimits !== undefined, ++ hasStrategyOverrides: options.rateLimitStrategy !== undefined, ++ hasRetryPolicy: options.retryPolicy !== undefined, ++ hasPriorityBurstLimits: options.priorityBurstLimits !== undefined, ++ hasPriorityWeights: options.priorityWeights !== undefined, ++ hasAdaptiveLimiter: options.adaptiveLimiter !== undefined, ++ }); ++ } ++ ++ /* ============================================================ ++ * Public API ++ * ============================================================ ++ */ ++ ++ handleText(request: TrafficRequest): Promise { ++ this.controllerLogger.trace("handleText called", { ++ tenantId: request.tenantId, ++ provider: request.metadata?.provider, ++ model: request.metadata?.model, ++ priority: request.metadata?.priority, ++ }); ++ return this.enqueue("text", request); ++ } ++ ++ handleStream(request: TrafficRequest): Promise { ++ this.controllerLogger.trace("handleStream called", { ++ tenantId: request.tenantId, ++ provider: request.metadata?.provider, ++ model: request.metadata?.model, ++ priority: request.metadata?.priority, ++ }); ++ return this.enqueue("stream", request); ++ } ++ ++ reportStreamSuccess(metadata?: TrafficRequestMetadata): void { ++ this.controllerLogger.debug("Stream reported success", { ++ provider: metadata?.provider, ++ model: metadata?.model, ++ tenantId: metadata?.tenantId, ++ priority: metadata?.priority, ++ }); ++ this.circuitBreaker.recordSuccess(metadata, this.trafficLogger); ++ const rateLimitKey = this.buildRateLimitKey(metadata); ++ const adaptiveKey = this.buildAdaptiveKey( ++ metadata, ++ metadata?.tenantId ?? "default", ++ rateLimitKey, ++ ); ++ this.recordAdaptiveSuccess(adaptiveKey); ++ } ++ ++ reportStreamFailure(metadata: TrafficRequestMetadata | undefined, error: unknown): void { ++ const rateLimitKey = this.buildRateLimitKey(metadata); ++ const normalizedRateLimitError = normalizeRateLimitError({ ++ error, ++ metadata, ++ tenantId: metadata?.tenantId, ++ key: rateLimitKey, ++ logger: this.trafficLogger, ++ }); ++ const errorForHandling = normalizedRateLimitError ?? error; ++ ++ this.controllerLogger.warn("Stream reported failure", { ++ provider: metadata?.provider, ++ model: metadata?.model, ++ tenantId: metadata?.tenantId, ++ priority: metadata?.priority, ++ errorName: (error as { name?: unknown } | null)?.name, ++ errorMessage: (error as { message?: unknown } | null)?.message, ++ status: (error as { status?: unknown } | null)?.status, ++ statusCode: (error as { statusCode?: unknown } | null)?.statusCode, ++ }); ++ this.circuitBreaker.recordFailure(metadata, errorForHandling, this.trafficLogger); ++ const adaptiveKey = this.buildAdaptiveKey( ++ metadata, ++ metadata?.tenantId ?? "default", ++ rateLimitKey, ++ ); ++ if (errorForHandling instanceof RateLimitedUpstreamError) { ++ this.recordAdaptiveRateLimitHit(adaptiveKey, errorForHandling.retryAfterMs); ++ } ++ const traffic = this.buildTrafficResponseMetadataFromMetadata( ++ metadata, ++ rateLimitKey, ++ Date.now(), ++ errorForHandling, ++ ); ++ this.attachTrafficMetadata(errorForHandling, traffic); ++ if (errorForHandling !== error) { ++ this.attachTrafficMetadata(error, traffic); ++ } ++ } ++ ++ updateRateLimitFromHeaders( ++ metadata: TrafficRequestMetadata | undefined, ++ headers: unknown, ++ ): RateLimitUpdateResult | undefined { ++ const key = this.buildRateLimitKey(metadata); ++ this.controllerLogger.debug("updateRateLimitFromHeaders called", { ++ rateLimitKey: key, ++ provider: metadata?.provider, ++ model: metadata?.model, ++ }); ++ ++ const update = this.rateLimiter.updateFromHeaders(metadata, headers, key, this.trafficLogger); ++ if (!update) { ++ this.controllerLogger.debug("updateRateLimitFromHeaders skipped (no headers applied)", { ++ rateLimitKey: key, ++ }); ++ return undefined; ++ } ++ ++ this.controllerLogger.debug("Rate limit headers applied", { ++ rateLimitKey: update.key, ++ limit: update.state.limit, ++ remaining: update.state.remaining, ++ reserved: update.state.reserved, ++ resetAt: update.state.resetAt, ++ nextAllowedAt: update.state.nextAllowedAt, ++ resetRequestsMs: update.headerSnapshot.resetRequestsMs, ++ }); ++ ++ this.rateLimitSnapshots.set(update.key, { ++ limit: update.state.limit, ++ remaining: update.state.remaining, ++ resetAt: update.state.resetAt, ++ nextAllowedAt: update.state.nextAllowedAt, ++ retryAfterMs: update.headerSnapshot.retryAfterMs, ++ }); ++ ++ return update; ++ } ++ ++ getTenantUsage(tenantId: string): TenantUsage | undefined { ++ this.controllerLogger.trace("getTenantUsage called", { tenantId }); ++ return this.usageTracker.getTenantUsage(tenantId); ++ } ++ ++ /* ============================================================ ++ * Scheduler & Queue ++ * ============================================================ ++ */ ++ ++ private createScheduler(): Scheduler { ++ return typeof queueMicrotask === "function" ? queueMicrotask : (cb) => setTimeout(cb, 0); ++ } ++ ++ private enqueue( ++ type: TrafficRequestType, ++ request: TrafficRequest, ++ ): Promise { ++ return new Promise((resolve, reject) => { ++ const priority = this.resolvePriority(request.metadata); ++ const tenantId = this.resolveTenantId(request); ++ this.controllerLogger.debug("Enqueue request", { ++ type, ++ tenantId, ++ priority, ++ provider: request.metadata?.provider, ++ model: request.metadata?.model, ++ }); ++ this.enqueueItem({ ++ type, ++ request, ++ resolve, ++ reject, ++ attempt: 1, ++ priority, ++ tenantId, ++ enqueuedAt: Date.now(), ++ estimatedTokens: request.estimatedTokens, ++ extractUsage: request.extractUsage, ++ }); ++ this.scheduleDrain(); ++ }); ++ } ++ ++ private scheduleDrain(): void { ++ if (this.drainScheduled) return; ++ this.drainScheduled = true; ++ ++ this.controllerLogger.trace("Drain scheduled"); ++ this.scheduler(() => { ++ this.drainScheduled = false; ++ this.controllerLogger.trace("Drain tick"); ++ this.drainQueue(); ++ }); ++ } ++ ++ private drainQueue(): void { ++ this.controllerLogger.trace("Drain start", { ++ activeCount: this.activeCount, ++ maxConcurrent: this.maxConcurrent, ++ queuedP0: this.getQueuedCount("P0"), ++ queuedP1: this.getQueuedCount("P1"), ++ queuedP2: this.getQueuedCount("P2"), ++ }); ++ while (true) { ++ const decision = this.tryDispatchNext(); ++ this.controllerLogger.trace("Dispatch decision", decision); ++ if (decision.kind === "dispatch" || decision.kind === "skip") continue; ++ if (decision.kind === "wait") { ++ if (decision.wakeUpAt) { ++ this.controllerLogger.debug("Rate limit wait; scheduling wakeup", { ++ wakeUpAt: decision.wakeUpAt, ++ inMs: Math.max(0, decision.wakeUpAt - Date.now()), ++ }); ++ this.scheduleRateLimitWakeUpAt(decision.wakeUpAt); ++ } ++ return; ++ } ++ return; ++ } ++ } ++ ++ /* ============================================================ ++ * Dispatch ++ * ============================================================ ++ */ ++ ++ private tryDispatchNext(): DispatchDecision { ++ if (this.activeCount >= this.maxConcurrent) return { kind: "wait" }; ++ ++ let earliestWakeUpAt: number | undefined; ++ ++ const observeWakeUpAt = (candidate?: number): void => { ++ if (candidate === undefined) return; ++ earliestWakeUpAt = ++ earliestWakeUpAt === undefined ? candidate : Math.min(earliestWakeUpAt, candidate); ++ }; ++ ++ const priorities = this.getPriorityDispatchOrder(); ++ for (const priority of priorities) { ++ const state = this.queues[priority]; ++ if (state.order.length === 0) continue; ++ ++ let attempts = 0; ++ const maxAttempts = state.order.length; ++ ++ while (attempts < maxAttempts) { ++ const candidate = this.getNextTenantCandidate(priority); ++ if (!candidate) break; ++ attempts += 1; ++ ++ const { item: next, queue, tenantId } = candidate; ++ const now = Date.now(); ++ const queueTimeoutAt = this.resolveQueueTimeoutAt(next); ++ const queueTimeoutTriggered = this.handleQueueTimeout(next, queue, 0, now, queueTimeoutAt); ++ if (queueTimeoutTriggered === "rejected") { ++ this.cleanupTenantQueue(priority, tenantId, queue); ++ return { kind: "skip" }; ++ } ++ if (queueTimeoutAt !== undefined && now < queueTimeoutAt) { ++ observeWakeUpAt(queueTimeoutAt); ++ } ++ const queueTimeoutExpired = queueTimeoutTriggered === "expired"; ++ ++ this.controllerLogger.trace("Evaluate next queued request", { ++ priority, ++ tenantId: next.tenantId, ++ type: next.type, ++ attempt: next.attempt, ++ provider: next.request.metadata?.provider, ++ model: next.request.metadata?.model, ++ queueLength: queue.length, ++ }); ++ ++ const circuit = this.resolveCircuit(next); ++ if (circuit) { ++ this.controllerLogger.trace("Circuit resolution returned decision", { ++ priority, ++ decision: circuit, ++ circuitKey: next.circuitKey, ++ circuitStatus: next.circuitStatus, ++ }); ++ if (circuit.kind === "skip") { ++ queue.shift(); ++ this.cleanupTenantQueue(priority, tenantId, queue); ++ return { kind: "skip" }; ++ } ++ if (circuit.kind === "wait") { ++ if ( ++ this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "circuit wait") ++ ) { ++ this.cleanupTenantQueue(priority, tenantId, queue); ++ return { kind: "skip" }; ++ } ++ next.etaMs = ++ circuit.wakeUpAt !== undefined ? Math.max(0, circuit.wakeUpAt - now) : undefined; ++ observeWakeUpAt(circuit.wakeUpAt); ++ continue; ++ } ++ } ++ ++ const concurrency = this.concurrencyLimiter.resolve(next, this.trafficLogger); ++ if (concurrency.kind === "wait") { ++ this.controllerLogger.trace("Concurrency gate blocked request", { ++ priority, ++ tenantId: next.tenantId, ++ provider: next.request.metadata?.provider, ++ model: next.request.metadata?.model, ++ reasons: concurrency.reasons, ++ }); ++ if ( ++ this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "concurrency wait") ++ ) { ++ this.cleanupTenantQueue(priority, tenantId, queue); ++ return { kind: "skip" }; ++ } ++ next.etaMs = undefined; ++ continue; ++ } ++ ++ const adaptive = this.resolveAdaptiveLimit(next, now); ++ if (adaptive?.kind === "wait") { ++ if ( ++ this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "adaptive wait") ++ ) { ++ this.cleanupTenantQueue(priority, tenantId, queue); ++ return { kind: "skip" }; ++ } ++ next.etaMs = ++ adaptive.wakeUpAt !== undefined ? Math.max(0, adaptive.wakeUpAt - now) : undefined; ++ observeWakeUpAt(adaptive.wakeUpAt); ++ continue; ++ } ++ ++ const rateLimit = this.resolveRateLimit(next); ++ if (rateLimit) { ++ this.controllerLogger.trace("Rate limit resolution returned decision", { ++ priority, ++ decision: rateLimit, ++ rateLimitKey: next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), ++ }); ++ if (rateLimit.kind === "wait") { ++ if ( ++ this.rejectIfQueueTimedOut( ++ queueTimeoutExpired, ++ next, ++ queue, ++ 0, ++ now, ++ "rate limit wait", ++ ) ++ ) { ++ this.cleanupTenantQueue(priority, tenantId, queue); ++ return { kind: "skip" }; ++ } ++ next.etaMs = ++ rateLimit.wakeUpAt !== undefined ? Math.max(0, rateLimit.wakeUpAt - now) : undefined; ++ observeWakeUpAt(rateLimit.wakeUpAt); ++ } ++ continue; ++ } ++ ++ if (queueTimeoutExpired) { ++ const timeoutError = this.createQueueTimeoutError(next, now); ++ this.attachTrafficMetadata( ++ timeoutError, ++ this.buildTrafficResponseMetadata( ++ next, ++ timeoutError.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), ++ now, ++ timeoutError, ++ ), ++ ); ++ this.controllerLogger.warn("Queue wait timed out before dispatch", { ++ tenantId: next.tenantId, ++ waitedMs: timeoutError.waitedMs, ++ maxQueueWaitMs: timeoutError.maxQueueWaitMs, ++ deadlineAt: timeoutError.deadlineAt, ++ provider: next.request.metadata?.provider, ++ model: next.request.metadata?.model, ++ rateLimitKey: timeoutError.rateLimitKey, ++ }); ++ queue.shift(); ++ this.cleanupTenantQueue(priority, tenantId, queue); ++ next.reject(timeoutError); ++ return { kind: "skip" }; ++ } ++ ++ this.startRequest(next, queue, tenantId); ++ return { kind: "dispatch" }; ++ } ++ } ++ ++ return earliestWakeUpAt !== undefined ++ ? { kind: "wait", wakeUpAt: earliestWakeUpAt } ++ : { kind: "wait" }; ++ } ++ ++ private startRequest(item: QueuedRequest, queue: QueuedRequest[], tenantId: string): void { ++ this.controllerLogger.debug("Start request", { ++ priority: item.priority, ++ type: item.type, ++ tenantId: item.tenantId, ++ attempt: item.attempt, ++ provider: item.request.metadata?.provider, ++ model: item.request.metadata?.model, ++ }); ++ item.dispatchedAt = Date.now(); ++ queue.shift(); ++ this.cleanupTenantQueue(item.priority, tenantId, queue); ++ this.recordPriorityDispatch(item.priority); ++ this.activeCount++; ++ this.concurrencyLimiter.acquire(item, this.trafficLogger); ++ this.rateLimiter.notifyDispatch(item.rateLimitKey, this.trafficLogger); ++ this.circuitBreaker.markTrial(item, this.trafficLogger); ++ void this.executeRequest(item); ++ } ++ ++ /* ============================================================ ++ * Execution ++ * ============================================================ ++ */ ++ ++ private async executeRequest(item: QueuedRequest): Promise { ++ const startedAt = Date.now(); ++ try { ++ this.controllerLogger.debug("Execute request", { ++ priority: item.priority, ++ type: item.type, ++ tenantId: item.tenantId, ++ attempt: item.attempt, ++ provider: item.request.metadata?.provider, ++ model: item.request.metadata?.model, ++ rateLimitKey: item.rateLimitKey, ++ circuitKey: item.circuitKey, ++ circuitStatus: item.circuitStatus, ++ activeCount: this.activeCount, ++ }); ++ const result = await item.request.execute(); ++ const rateLimitKey = item.rateLimitKey ?? this.buildRateLimitKey(item.request.metadata); ++ const adaptiveKey = this.buildAdaptiveKey(item.request.metadata, item.tenantId, rateLimitKey); ++ this.controllerLogger.debug("Request succeeded", { ++ tenantId: item.tenantId, ++ attempt: item.attempt, ++ provider: item.request.metadata?.provider, ++ model: item.request.metadata?.model, ++ elapsedMs: Date.now() - startedAt, ++ }); ++ if (item.type === "stream") { ++ this.controllerLogger.trace("Stream started successfully", { ++ tenantId: item.tenantId, ++ provider: item.request.metadata?.provider, ++ model: item.request.metadata?.model, ++ }); ++ } else { ++ this.circuitBreaker.recordSuccess(item.request.metadata, this.trafficLogger); ++ } ++ const usage = this.usageTracker.recordUsage(item, result, this.trafficLogger); ++ this.rateLimiter.recordUsage(rateLimitKey, usage, this.trafficLogger, item.reservedTokens); ++ this.recordAdaptiveSuccess(adaptiveKey); ++ this.attachTrafficMetadata( ++ result, ++ this.buildTrafficResponseMetadata(item, rateLimitKey, Date.now()), ++ ); ++ item.resolve(result); ++ } catch (error) { ++ const rateLimitKey = item.rateLimitKey ?? this.buildRateLimitKey(item.request.metadata); ++ const normalizedRateLimitError = normalizeRateLimitError({ ++ error, ++ metadata: item.request.metadata, ++ tenantId: item.tenantId, ++ key: rateLimitKey, ++ logger: this.trafficLogger, ++ }); ++ const errorForHandling = normalizedRateLimitError ?? error; ++ const adaptiveKey = this.buildAdaptiveKey(item.request.metadata, item.tenantId, rateLimitKey); ++ if (typeof item.reservedTokens === "number" && item.reservedTokens > 0) { ++ this.rateLimiter.recordUsage( ++ rateLimitKey, ++ { totalTokens: 0 }, ++ this.trafficLogger, ++ item.reservedTokens, ++ ); ++ } ++ if (errorForHandling instanceof RateLimitedUpstreamError) { ++ this.recordAdaptiveRateLimitHit(adaptiveKey, errorForHandling.retryAfterMs); ++ } ++ ++ this.controllerLogger.warn("Request failed", { ++ tenantId: item.tenantId, ++ attempt: item.attempt, ++ provider: item.request.metadata?.provider, ++ model: item.request.metadata?.model, ++ elapsedMs: Date.now() - startedAt, ++ errorName: (error as { name?: unknown } | null)?.name, ++ errorMessage: (error as { message?: unknown } | null)?.message, ++ status: (error as { status?: unknown } | null)?.status, ++ statusCode: (error as { statusCode?: unknown } | null)?.statusCode, ++ }); ++ this.circuitBreaker.recordFailure( ++ item.request.metadata, ++ errorForHandling, ++ this.trafficLogger, ++ ); ++ this.attachTrafficMetadata( ++ errorForHandling, ++ this.buildTrafficResponseMetadata(item, rateLimitKey, Date.now(), errorForHandling), ++ ); ++ ++ const retry = buildRetryPlanWithPolicy( ++ { ++ error: errorForHandling, ++ attempt: item.attempt, ++ metadata: item.request.metadata, ++ key: rateLimitKey, ++ logger: this.trafficLogger, ++ }, ++ this.retryPolicy, ++ ); ++ if (retry) { ++ if (!this.canRetryWithinDeadline(item, retry.delayMs)) { ++ this.controllerLogger.debug("Retry skipped; deadline exceeded", { ++ tenantId: item.tenantId, ++ attempt: item.attempt, ++ provider: item.request.metadata?.provider, ++ model: item.request.metadata?.model, ++ deadlineAt: item.request.deadlineAt, ++ delayMs: retry.delayMs, ++ }); ++ item.reject(errorForHandling); ++ } else { ++ this.controllerLogger.debug("Retrying request", { ++ tenantId: item.tenantId, ++ attempt: item.attempt, ++ nextAttempt: item.attempt + 1, ++ reason: retry.reason, ++ delayMs: retry.delayMs, ++ provider: item.request.metadata?.provider, ++ model: item.request.metadata?.model, ++ }); ++ this.scheduleRetry(item, retry); ++ } ++ } else { ++ this.controllerLogger.debug("No retry plan; rejecting request", { ++ tenantId: item.tenantId, ++ attempt: item.attempt, ++ provider: item.request.metadata?.provider, ++ model: item.request.metadata?.model, ++ }); ++ item.reject(errorForHandling); ++ } ++ } finally { ++ this.rateLimiter.releaseReservation(item.rateLimitKey, this.trafficLogger); ++ this.concurrencyLimiter.release(item, this.trafficLogger); ++ this.activeCount = Math.max(0, this.activeCount - 1); ++ this.controllerLogger.trace("Request finished; slot released", { ++ tenantId: item.tenantId, ++ activeCount: this.activeCount, ++ maxConcurrent: this.maxConcurrent, ++ }); ++ this.scheduleDrain(); ++ } ++ } ++ ++ /* ============================================================ ++ * Retry logic ++ * ============================================================ ++ */ ++ ++ private scheduleRetry(item: QueuedRequest, plan: RetryPlan): void { ++ this.controllerLogger.debug("Schedule retry", { ++ tenantId: item.tenantId, ++ priority: item.priority, ++ currentAttempt: item.attempt, ++ nextAttempt: item.attempt + 1, ++ reason: plan.reason, ++ delayMs: plan.delayMs, ++ }); ++ setTimeout(() => { ++ this.controllerLogger.debug("Retry timer fired", { ++ tenantId: item.tenantId, ++ priority: item.priority, ++ nextAttempt: item.attempt + 1, ++ }); ++ this.enqueueItem({ ++ ...item, ++ attempt: item.attempt + 1, ++ enqueuedAt: Date.now(), ++ dispatchedAt: undefined, ++ reservedTokens: undefined, ++ tenantConcurrencyKey: undefined, ++ providerModelConcurrencyKey: undefined, ++ rateLimitKey: undefined, ++ etaMs: undefined, ++ circuitKey: undefined, ++ circuitStatus: undefined, ++ }); ++ this.scheduleDrain(); ++ }, plan.delayMs); ++ } ++ ++ private canRetryWithinDeadline(item: QueuedRequest, delayMs: number): boolean { ++ const deadlineAt = item.request.deadlineAt; ++ if (!deadlineAt) return true; ++ const nextAttemptAt = Date.now() + delayMs; ++ return nextAttemptAt <= deadlineAt; ++ } ++ ++ /* ============================================================ ++ * Rate limiting (verbatim logic) ++ * ============================================================ ++ */ ++ ++ private resolveRateLimit(next: QueuedRequest): DispatchDecision | null { ++ const key = this.buildRateLimitKey(next.request.metadata); ++ return this.rateLimiter.resolve(next, key, this.trafficLogger); ++ } ++ ++ private scheduleRateLimitWakeUpAt(wakeUpAt: number): void { ++ this.rateLimiter.scheduleWakeUpAt(wakeUpAt, this.trafficLogger); ++ } ++ ++ /* ============================================================ ++ * Circuit breakers (verbatim logic, linearized) ++ * ============================================================ ++ */ ++ ++ private resolveCircuit(next: QueuedRequest): DispatchDecision | null { ++ return this.circuitBreaker.resolve(next, this.trafficLogger); ++ } ++ ++ /* ============================================================ ++ * Utilities ++ * ============================================================ ++ */ ++ ++ private resolveQueueTimeoutAt(next: QueuedRequest): number | undefined { ++ if (next.queueTimeoutDisabled) { ++ return next.request.deadlineAt; ++ } ++ const maxQueueWaitMs = next.request.maxQueueWaitMs; ++ const normalizedMaxWait = ++ typeof maxQueueWaitMs === "number" && Number.isFinite(maxQueueWaitMs) ++ ? Math.max(0, maxQueueWaitMs) ++ : undefined; ++ const timeoutAt = ++ normalizedMaxWait !== undefined ? next.enqueuedAt + normalizedMaxWait : undefined; ++ const deadlineAt = next.request.deadlineAt; ++ if (timeoutAt === undefined) return deadlineAt; ++ if (deadlineAt === undefined) return timeoutAt; ++ return Math.min(timeoutAt, deadlineAt); ++ } ++ ++ private handleQueueTimeout( ++ next: QueuedRequest, ++ queue: QueuedRequest[], ++ index: number, ++ now: number, ++ queueTimeoutAt?: number, ++ ): "none" | "expired" | "rejected" { ++ if (queueTimeoutAt === undefined) return "none"; ++ if (now < queueTimeoutAt) return "none"; ++ ++ const fallbackApplied = this.circuitBreaker.tryFallback( ++ next, ++ "queue-timeout", ++ this.trafficLogger, ++ ); ++ if (fallbackApplied) { ++ return "none"; ++ } ++ ++ const timeoutError = this.createQueueTimeoutError(next, now); ++ this.attachTrafficMetadata( ++ timeoutError, ++ this.buildTrafficResponseMetadata( ++ next, ++ timeoutError.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), ++ now, ++ timeoutError, ++ ), ++ ); ++ this.controllerLogger.warn("Queue wait timed out; rejecting request", { ++ tenantId: next.tenantId, ++ waitedMs: timeoutError.waitedMs, ++ maxQueueWaitMs: timeoutError.maxQueueWaitMs, ++ deadlineAt: timeoutError.deadlineAt, ++ provider: next.request.metadata?.provider, ++ model: next.request.metadata?.model, ++ rateLimitKey: timeoutError.rateLimitKey, ++ }); ++ queue.splice(index, 1); ++ next.reject(timeoutError); ++ return "rejected"; ++ } ++ ++ private rejectIfQueueTimedOut( ++ queueTimeoutExpired: boolean, ++ next: QueuedRequest, ++ queue: QueuedRequest[], ++ index: number, ++ now: number, ++ reason: string, ++ ): boolean { ++ if (!queueTimeoutExpired) return false; ++ const timeoutError = this.createQueueTimeoutError(next, now); ++ this.attachTrafficMetadata( ++ timeoutError, ++ this.buildTrafficResponseMetadata( ++ next, ++ timeoutError.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), ++ now, ++ timeoutError, ++ ), ++ ); ++ this.controllerLogger.warn("Queue wait timed out during gate wait", { ++ tenantId: next.tenantId, ++ waitedMs: timeoutError.waitedMs, ++ maxQueueWaitMs: timeoutError.maxQueueWaitMs, ++ deadlineAt: timeoutError.deadlineAt, ++ provider: next.request.metadata?.provider, ++ model: next.request.metadata?.model, ++ rateLimitKey: timeoutError.rateLimitKey, ++ reason, ++ }); ++ queue.splice(index, 1); ++ next.reject(timeoutError); ++ return true; ++ } ++ ++ private createQueueTimeoutError(next: QueuedRequest, now: number): QueueWaitTimeoutError { ++ const waitedMs = Math.max(0, now - next.enqueuedAt); ++ return new QueueWaitTimeoutError({ ++ waitedMs, ++ maxQueueWaitMs: next.request.maxQueueWaitMs, ++ deadlineAt: next.request.deadlineAt, ++ metadata: next.request.metadata, ++ rateLimitKey: next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), ++ }); ++ } ++ ++ private resolveTenantId(request: TrafficRequest): string { ++ return request.tenantId ?? request.metadata?.tenantId ?? "default"; ++ } ++ ++ private enqueueItem(item: QueuedRequest): void { ++ const state = this.queues[item.priority]; ++ const tenantId = item.tenantId; ++ let queue = state.queues.get(tenantId); ++ if (!queue) { ++ queue = []; ++ state.queues.set(tenantId, queue); ++ state.order.push(tenantId); ++ } ++ queue.push(item); ++ } ++ ++ private getQueuedCount(priority: TrafficPriority): number { ++ const state = this.queues[priority]; ++ let total = 0; ++ for (const queue of state.queues.values()) { ++ total += queue.length; ++ } ++ return total; ++ } ++ ++ private refillPriorityCredits(): void { ++ this.priorityCredits.P0 = this.priorityWeights.P0; ++ this.priorityCredits.P1 = this.priorityWeights.P1; ++ this.priorityCredits.P2 = this.priorityWeights.P2; ++ } ++ ++ private recordPriorityDispatch(priority: TrafficPriority): void { ++ if (this.priorityCredits[priority] > 0) { ++ this.priorityCredits[priority] -= 1; ++ } ++ } ++ ++ private getPriorityDispatchOrder(): TrafficPriority[] { ++ const prioritiesWithWork = this.priorityOrder.filter( ++ (priority) => this.getQueuedCount(priority) > 0, ++ ); ++ if (prioritiesWithWork.length === 0) return []; ++ ++ let available = prioritiesWithWork.filter((priority) => this.priorityCredits[priority] > 0); ++ if (available.length === 0) { ++ this.refillPriorityCredits(); ++ available = prioritiesWithWork.filter((priority) => this.priorityCredits[priority] > 0); ++ } ++ ++ return available.length === 0 ? prioritiesWithWork : available; ++ } ++ ++ private getNextTenantCandidate( ++ priority: TrafficPriority, ++ ): { item: QueuedRequest; queue: QueuedRequest[]; tenantId: string } | undefined { ++ const state = this.queues[priority]; ++ if (state.order.length === 0) return undefined; ++ const maxAttempts = state.order.length; ++ let attempts = 0; ++ ++ while (attempts < maxAttempts && state.order.length > 0) { ++ const index = state.index % state.order.length; ++ const tenantId = state.order[index]; ++ const queue = state.queues.get(tenantId); ++ attempts += 1; ++ ++ if (!queue || queue.length === 0) { ++ this.removeTenantQueue(priority, tenantId); ++ continue; ++ } ++ ++ state.index = (index + 1) % state.order.length; ++ return { item: queue[0], queue, tenantId }; ++ } ++ ++ return undefined; ++ } ++ ++ private cleanupTenantQueue( ++ priority: TrafficPriority, ++ tenantId: string, ++ queue: QueuedRequest[], ++ ): void { ++ if (queue.length > 0) return; ++ this.removeTenantQueue(priority, tenantId); ++ } ++ ++ private removeTenantQueue(priority: TrafficPriority, tenantId: string): void { ++ const state = this.queues[priority]; ++ state.queues.delete(tenantId); ++ const index = state.order.indexOf(tenantId); ++ if (index === -1) return; ++ state.order.splice(index, 1); ++ if (state.order.length === 0) { ++ state.index = 0; ++ return; ++ } ++ if (state.index > index) { ++ state.index -= 1; ++ } ++ if (state.index >= state.order.length) { ++ state.index = 0; ++ } ++ } ++ ++ private resolvePriority(metadata?: TrafficRequestMetadata): TrafficPriority { ++ return metadata?.priority ?? "P1"; ++ } ++ ++ private buildRateLimitKey(metadata?: TrafficRequestMetadata): string { ++ return this.rateLimitKeyBuilder(metadata); ++ } ++ ++ private resolveAdaptiveLimit(next: QueuedRequest, now: number): DispatchDecision | null { ++ const rateLimitKey = next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata); ++ const adaptiveKey = this.buildAdaptiveKey(next.request.metadata, next.tenantId, rateLimitKey); ++ const state = this.adaptiveLimiterState.get(adaptiveKey); ++ if (!state) return null; ++ ++ this.applyAdaptiveDecay(state, now); ++ if (state.cooldownUntil !== undefined && now < state.cooldownUntil) { ++ return { kind: "wait", wakeUpAt: state.cooldownUntil }; ++ } ++ ++ return null; ++ } ++ ++ private recordAdaptiveRateLimitHit(key: string, retryAfterMs?: number): void { ++ const state = this.getAdaptiveState(key); ++ const now = Date.now(); ++ const { windowMs, threshold, minPenaltyMs, maxPenaltyMs, penaltyMultiplier } = ++ this.adaptiveLimiterConfig; ++ ++ state.last429At = now; ++ state.recent429s = state.recent429s.filter((timestamp) => now - timestamp <= windowMs); ++ state.recent429s.push(now); ++ ++ if (state.recent429s.length < threshold) { ++ return; ++ } ++ ++ const basePenalty = state.penaltyMs > 0 ? state.penaltyMs : minPenaltyMs; ++ const nextPenalty = Math.min( ++ maxPenaltyMs, ++ Math.max(minPenaltyMs, Math.round(basePenalty * penaltyMultiplier)), ++ ); ++ state.penaltyMs = nextPenalty; ++ const retryPenalty = typeof retryAfterMs === "number" ? retryAfterMs : 0; ++ const cooldownMs = Math.max(nextPenalty, retryPenalty); ++ state.cooldownUntil = now + cooldownMs; ++ } ++ ++ private recordAdaptiveSuccess(key: string): void { ++ const state = this.adaptiveLimiterState.get(key); ++ if (!state) return; ++ ++ const now = Date.now(); ++ this.applyAdaptiveDecay(state, now); ++ if (state.penaltyMs === 0) { ++ state.cooldownUntil = undefined; ++ state.recent429s = []; ++ state.last429At = undefined; ++ } ++ } ++ ++ private applyAdaptiveDecay(state: AdaptiveLimiterState, now: number): void { ++ const { decayMs, penaltyMultiplier } = this.adaptiveLimiterConfig; ++ if (state.last429At && now - state.last429At < decayMs) { ++ return; ++ } ++ ++ if (state.penaltyMs > 0) { ++ state.penaltyMs = Math.max(0, Math.floor(state.penaltyMs / penaltyMultiplier)); ++ } ++ } ++ ++ private getAdaptiveState(key: string): AdaptiveLimiterState { ++ const existing = this.adaptiveLimiterState.get(key); ++ if (existing) return existing; ++ const created: AdaptiveLimiterState = { ++ recent429s: [], ++ penaltyMs: 0, ++ }; ++ this.adaptiveLimiterState.set(key, created); ++ return created; ++ } ++ ++ private buildAdaptiveKey( ++ metadata: TrafficRequestMetadata | undefined, ++ tenantId: string, ++ rateLimitKey: string, ++ ): string { ++ if (rateLimitKey.includes("tenant=")) { ++ return rateLimitKey; ++ } ++ const tenant = metadata?.tenantId ?? tenantId ?? "default"; ++ return `${rateLimitKey}::tenant=${encodeURIComponent(tenant)}`; ++ } ++ ++ private buildTrafficResponseMetadata( ++ item: QueuedRequest, ++ rateLimitKey: string, ++ now: number, ++ error?: unknown, ++ ): TrafficResponseMetadata { ++ const snapshot = this.rateLimitSnapshots.get(rateLimitKey); ++ const retryAfterMs = this.resolveRetryAfterMs(error, snapshot); ++ const queuedForMs = ++ item.dispatchedAt !== undefined ? item.dispatchedAt - item.enqueuedAt : now - item.enqueuedAt; ++ const queueEtaMs = item.etaMs ?? Math.max(0, queuedForMs); ++ ++ return { ++ rateLimitKey, ++ retryAfterMs, ++ rateLimitRemaining: snapshot?.remaining, ++ rateLimitResetAt: snapshot?.resetAt, ++ rateLimitResetInMs: ++ snapshot?.resetAt !== undefined ? Math.max(0, snapshot.resetAt - now) : undefined, ++ queueEtaMs, ++ tenantId: item.tenantId, ++ priority: item.request.metadata?.priority, ++ taskType: item.request.metadata?.taskType, ++ }; ++ } ++ ++ private buildTrafficResponseMetadataFromMetadata( ++ metadata: TrafficRequestMetadata | undefined, ++ rateLimitKey: string, ++ now: number, ++ error?: unknown, ++ ): TrafficResponseMetadata { ++ const snapshot = this.rateLimitSnapshots.get(rateLimitKey); ++ const retryAfterMs = this.resolveRetryAfterMs(error, snapshot); ++ ++ return { ++ rateLimitKey, ++ retryAfterMs, ++ rateLimitRemaining: snapshot?.remaining, ++ rateLimitResetAt: snapshot?.resetAt, ++ rateLimitResetInMs: ++ snapshot?.resetAt !== undefined ? Math.max(0, snapshot.resetAt - now) : undefined, ++ tenantId: metadata?.tenantId, ++ priority: metadata?.priority, ++ taskType: metadata?.taskType, ++ }; ++ } ++ ++ private attachTrafficMetadata(target: unknown, info: TrafficResponseMetadata): void { ++ if (!target || typeof target !== "object") return; ++ (target as Record).traffic = info; ++ } ++ ++ private resolveRetryAfterMs( ++ error: unknown | undefined, ++ snapshot?: RateLimitSnapshot, ++ ): number | undefined { ++ if (error && typeof error === "object" && "retryAfterMs" in error) { ++ const candidate = (error as { retryAfterMs?: unknown }).retryAfterMs; ++ if (typeof candidate === "number" && Number.isFinite(candidate)) { ++ return candidate; ++ } ++ } ++ if (snapshot?.retryAfterMs !== undefined) { ++ return snapshot.retryAfterMs; ++ } ++ return undefined; ++ } ++ ++ private resolveRateLimitStrategy( ++ key: string, ++ config?: RateLimitStrategyConfig, ++ ): RateLimitStrategyKind { ++ const modelOverride = config?.models?.[key]; ++ if (modelOverride) return modelOverride; ++ const provider = key.split("::")[0] ?? ""; ++ const providerOverride = config?.providers?.[provider]; ++ if (providerOverride) return providerOverride; ++ if (provider.startsWith("openai")) return "window"; ++ return "token-bucket"; ++ } ++} ++ ++/* ============================================================ ++ * Error + Singleton ++ * ============================================================ ++ */ ++ ++let singletonController: TrafficController | undefined; ++ ++export function getTrafficController(options?: TrafficControllerOptions): TrafficController { ++ if (!singletonController) { ++ singletonController = new TrafficController(options); ++ } ++ return singletonController; ++} ++ ++function buildRateLimitKeyFromMetadata(metadata?: TrafficRequestMetadata): string { ++ const provider = metadata?.provider ?? "default-provider"; ++ const model = metadata?.model ?? "default-model"; ++ const parts = [provider, model]; ++ ++ // SOP: Add new metadata fields in one place with a stable label and ordering. ++ // 1) Add the optional field to TrafficRequestMetadata. ++ // 2) Add it here with a stable label so keys stay predictable. ++ // Example: { label: "org", value: metadata?.orgId } ++ const optionalFields: Array<{ label: string; value?: string }> = [ ++ { label: "apiKey", value: metadata?.apiKeyId }, ++ { label: "region", value: metadata?.region }, ++ { label: "endpoint", value: metadata?.endpoint }, ++ // Intentionally exclude tenantId to enforce provider/model limits across tenants. ++ // Use rateLimitKeyBuilder to include tenant for per-tenant rate limits. ++ { label: "tenantTier", value: metadata?.tenantTier }, ++ { label: "taskType", value: metadata?.taskType }, ++ ]; ++ ++ for (const field of optionalFields) { ++ if (!field.value) continue; ++ parts.push(`${field.label}=${encodeURIComponent(field.value)}`); ++ } ++ ++ return parts.join("::"); ++} ++ ++function buildProviderModelKeyFromMetadata(metadata?: TrafficRequestMetadata): string { ++ const provider = metadata?.provider ?? "default-provider"; ++ const model = metadata?.model ?? "default-model"; ++ return `${provider}::${model}`; ++} +diff --git a/packages/core/src/traffic/traffic-error-utils.ts b/packages/core/src/traffic/traffic-error-utils.ts +new file mode 100644 +index 00000000..4cbb98b5 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-error-utils.ts +@@ -0,0 +1,148 @@ ++import type { Logger } from "../logger"; ++ ++function readObjectProperty(value: unknown, key: string): unknown { ++ if (!value || typeof value !== "object") return undefined; ++ return (value as Record)[key]; ++} ++ ++export function findHeaders(value: unknown): unknown[] { ++ const candidates: unknown[] = [ ++ readObjectProperty(value, "headers"), ++ readObjectProperty(readObjectProperty(value, "response"), "headers"), ++ readObjectProperty(readObjectProperty(value, "cause"), "headers"), ++ readObjectProperty( ++ readObjectProperty(readObjectProperty(value, "cause"), "response"), ++ "headers", ++ ), ++ ]; ++ ++ return candidates.filter((candidate) => candidate !== undefined && candidate !== null); ++} ++ ++export function readHeaderValue(headers: unknown, name: string): string | undefined { ++ if (!headers) return undefined; ++ ++ if (typeof (headers as { get?: unknown }).get === "function") { ++ const v = (headers as { get: (name: string) => unknown }).get(name); ++ return v === null || v === undefined ? undefined : String(v); ++ } ++ ++ if (typeof headers !== "object") return undefined; ++ ++ const entries = Object.entries(headers as Record); ++ const target = name.toLowerCase(); ++ const match = entries.find(([k]) => String(k).toLowerCase() === target); ++ if (!match) return undefined; ++ ++ const value = match[1]; ++ if (Array.isArray(value)) { ++ const first = value[0]; ++ return first === null || first === undefined ? undefined : String(first); ++ } ++ return value === null || value === undefined ? undefined : String(value); ++} ++ ++export function parseRetryAfterMs(value: string, nowMs: number = Date.now()): number | undefined { ++ const raw = value.trim(); ++ if (!raw) return undefined; ++ ++ const seconds = Number(raw); ++ if (Number.isFinite(seconds)) { ++ return Math.max(0, Math.round(seconds * 1000)); ++ } ++ ++ const parsedAt = Date.parse(raw); ++ if (Number.isFinite(parsedAt)) { ++ return Math.max(0, parsedAt - nowMs); ++ } ++ ++ return undefined; ++} ++ ++export function coerceStatus(value: unknown): number | undefined { ++ const n = Number(value); ++ return Number.isFinite(n) ? n : undefined; ++} ++ ++export function extractStatusCode(error: unknown, logger?: Logger): number | undefined { ++ const status = ++ coerceStatus(readObjectProperty(error, "status")) ?? ++ coerceStatus(readObjectProperty(error, "statusCode")) ?? ++ coerceStatus(readObjectProperty(error, "httpStatus")) ?? ++ coerceStatus(readObjectProperty(readObjectProperty(error, "response"), "status")) ?? ++ coerceStatus(readObjectProperty(readObjectProperty(error, "cause"), "status")); ++ ++ logger?.trace?.("Extracted status code", { ++ status, ++ hasStatus: readObjectProperty(error, "status") !== undefined, ++ hasStatusCode: readObjectProperty(error, "statusCode") !== undefined, ++ hasHttpStatus: readObjectProperty(error, "httpStatus") !== undefined, ++ hasResponseStatus: ++ readObjectProperty(readObjectProperty(error, "response"), "status") !== undefined, ++ hasCauseStatus: readObjectProperty(readObjectProperty(error, "cause"), "status") !== undefined, ++ }); ++ ++ return status; ++} ++ ++export function extractRetryAfterMs(error: unknown, logger?: Logger): number | undefined { ++ const retryAfterLogger = logger?.child({ module: "retry-after" }); ++ const candidates = findHeaders(error); ++ ++ for (const headers of candidates) { ++ const raw = readHeaderValue(headers, "retry-after"); ++ if (!raw) continue; ++ const parsed = parseRetryAfterMs(raw); ++ retryAfterLogger?.trace?.("Parsed Retry-After header", { raw, parsedMs: parsed }); ++ if (parsed !== undefined) return parsed; ++ } ++ ++ retryAfterLogger?.trace?.("Retry-After header missing or unparsable"); ++ return undefined; ++} ++ ++export function isTimeoutError(error: unknown, logger?: Logger): boolean { ++ const candidates: unknown[] = [error]; ++ ++ const cause = readObjectProperty(error, "cause"); ++ if (cause) { ++ candidates.push(cause); ++ const nestedCause = readObjectProperty(cause, "cause"); ++ if (nestedCause) candidates.push(nestedCause); ++ } ++ ++ for (const candidate of candidates) { ++ const code = readObjectProperty(candidate, "code"); ++ const name = readObjectProperty(candidate, "name"); ++ const message = readObjectProperty(candidate, "message"); ++ ++ const codeText = String(code ?? "").toLowerCase(); ++ const nameText = String(name ?? "").toLowerCase(); ++ const messageText = String(message ?? "").toLowerCase(); ++ ++ const isTimeout = ++ codeText.includes("timeout") || ++ codeText.includes("timedout") || ++ nameText.includes("timeout") || ++ nameText.includes("timedout") || ++ messageText.includes("timeout") || ++ messageText.includes("timedout") || ++ messageText.includes("timed out"); ++ ++ logger?.trace?.("Checked timeout error", { ++ isTimeout, ++ code, ++ name, ++ messagePreview: typeof message === "string" ? message.slice(0, 160) : message, ++ hasCause: candidate !== error, ++ }); ++ ++ if (isTimeout) return true; ++ } ++ ++ return false; ++} ++ ++export function isPromiseLike(value: unknown): value is PromiseLike { ++ return !!value && typeof (value as { then?: unknown }).then === "function"; ++} +diff --git a/packages/core/src/traffic/traffic-errors.ts b/packages/core/src/traffic/traffic-errors.ts +new file mode 100644 +index 00000000..4943c89f +--- /dev/null ++++ b/packages/core/src/traffic/traffic-errors.ts +@@ -0,0 +1,141 @@ ++import type { Logger } from "../logger"; ++import { extractRetryAfterMs, extractStatusCode } from "./traffic-error-utils"; ++import type { TrafficRequestMetadata } from "./traffic-types"; ++ ++export type RateLimitErrorOptions = { ++ metadata?: TrafficRequestMetadata; ++ retryAfterMs?: number; ++ tenantId?: string; ++ key?: string; ++}; ++ ++export class CircuitBreakerOpenError extends Error { ++ readonly retryAfterMs?: number; ++ readonly metadata?: TrafficRequestMetadata; ++ ++ constructor(message: string, metadata?: TrafficRequestMetadata, retryAfterMs?: number) { ++ super(message); ++ this.name = "CircuitBreakerOpenError"; ++ this.metadata = metadata; ++ this.retryAfterMs = retryAfterMs; ++ } ++} ++ ++export class QueueWaitTimeoutError extends Error { ++ readonly waitedMs: number; ++ readonly maxQueueWaitMs?: number; ++ readonly deadlineAt?: number; ++ readonly metadata?: TrafficRequestMetadata; ++ readonly rateLimitKey?: string; ++ ++ constructor(options: { ++ waitedMs: number; ++ maxQueueWaitMs?: number; ++ deadlineAt?: number; ++ metadata?: TrafficRequestMetadata; ++ rateLimitKey?: string; ++ }) { ++ super("Queue wait time exceeded"); ++ this.name = "QueueWaitTimeoutError"; ++ this.waitedMs = options.waitedMs; ++ this.maxQueueWaitMs = options.maxQueueWaitMs; ++ this.deadlineAt = options.deadlineAt; ++ this.metadata = options.metadata; ++ this.rateLimitKey = options.rateLimitKey; ++ } ++} ++ ++export class RateLimitedUpstreamError extends Error { ++ readonly status = 429; ++ readonly retryAfterMs?: number; ++ readonly metadata?: TrafficRequestMetadata; ++ readonly provider?: string; ++ readonly model?: string; ++ readonly tenantId?: string; ++ readonly key?: string; ++ ++ constructor( ++ message: string, ++ metadata?: TrafficRequestMetadata, ++ retryAfterMs?: number, ++ options?: { tenantId?: string; key?: string }, ++ ); ++ constructor(message: string, options?: RateLimitErrorOptions); ++ constructor( ++ message: string, ++ metadataOrOptions?: TrafficRequestMetadata | RateLimitErrorOptions, ++ retryAfterMs?: number, ++ legacyOptions?: { tenantId?: string; key?: string }, ++ ) { ++ super(message); ++ this.name = "RateLimitedUpstreamError"; ++ const isOptions = ++ metadataOrOptions && ++ (Object.prototype.hasOwnProperty.call(metadataOrOptions, "metadata") || ++ Object.prototype.hasOwnProperty.call(metadataOrOptions, "retryAfterMs") || ++ Object.prototype.hasOwnProperty.call(metadataOrOptions, "key")); ++ ++ const metadata = isOptions ++ ? (metadataOrOptions as RateLimitErrorOptions).metadata ++ : (metadataOrOptions as TrafficRequestMetadata | undefined); ++ const retryAfter = isOptions ++ ? (metadataOrOptions as RateLimitErrorOptions).retryAfterMs ++ : retryAfterMs; ++ const tenantId = isOptions ++ ? (metadataOrOptions as RateLimitErrorOptions).tenantId ++ : legacyOptions?.tenantId; ++ const key = isOptions ? (metadataOrOptions as RateLimitErrorOptions).key : legacyOptions?.key; ++ ++ this.metadata = metadata; ++ this.retryAfterMs = retryAfter; ++ this.provider = metadata?.provider; ++ this.model = metadata?.model; ++ this.tenantId = tenantId ?? metadata?.tenantId; ++ this.key = key; ++ } ++} ++ ++export function normalizeRateLimitError(options: { ++ error: unknown; ++ metadata?: TrafficRequestMetadata; ++ tenantId?: string; ++ key?: string; ++ logger?: Logger; ++}): RateLimitedUpstreamError | undefined { ++ const { error, metadata, tenantId, key, logger } = options; ++ const retryAfterMs = ++ error instanceof RateLimitedUpstreamError ++ ? (error.retryAfterMs ?? extractRetryAfterMs(error, logger)) ++ : extractRetryAfterMs(error, logger); ++ ++ if (error instanceof RateLimitedUpstreamError) { ++ const baseMetadata = metadata ?? error.metadata; ++ const baseTenant = tenantId ?? error.tenantId; ++ const baseKey = key ?? error.key; ++ if ( ++ error.metadata === baseMetadata && ++ error.retryAfterMs === retryAfterMs && ++ error.tenantId === baseTenant && ++ error.key === baseKey ++ ) { ++ return error; ++ } ++ return new RateLimitedUpstreamError(error.message, { ++ metadata: baseMetadata, ++ retryAfterMs, ++ tenantId: baseTenant, ++ key: baseKey, ++ }); ++ } ++ ++ const status = extractStatusCode(error, logger); ++ if (status !== 429) return undefined; ++ ++ const message = error instanceof Error ? error.message : "Rate limit exceeded"; ++ return new RateLimitedUpstreamError(message, { ++ metadata, ++ retryAfterMs, ++ tenantId, ++ key, ++ }); ++} +diff --git a/packages/core/src/traffic/traffic-rate-limiter.ts b/packages/core/src/traffic/traffic-rate-limiter.ts +new file mode 100644 +index 00000000..3e5aefbe +--- /dev/null ++++ b/packages/core/src/traffic/traffic-rate-limiter.ts +@@ -0,0 +1,295 @@ ++import type { Logger } from "../logger"; ++import type { ++ RateLimitStrategy, ++ RateLimitUpdateResult, ++} from "./rate-limit-strategies/rate-limit-strategy"; ++import { TokenBucketRateLimitStrategy } from "./rate-limit-strategies/token-bucket-rate-limit-strategy"; ++import type { DispatchDecision, QueuedRequest } from "./traffic-controller-internal"; ++import type { RateLimitConfig, TrafficRequestMetadata } from "./traffic-types"; ++ ++export type { ++ RateLimitHeaderSnapshot, ++ RateLimitStrategy, ++ RateLimitUpdateResult, ++} from "./rate-limit-strategies/rate-limit-strategy"; ++export { DefaultRateLimitStrategy } from "./rate-limit-strategies/default-rate-limit-strategy"; ++export { OpenAIWindowRateLimitStrategy } from "./rate-limit-strategies/openai-window-rate-limit-strategy"; ++export { TokenBucketRateLimitStrategy } from "./rate-limit-strategies/token-bucket-rate-limit-strategy"; ++ ++type SchedulerCallback = () => void; ++ ++export type RateLimitStrategyFactory = (key: string) => RateLimitStrategy; ++ ++type UsageCounters = { ++ inputTokens?: number; ++ outputTokens?: number; ++ totalTokens?: number; ++}; ++ ++type TokenRateState = { ++ capacity: number; ++ refillPerSecond: number; ++ tokens: number; ++ updatedAt: number; ++}; ++ ++export class TrafficRateLimiter { ++ private readonly strategies = new Map(); ++ private readonly tokenRates = new Map(); ++ private wakeUpTimeout?: ReturnType; ++ private wakeUpAt?: number; ++ private readonly onWakeUp: SchedulerCallback; ++ private readonly strategyFactory: RateLimitStrategyFactory; ++ private readonly rateLimits?: RateLimitConfig; ++ ++ constructor( ++ onWakeUp: SchedulerCallback, ++ options?: { strategyFactory?: RateLimitStrategyFactory; rateLimits?: RateLimitConfig }, ++ ) { ++ this.onWakeUp = onWakeUp; ++ this.rateLimits = options?.rateLimits; ++ this.strategyFactory = ++ options?.strategyFactory ?? ++ ((key) => new TokenBucketRateLimitStrategy(key, this.rateLimits?.[key])); ++ } ++ ++ resolve(next: QueuedRequest, key: string, logger?: Logger): DispatchDecision | null { ++ const strategy = this.strategies.get(key) ?? this.createStrategy(key, logger); ++ const requestDecision = strategy.resolve(next, logger); ++ if (requestDecision?.kind === "wait") { ++ const tokenDecision = strategy.handlesTokenLimits ++ ? null ++ : this.resolveTokenLimit(next, key, logger, false); ++ if (tokenDecision?.kind === "wait") { ++ const requestWakeUp = requestDecision.wakeUpAt; ++ const tokenWakeUp = tokenDecision.wakeUpAt; ++ if (tokenWakeUp !== undefined && requestWakeUp !== undefined) { ++ return { kind: "wait", wakeUpAt: Math.min(requestWakeUp, tokenWakeUp) }; ++ } ++ if (tokenWakeUp !== undefined && requestWakeUp === undefined) { ++ return tokenDecision; ++ } ++ } ++ return requestDecision; ++ } ++ ++ const tokenDecision = strategy.handlesTokenLimits ++ ? null ++ : this.resolveTokenLimit(next, key, logger, true); ++ if (tokenDecision?.kind === "wait") { ++ return tokenDecision; ++ } ++ ++ return requestDecision; ++ } ++ ++ notifyDispatch(key: string | undefined, logger?: Logger): void { ++ if (!key) return; ++ this.strategies.get(key)?.onDispatch(logger); ++ } ++ ++ scheduleWakeUpAt(wakeUpAt: number, logger?: Logger): void { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const now = Date.now(); ++ const target = Math.max(now, wakeUpAt); ++ ++ if (this.wakeUpTimeout && this.wakeUpAt !== undefined && this.wakeUpAt <= target) { ++ rateLimitLogger?.trace?.("Wakeup already scheduled earlier; skipping", { ++ currentWakeUpAt: this.wakeUpAt, ++ requestedWakeUpAt: target, ++ }); ++ return; ++ } ++ ++ if (this.wakeUpTimeout) clearTimeout(this.wakeUpTimeout); ++ ++ this.wakeUpAt = target; ++ rateLimitLogger?.debug?.("Scheduling rate limit wakeup", { ++ wakeUpAt: target, ++ inMs: Math.max(1, target - now), ++ }); ++ this.wakeUpTimeout = setTimeout( ++ () => { ++ this.wakeUpTimeout = undefined; ++ this.wakeUpAt = undefined; ++ rateLimitLogger?.debug?.("Rate limit wakeup fired"); ++ this.onWakeUp(); ++ }, ++ Math.max(1, target - now), ++ ); ++ } ++ ++ releaseReservation(key?: string, logger?: Logger): void { ++ if (!key) return; ++ this.strategies.get(key)?.onComplete(logger); ++ } ++ ++ recordUsage( ++ key: string | undefined, ++ usage: UsageCounters | Promise | undefined, ++ logger?: Logger, ++ reservedTokens?: number, ++ ): void { ++ if (!key || !usage) return; ++ if (typeof (usage as PromiseLike).then === "function") { ++ void (usage as Promise) ++ .then((resolved) => this.recordUsage(key, resolved, logger, reservedTokens)) ++ .catch(() => {}); ++ return; ++ } ++ ++ const strategy = this.strategies.get(key); ++ if (strategy?.recordUsage) { ++ strategy.recordUsage(usage, logger, reservedTokens); ++ return; ++ } ++ ++ const tokens = this.resolveTokenCount(usage); ++ if (tokens <= 0) return; ++ ++ const bucket = this.getTokenRateState(key, logger); ++ if (!bucket) return; ++ ++ const now = Date.now(); ++ this.refillTokenRate(bucket, now); ++ bucket.tokens = Math.min(bucket.capacity, bucket.tokens); ++ const reserved = typeof reservedTokens === "number" ? reservedTokens : 0; ++ const delta = tokens - reserved; ++ if (delta > 0) { ++ bucket.tokens -= delta; ++ } else if (delta < 0) { ++ bucket.tokens = Math.min(bucket.capacity, bucket.tokens + Math.abs(delta)); ++ } ++ ++ if (bucket.tokens < 0 && bucket.refillPerSecond > 0) { ++ const waitMs = Math.max(1, Math.ceil((-bucket.tokens / bucket.refillPerSecond) * 1000)); ++ this.scheduleWakeUpAt(now + waitMs, logger); ++ } ++ } ++ ++ updateFromHeaders( ++ metadata: TrafficRequestMetadata | undefined, ++ headers: unknown, ++ key: string, ++ logger?: Logger, ++ ): RateLimitUpdateResult | undefined { ++ const existing = this.strategies.get(key); ++ if (existing) return existing.updateFromHeaders(metadata, headers, logger); ++ ++ const created = this.strategyFactory(key); ++ const update = created.updateFromHeaders(metadata, headers, logger); ++ if (!update) return undefined; ++ this.strategies.set(key, created); ++ return update; ++ } ++ ++ private createStrategy(key: string, logger?: Logger): RateLimitStrategy { ++ const created = this.strategyFactory(key); ++ this.strategies.set(key, created); ++ logger?.child({ module: "rate-limiter" })?.trace?.("Created rate limit strategy", { ++ rateLimitKey: key, ++ strategy: created.constructor.name, ++ }); ++ return created; ++ } ++ ++ private resolveTokenLimit( ++ next: QueuedRequest, ++ key: string, ++ logger?: Logger, ++ reserveTokens = true, ++ ): DispatchDecision | null { ++ const bucket = this.getTokenRateState(key, logger); ++ if (!bucket) return null; ++ ++ const now = Date.now(); ++ this.refillTokenRate(bucket, now); ++ ++ if (bucket.capacity <= 0) { ++ logger?.child({ module: "rate-limiter" })?.debug?.("Token limit misconfigured; blocking", { ++ rateLimitKey: key, ++ capacity: bucket.capacity, ++ refillPerSecond: bucket.refillPerSecond, ++ }); ++ return { kind: "wait" }; ++ } ++ ++ const estimatedTokens = next.estimatedTokens; ++ if (typeof estimatedTokens === "number" && estimatedTokens > 0) { ++ if (bucket.tokens >= estimatedTokens) { ++ if (reserveTokens) { ++ bucket.tokens -= estimatedTokens; ++ next.reservedTokens = estimatedTokens; ++ } ++ return null; ++ } ++ } else if (bucket.tokens >= 0) { ++ return null; ++ } ++ ++ if (bucket.refillPerSecond <= 0) { ++ logger?.child({ module: "rate-limiter" })?.debug?.("Token limit has no refill; blocking", { ++ rateLimitKey: key, ++ capacity: bucket.capacity, ++ refillPerSecond: bucket.refillPerSecond, ++ }); ++ return { kind: "wait" }; ++ } ++ ++ const requiredTokens = ++ typeof estimatedTokens === "number" && estimatedTokens > 0 ++ ? Math.max(estimatedTokens - bucket.tokens, 1) ++ : -bucket.tokens; ++ const waitMs = Math.max(1, Math.ceil((requiredTokens / bucket.refillPerSecond) * 1000)); ++ return { kind: "wait", wakeUpAt: now + waitMs }; ++ } ++ ++ private getTokenRateState(key: string, logger?: Logger): TokenRateState | undefined { ++ const existing = this.tokenRates.get(key); ++ if (existing) return existing; ++ ++ const options = this.rateLimits?.[key]; ++ if (!options) return undefined; ++ ++ const tokensPerMinute = Number(options.tokensPerMinute); ++ if (!Number.isFinite(tokensPerMinute) || tokensPerMinute <= 0) { ++ return undefined; ++ } ++ ++ // Token pacing uses a 1-minute burst by default; request bursts are handled separately. ++ const refillPerSecond = tokensPerMinute / 60; ++ const capacity = tokensPerMinute; ++ const now = Date.now(); ++ const created: TokenRateState = { ++ capacity, ++ refillPerSecond, ++ tokens: capacity, ++ updatedAt: now, ++ }; ++ this.tokenRates.set(key, created); ++ logger?.child({ module: "rate-limiter" })?.trace?.("Created token rate state", { ++ rateLimitKey: key, ++ capacity, ++ refillPerSecond, ++ }); ++ return created; ++ } ++ ++ private refillTokenRate(bucket: TokenRateState, now: number): void { ++ const elapsedMs = now - bucket.updatedAt; ++ if (elapsedMs <= 0) return; ++ bucket.updatedAt = now; ++ if (bucket.capacity <= 0 || bucket.refillPerSecond <= 0) return; ++ const refill = (elapsedMs / 1000) * bucket.refillPerSecond; ++ if (refill <= 0) return; ++ bucket.tokens = Math.min(bucket.capacity, bucket.tokens + refill); ++ } ++ ++ private resolveTokenCount(usage: UsageCounters): number { ++ const total = Number.isFinite(usage.totalTokens) ? usage.totalTokens : undefined; ++ if (total !== undefined) return total; ++ const input = Number.isFinite(usage.inputTokens) ? usage.inputTokens : 0; ++ const output = Number.isFinite(usage.outputTokens) ? usage.outputTokens : 0; ++ return input + output; ++ } ++} +diff --git a/packages/core/src/traffic/traffic-retry.spec.ts b/packages/core/src/traffic/traffic-retry.spec.ts +new file mode 100644 +index 00000000..2360ca10 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-retry.spec.ts +@@ -0,0 +1,45 @@ ++import { describe, expect, it, vi } from "vitest"; ++import { buildRetryPlan } from "./traffic-retry"; ++ ++describe("buildRetryPlan", () => { ++ it("respects Retry-After for 429s", () => { ++ const randomSpy = vi.spyOn(Math, "random").mockReturnValue(0); ++ try { ++ const plan = buildRetryPlan( ++ { ++ status: 429, ++ response: { headers: { "retry-after": "2" } }, ++ }, ++ 1, ++ ); ++ ++ expect(plan).toBeTruthy(); ++ expect(plan?.reason).toBe("rateLimit"); ++ expect(plan?.delayMs).toBeGreaterThanOrEqual(2_000); ++ } finally { ++ randomSpy.mockRestore(); ++ } ++ }); ++ ++ it("parses HTTP-date Retry-After values", () => { ++ vi.useFakeTimers(); ++ const randomSpy = vi.spyOn(Math, "random").mockReturnValue(0); ++ ++ try { ++ vi.setSystemTime(new Date("2020-01-01T00:00:00.000Z")); ++ const plan = buildRetryPlan( ++ { ++ statusCode: 429, ++ response: { headers: { "retry-after": "Wed, 01 Jan 2020 00:00:03 GMT" } }, ++ }, ++ 1, ++ ); ++ ++ expect(plan).toBeTruthy(); ++ expect(plan?.delayMs).toBeGreaterThanOrEqual(3_000); ++ } finally { ++ vi.useRealTimers(); ++ randomSpy.mockRestore(); ++ } ++ }); ++}); +diff --git a/packages/core/src/traffic/traffic-retry.ts b/packages/core/src/traffic/traffic-retry.ts +new file mode 100644 +index 00000000..9604dc53 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-retry.ts +@@ -0,0 +1,144 @@ ++import type { Logger } from "../logger"; ++import { ++ MAX_RETRY_ATTEMPTS, ++ RATE_LIMIT_BASE_BACKOFF_MS, ++ RATE_LIMIT_JITTER_FACTOR, ++ SERVER_ERROR_BASE_BACKOFF_MS, ++ SERVER_ERROR_JITTER_FACTOR, ++ TIMEOUT_BASE_BACKOFF_MS, ++ TIMEOUT_JITTER_FACTOR, ++ TIMEOUT_RETRY_ATTEMPTS, ++} from "./traffic-constants"; ++import { extractRetryAfterMs, extractStatusCode, isTimeoutError } from "./traffic-error-utils"; ++import { RateLimitedUpstreamError } from "./traffic-errors"; ++import type { ++ RetryPlan, ++ RetryPolicy, ++ RetryPolicyConfig, ++ RetryPolicyContext, ++ RetryReason, ++} from "./traffic-types"; ++ ++export type { ++ RetryPlan, ++ RetryPolicy, ++ RetryPolicyConfig, ++ RetryPolicyContext, ++ RetryReason, ++} from "./traffic-types"; ++ ++export function buildRetryPlan( ++ error: unknown, ++ attempt: number, ++ logger?: Logger, ++): RetryPlan | undefined { ++ const retryLogger = logger?.child({ module: "retry" }); ++ const reason = getRetryReason(error, retryLogger); ++ if (!reason) { ++ retryLogger?.debug?.("No retry reason detected; skipping retry", { attempt }); ++ return undefined; ++ } ++ ++ const max = reason === "timeout" ? TIMEOUT_RETRY_ATTEMPTS : MAX_RETRY_ATTEMPTS; ++ if (attempt >= max) { ++ retryLogger?.debug?.("Retry attempts exhausted; skipping retry", { ++ attempt, ++ max, ++ reason, ++ }); ++ return undefined; ++ } ++ ++ const computedDelayMs = computeBackoffDelay(reason, attempt); ++ const retryAfterMs = ++ reason === "rateLimit" ++ ? error instanceof RateLimitedUpstreamError ++ ? error.retryAfterMs ++ : extractRetryAfterMs(error, retryLogger) ++ : undefined; ++ const delayMs = ++ retryAfterMs === undefined ? computedDelayMs : Math.max(computedDelayMs, retryAfterMs); ++ ++ retryLogger?.debug?.("Retry plan built", { ++ attempt, ++ reason, ++ delayMs, ++ computedDelayMs, ++ retryAfterMs, ++ max, ++ }); ++ ++ return { ++ reason, ++ delayMs, ++ }; ++} ++ ++export function buildRetryPlanWithPolicy( ++ context: RetryPolicyContext, ++ policyConfig?: RetryPolicyConfig, ++): RetryPlan | undefined { ++ const retryLogger = context.logger?.child({ module: "retry" }); ++ const policy = resolveRetryPolicy(context, policyConfig); ++ if (policy) { ++ const planned = policy(context); ++ if (planned) { ++ retryLogger?.debug?.("Retry policy returned a plan", { ++ attempt: context.attempt, ++ reason: planned.reason, ++ delayMs: planned.delayMs, ++ }); ++ return planned; ++ } ++ retryLogger?.debug?.("Retry policy declined to retry", { attempt: context.attempt }); ++ } ++ ++ return buildRetryPlan(context.error, context.attempt, context.logger); ++} ++ ++function resolveRetryPolicy( ++ context: RetryPolicyContext, ++ config?: RetryPolicyConfig, ++): RetryPolicy | undefined { ++ if (!config) return undefined; ++ const modelPolicy = context.key ? config.models?.[context.key] : undefined; ++ if (modelPolicy) return modelPolicy; ++ const providerModelKey = ++ context.metadata?.provider && context.metadata?.model ++ ? `${context.metadata.provider}::${context.metadata.model}` ++ : undefined; ++ const providerModelPolicy = providerModelKey ? config.models?.[providerModelKey] : undefined; ++ if (providerModelPolicy) return providerModelPolicy; ++ const provider = context.metadata?.provider; ++ const providerPolicy = provider ? config.providers?.[provider] : undefined; ++ if (providerPolicy) return providerPolicy; ++ return config.default; ++} ++ ++function getRetryReason(error: unknown, logger?: Logger): RetryReason | undefined { ++ if (error instanceof RateLimitedUpstreamError) return "rateLimit"; ++ const status = extractStatusCode(error, logger); ++ if (status === 429) return "rateLimit"; ++ if (status && status >= 500) return "serverError"; ++ if (status === 408 || isTimeoutError(error, logger)) return "timeout"; ++ return undefined; ++} ++ ++function computeBackoffDelay(reason: RetryReason, attempt: number): number { ++ const base = ++ reason === "serverError" ++ ? SERVER_ERROR_BASE_BACKOFF_MS ++ : reason === "timeout" ++ ? TIMEOUT_BASE_BACKOFF_MS ++ : RATE_LIMIT_BASE_BACKOFF_MS; ++ ++ const jitter = ++ reason === "serverError" ++ ? SERVER_ERROR_JITTER_FACTOR ++ : reason === "timeout" ++ ? TIMEOUT_JITTER_FACTOR ++ : RATE_LIMIT_JITTER_FACTOR; ++ ++ const exp = base * 2 ** (attempt - 1); ++ return Math.round(exp + exp * jitter * Math.random()); ++} +diff --git a/packages/core/src/traffic/traffic-types.ts b/packages/core/src/traffic/traffic-types.ts +new file mode 100644 +index 00000000..1d847e25 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-types.ts +@@ -0,0 +1,181 @@ ++import type { Logger } from "../logger"; ++ ++type BivariantFunction = { ++ bivarianceHack(...args: TArgs): TReturn; ++}["bivarianceHack"]; ++ ++type UsageCounters = { ++ inputTokens?: number; ++ outputTokens?: number; ++ totalTokens?: number; ++}; ++ ++export type RetryReason = "rateLimit" | "serverError" | "timeout"; ++ ++export type RetryPlan = { ++ delayMs: number; ++ reason: RetryReason; ++}; ++ ++export type RetryPolicyContext = { ++ error: unknown; ++ attempt: number; ++ metadata?: TrafficRequestMetadata; ++ key?: string; ++ logger?: Logger; ++}; ++ ++export type RetryPolicy = (context: RetryPolicyContext) => RetryPlan | undefined; ++ ++export type RetryPolicyConfig = { ++ default?: RetryPolicy; ++ providers?: Record; ++ models?: Record; ++}; ++ ++export type TrafficRequestType = "text" | "stream"; ++export type TrafficPriority = "P0" | "P1" | "P2"; ++ ++export interface TrafficRequestMetadata { ++ agentId?: string; ++ agentName?: string; ++ model?: string; ++ provider?: string; ++ priority?: TrafficPriority; ++ tenantId?: string; ++ apiKeyId?: string; ++ region?: string; ++ endpoint?: string; ++ tenantTier?: string; ++ taskType?: string; ++ fallbackPolicyId?: string; ++} ++ ++export type TrafficResponseMetadata = { ++ rateLimitKey?: string; ++ retryAfterMs?: number; ++ rateLimitRemaining?: number; ++ rateLimitResetAt?: number; ++ rateLimitResetInMs?: number; ++ queueEtaMs?: number; ++ tenantId?: string; ++ priority?: TrafficPriority; ++ taskType?: string; ++}; ++ ++export type FallbackTarget = { ++ provider?: string; ++ model: string; ++}; ++ ++export type ShortResponseFallbackTarget = { ++ kind: "short-response"; ++ text: string; ++}; ++ ++export type FallbackChainEntry = string | FallbackTarget | ShortResponseFallbackTarget; ++ ++export type FallbackPolicyMode = "fallback" | "wait"; ++ ++export type FallbackPolicy = { ++ mode: FallbackPolicyMode; ++}; ++ ++export type FallbackPolicyConfig = { ++ defaultPolicyId?: string; ++ policies?: Record; ++ taskTypePolicyIds?: Record; ++}; ++ ++export type ProviderModelConcurrencyLimit = ++ | number ++ | Record ++ | ((metadata: TrafficRequestMetadata | undefined, key: string) => number | undefined); ++ ++export type TenantConcurrencyLimit = ++ | number ++ | Record ++ | ((tenantId: string, metadata: TrafficRequestMetadata | undefined) => number | undefined); ++ ++export type PriorityBurstLimits = Partial>; ++export type PriorityWeights = Partial>; ++ ++export type AdaptiveLimiterConfig = { ++ windowMs?: number; ++ threshold?: number; ++ minPenaltyMs?: number; ++ maxPenaltyMs?: number; ++ penaltyMultiplier?: number; ++ decayMs?: number; ++}; ++ ++export interface TrafficRequest { ++ tenantId: string; ++ metadata?: TrafficRequestMetadata; ++ execute: () => Promise; ++ deadlineAt?: number; ++ maxQueueWaitMs?: number; ++ estimatedTokens?: number; ++ createFallbackRequest?: BivariantFunction< ++ [target: FallbackChainEntry], ++ TrafficRequest | undefined ++ >; ++ extractUsage?: BivariantFunction< ++ [response: TResponse], ++ Promise | UsageCounters | undefined ++ >; ++} ++ ++export interface TrafficControllerOptions { ++ maxConcurrent?: number; ++ maxConcurrentPerProviderModel?: ProviderModelConcurrencyLimit; ++ maxConcurrentPerTenant?: TenantConcurrencyLimit; ++ rateLimits?: RateLimitConfig; ++ priorityBurstLimits?: PriorityBurstLimits; ++ priorityWeights?: PriorityWeights; ++ adaptiveLimiter?: AdaptiveLimiterConfig; ++ /** ++ * Optional override for rate-limit key construction. ++ * Useful when you need to add new metadata fields without changing core logic. ++ */ ++ rateLimitKeyBuilder?: (metadata?: TrafficRequestMetadata) => string; ++ /** ++ * Optional retry policy overrides by provider/model. ++ * Models keys can use the rate-limit key or provider::model. ++ */ ++ retryPolicy?: RetryPolicyConfig; ++ /** ++ * Optional fallback policy selection by task type or explicit policy id. ++ */ ++ fallbackPolicy?: FallbackPolicyConfig; ++ /** ++ * Select a rate-limit strategy by provider/model. ++ * Example: ++ * { providers: { openai: "window" }, models: { "openai::gpt-4o": "window" } } ++ */ ++ rateLimitStrategy?: RateLimitStrategyConfig; ++ logger?: Logger; ++ fallbackChains?: Record; ++} ++ ++export type RateLimitStrategyKind = "window" | "token-bucket"; ++ ++export type RateLimitStrategyConfig = { ++ providers?: Record; ++ models?: Record; ++}; ++ ++export interface RateLimitOptions { ++ requestsPerMinute: number; ++ tokensPerMinute: number; ++ burstSize?: number; ++} ++ ++export type RateLimitKey = string; ++export type RateLimitConfig = Record; ++ ++export type TenantUsage = { ++ inputTokens: number; ++ outputTokens: number; ++ totalTokens: number; ++}; +diff --git a/packages/core/src/traffic/traffic-usage-tracker.ts b/packages/core/src/traffic/traffic-usage-tracker.ts +new file mode 100644 +index 00000000..c79b311a +--- /dev/null ++++ b/packages/core/src/traffic/traffic-usage-tracker.ts +@@ -0,0 +1,83 @@ ++import type { Logger } from "../logger"; ++import type { QueuedRequest } from "./traffic-controller-internal"; ++import { isPromiseLike } from "./traffic-error-utils"; ++import type { TenantUsage } from "./traffic-types"; ++ ++type UsageCounters = { ++ inputTokens?: number; ++ outputTokens?: number; ++ totalTokens?: number; ++}; ++ ++export class TrafficUsageTracker { ++ private readonly tenantUsage = new Map(); ++ ++ getTenantUsage(tenantId: string): TenantUsage | undefined { ++ const usage = this.tenantUsage.get(tenantId); ++ return usage ? { ...usage } : undefined; ++ } ++ ++ recordUsage( ++ item: QueuedRequest, ++ result: TResponse, ++ logger?: Logger, ++ ): UsageCounters | Promise | undefined { ++ const usageLogger = logger?.child({ module: "usage-tracker" }); ++ const extractor = item.extractUsage ?? item.request.extractUsage; ++ if (!extractor) { ++ usageLogger?.trace?.("No usage extractor; skipping usage", { tenantId: item.tenantId }); ++ return undefined; ++ } ++ ++ const usage = extractor(result); ++ if (!usage) { ++ usageLogger?.trace?.("Usage extractor returned empty; skipping usage", { ++ tenantId: item.tenantId, ++ }); ++ return undefined; ++ } ++ ++ if (isPromiseLike(usage)) { ++ usageLogger?.trace?.("Usage extractor returned promise; awaiting", { ++ tenantId: item.tenantId, ++ }); ++ void usage.then((u) => u && this.incrementTenantUsage(item.tenantId, u, usageLogger)); ++ return usage; ++ } ++ this.incrementTenantUsage(item.tenantId, usage, usageLogger); ++ return usage; ++ } ++ ++ private incrementTenantUsage(tenantId: string, usage: UsageCounters, logger?: Logger): void { ++ const current = this.tenantUsage.get(tenantId) ?? { ++ inputTokens: 0, ++ outputTokens: 0, ++ totalTokens: 0, ++ }; ++ ++ const input = ++ typeof usage.inputTokens === "number" && Number.isFinite(usage.inputTokens) ++ ? usage.inputTokens ++ : 0; ++ const output = ++ typeof usage.outputTokens === "number" && Number.isFinite(usage.outputTokens) ++ ? usage.outputTokens ++ : 0; ++ const total = ++ typeof usage.totalTokens === "number" && Number.isFinite(usage.totalTokens) ++ ? usage.totalTokens ++ : input + output; ++ ++ this.tenantUsage.set(tenantId, { ++ inputTokens: current.inputTokens + input, ++ outputTokens: current.outputTokens + output, ++ totalTokens: current.totalTokens + total, ++ }); ++ ++ logger?.debug?.("Tenant usage incremented", { ++ tenantId, ++ delta: { inputTokens: input, outputTokens: output, totalTokens: total }, ++ total: this.tenantUsage.get(tenantId), ++ }); ++ } ++} +diff --git a/packages/core/src/workflow/core.ts b/packages/core/src/workflow/core.ts +index 3136511c..2b273d58 100644 +--- a/packages/core/src/workflow/core.ts ++++ b/packages/core/src/workflow/core.ts +@@ -827,6 +827,9 @@ export function createWorkflow< + + // Wrap entire execution in root span + const rootSpan = traceContext.getRootSpan(); ++ if (options?.tenantId) { ++ rootSpan.setAttribute("tenant.id", options.tenantId); ++ } + + // Add workflow state snapshot for remote observability + const workflowState = { +@@ -848,6 +851,7 @@ export function createWorkflow< + executionId, + userId: options?.userId, + conversationId: options?.conversationId, ++ tenantId: options?.tenantId, + traceId: rootSpan.spanContext().traceId, + spanId: rootSpan.spanContext().spanId, + }); +diff --git a/packages/core/src/workflow/internal/state.ts b/packages/core/src/workflow/internal/state.ts +index 71fa602d..2de12528 100644 +--- a/packages/core/src/workflow/internal/state.ts ++++ b/packages/core/src/workflow/internal/state.ts +@@ -23,6 +23,7 @@ export type WorkflowState = { + executionId: string; + conversationId?: string; + userId?: string; ++ tenantId?: string; + context?: UserContext; + active: number; + startAt: Date; +@@ -132,6 +133,7 @@ class WorkflowStateManagerInternal implements WorkflowStateManager + active: config?.active ?? 0, + userId: config?.userId, + conversationId: config?.conversationId, ++ tenantId: config?.tenantId, + context: config?.context, + startAt: new Date(), + endAt: null, +diff --git a/packages/core/src/workflow/internal/utils.ts b/packages/core/src/workflow/internal/utils.ts +index fc39530b..42250d82 100644 +--- a/packages/core/src/workflow/internal/utils.ts ++++ b/packages/core/src/workflow/internal/utils.ts +@@ -32,6 +32,7 @@ export function convertWorkflowStateToParam( + executionId: state.executionId, + conversationId: state.conversationId, + userId: state.userId, ++ tenantId: state.tenantId, + context: state.context, + active: state.active, + startAt: state.startAt, +diff --git a/packages/core/src/workflow/steps/and-agent.ts b/packages/core/src/workflow/steps/and-agent.ts +index bc46c148..14af9b8f 100644 +--- a/packages/core/src/workflow/steps/and-agent.ts ++++ b/packages/core/src/workflow/steps/and-agent.ts +@@ -66,6 +66,7 @@ export function andAgent( + context: restConfig.context ?? state.context, + conversationId: restConfig.conversationId ?? state.conversationId, + userId: restConfig.userId ?? state.userId, ++ tenantId: restConfig.tenantId ?? state.tenantId, + // No parentSpan when there's no workflow context + }); + // Accumulate usage if available (no workflow context) +@@ -92,6 +93,7 @@ export function andAgent( + context: restConfig.context ?? state.context, + conversationId: restConfig.conversationId ?? state.conversationId, + userId: restConfig.userId ?? state.userId, ++ tenantId: restConfig.tenantId ?? state.tenantId, + // Pass the current step span as parent for proper span hierarchy + parentSpan: state.workflowContext?.currentStepSpan, + }); +diff --git a/packages/core/src/workflow/types.ts b/packages/core/src/workflow/types.ts +index f7eed282..49bfd8cb 100644 +--- a/packages/core/src/workflow/types.ts ++++ b/packages/core/src/workflow/types.ts +@@ -214,6 +214,10 @@ export interface WorkflowRunOptions { + * The conversation ID, this can be used to track the current conversation in a workflow + */ + conversationId?: string; ++ /** ++ * Tenant identifier propagated to agent steps and subcalls ++ */ ++ tenantId?: string; + /** + * The user ID, this can be used to track the current user in a workflow + */ +diff --git a/packages/scorers/src/llm/answer-correctness.ts b/packages/scorers/src/llm/answer-correctness.ts +index 2111fa31..d66cc007 100644 +--- a/packages/scorers/src/llm/answer-correctness.ts ++++ b/packages/scorers/src/llm/answer-correctness.ts +@@ -7,6 +7,7 @@ import { + import { safeStringify } from "@voltagent/internal/utils"; + import type { LanguageModel } from "ai"; + import { z } from "zod"; ++import { extractTenantId } from "./utils"; + + const ANSWER_CORRECTNESS_PROMPT = `Given a ground truth and an answer, analyze each statement in the answer and classify them in one of the following categories: + +@@ -84,15 +85,17 @@ export function createAnswerCorrectnessScorer< + const agent = new Agent({ + name: "answer-correctness-classifier", + model, ++ trafficPriority: "P2", + instructions: "You classify statements for answer correctness evaluation", + }); + ++ const tenantId = extractTenantId(context); + const payload = resolvePayload(context, buildPayload); + const prompt = ANSWER_CORRECTNESS_PROMPT.replace("{{question}}", payload.input) + .replace("{{answer}}", payload.output) + .replace("{{ground_truth}}", payload.expected); + +- const response = await agent.generateObject(prompt, CLASSIFICATION_SCHEMA); ++ const response = await agent.generateObject(prompt, CLASSIFICATION_SCHEMA, { tenantId }); + const normalized = normalizeClassification(response.object); + + return { +diff --git a/packages/scorers/src/llm/answer-relevancy.ts b/packages/scorers/src/llm/answer-relevancy.ts +index a3de2237..d9bda1c9 100644 +--- a/packages/scorers/src/llm/answer-relevancy.ts ++++ b/packages/scorers/src/llm/answer-relevancy.ts +@@ -8,6 +8,7 @@ import { + import { safeStringify } from "@voltagent/internal/utils"; + import type { LanguageModel } from "ai"; + import { z } from "zod"; ++import { extractTenantId } from "./utils"; + + const QUESTION_GEN_PROMPT = `Generate a question for the given answer and Identify if answer is noncommittal. Give noncommittal as 1 if the answer is noncommittal and 0 if the answer is committal. A noncommittal answer is one that is evasive, vague, or ambiguous. For example, "I don't know" or "I'm not sure" are noncommittal answers + +@@ -119,9 +120,11 @@ export function createAnswerRelevancyScorer< + const agent = new Agent({ + name: "question-generator", + model, ++ trafficPriority: "P2", + instructions: "You generate questions from answers to evaluate relevancy", + }); + ++ const tenantId = extractTenantId(context); + const payload = resolvePayload(context, buildPayload); + const questions: GeneratedQuestion[] = []; + +@@ -131,7 +134,7 @@ export function createAnswerRelevancyScorer< + payload.context, + ); + +- const response = await agent.generateObject(prompt, QUESTION_SCHEMA); ++ const response = await agent.generateObject(prompt, QUESTION_SCHEMA, { tenantId }); + questions.push({ + question: response.object.question, + noncommittal: response.object.noncommittal === 1, +diff --git a/packages/scorers/src/llm/classifiers.ts b/packages/scorers/src/llm/classifiers.ts +index 1bca4239..a327e20d 100644 +--- a/packages/scorers/src/llm/classifiers.ts ++++ b/packages/scorers/src/llm/classifiers.ts +@@ -7,6 +7,7 @@ import { + } from "@voltagent/core"; + import { safeStringify } from "@voltagent/internal/utils"; + import { z } from "zod"; ++import { extractTenantId } from "./utils"; + + type ChoiceId = string; + +@@ -93,11 +94,14 @@ async function evaluateChoice(args: EvaluateChoiceArgs): Promise + const agent = new Agent({ + name: `${scorerId}-judge`, + model, ++ trafficPriority: "P2", + instructions: judgeInstructions ?? buildDefaultChoiceInstructions(Object.keys(choices)), + }); + ++ const tenantId = extractTenantId(context); + const response = await agent.generateObject(prompt, CHOICE_RESPONSE_SCHEMA, { + maxOutputTokens, ++ tenantId, + }); + + const { choice, reason } = extractChoiceFromResponse(response.object, choices, scorerId); +diff --git a/packages/scorers/src/llm/context-precision.ts b/packages/scorers/src/llm/context-precision.ts +index d31b5b85..ba680f56 100644 +--- a/packages/scorers/src/llm/context-precision.ts ++++ b/packages/scorers/src/llm/context-precision.ts +@@ -7,6 +7,7 @@ import { + import { safeStringify } from "@voltagent/internal/utils"; + import type { LanguageModel } from "ai"; + import { z } from "zod"; ++import { extractTenantId } from "./utils"; + + const CONTEXT_PRECISION_PROMPT = `Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output. + +@@ -109,6 +110,7 @@ export function createContextPrecisionScorer< + const agent = new Agent({ + name: "context-precision-evaluator", + model, ++ trafficPriority: "P2", + instructions: "You evaluate if context was useful for arriving at the answer", + }); + +@@ -116,12 +118,15 @@ export function createContextPrecisionScorer< + const contextText = Array.isArray(payload.context) + ? payload.context.join("\n") + : payload.context; ++ const tenantId = extractTenantId(context); + + const prompt = CONTEXT_PRECISION_PROMPT.replace("{{question}}", payload.input) + .replace("{{context}}", contextText) + .replace("{{answer}}", payload.output); + +- const response = await agent.generateObject(prompt, CONTEXT_PRECISION_SCHEMA); ++ const response = await agent.generateObject(prompt, CONTEXT_PRECISION_SCHEMA, { ++ tenantId, ++ }); + + context.results.raw.contextPrecisionVerdict = response.object; + +diff --git a/packages/scorers/src/llm/context-recall.ts b/packages/scorers/src/llm/context-recall.ts +index e6e86510..2c6053fc 100644 +--- a/packages/scorers/src/llm/context-recall.ts ++++ b/packages/scorers/src/llm/context-recall.ts +@@ -7,6 +7,7 @@ import { + import { safeStringify } from "@voltagent/internal/utils"; + import type { LanguageModel } from "ai"; + import { z } from "zod"; ++import { extractTenantId } from "./utils"; + + const CONTEXT_RECALL_EXTRACT_PROMPT = `Given the context and ground truth (expected output), extract all factual statements from the ground truth. + +@@ -120,6 +121,7 @@ export function createContextRecallScorer< + const agent = new Agent({ + name: "context-recall-evaluator", + model, ++ trafficPriority: "P2", + instructions: "You evaluate how well provided context supports factual statements", + }); + +@@ -127,6 +129,7 @@ export function createContextRecallScorer< + const contextText = Array.isArray(payload.context) + ? payload.context.join("\n") + : payload.context; ++ const tenantId = extractTenantId(context); + + // Extract statements from expected output + const extractPrompt = CONTEXT_RECALL_EXTRACT_PROMPT.replace( +@@ -134,7 +137,9 @@ export function createContextRecallScorer< + contextText, + ).replace("{{expected}}", payload.expected); + +- const extractResponse = await agent.generateObject(extractPrompt, EXTRACT_SCHEMA); ++ const extractResponse = await agent.generateObject(extractPrompt, EXTRACT_SCHEMA, { ++ tenantId, ++ }); + const statements = extractResponse.object.statements; + + if (statements.length === 0) { +@@ -152,7 +157,9 @@ export function createContextRecallScorer< + contextText, + ).replace("{{statement}}", statement); + +- const verifyResponse = await agent.generateObject(verifyPrompt, VERIFY_SCHEMA); ++ const verifyResponse = await agent.generateObject(verifyPrompt, VERIFY_SCHEMA, { ++ tenantId, ++ }); + verdicts.push({ + statement, + verdict: verifyResponse.object.verdict, +diff --git a/packages/scorers/src/llm/context-relevancy.ts b/packages/scorers/src/llm/context-relevancy.ts +index ee882b5b..aca608b2 100644 +--- a/packages/scorers/src/llm/context-relevancy.ts ++++ b/packages/scorers/src/llm/context-relevancy.ts +@@ -7,6 +7,7 @@ import { + import { safeStringify } from "@voltagent/internal/utils"; + import type { LanguageModel } from "ai"; + import { z } from "zod"; ++import { extractTenantId } from "./utils"; + + const CONTEXT_RELEVANCY_PROMPT = `Analyze the provided context and identify which parts are relevant to answering the given question. For each context sentence or passage, determine its relevance level. + +@@ -144,6 +145,7 @@ export function createContextRelevancyScorer< + const agent = new Agent({ + name: "context-relevancy-evaluator", + model, ++ trafficPriority: "P2", + instructions: "You evaluate how relevant provided context is to answering questions", + }); + +@@ -151,13 +153,16 @@ export function createContextRelevancyScorer< + const contextText = Array.isArray(payload.context) + ? payload.context.join("\n") + : payload.context; ++ const tenantId = extractTenantId(context); + + const prompt = CONTEXT_RELEVANCY_PROMPT.replace("{{question}}", payload.input).replace( + "{{context}}", + contextText, + ); + +- const response = await agent.generateObject(prompt, CONTEXT_RELEVANCY_SCHEMA); ++ const response = await agent.generateObject(prompt, CONTEXT_RELEVANCY_SCHEMA, { ++ tenantId, ++ }); + const evaluations = response.object.evaluations; + + context.results.raw.contextRelevancyEvaluations = evaluations; +diff --git a/packages/scorers/src/llm/moderation.ts b/packages/scorers/src/llm/moderation.ts +index 03563bfe..1055927f 100644 +--- a/packages/scorers/src/llm/moderation.ts ++++ b/packages/scorers/src/llm/moderation.ts +@@ -7,6 +7,7 @@ import { + } from "@voltagent/core"; + import { safeStringify } from "@voltagent/internal/utils"; + import { z } from "zod"; ++import { extractTenantId } from "./utils"; + + export interface ModerationScorerOptions { + id?: string; +@@ -220,6 +221,7 @@ async function runModerationJudge(args: { + typeof context.results.prepare === "string" + ? context.results.prepare + : normalizeText(context.payload.output); ++ const tenantId = extractTenantId(context); + + const prompt = await buildPrompt({ + output: normalizedOutput, +@@ -232,12 +234,14 @@ async function runModerationJudge(args: { + const agent = new Agent({ + name: "moderation-judge", + model, ++ trafficPriority: "P2", + instructions: + "You are a safety classifier. Respond with JSON that matches the provided schema containing flagged, scores, and reason.", + }); + + const response = await agent.generateObject(prompt, MODERATION_SCHEMA, { + maxOutputTokens, ++ tenantId, + }); + + const parsed = mapModerationResponse(response.object, threshold); +diff --git a/packages/scorers/src/llm/utils.ts b/packages/scorers/src/llm/utils.ts +new file mode 100644 +index 00000000..75e886e3 +--- /dev/null ++++ b/packages/scorers/src/llm/utils.ts +@@ -0,0 +1,14 @@ ++import type { BuilderPrepareContext, BuilderScoreContext } from "@voltagent/core"; ++ ++type TenantAwareContext = BuilderScoreContext, Record> & ++ BuilderPrepareContext, Record>; ++ ++export function extractTenantId( ++ context: ++ | BuilderScoreContext, Record> ++ | BuilderPrepareContext, Record> ++ | TenantAwareContext, ++): string | undefined { ++ const candidate = (context.payload as { tenantId?: unknown })?.tenantId; ++ return typeof candidate === "string" ? candidate : undefined; ++} +diff --git a/packages/server-core/src/handlers/agent.handlers.ts b/packages/server-core/src/handlers/agent.handlers.ts +index 00c0f2ee..37fbeaf4 100644 +--- a/packages/server-core/src/handlers/agent.handlers.ts ++++ b/packages/server-core/src/handlers/agent.handlers.ts +@@ -1,11 +1,70 @@ +-import { ClientHTTPError, type ServerProviderDeps } from "@voltagent/core"; +-import { convertUsage } from "@voltagent/core"; ++import { ++ ClientHTTPError, ++ type ServerProviderDeps, ++ type TrafficResponseMetadata, ++ convertUsage, ++} from "@voltagent/core"; + import { type Logger, safeStringify } from "@voltagent/internal"; + import { z } from "zod"; + import { convertJsonSchemaToZod } from "zod-from-json-schema"; + import { convertJsonSchemaToZod as convertJsonSchemaToZodV3 } from "zod-from-json-schema-v3"; + import type { ApiResponse } from "../types"; + import { processAgentOptions } from "../utils/options"; ++import { buildTrafficHeaders } from "../utils/traffic"; ++ ++function extractTrafficMetadata(value: unknown): TrafficResponseMetadata | undefined { ++ if (!value || typeof value !== "object") return undefined; ++ const traffic = (value as { traffic?: unknown }).traffic; ++ if (!traffic || typeof traffic !== "object") return undefined; ++ return traffic as TrafficResponseMetadata; ++} ++ ++function wrapStreamWithTraffic( ++ baseResponse: Response, ++ traffic?: TrafficResponseMetadata, ++): Response { ++ if (!traffic) return baseResponse; ++ const headers = new Headers(baseResponse.headers); ++ const trafficHeaders = buildTrafficHeaders(traffic); ++ for (const [key, value] of Object.entries(trafficHeaders)) { ++ headers.set(key, value); ++ } ++ const baseBody = baseResponse.body; ++ if (!baseBody) { ++ return new Response(baseBody, { ++ status: baseResponse.status, ++ headers, ++ }); ++ } ++ ++ const encoder = new TextEncoder(); ++ const stream = new ReadableStream({ ++ async start(controller) { ++ const trafficEvent = `data: ${safeStringify({ type: "traffic", traffic })}\n\n`; ++ controller.enqueue(encoder.encode(trafficEvent)); ++ const reader = baseBody.getReader(); ++ try { ++ while (true) { ++ const { done, value } = await reader.read(); ++ if (done) break; ++ if (value !== undefined) { ++ controller.enqueue(value); ++ } ++ } ++ } catch (error) { ++ controller.error(error); ++ } finally { ++ reader.releaseLock(); ++ controller.close(); ++ } ++ }, ++ }); ++ ++ return new Response(stream, { ++ status: baseResponse.status, ++ headers, ++ }); ++} + + /** + * Handler for listing all agents +@@ -79,6 +138,7 @@ export async function handleGenerateText( + const options = processAgentOptions(body, signal); + + const result = await agent.generateText(input, options); ++ const traffic = extractTrafficMetadata(result); + + // Convert usage format if present + const usage = result.usage ? convertUsage(result.usage) : undefined; +@@ -102,9 +162,11 @@ export async function handleGenerateText( + } + })(), + }, ++ traffic, + }; + } catch (error) { + logger.error("Failed to generate text", { error }); ++ const traffic = extractTrafficMetadata(error); + if (error instanceof ClientHTTPError) { + return { + success: false, +@@ -112,11 +174,13 @@ export async function handleGenerateText( + code: error.code, + name: error.name, + httpStatus: error.httpStatus, ++ traffic, + }; + } + return { + success: false, + error: error instanceof Error ? error.message : "Unknown error", ++ traffic, + }; + } + } +@@ -153,6 +217,7 @@ export async function handleStreamText( + const options = processAgentOptions(body, signal); + + const result = await agent.streamText(input, options); ++ const traffic = extractTrafficMetadata(result); + + // Access the fullStream property + const { fullStream } = result; +@@ -178,7 +243,7 @@ export async function handleStreamText( + }, + }); + +- return new Response(stream, { ++ const response = new Response(stream, { + status: 200, + headers: { + "Content-Type": "text/event-stream", +@@ -186,20 +251,25 @@ export async function handleStreamText( + Connection: "keep-alive", + }, + }); ++ return wrapStreamWithTraffic(response, traffic); + } catch (error) { + logger.error("Failed to handle stream text request", { error }); + + const errorMessage = error instanceof Error ? error.message : "Unknown error"; ++ const traffic = extractTrafficMetadata(error); ++ const trafficHeaders = buildTrafficHeaders(traffic); + + return new Response( + safeStringify({ + error: errorMessage, + message: errorMessage, ++ traffic, + }), + { + status: 500, + headers: { + "Content-Type": "application/json", ++ ...trafficHeaders, + }, + }, + ); +@@ -238,26 +308,32 @@ export async function handleChatStream( + const options = processAgentOptions(body, signal); + + const result = await agent.streamText(input, options); ++ const traffic = extractTrafficMetadata(result); + + // Use the built-in toUIMessageStreamResponse - it handles errors properly +- return result.toUIMessageStreamResponse({ ++ const response = result.toUIMessageStreamResponse({ + sendReasoning: true, + sendSources: true, + }); ++ return wrapStreamWithTraffic(response, traffic); + } catch (error) { + logger.error("Failed to handle chat stream request", { error }); + + const errorMessage = error instanceof Error ? error.message : "Unknown error"; ++ const traffic = extractTrafficMetadata(error); ++ const trafficHeaders = buildTrafficHeaders(traffic); + + return new Response( + safeStringify({ + error: errorMessage, + message: errorMessage, ++ traffic, + }), + { + status: 500, + headers: { + "Content-Type": "application/json", ++ ...trafficHeaders, + }, + }, + ); +@@ -293,16 +369,20 @@ export async function handleGenerateObject( + ) as any; + + const result = await agent.generateObject(input, zodSchema, options); ++ const traffic = extractTrafficMetadata(result); + + return { + success: true, + data: result.object, ++ traffic, + }; + } catch (error) { + logger.error("Failed to generate object", { error }); ++ const traffic = extractTrafficMetadata(error); + return { + success: false, + error: error instanceof Error ? error.message : "Unknown error", ++ traffic, + }; + } + } +@@ -344,23 +424,29 @@ export async function handleStreamObject( + ) as any; + + const result = await agent.streamObject(input, zodSchema, options); ++ const traffic = extractTrafficMetadata(result); + + // Use the built-in toTextStreamResponse - it handles errors properly +- return result.toTextStreamResponse(); ++ const response = result.toTextStreamResponse(); ++ return wrapStreamWithTraffic(response, traffic); + } catch (error) { + logger.error("Failed to handle stream object request", { error }); + + const errorMessage = error instanceof Error ? error.message : "Unknown error"; ++ const traffic = extractTrafficMetadata(error); ++ const trafficHeaders = buildTrafficHeaders(traffic); + + return new Response( + safeStringify({ + error: errorMessage, + message: errorMessage, ++ traffic, + }), + { + status: 500, + headers: { + "Content-Type": "application/json", ++ ...trafficHeaders, + }, + }, + ); +diff --git a/packages/server-core/src/index.ts b/packages/server-core/src/index.ts +index 1fe7e206..2f7ed826 100644 +--- a/packages/server-core/src/index.ts ++++ b/packages/server-core/src/index.ts +@@ -40,6 +40,7 @@ export * from "./utils/server-utils"; + export * from "./utils/ui-templates"; + export * from "./utils/response-mappers"; + export * from "./utils/sse"; ++export * from "./utils/traffic"; + export * from "./utils/announcements"; + + // Export WebSocket utilities +diff --git a/packages/server-core/src/schemas/agent.schemas.ts b/packages/server-core/src/schemas/agent.schemas.ts +index 52e80b83..41181e00 100644 +--- a/packages/server-core/src/schemas/agent.schemas.ts ++++ b/packages/server-core/src/schemas/agent.schemas.ts +@@ -77,6 +77,18 @@ export const GenerateOptionsSchema = z + .object({ + userId: z.string().optional().describe("Optional user ID for context tracking"), + conversationId: z.string().optional().describe("Optional conversation ID for context tracking"), ++ tenantId: z.string().optional().describe("Optional tenant ID for traffic limits"), ++ trafficPriority: z ++ .enum(["P0", "P1", "P2"]) ++ .optional() ++ .describe("Optional traffic priority for scheduling (P0, P1, P2)"), ++ apiKeyId: z.string().optional().describe("Optional API key identifier for traffic limits"), ++ region: z.string().optional().describe("Optional region identifier for traffic limits"), ++ endpoint: z.string().optional().describe("Optional endpoint identifier for traffic limits"), ++ tenantTier: z ++ .string() ++ .optional() ++ .describe("Optional tenant tier identifier for traffic limits"), + context: z + .record(z.string(), z.unknown()) + .nullish() +@@ -94,6 +106,14 @@ export const GenerateOptionsSchema = z + .positive() + .optional() + .describe("Maximum number of steps for this request"), ++ maxQueueWaitMs: z ++ .number() ++ .int() ++ .nonnegative() ++ .optional() ++ .describe("Maximum time to wait in the queue before timing out (ms)"), ++ taskType: z.string().optional().describe("Optional task classification for fallback policy"), ++ fallbackPolicyId: z.string().optional().describe("Optional explicit fallback policy id"), + temperature: z + .number() + .min(0) +diff --git a/packages/server-core/src/types/responses.ts b/packages/server-core/src/types/responses.ts +index 2098c2f6..4935a535 100644 +--- a/packages/server-core/src/types/responses.ts ++++ b/packages/server-core/src/types/responses.ts +@@ -1,10 +1,12 @@ + /** + * Framework-agnostic response types for server handlers + */ ++import type { TrafficResponseMetadata } from "@voltagent/core"; + + export interface SuccessResponse { + success: true; + data: T; ++ traffic?: TrafficResponseMetadata; + } + + export interface ErrorResponse { +@@ -13,6 +15,7 @@ export interface ErrorResponse { + httpStatus?: number; + code?: string; + name?: string; ++ traffic?: TrafficResponseMetadata; + } + + export type ApiResponse = SuccessResponse | ErrorResponse; +diff --git a/packages/server-core/src/utils/traffic.ts b/packages/server-core/src/utils/traffic.ts +new file mode 100644 +index 00000000..f9be1845 +--- /dev/null ++++ b/packages/server-core/src/utils/traffic.ts +@@ -0,0 +1,35 @@ ++import type { TrafficResponseMetadata } from "@voltagent/core"; ++ ++export function buildTrafficHeaders(traffic?: TrafficResponseMetadata): Record { ++ if (!traffic) return {}; ++ ++ const headers: Record = {}; ++ ++ if (typeof traffic.retryAfterMs === "number" && Number.isFinite(traffic.retryAfterMs)) { ++ headers["Retry-After"] = String(Math.max(0, Math.ceil(traffic.retryAfterMs / 1000))); ++ } ++ ++ if (traffic.rateLimitRemaining !== undefined) { ++ headers["X-RateLimit-Remaining"] = String(traffic.rateLimitRemaining); ++ } ++ ++ if (typeof traffic.rateLimitResetAt === "number" && Number.isFinite(traffic.rateLimitResetAt)) { ++ headers["X-RateLimit-Reset"] = String(Math.max(0, Math.ceil(traffic.rateLimitResetAt / 1000))); ++ } else if ( ++ typeof traffic.rateLimitResetInMs === "number" && ++ Number.isFinite(traffic.rateLimitResetInMs) ++ ) { ++ const resetAt = Date.now() + Math.max(0, traffic.rateLimitResetInMs); ++ headers["X-RateLimit-Reset"] = String(Math.max(0, Math.ceil(resetAt / 1000))); ++ } ++ ++ if (traffic.queueEtaMs !== undefined) { ++ headers["X-Queue-ETA"] = String(traffic.queueEtaMs); ++ } ++ ++ if (traffic.rateLimitKey) { ++ headers["X-RateLimit-Key"] = traffic.rateLimitKey; ++ } ++ ++ return headers; ++} +diff --git a/packages/server-hono/src/routes/index.ts b/packages/server-hono/src/routes/index.ts +index a5af8214..336a5bf4 100644 +--- a/packages/server-hono/src/routes/index.ts ++++ b/packages/server-hono/src/routes/index.ts +@@ -2,6 +2,7 @@ import type { ServerProviderDeps } from "@voltagent/core"; + import type { Logger } from "@voltagent/internal"; + import { + UPDATE_ROUTES, ++ buildTrafficHeaders, + handleCancelWorkflow, + handleChatStream, + handleCheckUpdates, +@@ -87,11 +88,12 @@ export function registerAgentRoutes( + + const signal = c.req.raw.signal; + const response = await handleGenerateText(agentId, body, deps, logger, signal); ++ const trafficHeaders = buildTrafficHeaders(response.traffic); + if (!response.success) { + const { httpStatus, ...details } = response; +- return c.json(details, httpStatus || 500); ++ return c.json(details, httpStatus || 500, trafficHeaders); + } +- return c.json(response, 200); ++ return c.json(response, 200, trafficHeaders); + }); + + // POST /agents/:id/stream - Stream text (raw fullStream SSE) +@@ -131,11 +133,12 @@ export function registerAgentRoutes( + const body = await c.req.json(); + const signal = c.req.raw.signal; + const response = await handleGenerateObject(agentId, body, deps, logger, signal); ++ const trafficHeaders = buildTrafficHeaders(response.traffic); + if (!response.success) { + const { httpStatus, ...details } = response; +- return c.json(details, httpStatus || 500); ++ return c.json(details, httpStatus || 500, trafficHeaders); + } +- return c.json(response, 200); ++ return c.json(response, 200, trafficHeaders); + }); + + // POST /agents/:id/stream-object - Stream object +diff --git a/packages/serverless-hono/src/routes.ts b/packages/serverless-hono/src/routes.ts +index d377ce4b..39eabcf7 100644 +--- a/packages/serverless-hono/src/routes.ts ++++ b/packages/serverless-hono/src/routes.ts +@@ -28,6 +28,7 @@ import { + type TriggerHttpRequestContext, + UPDATE_ROUTES, + WORKFLOW_ROUTES, ++ buildTrafficHeaders, + executeA2ARequest, + executeTriggerHandler, + getConversationMessagesHandler, +@@ -165,7 +166,8 @@ export function registerAgentRoutes(app: Hono, deps: ServerProviderDeps, logger: + } + const signal = c.req.raw.signal; + const response = await handleGenerateText(agentId, body, deps, logger, signal); +- return c.json(response, response.success ? 200 : 500); ++ const trafficHeaders = buildTrafficHeaders(response.traffic); ++ return c.json(response, response.success ? 200 : 500, trafficHeaders); + }); + + app.post(AGENT_ROUTES.streamText.path, async (c) => { +@@ -197,7 +199,8 @@ export function registerAgentRoutes(app: Hono, deps: ServerProviderDeps, logger: + } + const signal = c.req.raw.signal; + const response = await handleGenerateObject(agentId, body, deps, logger, signal); +- return c.json(response, response.success ? 200 : 500); ++ const trafficHeaders = buildTrafficHeaders(response.traffic); ++ return c.json(response, response.success ? 200 : 500, trafficHeaders); + }); + + app.post(AGENT_ROUTES.streamObject.path, async (c) => { +diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml +index 20029de4..6671d8c1 100644 +--- a/pnpm-lock.yaml ++++ b/pnpm-lock.yaml +@@ -37,7 +37,7 @@ importers: + version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2) + '@nx/plugin': + specifier: 20.4.6 +- version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(typescript@5.9.2) ++ version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2) + '@nx/vite': + specifier: 20.4.6 + version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2)(vite@7.2.7)(vitest@3.2.4) +@@ -92,6 +92,9 @@ importers: + syncpack: + specifier: ^13.0.2 + version: 13.0.4(typescript@5.9.2) ++ ts-node: ++ specifier: ^10.9.2 ++ version: 10.9.2(@swc/core@1.5.29)(@types/node@24.2.1)(typescript@5.9.2) + tslib: + specifier: ^2.3.0 + version: 2.8.1 +@@ -99,7 +102,7 @@ importers: + specifier: ^8.5.0 + version: 8.5.0(@swc/core@1.5.29)(typescript@5.9.2) + typescript: +- specifier: ^5.8.2 ++ specifier: ^5.9.2 + version: 5.9.2 + vite: + specifier: ^7.2.7 +@@ -2750,6 +2753,61 @@ importers: + specifier: ^0.5.3 + version: 0.5.3(@tanstack/query-core@5.89.0)(@tanstack/react-query@5.89.0)(@tanstack/router-core@1.131.44)(@types/node@24.2.1)(@types/react@19.2.7)(@vitejs/plugin-react@5.1.2)(tsx@4.20.4)(vite@7.2.7) + ++ examples/with-viteval/dist: ++ dependencies: ++ '@ai-sdk/openai': ++ specifier: ^2.0.52 ++ version: 2.0.85(zod@3.25.76) ++ '@voltagent/cli': ++ specifier: ^0.1.16 ++ version: link:../../../packages/cli ++ '@voltagent/core': ++ specifier: ^1.2.15 ++ version: link:../../../packages/core ++ '@voltagent/libsql': ++ specifier: ^1.0.13 ++ version: link:../../../packages/libsql ++ '@voltagent/logger': ++ specifier: ^1.0.4 ++ version: link:../../../packages/logger ++ '@voltagent/server-hono': ++ specifier: ^1.2.5 ++ version: link:../../../packages/server-hono ++ ai: ++ specifier: ^5.0.76 ++ version: 5.0.113(zod@3.25.76) ++ consola: ++ specifier: ^3.4.2 ++ version: 3.4.2 ++ envalid: ++ specifier: ^8.1.0 ++ version: 8.1.0 ++ yargs: ++ specifier: ^18.0.0 ++ version: 18.0.0 ++ zod: ++ specifier: ^3.25.76 ++ version: 3.25.76 ++ devDependencies: ++ '@tsconfig/node24': ++ specifier: ^24.0.1 ++ version: 24.0.1 ++ '@types/yargs': ++ specifier: ^17.0.33 ++ version: 17.0.33 ++ dotenv: ++ specifier: ^16.4.5 ++ version: 16.6.1 ++ tsx: ++ specifier: ^4.19.3 ++ version: 4.20.4 ++ typescript: ++ specifier: ^5.8.2 ++ version: 5.9.2 ++ viteval: ++ specifier: ^0.5.3 ++ version: 0.5.3(@tanstack/query-core@5.89.0)(@tanstack/react-query@5.89.0)(@tanstack/router-core@1.131.44)(@types/node@24.2.1)(@types/react@19.2.7)(@vitejs/plugin-react@5.1.2)(tsx@4.20.4)(vite@7.2.7) ++ + examples/with-voice-elevenlabs: + dependencies: + '@ai-sdk/openai': +@@ -3509,7 +3567,7 @@ importers: + version: 3.2.4(vitest@3.2.4) + jest: + specifier: ^29.5.0 +- version: 29.7.0(@types/node@24.2.1) ++ version: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) + ts-jest: + specifier: ^29.1.0 + version: 29.4.1(@babel/core@7.28.5)(esbuild@0.25.10)(jest@29.7.0)(typescript@5.9.2) +@@ -9966,7 +10024,7 @@ packages: + slash: 3.0.0 + dev: true + +- /@jest/core@29.7.0: ++ /@jest/core@29.7.0(ts-node@10.9.2): + resolution: {integrity: sha512-n7aeXWKMnGtDA48y8TLWJPJmLmmZ642Ceo78cYWEpiD7FzDgmNDV/GCVRorPABdXLJZ/9wzzgZAlHjXjxDHGsg==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + peerDependencies: +@@ -9987,7 +10045,7 @@ packages: + exit: 0.1.2 + graceful-fs: 4.2.11 + jest-changed-files: 29.7.0 +- jest-config: 29.7.0(@types/node@24.6.2) ++ jest-config: 29.7.0(@types/node@24.6.2)(ts-node@10.9.2) + jest-haste-map: 29.7.0 + jest-message-util: 29.7.0 + jest-regex-util: 29.6.3 +@@ -12403,7 +12461,7 @@ packages: + - verdaccio + dev: true + +- /@nx/jest@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2): ++ /@nx/jest@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2): + resolution: {integrity: sha512-yZOZJOQFtpdY3Fu/WYNoDx81TwvF9yfwvalFpLD19bz+2YGl7B89l0S1ZrtSRXFfKXA/w7gb0gmKwthJtQhx9Q==} + dependencies: + '@jest/reporters': 29.7.0 +@@ -12412,7 +12470,7 @@ packages: + '@nx/js': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2) + '@phenomnomnominal/tsquery': 5.0.1(typescript@5.9.2) + identity-obj-proxy: 3.0.0 +- jest-config: 29.7.0(@types/node@24.2.1) ++ jest-config: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) + jest-resolve: 29.7.0 + jest-util: 29.7.0 + minimatch: 9.0.3 +@@ -12807,12 +12865,12 @@ packages: + dev: true + optional: true + +- /@nx/plugin@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(typescript@5.9.2): ++ /@nx/plugin@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2): + resolution: {integrity: sha512-7Jlv+BVqGoO0BolQN7P5Z87160phuE1i7H6C8xFwQnlQ3ZfwQCJzk2dkg1UyzxDkWl6lvVsqBjZPXD55gFQ3+w==} + dependencies: + '@nx/devkit': 20.4.6(nx@20.8.2) + '@nx/eslint': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2) +- '@nx/jest': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2) ++ '@nx/jest': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2) + '@nx/js': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2) + tslib: 2.8.1 + transitivePeerDependencies: +@@ -17770,8 +17828,8 @@ packages: + '@babel/plugin-syntax-jsx': 7.27.1(@babel/core@7.28.5) + '@babel/plugin-syntax-typescript': 7.27.1(@babel/core@7.28.5) + '@babel/template': 7.27.2 +- '@babel/traverse': 7.28.4 +- '@babel/types': 7.28.4 ++ '@babel/traverse': 7.28.5 ++ '@babel/types': 7.28.5 + '@tanstack/react-router': 1.131.44(react-dom@19.2.3)(react@19.2.3) + '@tanstack/router-core': 1.131.44 + '@tanstack/router-generator': 1.131.44 +@@ -22783,7 +22841,7 @@ packages: + crc-32: 1.2.2 + readable-stream: 4.7.0 + +- /create-jest@29.7.0(@types/node@24.2.1): ++ /create-jest@29.7.0(@types/node@24.2.1)(ts-node@10.9.2): + resolution: {integrity: sha512-Adz2bdH0Vq3F53KEMJOoftQFutWCukm6J24wbPWRO4k1kMY7gS7ds/uoJkNuV8wDCtWWnuwGcJwpWcih+zEW1Q==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + hasBin: true +@@ -22792,7 +22850,7 @@ packages: + chalk: 4.1.2 + exit: 0.1.2 + graceful-fs: 4.2.11 +- jest-config: 29.7.0(@types/node@24.2.1) ++ jest-config: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) + jest-util: 29.7.0 + prompts: 2.4.2 + transitivePeerDependencies: +@@ -27641,7 +27699,7 @@ packages: + - supports-color + dev: true + +- /jest-cli@29.7.0(@types/node@24.2.1): ++ /jest-cli@29.7.0(@types/node@24.2.1)(ts-node@10.9.2): + resolution: {integrity: sha512-OVVobw2IubN/GSYsxETi+gOe7Ka59EFMR/twOU3Jb2GnKKeMGJB5SGUUrEz3SFVmJASUdZUzy83sLNNQ2gZslg==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + hasBin: true +@@ -27651,14 +27709,14 @@ packages: + node-notifier: + optional: true + dependencies: +- '@jest/core': 29.7.0 ++ '@jest/core': 29.7.0(ts-node@10.9.2) + '@jest/test-result': 29.7.0 + '@jest/types': 29.6.3 + chalk: 4.1.2 +- create-jest: 29.7.0(@types/node@24.2.1) ++ create-jest: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) + exit: 0.1.2 + import-local: 3.2.0 +- jest-config: 29.7.0(@types/node@24.2.1) ++ jest-config: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) + jest-util: 29.7.0 + jest-validate: 29.7.0 + yargs: 17.7.2 +@@ -27669,7 +27727,7 @@ packages: + - ts-node + dev: true + +- /jest-config@29.7.0(@types/node@24.2.1): ++ /jest-config@29.7.0(@types/node@24.2.1)(ts-node@10.9.2): + resolution: {integrity: sha512-uXbpfeQ7R6TZBqI3/TxCU4q4ttk3u0PJeC+E0zbfSoSjq6bJ7buBPxzQPL0ifrkY4DNu4JUdk0ImlBUYi840eQ==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + peerDependencies: +@@ -27704,12 +27762,13 @@ packages: + pretty-format: 29.7.0 + slash: 3.0.0 + strip-json-comments: 3.1.1 ++ ts-node: 10.9.2(@swc/core@1.5.29)(@types/node@24.2.1)(typescript@5.9.2) + transitivePeerDependencies: + - babel-plugin-macros + - supports-color + dev: true + +- /jest-config@29.7.0(@types/node@24.6.2): ++ /jest-config@29.7.0(@types/node@24.6.2)(ts-node@10.9.2): + resolution: {integrity: sha512-uXbpfeQ7R6TZBqI3/TxCU4q4ttk3u0PJeC+E0zbfSoSjq6bJ7buBPxzQPL0ifrkY4DNu4JUdk0ImlBUYi840eQ==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + peerDependencies: +@@ -27744,6 +27803,7 @@ packages: + pretty-format: 29.7.0 + slash: 3.0.0 + strip-json-comments: 3.1.1 ++ ts-node: 10.9.2(@swc/core@1.5.29)(@types/node@24.2.1)(typescript@5.9.2) + transitivePeerDependencies: + - babel-plugin-macros + - supports-color +@@ -28041,7 +28101,7 @@ packages: + supports-color: 8.1.1 + dev: true + +- /jest@29.7.0(@types/node@24.2.1): ++ /jest@29.7.0(@types/node@24.2.1)(ts-node@10.9.2): + resolution: {integrity: sha512-NIy3oAFp9shda19hy4HK0HRTWKtPJmGdnvywu01nOqNC2vZg+Z+fvJDxpMQA88eb2I9EcafcdjYgsDthnYTvGw==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + hasBin: true +@@ -28051,10 +28111,10 @@ packages: + node-notifier: + optional: true + dependencies: +- '@jest/core': 29.7.0 ++ '@jest/core': 29.7.0(ts-node@10.9.2) + '@jest/types': 29.6.3 + import-local: 3.2.0 +- jest-cli: 29.7.0(@types/node@24.2.1) ++ jest-cli: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) + transitivePeerDependencies: + - '@types/node' + - babel-plugin-macros +@@ -36767,7 +36827,7 @@ packages: + esbuild: 0.25.10 + fast-json-stable-stringify: 2.1.0 + handlebars: 4.7.8 +- jest: 29.7.0(@types/node@24.2.1) ++ jest: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) + json5: 2.2.3 + lodash.memoize: 4.1.2 + make-error: 1.3.6 +diff --git a/tmp/test/traffic-concurrency.ts b/tmp/test/traffic-concurrency.ts +new file mode 100644 +index 00000000..d12fc5c9 +--- /dev/null ++++ b/tmp/test/traffic-concurrency.ts +@@ -0,0 +1,91 @@ ++// @ts-nocheck ++/** ++ * Manual test: TrafficController maxConcurrent scheduling. ++ * ++ * What to look for: ++ * - `inFlight` should never exceed `maxConcurrent`. ++ * - Requests should start in bursts up to `maxConcurrent`. ++ * ++ * Run: ++ * - pnpm ts-node tmp/test/traffic-concurrency.ts ++ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-concurrency.ts (enable controller debug logs) ++ */ ++ ++import { safeStringify } from "@voltagent/internal"; ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); ++const now = () => new Date().toISOString(); ++ ++const maxConcurrent = 3; ++const controller = getTrafficController({ maxConcurrent }); ++ ++let inFlight = 0; ++let maxObserved = 0; ++ ++function makeModel(id: string, durationMs: number) { ++ return { ++ specificationVersion: "v2", ++ provider: "sim", ++ modelId: `concurrency-${id}`, ++ doGenerate: async () => { ++ inFlight += 1; ++ maxObserved = Math.max(maxObserved, inFlight); ++ console.log(`[${now()}] start ${id} inFlight=${inFlight}`); ++ ++ try { ++ await sleep(durationMs); ++ return { ++ content: [{ type: "text", text: `ok:${id}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { modelId: `concurrency-${id}`, headers: {} }, ++ }; ++ } finally { ++ inFlight -= 1; ++ console.log(`[${now()}] end ${id} inFlight=${inFlight}`); ++ } ++ }, ++ }; ++} ++ ++async function main() { ++ console.log(`\n=== TrafficController concurrency (maxConcurrent=${maxConcurrent}) ===`); ++ void controller; ++ ++ const agent = new Agent({ ++ name: "traffic-concurrency", ++ instructions: "echo", ++ model: makeModel("base", 0), ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ const ids = ["A", "B", "C", "D", "E"]; ++ const jobs = ids.map((id) => ++ agent.generateText(id, { ++ tenantId: "default", ++ trafficPriority: "P1", ++ model: makeModel(id, 700), ++ }), ++ ); ++ ++ const settled = await Promise.allSettled(jobs); ++ console.log(`\n[done] maxObserved=${maxObserved}`); ++ console.log( ++ `[done] results=${safeStringify( ++ settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)), ++ )}`, ++ ); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-fallback-chain.ts b/tmp/test/traffic-fallback-chain.ts +new file mode 100644 +index 00000000..0cd77b2b +--- /dev/null ++++ b/tmp/test/traffic-fallback-chain.ts +@@ -0,0 +1,168 @@ ++// @ts-nocheck ++/** ++ * Manual test: TrafficController circuit breaker + fallback chains. ++ * ++ * Scenarios: ++ * - Test 1: Open primary circuit (via repeated 429s), then route to fallback1. ++ * - Test 2: Open fallback1 circuit, then route to fallback2 (success). ++ * - Test 3: No fallback configured → CircuitBreakerOpenError. ++ * ++ * Run: ++ * - pnpm ts-node tmp/test/traffic-fallback-chain.ts ++ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-fallback-chain.ts ++ */ ++ ++import { safeStringify } from "@voltagent/internal"; ++import { MockLanguageModelV2, MockProviderV2 } from "ai/test"; ++import { ++ Agent, ++ CircuitBreakerOpenError, ++ getTrafficController, ++} from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); ++const now = () => new Date().toISOString(); ++ ++type ModelId = "primary" | "fallback1" | "fallback2" | "no-fallback"; ++ ++const provider = "test-provider"; ++ ++const controller = getTrafficController({ ++ maxConcurrent: 1, ++ fallbackChains: { ++ primary: ["fallback1", "fallback2"], ++ fallback1: ["fallback2"], ++ }, ++}); ++ ++function makeAlways429Model(modelId: ModelId) { ++ let attempts = 0; ++ return new MockLanguageModelV2({ ++ provider, ++ modelId, ++ doGenerate: async () => { ++ attempts += 1; ++ console.log(`[${now()}] doGenerate model=${modelId} attempt=${attempts} -> 429`); ++ await sleep(25); ++ const err: any = new Error(`forced 429 for model=${modelId} attempt=${attempts}`); ++ err.status = 429; ++ throw err; ++ }, ++ }); ++} ++ ++function makeAlwaysOkModel(modelId: ModelId) { ++ let attempts = 0; ++ return new MockLanguageModelV2({ ++ provider, ++ modelId, ++ doGenerate: async () => { ++ attempts += 1; ++ console.log(`[${now()}] doGenerate model=${modelId} attempt=${attempts} -> ok`); ++ await sleep(25); ++ return { ++ content: [{ type: "text", text: `ok:${modelId}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { modelId, headers: {} }, ++ }; ++ }, ++ }); ++} ++ ++const primaryModel = makeAlways429Model("primary"); ++const fallback1Model = makeAlways429Model("fallback1"); ++const fallback2Model = makeAlwaysOkModel("fallback2"); ++const noFallbackModel = makeAlways429Model("no-fallback"); ++ ++// Required so Agent fallbacks (string model IDs) resolve without network calls. ++(globalThis as any).AI_SDK_DEFAULT_PROVIDER = new MockProviderV2({ ++ languageModels: { ++ primary: primaryModel, ++ fallback1: fallback1Model, ++ fallback2: fallback2Model, ++ "no-fallback": noFallbackModel, ++ }, ++}); ++ ++const primaryAgent = new Agent({ ++ name: "traffic-fallback-primary", ++ instructions: "echo", ++ model: primaryModel, ++ temperature: 0, ++ maxOutputTokens: 32, ++}); ++ ++const noFallbackAgent = new Agent({ ++ name: "traffic-fallback-none", ++ instructions: "echo", ++ model: noFallbackModel, ++ temperature: 0, ++ maxOutputTokens: 32, ++}); ++ ++async function runOnce(label: string, agent: any) { ++ console.log(`\n--- ${label} ---`); ++ try { ++ const result = await agent.generateText(label, { ++ tenantId: "default", ++ trafficPriority: "P1", ++ }); ++ console.log( ++ `[${label}] success text=${result.text} responseModel=${result.response?.modelId ?? "n/a"}`, ++ ); ++ } catch (err: any) { ++ if (err instanceof CircuitBreakerOpenError) { ++ console.log( ++ `[${label}] CircuitBreakerOpenError retryAfterMs=${err.retryAfterMs} msg=${err.message}`, ++ ); ++ } else { ++ console.log( ++ `[${label}] failed name=${err?.name ?? "Error"} status=${err?.status ?? err?.statusCode ?? "n/a"} msg=${err?.message}`, ++ ); ++ } ++ } ++} ++ ++async function main() { ++ console.log("\n=== Circuit breaker + fallback chain ==="); ++ void controller; ++ ++ console.log("\n[Test 1] Open primary circuit, then route to fallback1"); ++ // Two calls * (up to 3 retries each) ≈ 6 failures → should open the circuit (threshold=5). ++ await runOnce("primary-warmup-1", primaryAgent); ++ await runOnce("primary-warmup-2", primaryAgent); ++ await runOnce("primary-after-open", primaryAgent); // should execute fallback1 (still closed) ++ ++ console.log("\n[Test 2] Open fallback1 circuit, then route to fallback2"); ++ // Build enough failures on fallback1 by routing multiple requests to it via primary circuit-open path. ++ await runOnce("fallback1-warmup-1-via-primary", primaryAgent); ++ await runOnce("fallback1-warmup-2-via-primary", primaryAgent); ++ await runOnce("primary-should-hit-fallback2", primaryAgent); // should execute fallback2 and succeed ++ ++ console.log("\n[Test 3] No fallback configured → CircuitBreakerOpenError"); ++ await runOnce("no-fallback-warmup-1", noFallbackAgent); ++ await runOnce("no-fallback-warmup-2", noFallbackAgent); ++ await runOnce("no-fallback-after-open", noFallbackAgent); ++ ++ console.log("\n[debug] model call counts:"); ++ console.log( ++ safeStringify({ ++ primary: primaryModel.doGenerateCalls?.length, ++ fallback1: fallback1Model.doGenerateCalls?.length, ++ fallback2: fallback2Model.doGenerateCalls?.length, ++ "no-fallback": noFallbackModel.doGenerateCalls?.length, ++ }), ++ ); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-priority-openai-real.ts b/tmp/test/traffic-priority-openai-real.ts +new file mode 100644 +index 00000000..223263ba +--- /dev/null ++++ b/tmp/test/traffic-priority-openai-real.ts +@@ -0,0 +1,117 @@ ++// @ts-nocheck ++/** ++ * Manual test: TrafficController + AI SDK with real OpenAI calls. ++ * ++ * What this exercises: ++ * - Priority scheduling (P0/P1/P2) with `maxConcurrent=1` ++ * - Rate limit header ingestion via `updateRateLimitFromHeaders()` (if headers are present) ++ * - Tenant usage aggregation via `extractUsage` + `getTenantUsage()` ++ * ++ * Prereqs: ++ * - Set `OPENAI_API_KEY` ++ * ++ * Run: ++ * - OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-priority-openai-real.ts ++ * - VERBOSE=1 OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-priority-openai-real.ts ++ * ++ * Notes: ++ * - This will make real network calls and may incur cost. ++ */ ++ ++import { openai } from "@ai-sdk/openai"; ++import { safeStringify } from "@voltagent/internal"; ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const apiKey = process.env.OPENAI_API_KEY; ++if (!apiKey) { ++ console.error("Missing OPENAI_API_KEY. Example:"); ++ console.error(" OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-priority-openai-real.ts"); ++ process.exit(1); ++} ++ ++const _now = () => new Date().toISOString(); ++const preview = (value: unknown, max = 140) => { ++ if (typeof value !== "string") return String(value ?? ""); ++ return value.length > max ? `${value.slice(0, max)}…` : value; ++}; ++ ++const tenantId = process.env.TENANT_ID ?? "openai-real"; ++const defaultModelId = process.env.OPENAI_MODEL ?? "gpt-4o-mini"; ++ ++const controller = getTrafficController({ maxConcurrent: 1 }); ++ ++function getHeader(headers: any, name: string): string | undefined { ++ if (!headers) return undefined; ++ if (typeof headers.get === "function") { ++ const v = headers.get(name); ++ return v === null || v === undefined ? undefined : String(v); ++ } ++ const key = Object.keys(headers).find((k) => k.toLowerCase() === name.toLowerCase()); ++ if (!key) return undefined; ++ const v = headers[key]; ++ return v === null || v === undefined ? undefined : String(Array.isArray(v) ? v[0] : v); ++} ++ ++async function main() { ++ console.log( ++ `\n=== OpenAI real: priority scheduling (tenantId=${tenantId}, model=${defaultModelId}) ===`, ++ ); ++ void controller; ++ ++ const agent = new Agent({ ++ name: "openai-real-traffic", ++ instructions: "Reply exactly with the requested token.", ++ model: openai(defaultModelId), ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ // Enqueue in reverse priority order; controller should still execute P0 first. ++ const p2 = agent.generateText("Reply with only: P2", { tenantId, trafficPriority: "P2" }); ++ const p1 = agent.generateText("Reply with only: P1", { tenantId, trafficPriority: "P1" }); ++ const p0 = agent.generateText("Reply with only: P0", { tenantId, trafficPriority: "P0" }); ++ ++ const settled = await Promise.allSettled([p0, p1, p2]); ++ for (const result of settled) { ++ if (result.status !== "fulfilled") { ++ console.log(`[result] rejected=${result.reason?.message ?? String(result.reason)}`); ++ continue; ++ } ++ ++ const headers = result.value.response?.headers; ++ const limit = getHeader(headers, "x-ratelimit-limit-requests"); ++ const remaining = getHeader(headers, "x-ratelimit-remaining-requests"); ++ const reset = getHeader(headers, "x-ratelimit-reset-requests"); ++ ++ console.log( ++ `[result] text=${preview(result.value.text)} finishReason=${result.value.finishReason} usage=${safeStringify(result.value.usage)}`, ++ ); ++ console.log( ++ `[result] ratelimitHeaders=${safeStringify({ ++ limit, ++ remaining, ++ reset, ++ })}`, ++ ); ++ } ++ ++ console.log( ++ `\n[done] settled=${safeStringify( ++ settled.map((s) => (s.status === "fulfilled" ? preview(s.value.text) : s.reason?.message)), ++ )}`, ++ ); ++ ++ console.log( ++ `[done] tenantUsage(${tenantId})=${safeStringify(controller.getTenantUsage(tenantId))}`, ++ ); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-priority-openai-sim.ts b/tmp/test/traffic-priority-openai-sim.ts +new file mode 100644 +index 00000000..9d36a7d1 +--- /dev/null ++++ b/tmp/test/traffic-priority-openai-sim.ts +@@ -0,0 +1,114 @@ ++// @ts-nocheck ++/** ++ * Manual test: Agent → TrafficController priority scheduling (OpenAI-like stub models). ++ * ++ * This keeps the Agent + AI SDK path, but avoids real network calls by using stub models ++ * that pretend to be `provider="openai"` with modelIds like `gpt-4o`/`gpt-4o-mini`. ++ * ++ * Scenarios: ++ * - Test 1: P0 runs before P1/P2 when all runnable. ++ * - Test 2: P0 request (gpt-4o) is rate-limited → P1 (gpt-4o-mini) proceeds. ++ * ++ * Note: ++ * - Rate-limit wakeups include a small probe delay; a "1s" reset may unblock slightly after 1s. ++ * ++ * Run: ++ * - pnpm ts-node tmp/test/traffic-priority-openai-sim.ts ++ */ ++ ++import { safeStringify } from "@voltagent/internal"; ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); ++const now = () => new Date().toISOString(); ++ ++function makeOpenAIStubModel(modelId: string, delayMs: number) { ++ let calls = 0; ++ return { ++ specificationVersion: "v2", ++ provider: "openai", ++ modelId, ++ doGenerate: async () => { ++ calls += 1; ++ console.log(`[${now()}] [model] ${modelId} doGenerate call=${calls}`); ++ await sleep(delayMs); ++ return { ++ content: [{ type: "text", text: `ok:${modelId}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { modelId, headers: {} }, ++ }; ++ }, ++ }; ++} ++ ++const controller = getTrafficController({ maxConcurrent: 1 }); ++ ++const modelMini = makeOpenAIStubModel("gpt-4o-mini", 80); ++const modelBig = makeOpenAIStubModel("gpt-4o", 80); ++ ++const agent = new Agent({ ++ name: "priority-openai-sim", ++ instructions: "echo", ++ model: modelMini, ++ temperature: 0, ++ maxOutputTokens: 32, ++}); ++ ++async function test1_priorityOrder() { ++ console.log("\n=== Test 1: P0 ordering via Agent ==="); ++ ++ const p2 = agent.generateText("P2", { trafficPriority: "P2", tenantId: "sim" }); ++ const p1 = agent.generateText("P1", { trafficPriority: "P1", tenantId: "sim" }); ++ const p0 = agent.generateText("P0", { trafficPriority: "P0", tenantId: "sim" }); ++ ++ const results = await Promise.all([p0, p1, p2]); ++ console.log(`[Test 1] results=${safeStringify(results.map((r) => r.text))}`); ++} ++ ++async function test2_p1RunsWhenP0RateLimited() { ++ console.log("\n=== Test 2: P1 proceeds when P0 is rate-limited ==="); ++ ++ // Seed remaining=0 for openai::gpt-4o so the P0 head item initially waits. ++ const applied = controller.updateRateLimitFromHeaders( ++ { provider: "openai", model: "gpt-4o" }, ++ { ++ "x-ratelimit-limit-requests": "1", ++ "x-ratelimit-remaining-requests": "0", ++ "x-ratelimit-reset-requests": "1s", ++ }, ++ ); ++ console.log(`[Test 2] updateRateLimitFromHeaders=${safeStringify(applied)}`); ++ ++ const p0Blocked = agent.generateText("P0 (gpt-4o, rate-limited)", { ++ trafficPriority: "P0", ++ tenantId: "sim", ++ model: modelBig, // per-call model override (new in this branch) ++ }); ++ ++ const p1Free = agent.generateText("P1 (gpt-4o-mini)", { ++ trafficPriority: "P1", ++ tenantId: "sim", ++ model: modelMini, ++ }); ++ ++ const [r0, r1] = await Promise.all([p0Blocked, p1Free]); ++ console.log(`[Test 2] p0 text=${r0.text}`); ++ console.log(`[Test 2] p1 text=${r1.text}`); ++} ++ ++async function main() { ++ await test1_priorityOrder(); ++ await test2_p1RunsWhenP0RateLimited(); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-priority.ts b/tmp/test/traffic-priority.ts +new file mode 100644 +index 00000000..409e1078 +--- /dev/null ++++ b/tmp/test/traffic-priority.ts +@@ -0,0 +1,159 @@ ++// @ts-nocheck ++/** ++ * Manual test: TrafficController priority scheduling. ++ * ++ * Scenarios: ++ * - Test 1: P0 should run before P1/P2 when runnable. ++ * - Test 2: If a P0 request is rate-limited, a lower priority (P1) can proceed. ++ * ++ * Note: ++ * - Rate-limit wakeups include a small probe delay; a "1s" reset may unblock slightly after 1s. ++ * ++ * Run: ++ * - pnpm ts-node tmp/test/traffic-priority.ts ++ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-priority.ts ++ */ ++ ++import { safeStringify } from "@voltagent/internal"; ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); ++const now = () => new Date().toISOString(); ++ ++const controller = getTrafficController({ maxConcurrent: 1 }); ++ ++function extractLabel(prompt: any): string { ++ if (!Array.isArray(prompt)) { ++ return "unknown"; ++ } ++ ++ for (let index = prompt.length - 1; index >= 0; index -= 1) { ++ const message = prompt[index]; ++ if (!message || message.role !== "user" || !Array.isArray(message.content)) { ++ continue; ++ } ++ ++ const textPart = message.content.find((part: any) => part?.type === "text"); ++ if (textPart?.text) { ++ return String(textPart.text); ++ } ++ } ++ ++ return "unknown"; ++} ++ ++function makeModel(provider: string, modelId: string, delayMs = 50) { ++ let calls = 0; ++ let lastStartAt = 0; ++ ++ return { ++ specificationVersion: "v2", ++ provider, ++ modelId, ++ doGenerate: async (options: any) => { ++ calls += 1; ++ const startAt = Date.now(); ++ const delta = lastStartAt ? startAt - lastStartAt : 0; ++ lastStartAt = startAt; ++ ++ const label = extractLabel(options?.prompt); ++ console.log( ++ `[${now()}] doGenerate start model=${provider}::${modelId} call=${calls} (+${delta}ms) input=${label}`, ++ ); ++ await sleep(delayMs); ++ console.log(`[${now()}] doGenerate end model=${provider}::${modelId} input=${label}`); ++ ++ return { ++ content: [{ type: "text", text: `ok:${label}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { modelId, headers: {} }, ++ }; ++ }, ++ }; ++} ++ ++async function test1_priorityOrder() { ++ console.log("\n=== Test 1: priority order (P0 before P1/P2) ==="); ++ ++ const sharedModel = makeModel("p", "shared-model", 50); ++ const agent = new Agent({ ++ name: "traffic-priority", ++ instructions: "echo", ++ model: sharedModel, ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ // Enqueue in reverse order; scheduler should still run P0 first. ++ const p2 = agent.generateText("P2", { tenantId: "default", trafficPriority: "P2" }); ++ const p1 = agent.generateText("P1", { tenantId: "default", trafficPriority: "P1" }); ++ const p0 = agent.generateText("P0", { tenantId: "default", trafficPriority: "P0" }); ++ ++ const settled = await Promise.allSettled([p0, p1, p2]); ++ console.log( ++ `[Test 1] results=${safeStringify( ++ settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)), ++ )}`, ++ ); ++} ++ ++async function test2_lowerPriorityWhenP0RateLimited() { ++ console.log("\n=== Test 2: P1 proceeds when P0 rate-limited ==="); ++ ++ const applied = controller.updateRateLimitFromHeaders( ++ { provider: "p0", model: "m0" }, ++ { ++ "x-ratelimit-limit-requests": "1", ++ "x-ratelimit-remaining-requests": "0", ++ "x-ratelimit-reset-requests": "1s", ++ }, ++ ); ++ console.log(`[Test 2] updateRateLimitFromHeaders=${safeStringify(applied)}`); ++ ++ const modelP0 = makeModel("p0", "m0", 50); ++ const modelP1 = makeModel("p1", "m1", 50); ++ const agent = new Agent({ ++ name: "traffic-priority-rate-limit", ++ instructions: "echo", ++ model: modelP1, ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ // Now the next P0 request is at the head of the queue but rate-limited, ++ // so a runnable P1 request should execute first. ++ const p0Blocked = agent.generateText("P0-blocked (rate limited)", { ++ tenantId: "default", ++ trafficPriority: "P0", ++ model: modelP0, ++ }); ++ const p1Free = agent.generateText("P1-free (should run first)", { ++ tenantId: "default", ++ trafficPriority: "P1", ++ model: modelP1, ++ }); ++ ++ const settled = await Promise.allSettled([p0Blocked, p1Free]); ++ console.log( ++ `[Test 2] results=${safeStringify( ++ settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)), ++ )}`, ++ ); ++} ++ ++async function main() { ++ await test1_priorityOrder(); ++ await test2_lowerPriorityWhenP0RateLimited(); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-rate-limit-from-headers.ts b/tmp/test/traffic-rate-limit-from-headers.ts +new file mode 100644 +index 00000000..d8262661 +--- /dev/null ++++ b/tmp/test/traffic-rate-limit-from-headers.ts +@@ -0,0 +1,158 @@ ++// @ts-nocheck ++/** ++ * Manual test: TrafficController dynamic rate limits from OpenAI response headers. ++ * ++ * This hits the real OpenAI model via Agent + AI SDK, and relies on the ++ * `x-ratelimit-*` response headers to seed/update the TrafficController. ++ * ++ * What to look for: ++ * - Each request prints the observed `x-ratelimit-*` headers (if present). ++ * - Agent should also log: "[Traffic] Applied rate limit from response headers". ++ * - With enough parallel requests, some requests may take longer due to controller throttling. ++ * ++ * Prereqs: ++ * - Set `OPENAI_API_KEY` ++ * ++ * Optional env: ++ * - `OPENAI_MODEL` (default: gpt-4o-mini) ++ * - `REQUESTS` (default: 10) ++ * - `MAX_CONCURRENT` (default: 50) ++ * - `TENANT_ID` (default: openai-rate-limit-headers) ++ * ++ * Run: ++ * - OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-from-headers.ts ++ * - VERBOSE=1 OPENAI_API_KEY=... REQUESTS=30 pnpm ts-node tmp/test/traffic-rate-limit-from-headers.ts ++ */ ++ ++import { openai } from "@ai-sdk/openai"; ++import { safeStringify } from "@voltagent/internal"; ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const now = () => new Date().toISOString(); ++ ++const apiKey = process.env.OPENAI_API_KEY; ++if (!apiKey) { ++ console.error("Missing OPENAI_API_KEY. Example:"); ++ console.error(" OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-from-headers.ts"); ++ process.exit(1); ++} ++ ++const provider = "openai"; ++const modelId = process.env.OPENAI_MODEL ?? "gpt-4o-mini"; ++const tenantId = process.env.TENANT_ID ?? "openai-rate-limit-headers"; ++const requestCountRaw = Number(process.env.REQUESTS ?? "10"); ++const maxConcurrentRaw = Number(process.env.MAX_CONCURRENT ?? "50"); ++const requestCount = Number.isFinite(requestCountRaw) && requestCountRaw > 0 ? requestCountRaw : 10; ++const maxConcurrent = ++ Number.isFinite(maxConcurrentRaw) && maxConcurrentRaw > 0 ? maxConcurrentRaw : 50; ++ ++const key = `${provider}::${modelId}`; ++const controller = getTrafficController({ maxConcurrent }); ++ ++function getHeader(headers: any, name: string): string | undefined { ++ if (!headers) return undefined; ++ if (typeof headers.get === "function") { ++ const v = headers.get(name); ++ return v === null || v === undefined ? undefined : String(v); ++ } ++ ++ const entries = Object.entries(headers as Record); ++ const target = name.toLowerCase(); ++ const match = entries.find(([k]) => String(k).toLowerCase() === target); ++ if (!match) return undefined; ++ ++ const value = match[1]; ++ if (Array.isArray(value)) { ++ const first = value[0]; ++ return first === null || first === undefined ? undefined : String(first); ++ } ++ ++ return value === null || value === undefined ? undefined : String(value); ++} ++ ++async function main() { ++ console.log( ++ `\n=== OpenAI rate limit headers → TrafficController (${key}, maxConcurrent=${maxConcurrent}, requests=${requestCount}) ===`, ++ ); ++ void controller; ++ ++ const agent = new Agent({ ++ name: "openai-rate-limit-from-headers", ++ instructions: "Reply with only the requested token.", ++ model: openai(modelId), ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ console.log("\n[seed] Making one request to capture headers..."); ++ const seedStartedAt = Date.now(); ++ const seed = await agent.generateText("Reply with only: seed", { ++ tenantId, ++ trafficPriority: "P1", ++ }); ++ const seedElapsedMs = Date.now() - seedStartedAt; ++ ++ const seedHeaders = seed.response?.headers; ++ console.log(`[seed] done in ${seedElapsedMs}ms text=${seed.text}`); ++ console.log( ++ `[seed] x-ratelimit-*=${safeStringify({ ++ limit: getHeader(seedHeaders, "x-ratelimit-limit-requests"), ++ remaining: getHeader(seedHeaders, "x-ratelimit-remaining-requests"), ++ reset: getHeader(seedHeaders, "x-ratelimit-reset-requests"), ++ })}`, ++ ); ++ ++ console.log(`\n[burst] Scheduling ${requestCount} parallel requests...`); ++ const jobs = Array.from({ length: requestCount }, (_, idx) => { ++ const label = `req-${idx + 1}`; ++ const enqueuedAt = Date.now(); ++ console.log(`[${now()}] enqueue ${label}`); ++ ++ return agent ++ .generateText(`Reply with only: ${label}`, { tenantId, trafficPriority: "P1" }) ++ .then((result) => { ++ const elapsedMs = Date.now() - enqueuedAt; ++ const headers = result.response?.headers; ++ console.log( ++ `[${now()}] done ${label} in ${elapsedMs}ms text=${result.text} x-ratelimit-remaining=${getHeader( ++ headers, ++ "x-ratelimit-remaining-requests", ++ )}`, ++ ); ++ return { ++ label, ++ elapsedMs, ++ text: result.text, ++ headers: { ++ limit: getHeader(headers, "x-ratelimit-limit-requests"), ++ remaining: getHeader(headers, "x-ratelimit-remaining-requests"), ++ reset: getHeader(headers, "x-ratelimit-reset-requests"), ++ }, ++ }; ++ }) ++ .catch((error) => { ++ const elapsedMs = Date.now() - enqueuedAt; ++ console.log( ++ `[${now()}] failed ${label} in ${elapsedMs}ms name=${error?.name ?? "Error"} status=${error?.status ?? error?.statusCode ?? "n/a"} msg=${error?.message}`, ++ ); ++ throw error; ++ }); ++ }); ++ ++ const settled = await Promise.allSettled(jobs); ++ ++ console.log(`\n[done] settled=${safeStringify(settled.map((s) => s.status))}`); ++ console.log( ++ `[done] tenantUsage(${tenantId})=${safeStringify(controller.getTenantUsage(tenantId))}`, ++ ); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-rate-limit-openai-window-sim.ts b/tmp/test/traffic-rate-limit-openai-window-sim.ts +new file mode 100644 +index 00000000..35232faa +--- /dev/null ++++ b/tmp/test/traffic-rate-limit-openai-window-sim.ts +@@ -0,0 +1,247 @@ ++// @ts-nocheck ++/** ++ * Manual test (real network): Simulate OpenAI "window remaining + reset" semantics and watch ++ * TrafficController pace + probe behavior via logs. ++ * ++ * Why "simulate"? ++ * - Real OpenAI headers usually show very large remaining values, so pacing is hard to observe. ++ * - This script still hits the real OpenAI model, but it drives the controller state using ++ * synthetic `x-ratelimit-*` headers to force a small window (e.g. remaining=3, reset=30s). ++ * ++ * What this demonstrates (matches your Step 1–7): ++ * 1) We seed controller with remaining + reset window. ++ * 2) We enqueue many requests. ++ * 3) Controller subtracts `reserved` from `remaining` to avoid stampedes. ++ * 4) When `effectiveRemaining <= 1`, controller waits until `resetAt + probeDelay`. ++ * 5) When room exists, controller paces using `nextAllowedAt`. ++ * 6) When a request finishes, we release reservation (controller) and apply new headers (this script). ++ * 7) After reset, controller sends a probe even when remaining==0; probe "fetches" fresh headers and flow resumes. ++ * ++ * Prereqs: ++ * - Set `OPENAI_API_KEY` ++ * ++ * Suggested logging: ++ * - `VOLTAGENT_LOG_LEVEL=trace` (to see traffic controller internals) ++ * ++ * Run: ++ * - VOLTAGENT_LOG_LEVEL=trace OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-openai-window-sim.ts ++ * ++ * Optional env: ++ * - OPENAI_MODEL (default: gpt-4o-mini) ++ * - WINDOW_SECONDS (default: 30) ++ * - REMAINING (default: 3) ++ * - REQUESTS (default: 10) ++ * - MAX_CONCURRENT (default: 50) ++ */ ++ ++import { safeStringify } from "@voltagent/internal"; ++import { TrafficController } from "../../packages/core/dist/index.js"; ++ ++const apiKey = process.env.OPENAI_API_KEY; ++if (!apiKey) { ++ console.error("Missing OPENAI_API_KEY. Example:"); ++ console.error( ++ " VOLTAGENT_LOG_LEVEL=trace OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-openai-window-sim.ts", ++ ); ++ process.exit(1); ++} ++ ++const now = () => new Date().toISOString(); ++ ++const modelId = process.env.OPENAI_MODEL ?? "gpt-4o-mini"; ++const windowSecondsRaw = Number(process.env.WINDOW_SECONDS ?? "30"); ++const remainingRaw = Number(process.env.REMAINING ?? "3"); ++const requestsRaw = Number(process.env.REQUESTS ?? "10"); ++const maxConcurrentRaw = Number(process.env.MAX_CONCURRENT ?? "50"); ++ ++const windowSeconds = ++ Number.isFinite(windowSecondsRaw) && windowSecondsRaw > 0 ? windowSecondsRaw : 30; ++const initialRemaining = ++ Number.isFinite(remainingRaw) && remainingRaw > 0 ? Math.floor(remainingRaw) : 3; ++const requestCount = Number.isFinite(requestsRaw) && requestsRaw > 0 ? Math.floor(requestsRaw) : 10; ++const maxConcurrent = ++ Number.isFinite(maxConcurrentRaw) && maxConcurrentRaw > 0 ? Math.floor(maxConcurrentRaw) : 50; ++ ++const provider = "openai"; ++const tenantId = "openai-window-sim"; ++const windowMs = Math.round(windowSeconds * 1000); ++ ++async function callOpenAIResponses(label: string): Promise<{ ++ status: number; ++ headers: Record; ++ textPreview: string; ++}> { ++ const url = "https://api.openai.com/v1/responses"; ++ const body = safeStringify({ ++ model: modelId, ++ input: `Reply with only: ${label}`, ++ max_output_tokens: 16, ++ }); ++ ++ const startedAt = Date.now(); ++ const res = await fetch(url, { ++ method: "POST", ++ headers: { ++ authorization: `Bearer ${apiKey}`, ++ "content-type": "application/json", ++ }, ++ body, ++ }); ++ ++ const limit = res.headers.get("x-ratelimit-limit-requests") ?? undefined; ++ const remaining = res.headers.get("x-ratelimit-remaining-requests") ?? undefined; ++ const reset = res.headers.get("x-ratelimit-reset-requests") ?? undefined; ++ ++ if (!res.ok) { ++ const text = await res.text().catch(() => ""); ++ throw new Error( ++ `OpenAI error status=${res.status} elapsedMs=${Date.now() - startedAt} body=${text.slice(0, 280)}`, ++ ); ++ } ++ ++ const data: any = await res.json(); ++ const outputText = ++ data?.output?.[0]?.content?.find?.((c: any) => c?.type === "output_text")?.text ?? ++ data?.output_text ?? ++ data?.output?.[0]?.content?.[0]?.text ?? ++ ""; ++ ++ return { ++ status: res.status, ++ headers: { ++ "x-ratelimit-limit-requests": limit, ++ "x-ratelimit-remaining-requests": remaining, ++ "x-ratelimit-reset-requests": reset, ++ }, ++ textPreview: String(outputText).slice(0, 80), ++ }; ++} ++ ++async function main() { ++ console.log( ++ `\n=== OpenAI real + synthetic window rate limit (provider=${provider}, model=${modelId}) ===`, ++ ); ++ console.log( ++ `[config] maxConcurrent=${maxConcurrent} windowSeconds=${windowSeconds} initialRemaining=${initialRemaining} requests=${requestCount}`, ++ ); ++ console.log( ++ "[hint] Set VOLTAGENT_LOG_LEVEL=trace to see TrafficController internals (reserved/effectiveRemaining/nextAllowedAt).", ++ ); ++ ++ const controller = new TrafficController({ maxConcurrent }); ++ ++ // --- Step 1: seed "remaining + reset window" into controller --- ++ let windowResetAt = Date.now() + windowMs; ++ let remainingInWindow = initialRemaining; ++ ++ const applySyntheticHeaders = (source: string) => { ++ const resetMs = Math.max(1, windowResetAt - Date.now()); ++ const applied = controller.updateRateLimitFromHeaders( ++ { provider, model: modelId, tenantId }, ++ { ++ "x-ratelimit-limit-requests": String(initialRemaining), ++ "x-ratelimit-remaining-requests": String(Math.max(0, remainingInWindow)), ++ "x-ratelimit-reset-requests": `${resetMs}ms`, ++ }, ++ ); ++ console.log( ++ `[${now()}] [synthetic] source=${source} remaining=${remainingInWindow} resetInMs=${resetMs} applied=${safeStringify( ++ applied && { ++ key: applied.key, ++ state: { ++ remaining: applied.state.remaining, ++ reserved: applied.state.reserved, ++ resetAt: applied.state.resetAt, ++ nextAllowedAt: applied.state.nextAllowedAt, ++ }, ++ }, ++ )}`, ++ ); ++ }; ++ ++ applySyntheticHeaders("seed"); ++ ++ console.log("\n[seed] Making one real request to confirm connectivity + show real headers..."); ++ const seed = await callOpenAIResponses("seed"); ++ console.log( ++ `[${now()}] [seed] ok status=${seed.status} text=${seed.textPreview} realHeaders=${safeStringify( ++ seed.headers, ++ )}`, ++ ); ++ ++ console.log(`\n[burst] Enqueueing ${requestCount} controller-managed requests...`); ++ ++ const jobs = Array.from({ length: requestCount }, (_, index) => { ++ const label = `req-${index + 1}`; ++ const enqueuedAt = Date.now(); ++ console.log(`[${now()}] [enqueue] ${label}`); ++ ++ return controller ++ .handleText({ ++ tenantId, ++ metadata: { ++ tenantId, ++ provider, ++ model: modelId, ++ priority: "P1", ++ agentName: "openai-window-sim", ++ agentId: label, ++ }, ++ execute: async () => { ++ const startedAt = Date.now(); ++ console.log(`[${now()}] [execute-start] ${label}`); ++ ++ const result = await callOpenAIResponses(label); ++ ++ console.log( ++ `[${now()}] [execute-end] ${label} elapsedMs=${Date.now() - startedAt} realHeaders=${safeStringify( ++ result.headers, ++ )}`, ++ ); ++ ++ // --- Step 6: decrement remaining + apply new "headers" --- ++ const nowMs = Date.now(); ++ if (nowMs >= windowResetAt) { ++ // --- Step 7: reset happened; probe request fetched "fresh" headers for the next window --- ++ console.log( ++ `[${now()}] [reset] window elapsed; starting new synthetic window (windowSeconds=${windowSeconds})`, ++ ); ++ windowResetAt = nowMs + windowMs; ++ remainingInWindow = initialRemaining; ++ } ++ ++ remainingInWindow = Math.max(0, remainingInWindow - 1); ++ applySyntheticHeaders("response"); ++ ++ return result; ++ }, ++ }) ++ .then((r) => { ++ const totalElapsedMs = Date.now() - enqueuedAt; ++ console.log( ++ `[${now()}] [done] ${label} totalElapsedMs=${totalElapsedMs} text=${r.textPreview}`, ++ ); ++ return { label, totalElapsedMs, status: "fulfilled" as const }; ++ }) ++ .catch((error: any) => { ++ const totalElapsedMs = Date.now() - enqueuedAt; ++ console.log( ++ `[${now()}] [fail] ${label} totalElapsedMs=${totalElapsedMs} name=${error?.name ?? "Error"} msg=${ ++ error?.message ?? String(error) ++ }`, ++ ); ++ return { label, totalElapsedMs, status: "rejected" as const }; ++ }); ++ }); ++ ++ const settled = await Promise.all(jobs); ++ console.log(`\n[done] settled=${safeStringify(settled.map((s) => s.status))}`); ++ console.log( ++ `[done] tenantUsage(${tenantId})=${safeStringify(controller.getTenantUsage(tenantId))}`, ++ ); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-rate-limit-static.ts b/tmp/test/traffic-rate-limit-static.ts +new file mode 100644 +index 00000000..3f91d5bb +--- /dev/null ++++ b/tmp/test/traffic-rate-limit-static.ts +@@ -0,0 +1,149 @@ ++// @ts-nocheck ++/** ++ * Manual test: TrafficController window-based rate limiting (simulated OpenAI headers). ++ * ++ * What to look for: ++ * - Requests should be paced out across the window (no steady "refill" math). ++ * - If responses arrive out-of-order, remaining headers might "increase"; controller should ++ * keep remaining monotonic within the same window. ++ * ++ * Run: ++ * - pnpm ts-node tmp/test/traffic-rate-limit-static.ts ++ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-rate-limit-static.ts ++ * ++ * Optional env: ++ * - LIMIT=6 WINDOW_MS=3000 pnpm ts-node tmp/test/traffic-rate-limit-static.ts ++ */ ++ ++import { safeStringify } from "@voltagent/internal"; ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); ++const now = () => new Date().toISOString(); ++ ++const provider = "sim"; ++const model = "rate-limited-model"; ++const key = `${provider}::${model}`; ++ ++const controller = getTrafficController({ maxConcurrent: 50 }); ++ ++const limit = Number(process.env.LIMIT ?? 6); ++const windowMs = Number(process.env.WINDOW_MS ?? 3000); ++let windowStartAt = Date.now(); ++let windowResetAt = windowStartAt + windowMs; ++let usedInWindow = 0; ++ ++function extractLabel(prompt: any): string { ++ if (!Array.isArray(prompt)) { ++ return "unknown"; ++ } ++ ++ for (let index = prompt.length - 1; index >= 0; index -= 1) { ++ const message = prompt[index]; ++ if (!message || message.role !== "user" || !Array.isArray(message.content)) { ++ continue; ++ } ++ ++ const textPart = message.content.find((part: any) => part?.type === "text"); ++ if (textPart?.text) { ++ return String(textPart.text); ++ } ++ } ++ ++ return "unknown"; ++} ++ ++async function main() { ++ console.log( ++ `\n=== Window rate limit for ${key} (limit=${limit}, windowMs=${windowMs}, jobs=10) ===`, ++ ); ++ ++ const seeded = controller.updateRateLimitFromHeaders( ++ { provider, model }, ++ { ++ "x-ratelimit-limit-requests": String(limit), ++ "x-ratelimit-remaining-requests": String(limit), ++ "x-ratelimit-reset-requests": `${windowMs}ms`, ++ }, ++ ); ++ console.log(`[seed] updateRateLimitFromHeaders=${safeStringify(seeded)}`); ++ ++ let calls = 0; ++ let lastStartAt = 0; ++ const rateLimitedModel = { ++ specificationVersion: "v2", ++ provider, ++ modelId: model, ++ doGenerate: async (options: any) => { ++ const simulatedLatencyMs = 10 + Math.floor(Math.random() * 120); ++ const nowMs = Date.now(); ++ if (nowMs >= windowResetAt) { ++ windowStartAt = nowMs; ++ windowResetAt = windowStartAt + windowMs; ++ usedInWindow = 0; ++ } ++ ++ calls += 1; ++ usedInWindow += 1; ++ const startAt = Date.now(); ++ const delta = lastStartAt ? startAt - lastStartAt : 0; ++ lastStartAt = startAt; ++ ++ const label = extractLabel(options?.prompt); ++ console.log( ++ `[${now()}] doGenerate start call=${calls} (+${delta}ms) input=${label} latencyMs=${simulatedLatencyMs}`, ++ ); ++ await sleep(simulatedLatencyMs); ++ console.log(`[${now()}] doGenerate end input=${label}`); ++ ++ const remainingAfterThis = Math.max(0, limit - usedInWindow); ++ const resetMs = Math.max(1, windowResetAt - Date.now()); ++ return { ++ content: [{ type: "text", text: `ok:${label}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { ++ modelId: model, ++ headers: { ++ "x-ratelimit-limit-requests": String(limit), ++ "x-ratelimit-remaining-requests": String(remainingAfterThis), ++ "x-ratelimit-reset-requests": `${resetMs}ms`, ++ }, ++ }, ++ }; ++ }, ++ }; ++ ++ const agent = new Agent({ ++ name: "traffic-rate-limit-static", ++ instructions: "echo", ++ model: rateLimitedModel, ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ const jobs = Array.from({ length: 10 }, (_, idx) => ++ agent.generateText(`req-${idx + 1}`, { ++ tenantId: "default", ++ trafficPriority: "P1", ++ }), ++ ); ++ ++ const settled = await Promise.allSettled(jobs); ++ console.log( ++ `\n[done] results=${safeStringify( ++ settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)), ++ )}`, ++ ); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-retry-after.ts b/tmp/test/traffic-retry-after.ts +new file mode 100644 +index 00000000..c0c213eb +--- /dev/null ++++ b/tmp/test/traffic-retry-after.ts +@@ -0,0 +1,245 @@ ++// @ts-nocheck ++/** ++ * Manual test: Retry-After handling (429 retry + 200 OK header ingestion). ++ * ++ * What this exercises: ++ * - Retry-After on 429 errors increases retry delay (TrafficController retry plan). ++ * - Retry-After on successful responses throttles subsequent requests for the same provider::model. ++ * ++ * Run: ++ * - pnpm -C packages/core build ++ * - pnpm ts-node tmp/test/traffic-retry-after.ts ++ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-retry-after.ts ++ */ ++ ++import { safeStringify } from "@voltagent/internal"; ++import { ++ Agent, ++ RateLimitedUpstreamError, ++ getTrafficController, ++} from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); ++const now = () => new Date().toISOString(); ++ ++function extractLabel(prompt: any): string { ++ if (!Array.isArray(prompt)) { ++ return "unknown"; ++ } ++ ++ for (let index = prompt.length - 1; index >= 0; index -= 1) { ++ const message = prompt[index]; ++ if (!message || message.role !== "user" || !Array.isArray(message.content)) { ++ continue; ++ } ++ ++ const textPart = message.content.find((part: any) => part?.type === "text"); ++ if (textPart?.text) { ++ return String(textPart.text); ++ } ++ } ++ ++ return "unknown"; ++} ++ ++function make429RetryAfterModel(args: { ++ provider: string; ++ modelId: string; ++ retryAfterSeconds: number; ++ mode: "headers" | "typedError"; ++}) { ++ const { provider, modelId, retryAfterSeconds, mode } = args; ++ let calls = 0; ++ const startedAt: number[] = []; ++ ++ return { ++ specificationVersion: "v2", ++ provider, ++ modelId, ++ startedAt, ++ doGenerate: async (options: any) => { ++ calls += 1; ++ const start = Date.now(); ++ startedAt.push(start); ++ ++ const label = extractLabel(options?.prompt); ++ console.log(`[${now()}] [model] ${provider}::${modelId} start call=${calls} input=${label}`); ++ ++ if (calls === 1) { ++ const retryAfterValue = String(retryAfterSeconds); ++ ++ if (mode === "typedError") { ++ throw new RateLimitedUpstreamError( ++ `rate limited (typed) retry-after=${retryAfterValue}s`, ++ { provider, model: modelId }, ++ Math.round(retryAfterSeconds * 1000), ++ ); ++ } ++ ++ const err: any = new Error(`rate limited (headers) retry-after=${retryAfterValue}s`); ++ err.status = 429; ++ err.response = { ++ status: 429, ++ headers: { ++ "retry-after": retryAfterValue, ++ }, ++ }; ++ throw err; ++ } ++ ++ return { ++ content: [{ type: "text", text: `ok:${label}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { modelId, headers: {} }, ++ }; ++ }, ++ }; ++} ++ ++function makeSuccessRetryAfterModel(args: { ++ provider: string; ++ modelId: string; ++ retryAfterSeconds: number; ++ latencyMs: number; ++}) { ++ const { provider, modelId, retryAfterSeconds, latencyMs } = args; ++ let calls = 0; ++ const startedAt: number[] = []; ++ const endedAt: number[] = []; ++ ++ return { ++ specificationVersion: "v2", ++ provider, ++ modelId, ++ startedAt, ++ endedAt, ++ doGenerate: async (options: any) => { ++ calls += 1; ++ const start = Date.now(); ++ startedAt.push(start); ++ ++ const label = extractLabel(options?.prompt); ++ console.log(`[${now()}] [model] ${provider}::${modelId} start call=${calls} input=${label}`); ++ await sleep(latencyMs); ++ ++ const end = Date.now(); ++ endedAt.push(end); ++ console.log(`[${now()}] [model] ${provider}::${modelId} end call=${calls} input=${label}`); ++ ++ return { ++ content: [{ type: "text", text: `ok:${label}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { ++ modelId, ++ headers: ++ calls === 1 ++ ? { ++ "retry-after": String(retryAfterSeconds), ++ } ++ : {}, ++ }, ++ }; ++ }, ++ }; ++} ++ ++async function test_retryAfterOn429(mode: "headers" | "typedError") { ++ const retryAfterSeconds = 1; ++ const provider = `retry-after-429-${mode}`; ++ const modelId = "ra-429"; ++ const tenantId = `ra-429-${mode}`; ++ ++ const model = make429RetryAfterModel({ provider, modelId, retryAfterSeconds, mode }); ++ const agent = new Agent({ ++ name: `ra-429-${mode}`, ++ instructions: "echo", ++ model, ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ console.log(`\n=== Test: Retry-After on 429 (${mode}) ===`); ++ const result = await agent.generateText("hello", { tenantId, trafficPriority: "P1" }); ++ ++ const times = model.startedAt; ++ const deltaMs = times.length >= 2 ? times[1] - times[0] : undefined; ++ ++ console.log( ++ `[result] text=${result.text} calls=${times.length} startedAt=${safeStringify(times)} deltaMs=${deltaMs}`, ++ ); ++ ++ if (deltaMs === undefined || deltaMs < retryAfterSeconds * 1000) { ++ throw new Error( ++ `Expected retry delay >= ${retryAfterSeconds * 1000}ms, got ${deltaMs ?? "n/a"}ms`, ++ ); ++ } ++} ++ ++async function test_retryAfterOnSuccessResponse() { ++ const retryAfterSeconds = 0.3; ++ const provider = "retry-after-200"; ++ const modelId = "ra-200"; ++ const tenantId = "ra-200"; ++ ++ const model = makeSuccessRetryAfterModel({ ++ provider, ++ modelId, ++ retryAfterSeconds, ++ latencyMs: 20, ++ }); ++ ++ const agent = new Agent({ ++ name: "ra-200", ++ instructions: "echo", ++ model, ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ console.log("\n=== Test: Retry-After on 200 response headers ==="); ++ const first = agent.generateText("first", { tenantId, trafficPriority: "P1" }); ++ const second = agent.generateText("second", { tenantId, trafficPriority: "P1" }); ++ ++ const [r1, r2] = await Promise.all([first, second]); ++ ++ const end1 = model.endedAt[0]; ++ const start2 = model.startedAt[1]; ++ const enforcedDelayMs = start2 && end1 ? start2 - end1 : undefined; ++ ++ console.log( ++ `[result] texts=${safeStringify([r1.text, r2.text])} startedAt=${safeStringify( ++ model.startedAt, ++ )} endedAt=${safeStringify(model.endedAt)} enforcedDelayMs=${enforcedDelayMs}`, ++ ); ++ ++ if (enforcedDelayMs === undefined || enforcedDelayMs < retryAfterSeconds * 1000) { ++ throw new Error( ++ `Expected rate-limit delay >= ${retryAfterSeconds * 1000}ms, got ${enforcedDelayMs ?? "n/a"}ms`, ++ ); ++ } ++} ++ ++async function main() { ++ // Create controller early so all Agent calls share the same singleton. ++ getTrafficController({ maxConcurrent: 1 }); ++ ++ await test_retryAfterOn429("headers"); ++ await test_retryAfterOn429("typedError"); ++ await test_retryAfterOnSuccessResponse(); ++ ++ console.log("\n[done] All Retry-After manual checks passed."); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-retry-behavior.ts b/tmp/test/traffic-retry-behavior.ts +new file mode 100644 +index 00000000..273af55a +--- /dev/null ++++ b/tmp/test/traffic-retry-behavior.ts +@@ -0,0 +1,169 @@ ++// @ts-nocheck ++/** ++ * Manual test: TrafficController retry behavior via Agent + AI SDK path (stub model). ++ * ++ * Scenarios included: ++ * - 5xx retries (up to 3 attempts) ++ * - 429 retries (up to 3 attempts) ++ * - timeout retries (up to 2 attempts) ++ * - non-retriable 4xx does not retry ++ * ++ * Run: ++ * - pnpm ts-node tmp/test/traffic-retry-behavior.ts ++ * ++ * Notes: ++ * - Uses a stub LanguageModel; no network calls. ++ * - Watch the `[model] attempt=...` logs to confirm retries. ++ */ ++ ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++type Scenario = ++ | "server-error" ++ | "rate-limit" ++ | "timeout" ++ | "bad-request" ++ | "forbidden" ++ // Variations to hit different retry-detection branches. ++ | "server-error-status-string" ++ | "server-error-statusCode" ++ | "server-error-response-status" ++ | "server-error-cause-status" ++ | "rate-limit-statusCode" ++ | "timeout-code-only" ++ | "timeout-name-only" ++ | "timeout-message-only" ++ // Variations that should STOP retrying (hit max attempts). ++ | "server-error-exceed-max" ++ | "timeout-exceed-max"; ++ ++type RetryPlan = { ++ failCountBeforeSuccess: number; ++ status?: number | string; ++ statusCode?: number | string; ++ httpStatus?: number | string; ++ responseStatus?: number | string; ++ causeStatus?: number | string; ++ code?: string; ++ name?: string; ++ message?: string; ++}; ++ ++const plans: Record = { ++ "server-error": { failCountBeforeSuccess: 2, status: 500 }, ++ "rate-limit": { failCountBeforeSuccess: 2, status: 429 }, ++ timeout: { failCountBeforeSuccess: 1, status: 408, code: "ETIMEDOUT", message: "timeout" }, ++ "bad-request": { failCountBeforeSuccess: 10, status: 400 }, ++ forbidden: { failCountBeforeSuccess: 10, status: 403 }, ++ "server-error-status-string": { failCountBeforeSuccess: 2, status: "500" }, ++ "server-error-statusCode": { failCountBeforeSuccess: 2, statusCode: 502 }, ++ "server-error-response-status": { failCountBeforeSuccess: 2, responseStatus: 503 }, ++ "server-error-cause-status": { failCountBeforeSuccess: 2, causeStatus: 500 }, ++ "rate-limit-statusCode": { failCountBeforeSuccess: 2, statusCode: 429 }, ++ "timeout-code-only": { failCountBeforeSuccess: 1, code: "timeout" }, ++ "timeout-name-only": { failCountBeforeSuccess: 1, name: "TimeoutError" }, ++ "timeout-message-only": { failCountBeforeSuccess: 1, message: "this is a TIMEOUT" }, ++ "server-error-exceed-max": { failCountBeforeSuccess: 10, status: 500 }, ++ "timeout-exceed-max": { failCountBeforeSuccess: 10, message: "timeout" }, ++}; ++ ++function makeModel(modelId: string, plan: RetryPlan) { ++ let counter = 0; ++ let lastAttemptAt = 0; ++ ++ return { ++ specificationVersion: "v2", ++ provider: "retry-provider", ++ modelId, ++ doGenerate: async () => { ++ counter += 1; ++ const now = Date.now(); ++ const delta = lastAttemptAt ? now - lastAttemptAt : 0; ++ lastAttemptAt = now; ++ ++ console.log(`[model] modelId=${modelId} attempt=${counter} (+${delta}ms)`); ++ ++ if (counter <= plan.failCountBeforeSuccess) { ++ const err: any = new Error(plan.message ?? `forced failure ${counter} for ${modelId}`); ++ if (plan.status !== undefined) err.status = plan.status; ++ if (plan.statusCode !== undefined) err.statusCode = plan.statusCode; ++ if (plan.httpStatus !== undefined) err.httpStatus = plan.httpStatus; ++ if (plan.responseStatus !== undefined) err.response = { status: plan.responseStatus }; ++ if (plan.causeStatus !== undefined) err.cause = { status: plan.causeStatus }; ++ if (plan.code !== undefined) err.code = plan.code; ++ if (plan.name !== undefined) err.name = plan.name; ++ throw err; ++ } ++ ++ return { ++ content: [{ type: "text", text: "ok" }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { modelId, headers: {} }, ++ }; ++ }, ++ }; ++} ++ ++async function runScenario(name: Scenario) { ++ const plan = plans[name]; ++ const modelId = `retry-${name}`; ++ const model = makeModel(modelId, plan); ++ ++ const agent = new Agent({ ++ name: `RetryAgent-${name}`, ++ instructions: "echo", ++ model, ++ maxOutputTokens: 32, ++ temperature: 0, ++ }); ++ ++ console.log(`\n=== ${name} ===`); ++ try { ++ const result = await agent.generateText(name, { tenantId: "retry-test" }); ++ console.log(`[${name}] succeeded. text=${result.text}`); ++ } catch (err: any) { ++ console.log( ++ `[${name}] failed. status=${err?.status ?? err?.statusCode ?? err?.response?.status ?? "n/a"}`, ++ ); ++ } ++} ++ ++async function main() { ++ // Create controller early so all Agent calls share the same singleton. ++ getTrafficController({ maxConcurrent: 1 }); ++ ++ const runs: Scenario[] = [ ++ "server-error", ++ "rate-limit", ++ "timeout", ++ "bad-request", ++ "forbidden", ++ // Uncomment for additional coverage: ++ // "server-error-status-string", ++ // "server-error-statusCode", ++ // "server-error-response-status", ++ // "server-error-cause-status", ++ // "rate-limit-statusCode", ++ // "timeout-code-only", ++ // "timeout-name-only", ++ // "timeout-message-only", ++ // "server-error-exceed-max", ++ // "timeout-exceed-max", ++ ]; ++ ++ for (const name of runs) { ++ await runScenario(name); ++ } ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-tenant-usage.ts b/tmp/test/traffic-tenant-usage.ts +new file mode 100644 +index 00000000..801d7761 +--- /dev/null ++++ b/tmp/test/traffic-tenant-usage.ts +@@ -0,0 +1,71 @@ ++// @ts-nocheck ++/** ++ * Manual test: Tenant usage aggregation (via Agent → TrafficController). ++ * ++ * What to look for: ++ * - `getTenantUsage(tenantId)` should increase after each agent call. ++ * ++ * Run: ++ * - pnpm ts-node tmp/test/traffic-tenant-usage.ts ++ */ ++ ++import { safeStringify } from "@voltagent/internal"; ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++function makeModel(modelId: string) { ++ return { ++ specificationVersion: "v2", ++ provider: "usage-provider", ++ modelId, ++ doGenerate: async () => { ++ return { ++ content: [{ type: "text", text: `ok:${modelId}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 2, outputTokens: 3, totalTokens: 5 }, ++ warnings: [], ++ response: { modelId, headers: {} }, ++ }; ++ }, ++ }; ++} ++ ++const controller = getTrafficController({ maxConcurrent: 10 }); ++ ++async function run(label: string, tenantId: string) { ++ const model = makeModel("tenant-usage-model"); ++ const agent = new Agent({ ++ name: `TenantUsageAgent-${label}`, ++ instructions: "echo", ++ model, ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ console.log(`\n=== ${label} tenantId=${tenantId} ===`); ++ const result = await agent.generateText(`hello:${label}`, { tenantId }); ++ console.log(`[${label}] text=${result.text}`); ++ ++ const usage = controller.getTenantUsage(tenantId); ++ console.log(`[${label}] controller.getTenantUsage(${tenantId})=${safeStringify(usage)}`); ++} ++ ++async function main() { ++ await run("A1", "tenant-a"); ++ await run("A2", "tenant-a"); ++ await run("B1", "tenant-b"); ++ ++ console.log("\n=== Final usage snapshot ==="); ++ console.log(`tenant-a=${safeStringify(controller.getTenantUsage("tenant-a"))}`); ++ console.log(`tenant-b=${safeStringify(controller.getTenantUsage("tenant-b"))}`); ++ console.log(`default=${safeStringify(controller.getTenantUsage("default"))}`); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-text-vs-stream.ts b/tmp/test/traffic-text-vs-stream.ts +new file mode 100644 +index 00000000..41aa484d +--- /dev/null ++++ b/tmp/test/traffic-text-vs-stream.ts +@@ -0,0 +1,128 @@ ++// @ts-nocheck ++/** ++ * Manual test: Text + stream traffic share the same TrafficController queue. ++ * ++ * What to look for: ++ * - Stream and text requests should respect the same maxConcurrent + priority rules. ++ * ++ * Run: ++ * - pnpm ts-node tmp/test/traffic-text-vs-stream.ts ++ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-text-vs-stream.ts ++ */ ++ ++import { ReadableStream } from "node:stream/web"; ++import { safeStringify } from "@voltagent/internal"; ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); ++const now = () => new Date().toISOString(); ++ ++const controller = getTrafficController({ maxConcurrent: 1 }); ++ ++function extractLabel(prompt: any): string { ++ if (!Array.isArray(prompt)) { ++ return "unknown"; ++ } ++ ++ for (let index = prompt.length - 1; index >= 0; index -= 1) { ++ const message = prompt[index]; ++ if (!message || message.role !== "user" || !Array.isArray(message.content)) { ++ continue; ++ } ++ ++ const textPart = message.content.find((part: any) => part?.type === "text"); ++ if (textPart?.text) { ++ return String(textPart.text); ++ } ++ } ++ ++ return "unknown"; ++} ++ ++async function main() { ++ console.log("\n=== Text vs Stream (shared scheduler) ==="); ++ void controller; ++ ++ const provider = "sim"; ++ const modelId = "shared-queue"; ++ ++ const model = { ++ specificationVersion: "v2", ++ provider, ++ modelId, ++ doGenerate: async (options: any) => { ++ const label = extractLabel(options?.prompt); ++ console.log(`[${now()}] doGenerate start input=${label}`); ++ await sleep(50); ++ console.log(`[${now()}] doGenerate end input=${label}`); ++ return { ++ content: [{ type: "text", text: `text:${label}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { modelId, headers: {} }, ++ }; ++ }, ++ doStream: async (options: any) => { ++ const label = extractLabel(options?.prompt); ++ console.log(`[${now()}] doStream start input=${label}`); ++ ++ // Hold the controller slot for a bit so ordering is visible. ++ await sleep(400); ++ ++ console.log(`[${now()}] doStream ready input=${label}`); ++ const streamId = `text-${label}`; ++ const text = `stream:${label}`; ++ ++ const stream = new ReadableStream({ ++ start(streamController) { ++ streamController.enqueue({ type: "stream-start", warnings: [] }); ++ streamController.enqueue({ type: "text-start", id: streamId }); ++ streamController.enqueue({ type: "text-delta", id: streamId, delta: text }); ++ streamController.enqueue({ type: "text-end", id: streamId }); ++ streamController.enqueue({ ++ type: "finish", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ finishReason: "stop", ++ }); ++ streamController.close(); ++ }, ++ }); ++ ++ return { stream, response: { headers: {} } }; ++ }, ++ }; ++ ++ const agent = new Agent({ ++ name: "traffic-text-vs-stream", ++ instructions: "echo", ++ model, ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ const streamP1 = agent.streamText("S1", { tenantId: "default", trafficPriority: "P1" }); ++ const textP0 = agent.generateText("T0", { tenantId: "default", trafficPriority: "P0" }); ++ const textP1 = agent.generateText("T1", { tenantId: "default", trafficPriority: "P1" }); ++ ++ const [streamResult, t0, t1] = await Promise.all([streamP1, textP0, textP1]); ++ const streamText = await streamResult.text; ++ ++ console.log( ++ `\n[done] results=${safeStringify({ ++ streamText, ++ textP0: t0.text, ++ textP1: t1.text, ++ })}`, ++ ); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); diff --git a/examples/with-client-side-tools/next-env.d.ts b/examples/with-client-side-tools/next-env.d.ts index 1b3be0840..9edff1c7c 100644 --- a/examples/with-client-side-tools/next-env.d.ts +++ b/examples/with-client-side-tools/next-env.d.ts @@ -1,5 +1,6 @@ /// /// +import "./.next/types/routes.d.ts"; // NOTE: This file should not be edited // see https://nextjs.org/docs/app/api-reference/config/typescript for more information. diff --git a/examples/with-client-side-tools/tsconfig.json b/examples/with-client-side-tools/tsconfig.json index 3697fcb9b..0fca67d34 100644 --- a/examples/with-client-side-tools/tsconfig.json +++ b/examples/with-client-side-tools/tsconfig.json @@ -1,6 +1,10 @@ { "compilerOptions": { - "lib": ["dom", "dom.iterable", "esnext"], + "lib": [ + "dom", + "dom.iterable", + "esnext" + ], "allowJs": true, "skipLibCheck": true, "strict": true, @@ -11,7 +15,7 @@ "resolveJsonModule": true, "isolatedModules": true, "sourceMap": true, - "jsx": "preserve", + "jsx": "react-jsx", "incremental": true, "plugins": [ { @@ -19,10 +23,20 @@ } ], "paths": { - "@/*": ["./*"] + "@/*": [ + "./*" + ] }, "target": "ES2017" }, - "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"], - "exclude": ["node_modules"] + "include": [ + "next-env.d.ts", + "**/*.ts", + "**/*.tsx", + ".next/types/**/*.ts", + ".next/dev/types/**/*.ts" + ], + "exclude": [ + "node_modules" + ] } diff --git a/examples/with-netlify-functions/netlify/functions/voltagent.js b/examples/with-netlify-functions/netlify/functions/voltagent.js new file mode 100644 index 000000000..0ec386b8f --- /dev/null +++ b/examples/with-netlify-functions/netlify/functions/voltagent.js @@ -0,0 +1,4 @@ +import { createNetlifyFunctionHandler } from "@voltagent/serverless-hono"; +import { getVoltAgent } from "../../src/index"; +const voltAgent = getVoltAgent(); +export const handler = createNetlifyFunctionHandler(voltAgent); diff --git a/examples/with-netlify-functions/src/index.js b/examples/with-netlify-functions/src/index.js new file mode 100644 index 000000000..af385b506 --- /dev/null +++ b/examples/with-netlify-functions/src/index.js @@ -0,0 +1,17 @@ +import { openai } from "@ai-sdk/openai"; +import { Agent, VoltAgent } from "@voltagent/core"; +import { serverlessHono } from "@voltagent/serverless-hono"; +import { weatherTool } from "./tools"; +const agent = new Agent({ + name: "netlify-function-agent", + instructions: "Help the user quickly and call tools when needed.", + model: openai("gpt-4o-mini"), + tools: [weatherTool], +}); +const voltAgent = new VoltAgent({ + agents: { agent }, + serverless: serverlessHono(), +}); +export function getVoltAgent() { + return voltAgent; +} diff --git a/examples/with-netlify-functions/src/tools/index.js b/examples/with-netlify-functions/src/tools/index.js new file mode 100644 index 000000000..d1c5bf43b --- /dev/null +++ b/examples/with-netlify-functions/src/tools/index.js @@ -0,0 +1,26 @@ +import { createTool } from "@voltagent/core"; +import z from "zod"; +export const weatherTool = createTool({ + id: "get-weather", + name: "getWeather", + description: "Return a mock weather report for the requested location", + parameters: z.object({ + location: z.string().describe("City or location to look up"), + }), + execute: async ({ location }, context) => { + context?.logger.info(`Fetching weather for ${location}`); + const mockWeatherData = { + location, + temperature: Math.floor(Math.random() * 30) + 5, + condition: ["Sunny", "Cloudy", "Rainy", "Snowy", "Partly Cloudy"][ + Math.floor(Math.random() * 5) + ], + humidity: Math.floor(Math.random() * 60) + 30, + windSpeed: Math.floor(Math.random() * 30), + }; + return { + weather: mockWeatherData, + message: `Current weather in ${location}: ${mockWeatherData.temperature}°C and ${mockWeatherData.condition.toLowerCase()} with ${mockWeatherData.humidity}% humidity and wind speed of ${mockWeatherData.windSpeed} km/h.`, + }; + }, +}); diff --git a/package.json b/package.json index 7c80f7c59..7e3ef8ba1 100644 --- a/package.json +++ b/package.json @@ -32,9 +32,10 @@ "publint": "^0.3.8", "rimraf": "^5.0.5", "syncpack": "^13.0.2", + "ts-node": "^10.9.2", "tslib": "^2.3.0", "tsup": "^8.5.0", - "typescript": "^5.8.2", + "typescript": "^5.9.2", "vite": "^7.2.7", "vitest": "^3.2.4" }, diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index 291bdf7fd..84343c041 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -48,6 +48,14 @@ import type { BaseRetriever } from "../retriever/retriever"; import type { Tool, Toolkit } from "../tool"; import { createTool } from "../tool"; import { ToolManager } from "../tool/manager"; +import { + type FallbackChainEntry, + type TrafficPriority, + type TrafficRequest, + type TrafficRequestMetadata, + getTrafficController, +} from "../traffic/traffic-controller"; +import { findHeaders } from "../traffic/traffic-error-utils"; import { randomUUID } from "../utils/id"; import { convertModelMessagesToUIMessages } from "../utils/message-converter"; import { NodeType, createNodeId } from "../utils/node-utils"; @@ -262,8 +270,42 @@ export interface BaseGenerationOptions extends Partial { // Context userId?: string; conversationId?: string; + tenantId?: string; + /** + * Optional key metadata for per-key rate limits. + */ + apiKeyId?: string; + /** + * Optional region metadata for per-region rate limits. + */ + region?: string; + /** + * Optional endpoint metadata for per-endpoint rate limits. + */ + endpoint?: string; + /** + * Optional tenant tier metadata for per-tier rate limits. + */ + tenantTier?: string; context?: ContextInput; elicitation?: (request: unknown) => Promise; + /** + * Optional priority override for scheduling. + * Defaults to agent-level priority when omitted. + */ + trafficPriority?: TrafficPriority; + /** + * Optional maximum time to wait in the queue before timing out. + */ + maxQueueWaitMs?: number; + /** + * Optional task classification for circuit-breaker fallback policies. + */ + taskType?: string; + /** + * Optional explicit fallback policy id. + */ + fallbackPolicyId?: string; // Parent tracking parentAgentId?: string; @@ -303,6 +345,8 @@ export interface BaseGenerationOptions extends Partial { // Provider-specific options providerOptions?: ProviderOptions; + // Optional per-call model override (used for fallbacks) + model?: LanguageModel; // Experimental output (for structured generation) experimental_output?: ReturnType | ReturnType; @@ -347,6 +391,7 @@ export class Agent { readonly voice?: Voice; readonly retriever?: BaseRetriever; readonly supervisorConfig?: SupervisorConfig; + private readonly trafficPriority: TrafficPriority; private readonly context?: Map; private readonly logger: Logger; @@ -372,6 +417,7 @@ export class Agent { this.temperature = options.temperature; this.maxOutputTokens = options.maxOutputTokens; this.maxSteps = options.maxSteps || 5; + this.trafficPriority = options.trafficPriority ?? "P1"; this.stopWhen = options.stopWhen; this.markdown = options.markdown ?? false; this.voice = options.voice; @@ -444,6 +490,47 @@ export class Agent { async generateText( input: string | UIMessage[] | BaseMessage[], options?: GenerateTextOptions, + ): Promise { + const controller = getTrafficController({ logger: this.logger }); // Use shared controller so all agent calls flow through central queue/metrics + const tenantId = this.resolveTenantId(options); + const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => { + const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); + const metadata = this.buildTrafficMetadata( + mergedOptions?.model, + mergedOptions, + providerOverride, + ); // Compute once per queued request (including per-call model overrides) + return { + tenantId, + metadata, + maxQueueWaitMs: options?.maxQueueWaitMs, + estimatedTokens: this.estimateTokens(input, mergedOptions), + execute: () => this.executeGenerateText(input, mergedOptions, metadata), // Defer actual execution so controller can schedule it + extractUsage: (result: GenerateTextResultWithContext) => + this.extractUsageFromResponse(result), + createFallbackRequest: (fallbackTarget) => { + if (this.isShortResponseFallback(fallbackTarget)) { + return this.buildShortTextFallbackRequest( + tenantId, + metadata, + mergedOptions, + fallbackTarget.text, + ); + } + const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = + this.resolveFallbackTarget(fallbackTarget); + return buildRequest(fallbackModel, fallbackProvider); + }, + }; + }; + + return controller.handleText(buildRequest(options?.model)); + } + + private async executeGenerateText( + input: string | UIMessage[] | BaseMessage[], + options?: GenerateTextOptions, + trafficMetadata?: TrafficRequestMetadata, ): Promise { const startTime = Date.now(); const oc = this.createOperationContext(input, options); @@ -471,7 +558,7 @@ export class Agent { options, ); - const modelName = this.getModelName(); + const modelName = this.getModelName(model); const contextLimit = options?.contextLimit; // Add model attributes and all options @@ -544,10 +631,20 @@ export class Agent { hooks, maxSteps: userMaxSteps, tools: userTools, + maxQueueWaitMs, + taskType, + fallbackPolicyId, experimental_output, providerOptions, + maxRetries: _maxRetries, // Always disable provider retries (TrafficController handles retries) + model: _model, // Exclude model so aiSDKOptions doesn't override resolved model ...aiSDKOptions } = options || {}; + void _model; + void _maxRetries; + void maxQueueWaitMs; + void taskType; + void fallbackPolicyId; const llmSpan = this.createLLMSpan(oc, { operation: "generateText", @@ -567,6 +664,11 @@ export class Agent { let result!: GenerateTextResult; try { + methodLogger.info("[AI SDK] Calling generateText", { + messageCount: messages.length, + modelName, + tools: tools ? Object.keys(tools) : [], + }); result = await oc.traceContext.withSpan(llmSpan, () => generateText({ model, @@ -575,7 +677,7 @@ export class Agent { // Default values temperature: this.temperature, maxOutputTokens: this.maxOutputTokens, - maxRetries: 3, + maxRetries: 0, stopWhen: options?.stopWhen ?? this.stopWhen ?? stepCountIs(maxSteps), // User overrides from AI SDK options ...aiSDKOptions, @@ -588,7 +690,15 @@ export class Agent { onStepFinish: this.createStepHandler(oc, options), }), ); + methodLogger.info("[AI SDK] Received generateText result", { + finishReason: result.finishReason, + usage: result.usage ? safeStringify(result.usage) : undefined, + stepCount: result.steps?.length ?? 0, + rawResult: safeStringify(result), + }); + this.updateTrafficControllerRateLimits(result.response, trafficMetadata, methodLogger); } catch (error) { + this.updateTrafficControllerRateLimits(error, trafficMetadata, methodLogger); finalizeLLMSpan(SpanStatusCode.ERROR, { message: (error as Error).message }); throw error; } @@ -771,6 +881,47 @@ export class Agent { async streamText( input: string | UIMessage[] | BaseMessage[], options?: StreamTextOptions, + ): Promise { + const controller = getTrafficController({ logger: this.logger }); // Same controller handles streaming to keep ordering/backpressure consistent + const tenantId = this.resolveTenantId(options); + const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => { + const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); + const metadata = this.buildTrafficMetadata( + mergedOptions?.model, + mergedOptions, + providerOverride, + ); // Compute once per queued request (including per-call model overrides) + return { + tenantId, + metadata, + maxQueueWaitMs: options?.maxQueueWaitMs, + estimatedTokens: this.estimateTokens(input, mergedOptions), + execute: () => this.executeStreamText(input, mergedOptions, metadata), // Actual streaming work happens after the controller dequeues us + extractUsage: (result: StreamTextResultWithContext) => + this.extractUsageFromResponse(result), + createFallbackRequest: (fallbackTarget) => { + if (this.isShortResponseFallback(fallbackTarget)) { + return this.buildShortStreamTextFallbackRequest( + tenantId, + metadata, + mergedOptions, + fallbackTarget.text, + ); + } + const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = + this.resolveFallbackTarget(fallbackTarget); + return buildRequest(fallbackModel, fallbackProvider); + }, + }; + }; + + return controller.handleStream(buildRequest(options?.model)); + } + + private async executeStreamText( + input: string | UIMessage[] | BaseMessage[], + options?: StreamTextOptions, + trafficMetadata?: TrafficRequestMetadata, ): Promise { const startTime = Date.now(); const oc = this.createOperationContext(input, options); @@ -800,7 +951,7 @@ export class Agent { options, ); - const modelName = this.getModelName(); + const modelName = this.getModelName(model); const contextLimit = options?.contextLimit; // Add model attributes to root span if TraceContext exists @@ -868,10 +1019,20 @@ export class Agent { maxSteps: userMaxSteps, tools: userTools, onFinish: userOnFinish, + maxQueueWaitMs, + taskType, + fallbackPolicyId, experimental_output, providerOptions, + maxRetries: _maxRetries, // Always disable provider retries (TrafficController handles retries) + model: _model, // Exclude model from aiSDKOptions to avoid overriding resolved model ...aiSDKOptions } = options || {}; + void _model; + void _maxRetries; + void maxQueueWaitMs; + void taskType; + void fallbackPolicyId; const guardrailStreamingEnabled = guardrailSet.output.length > 0; @@ -893,7 +1054,13 @@ export class Agent { }, }); const finalizeLLMSpan = this.createLLMSpanFinalizer(llmSpan); + const trafficController = getTrafficController({ logger: this.logger }); + methodLogger.info("[AI SDK] Calling streamText", { + messageCount: messages.length, + modelName, + tools: tools ? Object.keys(tools) : [], + }); const result = streamText({ model, messages, @@ -901,7 +1068,7 @@ export class Agent { // Default values temperature: this.temperature, maxOutputTokens: this.maxOutputTokens, - maxRetries: 3, + maxRetries: 0, // Retry via traffic controller to avoid provider-level storms stopWhen: options?.stopWhen ?? this.stopWhen ?? stepCountIs(maxSteps), // User overrides from AI SDK options ...aiSDKOptions, @@ -937,6 +1104,8 @@ export class Agent { modelName: this.getModelName(), }); + this.updateTrafficControllerRateLimits(actualError, trafficMetadata, methodLogger); + trafficController.reportStreamFailure(trafficMetadata, actualError); finalizeLLMSpan(SpanStatusCode.ERROR, { message: (actualError as Error)?.message }); // History update removed - using OpenTelemetry only @@ -962,6 +1131,18 @@ export class Agent { .catch(() => {}); }, onFinish: async (finalResult) => { + methodLogger.info("[AI SDK] streamText finished", { + finishReason: finalResult.finishReason, + usage: finalResult.totalUsage ? safeStringify(finalResult.totalUsage) : undefined, + stepCount: finalResult.steps?.length ?? 0, + rawResult: safeStringify(finalResult), + }); + this.updateTrafficControllerRateLimits( + finalResult.response, + trafficMetadata, + methodLogger, + ); + trafficController.reportStreamSuccess(trafficMetadata); const providerUsage = finalResult.usage ? await Promise.resolve(finalResult.usage) : undefined; @@ -1428,6 +1609,49 @@ export class Agent { input: string | UIMessage[] | BaseMessage[], schema: T, options?: GenerateObjectOptions, + ): Promise>> { + const controller = getTrafficController({ logger: this.logger }); + const tenantId = this.resolveTenantId(options); + const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => { + const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); + const metadata = this.buildTrafficMetadata( + mergedOptions?.model, + mergedOptions, + providerOverride, + ); // Compute once per queued request (including per-call model overrides) + return { + tenantId, + metadata, + maxQueueWaitMs: options?.maxQueueWaitMs, + estimatedTokens: this.estimateTokens(input, mergedOptions), + execute: () => this.executeGenerateObject(input, schema, mergedOptions, metadata), + extractUsage: (result: GenerateObjectResultWithContext>) => + this.extractUsageFromResponse(result), + createFallbackRequest: (fallbackTarget) => { + if (this.isShortResponseFallback(fallbackTarget)) { + return this.buildShortObjectFallbackRequest( + tenantId, + metadata, + schema, + mergedOptions, + fallbackTarget.text, + ); + } + const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = + this.resolveFallbackTarget(fallbackTarget); + return buildRequest(fallbackModel, fallbackProvider); + }, + }; + }; + + return controller.handleText(buildRequest(options?.model)); + } + + private async executeGenerateObject( + input: string | UIMessage[] | BaseMessage[], + schema: T, + options?: GenerateObjectOptions, + trafficMetadata?: TrafficRequestMetadata, ): Promise>> { const startTime = Date.now(); const oc = this.createOperationContext(input, options); @@ -1452,7 +1676,7 @@ export class Agent { options, ); - const modelName = this.getModelName(); + const modelName = this.getModelName(model); const schemaName = schema.description || "unknown"; // Add model attributes and all options @@ -1510,10 +1734,25 @@ export class Agent { hooks, maxSteps: userMaxSteps, tools: userTools, + taskType, + fallbackPolicyId, + maxQueueWaitMs, providerOptions, + maxRetries: _maxRetries, // Always disable provider retries (TrafficController handles retries) + model: _model, // Exclude model so spread does not override resolved model ...aiSDKOptions } = options || {}; - + void _model; + void _maxRetries; + void taskType; + void fallbackPolicyId; + void maxQueueWaitMs; + + methodLogger.info("[AI SDK] Calling generateObject", { + messageCount: messages.length, + modelName, + schemaName, + }); const result = await generateObject({ model, messages, @@ -1522,7 +1761,7 @@ export class Agent { // Default values maxOutputTokens: this.maxOutputTokens, temperature: this.temperature, - maxRetries: 3, + maxRetries: 0, // User overrides from AI SDK options ...aiSDKOptions, // Provider-specific options @@ -1530,6 +1769,13 @@ export class Agent { // VoltAgent controlled abortSignal: oc.abortController.signal, }); + methodLogger.info("[AI SDK] Received generateObject result", { + finishReason: result.finishReason, + usage: result.usage ? safeStringify(result.usage) : undefined, + warnings: result.warnings, + rawResult: safeStringify(result), + }); + this.updateTrafficControllerRateLimits(result.response, trafficMetadata, methodLogger); const usageInfo = convertUsage(result.usage); const finalObject = await executeOutputGuardrails({ @@ -1638,6 +1884,7 @@ export class Agent { context: oc.context, }; } catch (error) { + this.updateTrafficControllerRateLimits(error, trafficMetadata, methodLogger); await this.flushPendingMessagesOnError(oc).catch(() => {}); return this.handleError(error as Error, oc, options, startTime); } finally { @@ -1655,6 +1902,49 @@ export class Agent { input: string | UIMessage[] | BaseMessage[], schema: T, options?: StreamObjectOptions, + ): Promise>> { + const controller = getTrafficController({ logger: this.logger }); + const tenantId = this.resolveTenantId(options); + const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => { + const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); + const metadata = this.buildTrafficMetadata( + mergedOptions?.model, + mergedOptions, + providerOverride, + ); // Compute once per queued request (including per-call model overrides) + return { + tenantId, + metadata, + maxQueueWaitMs: options?.maxQueueWaitMs, + estimatedTokens: this.estimateTokens(input, mergedOptions), + execute: () => this.executeStreamObject(input, schema, mergedOptions, metadata), + extractUsage: (result: StreamObjectResultWithContext>) => + this.extractUsageFromResponse(result), + createFallbackRequest: (fallbackTarget) => { + if (this.isShortResponseFallback(fallbackTarget)) { + return this.buildShortStreamObjectFallbackRequest( + tenantId, + metadata, + schema, + mergedOptions, + fallbackTarget.text, + ); + } + const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = + this.resolveFallbackTarget(fallbackTarget); + return buildRequest(fallbackModel, fallbackProvider); + }, + }; + }; + + return controller.handleStream(buildRequest(options?.model)); + } + + private async executeStreamObject( + input: string | UIMessage[] | BaseMessage[], + schema: T, + options?: StreamObjectOptions, + trafficMetadata?: TrafficRequestMetadata, ): Promise>> { const startTime = Date.now(); const oc = this.createOperationContext(input, options); @@ -1680,7 +1970,7 @@ export class Agent { options, ); - const modelName = this.getModelName(); + const modelName = this.getModelName(model); const schemaName = schema.description || "unknown"; // Add model attributes and all options @@ -1739,14 +2029,30 @@ export class Agent { maxSteps: userMaxSteps, tools: userTools, onFinish: userOnFinish, + taskType, + fallbackPolicyId, + maxQueueWaitMs, providerOptions, + maxRetries: _maxRetries, // Always disable provider retries (TrafficController handles retries) + model: _model, // Exclude model so aiSDKOptions cannot override resolved model ...aiSDKOptions } = options || {}; + void _model; + void _maxRetries; + void taskType; + void fallbackPolicyId; + void maxQueueWaitMs; let guardrailObjectPromise!: Promise>; let resolveGuardrailObject: ((value: z.infer) => void) | undefined; let rejectGuardrailObject: ((reason: unknown) => void) | undefined; + const trafficController = getTrafficController({ logger: this.logger }); + methodLogger.info("[AI SDK] Calling streamObject", { + messageCount: messages.length, + modelName, + schemaName, + }); const result = streamObject({ model, messages, @@ -1755,7 +2061,7 @@ export class Agent { // Default values maxOutputTokens: this.maxOutputTokens, temperature: this.temperature, - maxRetries: 3, + maxRetries: 0, // User overrides from AI SDK options ...aiSDKOptions, // Provider-specific options @@ -1771,9 +2077,11 @@ export class Agent { methodLogger.error("Stream object error occurred", { error: actualError, agentName: this.name, - modelName: this.getModelName(), + modelName: this.getModelName(model), schemaName: schemaName, }); + this.updateTrafficControllerRateLimits(actualError, trafficMetadata, methodLogger); + trafficController.reportStreamFailure(trafficMetadata, actualError); // History update removed - using OpenTelemetry only @@ -1800,6 +2108,17 @@ export class Agent { }, onFinish: async (finalResult: any) => { try { + methodLogger.info("[AI SDK] streamObject finished", { + finishReason: finalResult.finishReason, + usage: finalResult.usage ? safeStringify(finalResult.usage) : undefined, + rawResult: safeStringify(finalResult), + }); + this.updateTrafficControllerRateLimits( + finalResult.response, + trafficMetadata, + methodLogger, + ); + trafficController.reportStreamSuccess(trafficMetadata); const usageInfo = convertUsage(finalResult.usage as any); let finalObject = finalResult.object as z.infer; if (guardrailSet.output.length > 0) { @@ -2021,8 +2340,9 @@ export class Agent { // Calculate maxSteps (use provided option or calculate based on subagents) const maxSteps = options?.maxSteps ?? this.calculateMaxSteps(); - // Resolve dynamic values - const model = await this.resolveValue(this.model, oc); + // Resolve dynamic values (allow per-call model override for fallbacks) + const selectedModel = options?.model ?? this.model; + const model = await this.resolveValue(selectedModel, oc); const dynamicToolList = (await this.resolveValue(this.dynamicTools, oc)) || []; // Merge agent tools with option tools @@ -2073,6 +2393,12 @@ export class Agent { ): OperationContext { const operationId = randomUUID(); const startTimeDate = new Date(); + const priority = this.resolveTrafficPriority(options); + const tenantId = this.resolveTenantId(options); + const apiKeyId = options?.apiKeyId ?? options?.parentOperationContext?.apiKeyId; + const region = options?.region ?? options?.parentOperationContext?.region; + const endpoint = options?.endpoint ?? options?.parentOperationContext?.endpoint; + const tenantTier = options?.tenantTier ?? options?.parentOperationContext?.tenantTier; // Prefer reusing an existing context instance to preserve reference across calls/subagents const runtimeContext = toContextMap(options?.context); @@ -2123,6 +2449,7 @@ export class Agent { operationId, userId: options?.userId, conversationId: options?.conversationId, + tenantId, executionId: operationId, }); @@ -2137,6 +2464,9 @@ export class Agent { parentAgentId: options?.parentAgentId, input, }); + if (tenantId) { + traceContext.getRootSpan().setAttribute("tenant.id", tenantId); + } traceContext.getRootSpan().setAttribute("voltagent.operation_id", operationId); // Use parent's AbortController if available, otherwise create new one @@ -2174,8 +2504,14 @@ export class Agent { logger, conversationSteps: options?.parentOperationContext?.conversationSteps || [], abortController, + priority, userId: options?.userId, conversationId: options?.conversationId, + tenantId, + apiKeyId, + region, + endpoint, + tenantTier, parentAgentId: options?.parentAgentId, traceContext, startTime: startTimeDate, @@ -3170,6 +3506,20 @@ export class Agent { return value; } + private mergeOptionsWithModel( + options: BaseGenerationOptions | undefined, + modelOverride?: LanguageModel, + ): BaseGenerationOptions | undefined { + if (!options && modelOverride === undefined) { + return undefined; + } + + return { + ...(options ?? {}), + ...(modelOverride !== undefined ? { model: modelOverride } : {}), + }; + } + /** * Prepare tools with execution context */ @@ -3822,17 +4172,622 @@ export class Agent { return this.subAgentManager.calculateMaxSteps(this.maxSteps); } + private resolveTrafficPriority(options?: BaseGenerationOptions): TrafficPriority { + const normalize = (value?: TrafficPriority): TrafficPriority | undefined => { + if (value === "P0" || value === "P1" || value === "P2") { + return value; + } + return undefined; + }; + + const parentPriority = normalize(options?.parentOperationContext?.priority); + const localPriority = normalize(options?.trafficPriority) ?? this.trafficPriority ?? "P1"; + + if (parentPriority) { + return this.pickHigherPriority(parentPriority, localPriority); + } + + return localPriority; + } + + private resolveTenantId(options?: BaseGenerationOptions): string { + const parentTenant = options?.parentOperationContext?.tenantId; + if (parentTenant) { + return parentTenant; + } + + if (options?.tenantId) { + return options.tenantId; + } + + return "default"; + } + + private pickHigherPriority(a: TrafficPriority, b: TrafficPriority): TrafficPriority { + const rank: Record = { P0: 0, P1: 1, P2: 2 }; + return rank[a] <= rank[b] ? a : b; + } + + private buildTrafficMetadata( + modelOverride?: LanguageModel | DynamicValue, + options?: BaseGenerationOptions, + providerOverride?: string, + ): TrafficRequestMetadata { + const provider = + providerOverride ?? + this.resolveProvider(modelOverride) ?? + this.resolveProvider(this.model) ?? + undefined; + const priority = this.resolveTrafficPriority(options); + const apiKeyId = options?.apiKeyId ?? options?.parentOperationContext?.apiKeyId; + const region = options?.region ?? options?.parentOperationContext?.region; + const endpoint = options?.endpoint ?? options?.parentOperationContext?.endpoint; + const tenantTier = options?.tenantTier ?? options?.parentOperationContext?.tenantTier; + + return { + agentId: this.id, // Identify which agent issued the request + agentName: this.name, // Human-readable label for logs/metrics + model: this.getModelName(modelOverride), // Used for future capacity policies + provider, // Allows per-provider throttling later + priority, + tenantId: this.resolveTenantId(options), + apiKeyId, + region, + endpoint, + tenantTier, + taskType: options?.taskType, + fallbackPolicyId: options?.fallbackPolicyId, + }; + } + + private estimateTokens( + input: string | UIMessage[] | BaseMessage[], + options?: BaseGenerationOptions, + ): number | undefined { + let text = ""; + if (typeof input === "string") { + text = input; + } else if (Array.isArray(input)) { + text = input + .map((message) => { + if (typeof message === "string") return message; + if (message && typeof message === "object") { + const content = (message as { content?: unknown }).content; + if (typeof content === "string") return content; + if (content !== undefined) return safeStringify(content); + return safeStringify(message); + } + return String(message ?? ""); + }) + .join(" "); + } else if (input) { + text = safeStringify(input); + } + + const inputTokens = text ? Math.ceil(text.length / 4) : 0; + const outputTokensRaw = + typeof options?.maxOutputTokens === "number" ? options.maxOutputTokens : this.maxOutputTokens; + const outputTokens = + typeof outputTokensRaw === "number" && Number.isFinite(outputTokensRaw) + ? Math.max(0, Math.floor(outputTokensRaw)) + : 0; + const total = inputTokens + outputTokens; + return total > 0 ? total : undefined; + } + + private resolveFallbackTarget(target: FallbackChainEntry): { + modelOverride?: LanguageModel; + providerOverride?: string; + } { + if (typeof target === "string") { + return { modelOverride: target }; + } + return { + modelOverride: target.model, + providerOverride: target.provider, + }; + } + + private isShortResponseFallback( + target: FallbackChainEntry, + ): target is { kind: "short-response"; text: string } { + return ( + typeof target === "object" && + target !== null && + "kind" in target && + (target as { kind?: string }).kind === "short-response" + ); + } + + private buildShortResponseMetadata( + baseMetadata: TrafficRequestMetadata | undefined, + ): TrafficRequestMetadata { + const metadata = baseMetadata ?? this.buildTrafficMetadata(); + return { + ...metadata, + provider: "short-response", + model: "short-response", + }; + } + + private createZeroUsage(): LanguageModelUsage { + return { inputTokens: 0, outputTokens: 0, totalTokens: 0 }; + } + + private createShortTextStream(text: string): AsyncIterableStream { + return createAsyncIterableReadable((controller) => { + controller.enqueue(text); + controller.close(); + }); + } + + private createShortFullStream(text: string): AsyncIterableStream { + const usage = this.createZeroUsage(); + const id = `short-response-${randomUUID()}`; + return createAsyncIterableReadable((controller) => { + controller.enqueue({ + type: "text-delta", + id, + delta: text, + text, + } as VoltAgentTextStreamPart); + controller.enqueue({ + type: "finish", + finishReason: "stop", + usage, + totalUsage: usage, + } as VoltAgentTextStreamPart); + controller.close(); + }); + } + + private createShortTextResult( + text: string, + options?: GenerateTextOptions, + ): GenerateTextResultWithContext { + const usage = this.createZeroUsage(); + const context = toContextMap(options?.context) ?? new Map(); + const createTextStream = (): AsyncIterableStream => this.createShortTextStream(text); + + return { + text, + content: [], + reasoning: [], + reasoningText: "", + files: [], + sources: [], + toolCalls: [], + staticToolCalls: [], + dynamicToolCalls: [], + toolResults: [], + staticToolResults: [], + dynamicToolResults: [], + usage, + totalUsage: usage, + warnings: [], + finishReason: "stop", + steps: [], + experimental_output: undefined, + response: { + id: "short-response", + modelId: "short-response", + timestamp: new Date(), + messages: [], + }, + context, + request: { + body: {}, + }, + providerMetadata: undefined, + experimental_providerMetadata: undefined, + pipeTextStreamToResponse: (response, init) => { + pipeTextStreamToResponse({ + response, + textStream: createTextStream(), + ...(init ?? {}), + }); + }, + toTextStreamResponse: (init) => { + return createTextStreamResponse({ + textStream: createTextStream(), + ...(init ?? {}), + }); + }, + toDataStream: () => createTextStream(), + toDataStreamResponse: (init) => { + return createTextStreamResponse({ + textStream: createTextStream(), + ...(init ?? {}), + }); + }, + pipeDataStreamToResponse: (response, init) => { + pipeTextStreamToResponse({ + response, + textStream: createTextStream(), + ...(init ?? {}), + }); + }, + } as GenerateTextResultWithContext; + } + + private createShortStreamTextResult( + text: string, + options?: StreamTextOptions, + ): StreamTextResultWithContext { + const usage = this.createZeroUsage(); + const context = toContextMap(options?.context) ?? new Map(); + const createTextStream = (): AsyncIterableStream => this.createShortTextStream(text); + const createFullStream = (): AsyncIterableStream => + this.createShortFullStream(text); + + const toUIMessageStream = (_options?: unknown) => + createUIMessageStream({ + execute: async ({ writer }) => { + writer.write({ type: "text", text } as any); + }, + onError: (error) => String(error), + }); + + const toUIMessageStreamResponse = (options?: ResponseInit) => { + const stream = toUIMessageStream(options); + const responseInit = options ? { ...options } : {}; + return createUIMessageStreamResponse({ + stream, + ...responseInit, + }); + }; + + const pipeUIMessageStreamToResponse = (response: any, init?: ResponseInit) => { + const stream = toUIMessageStream(init); + const initOptions = init ? { ...init } : {}; + pipeUIMessageStreamToResponse({ + response, + stream, + ...initOptions, + }); + }; + + return { + text: Promise.resolve(text), + get textStream() { + return createTextStream(); + }, + get fullStream() { + return createFullStream(); + }, + usage: Promise.resolve(usage), + finishReason: Promise.resolve("stop"), + experimental_partialOutputStream: undefined, + toUIMessageStream: toUIMessageStream as StreamTextResultWithContext["toUIMessageStream"], + toUIMessageStreamResponse: + toUIMessageStreamResponse as StreamTextResultWithContext["toUIMessageStreamResponse"], + pipeUIMessageStreamToResponse: + pipeUIMessageStreamToResponse as StreamTextResultWithContext["pipeUIMessageStreamToResponse"], + pipeTextStreamToResponse: (response, init) => { + pipeTextStreamToResponse({ + response, + textStream: createTextStream(), + ...(init ?? {}), + }); + }, + toTextStreamResponse: (init) => { + return createTextStreamResponse({ + textStream: createTextStream(), + ...(init ?? {}), + }); + }, + context, + }; + } + + private resolveShortResponseObject(schema: T, text: string): z.infer { + const candidates: unknown[] = []; + if (text.length > 0) { + try { + candidates.push(JSON.parse(text)); + } catch {} + } + candidates.push(text); + candidates.push({ text }); + for (const candidate of candidates) { + const parsed = schema.safeParse(candidate); + if (parsed.success) { + return parsed.data; + } + } + return (candidates[0] ?? text) as z.infer; + } + + private createShortObjectResult( + schema: T, + text: string, + options?: GenerateObjectOptions, + ): GenerateObjectResultWithContext> { + const object = this.resolveShortResponseObject(schema, text); + const usage = this.createZeroUsage(); + const context = toContextMap(options?.context) ?? new Map(); + + return { + object, + usage, + warnings: [], + finishReason: "stop", + response: { + id: "short-response", + modelId: "short-response", + timestamp: new Date(), + messages: [], + }, + context, + request: { + body: {}, + }, + reasoning: "", + providerMetadata: undefined, + toJsonResponse: (init?: ResponseInit) => { + const responseInit = init ? { ...init } : {}; + const headers = { + "content-type": "application/json", + ...(responseInit.headers ?? {}), + }; + return new Response(safeStringify(object), { + ...responseInit, + headers, + }); + }, + } as GenerateObjectResultWithContext>; + } + + private createShortStreamObjectResult( + schema: T, + text: string, + options?: StreamObjectOptions, + ): StreamObjectResultWithContext> { + const object = this.resolveShortResponseObject(schema, text); + const usage = this.createZeroUsage(); + const context = toContextMap(options?.context) ?? new Map(); + const textPayload = safeStringify(object); + const createTextStream = (): AsyncIterableStream => + this.createShortTextStream(textPayload); + + const partialObjectStream = new ReadableStream>>({ + start(controller) { + controller.enqueue(object); + controller.close(); + }, + }); + + return { + object: Promise.resolve(object), + partialObjectStream, + textStream: createTextStream(), + warnings: Promise.resolve(undefined), + usage: Promise.resolve(usage), + finishReason: Promise.resolve("stop"), + pipeTextStreamToResponse: (response, init) => { + pipeTextStreamToResponse({ + response, + textStream: createTextStream(), + ...(init ?? {}), + }); + }, + toTextStreamResponse: (init) => { + return createTextStreamResponse({ + textStream: createTextStream(), + ...(init ?? {}), + }); + }, + context, + }; + } + + private buildShortTextFallbackRequest( + tenantId: string, + metadata: TrafficRequestMetadata | undefined, + options: GenerateTextOptions | undefined, + text: string, + ): TrafficRequest { + const shortMetadata = this.buildShortResponseMetadata(metadata); + return { + tenantId, + metadata: shortMetadata, + maxQueueWaitMs: options?.maxQueueWaitMs, + estimatedTokens: 0, + execute: async () => this.createShortTextResult(text, options), + extractUsage: (result: GenerateTextResultWithContext) => + this.extractUsageFromResponse(result), + }; + } + + private buildShortStreamTextFallbackRequest( + tenantId: string, + metadata: TrafficRequestMetadata | undefined, + options: StreamTextOptions | undefined, + text: string, + ): TrafficRequest { + const shortMetadata = this.buildShortResponseMetadata(metadata); + return { + tenantId, + metadata: shortMetadata, + maxQueueWaitMs: options?.maxQueueWaitMs, + estimatedTokens: 0, + execute: async () => this.createShortStreamTextResult(text, options), + extractUsage: (result: StreamTextResultWithContext) => this.extractUsageFromResponse(result), + }; + } + + private buildShortObjectFallbackRequest( + tenantId: string, + metadata: TrafficRequestMetadata | undefined, + schema: T, + options: GenerateObjectOptions | undefined, + text: string, + ): TrafficRequest>> { + const shortMetadata = this.buildShortResponseMetadata(metadata); + return { + tenantId, + metadata: shortMetadata, + maxQueueWaitMs: options?.maxQueueWaitMs, + estimatedTokens: 0, + execute: async () => this.createShortObjectResult(schema, text, options), + extractUsage: (result: GenerateObjectResultWithContext>) => + this.extractUsageFromResponse(result), + }; + } + + private buildShortStreamObjectFallbackRequest( + tenantId: string, + metadata: TrafficRequestMetadata | undefined, + schema: T, + options: StreamObjectOptions | undefined, + text: string, + ): TrafficRequest>> { + const shortMetadata = this.buildShortResponseMetadata(metadata); + return { + tenantId, + metadata: shortMetadata, + maxQueueWaitMs: options?.maxQueueWaitMs, + estimatedTokens: 0, + execute: async () => this.createShortStreamObjectResult(schema, text, options), + extractUsage: (result: StreamObjectResultWithContext>) => + this.extractUsageFromResponse(result), + }; + } + + private updateTrafficControllerRateLimits( + response: unknown, + metadata: TrafficRequestMetadata | undefined, + logger?: Logger, + ): void { + const headerCandidates = findHeaders(response); + if (headerCandidates.length === 0) { + logger?.debug?.("[Traffic] No headers found for rate limit update"); + return; + } + + const controller = getTrafficController(); + const effectiveMetadata = metadata ?? this.buildTrafficMetadata(); + let updateResult: ReturnType | undefined; + for (const headers of headerCandidates) { + updateResult = controller.updateRateLimitFromHeaders(effectiveMetadata, headers); + if (updateResult) break; + } + + if (!updateResult) { + logger?.debug?.("[Traffic] No rate limit headers applied from response"); + return; + } + + const now = Date.now(); + const effectiveRemaining = Math.max( + 0, + updateResult.state.remaining - updateResult.state.reserved, + ); + const resetInMs = Math.max(0, updateResult.state.resetAt - now); + const nextAllowedInMs = Math.max(0, updateResult.state.nextAllowedAt - now); + logger?.info?.("[Traffic] Applied rate limit from response headers", { + rateLimitKey: updateResult.key, + limit: updateResult.state.limit, + remaining: updateResult.state.remaining, + reserved: updateResult.state.reserved, + effectiveRemaining, + resetAt: updateResult.state.resetAt, + resetInMs, + nextAllowedAt: updateResult.state.nextAllowedAt, + nextAllowedInMs, + headers: { + limitRequests: updateResult.headerSnapshot.limitRequests, + remainingRequests: updateResult.headerSnapshot.remainingRequests, + resetRequestsMs: updateResult.headerSnapshot.resetRequestsMs, + }, + }); + } + + private extractUsageFromResponse( + result: + | { + usage?: LanguageModelUsage | Promise; + totalUsage?: LanguageModelUsage | Promise; + } + | undefined, + ): Promise | LanguageModelUsage | undefined { + if (!result) { + return undefined; + } + + const usageCandidate = + (result as { totalUsage?: LanguageModelUsage | Promise }) + ?.totalUsage ?? + (result as { usage?: LanguageModelUsage | Promise })?.usage; + + if (!usageCandidate) { + return undefined; + } + + const normalizeUsage = ( + usage: LanguageModelUsage | undefined, + ): LanguageModelUsage | undefined => { + if (!usage) return undefined; + const input = Number.isFinite(usage.inputTokens) ? (usage.inputTokens as number) : undefined; + const output = Number.isFinite(usage.outputTokens) + ? (usage.outputTokens as number) + : undefined; + const total = Number.isFinite(usage.totalTokens) ? (usage.totalTokens as number) : undefined; + + if (total === undefined && input === undefined && output === undefined) { + return undefined; + } + + const safeInput = input ?? 0; + const safeOutput = output ?? 0; + const safeTotal = total ?? safeInput + safeOutput; + + return { + ...usage, + inputTokens: safeInput, + outputTokens: safeOutput, + totalTokens: safeTotal, + }; + }; + + if ( + typeof (usageCandidate as PromiseLike).then === "function" + ) { + return (usageCandidate as Promise) + .then((usage) => normalizeUsage(usage)) + .catch(() => undefined); + } + + return normalizeUsage(usageCandidate as LanguageModelUsage); + } + + private resolveProvider( + model: LanguageModel | DynamicValue | undefined, + ): string | undefined { + if ( + model && + typeof model === "object" && + "provider" in model && + typeof (model as any).provider === "string" + ) { + return (model as any).provider; + } + + return undefined; + } + /** * Get the model name */ - public getModelName(): string { - if (typeof this.model === "function") { + public getModelName(modelOverride?: LanguageModel | DynamicValue): string { + const selectedModel = modelOverride ?? this.model; + if (typeof selectedModel === "function") { return "dynamic"; } - if (typeof this.model === "string") { - return this.model; + if (typeof selectedModel === "string") { + return selectedModel; } - return this.model.modelId || "unknown"; + return selectedModel.modelId || "unknown"; } /** diff --git a/packages/core/src/agent/eval.ts b/packages/core/src/agent/eval.ts index 9e4fe9f2e..de7125058 100644 --- a/packages/core/src/agent/eval.ts +++ b/packages/core/src/agent/eval.ts @@ -711,6 +711,7 @@ function buildEvalPayload( rawOutput: output, userId: oc.userId, conversationId: oc.conversationId, + tenantId: oc.tenantId, traceId: spanContext.traceId, spanId: spanContext.spanId, metadata, diff --git a/packages/core/src/agent/types.ts b/packages/core/src/agent/types.ts index dd5fb29d2..c70bd478e 100644 --- a/packages/core/src/agent/types.ts +++ b/packages/core/src/agent/types.ts @@ -29,6 +29,7 @@ import type { Logger } from "@voltagent/internal"; import type { LocalScorerDefinition, SamplingPolicy } from "../eval/runtime"; import type { MemoryOptions, MemoryStorageMetadata, WorkingMemorySummary } from "../memory/types"; import type { VoltAgentObservability } from "../observability"; +import type { TrafficPriority } from "../traffic/traffic-controller"; import type { DynamicValue, DynamicValueOptions, @@ -456,6 +457,11 @@ export type AgentOptions = { temperature?: number; maxOutputTokens?: number; maxSteps?: number; + /** + * Default scheduling priority for this agent's LLM calls. + * Defaults to P1 when unspecified. + */ + trafficPriority?: TrafficPriority; /** * Default stop condition for step execution (ai-sdk `stopWhen`). * Per-call `stopWhen` in method options overrides this. @@ -493,6 +499,7 @@ export interface AgentEvalPayload { rawOutput?: unknown; userId?: string; conversationId?: string; + tenantId?: string; traceId: string; spanId: string; metadata?: Record; @@ -890,6 +897,21 @@ export type OperationContext = { /** Optional conversation identifier associated with this operation */ conversationId?: string; + /** Optional tenant identifier propagated across nested operations */ + tenantId?: string; + + /** Optional key identifier for per-key traffic limits */ + apiKeyId?: string; + + /** Optional region identifier for per-region traffic limits */ + region?: string; + + /** Optional endpoint identifier for per-endpoint traffic limits */ + endpoint?: string; + + /** Optional tenant tier identifier for per-tier traffic limits */ + tenantTier?: string; + /** User-managed context map for this operation */ readonly context: Map; @@ -914,6 +936,9 @@ export type OperationContext = { /** Conversation steps for building full message history including tool calls/results */ conversationSteps?: StepWithContent[]; + /** Scheduling priority propagated from parent calls */ + priority?: TrafficPriority; + /** AbortController for cancelling the operation and accessing the signal */ abortController: AbortController; diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 8753f0391..9dee43331 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -21,6 +21,30 @@ export type { WorkflowTimelineEvent, RegisteredWorkflow, } from "./workflow"; +export { + // Surface traffic controller so downstream consumers can route agent calls through the shared scheduler + TrafficController, + CircuitBreakerOpenError, + QueueWaitTimeoutError, + RateLimitedUpstreamError, + getTrafficController, + type FallbackChainEntry, + type FallbackPolicy, + type FallbackPolicyConfig, + type FallbackPolicyMode, + type FallbackTarget, + type RateLimitConfig, + type RateLimitKey, + type RateLimitOptions, + type AdaptiveLimiterConfig, + type PriorityWeights, + type PriorityBurstLimits, + type TrafficRequest, + type TrafficRequestMetadata, + type TrafficResponseMetadata, + type TrafficPriority, + type TrafficRequestType, +} from "./traffic/traffic-controller"; // Export new Agent from agent.ts export { Agent, diff --git a/packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts new file mode 100644 index 000000000..652b7e59a --- /dev/null +++ b/packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts @@ -0,0 +1,243 @@ +import type { Logger } from "../../logger"; +import { + RATE_LIMIT_EXHAUSTION_BUFFER, + RATE_LIMIT_MIN_PACE_INTERVAL_MS, + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS, + RATE_LIMIT_PROBE_DELAY_MS, +} from "../traffic-constants"; +import type { + DispatchDecision, + QueuedRequest, + RateLimitWindowState, +} from "../traffic-controller-internal"; +import { parseRetryAfterMs, readHeaderValue } from "../traffic-error-utils"; +import type { TrafficRequestMetadata } from "../traffic-types"; +import type { + RateLimitHeaderSnapshot, + RateLimitStrategy, + RateLimitUpdateResult, +} from "./rate-limit-strategy"; +import { parseResetDurationToMs } from "./rate-limit-utils"; + +export class DefaultRateLimitStrategy implements RateLimitStrategy { + private state?: RateLimitWindowState; + private readonly key: string; + + constructor(key: string) { + this.key = key; + } + + resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { + const rateLimitLogger = logger?.child({ module: "rate-limiter" }); + const state = this.state; + if (!state) { + rateLimitLogger?.trace?.("Rate limit state missing; allow request", { + rateLimitKey: this.key, + }); + return null; + } + + const now = Date.now(); + const effectiveRemaining = Math.max(0, state.remaining - state.reserved); + const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; + + if (effectiveRemaining <= RATE_LIMIT_EXHAUSTION_BUFFER) { + if (now < probeAt) { + rateLimitLogger?.debug?.("Rate limit exhausted; waiting for probe", { + rateLimitKey: this.key, + remaining: state.remaining, + reserved: state.reserved, + effectiveRemaining, + resetAt: state.resetAt, + probeAt, + }); + return { kind: "wait", wakeUpAt: probeAt }; + } + if (state.reserved > 0) { + rateLimitLogger?.debug?.("Rate limit exhausted but in-flight reservations exist; waiting", { + rateLimitKey: this.key, + remaining: state.remaining, + reserved: state.reserved, + effectiveRemaining, + resetAt: state.resetAt, + }); + return { kind: "wait" }; + } + } + + if (now < state.nextAllowedAt) { + rateLimitLogger?.debug?.("Rate limit pacing; waiting until nextAllowedAt", { + rateLimitKey: this.key, + nextAllowedAt: state.nextAllowedAt, + resetAt: state.resetAt, + waitMs: Math.min(state.resetAt, state.nextAllowedAt) - now, + }); + return { kind: "wait", wakeUpAt: Math.min(state.resetAt, state.nextAllowedAt) }; + } + + state.reserved += 1; + next.rateLimitKey = this.key; + rateLimitLogger?.trace?.("Reserved rate limit token", { + rateLimitKey: this.key, + reserved: state.reserved, + remaining: state.remaining, + resetAt: state.resetAt, + nextAllowedAt: state.nextAllowedAt, + }); + + const remainingWindowMs = Math.max(0, state.resetAt - now); + const intervalMs = Math.max( + RATE_LIMIT_MIN_PACE_INTERVAL_MS, + Math.ceil(remainingWindowMs / Math.max(effectiveRemaining, 1)), + ); + + const candidateNext = Math.max(state.nextAllowedAt, now + intervalMs); + if ( + state.nextAllowedAt <= now || + candidateNext >= state.nextAllowedAt + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS + ) { + state.nextAllowedAt = candidateNext; + rateLimitLogger?.trace?.("Updated pacing nextAllowedAt", { + rateLimitKey: this.key, + nextAllowedAt: state.nextAllowedAt, + intervalMs, + remainingWindowMs, + effectiveRemaining, + }); + } + + return null; + } + + onDispatch(_logger?: Logger): void {} + + onComplete(logger?: Logger): void { + const rateLimitLogger = logger?.child({ module: "rate-limiter" }); + const state = this.state; + if (!state || state.reserved <= 0) return; + state.reserved -= 1; + rateLimitLogger?.trace?.("Released rate limit reservation", { + rateLimitKey: this.key, + reserved: state.reserved, + remaining: state.remaining, + resetAt: state.resetAt, + nextAllowedAt: state.nextAllowedAt, + }); + } + + updateFromHeaders( + _metadata: TrafficRequestMetadata | undefined, + headers: unknown, + logger?: Logger, + ): RateLimitUpdateResult | undefined { + const rateLimitLogger = logger?.child({ module: "rate-limiter" }); + const limitRequests = readHeaderValue(headers, "x-ratelimit-limit-requests"); + const remainingRequests = readHeaderValue(headers, "x-ratelimit-remaining-requests"); + const resetRequests = readHeaderValue(headers, "x-ratelimit-reset-requests"); + const retryAfter = readHeaderValue(headers, "retry-after"); + const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter) : undefined; + + const now = Date.now(); + const existing = this.state; + let state: RateLimitWindowState | undefined; + let headerSnapshot: RateLimitHeaderSnapshot | undefined; + + if (limitRequests && remainingRequests && resetRequests) { + const limit = Number(limitRequests); + const remaining = Number(remainingRequests); + if (!Number.isFinite(limit) || !Number.isFinite(remaining)) { + rateLimitLogger?.debug?.("Invalid rate limit numeric headers; skipping", { + rateLimitKey: this.key, + limitRequests, + remainingRequests, + }); + return undefined; + } + + const resetRequestsMs = parseResetDurationToMs(resetRequests); + if (resetRequestsMs === undefined) { + rateLimitLogger?.debug?.("Unable to parse reset duration; skipping", { + rateLimitKey: this.key, + resetRequests, + }); + return undefined; + } + + const parsedResetAt = now + resetRequestsMs; + const isSameWindow = !!existing && now < existing.resetAt; + const resetAt = isSameWindow ? Math.max(existing.resetAt, parsedResetAt) : parsedResetAt; + const nextAllowedAt = isSameWindow ? Math.max(existing.nextAllowedAt, now) : now; + const reserved = Math.max(0, existing?.reserved ?? 0); + + state = { + limit, + remaining: isSameWindow ? Math.min(existing.remaining, remaining) : remaining, + resetAt, + reserved, + nextAllowedAt, + }; + headerSnapshot = { + limitRequests, + remainingRequests, + resetRequests, + resetRequestsMs, + }; + } else if (retryAfterMs === undefined) { + rateLimitLogger?.trace?.("Missing rate limit headers; skipping", { + rateLimitKey: this.key, + hasLimit: !!limitRequests, + hasRemaining: !!remainingRequests, + hasReset: !!resetRequests, + hasRetryAfter: !!retryAfter, + }); + return undefined; + } + + if (!state) { + if (retryAfterMs === undefined) { + rateLimitLogger?.trace?.("Retry-After missing or unparsable; skipping", { + rateLimitKey: this.key, + retryAfter, + }); + return undefined; + } + const targetAt = now + retryAfterMs; + const isSameWindow = !!existing && now < existing.resetAt; + state = { + limit: existing?.limit ?? 1, + remaining: 0, + resetAt: isSameWindow ? Math.max(existing.resetAt, targetAt) : targetAt, + reserved: Math.max(0, existing?.reserved ?? 0), + nextAllowedAt: Math.max(existing?.nextAllowedAt ?? now, targetAt), + }; + headerSnapshot = { retryAfter, retryAfterMs }; + } else if (retryAfterMs !== undefined) { + const targetAt = now + retryAfterMs; + state = { + ...state, + remaining: 0, + resetAt: Math.max(state.resetAt, targetAt), + nextAllowedAt: Math.max(state.nextAllowedAt, targetAt), + }; + headerSnapshot = { ...headerSnapshot, retryAfter, retryAfterMs }; + } + + this.state = state; + rateLimitLogger?.debug?.("Applied rate limit headers to state", { + rateLimitKey: this.key, + limit: state.limit, + remaining: state.remaining, + effectiveRemaining: Math.max(0, state.remaining - state.reserved), + resetAt: state.resetAt, + nextAllowedAt: state.nextAllowedAt, + resetRequestsMs: headerSnapshot?.resetRequestsMs, + retryAfterMs: headerSnapshot?.retryAfterMs, + }); + + return { + key: this.key, + headerSnapshot: headerSnapshot ?? {}, + state, + }; + } +} diff --git a/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts new file mode 100644 index 000000000..7cca0d260 --- /dev/null +++ b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts @@ -0,0 +1,379 @@ +import type { Logger } from "../../logger"; +import { + RATE_LIMIT_EXHAUSTION_BUFFER, + RATE_LIMIT_MIN_PACE_INTERVAL_MS, + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS, + RATE_LIMIT_PROBE_DELAY_MS, +} from "../traffic-constants"; +import type { + DispatchDecision, + QueuedRequest, + RateLimitWindowState, +} from "../traffic-controller-internal"; +import { parseRetryAfterMs, readHeaderValue } from "../traffic-error-utils"; +import type { RateLimitOptions, TrafficRequestMetadata } from "../traffic-types"; +import { DefaultRateLimitStrategy } from "./default-rate-limit-strategy"; +import type { + RateLimitStrategy, + RateLimitUpdateResult, + RateLimitUsage, +} from "./rate-limit-strategy"; +import { parseResetDurationToMs } from "./rate-limit-utils"; + +export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { + readonly handlesTokenLimits = true; + private readonly window: DefaultRateLimitStrategy; + private readonly key: string; + private readonly requestsPerMinute?: number; + private readonly tokensPerMinute?: number; + private requestState?: RateLimitWindowState; + private tokenState?: RateLimitWindowState; + private bootstrapReserved = 0; + private readonly windowMs = 60_000; + + constructor(key: string, options?: RateLimitOptions) { + this.key = key; + this.window = new DefaultRateLimitStrategy(key); + // Window strategy enforces fixed 60s windows; burstSize is intentionally ignored here. + this.requestsPerMinute = this.normalizeLimit(options?.requestsPerMinute); + this.tokensPerMinute = this.normalizeLimit(options?.tokensPerMinute); + } + + resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { + if (this.requestsPerMinute !== undefined) { + const requestDecision = this.resolveRequestWindow(next, logger); + if (requestDecision) return requestDecision; + } else { + const decision = this.window.resolve(next, logger); + if (decision) return decision; + + if (!next.rateLimitKey && this.tokensPerMinute === undefined) { + const rateLimitLogger = logger?.child({ module: "rate-limiter" }); + if (this.bootstrapReserved >= 1) { + rateLimitLogger?.debug?.("OpenAI rate limit bootstrap active; waiting", { + rateLimitKey: this.key, + bootstrapReserved: this.bootstrapReserved, + }); + return { kind: "wait" }; + } + + this.bootstrapReserved += 1; + next.rateLimitKey = this.key; + rateLimitLogger?.debug?.("OpenAI rate limit bootstrap reserved", { + rateLimitKey: this.key, + bootstrapReserved: this.bootstrapReserved, + }); + } + } + + const tokenDecision = this.resolveTokenWindow(next, logger); + if (tokenDecision) return tokenDecision; + return null; + } + + onDispatch(logger?: Logger): void { + if (this.requestsPerMinute === undefined) { + this.window.onDispatch(logger); + } + } + + onComplete(logger?: Logger): void { + if (this.requestsPerMinute !== undefined) { + const now = Date.now(); + const state = this.ensureRequestState(now); + if (state.reserved > 0) { + state.reserved -= 1; + } + state.remaining = Math.max(0, state.remaining - 1); + return; + } + + if (this.bootstrapReserved > 0) { + this.bootstrapReserved -= 1; + } + this.window.onComplete(logger); + } + + recordUsage(usage: RateLimitUsage, logger?: Logger, reservedTokens?: number): void { + const tokens = this.resolveTokenCount(usage); + if (tokens <= 0) return; + + const now = Date.now(); + const state = this.ensureTokenState(now); + if (!state) return; + const reserved = typeof reservedTokens === "number" ? reservedTokens : 0; + const delta = tokens - reserved; + if (delta > 0) { + state.remaining = Math.max(0, state.remaining - delta); + } else if (delta < 0) { + state.remaining = Math.min(state.limit, state.remaining + Math.abs(delta)); + } + logger?.child({ module: "rate-limiter" })?.trace?.("OpenAI token usage recorded", { + rateLimitKey: this.key, + tokens, + remaining: state.remaining, + resetAt: state.resetAt, + }); + } + + updateFromHeaders( + metadata: TrafficRequestMetadata | undefined, + headers: unknown, + logger?: Logger, + ): RateLimitUpdateResult | undefined { + const update = + this.requestsPerMinute !== undefined + ? undefined + : this.window.updateFromHeaders(metadata, headers, logger); + const tokenUpdate = this.applyTokenHeaderUpdates(headers, logger); + if (!update) { + return tokenUpdate; + } + if (tokenUpdate?.headerSnapshot) { + return { + ...update, + headerSnapshot: { ...update.headerSnapshot, ...tokenUpdate.headerSnapshot }, + }; + } + return update; + } + + private resolveRequestWindow(next: QueuedRequest, logger?: Logger): DispatchDecision | null { + const rateLimitLogger = logger?.child({ module: "rate-limiter" }); + const now = Date.now(); + const state = this.ensureRequestState(now); + const effectiveRemaining = Math.max(0, state.remaining - state.reserved); + const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; + + if (effectiveRemaining <= RATE_LIMIT_EXHAUSTION_BUFFER) { + if (now < probeAt) { + rateLimitLogger?.debug?.("OpenAI request window exhausted; waiting for probe", { + rateLimitKey: this.key, + remaining: state.remaining, + reserved: state.reserved, + effectiveRemaining, + resetAt: state.resetAt, + probeAt, + }); + return { kind: "wait", wakeUpAt: probeAt }; + } + if (state.reserved > 0) { + rateLimitLogger?.debug?.( + "OpenAI request window exhausted but in-flight reservations exist; waiting", + { + rateLimitKey: this.key, + remaining: state.remaining, + reserved: state.reserved, + effectiveRemaining, + resetAt: state.resetAt, + }, + ); + return { kind: "wait" }; + } + } + + if (now < state.nextAllowedAt) { + rateLimitLogger?.debug?.("OpenAI request window pacing; waiting until nextAllowedAt", { + rateLimitKey: this.key, + nextAllowedAt: state.nextAllowedAt, + resetAt: state.resetAt, + waitMs: Math.min(state.resetAt, state.nextAllowedAt) - now, + }); + return { kind: "wait", wakeUpAt: Math.min(state.resetAt, state.nextAllowedAt) }; + } + + state.reserved += 1; + next.rateLimitKey = this.key; + rateLimitLogger?.trace?.("Reserved OpenAI request window slot", { + rateLimitKey: this.key, + reserved: state.reserved, + remaining: state.remaining, + resetAt: state.resetAt, + nextAllowedAt: state.nextAllowedAt, + }); + + const remainingWindowMs = Math.max(0, state.resetAt - now); + const intervalMs = Math.max( + RATE_LIMIT_MIN_PACE_INTERVAL_MS, + Math.ceil(remainingWindowMs / Math.max(effectiveRemaining, 1)), + ); + + const candidateNext = Math.max(state.nextAllowedAt, now + intervalMs); + if ( + state.nextAllowedAt <= now || + candidateNext >= state.nextAllowedAt + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS + ) { + state.nextAllowedAt = candidateNext; + rateLimitLogger?.trace?.("Updated OpenAI request pacing nextAllowedAt", { + rateLimitKey: this.key, + nextAllowedAt: state.nextAllowedAt, + intervalMs, + remainingWindowMs, + effectiveRemaining, + }); + } + + return null; + } + + private resolveTokenWindow(next: QueuedRequest, logger?: Logger): DispatchDecision | null { + const rateLimitLogger = logger?.child({ module: "rate-limiter" }); + const now = Date.now(); + const state = this.ensureTokenState(now); + if (!state) return null; + const estimatedTokens = next.estimatedTokens; + + if (typeof estimatedTokens === "number" && estimatedTokens > 0) { + if (state.remaining >= estimatedTokens) { + state.remaining = Math.max(0, state.remaining - estimatedTokens); + next.reservedTokens = estimatedTokens; + return null; + } + } else if (state.remaining > 0) { + return null; + } + + const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; + rateLimitLogger?.debug?.("OpenAI token window exhausted; waiting", { + rateLimitKey: this.key, + remaining: state.remaining, + resetAt: state.resetAt, + probeAt, + }); + return { kind: "wait", wakeUpAt: probeAt }; + } + + private ensureRequestState(now: number): RateLimitWindowState { + const limit = this.requestsPerMinute ?? 0; + const state = this.requestState; + if (!state || now >= state.resetAt) { + this.requestState = { + limit, + remaining: limit, + resetAt: now + this.windowMs, + reserved: 0, + nextAllowedAt: now, + }; + return this.requestState; + } + return state; + } + + private ensureTokenState(now: number): RateLimitWindowState | undefined { + const configuredLimit = this.tokensPerMinute; + const state = this.tokenState; + if (!state) { + if (configuredLimit === undefined) return undefined; + this.tokenState = { + limit: configuredLimit, + remaining: configuredLimit, + resetAt: now + this.windowMs, + reserved: 0, + nextAllowedAt: now, + }; + return this.tokenState; + } + + if (now >= state.resetAt) { + const limit = configuredLimit ?? state.limit; + this.tokenState = { + limit, + remaining: limit, + resetAt: now + this.windowMs, + reserved: 0, + nextAllowedAt: now, + }; + return this.tokenState; + } + + if (configuredLimit !== undefined && configuredLimit !== state.limit) { + state.limit = configuredLimit; + state.remaining = Math.min(state.remaining, configuredLimit); + } + + return state; + } + + private normalizeLimit(value: number | undefined): number | undefined { + const numeric = typeof value === "number" ? value : Number(value); + return Number.isFinite(numeric) && numeric > 0 ? numeric : undefined; + } + + private applyTokenHeaderUpdates( + headers: unknown, + logger?: Logger, + ): RateLimitUpdateResult | undefined { + const rateLimitLogger = logger?.child({ module: "rate-limiter" }); + const limitTokens = readHeaderValue(headers, "x-ratelimit-limit-tokens"); + const remainingTokens = readHeaderValue(headers, "x-ratelimit-remaining-tokens"); + const resetTokens = readHeaderValue(headers, "x-ratelimit-reset-tokens"); + const retryAfter = readHeaderValue(headers, "retry-after"); + + const limit = Number(limitTokens); + const remaining = Number(remainingTokens); + const resetTokensMs = resetTokens ? parseResetDurationToMs(resetTokens) : undefined; + const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter) : undefined; + + if (!Number.isFinite(limit) || !Number.isFinite(remaining) || resetTokensMs === undefined) { + rateLimitLogger?.trace?.("OpenAI token headers missing or invalid; skipping", { + rateLimitKey: this.key, + hasLimit: !!limitTokens, + hasRemaining: !!remainingTokens, + hasReset: !!resetTokens, + }); + return undefined; + } + + const now = Date.now(); + const configuredLimit = this.tokensPerMinute; + const effectiveLimit = configuredLimit === undefined ? limit : Math.min(configuredLimit, limit); + const clampedRemaining = Math.max(0, Math.min(remaining, effectiveLimit)); + const parsedResetAt = now + resetTokensMs; + const existing = this.tokenState; + const isSameWindow = !!existing && now < existing.resetAt; + const resetAt = isSameWindow ? Math.max(existing.resetAt, parsedResetAt) : parsedResetAt; + const nextAllowedAt = isSameWindow ? Math.max(existing.nextAllowedAt, now) : now; + const reserved = Math.max(0, existing?.reserved ?? 0); + const effectiveRemaining = isSameWindow + ? Math.min(existing.remaining, clampedRemaining) + : clampedRemaining; + + const state: RateLimitWindowState = { + limit: effectiveLimit, + remaining: effectiveRemaining, + resetAt, + reserved, + nextAllowedAt, + }; + this.tokenState = state; + + rateLimitLogger?.debug?.("OpenAI token headers applied", { + rateLimitKey: this.key, + limit: effectiveLimit, + remaining: effectiveRemaining, + resetAt, + retryAfterMs, + }); + + return { + key: this.key, + headerSnapshot: { + limitTokens, + remainingTokens, + resetTokens, + resetTokensMs, + retryAfter, + retryAfterMs, + }, + state, + }; + } + + private resolveTokenCount(usage: RateLimitUsage): number { + const total = Number.isFinite(usage.totalTokens) ? usage.totalTokens : undefined; + if (total !== undefined) return total; + const input = Number.isFinite(usage.inputTokens) ? usage.inputTokens : 0; + const output = Number.isFinite(usage.outputTokens) ? usage.outputTokens : 0; + return input + output; + } +} diff --git a/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts new file mode 100644 index 000000000..af398b25f --- /dev/null +++ b/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts @@ -0,0 +1,45 @@ +import type { Logger } from "../../logger"; +import type { + DispatchDecision, + QueuedRequest, + RateLimitWindowState, +} from "../traffic-controller-internal"; +import type { TrafficRequestMetadata } from "../traffic-types"; + +export type RateLimitHeaderSnapshot = { + limitRequests?: string; + remainingRequests?: string; + resetRequests?: string; + resetRequestsMs?: number; + limitTokens?: string; + remainingTokens?: string; + resetTokens?: string; + resetTokensMs?: number; + retryAfter?: string; + retryAfterMs?: number; +}; + +export type RateLimitUpdateResult = { + key: string; + headerSnapshot: RateLimitHeaderSnapshot; + state: RateLimitWindowState; +}; + +export type RateLimitUsage = { + inputTokens?: number; + outputTokens?: number; + totalTokens?: number; +}; + +export interface RateLimitStrategy { + readonly handlesTokenLimits?: boolean; + resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null; + onDispatch(logger?: Logger): void; + onComplete(logger?: Logger): void; + recordUsage?(usage: RateLimitUsage, logger?: Logger, reservedTokens?: number): void; + updateFromHeaders( + metadata: TrafficRequestMetadata | undefined, + headers: unknown, + logger?: Logger, + ): RateLimitUpdateResult | undefined; +} diff --git a/packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts b/packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts new file mode 100644 index 000000000..310c9a7e6 --- /dev/null +++ b/packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts @@ -0,0 +1,26 @@ +export function parseResetDurationToMs(raw: string): number | undefined { + const value = raw.trim(); + if (!value) return undefined; + + let totalMs = 0; + const regex = /(\d+(?:\.\d+)?)(ms|s|m|h|d)/g; + let matched = false; + for (const match of value.matchAll(regex)) { + matched = true; + const amount = Number.parseFloat(match[1] ?? ""); + if (!Number.isFinite(amount)) continue; + const unit = match[2]; + if (unit === "ms") totalMs += amount; + else if (unit === "s") totalMs += amount * 1000; + else if (unit === "m") totalMs += amount * 60_000; + else if (unit === "h") totalMs += amount * 3_600_000; + else if (unit === "d") totalMs += amount * 86_400_000; + } + + if (matched) { + return Math.round(totalMs); + } + + const n = Number(value); + return Number.isFinite(n) ? Math.round(n) : undefined; +} diff --git a/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts new file mode 100644 index 000000000..ee269ecd2 --- /dev/null +++ b/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts @@ -0,0 +1,218 @@ +import type { Logger } from "../../logger"; +import type { + DispatchDecision, + QueuedRequest, + RateLimitWindowState, +} from "../traffic-controller-internal"; +import { parseRetryAfterMs, readHeaderValue } from "../traffic-error-utils"; +import type { RateLimitOptions, TrafficRequestMetadata } from "../traffic-types"; +import type { + RateLimitHeaderSnapshot, + RateLimitStrategy, + RateLimitUpdateResult, +} from "./rate-limit-strategy"; +import { parseResetDurationToMs } from "./rate-limit-utils"; + +type TokenBucketState = { + capacity: number; + refillPerSecond: number; + tokens: number; + updatedAt: number; +}; + +function normalizeTokenBucketOptions( + raw: RateLimitOptions | undefined, +): Omit | undefined { + const requestsPerMinuteRaw = raw?.requestsPerMinute; + const tokensPerMinuteRaw = raw?.tokensPerMinute; + const burstSizeRaw = raw?.burstSize; + + const requestsPerMinute = + typeof requestsPerMinuteRaw === "number" ? requestsPerMinuteRaw : Number(requestsPerMinuteRaw); + const tokensPerMinute = + typeof tokensPerMinuteRaw === "number" ? tokensPerMinuteRaw : Number(tokensPerMinuteRaw); + const burstSize = typeof burstSizeRaw === "number" ? burstSizeRaw : Number(burstSizeRaw); + + const safeRequestsPerMinute = Number.isFinite(requestsPerMinute) ? requestsPerMinute : 0; + const hasTokenLimit = Number.isFinite(tokensPerMinute) && tokensPerMinute > 0; + if (safeRequestsPerMinute <= 0 && hasTokenLimit) { + return undefined; + } + const safeBurst = Number.isFinite(burstSize) ? burstSize : safeRequestsPerMinute; + const refillPerSecond = safeRequestsPerMinute > 0 ? safeRequestsPerMinute / 60 : 0; + + return { + capacity: safeBurst > 0 ? Math.max(1, safeBurst) : 0, + refillPerSecond, + }; +} +function refillTokenBucket(bucket: TokenBucketState, now: number): void { + const elapsedMs = now - bucket.updatedAt; + if (elapsedMs <= 0) return; + bucket.updatedAt = now; + if (bucket.capacity <= 0 || bucket.refillPerSecond <= 0) return; + + const refill = (elapsedMs / 1000) * bucket.refillPerSecond; + if (refill <= 0) return; + bucket.tokens = Math.min(bucket.capacity, bucket.tokens + refill); +} + +export class TokenBucketRateLimitStrategy implements RateLimitStrategy { + private readonly key: string; + private bucket?: TokenBucketState; + private cooldownUntil?: number; + + constructor(key: string, options?: RateLimitOptions) { + this.key = key; + if (!options) return; + const normalized = normalizeTokenBucketOptions(options); + if (!normalized) return; + const now = Date.now(); + this.bucket = { + ...normalized, + tokens: normalized.capacity, + updatedAt: now, + }; + } + + resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { + const rateLimitLogger = logger?.child({ module: "rate-limiter" }); + const now = Date.now(); + + if (this.cooldownUntil !== undefined && now < this.cooldownUntil) { + rateLimitLogger?.debug?.("Token bucket cooldown active; waiting", { + rateLimitKey: this.key, + cooldownUntil: this.cooldownUntil, + waitMs: this.cooldownUntil - now, + }); + return { kind: "wait", wakeUpAt: this.cooldownUntil }; + } + + const bucket = this.bucket; + if (!bucket) return null; + + refillTokenBucket(bucket, now); + + if (bucket.capacity <= 0) { + rateLimitLogger?.debug?.("Token bucket misconfigured; blocking", { + rateLimitKey: this.key, + capacity: bucket.capacity, + refillPerSecond: bucket.refillPerSecond, + }); + return { kind: "wait" }; + } + + if (bucket.tokens >= 1) { + bucket.tokens -= 1; + next.rateLimitKey = this.key; + rateLimitLogger?.trace?.("Consumed token bucket token", { + rateLimitKey: this.key, + tokens: bucket.tokens, + capacity: bucket.capacity, + refillPerSecond: bucket.refillPerSecond, + }); + return null; + } + + if (bucket.refillPerSecond <= 0) { + rateLimitLogger?.debug?.("Token bucket has no refill; blocking", { + rateLimitKey: this.key, + capacity: bucket.capacity, + refillPerSecond: bucket.refillPerSecond, + }); + return { kind: "wait" }; + } + + const requiredTokens = 1 - bucket.tokens; + const waitMs = Math.max(1, Math.ceil((requiredTokens / bucket.refillPerSecond) * 1000)); + const wakeUpAt = now + waitMs; + rateLimitLogger?.debug?.("Token bucket empty; waiting", { + rateLimitKey: this.key, + tokens: bucket.tokens, + capacity: bucket.capacity, + refillPerSecond: bucket.refillPerSecond, + wakeUpAt, + waitMs, + }); + return { kind: "wait", wakeUpAt }; + } + + onDispatch(_logger?: Logger): void {} + + onComplete(_logger?: Logger): void {} + + updateFromHeaders( + _metadata: TrafficRequestMetadata | undefined, + headers: unknown, + logger?: Logger, + ): RateLimitUpdateResult | undefined { + const rateLimitLogger = logger?.child({ module: "rate-limiter" }); + const now = Date.now(); + + const retryAfter = readHeaderValue(headers, "retry-after"); + const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter, now) : undefined; + + const remainingRequests = readHeaderValue(headers, "x-ratelimit-remaining-requests"); + const resetRequests = readHeaderValue(headers, "x-ratelimit-reset-requests"); + const resetRequestsMs = resetRequests ? parseResetDurationToMs(resetRequests) : undefined; + + let appliedUntil: number | undefined; + + if (retryAfterMs !== undefined) { + const targetAt = now + retryAfterMs; + this.cooldownUntil = + this.cooldownUntil === undefined ? targetAt : Math.max(this.cooldownUntil, targetAt); + appliedUntil = this.cooldownUntil; + } + + if (remainingRequests && resetRequestsMs !== undefined) { + const remaining = Number(remainingRequests); + if (Number.isFinite(remaining) && remaining <= 0) { + const targetAt = now + resetRequestsMs; + this.cooldownUntil = + this.cooldownUntil === undefined ? targetAt : Math.max(this.cooldownUntil, targetAt); + appliedUntil = this.cooldownUntil; + } + } + + if (appliedUntil === undefined) { + rateLimitLogger?.trace?.("No applicable cooldown headers; skipping", { + rateLimitKey: this.key, + hasRetryAfter: !!retryAfter, + hasRemainingRequests: !!remainingRequests, + hasResetRequests: !!resetRequests, + }); + return undefined; + } + + rateLimitLogger?.debug?.("Applied token bucket cooldown from headers", { + rateLimitKey: this.key, + cooldownUntil: appliedUntil, + inMs: Math.max(0, appliedUntil - now), + retryAfterMs, + resetRequestsMs, + }); + + const headerSnapshot: RateLimitHeaderSnapshot = { + remainingRequests, + resetRequests, + resetRequestsMs, + retryAfter, + retryAfterMs, + }; + + const state: RateLimitWindowState = { + limit: 1, + remaining: 0, + resetAt: appliedUntil, + reserved: 0, + nextAllowedAt: appliedUntil, + }; + + return { + key: this.key, + headerSnapshot, + state, + }; + } +} diff --git a/packages/core/src/traffic/traffic-circuit-breaker.ts b/packages/core/src/traffic/traffic-circuit-breaker.ts new file mode 100644 index 000000000..20d166ca2 --- /dev/null +++ b/packages/core/src/traffic/traffic-circuit-breaker.ts @@ -0,0 +1,478 @@ +import type { Logger } from "../logger"; +import { + CIRCUIT_COOLDOWN_MS, + CIRCUIT_FAILURE_THRESHOLD, + CIRCUIT_FAILURE_WINDOW_MS, + CIRCUIT_PROBE_INTERVAL_MS, + CIRCUIT_TIMEOUT_THRESHOLD, + CIRCUIT_TIMEOUT_WINDOW_MS, + DEFAULT_FALLBACK_CHAINS, +} from "./traffic-constants"; +import type { + CircuitState, + CircuitStateStatus, + DispatchDecision, + QueuedRequest, +} from "./traffic-controller-internal"; +import { extractStatusCode, isTimeoutError } from "./traffic-error-utils"; +import { CircuitBreakerOpenError } from "./traffic-errors"; +import type { + FallbackChainEntry, + FallbackPolicy, + FallbackPolicyConfig, + FallbackTarget, + TrafficRequestMetadata, + TrafficResponseMetadata, +} from "./traffic-types"; + +export class TrafficCircuitBreaker { + private readonly circuitBreakers = new Map(); + private readonly fallbackChains: Map; + private readonly fallbackPolicy?: FallbackPolicyConfig; + private readonly buildRateLimitKey: (metadata?: TrafficRequestMetadata) => string; + + constructor(options: { + fallbackChains?: Record; + fallbackPolicy?: FallbackPolicyConfig; + buildRateLimitKey: (metadata?: TrafficRequestMetadata) => string; + }) { + this.buildRateLimitKey = options.buildRateLimitKey; + const chains = options.fallbackChains ?? DEFAULT_FALLBACK_CHAINS; + this.fallbackChains = new Map(Object.entries(chains)); + this.fallbackPolicy = options.fallbackPolicy; + } + + resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { + const circuitLogger = logger?.child({ module: "circuit-breaker" }); + const visitedKeys = new Set(); + + while (true) { + const key = this.buildRateLimitKey(next.request.metadata); + next.circuitKey = key; + visitedKeys.add(key); + circuitLogger?.trace?.("Circuit resolve step", { + circuitKey: key, + provider: next.request.metadata?.provider, + model: next.request.metadata?.model, + }); + + const evaluation = this.evaluateCircuitState(key, circuitLogger); + next.circuitStatus = evaluation.state; + circuitLogger?.debug?.("Circuit evaluated", { + circuitKey: key, + state: evaluation.state, + allowRequest: evaluation.allowRequest, + retryAfterMs: evaluation.retryAfterMs, + }); + + if (evaluation.allowRequest) return null; + + const { policy, policyId } = this.resolveFallbackPolicy(next.request.metadata); + if (policy.mode === "wait") { + const wakeUpAt = + evaluation.retryAfterMs !== undefined ? Date.now() + evaluation.retryAfterMs : undefined; + circuitLogger?.debug?.("Circuit open; waiting per fallback policy", { + circuitKey: key, + policyId, + retryAfterMs: evaluation.retryAfterMs, + wakeUpAt, + }); + return { kind: "wait", wakeUpAt }; + } + + const fallback = this.findFallbackTarget(next.request.metadata, visitedKeys, circuitLogger); + circuitLogger?.debug?.("Circuit open; attempting fallback", { + circuitKey: key, + currentModel: next.request.metadata?.model, + fallback, + visitedKeys: Array.from(visitedKeys), + }); + if (!fallback || !next.request.createFallbackRequest) { + const error = new CircuitBreakerOpenError( + `Circuit open for ${key}`, + next.request.metadata, + evaluation.retryAfterMs, + ); + const traffic: TrafficResponseMetadata = { + rateLimitKey: key, + retryAfterMs: evaluation.retryAfterMs, + tenantId: next.request.metadata?.tenantId ?? next.tenantId, + priority: next.request.metadata?.priority, + taskType: next.request.metadata?.taskType, + }; + (error as Record).traffic = traffic; + next.reject(error); + circuitLogger?.warn?.("No fallback available; rejecting request", { + circuitKey: key, + retryAfterMs: evaluation.retryAfterMs, + }); + return { kind: "skip" }; + } + + const fallbackRequest = next.request.createFallbackRequest(fallback); + if (!fallbackRequest) { + circuitLogger?.warn?.("createFallbackRequest returned undefined; skipping", { + circuitKey: key, + fallback, + }); + return { kind: "skip" }; + } + + this.applyFallbackRequest(next, fallbackRequest, fallback, circuitLogger, { + previousCircuitKey: key, + reason: "circuit-open", + }); + } + } + + tryFallback(next: QueuedRequest, reason: "queue-timeout", logger?: Logger): boolean { + const circuitLogger = logger?.child({ module: "circuit-breaker" }); + const { policy, policyId } = this.resolveFallbackPolicy(next.request.metadata); + if (policy.mode === "wait") { + circuitLogger?.debug?.("Fallback skipped by policy", { + policyId, + reason, + provider: next.request.metadata?.provider, + model: next.request.metadata?.model, + }); + return false; + } + + const visitedKeys = new Set(); + const key = this.buildRateLimitKey(next.request.metadata); + visitedKeys.add(key); + + const fallback = this.findFallbackTarget(next.request.metadata, visitedKeys, circuitLogger); + if (!fallback || !next.request.createFallbackRequest) { + circuitLogger?.debug?.("Fallback unavailable for request", { + reason, + provider: next.request.metadata?.provider, + model: next.request.metadata?.model, + fallback, + }); + return false; + } + + const fallbackRequest = next.request.createFallbackRequest(fallback); + if (!fallbackRequest) { + circuitLogger?.warn?.("createFallbackRequest returned undefined; skipping", { + reason, + fallback, + }); + return false; + } + + this.applyFallbackRequest(next, fallbackRequest, fallback, circuitLogger, { + previousCircuitKey: key, + reason, + policyId, + }); + return true; + } + + markTrial(item: QueuedRequest, logger?: Logger): void { + const circuitLogger = logger?.child({ module: "circuit-breaker" }); + const key = item.circuitKey; + if (!key) return; + const state = this.circuitBreakers.get(key); + if (state && state.status === "half-open" && !state.trialInFlight) { + state.trialInFlight = true; + circuitLogger?.debug?.("Marked half-open trial in flight", { circuitKey: key }); + } + } + + recordSuccess(metadata?: TrafficRequestMetadata, logger?: Logger): void { + const circuitLogger = logger?.child({ module: "circuit-breaker" }); + const key = this.buildRateLimitKey(metadata); + this.circuitBreakers.delete(key); + circuitLogger?.debug?.("Circuit success; cleared circuit state", { + circuitKey: key, + provider: metadata?.provider, + model: metadata?.model, + }); + } + + recordFailure( + metadata: TrafficRequestMetadata | undefined, + error: unknown, + logger?: Logger, + ): void { + const circuitLogger = logger?.child({ module: "circuit-breaker" }); + const key = this.buildRateLimitKey(metadata); + const status = extractStatusCode(error, logger); + const isTimeout = status === 408 || isTimeoutError(error, logger); + const isStatusEligible = this.isCircuitBreakerStatus(status); + const isTimeoutEligible = !isStatusEligible && isTimeout; + const isEligible = isStatusEligible || isTimeoutEligible; + + circuitLogger?.debug?.("Circuit failure observed", { + circuitKey: key, + status, + isTimeout, + eligible: isEligible, + provider: metadata?.provider, + model: metadata?.model, + }); + + if (!isEligible) { + this.circuitBreakers.delete(key); + circuitLogger?.debug?.("Failure not eligible for circuit breaker; cleared circuit state", { + circuitKey: key, + status, + isTimeout, + }); + return; + } + + const now = Date.now(); + const state = + this.circuitBreakers.get(key) ?? + ({ status: "closed", failureTimestamps: [], timeoutTimestamps: [] } as CircuitState); + + state.failureTimestamps = state.failureTimestamps.filter( + (t) => now - t <= CIRCUIT_FAILURE_WINDOW_MS, + ); + state.timeoutTimestamps = state.timeoutTimestamps.filter( + (t) => now - t <= CIRCUIT_TIMEOUT_WINDOW_MS, + ); + + state.failureTimestamps.push(now); + if (isTimeoutEligible) { + state.timeoutTimestamps.push(now); + } + + if ( + state.status === "half-open" || + state.failureTimestamps.length >= CIRCUIT_FAILURE_THRESHOLD || + state.timeoutTimestamps.length >= CIRCUIT_TIMEOUT_THRESHOLD + ) { + const openReasons: string[] = []; + if (state.status === "half-open") openReasons.push("half-open-failure"); + if (state.failureTimestamps.length >= CIRCUIT_FAILURE_THRESHOLD) { + openReasons.push("failure-threshold"); + } + if (state.timeoutTimestamps.length >= CIRCUIT_TIMEOUT_THRESHOLD) { + openReasons.push("timeout-threshold"); + } + + state.status = "open"; + state.openedAt = now; + state.trialInFlight = false; + state.nextProbeAt = now + CIRCUIT_PROBE_INTERVAL_MS; + circuitLogger?.warn?.("Circuit opened", { + circuitKey: key, + openReasons, + status, + isTimeout, + failureCount: state.failureTimestamps.length, + failureThreshold: CIRCUIT_FAILURE_THRESHOLD, + timeoutCount: state.timeoutTimestamps.length, + timeoutThreshold: CIRCUIT_TIMEOUT_THRESHOLD, + openedAt: state.openedAt, + }); + } + + this.circuitBreakers.set(key, state); + circuitLogger?.trace?.("Circuit state updated", { + circuitKey: key, + status: state.status, + failureCount: state.failureTimestamps.length, + failureWindowMs: CIRCUIT_FAILURE_WINDOW_MS, + timeoutCount: state.timeoutTimestamps.length, + timeoutWindowMs: CIRCUIT_TIMEOUT_WINDOW_MS, + }); + } + + private evaluateCircuitState( + key: string, + logger?: Logger, + ): { + allowRequest: boolean; + state: CircuitStateStatus; + retryAfterMs?: number; + } { + const state = this.circuitBreakers.get(key); + if (!state) { + logger?.trace?.("Circuit state missing; allow request", { circuitKey: key }); + return { allowRequest: true, state: "closed" }; + } + + const now = Date.now(); + + if (state.status === "open") { + const elapsed = state.openedAt ? now - state.openedAt : 0; + if (state.nextProbeAt === undefined) { + state.nextProbeAt = now + CIRCUIT_PROBE_INTERVAL_MS; + } + const cooldownRemaining = Math.max(0, CIRCUIT_COOLDOWN_MS - elapsed); + const probeRemaining = Math.max(0, state.nextProbeAt - now); + if (probeRemaining === 0 || cooldownRemaining === 0) { + state.status = "half-open"; + state.trialInFlight = false; + state.failureTimestamps = []; + state.timeoutTimestamps = []; + state.nextProbeAt = undefined; + logger?.debug?.("Circuit transitioned to half-open", { + circuitKey: key, + reason: cooldownRemaining === 0 ? "cooldown" : "probe", + }); + return { allowRequest: true, state: "half-open" }; + } + return { + allowRequest: false, + state: "open", + retryAfterMs: Math.min(cooldownRemaining, probeRemaining), + }; + } + + if (state.status === "half-open" && state.trialInFlight) { + return { allowRequest: false, state: "half-open" }; + } + + return { allowRequest: true, state: state.status }; + } + + private resolveFallbackPolicy(metadata: TrafficRequestMetadata | undefined): { + policy: FallbackPolicy; + policyId?: string; + } { + const policyId = + metadata?.fallbackPolicyId ?? + (metadata?.taskType + ? this.fallbackPolicy?.taskTypePolicyIds?.[metadata.taskType] + : undefined) ?? + this.fallbackPolicy?.defaultPolicyId; + + const policy = policyId ? this.fallbackPolicy?.policies?.[policyId] : undefined; + return { + policy: policy ?? { mode: "fallback" }, + policyId, + }; + } + + private applyFallbackRequest( + next: QueuedRequest, + fallbackRequest: QueuedRequest["request"], + fallback: FallbackChainEntry, + logger?: Logger, + context?: { previousCircuitKey?: string; reason?: string; policyId?: string }, + ): void { + next.request = fallbackRequest; + next.attempt = 1; + next.estimatedTokens = fallbackRequest.estimatedTokens; + next.reservedTokens = undefined; + next.tenantConcurrencyKey = undefined; + next.providerModelConcurrencyKey = undefined; + next.rateLimitKey = undefined; + next.etaMs = undefined; + next.circuitKey = undefined; + next.circuitStatus = undefined; + next.extractUsage = fallbackRequest.extractUsage; + if (context?.reason === "queue-timeout") { + next.queueTimeoutDisabled = true; + } + logger?.debug?.("Switched to fallback request", { + previousCircuitKey: context?.previousCircuitKey, + fallbackModel: fallback, + reason: context?.reason, + policyId: context?.policyId, + }); + } + + private isShortResponseFallback( + candidate: FallbackChainEntry, + ): candidate is { kind: "short-response"; text: string } { + return ( + typeof candidate === "object" && + candidate !== null && + "kind" in candidate && + (candidate as { kind?: string }).kind === "short-response" + ); + } + + private findFallbackTarget( + metadata: TrafficRequestMetadata | undefined, + visitedKeys: Set, + logger?: Logger, + ): FallbackChainEntry | undefined { + const currentModel = metadata?.model; + if (!currentModel) { + logger?.trace?.("No current model; no fallback", {}); + return undefined; + } + + const provider = metadata?.provider; + const chain = this.resolveFallbackChain(provider, currentModel); + if (!chain) { + logger?.trace?.("No fallback chain for model", { + currentModel, + provider, + }); + return undefined; + } + + for (const candidate of chain) { + if (this.isShortResponseFallback(candidate)) { + logger?.debug?.("Selected short-response fallback", { + currentModel, + currentProvider: provider, + }); + return candidate; + } + const target = this.normalizeFallbackTarget(candidate, provider); + const candidateMetadata: TrafficRequestMetadata = { + ...(metadata ?? {}), + provider: target.provider ?? provider, + model: target.model, + }; + const candidateKey = this.buildRateLimitKey(candidateMetadata); + if (visitedKeys.has(candidateKey)) { + continue; + } + + const evaluation = this.evaluateCircuitState(candidateKey, logger); + if (evaluation.allowRequest) { + visitedKeys.add(candidateKey); + logger?.debug?.("Selected fallback target", { + currentModel, + currentProvider: provider, + fallbackModel: target.model, + fallbackProvider: target.provider ?? provider, + fallbackCircuitKey: candidateKey, + }); + return candidate; + } + } + + return undefined; + } + + private resolveFallbackChain( + provider: string | undefined, + model: string, + ): FallbackChainEntry[] | undefined { + const providerKey = provider ? `${provider}::${model}` : undefined; + if (providerKey) { + const providerChain = this.fallbackChains.get(providerKey); + if (providerChain) return providerChain; + } + return this.fallbackChains.get(model); + } + + private normalizeFallbackTarget( + candidate: FallbackChainEntry, + provider: string | undefined, + ): FallbackTarget { + if (typeof candidate === "string") { + return { provider, model: candidate }; + } + return { + provider: candidate.provider ?? provider, + model: candidate.model, + }; + } + + private isCircuitBreakerStatus(status?: number): boolean { + return status === 429 || (status !== undefined && status >= 500); + } +} diff --git a/packages/core/src/traffic/traffic-concurrency-limiter.ts b/packages/core/src/traffic/traffic-concurrency-limiter.ts new file mode 100644 index 000000000..e15256127 --- /dev/null +++ b/packages/core/src/traffic/traffic-concurrency-limiter.ts @@ -0,0 +1,235 @@ +import type { Logger } from "../logger"; +import type { QueuedRequest } from "./traffic-controller-internal"; +import type { + ProviderModelConcurrencyLimit, + TenantConcurrencyLimit, + TrafficRequestMetadata, +} from "./traffic-types"; + +export type ConcurrencyBlockReason = + | { + gate: "providerModel"; + key: string; + inFlight: number; + limit: number; + } + | { + gate: "tenant"; + key: string; + inFlight: number; + limit: number; + }; + +export type ConcurrencyDecision = + | { kind: "allow" } + | { kind: "wait"; reasons: ConcurrencyBlockReason[] }; + +function toNonNegativeIntegerLimit(raw: unknown): number | undefined { + if (raw === undefined || raw === null) return undefined; + const n = typeof raw === "number" ? raw : Number(raw); + if (!Number.isFinite(n)) return undefined; + if (n <= 0) return 0; + return Math.floor(n); +} + +function getInFlight(map: Map, key: string): number { + return map.get(key) ?? 0; +} + +function incrementInFlight(map: Map, key: string): void { + map.set(key, getInFlight(map, key) + 1); +} + +function decrementInFlight(map: Map, key: string): void { + const current = getInFlight(map, key); + if (current <= 1) { + map.delete(key); + return; + } + map.set(key, current - 1); +} + +export class TrafficConcurrencyLimiter { + private readonly inFlightByProviderModel = new Map(); + private readonly inFlightByTenant = new Map(); + + private readonly buildProviderModelKey: (metadata?: TrafficRequestMetadata) => string; + private readonly providerModelLimit?: ProviderModelConcurrencyLimit; + private readonly tenantLimit?: TenantConcurrencyLimit; + private readonly providerModelEnabled: boolean; + private readonly tenantEnabled: boolean; + + constructor(options: { + buildProviderModelKey: (metadata?: TrafficRequestMetadata) => string; + maxConcurrentPerProviderModel?: ProviderModelConcurrencyLimit; + maxConcurrentPerTenant?: TenantConcurrencyLimit; + }) { + this.buildProviderModelKey = options.buildProviderModelKey; + this.providerModelLimit = options.maxConcurrentPerProviderModel; + this.tenantLimit = options.maxConcurrentPerTenant; + this.providerModelEnabled = options.maxConcurrentPerProviderModel !== undefined; + this.tenantEnabled = options.maxConcurrentPerTenant !== undefined; + } + + resolve(next: QueuedRequest, logger?: Logger): ConcurrencyDecision { + if (!this.providerModelEnabled && !this.tenantEnabled) return { kind: "allow" }; + const concurrencyLogger = logger?.child({ module: "concurrency-limiter" }); + const reasons: ConcurrencyBlockReason[] = []; + + if (this.providerModelEnabled) { + const providerModelKey = this.buildProviderModelKey(next.request.metadata); + const providerModelLimit = this.resolveProviderModelLimit( + providerModelKey, + next.request.metadata, + concurrencyLogger, + ); + if (providerModelLimit !== undefined) { + const inFlight = getInFlight(this.inFlightByProviderModel, providerModelKey); + if (inFlight >= providerModelLimit) { + reasons.push({ + gate: "providerModel", + key: providerModelKey, + inFlight, + limit: providerModelLimit, + }); + } + } + } + + if (this.tenantEnabled) { + const tenantKey = next.tenantId; + const tenantLimit = this.resolveTenantLimit( + tenantKey, + next.request.metadata, + concurrencyLogger, + ); + if (tenantLimit !== undefined) { + const inFlight = getInFlight(this.inFlightByTenant, tenantKey); + if (inFlight >= tenantLimit) { + reasons.push({ + gate: "tenant", + key: tenantKey, + inFlight, + limit: tenantLimit, + }); + } + } + } + + if (reasons.length === 0) return { kind: "allow" }; + + concurrencyLogger?.trace?.("Concurrency gate blocked request", { + tenantId: next.tenantId, + reasons, + }); + return { kind: "wait", reasons }; + } + + acquire(next: QueuedRequest, logger?: Logger): void { + if (!this.providerModelEnabled && !this.tenantEnabled) return; + const concurrencyLogger = logger?.child({ module: "concurrency-limiter" }); + + let tenantKey: string | undefined; + if (this.tenantEnabled) { + tenantKey = next.tenantId; + next.tenantConcurrencyKey = tenantKey; + incrementInFlight(this.inFlightByTenant, tenantKey); + } + + let providerModelKey: string | undefined; + if (this.providerModelEnabled) { + providerModelKey = this.buildProviderModelKey(next.request.metadata); + next.providerModelConcurrencyKey = providerModelKey; + incrementInFlight(this.inFlightByProviderModel, providerModelKey); + } + + concurrencyLogger?.trace?.("Concurrency slots acquired", { + tenantId: tenantKey, + tenantInFlight: tenantKey ? getInFlight(this.inFlightByTenant, tenantKey) : undefined, + providerModelKey, + providerModelInFlight: providerModelKey + ? getInFlight(this.inFlightByProviderModel, providerModelKey) + : undefined, + }); + } + + release(next: QueuedRequest, logger?: Logger): void { + const concurrencyLogger = logger?.child({ module: "concurrency-limiter" }); + const tenantKey = next.tenantConcurrencyKey; + const providerModelKey = next.providerModelConcurrencyKey; + + if (tenantKey) { + decrementInFlight(this.inFlightByTenant, tenantKey); + } + + if (providerModelKey) { + decrementInFlight(this.inFlightByProviderModel, providerModelKey); + } + + if (tenantKey || providerModelKey) { + concurrencyLogger?.trace?.("Concurrency slots released", { + tenantId: tenantKey, + tenantInFlight: tenantKey ? getInFlight(this.inFlightByTenant, tenantKey) : undefined, + providerModelKey, + providerModelInFlight: providerModelKey + ? getInFlight(this.inFlightByProviderModel, providerModelKey) + : undefined, + }); + } + + next.tenantConcurrencyKey = undefined; + next.providerModelConcurrencyKey = undefined; + } + + private resolveTenantLimit( + tenantId: string, + metadata: TrafficRequestMetadata | undefined, + logger?: Logger, + ): number | undefined { + const policy = this.tenantLimit; + if (policy === undefined) return undefined; + + if (typeof policy === "number") return toNonNegativeIntegerLimit(policy); + if (typeof policy === "function") { + try { + return toNonNegativeIntegerLimit(policy(tenantId, metadata)); + } catch (error) { + logger?.warn?.("Tenant concurrency resolver threw; ignoring", { + tenantId, + errorName: (error as { name?: unknown } | null)?.name, + errorMessage: (error as { message?: unknown } | null)?.message, + }); + return undefined; + } + } + + return toNonNegativeIntegerLimit(policy[tenantId]); + } + + private resolveProviderModelLimit( + key: string, + metadata: TrafficRequestMetadata | undefined, + logger?: Logger, + ): number | undefined { + const policy = this.providerModelLimit; + if (policy === undefined) return undefined; + + if (typeof policy === "number") return toNonNegativeIntegerLimit(policy); + if (typeof policy === "function") { + try { + return toNonNegativeIntegerLimit(policy(metadata, key)); + } catch (error) { + logger?.warn?.("Provider/model concurrency resolver threw; ignoring", { + key, + provider: metadata?.provider, + model: metadata?.model, + errorName: (error as { name?: unknown } | null)?.name, + errorMessage: (error as { message?: unknown } | null)?.message, + }); + return undefined; + } + } + + return toNonNegativeIntegerLimit(policy[key]); + } +} diff --git a/packages/core/src/traffic/traffic-constants.ts b/packages/core/src/traffic/traffic-constants.ts new file mode 100644 index 000000000..68d99df78 --- /dev/null +++ b/packages/core/src/traffic/traffic-constants.ts @@ -0,0 +1,26 @@ +export const MAX_RETRY_ATTEMPTS = 3; +export const TIMEOUT_RETRY_ATTEMPTS = 2; + +export const RATE_LIMIT_BASE_BACKOFF_MS = 500; +export const SERVER_ERROR_BASE_BACKOFF_MS = 1000; +export const TIMEOUT_BASE_BACKOFF_MS = 750; + +export const RATE_LIMIT_JITTER_FACTOR = 0.35; +export const SERVER_ERROR_JITTER_FACTOR = 0.8; +export const TIMEOUT_JITTER_FACTOR = 0.5; + +export const CIRCUIT_FAILURE_THRESHOLD = 5; +export const CIRCUIT_FAILURE_WINDOW_MS = 10_000; +export const CIRCUIT_TIMEOUT_THRESHOLD = CIRCUIT_FAILURE_THRESHOLD; +export const CIRCUIT_TIMEOUT_WINDOW_MS = CIRCUIT_FAILURE_WINDOW_MS; +export const CIRCUIT_COOLDOWN_MS = 30_000; +export const CIRCUIT_PROBE_INTERVAL_MS = 5_000; + +export const RATE_LIMIT_EXHAUSTION_BUFFER = 1; +export const RATE_LIMIT_PROBE_DELAY_MS = 50; +export const RATE_LIMIT_MIN_PACE_INTERVAL_MS = 10; +export const RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS = 10; + +export const DEFAULT_FALLBACK_CHAINS: Record = { + "gpt-4o": ["gpt-4o-mini", "gpt-3.5"], +}; diff --git a/packages/core/src/traffic/traffic-controller-internal.ts b/packages/core/src/traffic/traffic-controller-internal.ts new file mode 100644 index 000000000..fd2012cf5 --- /dev/null +++ b/packages/core/src/traffic/traffic-controller-internal.ts @@ -0,0 +1,57 @@ +import type { TrafficPriority, TrafficRequest, TrafficRequestType } from "./traffic-types"; + +export type Scheduler = (callback: () => void) => void; + +export type DispatchDecision = + | { kind: "dispatch" } + | { kind: "skip" } + | { kind: "wait"; wakeUpAt?: number }; + +export type CircuitStateStatus = "closed" | "open" | "half-open"; + +export interface CircuitState { + status: CircuitStateStatus; + failureTimestamps: number[]; + timeoutTimestamps: number[]; + openedAt?: number; + trialInFlight?: boolean; + nextProbeAt?: number; +} + +export interface RateLimitWindowState { + limit: number; + remaining: number; + resetAt: number; + reserved: number; + nextAllowedAt: number; +} + +type BivariantHandler = { + bivarianceHack(...args: TArgs): void; +}["bivarianceHack"]; + +export interface QueuedRequest { + type: TrafficRequestType; + request: TrafficRequest; + resolve: BivariantHandler<[TResponse | PromiseLike]>; + reject: BivariantHandler<[reason?: unknown]>; + attempt: number; + priority: TrafficPriority; + tenantId: string; + enqueuedAt: number; + dispatchedAt?: number; + estimatedTokens?: number; + reservedTokens?: number; + queueTimeoutDisabled?: boolean; + + tenantConcurrencyKey?: string; + providerModelConcurrencyKey?: string; + + rateLimitKey?: string; + etaMs?: number; + + circuitKey?: string; + circuitStatus?: CircuitStateStatus; + + extractUsage?: TrafficRequest["extractUsage"]; +} diff --git a/packages/core/src/traffic/traffic-controller.spec.ts b/packages/core/src/traffic/traffic-controller.spec.ts new file mode 100644 index 000000000..dee0719f8 --- /dev/null +++ b/packages/core/src/traffic/traffic-controller.spec.ts @@ -0,0 +1,804 @@ +import { describe, expect, it, vi } from "vitest"; +import { CIRCUIT_FAILURE_THRESHOLD, RATE_LIMIT_PROBE_DELAY_MS } from "./traffic-constants"; +import { TrafficController } from "./traffic-controller"; + +describe("TrafficController priority scheduling", () => { + it("prioritizes P0 over lower priorities when runnable", async () => { + const controller = new TrafficController({ maxConcurrent: 1 }); + const order: string[] = []; + + const p1 = controller.handleText({ + metadata: { provider: "p", model: "m1", priority: "P1" }, + execute: async () => { + order.push("P1"); + return "P1"; + }, + }); + + const p2 = controller.handleText({ + metadata: { provider: "p", model: "m2", priority: "P2" }, + execute: async () => { + order.push("P2"); + return "P2"; + }, + }); + + const p0 = controller.handleText({ + metadata: { provider: "p", model: "m0", priority: "P0" }, + execute: async () => { + order.push("P0"); + return "P0"; + }, + }); + + await Promise.all([p0, p1, p2]); + + expect(order[0]).toBe("P0"); + expect(order).toEqual(["P0", "P1", "P2"]); + }); + + it("allows lower priorities to proceed when a higher priority request is rate limited", async () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(0)); + const controller = new TrafficController({ maxConcurrent: 1 }); + controller.updateRateLimitFromHeaders( + { provider: "p0", model: "m0" }, + { + "x-ratelimit-limit-requests": "1", + "x-ratelimit-remaining-requests": "0", + "x-ratelimit-reset-requests": "1s", + }, + ); + + const order: string[] = []; + + const p0 = controller.handleText({ + metadata: { provider: "p0", model: "m0", priority: "P0" }, + execute: async () => { + order.push("P0"); + return "P0"; + }, + }); + + const p1 = controller.handleText({ + metadata: { provider: "p1", model: "m1", priority: "P1" }, + execute: async () => { + order.push("P1"); + return "P1"; + }, + }); + + await vi.runAllTimersAsync(); + await Promise.all([p0, p1]); + + expect(order[0]).toBe("P1"); + expect(order[1]).toBe("P0"); + } finally { + vi.useRealTimers(); + } + }); +}); + +describe("TrafficController concurrency limits", () => { + it("shares provider/model limits across tenants", async () => { + const controller = new TrafficController({ + maxConcurrent: 2, + maxConcurrentPerProviderModel: 1, + }); + const started: string[] = []; + let releaseFirst!: () => void; + const firstGate = new Promise((resolve) => { + releaseFirst = resolve; + }); + + const first = controller.handleText({ + tenantId: "tenant-a", + metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, + execute: async () => { + started.push("tenant-a"); + await firstGate; + return "a"; + }, + }); + + const second = controller.handleText({ + tenantId: "tenant-b", + metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, + execute: async () => { + started.push("tenant-b"); + return "b"; + }, + }); + + await new Promise((resolve) => setTimeout(resolve, 0)); + expect(started).toEqual(["tenant-a"]); + + releaseFirst(); + await Promise.all([first, second]); + expect(started).toEqual(["tenant-a", "tenant-b"]); + }); +}); + +describe("TrafficController rate limit headers", () => { + it("parses OpenAI-style compound reset durations (e.g. 1m30.951s)", () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(1_000_000)); + const controller = new TrafficController({ maxConcurrent: 1 }); + const now = Date.now(); + + const result = controller.updateRateLimitFromHeaders( + { provider: "openai.responses", model: "gpt-4o-mini" }, + { + "x-ratelimit-limit-requests": "10000", + "x-ratelimit-remaining-requests": "9989", + "x-ratelimit-reset-requests": "1m30.951s", + }, + ); + + expect(result).toBeTruthy(); + expect(result?.headerSnapshot.resetRequestsMs).toBeCloseTo(90_951, 6); + expect(result?.state.limit).toBe(10000); + expect(result?.state.remaining).toBe(9989); + expect(result?.state.resetAt).toBe(now + 90_951); + expect(result?.state.reserved).toBe(0); + expect(result?.state.nextAllowedAt).toBe(now); + } finally { + vi.useRealTimers(); + } + }); + + it("keeps resetAt monotonic when headers shorten the reset duration", () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(0)); + const controller = new TrafficController({ maxConcurrent: 1 }); + + const first = controller.updateRateLimitFromHeaders( + { provider: "openai.responses", model: "gpt-4o-mini" }, + { + "x-ratelimit-limit-requests": "10000", + "x-ratelimit-remaining-requests": "9999", + "x-ratelimit-reset-requests": "60s", + }, + ); + + expect(first).toBeTruthy(); + expect(first?.state.resetAt).toBe(60_000); + + vi.setSystemTime(new Date(10_000)); + const second = controller.updateRateLimitFromHeaders( + { provider: "openai.responses", model: "gpt-4o-mini" }, + { + "x-ratelimit-limit-requests": "10000", + "x-ratelimit-remaining-requests": "9998", + "x-ratelimit-reset-requests": "5s", + }, + ); + + expect(second).toBeTruthy(); + expect(second?.state.resetAt).toBe(60_000); + } finally { + vi.useRealTimers(); + } + }); + + it("never increases remaining within the same window", () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(0)); + const controller = new TrafficController({ maxConcurrent: 1 }); + + const first = controller.updateRateLimitFromHeaders( + { provider: "openai.responses", model: "gpt-4o-mini" }, + { + "x-ratelimit-limit-requests": "10", + "x-ratelimit-remaining-requests": "9", + "x-ratelimit-reset-requests": "60s", + }, + ); + + expect(first?.state.remaining).toBe(9); + expect(first?.state.resetAt).toBe(60_000); + + vi.setSystemTime(new Date(10_000)); + const second = controller.updateRateLimitFromHeaders( + { provider: "openai.responses", model: "gpt-4o-mini" }, + { + "x-ratelimit-limit-requests": "10", + "x-ratelimit-remaining-requests": "8", + "x-ratelimit-reset-requests": "50s", + }, + ); + + expect(second?.state.remaining).toBe(8); + expect(second?.state.resetAt).toBe(60_000); + + vi.setSystemTime(new Date(20_000)); + const third = controller.updateRateLimitFromHeaders( + { provider: "openai.responses", model: "gpt-4o-mini" }, + { + "x-ratelimit-limit-requests": "10", + "x-ratelimit-remaining-requests": "9", + "x-ratelimit-reset-requests": "40s", + }, + ); + + expect(third?.state.remaining).toBe(8); + expect(third?.state.resetAt).toBe(60_000); + } finally { + vi.useRealTimers(); + } + }); + + it("applies Retry-After even when x-ratelimit headers are missing", async () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(0)); + const controller = new TrafficController({ maxConcurrent: 1 }); + const order: string[] = []; + + controller.updateRateLimitFromHeaders( + { provider: "p", model: "m" }, + { + "retry-after": "2", + }, + ); + + const p0 = controller.handleText({ + metadata: { provider: "p", model: "m", priority: "P0" }, + execute: async () => { + order.push("P0"); + return "P0"; + }, + }); + + await vi.advanceTimersByTimeAsync(1_999); + expect(order).toEqual([]); + + await vi.advanceTimersByTimeAsync(1); + await vi.runAllTimersAsync(); + await p0; + expect(order).toEqual(["P0"]); + } finally { + vi.useRealTimers(); + } + }); + + it("shares rate limits across tenants for the same provider/model", async () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(0)); + const controller = new TrafficController({ maxConcurrent: 1 }); + controller.updateRateLimitFromHeaders( + { provider: "openai", model: "gpt-4o", tenantId: "tenant-a" }, + { + "x-ratelimit-limit-requests": "1", + "x-ratelimit-remaining-requests": "0", + "x-ratelimit-reset-requests": "1s", + }, + ); + + const order: string[] = []; + const request = controller.handleText({ + tenantId: "tenant-b", + metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, + execute: async () => { + order.push("tenant-b"); + return "ok"; + }, + }); + + await vi.advanceTimersByTimeAsync(999); + await Promise.resolve(); + expect(order).toEqual([]); + + await vi.advanceTimersByTimeAsync(1); + await vi.runAllTimersAsync(); + await request; + + expect(order).toEqual(["tenant-b"]); + } finally { + vi.useRealTimers(); + } + }); +}); + +describe("TrafficController token limits", () => { + it("blocks OpenAI when the token window is exhausted even without RPM config", async () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(0)); + const controller = new TrafficController({ + maxConcurrent: 1, + rateLimits: { + "openai::gpt-4o": { + requestsPerMinute: 0, + tokensPerMinute: 2, + }, + }, + }); + const order: string[] = []; + + const first = controller.handleText({ + tenantId: "tenant-a", + metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, + execute: async () => { + order.push("first"); + return "first"; + }, + extractUsage: () => ({ totalTokens: 2 }), + }); + + const second = controller.handleText({ + tenantId: "tenant-b", + metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, + execute: async () => { + order.push("second"); + return "second"; + }, + extractUsage: () => ({ totalTokens: 1 }), + }); + + await first; + expect(order).toEqual(["first"]); + + await vi.advanceTimersByTimeAsync(60_000 + RATE_LIMIT_PROBE_DELAY_MS - 1); + expect(order).toEqual(["first"]); + + await vi.advanceTimersByTimeAsync(1); + await vi.runAllTimersAsync(); + await second; + expect(order).toEqual(["first", "second"]); + } finally { + vi.useRealTimers(); + } + }); + + it("reserves estimated tokens before dispatch", async () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(0)); + const controller = new TrafficController({ + maxConcurrent: 2, + rateLimits: { + "openai::gpt-4o": { + requestsPerMinute: 0, + tokensPerMinute: 2, + }, + }, + }); + const order: string[] = []; + let releaseFirst!: () => void; + const firstGate = new Promise((resolve) => { + releaseFirst = resolve; + }); + + const first = controller.handleText({ + tenantId: "tenant-a", + metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, + estimatedTokens: 2, + execute: async () => { + order.push("first"); + await firstGate; + return "first"; + }, + extractUsage: () => ({ totalTokens: 2 }), + }); + + const second = controller.handleText({ + tenantId: "tenant-b", + metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, + estimatedTokens: 1, + execute: async () => { + order.push("second"); + return "second"; + }, + extractUsage: () => ({ totalTokens: 1 }), + }); + + await Promise.resolve(); + expect(order).toEqual(["first"]); + + await vi.advanceTimersByTimeAsync(60_000 + RATE_LIMIT_PROBE_DELAY_MS - 1); + await Promise.resolve(); + expect(order).toEqual(["first"]); + + await vi.advanceTimersByTimeAsync(1); + await vi.runAllTimersAsync(); + await Promise.resolve(); + expect(order).toEqual(["first", "second"]); + + releaseFirst(); + await Promise.all([first, second]); + } finally { + vi.useRealTimers(); + } + }); + + it("allows token-only configs on non-OpenAI providers", async () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(0)); + const controller = new TrafficController({ + maxConcurrent: 2, + rateLimits: { + "p::m": { + requestsPerMinute: 0, + tokensPerMinute: 2, + }, + }, + }); + const order: string[] = []; + + const first = controller.handleText({ + tenantId: "tenant-a", + metadata: { provider: "p", model: "m", priority: "P1" }, + estimatedTokens: 2, + execute: async () => { + order.push("first"); + return "first"; + }, + extractUsage: () => ({ totalTokens: 2 }), + }); + + const second = controller.handleText({ + tenantId: "tenant-b", + metadata: { provider: "p", model: "m", priority: "P1" }, + estimatedTokens: 1, + execute: async () => { + order.push("second"); + return "second"; + }, + extractUsage: () => ({ totalTokens: 1 }), + }); + + await first; + expect(order).toEqual(["first"]); + + await vi.advanceTimersByTimeAsync(29_999); + await Promise.resolve(); + expect(order).toEqual(["first"]); + + await vi.advanceTimersByTimeAsync(1); + await vi.runAllTimersAsync(); + await second; + expect(order).toEqual(["first", "second"]); + } finally { + vi.useRealTimers(); + } + }); + + it("honors OpenAI token headers even without token config", async () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(0)); + const controller = new TrafficController({ maxConcurrent: 1 }); + controller.updateRateLimitFromHeaders( + { provider: "openai", model: "gpt-4o" }, + { + "x-ratelimit-limit-tokens": "2", + "x-ratelimit-remaining-tokens": "0", + "x-ratelimit-reset-tokens": "1s", + }, + ); + + const order: string[] = []; + const request = controller.handleText({ + tenantId: "tenant-a", + metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, + estimatedTokens: 1, + execute: async () => { + order.push("run"); + return "ok"; + }, + }); + + await Promise.resolve(); + expect(order).toEqual([]); + + await vi.advanceTimersByTimeAsync(1_000 + RATE_LIMIT_PROBE_DELAY_MS - 1); + await Promise.resolve(); + expect(order).toEqual([]); + + await vi.advanceTimersByTimeAsync(1); + await vi.runAllTimersAsync(); + await request; + expect(order).toEqual(["run"]); + } finally { + vi.useRealTimers(); + } + }); +}); + +describe("TrafficController stream reporting", () => { + it("holds concurrency slots for streams until completion", async () => { + const controller = new TrafficController({ maxConcurrent: 1 }); + const order: string[] = []; + const firstMetadata = { + provider: "p", + model: "m", + priority: "P1" as const, + tenantId: "tenant-a", + }; + const secondMetadata = { + provider: "p", + model: "m", + priority: "P1" as const, + tenantId: "tenant-a", + }; + + const first = controller.handleStream({ + tenantId: "tenant-a", + metadata: firstMetadata, + execute: async () => { + order.push("first"); + return "first"; + }, + }); + + const second = controller.handleStream({ + tenantId: "tenant-a", + metadata: secondMetadata, + execute: async () => { + order.push("second"); + return "second"; + }, + }); + + await first; + await Promise.resolve(); + expect(order).toEqual(["first"]); + + controller.reportStreamSuccess(firstMetadata); + await Promise.resolve(); + expect(order).toEqual(["first", "second"]); + + controller.reportStreamSuccess(secondMetadata); + await Promise.all([first, second]); + }); + + it("slows down after stream 429 errors", async () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(0)); + const controller = new TrafficController({ + maxConcurrent: 1, + adaptiveLimiter: { + windowMs: 1_000, + threshold: 1, + minPenaltyMs: 10, + maxPenaltyMs: 10, + penaltyMultiplier: 1, + decayMs: 1_000, + }, + }); + const metadata = { + provider: "p", + model: "m", + priority: "P1" as const, + tenantId: "tenant-a", + }; + + controller.reportStreamFailure( + metadata, + Object.assign(new Error("rate limit"), { status: 429 }), + ); + + const order: string[] = []; + const request = controller.handleText({ + tenantId: "tenant-a", + metadata, + execute: async () => { + order.push("run"); + return "ok"; + }, + }); + + await Promise.resolve(); + expect(order).toEqual([]); + + await vi.advanceTimersByTimeAsync(9); + await Promise.resolve(); + expect(order).toEqual([]); + + await vi.advanceTimersByTimeAsync(1); + await vi.runAllTimersAsync(); + await request; + expect(order).toEqual(["run"]); + } finally { + vi.useRealTimers(); + } + }); + + it("treats post-start stream failures as circuit breaker failures", async () => { + const controller = new TrafficController({ + maxConcurrent: 1, + fallbackChains: { + primary: ["fallback"], + }, + }); + const tenantId = "tenant-1"; + const metadata = { provider: "p", model: "primary", priority: "P1" as const }; + + await controller.handleStream({ + tenantId, + metadata, + execute: async () => ({ ok: true }), + }); + + for (let i = 0; i < CIRCUIT_FAILURE_THRESHOLD; i += 1) { + controller.reportStreamFailure(metadata, new Error("stream-failure")); + } + + const order: string[] = []; + await controller.handleStream({ + tenantId, + metadata, + execute: async () => { + order.push("primary"); + return "primary"; + }, + createFallbackRequest: (target) => ({ + tenantId, + metadata: { + provider: "p", + model: typeof target === "string" ? target : target.model, + priority: "P1", + }, + execute: async () => { + const modelId = typeof target === "string" ? target : target.model; + order.push(modelId); + return modelId; + }, + }), + }); + + expect(order).toEqual(["fallback"]); + }); +}); + +describe("TrafficController queue timeouts", () => { + it("times out queued requests even when max concurrency is saturated", async () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(0)); + const controller = new TrafficController({ maxConcurrent: 1 }); + const order: string[] = []; + let releaseFirst!: () => void; + const firstGate = new Promise((resolve) => { + releaseFirst = resolve; + }); + + const first = controller.handleText({ + tenantId: "tenant-a", + metadata: { provider: "p", model: "m", priority: "P1" }, + execute: async () => { + order.push("first"); + await firstGate; + return "first"; + }, + }); + + const second = controller.handleText({ + tenantId: "tenant-a", + metadata: { provider: "p", model: "m", priority: "P1" }, + maxQueueWaitMs: 1, + execute: async () => { + order.push("second"); + return "second"; + }, + }); + const secondExpectation = expect(second).rejects.toHaveProperty( + "name", + "QueueWaitTimeoutError", + ); + + await Promise.resolve(); + expect(order).toEqual(["first"]); + + await vi.advanceTimersByTimeAsync(2); + await vi.runAllTimersAsync(); + await secondExpectation; + expect(order).toEqual(["first"]); + + releaseFirst(); + await vi.runAllTimersAsync(); + await first; + } finally { + vi.useRealTimers(); + } + }); + + it("lets fallback requests wait after queue timeout without rejecting", async () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(0)); + const controller = new TrafficController({ + maxConcurrent: 1, + fallbackChains: { + "p::m": ["m-fallback"], + }, + }); + const order: string[] = []; + let releaseFirst!: () => void; + const firstGate = new Promise((resolve) => { + releaseFirst = resolve; + }); + + const first = controller.handleText({ + tenantId: "tenant-a", + metadata: { provider: "p", model: "m", priority: "P1" }, + execute: async () => { + order.push("first"); + await firstGate; + return "first"; + }, + }); + + const second = controller.handleText({ + tenantId: "tenant-a", + metadata: { provider: "p", model: "m", priority: "P1" }, + maxQueueWaitMs: 1, + execute: async () => { + order.push("primary"); + return "primary"; + }, + createFallbackRequest: (target) => ({ + tenantId: "tenant-a", + metadata: { + provider: "p", + model: typeof target === "string" ? target : target.model, + priority: "P1", + }, + maxQueueWaitMs: 1, + execute: async () => { + order.push("fallback"); + return "fallback"; + }, + }), + }); + + await Promise.resolve(); + expect(order).toEqual(["first"]); + + await vi.advanceTimersByTimeAsync(2); + + const third = controller.handleText({ + tenantId: "tenant-a", + metadata: { provider: "p", model: "other", priority: "P1" }, + execute: async () => { + order.push("third"); + return "third"; + }, + }); + + await Promise.resolve(); + expect(order).toEqual(["first"]); + + releaseFirst(); + await vi.runAllTimersAsync(); + + await expect(second).resolves.toBe("fallback"); + await Promise.all([first, third]); + + expect(order).toEqual(["first", "fallback", "third"]); + } finally { + vi.useRealTimers(); + } + }); +}); diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts new file mode 100644 index 000000000..90d56037c --- /dev/null +++ b/packages/core/src/traffic/traffic-controller.ts @@ -0,0 +1,1408 @@ +import type { Logger } from "../logger"; +import { LoggerProxy } from "../logger"; +import { randomUUID } from "../utils/id"; +import { TrafficCircuitBreaker } from "./traffic-circuit-breaker"; +import { TrafficConcurrencyLimiter } from "./traffic-concurrency-limiter"; +import type { DispatchDecision, QueuedRequest, Scheduler } from "./traffic-controller-internal"; +import { + CircuitBreakerOpenError, + QueueWaitTimeoutError, + RateLimitedUpstreamError, + normalizeRateLimitError, +} from "./traffic-errors"; +import { + OpenAIWindowRateLimitStrategy, + type RateLimitUpdateResult, + TokenBucketRateLimitStrategy, + TrafficRateLimiter, +} from "./traffic-rate-limiter"; +import { buildRetryPlanWithPolicy } from "./traffic-retry"; +import type { + AdaptiveLimiterConfig, + FallbackChainEntry, + FallbackPolicy, + FallbackPolicyConfig, + FallbackPolicyMode, + FallbackTarget, + PriorityBurstLimits, + PriorityWeights, + ProviderModelConcurrencyLimit, + RateLimitConfig, + RateLimitKey, + RateLimitStrategyConfig, + RateLimitStrategyKind, + RetryPlan, + RetryPolicyConfig, + TenantConcurrencyLimit, + TenantUsage, + TrafficControllerOptions, + TrafficPriority, + TrafficRequest, + TrafficRequestMetadata, + TrafficRequestType, + TrafficResponseMetadata, +} from "./traffic-types"; +import { TrafficUsageTracker } from "./traffic-usage-tracker"; + +/* ============================================================ + * Traffic Controller + * ============================================================ + */ + +export type { + AdaptiveLimiterConfig, + FallbackChainEntry, + FallbackPolicy, + FallbackPolicyConfig, + FallbackPolicyMode, + FallbackTarget, + PriorityBurstLimits, + PriorityWeights, + ProviderModelConcurrencyLimit, + RateLimitConfig, + RateLimitKey, + RateLimitStrategyConfig, + RateLimitStrategyKind, + TenantConcurrencyLimit, + TenantUsage, + TrafficControllerOptions, + TrafficPriority, + TrafficRequest, + TrafficRequestMetadata, + TrafficResponseMetadata, + TrafficRequestType, +}; + +export { CircuitBreakerOpenError }; +export { QueueWaitTimeoutError }; +export { RateLimitedUpstreamError }; + +type TenantQueueState = { + order: string[]; + index: number; + queues: Map; +}; + +type RateLimitSnapshot = { + limit?: number; + remaining?: number; + resetAt?: number; + nextAllowedAt?: number; + retryAfterMs?: number; +}; + +type AdaptiveLimiterState = { + recent429s: number[]; + penaltyMs: number; + cooldownUntil?: number; + last429At?: number; +}; + +const DEFAULT_PRIORITY_WEIGHTS: Record = { + P0: 5, + P1: 3, + P2: 2, +}; + +const DEFAULT_ADAPTIVE_LIMITER: Required = { + windowMs: 30_000, + threshold: 3, + minPenaltyMs: 500, + maxPenaltyMs: 10_000, + penaltyMultiplier: 2, + decayMs: 10_000, +}; + +export class TrafficController { + /* ---------- Core ---------- */ + + private readonly scheduler: Scheduler; + private readonly maxConcurrent: number; + private readonly rateLimitKeyBuilder: (metadata?: TrafficRequestMetadata) => string; + private readonly retryPolicy?: RetryPolicyConfig; + private readonly logger: Logger; + private readonly trafficLogger: Logger; + private readonly controllerLogger: Logger; + private readonly concurrencyLimiter: TrafficConcurrencyLimiter; + + private readonly queues: Record = { + P0: { order: [], index: 0, queues: new Map() }, + P1: { order: [], index: 0, queues: new Map() }, + P2: { order: [], index: 0, queues: new Map() }, + }; + private readonly priorityOrder: TrafficPriority[] = ["P0", "P1", "P2"]; + private readonly priorityWeights: Record; + private readonly priorityCredits: Record; + + private activeCount = 0; + private drainScheduled = false; + private readonly inFlightStreams = new Map(); + + /* ---------- Rate limits ---------- */ + private readonly rateLimiter: TrafficRateLimiter; + + /* ---------- Circuit breakers ---------- */ + private readonly circuitBreaker: TrafficCircuitBreaker; + + /* ---------- Usage ---------- */ + private readonly usageTracker = new TrafficUsageTracker(); + + /* ---------- Traffic metadata ---------- */ + private readonly rateLimitSnapshots = new Map(); + + /* ---------- Adaptive limiter ---------- */ + private readonly adaptiveLimiterConfig: Required; + private readonly adaptiveLimiterState = new Map(); + + constructor(options: TrafficControllerOptions = {}) { + this.maxConcurrent = options.maxConcurrent ?? Number.POSITIVE_INFINITY; + this.scheduler = this.createScheduler(); + this.rateLimitKeyBuilder = options.rateLimitKeyBuilder ?? buildRateLimitKeyFromMetadata; + this.retryPolicy = options.retryPolicy; + const priorityOverrides = options.priorityWeights ?? options.priorityBurstLimits; + const priorityWeights = { + ...DEFAULT_PRIORITY_WEIGHTS, + ...(priorityOverrides ?? {}), + }; + this.priorityWeights = { + P0: Math.max(0, Math.floor(priorityWeights.P0)), + P1: Math.max(0, Math.floor(priorityWeights.P1)), + P2: Math.max(0, Math.floor(priorityWeights.P2)), + }; + this.priorityCredits = { ...this.priorityWeights }; + this.adaptiveLimiterConfig = { + ...DEFAULT_ADAPTIVE_LIMITER, + ...(options.adaptiveLimiter ?? {}), + }; + this.logger = new LoggerProxy({ component: "traffic-controller" }, options.logger); + this.trafficLogger = this.logger.child({ subsystem: "traffic" }); + this.controllerLogger = this.trafficLogger.child({ module: "controller" }); + const rateLimits = options.rateLimits; + const rateLimitStrategy = options.rateLimitStrategy; + this.rateLimiter = new TrafficRateLimiter(() => this.scheduleDrain(), { + rateLimits, + strategyFactory: (key) => { + const strategyKind = this.resolveRateLimitStrategy(key, rateLimitStrategy); + if (strategyKind === "window") { + return new OpenAIWindowRateLimitStrategy(key, rateLimits?.[key]); + } + return new TokenBucketRateLimitStrategy(key, rateLimits?.[key]); + }, + }); + this.circuitBreaker = new TrafficCircuitBreaker({ + fallbackChains: options.fallbackChains, + fallbackPolicy: options.fallbackPolicy, + buildRateLimitKey: (metadata) => this.buildRateLimitKey(metadata), + }); + this.concurrencyLimiter = new TrafficConcurrencyLimiter({ + buildProviderModelKey: (metadata) => buildProviderModelKeyFromMetadata(metadata), + maxConcurrentPerProviderModel: options.maxConcurrentPerProviderModel, + maxConcurrentPerTenant: options.maxConcurrentPerTenant, + }); + + this.controllerLogger.debug("Initialized TrafficController", { + maxConcurrent: this.maxConcurrent, + hasFallbackChains: !!options.fallbackChains, + hasFallbackPolicy: options.fallbackPolicy !== undefined, + hasProviderModelConcurrency: options.maxConcurrentPerProviderModel !== undefined, + hasTenantConcurrency: options.maxConcurrentPerTenant !== undefined, + hasConfigRateLimits: options.rateLimits !== undefined, + hasStrategyOverrides: options.rateLimitStrategy !== undefined, + hasRetryPolicy: options.retryPolicy !== undefined, + hasPriorityBurstLimits: options.priorityBurstLimits !== undefined, + hasPriorityWeights: options.priorityWeights !== undefined, + hasAdaptiveLimiter: options.adaptiveLimiter !== undefined, + }); + } + + /* ============================================================ + * Public API + * ============================================================ + */ + + handleText(request: TrafficRequest): Promise { + this.controllerLogger.trace("handleText called", { + tenantId: request.tenantId, + provider: request.metadata?.provider, + model: request.metadata?.model, + priority: request.metadata?.priority, + }); + return this.enqueue("text", request); + } + + handleStream(request: TrafficRequest): Promise { + this.controllerLogger.trace("handleStream called", { + tenantId: request.tenantId, + provider: request.metadata?.provider, + model: request.metadata?.model, + priority: request.metadata?.priority, + }); + return this.enqueue("stream", request); + } + + reportStreamSuccess(metadata?: TrafficRequestMetadata): void { + this.controllerLogger.debug("Stream reported success", { + provider: metadata?.provider, + model: metadata?.model, + tenantId: metadata?.tenantId, + priority: metadata?.priority, + }); + this.circuitBreaker.recordSuccess(metadata, this.trafficLogger); + const rateLimitKey = this.buildRateLimitKey(metadata); + const adaptiveKey = this.buildAdaptiveKey( + metadata, + metadata?.tenantId ?? "default", + rateLimitKey, + ); + this.recordAdaptiveSuccess(adaptiveKey); + this.releaseStreamSlot(metadata, "success"); + } + + reportStreamFailure(metadata: TrafficRequestMetadata | undefined, error: unknown): void { + const rateLimitKey = this.buildRateLimitKey(metadata); + const normalizedRateLimitError = normalizeRateLimitError({ + error, + metadata, + tenantId: metadata?.tenantId, + key: rateLimitKey, + logger: this.trafficLogger, + }); + const errorForHandling = normalizedRateLimitError ?? error; + + this.controllerLogger.warn("Stream reported failure", { + provider: metadata?.provider, + model: metadata?.model, + tenantId: metadata?.tenantId, + priority: metadata?.priority, + errorName: (error as { name?: unknown } | null)?.name, + errorMessage: (error as { message?: unknown } | null)?.message, + status: (error as { status?: unknown } | null)?.status, + statusCode: (error as { statusCode?: unknown } | null)?.statusCode, + }); + this.circuitBreaker.recordFailure(metadata, errorForHandling, this.trafficLogger); + const adaptiveKey = this.buildAdaptiveKey( + metadata, + metadata?.tenantId ?? "default", + rateLimitKey, + ); + if (errorForHandling instanceof RateLimitedUpstreamError) { + this.recordAdaptiveRateLimitHit(adaptiveKey, errorForHandling.retryAfterMs); + } + const traffic = this.buildTrafficResponseMetadataFromMetadata( + metadata, + rateLimitKey, + Date.now(), + errorForHandling, + ); + this.attachTrafficMetadata(errorForHandling, traffic); + if (errorForHandling !== error) { + this.attachTrafficMetadata(error, traffic); + } + this.releaseStreamSlot(metadata, "failure"); + } + + updateRateLimitFromHeaders( + metadata: TrafficRequestMetadata | undefined, + headers: unknown, + ): RateLimitUpdateResult | undefined { + const key = this.buildRateLimitKey(metadata); + this.controllerLogger.debug("updateRateLimitFromHeaders called", { + rateLimitKey: key, + provider: metadata?.provider, + model: metadata?.model, + }); + + const update = this.rateLimiter.updateFromHeaders(metadata, headers, key, this.trafficLogger); + if (!update) { + this.controllerLogger.debug("updateRateLimitFromHeaders skipped (no headers applied)", { + rateLimitKey: key, + }); + return undefined; + } + + this.controllerLogger.debug("Rate limit headers applied", { + rateLimitKey: update.key, + limit: update.state.limit, + remaining: update.state.remaining, + reserved: update.state.reserved, + resetAt: update.state.resetAt, + nextAllowedAt: update.state.nextAllowedAt, + resetRequestsMs: update.headerSnapshot.resetRequestsMs, + resetTokensMs: update.headerSnapshot.resetTokensMs, + }); + + this.rateLimitSnapshots.set(update.key, { + limit: update.state.limit, + remaining: update.state.remaining, + resetAt: update.state.resetAt, + nextAllowedAt: update.state.nextAllowedAt, + retryAfterMs: update.headerSnapshot.retryAfterMs, + }); + + return update; + } + + getTenantUsage(tenantId: string): TenantUsage | undefined { + this.controllerLogger.trace("getTenantUsage called", { tenantId }); + return this.usageTracker.getTenantUsage(tenantId); + } + + /* ============================================================ + * Scheduler & Queue + * ============================================================ + */ + + private createScheduler(): Scheduler { + return typeof queueMicrotask === "function" ? queueMicrotask : (cb) => setTimeout(cb, 0); + } + + private enqueue( + type: TrafficRequestType, + request: TrafficRequest, + ): Promise { + return new Promise((resolve, reject) => { + const normalizedRequest = this.ensureStreamRequestId(type, request); + const priority = this.resolvePriority(normalizedRequest.metadata); + const tenantId = this.resolveTenantId(normalizedRequest); + this.controllerLogger.debug("Enqueue request", { + type, + tenantId, + priority, + provider: normalizedRequest.metadata?.provider, + model: normalizedRequest.metadata?.model, + }); + this.enqueueItem({ + type, + request: normalizedRequest, + resolve, + reject, + attempt: 1, + priority, + tenantId, + enqueuedAt: Date.now(), + estimatedTokens: normalizedRequest.estimatedTokens, + extractUsage: normalizedRequest.extractUsage, + }); + this.scheduleDrain(); + }); + } + + private scheduleDrain(): void { + if (this.drainScheduled) return; + this.drainScheduled = true; + + this.controllerLogger.trace("Drain scheduled"); + this.scheduler(() => { + this.drainScheduled = false; + this.controllerLogger.trace("Drain tick"); + this.drainQueue(); + }); + } + + private drainQueue(): void { + this.controllerLogger.trace("Drain start", { + activeCount: this.activeCount, + maxConcurrent: this.maxConcurrent, + queuedP0: this.getQueuedCount("P0"), + queuedP1: this.getQueuedCount("P1"), + queuedP2: this.getQueuedCount("P2"), + }); + while (true) { + const decision = this.tryDispatchNext(); + this.controllerLogger.trace("Dispatch decision", decision); + if (decision.kind === "dispatch" || decision.kind === "skip") continue; + if (decision.kind === "wait") { + if (decision.wakeUpAt) { + this.controllerLogger.debug("Rate limit wait; scheduling wakeup", { + wakeUpAt: decision.wakeUpAt, + inMs: Math.max(0, decision.wakeUpAt - Date.now()), + }); + this.scheduleRateLimitWakeUpAt(decision.wakeUpAt); + } + return; + } + return; + } + } + + /* ============================================================ + * Dispatch + * ============================================================ + */ + + private tryDispatchNext(): DispatchDecision { + if (this.activeCount >= this.maxConcurrent) { + const timeoutSweep = this.processQueueTimeoutsOnly(Date.now()); + if (timeoutSweep.evicted) return { kind: "skip" }; + return timeoutSweep.wakeUpAt !== undefined + ? { kind: "wait", wakeUpAt: timeoutSweep.wakeUpAt } + : { kind: "wait" }; + } + + let earliestWakeUpAt: number | undefined; + + const observeWakeUpAt = (candidate?: number): void => { + if (candidate === undefined) return; + earliestWakeUpAt = + earliestWakeUpAt === undefined ? candidate : Math.min(earliestWakeUpAt, candidate); + }; + + const priorities = this.getPriorityDispatchOrder(); + for (const priority of priorities) { + const state = this.queues[priority]; + if (state.order.length === 0) continue; + + let attempts = 0; + const maxAttempts = state.order.length; + + while (attempts < maxAttempts) { + const candidate = this.getNextTenantCandidate(priority); + if (!candidate) break; + attempts += 1; + + const now = Date.now(); + const result = this.processQueuedCandidate(priority, candidate, now); + observeWakeUpAt(result.wakeUpAt); + if (result.action === "dispatch") return { kind: "dispatch" }; + if (result.action === "skip") return { kind: "skip" }; + } + } + + return earliestWakeUpAt !== undefined + ? { kind: "wait", wakeUpAt: earliestWakeUpAt } + : { kind: "wait" }; + } + + private startRequest(item: QueuedRequest, queue: QueuedRequest[], tenantId: string): void { + this.controllerLogger.debug("Start request", { + priority: item.priority, + type: item.type, + tenantId: item.tenantId, + attempt: item.attempt, + provider: item.request.metadata?.provider, + model: item.request.metadata?.model, + }); + item.dispatchedAt = Date.now(); + queue.shift(); + this.cleanupTenantQueue(item.priority, tenantId, queue); + this.recordPriorityDispatch(item.priority); + this.activeCount++; + this.concurrencyLimiter.acquire(item, this.trafficLogger); + this.rateLimiter.notifyDispatch(item.rateLimitKey, this.trafficLogger); + this.circuitBreaker.markTrial(item, this.trafficLogger); + void this.executeRequest(item); + } + + /* ============================================================ + * Execution + * ============================================================ + */ + + private async executeRequest(item: QueuedRequest): Promise { + const startedAt = Date.now(); + let streamHeld = false; + try { + this.controllerLogger.debug("Execute request", { + priority: item.priority, + type: item.type, + tenantId: item.tenantId, + attempt: item.attempt, + provider: item.request.metadata?.provider, + model: item.request.metadata?.model, + rateLimitKey: item.rateLimitKey, + circuitKey: item.circuitKey, + circuitStatus: item.circuitStatus, + activeCount: this.activeCount, + }); + const result = await item.request.execute(); + const rateLimitKey = item.rateLimitKey ?? this.buildRateLimitKey(item.request.metadata); + const adaptiveKey = this.buildAdaptiveKey(item.request.metadata, item.tenantId, rateLimitKey); + this.controllerLogger.debug("Request succeeded", { + tenantId: item.tenantId, + attempt: item.attempt, + provider: item.request.metadata?.provider, + model: item.request.metadata?.model, + elapsedMs: Date.now() - startedAt, + }); + if (item.type === "stream") { + this.controllerLogger.trace("Stream started successfully", { + tenantId: item.tenantId, + provider: item.request.metadata?.provider, + model: item.request.metadata?.model, + }); + } else { + this.circuitBreaker.recordSuccess(item.request.metadata, this.trafficLogger); + } + const usage = this.usageTracker.recordUsage(item, result, this.trafficLogger); + this.rateLimiter.recordUsage(rateLimitKey, usage, this.trafficLogger, item.reservedTokens); + this.recordAdaptiveSuccess(adaptiveKey); + this.attachTrafficMetadata( + result, + this.buildTrafficResponseMetadata(item, rateLimitKey, Date.now()), + ); + if (item.type === "stream") { + const requestId = item.request.metadata?.requestId; + if (!requestId) { + this.controllerLogger.warn("Stream missing requestId; releasing slot immediately", { + tenantId: item.tenantId, + provider: item.request.metadata?.provider, + model: item.request.metadata?.model, + }); + } else { + this.inFlightStreams.set(requestId, item); + streamHeld = true; + this.controllerLogger.debug("Stream registered; holding slot", { + requestId, + tenantId: item.tenantId, + provider: item.request.metadata?.provider, + model: item.request.metadata?.model, + }); + } + } + item.resolve(result); + } catch (error) { + const rateLimitKey = item.rateLimitKey ?? this.buildRateLimitKey(item.request.metadata); + const normalizedRateLimitError = normalizeRateLimitError({ + error, + metadata: item.request.metadata, + tenantId: item.tenantId, + key: rateLimitKey, + logger: this.trafficLogger, + }); + const errorForHandling = normalizedRateLimitError ?? error; + const adaptiveKey = this.buildAdaptiveKey(item.request.metadata, item.tenantId, rateLimitKey); + if (typeof item.reservedTokens === "number" && item.reservedTokens > 0) { + this.rateLimiter.recordUsage( + rateLimitKey, + { totalTokens: 0 }, + this.trafficLogger, + item.reservedTokens, + ); + } + if (errorForHandling instanceof RateLimitedUpstreamError) { + this.recordAdaptiveRateLimitHit(adaptiveKey, errorForHandling.retryAfterMs); + } + + this.controllerLogger.warn("Request failed", { + tenantId: item.tenantId, + attempt: item.attempt, + provider: item.request.metadata?.provider, + model: item.request.metadata?.model, + elapsedMs: Date.now() - startedAt, + errorName: (error as { name?: unknown } | null)?.name, + errorMessage: (error as { message?: unknown } | null)?.message, + status: (error as { status?: unknown } | null)?.status, + statusCode: (error as { statusCode?: unknown } | null)?.statusCode, + }); + this.circuitBreaker.recordFailure( + item.request.metadata, + errorForHandling, + this.trafficLogger, + ); + this.attachTrafficMetadata( + errorForHandling, + this.buildTrafficResponseMetadata(item, rateLimitKey, Date.now(), errorForHandling), + ); + + const retry = buildRetryPlanWithPolicy( + { + error: errorForHandling, + attempt: item.attempt, + metadata: item.request.metadata, + key: rateLimitKey, + logger: this.trafficLogger, + }, + this.retryPolicy, + ); + if (retry) { + if (!this.canRetryWithinDeadline(item, retry.delayMs)) { + this.controllerLogger.debug("Retry skipped; deadline exceeded", { + tenantId: item.tenantId, + attempt: item.attempt, + provider: item.request.metadata?.provider, + model: item.request.metadata?.model, + deadlineAt: item.request.deadlineAt, + delayMs: retry.delayMs, + }); + item.reject(errorForHandling); + } else { + this.controllerLogger.debug("Retrying request", { + tenantId: item.tenantId, + attempt: item.attempt, + nextAttempt: item.attempt + 1, + reason: retry.reason, + delayMs: retry.delayMs, + provider: item.request.metadata?.provider, + model: item.request.metadata?.model, + }); + this.scheduleRetry(item, retry); + } + } else { + this.controllerLogger.debug("No retry plan; rejecting request", { + tenantId: item.tenantId, + attempt: item.attempt, + provider: item.request.metadata?.provider, + model: item.request.metadata?.model, + }); + item.reject(errorForHandling); + } + } finally { + if (!(item.type === "stream" && streamHeld)) { + this.releaseActiveSlot(item, "completed"); + } + } + } + + /* ============================================================ + * Retry logic + * ============================================================ + */ + + private scheduleRetry(item: QueuedRequest, plan: RetryPlan): void { + this.controllerLogger.debug("Schedule retry", { + tenantId: item.tenantId, + priority: item.priority, + currentAttempt: item.attempt, + nextAttempt: item.attempt + 1, + reason: plan.reason, + delayMs: plan.delayMs, + }); + setTimeout(() => { + this.controllerLogger.debug("Retry timer fired", { + tenantId: item.tenantId, + priority: item.priority, + nextAttempt: item.attempt + 1, + }); + this.enqueueItem({ + ...item, + attempt: item.attempt + 1, + enqueuedAt: Date.now(), + dispatchedAt: undefined, + reservedTokens: undefined, + tenantConcurrencyKey: undefined, + providerModelConcurrencyKey: undefined, + rateLimitKey: undefined, + etaMs: undefined, + circuitKey: undefined, + circuitStatus: undefined, + }); + this.scheduleDrain(); + }, plan.delayMs); + } + + private canRetryWithinDeadline(item: QueuedRequest, delayMs: number): boolean { + const deadlineAt = item.request.deadlineAt; + if (!deadlineAt) return true; + const nextAttemptAt = Date.now() + delayMs; + return nextAttemptAt <= deadlineAt; + } + + /* ============================================================ + * Rate limiting (verbatim logic) + * ============================================================ + */ + + private resolveRateLimit(next: QueuedRequest): DispatchDecision | null { + const key = this.buildRateLimitKey(next.request.metadata); + return this.rateLimiter.resolve(next, key, this.trafficLogger); + } + + private scheduleRateLimitWakeUpAt(wakeUpAt: number): void { + this.rateLimiter.scheduleWakeUpAt(wakeUpAt, this.trafficLogger); + } + + /* ============================================================ + * Circuit breakers (verbatim logic, linearized) + * ============================================================ + */ + + private resolveCircuit(next: QueuedRequest): DispatchDecision | null { + return this.circuitBreaker.resolve(next, this.trafficLogger); + } + + /* ============================================================ + * Utilities + * ============================================================ + */ + + private resolveQueueTimeoutAt(next: QueuedRequest): number | undefined { + if (next.queueTimeoutDisabled) { + return next.request.deadlineAt; + } + const maxQueueWaitMs = next.request.maxQueueWaitMs; + const normalizedMaxWait = + typeof maxQueueWaitMs === "number" && Number.isFinite(maxQueueWaitMs) + ? Math.max(0, maxQueueWaitMs) + : undefined; + const timeoutAt = + normalizedMaxWait !== undefined ? next.enqueuedAt + normalizedMaxWait : undefined; + const deadlineAt = next.request.deadlineAt; + if (timeoutAt === undefined) return deadlineAt; + if (deadlineAt === undefined) return timeoutAt; + return Math.min(timeoutAt, deadlineAt); + } + + private handleQueueTimeout( + next: QueuedRequest, + queue: QueuedRequest[], + index: number, + now: number, + queueTimeoutAt?: number, + ): "none" | "expired" | "rejected" { + if (queueTimeoutAt === undefined) return "none"; + if (now < queueTimeoutAt) return "none"; + + const fallbackApplied = this.circuitBreaker.tryFallback( + next, + "queue-timeout", + this.trafficLogger, + ); + if (fallbackApplied) { + return "none"; + } + + const timeoutError = this.createQueueTimeoutError(next, now); + this.attachTrafficMetadata( + timeoutError, + this.buildTrafficResponseMetadata( + next, + timeoutError.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), + now, + timeoutError, + ), + ); + this.controllerLogger.warn("Queue wait timed out; rejecting request", { + tenantId: next.tenantId, + waitedMs: timeoutError.waitedMs, + maxQueueWaitMs: timeoutError.maxQueueWaitMs, + deadlineAt: timeoutError.deadlineAt, + provider: next.request.metadata?.provider, + model: next.request.metadata?.model, + rateLimitKey: timeoutError.rateLimitKey, + }); + queue.splice(index, 1); + next.reject(timeoutError); + return "rejected"; + } + + private rejectIfQueueTimedOut( + queueTimeoutExpired: boolean, + next: QueuedRequest, + queue: QueuedRequest[], + index: number, + now: number, + reason: string, + ): boolean { + if (!queueTimeoutExpired) return false; + const timeoutError = this.createQueueTimeoutError(next, now); + this.attachTrafficMetadata( + timeoutError, + this.buildTrafficResponseMetadata( + next, + timeoutError.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), + now, + timeoutError, + ), + ); + this.controllerLogger.warn("Queue wait timed out during gate wait", { + tenantId: next.tenantId, + waitedMs: timeoutError.waitedMs, + maxQueueWaitMs: timeoutError.maxQueueWaitMs, + deadlineAt: timeoutError.deadlineAt, + provider: next.request.metadata?.provider, + model: next.request.metadata?.model, + rateLimitKey: timeoutError.rateLimitKey, + reason, + }); + queue.splice(index, 1); + next.reject(timeoutError); + return true; + } + + private createQueueTimeoutError(next: QueuedRequest, now: number): QueueWaitTimeoutError { + const waitedMs = Math.max(0, now - next.enqueuedAt); + return new QueueWaitTimeoutError({ + waitedMs, + maxQueueWaitMs: next.request.maxQueueWaitMs, + deadlineAt: next.request.deadlineAt, + metadata: next.request.metadata, + rateLimitKey: next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), + }); + } + + private resolveTenantId(request: TrafficRequest): string { + return request.tenantId ?? request.metadata?.tenantId ?? "default"; + } + + private enqueueItem(item: QueuedRequest): void { + const state = this.queues[item.priority]; + const tenantId = item.tenantId; + let queue = state.queues.get(tenantId); + if (!queue) { + queue = []; + state.queues.set(tenantId, queue); + state.order.push(tenantId); + } + queue.push(item); + } + + private getQueuedCount(priority: TrafficPriority): number { + const state = this.queues[priority]; + let total = 0; + for (const queue of state.queues.values()) { + total += queue.length; + } + return total; + } + + private refillPriorityCredits(): void { + this.priorityCredits.P0 = this.priorityWeights.P0; + this.priorityCredits.P1 = this.priorityWeights.P1; + this.priorityCredits.P2 = this.priorityWeights.P2; + } + + private recordPriorityDispatch(priority: TrafficPriority): void { + if (this.priorityCredits[priority] > 0) { + this.priorityCredits[priority] -= 1; + } + } + + private getPriorityDispatchOrder(): TrafficPriority[] { + const prioritiesWithWork = this.priorityOrder.filter( + (priority) => this.getQueuedCount(priority) > 0, + ); + if (prioritiesWithWork.length === 0) return []; + + let available = prioritiesWithWork.filter((priority) => this.priorityCredits[priority] > 0); + if (available.length === 0) { + this.refillPriorityCredits(); + available = prioritiesWithWork.filter((priority) => this.priorityCredits[priority] > 0); + } + + return available.length === 0 ? prioritiesWithWork : available; + } + + private getNextTenantCandidate( + priority: TrafficPriority, + ): { item: QueuedRequest; queue: QueuedRequest[]; tenantId: string } | undefined { + const state = this.queues[priority]; + if (state.order.length === 0) return undefined; + const maxAttempts = state.order.length; + let attempts = 0; + + while (attempts < maxAttempts && state.order.length > 0) { + const index = state.index % state.order.length; + const tenantId = state.order[index]; + const queue = state.queues.get(tenantId); + attempts += 1; + + if (!queue || queue.length === 0) { + this.removeTenantQueue(priority, tenantId); + continue; + } + + state.index = (index + 1) % state.order.length; + return { item: queue[0], queue, tenantId }; + } + + return undefined; + } + + private cleanupTenantQueue( + priority: TrafficPriority, + tenantId: string, + queue: QueuedRequest[], + ): void { + if (queue.length > 0) return; + this.removeTenantQueue(priority, tenantId); + } + + private removeTenantQueue(priority: TrafficPriority, tenantId: string): void { + const state = this.queues[priority]; + state.queues.delete(tenantId); + const index = state.order.indexOf(tenantId); + if (index === -1) return; + state.order.splice(index, 1); + if (state.order.length === 0) { + state.index = 0; + return; + } + if (state.index > index) { + state.index -= 1; + } + if (state.index >= state.order.length) { + state.index = 0; + } + } + + private resolvePriority(metadata?: TrafficRequestMetadata): TrafficPriority { + return metadata?.priority ?? "P1"; + } + + private buildRateLimitKey(metadata?: TrafficRequestMetadata): string { + return this.rateLimitKeyBuilder(metadata); + } + + private processQueueTimeoutsOnly(now: number): { evicted: boolean; wakeUpAt?: number } { + let evicted = false; + let wakeUpAt: number | undefined; + + const observeWakeUpAt = (candidate?: number): void => { + if (candidate === undefined) return; + wakeUpAt = wakeUpAt === undefined ? candidate : Math.min(wakeUpAt, candidate); + }; + + for (const priority of this.priorityOrder) { + const state = this.queues[priority]; + if (state.order.length === 0) continue; + + for (const tenantId of [...state.order]) { + const queue = state.queues.get(tenantId); + if (!queue || queue.length === 0) { + this.removeTenantQueue(priority, tenantId); + continue; + } + + const next = queue[0]; + const queueTimeoutAt = this.resolveQueueTimeoutAt(next); + if (queueTimeoutAt !== undefined && now < queueTimeoutAt) { + observeWakeUpAt(queueTimeoutAt); + } + + const queueTimeoutTriggered = this.handleQueueTimeout(next, queue, 0, now, queueTimeoutAt); + if (queueTimeoutTriggered === "rejected") { + evicted = true; + this.cleanupTenantQueue(priority, tenantId, queue); + } + } + } + + return { evicted, wakeUpAt }; + } + + private processQueuedCandidate( + priority: TrafficPriority, + candidate: { item: QueuedRequest; queue: QueuedRequest[]; tenantId: string }, + now: number, + ): { action: "dispatch" | "skip" | "continue"; wakeUpAt?: number } { + const { item: next, queue, tenantId } = candidate; + let wakeUpAt: number | undefined; + const queueTimeoutAt = this.resolveQueueTimeoutAt(next); + const queueTimeoutTriggered = this.handleQueueTimeout(next, queue, 0, now, queueTimeoutAt); + if (queueTimeoutTriggered === "rejected") { + this.cleanupTenantQueue(priority, tenantId, queue); + return { action: "skip" }; + } + if (queueTimeoutAt !== undefined && now < queueTimeoutAt) { + wakeUpAt = queueTimeoutAt; + } + const queueTimeoutExpired = queueTimeoutTriggered === "expired"; + + this.controllerLogger.trace("Evaluate next queued request", { + priority, + tenantId: next.tenantId, + type: next.type, + attempt: next.attempt, + provider: next.request.metadata?.provider, + model: next.request.metadata?.model, + queueLength: queue.length, + }); + + const circuit = this.resolveCircuit(next); + if (circuit) { + this.controllerLogger.trace("Circuit resolution returned decision", { + priority, + decision: circuit, + circuitKey: next.circuitKey, + circuitStatus: next.circuitStatus, + }); + if (circuit.kind === "skip") { + queue.shift(); + this.cleanupTenantQueue(priority, tenantId, queue); + return { action: "skip", wakeUpAt }; + } + if (circuit.kind === "wait") { + if (this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "circuit wait")) { + this.cleanupTenantQueue(priority, tenantId, queue); + return { action: "skip", wakeUpAt }; + } + next.etaMs = + circuit.wakeUpAt !== undefined ? Math.max(0, circuit.wakeUpAt - now) : undefined; + return { action: "continue", wakeUpAt: this.pickEarlierWakeUp(wakeUpAt, circuit.wakeUpAt) }; + } + } + + const concurrency = this.concurrencyLimiter.resolve(next, this.trafficLogger); + if (concurrency.kind === "wait") { + this.controllerLogger.trace("Concurrency gate blocked request", { + priority, + tenantId: next.tenantId, + provider: next.request.metadata?.provider, + model: next.request.metadata?.model, + reasons: concurrency.reasons, + }); + if ( + this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "concurrency wait") + ) { + this.cleanupTenantQueue(priority, tenantId, queue); + return { action: "skip", wakeUpAt }; + } + next.etaMs = undefined; + return { action: "continue", wakeUpAt }; + } + + const adaptive = this.resolveAdaptiveLimit(next, now); + if (adaptive?.kind === "wait") { + if (this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "adaptive wait")) { + this.cleanupTenantQueue(priority, tenantId, queue); + return { action: "skip", wakeUpAt }; + } + next.etaMs = + adaptive.wakeUpAt !== undefined ? Math.max(0, adaptive.wakeUpAt - now) : undefined; + return { action: "continue", wakeUpAt: this.pickEarlierWakeUp(wakeUpAt, adaptive.wakeUpAt) }; + } + + const rateLimit = this.resolveRateLimit(next); + if (rateLimit) { + this.controllerLogger.trace("Rate limit resolution returned decision", { + priority, + decision: rateLimit, + rateLimitKey: next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), + }); + if (rateLimit.kind === "wait") { + if ( + this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "rate limit wait") + ) { + this.cleanupTenantQueue(priority, tenantId, queue); + return { action: "skip", wakeUpAt }; + } + next.etaMs = + rateLimit.wakeUpAt !== undefined ? Math.max(0, rateLimit.wakeUpAt - now) : undefined; + return { + action: "continue", + wakeUpAt: this.pickEarlierWakeUp(wakeUpAt, rateLimit.wakeUpAt), + }; + } + return { action: "continue", wakeUpAt }; + } + + if (queueTimeoutExpired) { + const timeoutError = this.createQueueTimeoutError(next, now); + this.attachTrafficMetadata( + timeoutError, + this.buildTrafficResponseMetadata( + next, + timeoutError.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), + now, + timeoutError, + ), + ); + this.controllerLogger.warn("Queue wait timed out before dispatch", { + tenantId: next.tenantId, + waitedMs: timeoutError.waitedMs, + maxQueueWaitMs: timeoutError.maxQueueWaitMs, + deadlineAt: timeoutError.deadlineAt, + provider: next.request.metadata?.provider, + model: next.request.metadata?.model, + rateLimitKey: timeoutError.rateLimitKey, + }); + queue.shift(); + this.cleanupTenantQueue(priority, tenantId, queue); + next.reject(timeoutError); + return { action: "skip", wakeUpAt }; + } + + this.startRequest(next, queue, tenantId); + return { action: "dispatch", wakeUpAt }; + } + + private pickEarlierWakeUp( + current: number | undefined, + candidate: number | undefined, + ): number | undefined { + if (candidate === undefined) return current; + if (current === undefined) return candidate; + return Math.min(current, candidate); + } + + private ensureStreamRequestId( + type: TrafficRequestType, + request: TrafficRequest, + ): TrafficRequest { + if (type !== "stream") return request; + const metadata = request.metadata; + if (metadata?.requestId) return request; + + const requestId = randomUUID(); + if (metadata && typeof metadata === "object") { + (metadata as TrafficRequestMetadata).requestId = requestId; + return request; + } + + return { + ...request, + metadata: { + ...(metadata ?? {}), + requestId, + }, + }; + } + + private releaseStreamSlot( + metadata: TrafficRequestMetadata | undefined, + outcome: "success" | "failure", + ): void { + const requestId = metadata?.requestId; + if (!requestId) { + this.controllerLogger.debug("Stream completion missing requestId; slot not released", { + outcome, + }); + return; + } + const item = this.inFlightStreams.get(requestId); + if (!item) { + this.controllerLogger.debug("Stream completion missing in-flight entry", { + requestId, + outcome, + }); + return; + } + this.inFlightStreams.delete(requestId); + this.controllerLogger.debug("Stream completed; releasing slot", { + requestId, + tenantId: item.tenantId, + provider: item.request.metadata?.provider, + model: item.request.metadata?.model, + outcome, + }); + this.releaseActiveSlot(item, `stream-${outcome}`); + } + + private releaseActiveSlot(item: QueuedRequest, reason: string): void { + this.rateLimiter.releaseReservation(item.rateLimitKey, this.trafficLogger); + this.concurrencyLimiter.release(item, this.trafficLogger); + this.activeCount = Math.max(0, this.activeCount - 1); + this.controllerLogger.trace("Request finished; slot released", { + tenantId: item.tenantId, + activeCount: this.activeCount, + maxConcurrent: this.maxConcurrent, + reason, + }); + this.scheduleDrain(); + } + + private resolveAdaptiveLimit(next: QueuedRequest, now: number): DispatchDecision | null { + const rateLimitKey = next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata); + const adaptiveKey = this.buildAdaptiveKey(next.request.metadata, next.tenantId, rateLimitKey); + const state = this.adaptiveLimiterState.get(adaptiveKey); + if (!state) return null; + + this.applyAdaptiveDecay(state, now); + if (state.cooldownUntil !== undefined && now < state.cooldownUntil) { + return { kind: "wait", wakeUpAt: state.cooldownUntil }; + } + + return null; + } + + private recordAdaptiveRateLimitHit(key: string, retryAfterMs?: number): void { + const state = this.getAdaptiveState(key); + const now = Date.now(); + const { windowMs, threshold, minPenaltyMs, maxPenaltyMs, penaltyMultiplier } = + this.adaptiveLimiterConfig; + + state.last429At = now; + state.recent429s = state.recent429s.filter((timestamp) => now - timestamp <= windowMs); + state.recent429s.push(now); + + if (state.recent429s.length < threshold) { + return; + } + + const basePenalty = state.penaltyMs > 0 ? state.penaltyMs : minPenaltyMs; + const nextPenalty = Math.min( + maxPenaltyMs, + Math.max(minPenaltyMs, Math.round(basePenalty * penaltyMultiplier)), + ); + state.penaltyMs = nextPenalty; + const retryPenalty = typeof retryAfterMs === "number" ? retryAfterMs : 0; + const cooldownMs = Math.max(nextPenalty, retryPenalty); + state.cooldownUntil = now + cooldownMs; + } + + private recordAdaptiveSuccess(key: string): void { + const state = this.adaptiveLimiterState.get(key); + if (!state) return; + + const now = Date.now(); + this.applyAdaptiveDecay(state, now); + if (state.penaltyMs === 0) { + state.cooldownUntil = undefined; + state.recent429s = []; + state.last429At = undefined; + } + } + + private applyAdaptiveDecay(state: AdaptiveLimiterState, now: number): void { + const { decayMs, penaltyMultiplier } = this.adaptiveLimiterConfig; + if (state.last429At && now - state.last429At < decayMs) { + return; + } + + if (state.penaltyMs > 0) { + state.penaltyMs = Math.max(0, Math.floor(state.penaltyMs / penaltyMultiplier)); + } + } + + private getAdaptiveState(key: string): AdaptiveLimiterState { + const existing = this.adaptiveLimiterState.get(key); + if (existing) return existing; + const created: AdaptiveLimiterState = { + recent429s: [], + penaltyMs: 0, + }; + this.adaptiveLimiterState.set(key, created); + return created; + } + + private buildAdaptiveKey( + metadata: TrafficRequestMetadata | undefined, + tenantId: string, + rateLimitKey: string, + ): string { + if (rateLimitKey.includes("tenant=")) { + return rateLimitKey; + } + const tenant = metadata?.tenantId ?? tenantId ?? "default"; + return `${rateLimitKey}::tenant=${encodeURIComponent(tenant)}`; + } + + private buildTrafficResponseMetadata( + item: QueuedRequest, + rateLimitKey: string, + now: number, + error?: unknown, + ): TrafficResponseMetadata { + const snapshot = this.rateLimitSnapshots.get(rateLimitKey); + const retryAfterMs = this.resolveRetryAfterMs(error, snapshot); + const queuedForMs = + item.dispatchedAt !== undefined ? item.dispatchedAt - item.enqueuedAt : now - item.enqueuedAt; + const queueEtaMs = item.etaMs ?? Math.max(0, queuedForMs); + + return { + rateLimitKey, + retryAfterMs, + rateLimitRemaining: snapshot?.remaining, + rateLimitResetAt: snapshot?.resetAt, + rateLimitResetInMs: + snapshot?.resetAt !== undefined ? Math.max(0, snapshot.resetAt - now) : undefined, + queueEtaMs, + tenantId: item.tenantId, + priority: item.request.metadata?.priority, + taskType: item.request.metadata?.taskType, + }; + } + + private buildTrafficResponseMetadataFromMetadata( + metadata: TrafficRequestMetadata | undefined, + rateLimitKey: string, + now: number, + error?: unknown, + ): TrafficResponseMetadata { + const snapshot = this.rateLimitSnapshots.get(rateLimitKey); + const retryAfterMs = this.resolveRetryAfterMs(error, snapshot); + + return { + rateLimitKey, + retryAfterMs, + rateLimitRemaining: snapshot?.remaining, + rateLimitResetAt: snapshot?.resetAt, + rateLimitResetInMs: + snapshot?.resetAt !== undefined ? Math.max(0, snapshot.resetAt - now) : undefined, + tenantId: metadata?.tenantId, + priority: metadata?.priority, + taskType: metadata?.taskType, + }; + } + + private attachTrafficMetadata(target: unknown, info: TrafficResponseMetadata): void { + if (!target || typeof target !== "object") return; + (target as Record).traffic = info; + } + + private resolveRetryAfterMs( + error: unknown | undefined, + snapshot?: RateLimitSnapshot, + ): number | undefined { + if (error && typeof error === "object" && "retryAfterMs" in error) { + const candidate = (error as { retryAfterMs?: unknown }).retryAfterMs; + if (typeof candidate === "number" && Number.isFinite(candidate)) { + return candidate; + } + } + if (snapshot?.retryAfterMs !== undefined) { + return snapshot.retryAfterMs; + } + return undefined; + } + + private resolveRateLimitStrategy( + key: string, + config?: RateLimitStrategyConfig, + ): RateLimitStrategyKind { + const modelOverride = config?.models?.[key]; + if (modelOverride) return modelOverride; + const provider = key.split("::")[0] ?? ""; + const providerOverride = config?.providers?.[provider]; + if (providerOverride) return providerOverride; + if (provider.startsWith("openai")) return "window"; + return "token-bucket"; + } +} + +/* ============================================================ + * Error + Singleton + * ============================================================ + */ + +let singletonController: TrafficController | undefined; + +export function getTrafficController(options?: TrafficControllerOptions): TrafficController { + if (!singletonController) { + singletonController = new TrafficController(options); + } + return singletonController; +} + +function buildRateLimitKeyFromMetadata(metadata?: TrafficRequestMetadata): string { + const provider = metadata?.provider ?? "default-provider"; + const model = metadata?.model ?? "default-model"; + const parts = [provider, model]; + + // SOP: Add new metadata fields in one place with a stable label and ordering. + // 1) Add the optional field to TrafficRequestMetadata. + // 2) Add it here with a stable label so keys stay predictable. + // Example: { label: "org", value: metadata?.orgId } + const optionalFields: Array<{ label: string; value?: string }> = [ + { label: "apiKey", value: metadata?.apiKeyId }, + { label: "region", value: metadata?.region }, + { label: "endpoint", value: metadata?.endpoint }, + // Intentionally exclude tenantId to enforce provider/model limits across tenants. + // Use rateLimitKeyBuilder to include tenant for per-tenant rate limits. + { label: "tenantTier", value: metadata?.tenantTier }, + { label: "taskType", value: metadata?.taskType }, + ]; + + for (const field of optionalFields) { + if (!field.value) continue; + parts.push(`${field.label}=${encodeURIComponent(field.value)}`); + } + + return parts.join("::"); +} + +function buildProviderModelKeyFromMetadata(metadata?: TrafficRequestMetadata): string { + const provider = metadata?.provider ?? "default-provider"; + const model = metadata?.model ?? "default-model"; + return `${provider}::${model}`; +} diff --git a/packages/core/src/traffic/traffic-error-utils.ts b/packages/core/src/traffic/traffic-error-utils.ts new file mode 100644 index 000000000..4cbb98b52 --- /dev/null +++ b/packages/core/src/traffic/traffic-error-utils.ts @@ -0,0 +1,148 @@ +import type { Logger } from "../logger"; + +function readObjectProperty(value: unknown, key: string): unknown { + if (!value || typeof value !== "object") return undefined; + return (value as Record)[key]; +} + +export function findHeaders(value: unknown): unknown[] { + const candidates: unknown[] = [ + readObjectProperty(value, "headers"), + readObjectProperty(readObjectProperty(value, "response"), "headers"), + readObjectProperty(readObjectProperty(value, "cause"), "headers"), + readObjectProperty( + readObjectProperty(readObjectProperty(value, "cause"), "response"), + "headers", + ), + ]; + + return candidates.filter((candidate) => candidate !== undefined && candidate !== null); +} + +export function readHeaderValue(headers: unknown, name: string): string | undefined { + if (!headers) return undefined; + + if (typeof (headers as { get?: unknown }).get === "function") { + const v = (headers as { get: (name: string) => unknown }).get(name); + return v === null || v === undefined ? undefined : String(v); + } + + if (typeof headers !== "object") return undefined; + + const entries = Object.entries(headers as Record); + const target = name.toLowerCase(); + const match = entries.find(([k]) => String(k).toLowerCase() === target); + if (!match) return undefined; + + const value = match[1]; + if (Array.isArray(value)) { + const first = value[0]; + return first === null || first === undefined ? undefined : String(first); + } + return value === null || value === undefined ? undefined : String(value); +} + +export function parseRetryAfterMs(value: string, nowMs: number = Date.now()): number | undefined { + const raw = value.trim(); + if (!raw) return undefined; + + const seconds = Number(raw); + if (Number.isFinite(seconds)) { + return Math.max(0, Math.round(seconds * 1000)); + } + + const parsedAt = Date.parse(raw); + if (Number.isFinite(parsedAt)) { + return Math.max(0, parsedAt - nowMs); + } + + return undefined; +} + +export function coerceStatus(value: unknown): number | undefined { + const n = Number(value); + return Number.isFinite(n) ? n : undefined; +} + +export function extractStatusCode(error: unknown, logger?: Logger): number | undefined { + const status = + coerceStatus(readObjectProperty(error, "status")) ?? + coerceStatus(readObjectProperty(error, "statusCode")) ?? + coerceStatus(readObjectProperty(error, "httpStatus")) ?? + coerceStatus(readObjectProperty(readObjectProperty(error, "response"), "status")) ?? + coerceStatus(readObjectProperty(readObjectProperty(error, "cause"), "status")); + + logger?.trace?.("Extracted status code", { + status, + hasStatus: readObjectProperty(error, "status") !== undefined, + hasStatusCode: readObjectProperty(error, "statusCode") !== undefined, + hasHttpStatus: readObjectProperty(error, "httpStatus") !== undefined, + hasResponseStatus: + readObjectProperty(readObjectProperty(error, "response"), "status") !== undefined, + hasCauseStatus: readObjectProperty(readObjectProperty(error, "cause"), "status") !== undefined, + }); + + return status; +} + +export function extractRetryAfterMs(error: unknown, logger?: Logger): number | undefined { + const retryAfterLogger = logger?.child({ module: "retry-after" }); + const candidates = findHeaders(error); + + for (const headers of candidates) { + const raw = readHeaderValue(headers, "retry-after"); + if (!raw) continue; + const parsed = parseRetryAfterMs(raw); + retryAfterLogger?.trace?.("Parsed Retry-After header", { raw, parsedMs: parsed }); + if (parsed !== undefined) return parsed; + } + + retryAfterLogger?.trace?.("Retry-After header missing or unparsable"); + return undefined; +} + +export function isTimeoutError(error: unknown, logger?: Logger): boolean { + const candidates: unknown[] = [error]; + + const cause = readObjectProperty(error, "cause"); + if (cause) { + candidates.push(cause); + const nestedCause = readObjectProperty(cause, "cause"); + if (nestedCause) candidates.push(nestedCause); + } + + for (const candidate of candidates) { + const code = readObjectProperty(candidate, "code"); + const name = readObjectProperty(candidate, "name"); + const message = readObjectProperty(candidate, "message"); + + const codeText = String(code ?? "").toLowerCase(); + const nameText = String(name ?? "").toLowerCase(); + const messageText = String(message ?? "").toLowerCase(); + + const isTimeout = + codeText.includes("timeout") || + codeText.includes("timedout") || + nameText.includes("timeout") || + nameText.includes("timedout") || + messageText.includes("timeout") || + messageText.includes("timedout") || + messageText.includes("timed out"); + + logger?.trace?.("Checked timeout error", { + isTimeout, + code, + name, + messagePreview: typeof message === "string" ? message.slice(0, 160) : message, + hasCause: candidate !== error, + }); + + if (isTimeout) return true; + } + + return false; +} + +export function isPromiseLike(value: unknown): value is PromiseLike { + return !!value && typeof (value as { then?: unknown }).then === "function"; +} diff --git a/packages/core/src/traffic/traffic-errors.ts b/packages/core/src/traffic/traffic-errors.ts new file mode 100644 index 000000000..4943c89fd --- /dev/null +++ b/packages/core/src/traffic/traffic-errors.ts @@ -0,0 +1,141 @@ +import type { Logger } from "../logger"; +import { extractRetryAfterMs, extractStatusCode } from "./traffic-error-utils"; +import type { TrafficRequestMetadata } from "./traffic-types"; + +export type RateLimitErrorOptions = { + metadata?: TrafficRequestMetadata; + retryAfterMs?: number; + tenantId?: string; + key?: string; +}; + +export class CircuitBreakerOpenError extends Error { + readonly retryAfterMs?: number; + readonly metadata?: TrafficRequestMetadata; + + constructor(message: string, metadata?: TrafficRequestMetadata, retryAfterMs?: number) { + super(message); + this.name = "CircuitBreakerOpenError"; + this.metadata = metadata; + this.retryAfterMs = retryAfterMs; + } +} + +export class QueueWaitTimeoutError extends Error { + readonly waitedMs: number; + readonly maxQueueWaitMs?: number; + readonly deadlineAt?: number; + readonly metadata?: TrafficRequestMetadata; + readonly rateLimitKey?: string; + + constructor(options: { + waitedMs: number; + maxQueueWaitMs?: number; + deadlineAt?: number; + metadata?: TrafficRequestMetadata; + rateLimitKey?: string; + }) { + super("Queue wait time exceeded"); + this.name = "QueueWaitTimeoutError"; + this.waitedMs = options.waitedMs; + this.maxQueueWaitMs = options.maxQueueWaitMs; + this.deadlineAt = options.deadlineAt; + this.metadata = options.metadata; + this.rateLimitKey = options.rateLimitKey; + } +} + +export class RateLimitedUpstreamError extends Error { + readonly status = 429; + readonly retryAfterMs?: number; + readonly metadata?: TrafficRequestMetadata; + readonly provider?: string; + readonly model?: string; + readonly tenantId?: string; + readonly key?: string; + + constructor( + message: string, + metadata?: TrafficRequestMetadata, + retryAfterMs?: number, + options?: { tenantId?: string; key?: string }, + ); + constructor(message: string, options?: RateLimitErrorOptions); + constructor( + message: string, + metadataOrOptions?: TrafficRequestMetadata | RateLimitErrorOptions, + retryAfterMs?: number, + legacyOptions?: { tenantId?: string; key?: string }, + ) { + super(message); + this.name = "RateLimitedUpstreamError"; + const isOptions = + metadataOrOptions && + (Object.prototype.hasOwnProperty.call(metadataOrOptions, "metadata") || + Object.prototype.hasOwnProperty.call(metadataOrOptions, "retryAfterMs") || + Object.prototype.hasOwnProperty.call(metadataOrOptions, "key")); + + const metadata = isOptions + ? (metadataOrOptions as RateLimitErrorOptions).metadata + : (metadataOrOptions as TrafficRequestMetadata | undefined); + const retryAfter = isOptions + ? (metadataOrOptions as RateLimitErrorOptions).retryAfterMs + : retryAfterMs; + const tenantId = isOptions + ? (metadataOrOptions as RateLimitErrorOptions).tenantId + : legacyOptions?.tenantId; + const key = isOptions ? (metadataOrOptions as RateLimitErrorOptions).key : legacyOptions?.key; + + this.metadata = metadata; + this.retryAfterMs = retryAfter; + this.provider = metadata?.provider; + this.model = metadata?.model; + this.tenantId = tenantId ?? metadata?.tenantId; + this.key = key; + } +} + +export function normalizeRateLimitError(options: { + error: unknown; + metadata?: TrafficRequestMetadata; + tenantId?: string; + key?: string; + logger?: Logger; +}): RateLimitedUpstreamError | undefined { + const { error, metadata, tenantId, key, logger } = options; + const retryAfterMs = + error instanceof RateLimitedUpstreamError + ? (error.retryAfterMs ?? extractRetryAfterMs(error, logger)) + : extractRetryAfterMs(error, logger); + + if (error instanceof RateLimitedUpstreamError) { + const baseMetadata = metadata ?? error.metadata; + const baseTenant = tenantId ?? error.tenantId; + const baseKey = key ?? error.key; + if ( + error.metadata === baseMetadata && + error.retryAfterMs === retryAfterMs && + error.tenantId === baseTenant && + error.key === baseKey + ) { + return error; + } + return new RateLimitedUpstreamError(error.message, { + metadata: baseMetadata, + retryAfterMs, + tenantId: baseTenant, + key: baseKey, + }); + } + + const status = extractStatusCode(error, logger); + if (status !== 429) return undefined; + + const message = error instanceof Error ? error.message : "Rate limit exceeded"; + return new RateLimitedUpstreamError(message, { + metadata, + retryAfterMs, + tenantId, + key, + }); +} diff --git a/packages/core/src/traffic/traffic-rate-limiter.ts b/packages/core/src/traffic/traffic-rate-limiter.ts new file mode 100644 index 000000000..3e5aefbed --- /dev/null +++ b/packages/core/src/traffic/traffic-rate-limiter.ts @@ -0,0 +1,295 @@ +import type { Logger } from "../logger"; +import type { + RateLimitStrategy, + RateLimitUpdateResult, +} from "./rate-limit-strategies/rate-limit-strategy"; +import { TokenBucketRateLimitStrategy } from "./rate-limit-strategies/token-bucket-rate-limit-strategy"; +import type { DispatchDecision, QueuedRequest } from "./traffic-controller-internal"; +import type { RateLimitConfig, TrafficRequestMetadata } from "./traffic-types"; + +export type { + RateLimitHeaderSnapshot, + RateLimitStrategy, + RateLimitUpdateResult, +} from "./rate-limit-strategies/rate-limit-strategy"; +export { DefaultRateLimitStrategy } from "./rate-limit-strategies/default-rate-limit-strategy"; +export { OpenAIWindowRateLimitStrategy } from "./rate-limit-strategies/openai-window-rate-limit-strategy"; +export { TokenBucketRateLimitStrategy } from "./rate-limit-strategies/token-bucket-rate-limit-strategy"; + +type SchedulerCallback = () => void; + +export type RateLimitStrategyFactory = (key: string) => RateLimitStrategy; + +type UsageCounters = { + inputTokens?: number; + outputTokens?: number; + totalTokens?: number; +}; + +type TokenRateState = { + capacity: number; + refillPerSecond: number; + tokens: number; + updatedAt: number; +}; + +export class TrafficRateLimiter { + private readonly strategies = new Map(); + private readonly tokenRates = new Map(); + private wakeUpTimeout?: ReturnType; + private wakeUpAt?: number; + private readonly onWakeUp: SchedulerCallback; + private readonly strategyFactory: RateLimitStrategyFactory; + private readonly rateLimits?: RateLimitConfig; + + constructor( + onWakeUp: SchedulerCallback, + options?: { strategyFactory?: RateLimitStrategyFactory; rateLimits?: RateLimitConfig }, + ) { + this.onWakeUp = onWakeUp; + this.rateLimits = options?.rateLimits; + this.strategyFactory = + options?.strategyFactory ?? + ((key) => new TokenBucketRateLimitStrategy(key, this.rateLimits?.[key])); + } + + resolve(next: QueuedRequest, key: string, logger?: Logger): DispatchDecision | null { + const strategy = this.strategies.get(key) ?? this.createStrategy(key, logger); + const requestDecision = strategy.resolve(next, logger); + if (requestDecision?.kind === "wait") { + const tokenDecision = strategy.handlesTokenLimits + ? null + : this.resolveTokenLimit(next, key, logger, false); + if (tokenDecision?.kind === "wait") { + const requestWakeUp = requestDecision.wakeUpAt; + const tokenWakeUp = tokenDecision.wakeUpAt; + if (tokenWakeUp !== undefined && requestWakeUp !== undefined) { + return { kind: "wait", wakeUpAt: Math.min(requestWakeUp, tokenWakeUp) }; + } + if (tokenWakeUp !== undefined && requestWakeUp === undefined) { + return tokenDecision; + } + } + return requestDecision; + } + + const tokenDecision = strategy.handlesTokenLimits + ? null + : this.resolveTokenLimit(next, key, logger, true); + if (tokenDecision?.kind === "wait") { + return tokenDecision; + } + + return requestDecision; + } + + notifyDispatch(key: string | undefined, logger?: Logger): void { + if (!key) return; + this.strategies.get(key)?.onDispatch(logger); + } + + scheduleWakeUpAt(wakeUpAt: number, logger?: Logger): void { + const rateLimitLogger = logger?.child({ module: "rate-limiter" }); + const now = Date.now(); + const target = Math.max(now, wakeUpAt); + + if (this.wakeUpTimeout && this.wakeUpAt !== undefined && this.wakeUpAt <= target) { + rateLimitLogger?.trace?.("Wakeup already scheduled earlier; skipping", { + currentWakeUpAt: this.wakeUpAt, + requestedWakeUpAt: target, + }); + return; + } + + if (this.wakeUpTimeout) clearTimeout(this.wakeUpTimeout); + + this.wakeUpAt = target; + rateLimitLogger?.debug?.("Scheduling rate limit wakeup", { + wakeUpAt: target, + inMs: Math.max(1, target - now), + }); + this.wakeUpTimeout = setTimeout( + () => { + this.wakeUpTimeout = undefined; + this.wakeUpAt = undefined; + rateLimitLogger?.debug?.("Rate limit wakeup fired"); + this.onWakeUp(); + }, + Math.max(1, target - now), + ); + } + + releaseReservation(key?: string, logger?: Logger): void { + if (!key) return; + this.strategies.get(key)?.onComplete(logger); + } + + recordUsage( + key: string | undefined, + usage: UsageCounters | Promise | undefined, + logger?: Logger, + reservedTokens?: number, + ): void { + if (!key || !usage) return; + if (typeof (usage as PromiseLike).then === "function") { + void (usage as Promise) + .then((resolved) => this.recordUsage(key, resolved, logger, reservedTokens)) + .catch(() => {}); + return; + } + + const strategy = this.strategies.get(key); + if (strategy?.recordUsage) { + strategy.recordUsage(usage, logger, reservedTokens); + return; + } + + const tokens = this.resolveTokenCount(usage); + if (tokens <= 0) return; + + const bucket = this.getTokenRateState(key, logger); + if (!bucket) return; + + const now = Date.now(); + this.refillTokenRate(bucket, now); + bucket.tokens = Math.min(bucket.capacity, bucket.tokens); + const reserved = typeof reservedTokens === "number" ? reservedTokens : 0; + const delta = tokens - reserved; + if (delta > 0) { + bucket.tokens -= delta; + } else if (delta < 0) { + bucket.tokens = Math.min(bucket.capacity, bucket.tokens + Math.abs(delta)); + } + + if (bucket.tokens < 0 && bucket.refillPerSecond > 0) { + const waitMs = Math.max(1, Math.ceil((-bucket.tokens / bucket.refillPerSecond) * 1000)); + this.scheduleWakeUpAt(now + waitMs, logger); + } + } + + updateFromHeaders( + metadata: TrafficRequestMetadata | undefined, + headers: unknown, + key: string, + logger?: Logger, + ): RateLimitUpdateResult | undefined { + const existing = this.strategies.get(key); + if (existing) return existing.updateFromHeaders(metadata, headers, logger); + + const created = this.strategyFactory(key); + const update = created.updateFromHeaders(metadata, headers, logger); + if (!update) return undefined; + this.strategies.set(key, created); + return update; + } + + private createStrategy(key: string, logger?: Logger): RateLimitStrategy { + const created = this.strategyFactory(key); + this.strategies.set(key, created); + logger?.child({ module: "rate-limiter" })?.trace?.("Created rate limit strategy", { + rateLimitKey: key, + strategy: created.constructor.name, + }); + return created; + } + + private resolveTokenLimit( + next: QueuedRequest, + key: string, + logger?: Logger, + reserveTokens = true, + ): DispatchDecision | null { + const bucket = this.getTokenRateState(key, logger); + if (!bucket) return null; + + const now = Date.now(); + this.refillTokenRate(bucket, now); + + if (bucket.capacity <= 0) { + logger?.child({ module: "rate-limiter" })?.debug?.("Token limit misconfigured; blocking", { + rateLimitKey: key, + capacity: bucket.capacity, + refillPerSecond: bucket.refillPerSecond, + }); + return { kind: "wait" }; + } + + const estimatedTokens = next.estimatedTokens; + if (typeof estimatedTokens === "number" && estimatedTokens > 0) { + if (bucket.tokens >= estimatedTokens) { + if (reserveTokens) { + bucket.tokens -= estimatedTokens; + next.reservedTokens = estimatedTokens; + } + return null; + } + } else if (bucket.tokens >= 0) { + return null; + } + + if (bucket.refillPerSecond <= 0) { + logger?.child({ module: "rate-limiter" })?.debug?.("Token limit has no refill; blocking", { + rateLimitKey: key, + capacity: bucket.capacity, + refillPerSecond: bucket.refillPerSecond, + }); + return { kind: "wait" }; + } + + const requiredTokens = + typeof estimatedTokens === "number" && estimatedTokens > 0 + ? Math.max(estimatedTokens - bucket.tokens, 1) + : -bucket.tokens; + const waitMs = Math.max(1, Math.ceil((requiredTokens / bucket.refillPerSecond) * 1000)); + return { kind: "wait", wakeUpAt: now + waitMs }; + } + + private getTokenRateState(key: string, logger?: Logger): TokenRateState | undefined { + const existing = this.tokenRates.get(key); + if (existing) return existing; + + const options = this.rateLimits?.[key]; + if (!options) return undefined; + + const tokensPerMinute = Number(options.tokensPerMinute); + if (!Number.isFinite(tokensPerMinute) || tokensPerMinute <= 0) { + return undefined; + } + + // Token pacing uses a 1-minute burst by default; request bursts are handled separately. + const refillPerSecond = tokensPerMinute / 60; + const capacity = tokensPerMinute; + const now = Date.now(); + const created: TokenRateState = { + capacity, + refillPerSecond, + tokens: capacity, + updatedAt: now, + }; + this.tokenRates.set(key, created); + logger?.child({ module: "rate-limiter" })?.trace?.("Created token rate state", { + rateLimitKey: key, + capacity, + refillPerSecond, + }); + return created; + } + + private refillTokenRate(bucket: TokenRateState, now: number): void { + const elapsedMs = now - bucket.updatedAt; + if (elapsedMs <= 0) return; + bucket.updatedAt = now; + if (bucket.capacity <= 0 || bucket.refillPerSecond <= 0) return; + const refill = (elapsedMs / 1000) * bucket.refillPerSecond; + if (refill <= 0) return; + bucket.tokens = Math.min(bucket.capacity, bucket.tokens + refill); + } + + private resolveTokenCount(usage: UsageCounters): number { + const total = Number.isFinite(usage.totalTokens) ? usage.totalTokens : undefined; + if (total !== undefined) return total; + const input = Number.isFinite(usage.inputTokens) ? usage.inputTokens : 0; + const output = Number.isFinite(usage.outputTokens) ? usage.outputTokens : 0; + return input + output; + } +} diff --git a/packages/core/src/traffic/traffic-retry.spec.ts b/packages/core/src/traffic/traffic-retry.spec.ts new file mode 100644 index 000000000..2360ca109 --- /dev/null +++ b/packages/core/src/traffic/traffic-retry.spec.ts @@ -0,0 +1,45 @@ +import { describe, expect, it, vi } from "vitest"; +import { buildRetryPlan } from "./traffic-retry"; + +describe("buildRetryPlan", () => { + it("respects Retry-After for 429s", () => { + const randomSpy = vi.spyOn(Math, "random").mockReturnValue(0); + try { + const plan = buildRetryPlan( + { + status: 429, + response: { headers: { "retry-after": "2" } }, + }, + 1, + ); + + expect(plan).toBeTruthy(); + expect(plan?.reason).toBe("rateLimit"); + expect(plan?.delayMs).toBeGreaterThanOrEqual(2_000); + } finally { + randomSpy.mockRestore(); + } + }); + + it("parses HTTP-date Retry-After values", () => { + vi.useFakeTimers(); + const randomSpy = vi.spyOn(Math, "random").mockReturnValue(0); + + try { + vi.setSystemTime(new Date("2020-01-01T00:00:00.000Z")); + const plan = buildRetryPlan( + { + statusCode: 429, + response: { headers: { "retry-after": "Wed, 01 Jan 2020 00:00:03 GMT" } }, + }, + 1, + ); + + expect(plan).toBeTruthy(); + expect(plan?.delayMs).toBeGreaterThanOrEqual(3_000); + } finally { + vi.useRealTimers(); + randomSpy.mockRestore(); + } + }); +}); diff --git a/packages/core/src/traffic/traffic-retry.ts b/packages/core/src/traffic/traffic-retry.ts new file mode 100644 index 000000000..9604dc53a --- /dev/null +++ b/packages/core/src/traffic/traffic-retry.ts @@ -0,0 +1,144 @@ +import type { Logger } from "../logger"; +import { + MAX_RETRY_ATTEMPTS, + RATE_LIMIT_BASE_BACKOFF_MS, + RATE_LIMIT_JITTER_FACTOR, + SERVER_ERROR_BASE_BACKOFF_MS, + SERVER_ERROR_JITTER_FACTOR, + TIMEOUT_BASE_BACKOFF_MS, + TIMEOUT_JITTER_FACTOR, + TIMEOUT_RETRY_ATTEMPTS, +} from "./traffic-constants"; +import { extractRetryAfterMs, extractStatusCode, isTimeoutError } from "./traffic-error-utils"; +import { RateLimitedUpstreamError } from "./traffic-errors"; +import type { + RetryPlan, + RetryPolicy, + RetryPolicyConfig, + RetryPolicyContext, + RetryReason, +} from "./traffic-types"; + +export type { + RetryPlan, + RetryPolicy, + RetryPolicyConfig, + RetryPolicyContext, + RetryReason, +} from "./traffic-types"; + +export function buildRetryPlan( + error: unknown, + attempt: number, + logger?: Logger, +): RetryPlan | undefined { + const retryLogger = logger?.child({ module: "retry" }); + const reason = getRetryReason(error, retryLogger); + if (!reason) { + retryLogger?.debug?.("No retry reason detected; skipping retry", { attempt }); + return undefined; + } + + const max = reason === "timeout" ? TIMEOUT_RETRY_ATTEMPTS : MAX_RETRY_ATTEMPTS; + if (attempt >= max) { + retryLogger?.debug?.("Retry attempts exhausted; skipping retry", { + attempt, + max, + reason, + }); + return undefined; + } + + const computedDelayMs = computeBackoffDelay(reason, attempt); + const retryAfterMs = + reason === "rateLimit" + ? error instanceof RateLimitedUpstreamError + ? error.retryAfterMs + : extractRetryAfterMs(error, retryLogger) + : undefined; + const delayMs = + retryAfterMs === undefined ? computedDelayMs : Math.max(computedDelayMs, retryAfterMs); + + retryLogger?.debug?.("Retry plan built", { + attempt, + reason, + delayMs, + computedDelayMs, + retryAfterMs, + max, + }); + + return { + reason, + delayMs, + }; +} + +export function buildRetryPlanWithPolicy( + context: RetryPolicyContext, + policyConfig?: RetryPolicyConfig, +): RetryPlan | undefined { + const retryLogger = context.logger?.child({ module: "retry" }); + const policy = resolveRetryPolicy(context, policyConfig); + if (policy) { + const planned = policy(context); + if (planned) { + retryLogger?.debug?.("Retry policy returned a plan", { + attempt: context.attempt, + reason: planned.reason, + delayMs: planned.delayMs, + }); + return planned; + } + retryLogger?.debug?.("Retry policy declined to retry", { attempt: context.attempt }); + } + + return buildRetryPlan(context.error, context.attempt, context.logger); +} + +function resolveRetryPolicy( + context: RetryPolicyContext, + config?: RetryPolicyConfig, +): RetryPolicy | undefined { + if (!config) return undefined; + const modelPolicy = context.key ? config.models?.[context.key] : undefined; + if (modelPolicy) return modelPolicy; + const providerModelKey = + context.metadata?.provider && context.metadata?.model + ? `${context.metadata.provider}::${context.metadata.model}` + : undefined; + const providerModelPolicy = providerModelKey ? config.models?.[providerModelKey] : undefined; + if (providerModelPolicy) return providerModelPolicy; + const provider = context.metadata?.provider; + const providerPolicy = provider ? config.providers?.[provider] : undefined; + if (providerPolicy) return providerPolicy; + return config.default; +} + +function getRetryReason(error: unknown, logger?: Logger): RetryReason | undefined { + if (error instanceof RateLimitedUpstreamError) return "rateLimit"; + const status = extractStatusCode(error, logger); + if (status === 429) return "rateLimit"; + if (status && status >= 500) return "serverError"; + if (status === 408 || isTimeoutError(error, logger)) return "timeout"; + return undefined; +} + +function computeBackoffDelay(reason: RetryReason, attempt: number): number { + const base = + reason === "serverError" + ? SERVER_ERROR_BASE_BACKOFF_MS + : reason === "timeout" + ? TIMEOUT_BASE_BACKOFF_MS + : RATE_LIMIT_BASE_BACKOFF_MS; + + const jitter = + reason === "serverError" + ? SERVER_ERROR_JITTER_FACTOR + : reason === "timeout" + ? TIMEOUT_JITTER_FACTOR + : RATE_LIMIT_JITTER_FACTOR; + + const exp = base * 2 ** (attempt - 1); + return Math.round(exp + exp * jitter * Math.random()); +} diff --git a/packages/core/src/traffic/traffic-types.ts b/packages/core/src/traffic/traffic-types.ts new file mode 100644 index 000000000..396fbf07c --- /dev/null +++ b/packages/core/src/traffic/traffic-types.ts @@ -0,0 +1,182 @@ +import type { Logger } from "../logger"; + +type BivariantFunction = { + bivarianceHack(...args: TArgs): TReturn; +}["bivarianceHack"]; + +type UsageCounters = { + inputTokens?: number; + outputTokens?: number; + totalTokens?: number; +}; + +export type RetryReason = "rateLimit" | "serverError" | "timeout"; + +export type RetryPlan = { + delayMs: number; + reason: RetryReason; +}; + +export type RetryPolicyContext = { + error: unknown; + attempt: number; + metadata?: TrafficRequestMetadata; + key?: string; + logger?: Logger; +}; + +export type RetryPolicy = (context: RetryPolicyContext) => RetryPlan | undefined; + +export type RetryPolicyConfig = { + default?: RetryPolicy; + providers?: Record; + models?: Record; +}; + +export type TrafficRequestType = "text" | "stream"; +export type TrafficPriority = "P0" | "P1" | "P2"; + +export interface TrafficRequestMetadata { + agentId?: string; + agentName?: string; + model?: string; + provider?: string; + requestId?: string; + priority?: TrafficPriority; + tenantId?: string; + apiKeyId?: string; + region?: string; + endpoint?: string; + tenantTier?: string; + taskType?: string; + fallbackPolicyId?: string; +} + +export type TrafficResponseMetadata = { + rateLimitKey?: string; + retryAfterMs?: number; + rateLimitRemaining?: number; + rateLimitResetAt?: number; + rateLimitResetInMs?: number; + queueEtaMs?: number; + tenantId?: string; + priority?: TrafficPriority; + taskType?: string; +}; + +export type FallbackTarget = { + provider?: string; + model: string; +}; + +export type ShortResponseFallbackTarget = { + kind: "short-response"; + text: string; +}; + +export type FallbackChainEntry = string | FallbackTarget | ShortResponseFallbackTarget; + +export type FallbackPolicyMode = "fallback" | "wait"; + +export type FallbackPolicy = { + mode: FallbackPolicyMode; +}; + +export type FallbackPolicyConfig = { + defaultPolicyId?: string; + policies?: Record; + taskTypePolicyIds?: Record; +}; + +export type ProviderModelConcurrencyLimit = + | number + | Record + | ((metadata: TrafficRequestMetadata | undefined, key: string) => number | undefined); + +export type TenantConcurrencyLimit = + | number + | Record + | ((tenantId: string, metadata: TrafficRequestMetadata | undefined) => number | undefined); + +export type PriorityBurstLimits = Partial>; +export type PriorityWeights = Partial>; + +export type AdaptiveLimiterConfig = { + windowMs?: number; + threshold?: number; + minPenaltyMs?: number; + maxPenaltyMs?: number; + penaltyMultiplier?: number; + decayMs?: number; +}; + +export interface TrafficRequest { + tenantId: string; + metadata?: TrafficRequestMetadata; + execute: () => Promise; + deadlineAt?: number; + maxQueueWaitMs?: number; + estimatedTokens?: number; + createFallbackRequest?: BivariantFunction< + [target: FallbackChainEntry], + TrafficRequest | undefined + >; + extractUsage?: BivariantFunction< + [response: TResponse], + Promise | UsageCounters | undefined + >; +} + +export interface TrafficControllerOptions { + maxConcurrent?: number; + maxConcurrentPerProviderModel?: ProviderModelConcurrencyLimit; + maxConcurrentPerTenant?: TenantConcurrencyLimit; + rateLimits?: RateLimitConfig; + priorityBurstLimits?: PriorityBurstLimits; + priorityWeights?: PriorityWeights; + adaptiveLimiter?: AdaptiveLimiterConfig; + /** + * Optional override for rate-limit key construction. + * Useful when you need to add new metadata fields without changing core logic. + */ + rateLimitKeyBuilder?: (metadata?: TrafficRequestMetadata) => string; + /** + * Optional retry policy overrides by provider/model. + * Models keys can use the rate-limit key or provider::model. + */ + retryPolicy?: RetryPolicyConfig; + /** + * Optional fallback policy selection by task type or explicit policy id. + */ + fallbackPolicy?: FallbackPolicyConfig; + /** + * Select a rate-limit strategy by provider/model. + * Example: + * { providers: { openai: "window" }, models: { "openai::gpt-4o": "window" } } + */ + rateLimitStrategy?: RateLimitStrategyConfig; + logger?: Logger; + fallbackChains?: Record; +} + +export type RateLimitStrategyKind = "window" | "token-bucket"; + +export type RateLimitStrategyConfig = { + providers?: Record; + models?: Record; +}; + +export interface RateLimitOptions { + requestsPerMinute: number; + tokensPerMinute: number; + burstSize?: number; +} + +export type RateLimitKey = string; +export type RateLimitConfig = Record; + +export type TenantUsage = { + inputTokens: number; + outputTokens: number; + totalTokens: number; +}; diff --git a/packages/core/src/traffic/traffic-usage-tracker.ts b/packages/core/src/traffic/traffic-usage-tracker.ts new file mode 100644 index 000000000..b75f602a1 --- /dev/null +++ b/packages/core/src/traffic/traffic-usage-tracker.ts @@ -0,0 +1,101 @@ +import type { Logger } from "../logger"; +import type { QueuedRequest } from "./traffic-controller-internal"; +import { isPromiseLike } from "./traffic-error-utils"; +import type { TenantUsage } from "./traffic-types"; + +type UsageCounters = { + inputTokens?: number; + outputTokens?: number; + totalTokens?: number; +}; + +export class TrafficUsageTracker { + private readonly tenantUsage = new Map(); + + getTenantUsage(tenantId: string): TenantUsage | undefined { + const usage = this.tenantUsage.get(tenantId); + return usage ? { ...usage } : undefined; + } + + recordUsage( + item: QueuedRequest, + result: TResponse, + logger?: Logger, + ): UsageCounters | Promise | undefined { + const usageLogger = logger?.child({ module: "usage-tracker" }); + const extractor = item.extractUsage ?? item.request.extractUsage; + if (!extractor) { + usageLogger?.trace?.("No usage extractor; skipping usage", { tenantId: item.tenantId }); + return undefined; + } + + let usage: UsageCounters | Promise | undefined; + try { + usage = extractor(result); + } catch (error) { + usageLogger?.warn?.("Usage extractor threw; skipping usage", { + tenantId: item.tenantId, + errorName: (error as { name?: unknown } | null)?.name, + errorMessage: (error as { message?: unknown } | null)?.message, + }); + return undefined; + } + if (!usage) { + usageLogger?.trace?.("Usage extractor returned empty; skipping usage", { + tenantId: item.tenantId, + }); + return undefined; + } + + if (isPromiseLike(usage)) { + usageLogger?.trace?.("Usage extractor returned promise; awaiting", { + tenantId: item.tenantId, + }); + void usage + .then((u) => u && this.incrementTenantUsage(item.tenantId, u, usageLogger)) + .catch((error) => { + usageLogger?.warn?.("Usage extractor promise rejected; skipping usage", { + tenantId: item.tenantId, + errorName: (error as { name?: unknown } | null)?.name, + errorMessage: (error as { message?: unknown } | null)?.message, + }); + }); + return usage; + } + this.incrementTenantUsage(item.tenantId, usage, usageLogger); + return usage; + } + + private incrementTenantUsage(tenantId: string, usage: UsageCounters, logger?: Logger): void { + const current = this.tenantUsage.get(tenantId) ?? { + inputTokens: 0, + outputTokens: 0, + totalTokens: 0, + }; + + const input = + typeof usage.inputTokens === "number" && Number.isFinite(usage.inputTokens) + ? usage.inputTokens + : 0; + const output = + typeof usage.outputTokens === "number" && Number.isFinite(usage.outputTokens) + ? usage.outputTokens + : 0; + const total = + typeof usage.totalTokens === "number" && Number.isFinite(usage.totalTokens) + ? usage.totalTokens + : input + output; + + this.tenantUsage.set(tenantId, { + inputTokens: current.inputTokens + input, + outputTokens: current.outputTokens + output, + totalTokens: current.totalTokens + total, + }); + + logger?.debug?.("Tenant usage incremented", { + tenantId, + delta: { inputTokens: input, outputTokens: output, totalTokens: total }, + total: this.tenantUsage.get(tenantId), + }); + } +} diff --git a/packages/core/src/workflow/core.ts b/packages/core/src/workflow/core.ts index 3136511ca..2b273d588 100644 --- a/packages/core/src/workflow/core.ts +++ b/packages/core/src/workflow/core.ts @@ -827,6 +827,9 @@ export function createWorkflow< // Wrap entire execution in root span const rootSpan = traceContext.getRootSpan(); + if (options?.tenantId) { + rootSpan.setAttribute("tenant.id", options.tenantId); + } // Add workflow state snapshot for remote observability const workflowState = { @@ -848,6 +851,7 @@ export function createWorkflow< executionId, userId: options?.userId, conversationId: options?.conversationId, + tenantId: options?.tenantId, traceId: rootSpan.spanContext().traceId, spanId: rootSpan.spanContext().spanId, }); diff --git a/packages/core/src/workflow/internal/state.ts b/packages/core/src/workflow/internal/state.ts index 71fa602d4..2de12528c 100644 --- a/packages/core/src/workflow/internal/state.ts +++ b/packages/core/src/workflow/internal/state.ts @@ -23,6 +23,7 @@ export type WorkflowState = { executionId: string; conversationId?: string; userId?: string; + tenantId?: string; context?: UserContext; active: number; startAt: Date; @@ -132,6 +133,7 @@ class WorkflowStateManagerInternal implements WorkflowStateManager active: config?.active ?? 0, userId: config?.userId, conversationId: config?.conversationId, + tenantId: config?.tenantId, context: config?.context, startAt: new Date(), endAt: null, diff --git a/packages/core/src/workflow/internal/utils.ts b/packages/core/src/workflow/internal/utils.ts index fc39530b5..42250d828 100644 --- a/packages/core/src/workflow/internal/utils.ts +++ b/packages/core/src/workflow/internal/utils.ts @@ -32,6 +32,7 @@ export function convertWorkflowStateToParam( executionId: state.executionId, conversationId: state.conversationId, userId: state.userId, + tenantId: state.tenantId, context: state.context, active: state.active, startAt: state.startAt, diff --git a/packages/core/src/workflow/steps/and-agent.ts b/packages/core/src/workflow/steps/and-agent.ts index bc46c1480..14af9b8f6 100644 --- a/packages/core/src/workflow/steps/and-agent.ts +++ b/packages/core/src/workflow/steps/and-agent.ts @@ -66,6 +66,7 @@ export function andAgent( context: restConfig.context ?? state.context, conversationId: restConfig.conversationId ?? state.conversationId, userId: restConfig.userId ?? state.userId, + tenantId: restConfig.tenantId ?? state.tenantId, // No parentSpan when there's no workflow context }); // Accumulate usage if available (no workflow context) @@ -92,6 +93,7 @@ export function andAgent( context: restConfig.context ?? state.context, conversationId: restConfig.conversationId ?? state.conversationId, userId: restConfig.userId ?? state.userId, + tenantId: restConfig.tenantId ?? state.tenantId, // Pass the current step span as parent for proper span hierarchy parentSpan: state.workflowContext?.currentStepSpan, }); diff --git a/packages/core/src/workflow/types.ts b/packages/core/src/workflow/types.ts index f7eed2823..49bfd8cb4 100644 --- a/packages/core/src/workflow/types.ts +++ b/packages/core/src/workflow/types.ts @@ -214,6 +214,10 @@ export interface WorkflowRunOptions { * The conversation ID, this can be used to track the current conversation in a workflow */ conversationId?: string; + /** + * Tenant identifier propagated to agent steps and subcalls + */ + tenantId?: string; /** * The user ID, this can be used to track the current user in a workflow */ diff --git a/packages/scorers/src/llm/answer-correctness.ts b/packages/scorers/src/llm/answer-correctness.ts index 2111fa31c..d66cc0079 100644 --- a/packages/scorers/src/llm/answer-correctness.ts +++ b/packages/scorers/src/llm/answer-correctness.ts @@ -7,6 +7,7 @@ import { import { safeStringify } from "@voltagent/internal/utils"; import type { LanguageModel } from "ai"; import { z } from "zod"; +import { extractTenantId } from "./utils"; const ANSWER_CORRECTNESS_PROMPT = `Given a ground truth and an answer, analyze each statement in the answer and classify them in one of the following categories: @@ -84,15 +85,17 @@ export function createAnswerCorrectnessScorer< const agent = new Agent({ name: "answer-correctness-classifier", model, + trafficPriority: "P2", instructions: "You classify statements for answer correctness evaluation", }); + const tenantId = extractTenantId(context); const payload = resolvePayload(context, buildPayload); const prompt = ANSWER_CORRECTNESS_PROMPT.replace("{{question}}", payload.input) .replace("{{answer}}", payload.output) .replace("{{ground_truth}}", payload.expected); - const response = await agent.generateObject(prompt, CLASSIFICATION_SCHEMA); + const response = await agent.generateObject(prompt, CLASSIFICATION_SCHEMA, { tenantId }); const normalized = normalizeClassification(response.object); return { diff --git a/packages/scorers/src/llm/answer-relevancy.ts b/packages/scorers/src/llm/answer-relevancy.ts index a3de2237c..d9bda1c9a 100644 --- a/packages/scorers/src/llm/answer-relevancy.ts +++ b/packages/scorers/src/llm/answer-relevancy.ts @@ -8,6 +8,7 @@ import { import { safeStringify } from "@voltagent/internal/utils"; import type { LanguageModel } from "ai"; import { z } from "zod"; +import { extractTenantId } from "./utils"; const QUESTION_GEN_PROMPT = `Generate a question for the given answer and Identify if answer is noncommittal. Give noncommittal as 1 if the answer is noncommittal and 0 if the answer is committal. A noncommittal answer is one that is evasive, vague, or ambiguous. For example, "I don't know" or "I'm not sure" are noncommittal answers @@ -119,9 +120,11 @@ export function createAnswerRelevancyScorer< const agent = new Agent({ name: "question-generator", model, + trafficPriority: "P2", instructions: "You generate questions from answers to evaluate relevancy", }); + const tenantId = extractTenantId(context); const payload = resolvePayload(context, buildPayload); const questions: GeneratedQuestion[] = []; @@ -131,7 +134,7 @@ export function createAnswerRelevancyScorer< payload.context, ); - const response = await agent.generateObject(prompt, QUESTION_SCHEMA); + const response = await agent.generateObject(prompt, QUESTION_SCHEMA, { tenantId }); questions.push({ question: response.object.question, noncommittal: response.object.noncommittal === 1, diff --git a/packages/scorers/src/llm/classifiers.ts b/packages/scorers/src/llm/classifiers.ts index 1bca42393..a327e20d4 100644 --- a/packages/scorers/src/llm/classifiers.ts +++ b/packages/scorers/src/llm/classifiers.ts @@ -7,6 +7,7 @@ import { } from "@voltagent/core"; import { safeStringify } from "@voltagent/internal/utils"; import { z } from "zod"; +import { extractTenantId } from "./utils"; type ChoiceId = string; @@ -93,11 +94,14 @@ async function evaluateChoice(args: EvaluateChoiceArgs): Promise const agent = new Agent({ name: `${scorerId}-judge`, model, + trafficPriority: "P2", instructions: judgeInstructions ?? buildDefaultChoiceInstructions(Object.keys(choices)), }); + const tenantId = extractTenantId(context); const response = await agent.generateObject(prompt, CHOICE_RESPONSE_SCHEMA, { maxOutputTokens, + tenantId, }); const { choice, reason } = extractChoiceFromResponse(response.object, choices, scorerId); diff --git a/packages/scorers/src/llm/context-precision.ts b/packages/scorers/src/llm/context-precision.ts index d31b5b851..ba680f560 100644 --- a/packages/scorers/src/llm/context-precision.ts +++ b/packages/scorers/src/llm/context-precision.ts @@ -7,6 +7,7 @@ import { import { safeStringify } from "@voltagent/internal/utils"; import type { LanguageModel } from "ai"; import { z } from "zod"; +import { extractTenantId } from "./utils"; const CONTEXT_PRECISION_PROMPT = `Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output. @@ -109,6 +110,7 @@ export function createContextPrecisionScorer< const agent = new Agent({ name: "context-precision-evaluator", model, + trafficPriority: "P2", instructions: "You evaluate if context was useful for arriving at the answer", }); @@ -116,12 +118,15 @@ export function createContextPrecisionScorer< const contextText = Array.isArray(payload.context) ? payload.context.join("\n") : payload.context; + const tenantId = extractTenantId(context); const prompt = CONTEXT_PRECISION_PROMPT.replace("{{question}}", payload.input) .replace("{{context}}", contextText) .replace("{{answer}}", payload.output); - const response = await agent.generateObject(prompt, CONTEXT_PRECISION_SCHEMA); + const response = await agent.generateObject(prompt, CONTEXT_PRECISION_SCHEMA, { + tenantId, + }); context.results.raw.contextPrecisionVerdict = response.object; diff --git a/packages/scorers/src/llm/context-recall.ts b/packages/scorers/src/llm/context-recall.ts index e6e865106..2c6053fc9 100644 --- a/packages/scorers/src/llm/context-recall.ts +++ b/packages/scorers/src/llm/context-recall.ts @@ -7,6 +7,7 @@ import { import { safeStringify } from "@voltagent/internal/utils"; import type { LanguageModel } from "ai"; import { z } from "zod"; +import { extractTenantId } from "./utils"; const CONTEXT_RECALL_EXTRACT_PROMPT = `Given the context and ground truth (expected output), extract all factual statements from the ground truth. @@ -120,6 +121,7 @@ export function createContextRecallScorer< const agent = new Agent({ name: "context-recall-evaluator", model, + trafficPriority: "P2", instructions: "You evaluate how well provided context supports factual statements", }); @@ -127,6 +129,7 @@ export function createContextRecallScorer< const contextText = Array.isArray(payload.context) ? payload.context.join("\n") : payload.context; + const tenantId = extractTenantId(context); // Extract statements from expected output const extractPrompt = CONTEXT_RECALL_EXTRACT_PROMPT.replace( @@ -134,7 +137,9 @@ export function createContextRecallScorer< contextText, ).replace("{{expected}}", payload.expected); - const extractResponse = await agent.generateObject(extractPrompt, EXTRACT_SCHEMA); + const extractResponse = await agent.generateObject(extractPrompt, EXTRACT_SCHEMA, { + tenantId, + }); const statements = extractResponse.object.statements; if (statements.length === 0) { @@ -152,7 +157,9 @@ export function createContextRecallScorer< contextText, ).replace("{{statement}}", statement); - const verifyResponse = await agent.generateObject(verifyPrompt, VERIFY_SCHEMA); + const verifyResponse = await agent.generateObject(verifyPrompt, VERIFY_SCHEMA, { + tenantId, + }); verdicts.push({ statement, verdict: verifyResponse.object.verdict, diff --git a/packages/scorers/src/llm/context-relevancy.ts b/packages/scorers/src/llm/context-relevancy.ts index ee882b5b1..aca608b25 100644 --- a/packages/scorers/src/llm/context-relevancy.ts +++ b/packages/scorers/src/llm/context-relevancy.ts @@ -7,6 +7,7 @@ import { import { safeStringify } from "@voltagent/internal/utils"; import type { LanguageModel } from "ai"; import { z } from "zod"; +import { extractTenantId } from "./utils"; const CONTEXT_RELEVANCY_PROMPT = `Analyze the provided context and identify which parts are relevant to answering the given question. For each context sentence or passage, determine its relevance level. @@ -144,6 +145,7 @@ export function createContextRelevancyScorer< const agent = new Agent({ name: "context-relevancy-evaluator", model, + trafficPriority: "P2", instructions: "You evaluate how relevant provided context is to answering questions", }); @@ -151,13 +153,16 @@ export function createContextRelevancyScorer< const contextText = Array.isArray(payload.context) ? payload.context.join("\n") : payload.context; + const tenantId = extractTenantId(context); const prompt = CONTEXT_RELEVANCY_PROMPT.replace("{{question}}", payload.input).replace( "{{context}}", contextText, ); - const response = await agent.generateObject(prompt, CONTEXT_RELEVANCY_SCHEMA); + const response = await agent.generateObject(prompt, CONTEXT_RELEVANCY_SCHEMA, { + tenantId, + }); const evaluations = response.object.evaluations; context.results.raw.contextRelevancyEvaluations = evaluations; diff --git a/packages/scorers/src/llm/moderation.ts b/packages/scorers/src/llm/moderation.ts index 03563bfe4..1055927f2 100644 --- a/packages/scorers/src/llm/moderation.ts +++ b/packages/scorers/src/llm/moderation.ts @@ -7,6 +7,7 @@ import { } from "@voltagent/core"; import { safeStringify } from "@voltagent/internal/utils"; import { z } from "zod"; +import { extractTenantId } from "./utils"; export interface ModerationScorerOptions { id?: string; @@ -220,6 +221,7 @@ async function runModerationJudge(args: { typeof context.results.prepare === "string" ? context.results.prepare : normalizeText(context.payload.output); + const tenantId = extractTenantId(context); const prompt = await buildPrompt({ output: normalizedOutput, @@ -232,12 +234,14 @@ async function runModerationJudge(args: { const agent = new Agent({ name: "moderation-judge", model, + trafficPriority: "P2", instructions: "You are a safety classifier. Respond with JSON that matches the provided schema containing flagged, scores, and reason.", }); const response = await agent.generateObject(prompt, MODERATION_SCHEMA, { maxOutputTokens, + tenantId, }); const parsed = mapModerationResponse(response.object, threshold); diff --git a/packages/scorers/src/llm/utils.ts b/packages/scorers/src/llm/utils.ts new file mode 100644 index 000000000..75e886e3e --- /dev/null +++ b/packages/scorers/src/llm/utils.ts @@ -0,0 +1,14 @@ +import type { BuilderPrepareContext, BuilderScoreContext } from "@voltagent/core"; + +type TenantAwareContext = BuilderScoreContext, Record> & + BuilderPrepareContext, Record>; + +export function extractTenantId( + context: + | BuilderScoreContext, Record> + | BuilderPrepareContext, Record> + | TenantAwareContext, +): string | undefined { + const candidate = (context.payload as { tenantId?: unknown })?.tenantId; + return typeof candidate === "string" ? candidate : undefined; +} diff --git a/packages/server-core/src/handlers/agent.handlers.ts b/packages/server-core/src/handlers/agent.handlers.ts index 00c0f2ee9..74d479962 100644 --- a/packages/server-core/src/handlers/agent.handlers.ts +++ b/packages/server-core/src/handlers/agent.handlers.ts @@ -1,11 +1,74 @@ -import { ClientHTTPError, type ServerProviderDeps } from "@voltagent/core"; -import { convertUsage } from "@voltagent/core"; +import { + ClientHTTPError, + type ServerProviderDeps, + type TrafficResponseMetadata, + convertUsage, +} from "@voltagent/core"; import { type Logger, safeStringify } from "@voltagent/internal"; import { z } from "zod"; import { convertJsonSchemaToZod } from "zod-from-json-schema"; import { convertJsonSchemaToZod as convertJsonSchemaToZodV3 } from "zod-from-json-schema-v3"; import type { ApiResponse } from "../types"; import { processAgentOptions } from "../utils/options"; +import { buildTrafficHeaders } from "../utils/traffic"; + +function extractTrafficMetadata(value: unknown): TrafficResponseMetadata | undefined { + if (!value || typeof value !== "object") return undefined; + const traffic = (value as { traffic?: unknown }).traffic; + if (!traffic || typeof traffic !== "object") return undefined; + return traffic as TrafficResponseMetadata; +} + +function wrapStreamWithTraffic( + baseResponse: Response, + traffic?: TrafficResponseMetadata, +): Response { + if (!traffic) return baseResponse; + const headers = new Headers(baseResponse.headers); + const trafficHeaders = buildTrafficHeaders(traffic); + for (const [key, value] of Object.entries(trafficHeaders)) { + headers.set(key, value); + } + const baseBody = baseResponse.body; + if (!baseBody) { + return new Response(baseBody, { + status: baseResponse.status, + headers, + }); + } + + const encoder = new TextEncoder(); + const stream = new ReadableStream({ + async start(controller) { + const trafficEvent = `data: ${safeStringify({ type: "traffic", traffic })}\n\n`; + controller.enqueue(encoder.encode(trafficEvent)); + const reader = baseBody.getReader(); + let didError = false; + try { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + if (value !== undefined) { + controller.enqueue(value); + } + } + } catch (error) { + didError = true; + controller.error(error); + } finally { + reader.releaseLock(); + if (!didError) { + controller.close(); + } + } + }, + }); + + return new Response(stream, { + status: baseResponse.status, + headers, + }); +} /** * Handler for listing all agents @@ -79,6 +142,7 @@ export async function handleGenerateText( const options = processAgentOptions(body, signal); const result = await agent.generateText(input, options); + const traffic = extractTrafficMetadata(result); // Convert usage format if present const usage = result.usage ? convertUsage(result.usage) : undefined; @@ -102,9 +166,11 @@ export async function handleGenerateText( } })(), }, + traffic, }; } catch (error) { logger.error("Failed to generate text", { error }); + const traffic = extractTrafficMetadata(error); if (error instanceof ClientHTTPError) { return { success: false, @@ -112,11 +178,13 @@ export async function handleGenerateText( code: error.code, name: error.name, httpStatus: error.httpStatus, + traffic, }; } return { success: false, error: error instanceof Error ? error.message : "Unknown error", + traffic, }; } } @@ -153,6 +221,7 @@ export async function handleStreamText( const options = processAgentOptions(body, signal); const result = await agent.streamText(input, options); + const traffic = extractTrafficMetadata(result); // Access the fullStream property const { fullStream } = result; @@ -178,7 +247,7 @@ export async function handleStreamText( }, }); - return new Response(stream, { + const response = new Response(stream, { status: 200, headers: { "Content-Type": "text/event-stream", @@ -186,20 +255,25 @@ export async function handleStreamText( Connection: "keep-alive", }, }); + return wrapStreamWithTraffic(response, traffic); } catch (error) { logger.error("Failed to handle stream text request", { error }); const errorMessage = error instanceof Error ? error.message : "Unknown error"; + const traffic = extractTrafficMetadata(error); + const trafficHeaders = buildTrafficHeaders(traffic); return new Response( safeStringify({ error: errorMessage, message: errorMessage, + traffic, }), { status: 500, headers: { "Content-Type": "application/json", + ...trafficHeaders, }, }, ); @@ -238,26 +312,32 @@ export async function handleChatStream( const options = processAgentOptions(body, signal); const result = await agent.streamText(input, options); + const traffic = extractTrafficMetadata(result); // Use the built-in toUIMessageStreamResponse - it handles errors properly - return result.toUIMessageStreamResponse({ + const response = result.toUIMessageStreamResponse({ sendReasoning: true, sendSources: true, }); + return wrapStreamWithTraffic(response, traffic); } catch (error) { logger.error("Failed to handle chat stream request", { error }); const errorMessage = error instanceof Error ? error.message : "Unknown error"; + const traffic = extractTrafficMetadata(error); + const trafficHeaders = buildTrafficHeaders(traffic); return new Response( safeStringify({ error: errorMessage, message: errorMessage, + traffic, }), { status: 500, headers: { "Content-Type": "application/json", + ...trafficHeaders, }, }, ); @@ -293,16 +373,20 @@ export async function handleGenerateObject( ) as any; const result = await agent.generateObject(input, zodSchema, options); + const traffic = extractTrafficMetadata(result); return { success: true, data: result.object, + traffic, }; } catch (error) { logger.error("Failed to generate object", { error }); + const traffic = extractTrafficMetadata(error); return { success: false, error: error instanceof Error ? error.message : "Unknown error", + traffic, }; } } @@ -344,23 +428,29 @@ export async function handleStreamObject( ) as any; const result = await agent.streamObject(input, zodSchema, options); + const traffic = extractTrafficMetadata(result); // Use the built-in toTextStreamResponse - it handles errors properly - return result.toTextStreamResponse(); + const response = result.toTextStreamResponse(); + return wrapStreamWithTraffic(response, traffic); } catch (error) { logger.error("Failed to handle stream object request", { error }); const errorMessage = error instanceof Error ? error.message : "Unknown error"; + const traffic = extractTrafficMetadata(error); + const trafficHeaders = buildTrafficHeaders(traffic); return new Response( safeStringify({ error: errorMessage, message: errorMessage, + traffic, }), { status: 500, headers: { "Content-Type": "application/json", + ...trafficHeaders, }, }, ); diff --git a/packages/server-core/src/index.ts b/packages/server-core/src/index.ts index 1fe7e206a..2f7ed826a 100644 --- a/packages/server-core/src/index.ts +++ b/packages/server-core/src/index.ts @@ -40,6 +40,7 @@ export * from "./utils/server-utils"; export * from "./utils/ui-templates"; export * from "./utils/response-mappers"; export * from "./utils/sse"; +export * from "./utils/traffic"; export * from "./utils/announcements"; // Export WebSocket utilities diff --git a/packages/server-core/src/schemas/agent.schemas.ts b/packages/server-core/src/schemas/agent.schemas.ts index 52e80b832..41181e00f 100644 --- a/packages/server-core/src/schemas/agent.schemas.ts +++ b/packages/server-core/src/schemas/agent.schemas.ts @@ -77,6 +77,18 @@ export const GenerateOptionsSchema = z .object({ userId: z.string().optional().describe("Optional user ID for context tracking"), conversationId: z.string().optional().describe("Optional conversation ID for context tracking"), + tenantId: z.string().optional().describe("Optional tenant ID for traffic limits"), + trafficPriority: z + .enum(["P0", "P1", "P2"]) + .optional() + .describe("Optional traffic priority for scheduling (P0, P1, P2)"), + apiKeyId: z.string().optional().describe("Optional API key identifier for traffic limits"), + region: z.string().optional().describe("Optional region identifier for traffic limits"), + endpoint: z.string().optional().describe("Optional endpoint identifier for traffic limits"), + tenantTier: z + .string() + .optional() + .describe("Optional tenant tier identifier for traffic limits"), context: z .record(z.string(), z.unknown()) .nullish() @@ -94,6 +106,14 @@ export const GenerateOptionsSchema = z .positive() .optional() .describe("Maximum number of steps for this request"), + maxQueueWaitMs: z + .number() + .int() + .nonnegative() + .optional() + .describe("Maximum time to wait in the queue before timing out (ms)"), + taskType: z.string().optional().describe("Optional task classification for fallback policy"), + fallbackPolicyId: z.string().optional().describe("Optional explicit fallback policy id"), temperature: z .number() .min(0) diff --git a/packages/server-core/src/types/responses.ts b/packages/server-core/src/types/responses.ts index 2098c2f64..4935a535b 100644 --- a/packages/server-core/src/types/responses.ts +++ b/packages/server-core/src/types/responses.ts @@ -1,10 +1,12 @@ /** * Framework-agnostic response types for server handlers */ +import type { TrafficResponseMetadata } from "@voltagent/core"; export interface SuccessResponse { success: true; data: T; + traffic?: TrafficResponseMetadata; } export interface ErrorResponse { @@ -13,6 +15,7 @@ export interface ErrorResponse { httpStatus?: number; code?: string; name?: string; + traffic?: TrafficResponseMetadata; } export type ApiResponse = SuccessResponse | ErrorResponse; diff --git a/packages/server-core/src/utils/traffic.ts b/packages/server-core/src/utils/traffic.ts new file mode 100644 index 000000000..f9be1845a --- /dev/null +++ b/packages/server-core/src/utils/traffic.ts @@ -0,0 +1,35 @@ +import type { TrafficResponseMetadata } from "@voltagent/core"; + +export function buildTrafficHeaders(traffic?: TrafficResponseMetadata): Record { + if (!traffic) return {}; + + const headers: Record = {}; + + if (typeof traffic.retryAfterMs === "number" && Number.isFinite(traffic.retryAfterMs)) { + headers["Retry-After"] = String(Math.max(0, Math.ceil(traffic.retryAfterMs / 1000))); + } + + if (traffic.rateLimitRemaining !== undefined) { + headers["X-RateLimit-Remaining"] = String(traffic.rateLimitRemaining); + } + + if (typeof traffic.rateLimitResetAt === "number" && Number.isFinite(traffic.rateLimitResetAt)) { + headers["X-RateLimit-Reset"] = String(Math.max(0, Math.ceil(traffic.rateLimitResetAt / 1000))); + } else if ( + typeof traffic.rateLimitResetInMs === "number" && + Number.isFinite(traffic.rateLimitResetInMs) + ) { + const resetAt = Date.now() + Math.max(0, traffic.rateLimitResetInMs); + headers["X-RateLimit-Reset"] = String(Math.max(0, Math.ceil(resetAt / 1000))); + } + + if (traffic.queueEtaMs !== undefined) { + headers["X-Queue-ETA"] = String(traffic.queueEtaMs); + } + + if (traffic.rateLimitKey) { + headers["X-RateLimit-Key"] = traffic.rateLimitKey; + } + + return headers; +} diff --git a/packages/server-hono/src/routes/index.ts b/packages/server-hono/src/routes/index.ts index a5af82146..336a5bf47 100644 --- a/packages/server-hono/src/routes/index.ts +++ b/packages/server-hono/src/routes/index.ts @@ -2,6 +2,7 @@ import type { ServerProviderDeps } from "@voltagent/core"; import type { Logger } from "@voltagent/internal"; import { UPDATE_ROUTES, + buildTrafficHeaders, handleCancelWorkflow, handleChatStream, handleCheckUpdates, @@ -87,11 +88,12 @@ export function registerAgentRoutes( const signal = c.req.raw.signal; const response = await handleGenerateText(agentId, body, deps, logger, signal); + const trafficHeaders = buildTrafficHeaders(response.traffic); if (!response.success) { const { httpStatus, ...details } = response; - return c.json(details, httpStatus || 500); + return c.json(details, httpStatus || 500, trafficHeaders); } - return c.json(response, 200); + return c.json(response, 200, trafficHeaders); }); // POST /agents/:id/stream - Stream text (raw fullStream SSE) @@ -131,11 +133,12 @@ export function registerAgentRoutes( const body = await c.req.json(); const signal = c.req.raw.signal; const response = await handleGenerateObject(agentId, body, deps, logger, signal); + const trafficHeaders = buildTrafficHeaders(response.traffic); if (!response.success) { const { httpStatus, ...details } = response; - return c.json(details, httpStatus || 500); + return c.json(details, httpStatus || 500, trafficHeaders); } - return c.json(response, 200); + return c.json(response, 200, trafficHeaders); }); // POST /agents/:id/stream-object - Stream object diff --git a/packages/serverless-hono/src/routes.ts b/packages/serverless-hono/src/routes.ts index d377ce4b3..39eabcf76 100644 --- a/packages/serverless-hono/src/routes.ts +++ b/packages/serverless-hono/src/routes.ts @@ -28,6 +28,7 @@ import { type TriggerHttpRequestContext, UPDATE_ROUTES, WORKFLOW_ROUTES, + buildTrafficHeaders, executeA2ARequest, executeTriggerHandler, getConversationMessagesHandler, @@ -165,7 +166,8 @@ export function registerAgentRoutes(app: Hono, deps: ServerProviderDeps, logger: } const signal = c.req.raw.signal; const response = await handleGenerateText(agentId, body, deps, logger, signal); - return c.json(response, response.success ? 200 : 500); + const trafficHeaders = buildTrafficHeaders(response.traffic); + return c.json(response, response.success ? 200 : 500, trafficHeaders); }); app.post(AGENT_ROUTES.streamText.path, async (c) => { @@ -197,7 +199,8 @@ export function registerAgentRoutes(app: Hono, deps: ServerProviderDeps, logger: } const signal = c.req.raw.signal; const response = await handleGenerateObject(agentId, body, deps, logger, signal); - return c.json(response, response.success ? 200 : 500); + const trafficHeaders = buildTrafficHeaders(response.traffic); + return c.json(response, response.success ? 200 : 500, trafficHeaders); }); app.post(AGENT_ROUTES.streamObject.path, async (c) => { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 20029de49..6671d8c17 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -37,7 +37,7 @@ importers: version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2) '@nx/plugin': specifier: 20.4.6 - version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(typescript@5.9.2) + version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2) '@nx/vite': specifier: 20.4.6 version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2)(vite@7.2.7)(vitest@3.2.4) @@ -92,6 +92,9 @@ importers: syncpack: specifier: ^13.0.2 version: 13.0.4(typescript@5.9.2) + ts-node: + specifier: ^10.9.2 + version: 10.9.2(@swc/core@1.5.29)(@types/node@24.2.1)(typescript@5.9.2) tslib: specifier: ^2.3.0 version: 2.8.1 @@ -99,7 +102,7 @@ importers: specifier: ^8.5.0 version: 8.5.0(@swc/core@1.5.29)(typescript@5.9.2) typescript: - specifier: ^5.8.2 + specifier: ^5.9.2 version: 5.9.2 vite: specifier: ^7.2.7 @@ -2750,6 +2753,61 @@ importers: specifier: ^0.5.3 version: 0.5.3(@tanstack/query-core@5.89.0)(@tanstack/react-query@5.89.0)(@tanstack/router-core@1.131.44)(@types/node@24.2.1)(@types/react@19.2.7)(@vitejs/plugin-react@5.1.2)(tsx@4.20.4)(vite@7.2.7) + examples/with-viteval/dist: + dependencies: + '@ai-sdk/openai': + specifier: ^2.0.52 + version: 2.0.85(zod@3.25.76) + '@voltagent/cli': + specifier: ^0.1.16 + version: link:../../../packages/cli + '@voltagent/core': + specifier: ^1.2.15 + version: link:../../../packages/core + '@voltagent/libsql': + specifier: ^1.0.13 + version: link:../../../packages/libsql + '@voltagent/logger': + specifier: ^1.0.4 + version: link:../../../packages/logger + '@voltagent/server-hono': + specifier: ^1.2.5 + version: link:../../../packages/server-hono + ai: + specifier: ^5.0.76 + version: 5.0.113(zod@3.25.76) + consola: + specifier: ^3.4.2 + version: 3.4.2 + envalid: + specifier: ^8.1.0 + version: 8.1.0 + yargs: + specifier: ^18.0.0 + version: 18.0.0 + zod: + specifier: ^3.25.76 + version: 3.25.76 + devDependencies: + '@tsconfig/node24': + specifier: ^24.0.1 + version: 24.0.1 + '@types/yargs': + specifier: ^17.0.33 + version: 17.0.33 + dotenv: + specifier: ^16.4.5 + version: 16.6.1 + tsx: + specifier: ^4.19.3 + version: 4.20.4 + typescript: + specifier: ^5.8.2 + version: 5.9.2 + viteval: + specifier: ^0.5.3 + version: 0.5.3(@tanstack/query-core@5.89.0)(@tanstack/react-query@5.89.0)(@tanstack/router-core@1.131.44)(@types/node@24.2.1)(@types/react@19.2.7)(@vitejs/plugin-react@5.1.2)(tsx@4.20.4)(vite@7.2.7) + examples/with-voice-elevenlabs: dependencies: '@ai-sdk/openai': @@ -3509,7 +3567,7 @@ importers: version: 3.2.4(vitest@3.2.4) jest: specifier: ^29.5.0 - version: 29.7.0(@types/node@24.2.1) + version: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) ts-jest: specifier: ^29.1.0 version: 29.4.1(@babel/core@7.28.5)(esbuild@0.25.10)(jest@29.7.0)(typescript@5.9.2) @@ -9966,7 +10024,7 @@ packages: slash: 3.0.0 dev: true - /@jest/core@29.7.0: + /@jest/core@29.7.0(ts-node@10.9.2): resolution: {integrity: sha512-n7aeXWKMnGtDA48y8TLWJPJmLmmZ642Ceo78cYWEpiD7FzDgmNDV/GCVRorPABdXLJZ/9wzzgZAlHjXjxDHGsg==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} peerDependencies: @@ -9987,7 +10045,7 @@ packages: exit: 0.1.2 graceful-fs: 4.2.11 jest-changed-files: 29.7.0 - jest-config: 29.7.0(@types/node@24.6.2) + jest-config: 29.7.0(@types/node@24.6.2)(ts-node@10.9.2) jest-haste-map: 29.7.0 jest-message-util: 29.7.0 jest-regex-util: 29.6.3 @@ -12403,7 +12461,7 @@ packages: - verdaccio dev: true - /@nx/jest@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2): + /@nx/jest@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2): resolution: {integrity: sha512-yZOZJOQFtpdY3Fu/WYNoDx81TwvF9yfwvalFpLD19bz+2YGl7B89l0S1ZrtSRXFfKXA/w7gb0gmKwthJtQhx9Q==} dependencies: '@jest/reporters': 29.7.0 @@ -12412,7 +12470,7 @@ packages: '@nx/js': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2) '@phenomnomnominal/tsquery': 5.0.1(typescript@5.9.2) identity-obj-proxy: 3.0.0 - jest-config: 29.7.0(@types/node@24.2.1) + jest-config: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) jest-resolve: 29.7.0 jest-util: 29.7.0 minimatch: 9.0.3 @@ -12807,12 +12865,12 @@ packages: dev: true optional: true - /@nx/plugin@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(typescript@5.9.2): + /@nx/plugin@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2): resolution: {integrity: sha512-7Jlv+BVqGoO0BolQN7P5Z87160phuE1i7H6C8xFwQnlQ3ZfwQCJzk2dkg1UyzxDkWl6lvVsqBjZPXD55gFQ3+w==} dependencies: '@nx/devkit': 20.4.6(nx@20.8.2) '@nx/eslint': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2) - '@nx/jest': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2) + '@nx/jest': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2) '@nx/js': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2) tslib: 2.8.1 transitivePeerDependencies: @@ -17770,8 +17828,8 @@ packages: '@babel/plugin-syntax-jsx': 7.27.1(@babel/core@7.28.5) '@babel/plugin-syntax-typescript': 7.27.1(@babel/core@7.28.5) '@babel/template': 7.27.2 - '@babel/traverse': 7.28.4 - '@babel/types': 7.28.4 + '@babel/traverse': 7.28.5 + '@babel/types': 7.28.5 '@tanstack/react-router': 1.131.44(react-dom@19.2.3)(react@19.2.3) '@tanstack/router-core': 1.131.44 '@tanstack/router-generator': 1.131.44 @@ -22783,7 +22841,7 @@ packages: crc-32: 1.2.2 readable-stream: 4.7.0 - /create-jest@29.7.0(@types/node@24.2.1): + /create-jest@29.7.0(@types/node@24.2.1)(ts-node@10.9.2): resolution: {integrity: sha512-Adz2bdH0Vq3F53KEMJOoftQFutWCukm6J24wbPWRO4k1kMY7gS7ds/uoJkNuV8wDCtWWnuwGcJwpWcih+zEW1Q==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} hasBin: true @@ -22792,7 +22850,7 @@ packages: chalk: 4.1.2 exit: 0.1.2 graceful-fs: 4.2.11 - jest-config: 29.7.0(@types/node@24.2.1) + jest-config: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) jest-util: 29.7.0 prompts: 2.4.2 transitivePeerDependencies: @@ -27641,7 +27699,7 @@ packages: - supports-color dev: true - /jest-cli@29.7.0(@types/node@24.2.1): + /jest-cli@29.7.0(@types/node@24.2.1)(ts-node@10.9.2): resolution: {integrity: sha512-OVVobw2IubN/GSYsxETi+gOe7Ka59EFMR/twOU3Jb2GnKKeMGJB5SGUUrEz3SFVmJASUdZUzy83sLNNQ2gZslg==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} hasBin: true @@ -27651,14 +27709,14 @@ packages: node-notifier: optional: true dependencies: - '@jest/core': 29.7.0 + '@jest/core': 29.7.0(ts-node@10.9.2) '@jest/test-result': 29.7.0 '@jest/types': 29.6.3 chalk: 4.1.2 - create-jest: 29.7.0(@types/node@24.2.1) + create-jest: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) exit: 0.1.2 import-local: 3.2.0 - jest-config: 29.7.0(@types/node@24.2.1) + jest-config: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) jest-util: 29.7.0 jest-validate: 29.7.0 yargs: 17.7.2 @@ -27669,7 +27727,7 @@ packages: - ts-node dev: true - /jest-config@29.7.0(@types/node@24.2.1): + /jest-config@29.7.0(@types/node@24.2.1)(ts-node@10.9.2): resolution: {integrity: sha512-uXbpfeQ7R6TZBqI3/TxCU4q4ttk3u0PJeC+E0zbfSoSjq6bJ7buBPxzQPL0ifrkY4DNu4JUdk0ImlBUYi840eQ==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} peerDependencies: @@ -27704,12 +27762,13 @@ packages: pretty-format: 29.7.0 slash: 3.0.0 strip-json-comments: 3.1.1 + ts-node: 10.9.2(@swc/core@1.5.29)(@types/node@24.2.1)(typescript@5.9.2) transitivePeerDependencies: - babel-plugin-macros - supports-color dev: true - /jest-config@29.7.0(@types/node@24.6.2): + /jest-config@29.7.0(@types/node@24.6.2)(ts-node@10.9.2): resolution: {integrity: sha512-uXbpfeQ7R6TZBqI3/TxCU4q4ttk3u0PJeC+E0zbfSoSjq6bJ7buBPxzQPL0ifrkY4DNu4JUdk0ImlBUYi840eQ==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} peerDependencies: @@ -27744,6 +27803,7 @@ packages: pretty-format: 29.7.0 slash: 3.0.0 strip-json-comments: 3.1.1 + ts-node: 10.9.2(@swc/core@1.5.29)(@types/node@24.2.1)(typescript@5.9.2) transitivePeerDependencies: - babel-plugin-macros - supports-color @@ -28041,7 +28101,7 @@ packages: supports-color: 8.1.1 dev: true - /jest@29.7.0(@types/node@24.2.1): + /jest@29.7.0(@types/node@24.2.1)(ts-node@10.9.2): resolution: {integrity: sha512-NIy3oAFp9shda19hy4HK0HRTWKtPJmGdnvywu01nOqNC2vZg+Z+fvJDxpMQA88eb2I9EcafcdjYgsDthnYTvGw==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} hasBin: true @@ -28051,10 +28111,10 @@ packages: node-notifier: optional: true dependencies: - '@jest/core': 29.7.0 + '@jest/core': 29.7.0(ts-node@10.9.2) '@jest/types': 29.6.3 import-local: 3.2.0 - jest-cli: 29.7.0(@types/node@24.2.1) + jest-cli: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) transitivePeerDependencies: - '@types/node' - babel-plugin-macros @@ -36767,7 +36827,7 @@ packages: esbuild: 0.25.10 fast-json-stable-stringify: 2.1.0 handlebars: 4.7.8 - jest: 29.7.0(@types/node@24.2.1) + jest: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) json5: 2.2.3 lodash.memoize: 4.1.2 make-error: 1.3.6 diff --git a/tmp/test/traffic-concurrency.ts b/tmp/test/traffic-concurrency.ts new file mode 100644 index 000000000..d12fc5c9f --- /dev/null +++ b/tmp/test/traffic-concurrency.ts @@ -0,0 +1,91 @@ +// @ts-nocheck +/** + * Manual test: TrafficController maxConcurrent scheduling. + * + * What to look for: + * - `inFlight` should never exceed `maxConcurrent`. + * - Requests should start in bursts up to `maxConcurrent`. + * + * Run: + * - pnpm ts-node tmp/test/traffic-concurrency.ts + * - VERBOSE=1 pnpm ts-node tmp/test/traffic-concurrency.ts (enable controller debug logs) + */ + +import { safeStringify } from "@voltagent/internal"; +import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; + +const verbose = process.env.VERBOSE === "1"; +if (!verbose) { + console.debug = () => {}; +} + +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); +const now = () => new Date().toISOString(); + +const maxConcurrent = 3; +const controller = getTrafficController({ maxConcurrent }); + +let inFlight = 0; +let maxObserved = 0; + +function makeModel(id: string, durationMs: number) { + return { + specificationVersion: "v2", + provider: "sim", + modelId: `concurrency-${id}`, + doGenerate: async () => { + inFlight += 1; + maxObserved = Math.max(maxObserved, inFlight); + console.log(`[${now()}] start ${id} inFlight=${inFlight}`); + + try { + await sleep(durationMs); + return { + content: [{ type: "text", text: `ok:${id}` }], + finishReason: "stop", + usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, + warnings: [], + response: { modelId: `concurrency-${id}`, headers: {} }, + }; + } finally { + inFlight -= 1; + console.log(`[${now()}] end ${id} inFlight=${inFlight}`); + } + }, + }; +} + +async function main() { + console.log(`\n=== TrafficController concurrency (maxConcurrent=${maxConcurrent}) ===`); + void controller; + + const agent = new Agent({ + name: "traffic-concurrency", + instructions: "echo", + model: makeModel("base", 0), + temperature: 0, + maxOutputTokens: 32, + }); + + const ids = ["A", "B", "C", "D", "E"]; + const jobs = ids.map((id) => + agent.generateText(id, { + tenantId: "default", + trafficPriority: "P1", + model: makeModel(id, 700), + }), + ); + + const settled = await Promise.allSettled(jobs); + console.log(`\n[done] maxObserved=${maxObserved}`); + console.log( + `[done] results=${safeStringify( + settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)), + )}`, + ); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/tmp/test/traffic-fallback-chain.ts b/tmp/test/traffic-fallback-chain.ts new file mode 100644 index 000000000..0cd77b2ba --- /dev/null +++ b/tmp/test/traffic-fallback-chain.ts @@ -0,0 +1,168 @@ +// @ts-nocheck +/** + * Manual test: TrafficController circuit breaker + fallback chains. + * + * Scenarios: + * - Test 1: Open primary circuit (via repeated 429s), then route to fallback1. + * - Test 2: Open fallback1 circuit, then route to fallback2 (success). + * - Test 3: No fallback configured → CircuitBreakerOpenError. + * + * Run: + * - pnpm ts-node tmp/test/traffic-fallback-chain.ts + * - VERBOSE=1 pnpm ts-node tmp/test/traffic-fallback-chain.ts + */ + +import { safeStringify } from "@voltagent/internal"; +import { MockLanguageModelV2, MockProviderV2 } from "ai/test"; +import { + Agent, + CircuitBreakerOpenError, + getTrafficController, +} from "../../packages/core/dist/index.js"; + +const verbose = process.env.VERBOSE === "1"; +if (!verbose) { + console.debug = () => {}; +} + +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); +const now = () => new Date().toISOString(); + +type ModelId = "primary" | "fallback1" | "fallback2" | "no-fallback"; + +const provider = "test-provider"; + +const controller = getTrafficController({ + maxConcurrent: 1, + fallbackChains: { + primary: ["fallback1", "fallback2"], + fallback1: ["fallback2"], + }, +}); + +function makeAlways429Model(modelId: ModelId) { + let attempts = 0; + return new MockLanguageModelV2({ + provider, + modelId, + doGenerate: async () => { + attempts += 1; + console.log(`[${now()}] doGenerate model=${modelId} attempt=${attempts} -> 429`); + await sleep(25); + const err: any = new Error(`forced 429 for model=${modelId} attempt=${attempts}`); + err.status = 429; + throw err; + }, + }); +} + +function makeAlwaysOkModel(modelId: ModelId) { + let attempts = 0; + return new MockLanguageModelV2({ + provider, + modelId, + doGenerate: async () => { + attempts += 1; + console.log(`[${now()}] doGenerate model=${modelId} attempt=${attempts} -> ok`); + await sleep(25); + return { + content: [{ type: "text", text: `ok:${modelId}` }], + finishReason: "stop", + usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, + warnings: [], + response: { modelId, headers: {} }, + }; + }, + }); +} + +const primaryModel = makeAlways429Model("primary"); +const fallback1Model = makeAlways429Model("fallback1"); +const fallback2Model = makeAlwaysOkModel("fallback2"); +const noFallbackModel = makeAlways429Model("no-fallback"); + +// Required so Agent fallbacks (string model IDs) resolve without network calls. +(globalThis as any).AI_SDK_DEFAULT_PROVIDER = new MockProviderV2({ + languageModels: { + primary: primaryModel, + fallback1: fallback1Model, + fallback2: fallback2Model, + "no-fallback": noFallbackModel, + }, +}); + +const primaryAgent = new Agent({ + name: "traffic-fallback-primary", + instructions: "echo", + model: primaryModel, + temperature: 0, + maxOutputTokens: 32, +}); + +const noFallbackAgent = new Agent({ + name: "traffic-fallback-none", + instructions: "echo", + model: noFallbackModel, + temperature: 0, + maxOutputTokens: 32, +}); + +async function runOnce(label: string, agent: any) { + console.log(`\n--- ${label} ---`); + try { + const result = await agent.generateText(label, { + tenantId: "default", + trafficPriority: "P1", + }); + console.log( + `[${label}] success text=${result.text} responseModel=${result.response?.modelId ?? "n/a"}`, + ); + } catch (err: any) { + if (err instanceof CircuitBreakerOpenError) { + console.log( + `[${label}] CircuitBreakerOpenError retryAfterMs=${err.retryAfterMs} msg=${err.message}`, + ); + } else { + console.log( + `[${label}] failed name=${err?.name ?? "Error"} status=${err?.status ?? err?.statusCode ?? "n/a"} msg=${err?.message}`, + ); + } + } +} + +async function main() { + console.log("\n=== Circuit breaker + fallback chain ==="); + void controller; + + console.log("\n[Test 1] Open primary circuit, then route to fallback1"); + // Two calls * (up to 3 retries each) ≈ 6 failures → should open the circuit (threshold=5). + await runOnce("primary-warmup-1", primaryAgent); + await runOnce("primary-warmup-2", primaryAgent); + await runOnce("primary-after-open", primaryAgent); // should execute fallback1 (still closed) + + console.log("\n[Test 2] Open fallback1 circuit, then route to fallback2"); + // Build enough failures on fallback1 by routing multiple requests to it via primary circuit-open path. + await runOnce("fallback1-warmup-1-via-primary", primaryAgent); + await runOnce("fallback1-warmup-2-via-primary", primaryAgent); + await runOnce("primary-should-hit-fallback2", primaryAgent); // should execute fallback2 and succeed + + console.log("\n[Test 3] No fallback configured → CircuitBreakerOpenError"); + await runOnce("no-fallback-warmup-1", noFallbackAgent); + await runOnce("no-fallback-warmup-2", noFallbackAgent); + await runOnce("no-fallback-after-open", noFallbackAgent); + + console.log("\n[debug] model call counts:"); + console.log( + safeStringify({ + primary: primaryModel.doGenerateCalls?.length, + fallback1: fallback1Model.doGenerateCalls?.length, + fallback2: fallback2Model.doGenerateCalls?.length, + "no-fallback": noFallbackModel.doGenerateCalls?.length, + }), + ); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/tmp/test/traffic-priority-openai-real.ts b/tmp/test/traffic-priority-openai-real.ts new file mode 100644 index 000000000..223263ba8 --- /dev/null +++ b/tmp/test/traffic-priority-openai-real.ts @@ -0,0 +1,117 @@ +// @ts-nocheck +/** + * Manual test: TrafficController + AI SDK with real OpenAI calls. + * + * What this exercises: + * - Priority scheduling (P0/P1/P2) with `maxConcurrent=1` + * - Rate limit header ingestion via `updateRateLimitFromHeaders()` (if headers are present) + * - Tenant usage aggregation via `extractUsage` + `getTenantUsage()` + * + * Prereqs: + * - Set `OPENAI_API_KEY` + * + * Run: + * - OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-priority-openai-real.ts + * - VERBOSE=1 OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-priority-openai-real.ts + * + * Notes: + * - This will make real network calls and may incur cost. + */ + +import { openai } from "@ai-sdk/openai"; +import { safeStringify } from "@voltagent/internal"; +import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; + +const verbose = process.env.VERBOSE === "1"; +if (!verbose) { + console.debug = () => {}; +} + +const apiKey = process.env.OPENAI_API_KEY; +if (!apiKey) { + console.error("Missing OPENAI_API_KEY. Example:"); + console.error(" OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-priority-openai-real.ts"); + process.exit(1); +} + +const _now = () => new Date().toISOString(); +const preview = (value: unknown, max = 140) => { + if (typeof value !== "string") return String(value ?? ""); + return value.length > max ? `${value.slice(0, max)}…` : value; +}; + +const tenantId = process.env.TENANT_ID ?? "openai-real"; +const defaultModelId = process.env.OPENAI_MODEL ?? "gpt-4o-mini"; + +const controller = getTrafficController({ maxConcurrent: 1 }); + +function getHeader(headers: any, name: string): string | undefined { + if (!headers) return undefined; + if (typeof headers.get === "function") { + const v = headers.get(name); + return v === null || v === undefined ? undefined : String(v); + } + const key = Object.keys(headers).find((k) => k.toLowerCase() === name.toLowerCase()); + if (!key) return undefined; + const v = headers[key]; + return v === null || v === undefined ? undefined : String(Array.isArray(v) ? v[0] : v); +} + +async function main() { + console.log( + `\n=== OpenAI real: priority scheduling (tenantId=${tenantId}, model=${defaultModelId}) ===`, + ); + void controller; + + const agent = new Agent({ + name: "openai-real-traffic", + instructions: "Reply exactly with the requested token.", + model: openai(defaultModelId), + temperature: 0, + maxOutputTokens: 32, + }); + + // Enqueue in reverse priority order; controller should still execute P0 first. + const p2 = agent.generateText("Reply with only: P2", { tenantId, trafficPriority: "P2" }); + const p1 = agent.generateText("Reply with only: P1", { tenantId, trafficPriority: "P1" }); + const p0 = agent.generateText("Reply with only: P0", { tenantId, trafficPriority: "P0" }); + + const settled = await Promise.allSettled([p0, p1, p2]); + for (const result of settled) { + if (result.status !== "fulfilled") { + console.log(`[result] rejected=${result.reason?.message ?? String(result.reason)}`); + continue; + } + + const headers = result.value.response?.headers; + const limit = getHeader(headers, "x-ratelimit-limit-requests"); + const remaining = getHeader(headers, "x-ratelimit-remaining-requests"); + const reset = getHeader(headers, "x-ratelimit-reset-requests"); + + console.log( + `[result] text=${preview(result.value.text)} finishReason=${result.value.finishReason} usage=${safeStringify(result.value.usage)}`, + ); + console.log( + `[result] ratelimitHeaders=${safeStringify({ + limit, + remaining, + reset, + })}`, + ); + } + + console.log( + `\n[done] settled=${safeStringify( + settled.map((s) => (s.status === "fulfilled" ? preview(s.value.text) : s.reason?.message)), + )}`, + ); + + console.log( + `[done] tenantUsage(${tenantId})=${safeStringify(controller.getTenantUsage(tenantId))}`, + ); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/tmp/test/traffic-priority-openai-sim.ts b/tmp/test/traffic-priority-openai-sim.ts new file mode 100644 index 000000000..9d36a7d14 --- /dev/null +++ b/tmp/test/traffic-priority-openai-sim.ts @@ -0,0 +1,114 @@ +// @ts-nocheck +/** + * Manual test: Agent → TrafficController priority scheduling (OpenAI-like stub models). + * + * This keeps the Agent + AI SDK path, but avoids real network calls by using stub models + * that pretend to be `provider="openai"` with modelIds like `gpt-4o`/`gpt-4o-mini`. + * + * Scenarios: + * - Test 1: P0 runs before P1/P2 when all runnable. + * - Test 2: P0 request (gpt-4o) is rate-limited → P1 (gpt-4o-mini) proceeds. + * + * Note: + * - Rate-limit wakeups include a small probe delay; a "1s" reset may unblock slightly after 1s. + * + * Run: + * - pnpm ts-node tmp/test/traffic-priority-openai-sim.ts + */ + +import { safeStringify } from "@voltagent/internal"; +import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; + +const verbose = process.env.VERBOSE === "1"; +if (!verbose) { + console.debug = () => {}; +} + +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); +const now = () => new Date().toISOString(); + +function makeOpenAIStubModel(modelId: string, delayMs: number) { + let calls = 0; + return { + specificationVersion: "v2", + provider: "openai", + modelId, + doGenerate: async () => { + calls += 1; + console.log(`[${now()}] [model] ${modelId} doGenerate call=${calls}`); + await sleep(delayMs); + return { + content: [{ type: "text", text: `ok:${modelId}` }], + finishReason: "stop", + usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, + warnings: [], + response: { modelId, headers: {} }, + }; + }, + }; +} + +const controller = getTrafficController({ maxConcurrent: 1 }); + +const modelMini = makeOpenAIStubModel("gpt-4o-mini", 80); +const modelBig = makeOpenAIStubModel("gpt-4o", 80); + +const agent = new Agent({ + name: "priority-openai-sim", + instructions: "echo", + model: modelMini, + temperature: 0, + maxOutputTokens: 32, +}); + +async function test1_priorityOrder() { + console.log("\n=== Test 1: P0 ordering via Agent ==="); + + const p2 = agent.generateText("P2", { trafficPriority: "P2", tenantId: "sim" }); + const p1 = agent.generateText("P1", { trafficPriority: "P1", tenantId: "sim" }); + const p0 = agent.generateText("P0", { trafficPriority: "P0", tenantId: "sim" }); + + const results = await Promise.all([p0, p1, p2]); + console.log(`[Test 1] results=${safeStringify(results.map((r) => r.text))}`); +} + +async function test2_p1RunsWhenP0RateLimited() { + console.log("\n=== Test 2: P1 proceeds when P0 is rate-limited ==="); + + // Seed remaining=0 for openai::gpt-4o so the P0 head item initially waits. + const applied = controller.updateRateLimitFromHeaders( + { provider: "openai", model: "gpt-4o" }, + { + "x-ratelimit-limit-requests": "1", + "x-ratelimit-remaining-requests": "0", + "x-ratelimit-reset-requests": "1s", + }, + ); + console.log(`[Test 2] updateRateLimitFromHeaders=${safeStringify(applied)}`); + + const p0Blocked = agent.generateText("P0 (gpt-4o, rate-limited)", { + trafficPriority: "P0", + tenantId: "sim", + model: modelBig, // per-call model override (new in this branch) + }); + + const p1Free = agent.generateText("P1 (gpt-4o-mini)", { + trafficPriority: "P1", + tenantId: "sim", + model: modelMini, + }); + + const [r0, r1] = await Promise.all([p0Blocked, p1Free]); + console.log(`[Test 2] p0 text=${r0.text}`); + console.log(`[Test 2] p1 text=${r1.text}`); +} + +async function main() { + await test1_priorityOrder(); + await test2_p1RunsWhenP0RateLimited(); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/tmp/test/traffic-priority.ts b/tmp/test/traffic-priority.ts new file mode 100644 index 000000000..409e10782 --- /dev/null +++ b/tmp/test/traffic-priority.ts @@ -0,0 +1,159 @@ +// @ts-nocheck +/** + * Manual test: TrafficController priority scheduling. + * + * Scenarios: + * - Test 1: P0 should run before P1/P2 when runnable. + * - Test 2: If a P0 request is rate-limited, a lower priority (P1) can proceed. + * + * Note: + * - Rate-limit wakeups include a small probe delay; a "1s" reset may unblock slightly after 1s. + * + * Run: + * - pnpm ts-node tmp/test/traffic-priority.ts + * - VERBOSE=1 pnpm ts-node tmp/test/traffic-priority.ts + */ + +import { safeStringify } from "@voltagent/internal"; +import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; + +const verbose = process.env.VERBOSE === "1"; +if (!verbose) { + console.debug = () => {}; +} + +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); +const now = () => new Date().toISOString(); + +const controller = getTrafficController({ maxConcurrent: 1 }); + +function extractLabel(prompt: any): string { + if (!Array.isArray(prompt)) { + return "unknown"; + } + + for (let index = prompt.length - 1; index >= 0; index -= 1) { + const message = prompt[index]; + if (!message || message.role !== "user" || !Array.isArray(message.content)) { + continue; + } + + const textPart = message.content.find((part: any) => part?.type === "text"); + if (textPart?.text) { + return String(textPart.text); + } + } + + return "unknown"; +} + +function makeModel(provider: string, modelId: string, delayMs = 50) { + let calls = 0; + let lastStartAt = 0; + + return { + specificationVersion: "v2", + provider, + modelId, + doGenerate: async (options: any) => { + calls += 1; + const startAt = Date.now(); + const delta = lastStartAt ? startAt - lastStartAt : 0; + lastStartAt = startAt; + + const label = extractLabel(options?.prompt); + console.log( + `[${now()}] doGenerate start model=${provider}::${modelId} call=${calls} (+${delta}ms) input=${label}`, + ); + await sleep(delayMs); + console.log(`[${now()}] doGenerate end model=${provider}::${modelId} input=${label}`); + + return { + content: [{ type: "text", text: `ok:${label}` }], + finishReason: "stop", + usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, + warnings: [], + response: { modelId, headers: {} }, + }; + }, + }; +} + +async function test1_priorityOrder() { + console.log("\n=== Test 1: priority order (P0 before P1/P2) ==="); + + const sharedModel = makeModel("p", "shared-model", 50); + const agent = new Agent({ + name: "traffic-priority", + instructions: "echo", + model: sharedModel, + temperature: 0, + maxOutputTokens: 32, + }); + + // Enqueue in reverse order; scheduler should still run P0 first. + const p2 = agent.generateText("P2", { tenantId: "default", trafficPriority: "P2" }); + const p1 = agent.generateText("P1", { tenantId: "default", trafficPriority: "P1" }); + const p0 = agent.generateText("P0", { tenantId: "default", trafficPriority: "P0" }); + + const settled = await Promise.allSettled([p0, p1, p2]); + console.log( + `[Test 1] results=${safeStringify( + settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)), + )}`, + ); +} + +async function test2_lowerPriorityWhenP0RateLimited() { + console.log("\n=== Test 2: P1 proceeds when P0 rate-limited ==="); + + const applied = controller.updateRateLimitFromHeaders( + { provider: "p0", model: "m0" }, + { + "x-ratelimit-limit-requests": "1", + "x-ratelimit-remaining-requests": "0", + "x-ratelimit-reset-requests": "1s", + }, + ); + console.log(`[Test 2] updateRateLimitFromHeaders=${safeStringify(applied)}`); + + const modelP0 = makeModel("p0", "m0", 50); + const modelP1 = makeModel("p1", "m1", 50); + const agent = new Agent({ + name: "traffic-priority-rate-limit", + instructions: "echo", + model: modelP1, + temperature: 0, + maxOutputTokens: 32, + }); + + // Now the next P0 request is at the head of the queue but rate-limited, + // so a runnable P1 request should execute first. + const p0Blocked = agent.generateText("P0-blocked (rate limited)", { + tenantId: "default", + trafficPriority: "P0", + model: modelP0, + }); + const p1Free = agent.generateText("P1-free (should run first)", { + tenantId: "default", + trafficPriority: "P1", + model: modelP1, + }); + + const settled = await Promise.allSettled([p0Blocked, p1Free]); + console.log( + `[Test 2] results=${safeStringify( + settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)), + )}`, + ); +} + +async function main() { + await test1_priorityOrder(); + await test2_lowerPriorityWhenP0RateLimited(); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/tmp/test/traffic-rate-limit-from-headers.ts b/tmp/test/traffic-rate-limit-from-headers.ts new file mode 100644 index 000000000..d82626611 --- /dev/null +++ b/tmp/test/traffic-rate-limit-from-headers.ts @@ -0,0 +1,158 @@ +// @ts-nocheck +/** + * Manual test: TrafficController dynamic rate limits from OpenAI response headers. + * + * This hits the real OpenAI model via Agent + AI SDK, and relies on the + * `x-ratelimit-*` response headers to seed/update the TrafficController. + * + * What to look for: + * - Each request prints the observed `x-ratelimit-*` headers (if present). + * - Agent should also log: "[Traffic] Applied rate limit from response headers". + * - With enough parallel requests, some requests may take longer due to controller throttling. + * + * Prereqs: + * - Set `OPENAI_API_KEY` + * + * Optional env: + * - `OPENAI_MODEL` (default: gpt-4o-mini) + * - `REQUESTS` (default: 10) + * - `MAX_CONCURRENT` (default: 50) + * - `TENANT_ID` (default: openai-rate-limit-headers) + * + * Run: + * - OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-from-headers.ts + * - VERBOSE=1 OPENAI_API_KEY=... REQUESTS=30 pnpm ts-node tmp/test/traffic-rate-limit-from-headers.ts + */ + +import { openai } from "@ai-sdk/openai"; +import { safeStringify } from "@voltagent/internal"; +import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; + +const verbose = process.env.VERBOSE === "1"; +if (!verbose) { + console.debug = () => {}; +} + +const now = () => new Date().toISOString(); + +const apiKey = process.env.OPENAI_API_KEY; +if (!apiKey) { + console.error("Missing OPENAI_API_KEY. Example:"); + console.error(" OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-from-headers.ts"); + process.exit(1); +} + +const provider = "openai"; +const modelId = process.env.OPENAI_MODEL ?? "gpt-4o-mini"; +const tenantId = process.env.TENANT_ID ?? "openai-rate-limit-headers"; +const requestCountRaw = Number(process.env.REQUESTS ?? "10"); +const maxConcurrentRaw = Number(process.env.MAX_CONCURRENT ?? "50"); +const requestCount = Number.isFinite(requestCountRaw) && requestCountRaw > 0 ? requestCountRaw : 10; +const maxConcurrent = + Number.isFinite(maxConcurrentRaw) && maxConcurrentRaw > 0 ? maxConcurrentRaw : 50; + +const key = `${provider}::${modelId}`; +const controller = getTrafficController({ maxConcurrent }); + +function getHeader(headers: any, name: string): string | undefined { + if (!headers) return undefined; + if (typeof headers.get === "function") { + const v = headers.get(name); + return v === null || v === undefined ? undefined : String(v); + } + + const entries = Object.entries(headers as Record); + const target = name.toLowerCase(); + const match = entries.find(([k]) => String(k).toLowerCase() === target); + if (!match) return undefined; + + const value = match[1]; + if (Array.isArray(value)) { + const first = value[0]; + return first === null || first === undefined ? undefined : String(first); + } + + return value === null || value === undefined ? undefined : String(value); +} + +async function main() { + console.log( + `\n=== OpenAI rate limit headers → TrafficController (${key}, maxConcurrent=${maxConcurrent}, requests=${requestCount}) ===`, + ); + void controller; + + const agent = new Agent({ + name: "openai-rate-limit-from-headers", + instructions: "Reply with only the requested token.", + model: openai(modelId), + temperature: 0, + maxOutputTokens: 32, + }); + + console.log("\n[seed] Making one request to capture headers..."); + const seedStartedAt = Date.now(); + const seed = await agent.generateText("Reply with only: seed", { + tenantId, + trafficPriority: "P1", + }); + const seedElapsedMs = Date.now() - seedStartedAt; + + const seedHeaders = seed.response?.headers; + console.log(`[seed] done in ${seedElapsedMs}ms text=${seed.text}`); + console.log( + `[seed] x-ratelimit-*=${safeStringify({ + limit: getHeader(seedHeaders, "x-ratelimit-limit-requests"), + remaining: getHeader(seedHeaders, "x-ratelimit-remaining-requests"), + reset: getHeader(seedHeaders, "x-ratelimit-reset-requests"), + })}`, + ); + + console.log(`\n[burst] Scheduling ${requestCount} parallel requests...`); + const jobs = Array.from({ length: requestCount }, (_, idx) => { + const label = `req-${idx + 1}`; + const enqueuedAt = Date.now(); + console.log(`[${now()}] enqueue ${label}`); + + return agent + .generateText(`Reply with only: ${label}`, { tenantId, trafficPriority: "P1" }) + .then((result) => { + const elapsedMs = Date.now() - enqueuedAt; + const headers = result.response?.headers; + console.log( + `[${now()}] done ${label} in ${elapsedMs}ms text=${result.text} x-ratelimit-remaining=${getHeader( + headers, + "x-ratelimit-remaining-requests", + )}`, + ); + return { + label, + elapsedMs, + text: result.text, + headers: { + limit: getHeader(headers, "x-ratelimit-limit-requests"), + remaining: getHeader(headers, "x-ratelimit-remaining-requests"), + reset: getHeader(headers, "x-ratelimit-reset-requests"), + }, + }; + }) + .catch((error) => { + const elapsedMs = Date.now() - enqueuedAt; + console.log( + `[${now()}] failed ${label} in ${elapsedMs}ms name=${error?.name ?? "Error"} status=${error?.status ?? error?.statusCode ?? "n/a"} msg=${error?.message}`, + ); + throw error; + }); + }); + + const settled = await Promise.allSettled(jobs); + + console.log(`\n[done] settled=${safeStringify(settled.map((s) => s.status))}`); + console.log( + `[done] tenantUsage(${tenantId})=${safeStringify(controller.getTenantUsage(tenantId))}`, + ); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/tmp/test/traffic-rate-limit-openai-window-sim.ts b/tmp/test/traffic-rate-limit-openai-window-sim.ts new file mode 100644 index 000000000..35232faa0 --- /dev/null +++ b/tmp/test/traffic-rate-limit-openai-window-sim.ts @@ -0,0 +1,247 @@ +// @ts-nocheck +/** + * Manual test (real network): Simulate OpenAI "window remaining + reset" semantics and watch + * TrafficController pace + probe behavior via logs. + * + * Why "simulate"? + * - Real OpenAI headers usually show very large remaining values, so pacing is hard to observe. + * - This script still hits the real OpenAI model, but it drives the controller state using + * synthetic `x-ratelimit-*` headers to force a small window (e.g. remaining=3, reset=30s). + * + * What this demonstrates (matches your Step 1–7): + * 1) We seed controller with remaining + reset window. + * 2) We enqueue many requests. + * 3) Controller subtracts `reserved` from `remaining` to avoid stampedes. + * 4) When `effectiveRemaining <= 1`, controller waits until `resetAt + probeDelay`. + * 5) When room exists, controller paces using `nextAllowedAt`. + * 6) When a request finishes, we release reservation (controller) and apply new headers (this script). + * 7) After reset, controller sends a probe even when remaining==0; probe "fetches" fresh headers and flow resumes. + * + * Prereqs: + * - Set `OPENAI_API_KEY` + * + * Suggested logging: + * - `VOLTAGENT_LOG_LEVEL=trace` (to see traffic controller internals) + * + * Run: + * - VOLTAGENT_LOG_LEVEL=trace OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-openai-window-sim.ts + * + * Optional env: + * - OPENAI_MODEL (default: gpt-4o-mini) + * - WINDOW_SECONDS (default: 30) + * - REMAINING (default: 3) + * - REQUESTS (default: 10) + * - MAX_CONCURRENT (default: 50) + */ + +import { safeStringify } from "@voltagent/internal"; +import { TrafficController } from "../../packages/core/dist/index.js"; + +const apiKey = process.env.OPENAI_API_KEY; +if (!apiKey) { + console.error("Missing OPENAI_API_KEY. Example:"); + console.error( + " VOLTAGENT_LOG_LEVEL=trace OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-openai-window-sim.ts", + ); + process.exit(1); +} + +const now = () => new Date().toISOString(); + +const modelId = process.env.OPENAI_MODEL ?? "gpt-4o-mini"; +const windowSecondsRaw = Number(process.env.WINDOW_SECONDS ?? "30"); +const remainingRaw = Number(process.env.REMAINING ?? "3"); +const requestsRaw = Number(process.env.REQUESTS ?? "10"); +const maxConcurrentRaw = Number(process.env.MAX_CONCURRENT ?? "50"); + +const windowSeconds = + Number.isFinite(windowSecondsRaw) && windowSecondsRaw > 0 ? windowSecondsRaw : 30; +const initialRemaining = + Number.isFinite(remainingRaw) && remainingRaw > 0 ? Math.floor(remainingRaw) : 3; +const requestCount = Number.isFinite(requestsRaw) && requestsRaw > 0 ? Math.floor(requestsRaw) : 10; +const maxConcurrent = + Number.isFinite(maxConcurrentRaw) && maxConcurrentRaw > 0 ? Math.floor(maxConcurrentRaw) : 50; + +const provider = "openai"; +const tenantId = "openai-window-sim"; +const windowMs = Math.round(windowSeconds * 1000); + +async function callOpenAIResponses(label: string): Promise<{ + status: number; + headers: Record; + textPreview: string; +}> { + const url = "https://api.openai.com/v1/responses"; + const body = safeStringify({ + model: modelId, + input: `Reply with only: ${label}`, + max_output_tokens: 16, + }); + + const startedAt = Date.now(); + const res = await fetch(url, { + method: "POST", + headers: { + authorization: `Bearer ${apiKey}`, + "content-type": "application/json", + }, + body, + }); + + const limit = res.headers.get("x-ratelimit-limit-requests") ?? undefined; + const remaining = res.headers.get("x-ratelimit-remaining-requests") ?? undefined; + const reset = res.headers.get("x-ratelimit-reset-requests") ?? undefined; + + if (!res.ok) { + const text = await res.text().catch(() => ""); + throw new Error( + `OpenAI error status=${res.status} elapsedMs=${Date.now() - startedAt} body=${text.slice(0, 280)}`, + ); + } + + const data: any = await res.json(); + const outputText = + data?.output?.[0]?.content?.find?.((c: any) => c?.type === "output_text")?.text ?? + data?.output_text ?? + data?.output?.[0]?.content?.[0]?.text ?? + ""; + + return { + status: res.status, + headers: { + "x-ratelimit-limit-requests": limit, + "x-ratelimit-remaining-requests": remaining, + "x-ratelimit-reset-requests": reset, + }, + textPreview: String(outputText).slice(0, 80), + }; +} + +async function main() { + console.log( + `\n=== OpenAI real + synthetic window rate limit (provider=${provider}, model=${modelId}) ===`, + ); + console.log( + `[config] maxConcurrent=${maxConcurrent} windowSeconds=${windowSeconds} initialRemaining=${initialRemaining} requests=${requestCount}`, + ); + console.log( + "[hint] Set VOLTAGENT_LOG_LEVEL=trace to see TrafficController internals (reserved/effectiveRemaining/nextAllowedAt).", + ); + + const controller = new TrafficController({ maxConcurrent }); + + // --- Step 1: seed "remaining + reset window" into controller --- + let windowResetAt = Date.now() + windowMs; + let remainingInWindow = initialRemaining; + + const applySyntheticHeaders = (source: string) => { + const resetMs = Math.max(1, windowResetAt - Date.now()); + const applied = controller.updateRateLimitFromHeaders( + { provider, model: modelId, tenantId }, + { + "x-ratelimit-limit-requests": String(initialRemaining), + "x-ratelimit-remaining-requests": String(Math.max(0, remainingInWindow)), + "x-ratelimit-reset-requests": `${resetMs}ms`, + }, + ); + console.log( + `[${now()}] [synthetic] source=${source} remaining=${remainingInWindow} resetInMs=${resetMs} applied=${safeStringify( + applied && { + key: applied.key, + state: { + remaining: applied.state.remaining, + reserved: applied.state.reserved, + resetAt: applied.state.resetAt, + nextAllowedAt: applied.state.nextAllowedAt, + }, + }, + )}`, + ); + }; + + applySyntheticHeaders("seed"); + + console.log("\n[seed] Making one real request to confirm connectivity + show real headers..."); + const seed = await callOpenAIResponses("seed"); + console.log( + `[${now()}] [seed] ok status=${seed.status} text=${seed.textPreview} realHeaders=${safeStringify( + seed.headers, + )}`, + ); + + console.log(`\n[burst] Enqueueing ${requestCount} controller-managed requests...`); + + const jobs = Array.from({ length: requestCount }, (_, index) => { + const label = `req-${index + 1}`; + const enqueuedAt = Date.now(); + console.log(`[${now()}] [enqueue] ${label}`); + + return controller + .handleText({ + tenantId, + metadata: { + tenantId, + provider, + model: modelId, + priority: "P1", + agentName: "openai-window-sim", + agentId: label, + }, + execute: async () => { + const startedAt = Date.now(); + console.log(`[${now()}] [execute-start] ${label}`); + + const result = await callOpenAIResponses(label); + + console.log( + `[${now()}] [execute-end] ${label} elapsedMs=${Date.now() - startedAt} realHeaders=${safeStringify( + result.headers, + )}`, + ); + + // --- Step 6: decrement remaining + apply new "headers" --- + const nowMs = Date.now(); + if (nowMs >= windowResetAt) { + // --- Step 7: reset happened; probe request fetched "fresh" headers for the next window --- + console.log( + `[${now()}] [reset] window elapsed; starting new synthetic window (windowSeconds=${windowSeconds})`, + ); + windowResetAt = nowMs + windowMs; + remainingInWindow = initialRemaining; + } + + remainingInWindow = Math.max(0, remainingInWindow - 1); + applySyntheticHeaders("response"); + + return result; + }, + }) + .then((r) => { + const totalElapsedMs = Date.now() - enqueuedAt; + console.log( + `[${now()}] [done] ${label} totalElapsedMs=${totalElapsedMs} text=${r.textPreview}`, + ); + return { label, totalElapsedMs, status: "fulfilled" as const }; + }) + .catch((error: any) => { + const totalElapsedMs = Date.now() - enqueuedAt; + console.log( + `[${now()}] [fail] ${label} totalElapsedMs=${totalElapsedMs} name=${error?.name ?? "Error"} msg=${ + error?.message ?? String(error) + }`, + ); + return { label, totalElapsedMs, status: "rejected" as const }; + }); + }); + + const settled = await Promise.all(jobs); + console.log(`\n[done] settled=${safeStringify(settled.map((s) => s.status))}`); + console.log( + `[done] tenantUsage(${tenantId})=${safeStringify(controller.getTenantUsage(tenantId))}`, + ); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/tmp/test/traffic-rate-limit-static.ts b/tmp/test/traffic-rate-limit-static.ts new file mode 100644 index 000000000..3f91d5bbb --- /dev/null +++ b/tmp/test/traffic-rate-limit-static.ts @@ -0,0 +1,149 @@ +// @ts-nocheck +/** + * Manual test: TrafficController window-based rate limiting (simulated OpenAI headers). + * + * What to look for: + * - Requests should be paced out across the window (no steady "refill" math). + * - If responses arrive out-of-order, remaining headers might "increase"; controller should + * keep remaining monotonic within the same window. + * + * Run: + * - pnpm ts-node tmp/test/traffic-rate-limit-static.ts + * - VERBOSE=1 pnpm ts-node tmp/test/traffic-rate-limit-static.ts + * + * Optional env: + * - LIMIT=6 WINDOW_MS=3000 pnpm ts-node tmp/test/traffic-rate-limit-static.ts + */ + +import { safeStringify } from "@voltagent/internal"; +import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; + +const verbose = process.env.VERBOSE === "1"; +if (!verbose) { + console.debug = () => {}; +} + +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); +const now = () => new Date().toISOString(); + +const provider = "sim"; +const model = "rate-limited-model"; +const key = `${provider}::${model}`; + +const controller = getTrafficController({ maxConcurrent: 50 }); + +const limit = Number(process.env.LIMIT ?? 6); +const windowMs = Number(process.env.WINDOW_MS ?? 3000); +let windowStartAt = Date.now(); +let windowResetAt = windowStartAt + windowMs; +let usedInWindow = 0; + +function extractLabel(prompt: any): string { + if (!Array.isArray(prompt)) { + return "unknown"; + } + + for (let index = prompt.length - 1; index >= 0; index -= 1) { + const message = prompt[index]; + if (!message || message.role !== "user" || !Array.isArray(message.content)) { + continue; + } + + const textPart = message.content.find((part: any) => part?.type === "text"); + if (textPart?.text) { + return String(textPart.text); + } + } + + return "unknown"; +} + +async function main() { + console.log( + `\n=== Window rate limit for ${key} (limit=${limit}, windowMs=${windowMs}, jobs=10) ===`, + ); + + const seeded = controller.updateRateLimitFromHeaders( + { provider, model }, + { + "x-ratelimit-limit-requests": String(limit), + "x-ratelimit-remaining-requests": String(limit), + "x-ratelimit-reset-requests": `${windowMs}ms`, + }, + ); + console.log(`[seed] updateRateLimitFromHeaders=${safeStringify(seeded)}`); + + let calls = 0; + let lastStartAt = 0; + const rateLimitedModel = { + specificationVersion: "v2", + provider, + modelId: model, + doGenerate: async (options: any) => { + const simulatedLatencyMs = 10 + Math.floor(Math.random() * 120); + const nowMs = Date.now(); + if (nowMs >= windowResetAt) { + windowStartAt = nowMs; + windowResetAt = windowStartAt + windowMs; + usedInWindow = 0; + } + + calls += 1; + usedInWindow += 1; + const startAt = Date.now(); + const delta = lastStartAt ? startAt - lastStartAt : 0; + lastStartAt = startAt; + + const label = extractLabel(options?.prompt); + console.log( + `[${now()}] doGenerate start call=${calls} (+${delta}ms) input=${label} latencyMs=${simulatedLatencyMs}`, + ); + await sleep(simulatedLatencyMs); + console.log(`[${now()}] doGenerate end input=${label}`); + + const remainingAfterThis = Math.max(0, limit - usedInWindow); + const resetMs = Math.max(1, windowResetAt - Date.now()); + return { + content: [{ type: "text", text: `ok:${label}` }], + finishReason: "stop", + usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, + warnings: [], + response: { + modelId: model, + headers: { + "x-ratelimit-limit-requests": String(limit), + "x-ratelimit-remaining-requests": String(remainingAfterThis), + "x-ratelimit-reset-requests": `${resetMs}ms`, + }, + }, + }; + }, + }; + + const agent = new Agent({ + name: "traffic-rate-limit-static", + instructions: "echo", + model: rateLimitedModel, + temperature: 0, + maxOutputTokens: 32, + }); + + const jobs = Array.from({ length: 10 }, (_, idx) => + agent.generateText(`req-${idx + 1}`, { + tenantId: "default", + trafficPriority: "P1", + }), + ); + + const settled = await Promise.allSettled(jobs); + console.log( + `\n[done] results=${safeStringify( + settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)), + )}`, + ); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/tmp/test/traffic-retry-after.ts b/tmp/test/traffic-retry-after.ts new file mode 100644 index 000000000..c0c213ebe --- /dev/null +++ b/tmp/test/traffic-retry-after.ts @@ -0,0 +1,245 @@ +// @ts-nocheck +/** + * Manual test: Retry-After handling (429 retry + 200 OK header ingestion). + * + * What this exercises: + * - Retry-After on 429 errors increases retry delay (TrafficController retry plan). + * - Retry-After on successful responses throttles subsequent requests for the same provider::model. + * + * Run: + * - pnpm -C packages/core build + * - pnpm ts-node tmp/test/traffic-retry-after.ts + * - VERBOSE=1 pnpm ts-node tmp/test/traffic-retry-after.ts + */ + +import { safeStringify } from "@voltagent/internal"; +import { + Agent, + RateLimitedUpstreamError, + getTrafficController, +} from "../../packages/core/dist/index.js"; + +const verbose = process.env.VERBOSE === "1"; +if (!verbose) { + console.debug = () => {}; +} + +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); +const now = () => new Date().toISOString(); + +function extractLabel(prompt: any): string { + if (!Array.isArray(prompt)) { + return "unknown"; + } + + for (let index = prompt.length - 1; index >= 0; index -= 1) { + const message = prompt[index]; + if (!message || message.role !== "user" || !Array.isArray(message.content)) { + continue; + } + + const textPart = message.content.find((part: any) => part?.type === "text"); + if (textPart?.text) { + return String(textPart.text); + } + } + + return "unknown"; +} + +function make429RetryAfterModel(args: { + provider: string; + modelId: string; + retryAfterSeconds: number; + mode: "headers" | "typedError"; +}) { + const { provider, modelId, retryAfterSeconds, mode } = args; + let calls = 0; + const startedAt: number[] = []; + + return { + specificationVersion: "v2", + provider, + modelId, + startedAt, + doGenerate: async (options: any) => { + calls += 1; + const start = Date.now(); + startedAt.push(start); + + const label = extractLabel(options?.prompt); + console.log(`[${now()}] [model] ${provider}::${modelId} start call=${calls} input=${label}`); + + if (calls === 1) { + const retryAfterValue = String(retryAfterSeconds); + + if (mode === "typedError") { + throw new RateLimitedUpstreamError( + `rate limited (typed) retry-after=${retryAfterValue}s`, + { provider, model: modelId }, + Math.round(retryAfterSeconds * 1000), + ); + } + + const err: any = new Error(`rate limited (headers) retry-after=${retryAfterValue}s`); + err.status = 429; + err.response = { + status: 429, + headers: { + "retry-after": retryAfterValue, + }, + }; + throw err; + } + + return { + content: [{ type: "text", text: `ok:${label}` }], + finishReason: "stop", + usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, + warnings: [], + response: { modelId, headers: {} }, + }; + }, + }; +} + +function makeSuccessRetryAfterModel(args: { + provider: string; + modelId: string; + retryAfterSeconds: number; + latencyMs: number; +}) { + const { provider, modelId, retryAfterSeconds, latencyMs } = args; + let calls = 0; + const startedAt: number[] = []; + const endedAt: number[] = []; + + return { + specificationVersion: "v2", + provider, + modelId, + startedAt, + endedAt, + doGenerate: async (options: any) => { + calls += 1; + const start = Date.now(); + startedAt.push(start); + + const label = extractLabel(options?.prompt); + console.log(`[${now()}] [model] ${provider}::${modelId} start call=${calls} input=${label}`); + await sleep(latencyMs); + + const end = Date.now(); + endedAt.push(end); + console.log(`[${now()}] [model] ${provider}::${modelId} end call=${calls} input=${label}`); + + return { + content: [{ type: "text", text: `ok:${label}` }], + finishReason: "stop", + usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, + warnings: [], + response: { + modelId, + headers: + calls === 1 + ? { + "retry-after": String(retryAfterSeconds), + } + : {}, + }, + }; + }, + }; +} + +async function test_retryAfterOn429(mode: "headers" | "typedError") { + const retryAfterSeconds = 1; + const provider = `retry-after-429-${mode}`; + const modelId = "ra-429"; + const tenantId = `ra-429-${mode}`; + + const model = make429RetryAfterModel({ provider, modelId, retryAfterSeconds, mode }); + const agent = new Agent({ + name: `ra-429-${mode}`, + instructions: "echo", + model, + temperature: 0, + maxOutputTokens: 32, + }); + + console.log(`\n=== Test: Retry-After on 429 (${mode}) ===`); + const result = await agent.generateText("hello", { tenantId, trafficPriority: "P1" }); + + const times = model.startedAt; + const deltaMs = times.length >= 2 ? times[1] - times[0] : undefined; + + console.log( + `[result] text=${result.text} calls=${times.length} startedAt=${safeStringify(times)} deltaMs=${deltaMs}`, + ); + + if (deltaMs === undefined || deltaMs < retryAfterSeconds * 1000) { + throw new Error( + `Expected retry delay >= ${retryAfterSeconds * 1000}ms, got ${deltaMs ?? "n/a"}ms`, + ); + } +} + +async function test_retryAfterOnSuccessResponse() { + const retryAfterSeconds = 0.3; + const provider = "retry-after-200"; + const modelId = "ra-200"; + const tenantId = "ra-200"; + + const model = makeSuccessRetryAfterModel({ + provider, + modelId, + retryAfterSeconds, + latencyMs: 20, + }); + + const agent = new Agent({ + name: "ra-200", + instructions: "echo", + model, + temperature: 0, + maxOutputTokens: 32, + }); + + console.log("\n=== Test: Retry-After on 200 response headers ==="); + const first = agent.generateText("first", { tenantId, trafficPriority: "P1" }); + const second = agent.generateText("second", { tenantId, trafficPriority: "P1" }); + + const [r1, r2] = await Promise.all([first, second]); + + const end1 = model.endedAt[0]; + const start2 = model.startedAt[1]; + const enforcedDelayMs = start2 && end1 ? start2 - end1 : undefined; + + console.log( + `[result] texts=${safeStringify([r1.text, r2.text])} startedAt=${safeStringify( + model.startedAt, + )} endedAt=${safeStringify(model.endedAt)} enforcedDelayMs=${enforcedDelayMs}`, + ); + + if (enforcedDelayMs === undefined || enforcedDelayMs < retryAfterSeconds * 1000) { + throw new Error( + `Expected rate-limit delay >= ${retryAfterSeconds * 1000}ms, got ${enforcedDelayMs ?? "n/a"}ms`, + ); + } +} + +async function main() { + // Create controller early so all Agent calls share the same singleton. + getTrafficController({ maxConcurrent: 1 }); + + await test_retryAfterOn429("headers"); + await test_retryAfterOn429("typedError"); + await test_retryAfterOnSuccessResponse(); + + console.log("\n[done] All Retry-After manual checks passed."); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/tmp/test/traffic-retry-behavior.ts b/tmp/test/traffic-retry-behavior.ts new file mode 100644 index 000000000..273af55ab --- /dev/null +++ b/tmp/test/traffic-retry-behavior.ts @@ -0,0 +1,169 @@ +// @ts-nocheck +/** + * Manual test: TrafficController retry behavior via Agent + AI SDK path (stub model). + * + * Scenarios included: + * - 5xx retries (up to 3 attempts) + * - 429 retries (up to 3 attempts) + * - timeout retries (up to 2 attempts) + * - non-retriable 4xx does not retry + * + * Run: + * - pnpm ts-node tmp/test/traffic-retry-behavior.ts + * + * Notes: + * - Uses a stub LanguageModel; no network calls. + * - Watch the `[model] attempt=...` logs to confirm retries. + */ + +import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; + +const verbose = process.env.VERBOSE === "1"; +if (!verbose) { + console.debug = () => {}; +} + +type Scenario = + | "server-error" + | "rate-limit" + | "timeout" + | "bad-request" + | "forbidden" + // Variations to hit different retry-detection branches. + | "server-error-status-string" + | "server-error-statusCode" + | "server-error-response-status" + | "server-error-cause-status" + | "rate-limit-statusCode" + | "timeout-code-only" + | "timeout-name-only" + | "timeout-message-only" + // Variations that should STOP retrying (hit max attempts). + | "server-error-exceed-max" + | "timeout-exceed-max"; + +type RetryPlan = { + failCountBeforeSuccess: number; + status?: number | string; + statusCode?: number | string; + httpStatus?: number | string; + responseStatus?: number | string; + causeStatus?: number | string; + code?: string; + name?: string; + message?: string; +}; + +const plans: Record = { + "server-error": { failCountBeforeSuccess: 2, status: 500 }, + "rate-limit": { failCountBeforeSuccess: 2, status: 429 }, + timeout: { failCountBeforeSuccess: 1, status: 408, code: "ETIMEDOUT", message: "timeout" }, + "bad-request": { failCountBeforeSuccess: 10, status: 400 }, + forbidden: { failCountBeforeSuccess: 10, status: 403 }, + "server-error-status-string": { failCountBeforeSuccess: 2, status: "500" }, + "server-error-statusCode": { failCountBeforeSuccess: 2, statusCode: 502 }, + "server-error-response-status": { failCountBeforeSuccess: 2, responseStatus: 503 }, + "server-error-cause-status": { failCountBeforeSuccess: 2, causeStatus: 500 }, + "rate-limit-statusCode": { failCountBeforeSuccess: 2, statusCode: 429 }, + "timeout-code-only": { failCountBeforeSuccess: 1, code: "timeout" }, + "timeout-name-only": { failCountBeforeSuccess: 1, name: "TimeoutError" }, + "timeout-message-only": { failCountBeforeSuccess: 1, message: "this is a TIMEOUT" }, + "server-error-exceed-max": { failCountBeforeSuccess: 10, status: 500 }, + "timeout-exceed-max": { failCountBeforeSuccess: 10, message: "timeout" }, +}; + +function makeModel(modelId: string, plan: RetryPlan) { + let counter = 0; + let lastAttemptAt = 0; + + return { + specificationVersion: "v2", + provider: "retry-provider", + modelId, + doGenerate: async () => { + counter += 1; + const now = Date.now(); + const delta = lastAttemptAt ? now - lastAttemptAt : 0; + lastAttemptAt = now; + + console.log(`[model] modelId=${modelId} attempt=${counter} (+${delta}ms)`); + + if (counter <= plan.failCountBeforeSuccess) { + const err: any = new Error(plan.message ?? `forced failure ${counter} for ${modelId}`); + if (plan.status !== undefined) err.status = plan.status; + if (plan.statusCode !== undefined) err.statusCode = plan.statusCode; + if (plan.httpStatus !== undefined) err.httpStatus = plan.httpStatus; + if (plan.responseStatus !== undefined) err.response = { status: plan.responseStatus }; + if (plan.causeStatus !== undefined) err.cause = { status: plan.causeStatus }; + if (plan.code !== undefined) err.code = plan.code; + if (plan.name !== undefined) err.name = plan.name; + throw err; + } + + return { + content: [{ type: "text", text: "ok" }], + finishReason: "stop", + usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, + warnings: [], + response: { modelId, headers: {} }, + }; + }, + }; +} + +async function runScenario(name: Scenario) { + const plan = plans[name]; + const modelId = `retry-${name}`; + const model = makeModel(modelId, plan); + + const agent = new Agent({ + name: `RetryAgent-${name}`, + instructions: "echo", + model, + maxOutputTokens: 32, + temperature: 0, + }); + + console.log(`\n=== ${name} ===`); + try { + const result = await agent.generateText(name, { tenantId: "retry-test" }); + console.log(`[${name}] succeeded. text=${result.text}`); + } catch (err: any) { + console.log( + `[${name}] failed. status=${err?.status ?? err?.statusCode ?? err?.response?.status ?? "n/a"}`, + ); + } +} + +async function main() { + // Create controller early so all Agent calls share the same singleton. + getTrafficController({ maxConcurrent: 1 }); + + const runs: Scenario[] = [ + "server-error", + "rate-limit", + "timeout", + "bad-request", + "forbidden", + // Uncomment for additional coverage: + // "server-error-status-string", + // "server-error-statusCode", + // "server-error-response-status", + // "server-error-cause-status", + // "rate-limit-statusCode", + // "timeout-code-only", + // "timeout-name-only", + // "timeout-message-only", + // "server-error-exceed-max", + // "timeout-exceed-max", + ]; + + for (const name of runs) { + await runScenario(name); + } +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/tmp/test/traffic-tenant-usage.ts b/tmp/test/traffic-tenant-usage.ts new file mode 100644 index 000000000..801d7761c --- /dev/null +++ b/tmp/test/traffic-tenant-usage.ts @@ -0,0 +1,71 @@ +// @ts-nocheck +/** + * Manual test: Tenant usage aggregation (via Agent → TrafficController). + * + * What to look for: + * - `getTenantUsage(tenantId)` should increase after each agent call. + * + * Run: + * - pnpm ts-node tmp/test/traffic-tenant-usage.ts + */ + +import { safeStringify } from "@voltagent/internal"; +import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; + +const verbose = process.env.VERBOSE === "1"; +if (!verbose) { + console.debug = () => {}; +} + +function makeModel(modelId: string) { + return { + specificationVersion: "v2", + provider: "usage-provider", + modelId, + doGenerate: async () => { + return { + content: [{ type: "text", text: `ok:${modelId}` }], + finishReason: "stop", + usage: { inputTokens: 2, outputTokens: 3, totalTokens: 5 }, + warnings: [], + response: { modelId, headers: {} }, + }; + }, + }; +} + +const controller = getTrafficController({ maxConcurrent: 10 }); + +async function run(label: string, tenantId: string) { + const model = makeModel("tenant-usage-model"); + const agent = new Agent({ + name: `TenantUsageAgent-${label}`, + instructions: "echo", + model, + temperature: 0, + maxOutputTokens: 32, + }); + + console.log(`\n=== ${label} tenantId=${tenantId} ===`); + const result = await agent.generateText(`hello:${label}`, { tenantId }); + console.log(`[${label}] text=${result.text}`); + + const usage = controller.getTenantUsage(tenantId); + console.log(`[${label}] controller.getTenantUsage(${tenantId})=${safeStringify(usage)}`); +} + +async function main() { + await run("A1", "tenant-a"); + await run("A2", "tenant-a"); + await run("B1", "tenant-b"); + + console.log("\n=== Final usage snapshot ==="); + console.log(`tenant-a=${safeStringify(controller.getTenantUsage("tenant-a"))}`); + console.log(`tenant-b=${safeStringify(controller.getTenantUsage("tenant-b"))}`); + console.log(`default=${safeStringify(controller.getTenantUsage("default"))}`); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/tmp/test/traffic-text-vs-stream.ts b/tmp/test/traffic-text-vs-stream.ts new file mode 100644 index 000000000..41aa484d4 --- /dev/null +++ b/tmp/test/traffic-text-vs-stream.ts @@ -0,0 +1,128 @@ +// @ts-nocheck +/** + * Manual test: Text + stream traffic share the same TrafficController queue. + * + * What to look for: + * - Stream and text requests should respect the same maxConcurrent + priority rules. + * + * Run: + * - pnpm ts-node tmp/test/traffic-text-vs-stream.ts + * - VERBOSE=1 pnpm ts-node tmp/test/traffic-text-vs-stream.ts + */ + +import { ReadableStream } from "node:stream/web"; +import { safeStringify } from "@voltagent/internal"; +import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; + +const verbose = process.env.VERBOSE === "1"; +if (!verbose) { + console.debug = () => {}; +} + +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); +const now = () => new Date().toISOString(); + +const controller = getTrafficController({ maxConcurrent: 1 }); + +function extractLabel(prompt: any): string { + if (!Array.isArray(prompt)) { + return "unknown"; + } + + for (let index = prompt.length - 1; index >= 0; index -= 1) { + const message = prompt[index]; + if (!message || message.role !== "user" || !Array.isArray(message.content)) { + continue; + } + + const textPart = message.content.find((part: any) => part?.type === "text"); + if (textPart?.text) { + return String(textPart.text); + } + } + + return "unknown"; +} + +async function main() { + console.log("\n=== Text vs Stream (shared scheduler) ==="); + void controller; + + const provider = "sim"; + const modelId = "shared-queue"; + + const model = { + specificationVersion: "v2", + provider, + modelId, + doGenerate: async (options: any) => { + const label = extractLabel(options?.prompt); + console.log(`[${now()}] doGenerate start input=${label}`); + await sleep(50); + console.log(`[${now()}] doGenerate end input=${label}`); + return { + content: [{ type: "text", text: `text:${label}` }], + finishReason: "stop", + usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, + warnings: [], + response: { modelId, headers: {} }, + }; + }, + doStream: async (options: any) => { + const label = extractLabel(options?.prompt); + console.log(`[${now()}] doStream start input=${label}`); + + // Hold the controller slot for a bit so ordering is visible. + await sleep(400); + + console.log(`[${now()}] doStream ready input=${label}`); + const streamId = `text-${label}`; + const text = `stream:${label}`; + + const stream = new ReadableStream({ + start(streamController) { + streamController.enqueue({ type: "stream-start", warnings: [] }); + streamController.enqueue({ type: "text-start", id: streamId }); + streamController.enqueue({ type: "text-delta", id: streamId, delta: text }); + streamController.enqueue({ type: "text-end", id: streamId }); + streamController.enqueue({ + type: "finish", + usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, + finishReason: "stop", + }); + streamController.close(); + }, + }); + + return { stream, response: { headers: {} } }; + }, + }; + + const agent = new Agent({ + name: "traffic-text-vs-stream", + instructions: "echo", + model, + temperature: 0, + maxOutputTokens: 32, + }); + + const streamP1 = agent.streamText("S1", { tenantId: "default", trafficPriority: "P1" }); + const textP0 = agent.generateText("T0", { tenantId: "default", trafficPriority: "P0" }); + const textP1 = agent.generateText("T1", { tenantId: "default", trafficPriority: "P1" }); + + const [streamResult, t0, t1] = await Promise.all([streamP1, textP0, textP1]); + const streamText = await streamResult.text; + + console.log( + `\n[done] results=${safeStringify({ + streamText, + textP0: t0.text, + textP1: t1.text, + })}`, + ); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +});