diff --git a/commits.txt b/commits.txt
new file mode 100644
index 000000000..73fd43c52
--- /dev/null
+++ b/commits.txt
@@ -0,0 +1,6 @@
+e8443df2
+9503a0a6
+293fe825
+a88ecd67
+66d74dd2
+53f34370
\ No newline at end of file
diff --git a/diff.txt b/diff.txt
new file mode 100644
index 000000000..b393df88f
--- /dev/null
+++ b/diff.txt
@@ -0,0 +1,9297 @@
+diff --git a/commits.txt b/commits.txt
+new file mode 100644
+index 00000000..73fd43c5
+--- /dev/null
++++ b/commits.txt
+@@ -0,0 +1,6 @@
++e8443df2
++9503a0a6
++293fe825
++a88ecd67
++66d74dd2
++53f34370
+\ No newline at end of file
+diff --git a/examples/with-client-side-tools/next-env.d.ts b/examples/with-client-side-tools/next-env.d.ts
+index 1b3be084..9edff1c7 100644
+--- a/examples/with-client-side-tools/next-env.d.ts
++++ b/examples/with-client-side-tools/next-env.d.ts
+@@ -1,5 +1,6 @@
+ ///
+ ///
++import "./.next/types/routes.d.ts";
+
+ // NOTE: This file should not be edited
+ // see https://nextjs.org/docs/app/api-reference/config/typescript for more information.
+diff --git a/examples/with-client-side-tools/tsconfig.json b/examples/with-client-side-tools/tsconfig.json
+index 3697fcb9..0fca67d3 100644
+--- a/examples/with-client-side-tools/tsconfig.json
++++ b/examples/with-client-side-tools/tsconfig.json
+@@ -1,6 +1,10 @@
+ {
+ "compilerOptions": {
+- "lib": ["dom", "dom.iterable", "esnext"],
++ "lib": [
++ "dom",
++ "dom.iterable",
++ "esnext"
++ ],
+ "allowJs": true,
+ "skipLibCheck": true,
+ "strict": true,
+@@ -11,7 +15,7 @@
+ "resolveJsonModule": true,
+ "isolatedModules": true,
+ "sourceMap": true,
+- "jsx": "preserve",
++ "jsx": "react-jsx",
+ "incremental": true,
+ "plugins": [
+ {
+@@ -19,10 +23,20 @@
+ }
+ ],
+ "paths": {
+- "@/*": ["./*"]
++ "@/*": [
++ "./*"
++ ]
+ },
+ "target": "ES2017"
+ },
+- "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
+- "exclude": ["node_modules"]
++ "include": [
++ "next-env.d.ts",
++ "**/*.ts",
++ "**/*.tsx",
++ ".next/types/**/*.ts",
++ ".next/dev/types/**/*.ts"
++ ],
++ "exclude": [
++ "node_modules"
++ ]
+ }
+diff --git a/examples/with-netlify-functions/netlify/functions/voltagent.js b/examples/with-netlify-functions/netlify/functions/voltagent.js
+new file mode 100644
+index 00000000..0ec386b8
+--- /dev/null
++++ b/examples/with-netlify-functions/netlify/functions/voltagent.js
+@@ -0,0 +1,4 @@
++import { createNetlifyFunctionHandler } from "@voltagent/serverless-hono";
++import { getVoltAgent } from "../../src/index";
++const voltAgent = getVoltAgent();
++export const handler = createNetlifyFunctionHandler(voltAgent);
+diff --git a/examples/with-netlify-functions/src/index.js b/examples/with-netlify-functions/src/index.js
+new file mode 100644
+index 00000000..af385b50
+--- /dev/null
++++ b/examples/with-netlify-functions/src/index.js
+@@ -0,0 +1,17 @@
++import { openai } from "@ai-sdk/openai";
++import { Agent, VoltAgent } from "@voltagent/core";
++import { serverlessHono } from "@voltagent/serverless-hono";
++import { weatherTool } from "./tools";
++const agent = new Agent({
++ name: "netlify-function-agent",
++ instructions: "Help the user quickly and call tools when needed.",
++ model: openai("gpt-4o-mini"),
++ tools: [weatherTool],
++});
++const voltAgent = new VoltAgent({
++ agents: { agent },
++ serverless: serverlessHono(),
++});
++export function getVoltAgent() {
++ return voltAgent;
++}
+diff --git a/examples/with-netlify-functions/src/tools/index.js b/examples/with-netlify-functions/src/tools/index.js
+new file mode 100644
+index 00000000..d1c5bf43
+--- /dev/null
++++ b/examples/with-netlify-functions/src/tools/index.js
+@@ -0,0 +1,26 @@
++import { createTool } from "@voltagent/core";
++import z from "zod";
++export const weatherTool = createTool({
++ id: "get-weather",
++ name: "getWeather",
++ description: "Return a mock weather report for the requested location",
++ parameters: z.object({
++ location: z.string().describe("City or location to look up"),
++ }),
++ execute: async ({ location }, context) => {
++ context?.logger.info(`Fetching weather for ${location}`);
++ const mockWeatherData = {
++ location,
++ temperature: Math.floor(Math.random() * 30) + 5,
++ condition: ["Sunny", "Cloudy", "Rainy", "Snowy", "Partly Cloudy"][
++ Math.floor(Math.random() * 5)
++ ],
++ humidity: Math.floor(Math.random() * 60) + 30,
++ windSpeed: Math.floor(Math.random() * 30),
++ };
++ return {
++ weather: mockWeatherData,
++ message: `Current weather in ${location}: ${mockWeatherData.temperature}°C and ${mockWeatherData.condition.toLowerCase()} with ${mockWeatherData.humidity}% humidity and wind speed of ${mockWeatherData.windSpeed} km/h.`,
++ };
++ },
++});
+diff --git a/package.json b/package.json
+index 7c80f7c5..7e3ef8ba 100644
+--- a/package.json
++++ b/package.json
+@@ -32,9 +32,10 @@
+ "publint": "^0.3.8",
+ "rimraf": "^5.0.5",
+ "syncpack": "^13.0.2",
++ "ts-node": "^10.9.2",
+ "tslib": "^2.3.0",
+ "tsup": "^8.5.0",
+- "typescript": "^5.8.2",
++ "typescript": "^5.9.2",
+ "vite": "^7.2.7",
+ "vitest": "^3.2.4"
+ },
+diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts
+index 291bdf7f..84343c04 100644
+--- a/packages/core/src/agent/agent.ts
++++ b/packages/core/src/agent/agent.ts
+@@ -48,6 +48,14 @@ import type { BaseRetriever } from "../retriever/retriever";
+ import type { Tool, Toolkit } from "../tool";
+ import { createTool } from "../tool";
+ import { ToolManager } from "../tool/manager";
++import {
++ type FallbackChainEntry,
++ type TrafficPriority,
++ type TrafficRequest,
++ type TrafficRequestMetadata,
++ getTrafficController,
++} from "../traffic/traffic-controller";
++import { findHeaders } from "../traffic/traffic-error-utils";
+ import { randomUUID } from "../utils/id";
+ import { convertModelMessagesToUIMessages } from "../utils/message-converter";
+ import { NodeType, createNodeId } from "../utils/node-utils";
+@@ -262,8 +270,42 @@ export interface BaseGenerationOptions extends Partial {
+ // Context
+ userId?: string;
+ conversationId?: string;
++ tenantId?: string;
++ /**
++ * Optional key metadata for per-key rate limits.
++ */
++ apiKeyId?: string;
++ /**
++ * Optional region metadata for per-region rate limits.
++ */
++ region?: string;
++ /**
++ * Optional endpoint metadata for per-endpoint rate limits.
++ */
++ endpoint?: string;
++ /**
++ * Optional tenant tier metadata for per-tier rate limits.
++ */
++ tenantTier?: string;
+ context?: ContextInput;
+ elicitation?: (request: unknown) => Promise;
++ /**
++ * Optional priority override for scheduling.
++ * Defaults to agent-level priority when omitted.
++ */
++ trafficPriority?: TrafficPriority;
++ /**
++ * Optional maximum time to wait in the queue before timing out.
++ */
++ maxQueueWaitMs?: number;
++ /**
++ * Optional task classification for circuit-breaker fallback policies.
++ */
++ taskType?: string;
++ /**
++ * Optional explicit fallback policy id.
++ */
++ fallbackPolicyId?: string;
+
+ // Parent tracking
+ parentAgentId?: string;
+@@ -303,6 +345,8 @@ export interface BaseGenerationOptions extends Partial {
+
+ // Provider-specific options
+ providerOptions?: ProviderOptions;
++ // Optional per-call model override (used for fallbacks)
++ model?: LanguageModel;
+
+ // Experimental output (for structured generation)
+ experimental_output?: ReturnType | ReturnType;
+@@ -347,6 +391,7 @@ export class Agent {
+ readonly voice?: Voice;
+ readonly retriever?: BaseRetriever;
+ readonly supervisorConfig?: SupervisorConfig;
++ private readonly trafficPriority: TrafficPriority;
+ private readonly context?: Map;
+
+ private readonly logger: Logger;
+@@ -372,6 +417,7 @@ export class Agent {
+ this.temperature = options.temperature;
+ this.maxOutputTokens = options.maxOutputTokens;
+ this.maxSteps = options.maxSteps || 5;
++ this.trafficPriority = options.trafficPriority ?? "P1";
+ this.stopWhen = options.stopWhen;
+ this.markdown = options.markdown ?? false;
+ this.voice = options.voice;
+@@ -444,6 +490,47 @@ export class Agent {
+ async generateText(
+ input: string | UIMessage[] | BaseMessage[],
+ options?: GenerateTextOptions,
++ ): Promise {
++ const controller = getTrafficController({ logger: this.logger }); // Use shared controller so all agent calls flow through central queue/metrics
++ const tenantId = this.resolveTenantId(options);
++ const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => {
++ const mergedOptions = this.mergeOptionsWithModel(options, modelOverride);
++ const metadata = this.buildTrafficMetadata(
++ mergedOptions?.model,
++ mergedOptions,
++ providerOverride,
++ ); // Compute once per queued request (including per-call model overrides)
++ return {
++ tenantId,
++ metadata,
++ maxQueueWaitMs: options?.maxQueueWaitMs,
++ estimatedTokens: this.estimateTokens(input, mergedOptions),
++ execute: () => this.executeGenerateText(input, mergedOptions, metadata), // Defer actual execution so controller can schedule it
++ extractUsage: (result: GenerateTextResultWithContext) =>
++ this.extractUsageFromResponse(result),
++ createFallbackRequest: (fallbackTarget) => {
++ if (this.isShortResponseFallback(fallbackTarget)) {
++ return this.buildShortTextFallbackRequest(
++ tenantId,
++ metadata,
++ mergedOptions,
++ fallbackTarget.text,
++ );
++ }
++ const { modelOverride: fallbackModel, providerOverride: fallbackProvider } =
++ this.resolveFallbackTarget(fallbackTarget);
++ return buildRequest(fallbackModel, fallbackProvider);
++ },
++ };
++ };
++
++ return controller.handleText(buildRequest(options?.model));
++ }
++
++ private async executeGenerateText(
++ input: string | UIMessage[] | BaseMessage[],
++ options?: GenerateTextOptions,
++ trafficMetadata?: TrafficRequestMetadata,
+ ): Promise {
+ const startTime = Date.now();
+ const oc = this.createOperationContext(input, options);
+@@ -471,7 +558,7 @@ export class Agent {
+ options,
+ );
+
+- const modelName = this.getModelName();
++ const modelName = this.getModelName(model);
+ const contextLimit = options?.contextLimit;
+
+ // Add model attributes and all options
+@@ -544,10 +631,20 @@ export class Agent {
+ hooks,
+ maxSteps: userMaxSteps,
+ tools: userTools,
++ maxQueueWaitMs,
++ taskType,
++ fallbackPolicyId,
+ experimental_output,
+ providerOptions,
++ maxRetries: _maxRetries, // Always disable provider retries (TrafficController handles retries)
++ model: _model, // Exclude model so aiSDKOptions doesn't override resolved model
+ ...aiSDKOptions
+ } = options || {};
++ void _model;
++ void _maxRetries;
++ void maxQueueWaitMs;
++ void taskType;
++ void fallbackPolicyId;
+
+ const llmSpan = this.createLLMSpan(oc, {
+ operation: "generateText",
+@@ -567,6 +664,11 @@ export class Agent {
+
+ let result!: GenerateTextResult;
+ try {
++ methodLogger.info("[AI SDK] Calling generateText", {
++ messageCount: messages.length,
++ modelName,
++ tools: tools ? Object.keys(tools) : [],
++ });
+ result = await oc.traceContext.withSpan(llmSpan, () =>
+ generateText({
+ model,
+@@ -575,7 +677,7 @@ export class Agent {
+ // Default values
+ temperature: this.temperature,
+ maxOutputTokens: this.maxOutputTokens,
+- maxRetries: 3,
++ maxRetries: 0,
+ stopWhen: options?.stopWhen ?? this.stopWhen ?? stepCountIs(maxSteps),
+ // User overrides from AI SDK options
+ ...aiSDKOptions,
+@@ -588,7 +690,15 @@ export class Agent {
+ onStepFinish: this.createStepHandler(oc, options),
+ }),
+ );
++ methodLogger.info("[AI SDK] Received generateText result", {
++ finishReason: result.finishReason,
++ usage: result.usage ? safeStringify(result.usage) : undefined,
++ stepCount: result.steps?.length ?? 0,
++ rawResult: safeStringify(result),
++ });
++ this.updateTrafficControllerRateLimits(result.response, trafficMetadata, methodLogger);
+ } catch (error) {
++ this.updateTrafficControllerRateLimits(error, trafficMetadata, methodLogger);
+ finalizeLLMSpan(SpanStatusCode.ERROR, { message: (error as Error).message });
+ throw error;
+ }
+@@ -771,6 +881,47 @@ export class Agent {
+ async streamText(
+ input: string | UIMessage[] | BaseMessage[],
+ options?: StreamTextOptions,
++ ): Promise {
++ const controller = getTrafficController({ logger: this.logger }); // Same controller handles streaming to keep ordering/backpressure consistent
++ const tenantId = this.resolveTenantId(options);
++ const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => {
++ const mergedOptions = this.mergeOptionsWithModel(options, modelOverride);
++ const metadata = this.buildTrafficMetadata(
++ mergedOptions?.model,
++ mergedOptions,
++ providerOverride,
++ ); // Compute once per queued request (including per-call model overrides)
++ return {
++ tenantId,
++ metadata,
++ maxQueueWaitMs: options?.maxQueueWaitMs,
++ estimatedTokens: this.estimateTokens(input, mergedOptions),
++ execute: () => this.executeStreamText(input, mergedOptions, metadata), // Actual streaming work happens after the controller dequeues us
++ extractUsage: (result: StreamTextResultWithContext) =>
++ this.extractUsageFromResponse(result),
++ createFallbackRequest: (fallbackTarget) => {
++ if (this.isShortResponseFallback(fallbackTarget)) {
++ return this.buildShortStreamTextFallbackRequest(
++ tenantId,
++ metadata,
++ mergedOptions,
++ fallbackTarget.text,
++ );
++ }
++ const { modelOverride: fallbackModel, providerOverride: fallbackProvider } =
++ this.resolveFallbackTarget(fallbackTarget);
++ return buildRequest(fallbackModel, fallbackProvider);
++ },
++ };
++ };
++
++ return controller.handleStream(buildRequest(options?.model));
++ }
++
++ private async executeStreamText(
++ input: string | UIMessage[] | BaseMessage[],
++ options?: StreamTextOptions,
++ trafficMetadata?: TrafficRequestMetadata,
+ ): Promise {
+ const startTime = Date.now();
+ const oc = this.createOperationContext(input, options);
+@@ -800,7 +951,7 @@ export class Agent {
+ options,
+ );
+
+- const modelName = this.getModelName();
++ const modelName = this.getModelName(model);
+ const contextLimit = options?.contextLimit;
+
+ // Add model attributes to root span if TraceContext exists
+@@ -868,10 +1019,20 @@ export class Agent {
+ maxSteps: userMaxSteps,
+ tools: userTools,
+ onFinish: userOnFinish,
++ maxQueueWaitMs,
++ taskType,
++ fallbackPolicyId,
+ experimental_output,
+ providerOptions,
++ maxRetries: _maxRetries, // Always disable provider retries (TrafficController handles retries)
++ model: _model, // Exclude model from aiSDKOptions to avoid overriding resolved model
+ ...aiSDKOptions
+ } = options || {};
++ void _model;
++ void _maxRetries;
++ void maxQueueWaitMs;
++ void taskType;
++ void fallbackPolicyId;
+
+ const guardrailStreamingEnabled = guardrailSet.output.length > 0;
+
+@@ -893,7 +1054,13 @@ export class Agent {
+ },
+ });
+ const finalizeLLMSpan = this.createLLMSpanFinalizer(llmSpan);
++ const trafficController = getTrafficController({ logger: this.logger });
+
++ methodLogger.info("[AI SDK] Calling streamText", {
++ messageCount: messages.length,
++ modelName,
++ tools: tools ? Object.keys(tools) : [],
++ });
+ const result = streamText({
+ model,
+ messages,
+@@ -901,7 +1068,7 @@ export class Agent {
+ // Default values
+ temperature: this.temperature,
+ maxOutputTokens: this.maxOutputTokens,
+- maxRetries: 3,
++ maxRetries: 0, // Retry via traffic controller to avoid provider-level storms
+ stopWhen: options?.stopWhen ?? this.stopWhen ?? stepCountIs(maxSteps),
+ // User overrides from AI SDK options
+ ...aiSDKOptions,
+@@ -937,6 +1104,8 @@ export class Agent {
+ modelName: this.getModelName(),
+ });
+
++ this.updateTrafficControllerRateLimits(actualError, trafficMetadata, methodLogger);
++ trafficController.reportStreamFailure(trafficMetadata, actualError);
+ finalizeLLMSpan(SpanStatusCode.ERROR, { message: (actualError as Error)?.message });
+
+ // History update removed - using OpenTelemetry only
+@@ -962,6 +1131,18 @@ export class Agent {
+ .catch(() => {});
+ },
+ onFinish: async (finalResult) => {
++ methodLogger.info("[AI SDK] streamText finished", {
++ finishReason: finalResult.finishReason,
++ usage: finalResult.totalUsage ? safeStringify(finalResult.totalUsage) : undefined,
++ stepCount: finalResult.steps?.length ?? 0,
++ rawResult: safeStringify(finalResult),
++ });
++ this.updateTrafficControllerRateLimits(
++ finalResult.response,
++ trafficMetadata,
++ methodLogger,
++ );
++ trafficController.reportStreamSuccess(trafficMetadata);
+ const providerUsage = finalResult.usage
+ ? await Promise.resolve(finalResult.usage)
+ : undefined;
+@@ -1428,6 +1609,49 @@ export class Agent {
+ input: string | UIMessage[] | BaseMessage[],
+ schema: T,
+ options?: GenerateObjectOptions,
++ ): Promise>> {
++ const controller = getTrafficController({ logger: this.logger });
++ const tenantId = this.resolveTenantId(options);
++ const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => {
++ const mergedOptions = this.mergeOptionsWithModel(options, modelOverride);
++ const metadata = this.buildTrafficMetadata(
++ mergedOptions?.model,
++ mergedOptions,
++ providerOverride,
++ ); // Compute once per queued request (including per-call model overrides)
++ return {
++ tenantId,
++ metadata,
++ maxQueueWaitMs: options?.maxQueueWaitMs,
++ estimatedTokens: this.estimateTokens(input, mergedOptions),
++ execute: () => this.executeGenerateObject(input, schema, mergedOptions, metadata),
++ extractUsage: (result: GenerateObjectResultWithContext>) =>
++ this.extractUsageFromResponse(result),
++ createFallbackRequest: (fallbackTarget) => {
++ if (this.isShortResponseFallback(fallbackTarget)) {
++ return this.buildShortObjectFallbackRequest(
++ tenantId,
++ metadata,
++ schema,
++ mergedOptions,
++ fallbackTarget.text,
++ );
++ }
++ const { modelOverride: fallbackModel, providerOverride: fallbackProvider } =
++ this.resolveFallbackTarget(fallbackTarget);
++ return buildRequest(fallbackModel, fallbackProvider);
++ },
++ };
++ };
++
++ return controller.handleText(buildRequest(options?.model));
++ }
++
++ private async executeGenerateObject(
++ input: string | UIMessage[] | BaseMessage[],
++ schema: T,
++ options?: GenerateObjectOptions,
++ trafficMetadata?: TrafficRequestMetadata,
+ ): Promise>> {
+ const startTime = Date.now();
+ const oc = this.createOperationContext(input, options);
+@@ -1452,7 +1676,7 @@ export class Agent {
+ options,
+ );
+
+- const modelName = this.getModelName();
++ const modelName = this.getModelName(model);
+ const schemaName = schema.description || "unknown";
+
+ // Add model attributes and all options
+@@ -1510,10 +1734,25 @@ export class Agent {
+ hooks,
+ maxSteps: userMaxSteps,
+ tools: userTools,
++ taskType,
++ fallbackPolicyId,
++ maxQueueWaitMs,
+ providerOptions,
++ maxRetries: _maxRetries, // Always disable provider retries (TrafficController handles retries)
++ model: _model, // Exclude model so spread does not override resolved model
+ ...aiSDKOptions
+ } = options || {};
+-
++ void _model;
++ void _maxRetries;
++ void taskType;
++ void fallbackPolicyId;
++ void maxQueueWaitMs;
++
++ methodLogger.info("[AI SDK] Calling generateObject", {
++ messageCount: messages.length,
++ modelName,
++ schemaName,
++ });
+ const result = await generateObject({
+ model,
+ messages,
+@@ -1522,7 +1761,7 @@ export class Agent {
+ // Default values
+ maxOutputTokens: this.maxOutputTokens,
+ temperature: this.temperature,
+- maxRetries: 3,
++ maxRetries: 0,
+ // User overrides from AI SDK options
+ ...aiSDKOptions,
+ // Provider-specific options
+@@ -1530,6 +1769,13 @@ export class Agent {
+ // VoltAgent controlled
+ abortSignal: oc.abortController.signal,
+ });
++ methodLogger.info("[AI SDK] Received generateObject result", {
++ finishReason: result.finishReason,
++ usage: result.usage ? safeStringify(result.usage) : undefined,
++ warnings: result.warnings,
++ rawResult: safeStringify(result),
++ });
++ this.updateTrafficControllerRateLimits(result.response, trafficMetadata, methodLogger);
+
+ const usageInfo = convertUsage(result.usage);
+ const finalObject = await executeOutputGuardrails({
+@@ -1638,6 +1884,7 @@ export class Agent {
+ context: oc.context,
+ };
+ } catch (error) {
++ this.updateTrafficControllerRateLimits(error, trafficMetadata, methodLogger);
+ await this.flushPendingMessagesOnError(oc).catch(() => {});
+ return this.handleError(error as Error, oc, options, startTime);
+ } finally {
+@@ -1655,6 +1902,49 @@ export class Agent {
+ input: string | UIMessage[] | BaseMessage[],
+ schema: T,
+ options?: StreamObjectOptions,
++ ): Promise>> {
++ const controller = getTrafficController({ logger: this.logger });
++ const tenantId = this.resolveTenantId(options);
++ const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => {
++ const mergedOptions = this.mergeOptionsWithModel(options, modelOverride);
++ const metadata = this.buildTrafficMetadata(
++ mergedOptions?.model,
++ mergedOptions,
++ providerOverride,
++ ); // Compute once per queued request (including per-call model overrides)
++ return {
++ tenantId,
++ metadata,
++ maxQueueWaitMs: options?.maxQueueWaitMs,
++ estimatedTokens: this.estimateTokens(input, mergedOptions),
++ execute: () => this.executeStreamObject(input, schema, mergedOptions, metadata),
++ extractUsage: (result: StreamObjectResultWithContext>) =>
++ this.extractUsageFromResponse(result),
++ createFallbackRequest: (fallbackTarget) => {
++ if (this.isShortResponseFallback(fallbackTarget)) {
++ return this.buildShortStreamObjectFallbackRequest(
++ tenantId,
++ metadata,
++ schema,
++ mergedOptions,
++ fallbackTarget.text,
++ );
++ }
++ const { modelOverride: fallbackModel, providerOverride: fallbackProvider } =
++ this.resolveFallbackTarget(fallbackTarget);
++ return buildRequest(fallbackModel, fallbackProvider);
++ },
++ };
++ };
++
++ return controller.handleStream(buildRequest(options?.model));
++ }
++
++ private async executeStreamObject(
++ input: string | UIMessage[] | BaseMessage[],
++ schema: T,
++ options?: StreamObjectOptions,
++ trafficMetadata?: TrafficRequestMetadata,
+ ): Promise>> {
+ const startTime = Date.now();
+ const oc = this.createOperationContext(input, options);
+@@ -1680,7 +1970,7 @@ export class Agent {
+ options,
+ );
+
+- const modelName = this.getModelName();
++ const modelName = this.getModelName(model);
+ const schemaName = schema.description || "unknown";
+
+ // Add model attributes and all options
+@@ -1739,14 +2029,30 @@ export class Agent {
+ maxSteps: userMaxSteps,
+ tools: userTools,
+ onFinish: userOnFinish,
++ taskType,
++ fallbackPolicyId,
++ maxQueueWaitMs,
+ providerOptions,
++ maxRetries: _maxRetries, // Always disable provider retries (TrafficController handles retries)
++ model: _model, // Exclude model so aiSDKOptions cannot override resolved model
+ ...aiSDKOptions
+ } = options || {};
++ void _model;
++ void _maxRetries;
++ void taskType;
++ void fallbackPolicyId;
++ void maxQueueWaitMs;
+
+ let guardrailObjectPromise!: Promise>;
+ let resolveGuardrailObject: ((value: z.infer) => void) | undefined;
+ let rejectGuardrailObject: ((reason: unknown) => void) | undefined;
++ const trafficController = getTrafficController({ logger: this.logger });
+
++ methodLogger.info("[AI SDK] Calling streamObject", {
++ messageCount: messages.length,
++ modelName,
++ schemaName,
++ });
+ const result = streamObject({
+ model,
+ messages,
+@@ -1755,7 +2061,7 @@ export class Agent {
+ // Default values
+ maxOutputTokens: this.maxOutputTokens,
+ temperature: this.temperature,
+- maxRetries: 3,
++ maxRetries: 0,
+ // User overrides from AI SDK options
+ ...aiSDKOptions,
+ // Provider-specific options
+@@ -1771,9 +2077,11 @@ export class Agent {
+ methodLogger.error("Stream object error occurred", {
+ error: actualError,
+ agentName: this.name,
+- modelName: this.getModelName(),
++ modelName: this.getModelName(model),
+ schemaName: schemaName,
+ });
++ this.updateTrafficControllerRateLimits(actualError, trafficMetadata, methodLogger);
++ trafficController.reportStreamFailure(trafficMetadata, actualError);
+
+ // History update removed - using OpenTelemetry only
+
+@@ -1800,6 +2108,17 @@ export class Agent {
+ },
+ onFinish: async (finalResult: any) => {
+ try {
++ methodLogger.info("[AI SDK] streamObject finished", {
++ finishReason: finalResult.finishReason,
++ usage: finalResult.usage ? safeStringify(finalResult.usage) : undefined,
++ rawResult: safeStringify(finalResult),
++ });
++ this.updateTrafficControllerRateLimits(
++ finalResult.response,
++ trafficMetadata,
++ methodLogger,
++ );
++ trafficController.reportStreamSuccess(trafficMetadata);
+ const usageInfo = convertUsage(finalResult.usage as any);
+ let finalObject = finalResult.object as z.infer;
+ if (guardrailSet.output.length > 0) {
+@@ -2021,8 +2340,9 @@ export class Agent {
+ // Calculate maxSteps (use provided option or calculate based on subagents)
+ const maxSteps = options?.maxSteps ?? this.calculateMaxSteps();
+
+- // Resolve dynamic values
+- const model = await this.resolveValue(this.model, oc);
++ // Resolve dynamic values (allow per-call model override for fallbacks)
++ const selectedModel = options?.model ?? this.model;
++ const model = await this.resolveValue(selectedModel, oc);
+ const dynamicToolList = (await this.resolveValue(this.dynamicTools, oc)) || [];
+
+ // Merge agent tools with option tools
+@@ -2073,6 +2393,12 @@ export class Agent {
+ ): OperationContext {
+ const operationId = randomUUID();
+ const startTimeDate = new Date();
++ const priority = this.resolveTrafficPriority(options);
++ const tenantId = this.resolveTenantId(options);
++ const apiKeyId = options?.apiKeyId ?? options?.parentOperationContext?.apiKeyId;
++ const region = options?.region ?? options?.parentOperationContext?.region;
++ const endpoint = options?.endpoint ?? options?.parentOperationContext?.endpoint;
++ const tenantTier = options?.tenantTier ?? options?.parentOperationContext?.tenantTier;
+
+ // Prefer reusing an existing context instance to preserve reference across calls/subagents
+ const runtimeContext = toContextMap(options?.context);
+@@ -2123,6 +2449,7 @@ export class Agent {
+ operationId,
+ userId: options?.userId,
+ conversationId: options?.conversationId,
++ tenantId,
+ executionId: operationId,
+ });
+
+@@ -2137,6 +2464,9 @@ export class Agent {
+ parentAgentId: options?.parentAgentId,
+ input,
+ });
++ if (tenantId) {
++ traceContext.getRootSpan().setAttribute("tenant.id", tenantId);
++ }
+ traceContext.getRootSpan().setAttribute("voltagent.operation_id", operationId);
+
+ // Use parent's AbortController if available, otherwise create new one
+@@ -2174,8 +2504,14 @@ export class Agent {
+ logger,
+ conversationSteps: options?.parentOperationContext?.conversationSteps || [],
+ abortController,
++ priority,
+ userId: options?.userId,
+ conversationId: options?.conversationId,
++ tenantId,
++ apiKeyId,
++ region,
++ endpoint,
++ tenantTier,
+ parentAgentId: options?.parentAgentId,
+ traceContext,
+ startTime: startTimeDate,
+@@ -3170,6 +3506,20 @@ export class Agent {
+ return value;
+ }
+
++ private mergeOptionsWithModel(
++ options: BaseGenerationOptions | undefined,
++ modelOverride?: LanguageModel,
++ ): BaseGenerationOptions | undefined {
++ if (!options && modelOverride === undefined) {
++ return undefined;
++ }
++
++ return {
++ ...(options ?? {}),
++ ...(modelOverride !== undefined ? { model: modelOverride } : {}),
++ };
++ }
++
+ /**
+ * Prepare tools with execution context
+ */
+@@ -3822,17 +4172,622 @@ export class Agent {
+ return this.subAgentManager.calculateMaxSteps(this.maxSteps);
+ }
+
++ private resolveTrafficPriority(options?: BaseGenerationOptions): TrafficPriority {
++ const normalize = (value?: TrafficPriority): TrafficPriority | undefined => {
++ if (value === "P0" || value === "P1" || value === "P2") {
++ return value;
++ }
++ return undefined;
++ };
++
++ const parentPriority = normalize(options?.parentOperationContext?.priority);
++ const localPriority = normalize(options?.trafficPriority) ?? this.trafficPriority ?? "P1";
++
++ if (parentPriority) {
++ return this.pickHigherPriority(parentPriority, localPriority);
++ }
++
++ return localPriority;
++ }
++
++ private resolveTenantId(options?: BaseGenerationOptions): string {
++ const parentTenant = options?.parentOperationContext?.tenantId;
++ if (parentTenant) {
++ return parentTenant;
++ }
++
++ if (options?.tenantId) {
++ return options.tenantId;
++ }
++
++ return "default";
++ }
++
++ private pickHigherPriority(a: TrafficPriority, b: TrafficPriority): TrafficPriority {
++ const rank: Record = { P0: 0, P1: 1, P2: 2 };
++ return rank[a] <= rank[b] ? a : b;
++ }
++
++ private buildTrafficMetadata(
++ modelOverride?: LanguageModel | DynamicValue,
++ options?: BaseGenerationOptions,
++ providerOverride?: string,
++ ): TrafficRequestMetadata {
++ const provider =
++ providerOverride ??
++ this.resolveProvider(modelOverride) ??
++ this.resolveProvider(this.model) ??
++ undefined;
++ const priority = this.resolveTrafficPriority(options);
++ const apiKeyId = options?.apiKeyId ?? options?.parentOperationContext?.apiKeyId;
++ const region = options?.region ?? options?.parentOperationContext?.region;
++ const endpoint = options?.endpoint ?? options?.parentOperationContext?.endpoint;
++ const tenantTier = options?.tenantTier ?? options?.parentOperationContext?.tenantTier;
++
++ return {
++ agentId: this.id, // Identify which agent issued the request
++ agentName: this.name, // Human-readable label for logs/metrics
++ model: this.getModelName(modelOverride), // Used for future capacity policies
++ provider, // Allows per-provider throttling later
++ priority,
++ tenantId: this.resolveTenantId(options),
++ apiKeyId,
++ region,
++ endpoint,
++ tenantTier,
++ taskType: options?.taskType,
++ fallbackPolicyId: options?.fallbackPolicyId,
++ };
++ }
++
++ private estimateTokens(
++ input: string | UIMessage[] | BaseMessage[],
++ options?: BaseGenerationOptions,
++ ): number | undefined {
++ let text = "";
++ if (typeof input === "string") {
++ text = input;
++ } else if (Array.isArray(input)) {
++ text = input
++ .map((message) => {
++ if (typeof message === "string") return message;
++ if (message && typeof message === "object") {
++ const content = (message as { content?: unknown }).content;
++ if (typeof content === "string") return content;
++ if (content !== undefined) return safeStringify(content);
++ return safeStringify(message);
++ }
++ return String(message ?? "");
++ })
++ .join(" ");
++ } else if (input) {
++ text = safeStringify(input);
++ }
++
++ const inputTokens = text ? Math.ceil(text.length / 4) : 0;
++ const outputTokensRaw =
++ typeof options?.maxOutputTokens === "number" ? options.maxOutputTokens : this.maxOutputTokens;
++ const outputTokens =
++ typeof outputTokensRaw === "number" && Number.isFinite(outputTokensRaw)
++ ? Math.max(0, Math.floor(outputTokensRaw))
++ : 0;
++ const total = inputTokens + outputTokens;
++ return total > 0 ? total : undefined;
++ }
++
++ private resolveFallbackTarget(target: FallbackChainEntry): {
++ modelOverride?: LanguageModel;
++ providerOverride?: string;
++ } {
++ if (typeof target === "string") {
++ return { modelOverride: target };
++ }
++ return {
++ modelOverride: target.model,
++ providerOverride: target.provider,
++ };
++ }
++
++ private isShortResponseFallback(
++ target: FallbackChainEntry,
++ ): target is { kind: "short-response"; text: string } {
++ return (
++ typeof target === "object" &&
++ target !== null &&
++ "kind" in target &&
++ (target as { kind?: string }).kind === "short-response"
++ );
++ }
++
++ private buildShortResponseMetadata(
++ baseMetadata: TrafficRequestMetadata | undefined,
++ ): TrafficRequestMetadata {
++ const metadata = baseMetadata ?? this.buildTrafficMetadata();
++ return {
++ ...metadata,
++ provider: "short-response",
++ model: "short-response",
++ };
++ }
++
++ private createZeroUsage(): LanguageModelUsage {
++ return { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
++ }
++
++ private createShortTextStream(text: string): AsyncIterableStream {
++ return createAsyncIterableReadable((controller) => {
++ controller.enqueue(text);
++ controller.close();
++ });
++ }
++
++ private createShortFullStream(text: string): AsyncIterableStream {
++ const usage = this.createZeroUsage();
++ const id = `short-response-${randomUUID()}`;
++ return createAsyncIterableReadable((controller) => {
++ controller.enqueue({
++ type: "text-delta",
++ id,
++ delta: text,
++ text,
++ } as VoltAgentTextStreamPart);
++ controller.enqueue({
++ type: "finish",
++ finishReason: "stop",
++ usage,
++ totalUsage: usage,
++ } as VoltAgentTextStreamPart);
++ controller.close();
++ });
++ }
++
++ private createShortTextResult(
++ text: string,
++ options?: GenerateTextOptions,
++ ): GenerateTextResultWithContext {
++ const usage = this.createZeroUsage();
++ const context = toContextMap(options?.context) ?? new Map();
++ const createTextStream = (): AsyncIterableStream => this.createShortTextStream(text);
++
++ return {
++ text,
++ content: [],
++ reasoning: [],
++ reasoningText: "",
++ files: [],
++ sources: [],
++ toolCalls: [],
++ staticToolCalls: [],
++ dynamicToolCalls: [],
++ toolResults: [],
++ staticToolResults: [],
++ dynamicToolResults: [],
++ usage,
++ totalUsage: usage,
++ warnings: [],
++ finishReason: "stop",
++ steps: [],
++ experimental_output: undefined,
++ response: {
++ id: "short-response",
++ modelId: "short-response",
++ timestamp: new Date(),
++ messages: [],
++ },
++ context,
++ request: {
++ body: {},
++ },
++ providerMetadata: undefined,
++ experimental_providerMetadata: undefined,
++ pipeTextStreamToResponse: (response, init) => {
++ pipeTextStreamToResponse({
++ response,
++ textStream: createTextStream(),
++ ...(init ?? {}),
++ });
++ },
++ toTextStreamResponse: (init) => {
++ return createTextStreamResponse({
++ textStream: createTextStream(),
++ ...(init ?? {}),
++ });
++ },
++ toDataStream: () => createTextStream(),
++ toDataStreamResponse: (init) => {
++ return createTextStreamResponse({
++ textStream: createTextStream(),
++ ...(init ?? {}),
++ });
++ },
++ pipeDataStreamToResponse: (response, init) => {
++ pipeTextStreamToResponse({
++ response,
++ textStream: createTextStream(),
++ ...(init ?? {}),
++ });
++ },
++ } as GenerateTextResultWithContext;
++ }
++
++ private createShortStreamTextResult(
++ text: string,
++ options?: StreamTextOptions,
++ ): StreamTextResultWithContext {
++ const usage = this.createZeroUsage();
++ const context = toContextMap(options?.context) ?? new Map();
++ const createTextStream = (): AsyncIterableStream => this.createShortTextStream(text);
++ const createFullStream = (): AsyncIterableStream =>
++ this.createShortFullStream(text);
++
++ const toUIMessageStream = (_options?: unknown) =>
++ createUIMessageStream({
++ execute: async ({ writer }) => {
++ writer.write({ type: "text", text } as any);
++ },
++ onError: (error) => String(error),
++ });
++
++ const toUIMessageStreamResponse = (options?: ResponseInit) => {
++ const stream = toUIMessageStream(options);
++ const responseInit = options ? { ...options } : {};
++ return createUIMessageStreamResponse({
++ stream,
++ ...responseInit,
++ });
++ };
++
++ const pipeUIMessageStreamToResponse = (response: any, init?: ResponseInit) => {
++ const stream = toUIMessageStream(init);
++ const initOptions = init ? { ...init } : {};
++ pipeUIMessageStreamToResponse({
++ response,
++ stream,
++ ...initOptions,
++ });
++ };
++
++ return {
++ text: Promise.resolve(text),
++ get textStream() {
++ return createTextStream();
++ },
++ get fullStream() {
++ return createFullStream();
++ },
++ usage: Promise.resolve(usage),
++ finishReason: Promise.resolve("stop"),
++ experimental_partialOutputStream: undefined,
++ toUIMessageStream: toUIMessageStream as StreamTextResultWithContext["toUIMessageStream"],
++ toUIMessageStreamResponse:
++ toUIMessageStreamResponse as StreamTextResultWithContext["toUIMessageStreamResponse"],
++ pipeUIMessageStreamToResponse:
++ pipeUIMessageStreamToResponse as StreamTextResultWithContext["pipeUIMessageStreamToResponse"],
++ pipeTextStreamToResponse: (response, init) => {
++ pipeTextStreamToResponse({
++ response,
++ textStream: createTextStream(),
++ ...(init ?? {}),
++ });
++ },
++ toTextStreamResponse: (init) => {
++ return createTextStreamResponse({
++ textStream: createTextStream(),
++ ...(init ?? {}),
++ });
++ },
++ context,
++ };
++ }
++
++ private resolveShortResponseObject(schema: T, text: string): z.infer {
++ const candidates: unknown[] = [];
++ if (text.length > 0) {
++ try {
++ candidates.push(JSON.parse(text));
++ } catch {}
++ }
++ candidates.push(text);
++ candidates.push({ text });
++ for (const candidate of candidates) {
++ const parsed = schema.safeParse(candidate);
++ if (parsed.success) {
++ return parsed.data;
++ }
++ }
++ return (candidates[0] ?? text) as z.infer;
++ }
++
++ private createShortObjectResult(
++ schema: T,
++ text: string,
++ options?: GenerateObjectOptions,
++ ): GenerateObjectResultWithContext> {
++ const object = this.resolveShortResponseObject(schema, text);
++ const usage = this.createZeroUsage();
++ const context = toContextMap(options?.context) ?? new Map();
++
++ return {
++ object,
++ usage,
++ warnings: [],
++ finishReason: "stop",
++ response: {
++ id: "short-response",
++ modelId: "short-response",
++ timestamp: new Date(),
++ messages: [],
++ },
++ context,
++ request: {
++ body: {},
++ },
++ reasoning: "",
++ providerMetadata: undefined,
++ toJsonResponse: (init?: ResponseInit) => {
++ const responseInit = init ? { ...init } : {};
++ const headers = {
++ "content-type": "application/json",
++ ...(responseInit.headers ?? {}),
++ };
++ return new Response(safeStringify(object), {
++ ...responseInit,
++ headers,
++ });
++ },
++ } as GenerateObjectResultWithContext>;
++ }
++
++ private createShortStreamObjectResult(
++ schema: T,
++ text: string,
++ options?: StreamObjectOptions,
++ ): StreamObjectResultWithContext> {
++ const object = this.resolveShortResponseObject(schema, text);
++ const usage = this.createZeroUsage();
++ const context = toContextMap(options?.context) ?? new Map();
++ const textPayload = safeStringify(object);
++ const createTextStream = (): AsyncIterableStream =>
++ this.createShortTextStream(textPayload);
++
++ const partialObjectStream = new ReadableStream>>({
++ start(controller) {
++ controller.enqueue(object);
++ controller.close();
++ },
++ });
++
++ return {
++ object: Promise.resolve(object),
++ partialObjectStream,
++ textStream: createTextStream(),
++ warnings: Promise.resolve(undefined),
++ usage: Promise.resolve(usage),
++ finishReason: Promise.resolve("stop"),
++ pipeTextStreamToResponse: (response, init) => {
++ pipeTextStreamToResponse({
++ response,
++ textStream: createTextStream(),
++ ...(init ?? {}),
++ });
++ },
++ toTextStreamResponse: (init) => {
++ return createTextStreamResponse({
++ textStream: createTextStream(),
++ ...(init ?? {}),
++ });
++ },
++ context,
++ };
++ }
++
++ private buildShortTextFallbackRequest(
++ tenantId: string,
++ metadata: TrafficRequestMetadata | undefined,
++ options: GenerateTextOptions | undefined,
++ text: string,
++ ): TrafficRequest {
++ const shortMetadata = this.buildShortResponseMetadata(metadata);
++ return {
++ tenantId,
++ metadata: shortMetadata,
++ maxQueueWaitMs: options?.maxQueueWaitMs,
++ estimatedTokens: 0,
++ execute: async () => this.createShortTextResult(text, options),
++ extractUsage: (result: GenerateTextResultWithContext) =>
++ this.extractUsageFromResponse(result),
++ };
++ }
++
++ private buildShortStreamTextFallbackRequest(
++ tenantId: string,
++ metadata: TrafficRequestMetadata | undefined,
++ options: StreamTextOptions | undefined,
++ text: string,
++ ): TrafficRequest {
++ const shortMetadata = this.buildShortResponseMetadata(metadata);
++ return {
++ tenantId,
++ metadata: shortMetadata,
++ maxQueueWaitMs: options?.maxQueueWaitMs,
++ estimatedTokens: 0,
++ execute: async () => this.createShortStreamTextResult(text, options),
++ extractUsage: (result: StreamTextResultWithContext) => this.extractUsageFromResponse(result),
++ };
++ }
++
++ private buildShortObjectFallbackRequest(
++ tenantId: string,
++ metadata: TrafficRequestMetadata | undefined,
++ schema: T,
++ options: GenerateObjectOptions | undefined,
++ text: string,
++ ): TrafficRequest>> {
++ const shortMetadata = this.buildShortResponseMetadata(metadata);
++ return {
++ tenantId,
++ metadata: shortMetadata,
++ maxQueueWaitMs: options?.maxQueueWaitMs,
++ estimatedTokens: 0,
++ execute: async () => this.createShortObjectResult(schema, text, options),
++ extractUsage: (result: GenerateObjectResultWithContext>) =>
++ this.extractUsageFromResponse(result),
++ };
++ }
++
++ private buildShortStreamObjectFallbackRequest(
++ tenantId: string,
++ metadata: TrafficRequestMetadata | undefined,
++ schema: T,
++ options: StreamObjectOptions | undefined,
++ text: string,
++ ): TrafficRequest>> {
++ const shortMetadata = this.buildShortResponseMetadata(metadata);
++ return {
++ tenantId,
++ metadata: shortMetadata,
++ maxQueueWaitMs: options?.maxQueueWaitMs,
++ estimatedTokens: 0,
++ execute: async () => this.createShortStreamObjectResult(schema, text, options),
++ extractUsage: (result: StreamObjectResultWithContext>) =>
++ this.extractUsageFromResponse(result),
++ };
++ }
++
++ private updateTrafficControllerRateLimits(
++ response: unknown,
++ metadata: TrafficRequestMetadata | undefined,
++ logger?: Logger,
++ ): void {
++ const headerCandidates = findHeaders(response);
++ if (headerCandidates.length === 0) {
++ logger?.debug?.("[Traffic] No headers found for rate limit update");
++ return;
++ }
++
++ const controller = getTrafficController();
++ const effectiveMetadata = metadata ?? this.buildTrafficMetadata();
++ let updateResult: ReturnType | undefined;
++ for (const headers of headerCandidates) {
++ updateResult = controller.updateRateLimitFromHeaders(effectiveMetadata, headers);
++ if (updateResult) break;
++ }
++
++ if (!updateResult) {
++ logger?.debug?.("[Traffic] No rate limit headers applied from response");
++ return;
++ }
++
++ const now = Date.now();
++ const effectiveRemaining = Math.max(
++ 0,
++ updateResult.state.remaining - updateResult.state.reserved,
++ );
++ const resetInMs = Math.max(0, updateResult.state.resetAt - now);
++ const nextAllowedInMs = Math.max(0, updateResult.state.nextAllowedAt - now);
++ logger?.info?.("[Traffic] Applied rate limit from response headers", {
++ rateLimitKey: updateResult.key,
++ limit: updateResult.state.limit,
++ remaining: updateResult.state.remaining,
++ reserved: updateResult.state.reserved,
++ effectiveRemaining,
++ resetAt: updateResult.state.resetAt,
++ resetInMs,
++ nextAllowedAt: updateResult.state.nextAllowedAt,
++ nextAllowedInMs,
++ headers: {
++ limitRequests: updateResult.headerSnapshot.limitRequests,
++ remainingRequests: updateResult.headerSnapshot.remainingRequests,
++ resetRequestsMs: updateResult.headerSnapshot.resetRequestsMs,
++ },
++ });
++ }
++
++ private extractUsageFromResponse(
++ result:
++ | {
++ usage?: LanguageModelUsage | Promise;
++ totalUsage?: LanguageModelUsage | Promise;
++ }
++ | undefined,
++ ): Promise | LanguageModelUsage | undefined {
++ if (!result) {
++ return undefined;
++ }
++
++ const usageCandidate =
++ (result as { totalUsage?: LanguageModelUsage | Promise })
++ ?.totalUsage ??
++ (result as { usage?: LanguageModelUsage | Promise })?.usage;
++
++ if (!usageCandidate) {
++ return undefined;
++ }
++
++ const normalizeUsage = (
++ usage: LanguageModelUsage | undefined,
++ ): LanguageModelUsage | undefined => {
++ if (!usage) return undefined;
++ const input = Number.isFinite(usage.inputTokens) ? (usage.inputTokens as number) : undefined;
++ const output = Number.isFinite(usage.outputTokens)
++ ? (usage.outputTokens as number)
++ : undefined;
++ const total = Number.isFinite(usage.totalTokens) ? (usage.totalTokens as number) : undefined;
++
++ if (total === undefined && input === undefined && output === undefined) {
++ return undefined;
++ }
++
++ const safeInput = input ?? 0;
++ const safeOutput = output ?? 0;
++ const safeTotal = total ?? safeInput + safeOutput;
++
++ return {
++ ...usage,
++ inputTokens: safeInput,
++ outputTokens: safeOutput,
++ totalTokens: safeTotal,
++ };
++ };
++
++ if (
++ typeof (usageCandidate as PromiseLike).then === "function"
++ ) {
++ return (usageCandidate as Promise)
++ .then((usage) => normalizeUsage(usage))
++ .catch(() => undefined);
++ }
++
++ return normalizeUsage(usageCandidate as LanguageModelUsage);
++ }
++
++ private resolveProvider(
++ model: LanguageModel | DynamicValue | undefined,
++ ): string | undefined {
++ if (
++ model &&
++ typeof model === "object" &&
++ "provider" in model &&
++ typeof (model as any).provider === "string"
++ ) {
++ return (model as any).provider;
++ }
++
++ return undefined;
++ }
++
+ /**
+ * Get the model name
+ */
+- public getModelName(): string {
+- if (typeof this.model === "function") {
++ public getModelName(modelOverride?: LanguageModel | DynamicValue): string {
++ const selectedModel = modelOverride ?? this.model;
++ if (typeof selectedModel === "function") {
+ return "dynamic";
+ }
+- if (typeof this.model === "string") {
+- return this.model;
++ if (typeof selectedModel === "string") {
++ return selectedModel;
+ }
+- return this.model.modelId || "unknown";
++ return selectedModel.modelId || "unknown";
+ }
+
+ /**
+diff --git a/packages/core/src/agent/eval.ts b/packages/core/src/agent/eval.ts
+index 9e4fe9f2..de712505 100644
+--- a/packages/core/src/agent/eval.ts
++++ b/packages/core/src/agent/eval.ts
+@@ -711,6 +711,7 @@ function buildEvalPayload(
+ rawOutput: output,
+ userId: oc.userId,
+ conversationId: oc.conversationId,
++ tenantId: oc.tenantId,
+ traceId: spanContext.traceId,
+ spanId: spanContext.spanId,
+ metadata,
+diff --git a/packages/core/src/agent/types.ts b/packages/core/src/agent/types.ts
+index dd5fb29d..c70bd478 100644
+--- a/packages/core/src/agent/types.ts
++++ b/packages/core/src/agent/types.ts
+@@ -29,6 +29,7 @@ import type { Logger } from "@voltagent/internal";
+ import type { LocalScorerDefinition, SamplingPolicy } from "../eval/runtime";
+ import type { MemoryOptions, MemoryStorageMetadata, WorkingMemorySummary } from "../memory/types";
+ import type { VoltAgentObservability } from "../observability";
++import type { TrafficPriority } from "../traffic/traffic-controller";
+ import type {
+ DynamicValue,
+ DynamicValueOptions,
+@@ -456,6 +457,11 @@ export type AgentOptions = {
+ temperature?: number;
+ maxOutputTokens?: number;
+ maxSteps?: number;
++ /**
++ * Default scheduling priority for this agent's LLM calls.
++ * Defaults to P1 when unspecified.
++ */
++ trafficPriority?: TrafficPriority;
+ /**
+ * Default stop condition for step execution (ai-sdk `stopWhen`).
+ * Per-call `stopWhen` in method options overrides this.
+@@ -493,6 +499,7 @@ export interface AgentEvalPayload {
+ rawOutput?: unknown;
+ userId?: string;
+ conversationId?: string;
++ tenantId?: string;
+ traceId: string;
+ spanId: string;
+ metadata?: Record;
+@@ -890,6 +897,21 @@ export type OperationContext = {
+ /** Optional conversation identifier associated with this operation */
+ conversationId?: string;
+
++ /** Optional tenant identifier propagated across nested operations */
++ tenantId?: string;
++
++ /** Optional key identifier for per-key traffic limits */
++ apiKeyId?: string;
++
++ /** Optional region identifier for per-region traffic limits */
++ region?: string;
++
++ /** Optional endpoint identifier for per-endpoint traffic limits */
++ endpoint?: string;
++
++ /** Optional tenant tier identifier for per-tier traffic limits */
++ tenantTier?: string;
++
+ /** User-managed context map for this operation */
+ readonly context: Map;
+
+@@ -914,6 +936,9 @@ export type OperationContext = {
+ /** Conversation steps for building full message history including tool calls/results */
+ conversationSteps?: StepWithContent[];
+
++ /** Scheduling priority propagated from parent calls */
++ priority?: TrafficPriority;
++
+ /** AbortController for cancelling the operation and accessing the signal */
+ abortController: AbortController;
+
+diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
+index 8753f039..9dee4333 100644
+--- a/packages/core/src/index.ts
++++ b/packages/core/src/index.ts
+@@ -21,6 +21,30 @@ export type {
+ WorkflowTimelineEvent,
+ RegisteredWorkflow,
+ } from "./workflow";
++export {
++ // Surface traffic controller so downstream consumers can route agent calls through the shared scheduler
++ TrafficController,
++ CircuitBreakerOpenError,
++ QueueWaitTimeoutError,
++ RateLimitedUpstreamError,
++ getTrafficController,
++ type FallbackChainEntry,
++ type FallbackPolicy,
++ type FallbackPolicyConfig,
++ type FallbackPolicyMode,
++ type FallbackTarget,
++ type RateLimitConfig,
++ type RateLimitKey,
++ type RateLimitOptions,
++ type AdaptiveLimiterConfig,
++ type PriorityWeights,
++ type PriorityBurstLimits,
++ type TrafficRequest,
++ type TrafficRequestMetadata,
++ type TrafficResponseMetadata,
++ type TrafficPriority,
++ type TrafficRequestType,
++} from "./traffic/traffic-controller";
+ // Export new Agent from agent.ts
+ export {
+ Agent,
+diff --git a/packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts
+new file mode 100644
+index 00000000..652b7e59
+--- /dev/null
++++ b/packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts
+@@ -0,0 +1,243 @@
++import type { Logger } from "../../logger";
++import {
++ RATE_LIMIT_EXHAUSTION_BUFFER,
++ RATE_LIMIT_MIN_PACE_INTERVAL_MS,
++ RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS,
++ RATE_LIMIT_PROBE_DELAY_MS,
++} from "../traffic-constants";
++import type {
++ DispatchDecision,
++ QueuedRequest,
++ RateLimitWindowState,
++} from "../traffic-controller-internal";
++import { parseRetryAfterMs, readHeaderValue } from "../traffic-error-utils";
++import type { TrafficRequestMetadata } from "../traffic-types";
++import type {
++ RateLimitHeaderSnapshot,
++ RateLimitStrategy,
++ RateLimitUpdateResult,
++} from "./rate-limit-strategy";
++import { parseResetDurationToMs } from "./rate-limit-utils";
++
++export class DefaultRateLimitStrategy implements RateLimitStrategy {
++ private state?: RateLimitWindowState;
++ private readonly key: string;
++
++ constructor(key: string) {
++ this.key = key;
++ }
++
++ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null {
++ const rateLimitLogger = logger?.child({ module: "rate-limiter" });
++ const state = this.state;
++ if (!state) {
++ rateLimitLogger?.trace?.("Rate limit state missing; allow request", {
++ rateLimitKey: this.key,
++ });
++ return null;
++ }
++
++ const now = Date.now();
++ const effectiveRemaining = Math.max(0, state.remaining - state.reserved);
++ const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS;
++
++ if (effectiveRemaining <= RATE_LIMIT_EXHAUSTION_BUFFER) {
++ if (now < probeAt) {
++ rateLimitLogger?.debug?.("Rate limit exhausted; waiting for probe", {
++ rateLimitKey: this.key,
++ remaining: state.remaining,
++ reserved: state.reserved,
++ effectiveRemaining,
++ resetAt: state.resetAt,
++ probeAt,
++ });
++ return { kind: "wait", wakeUpAt: probeAt };
++ }
++ if (state.reserved > 0) {
++ rateLimitLogger?.debug?.("Rate limit exhausted but in-flight reservations exist; waiting", {
++ rateLimitKey: this.key,
++ remaining: state.remaining,
++ reserved: state.reserved,
++ effectiveRemaining,
++ resetAt: state.resetAt,
++ });
++ return { kind: "wait" };
++ }
++ }
++
++ if (now < state.nextAllowedAt) {
++ rateLimitLogger?.debug?.("Rate limit pacing; waiting until nextAllowedAt", {
++ rateLimitKey: this.key,
++ nextAllowedAt: state.nextAllowedAt,
++ resetAt: state.resetAt,
++ waitMs: Math.min(state.resetAt, state.nextAllowedAt) - now,
++ });
++ return { kind: "wait", wakeUpAt: Math.min(state.resetAt, state.nextAllowedAt) };
++ }
++
++ state.reserved += 1;
++ next.rateLimitKey = this.key;
++ rateLimitLogger?.trace?.("Reserved rate limit token", {
++ rateLimitKey: this.key,
++ reserved: state.reserved,
++ remaining: state.remaining,
++ resetAt: state.resetAt,
++ nextAllowedAt: state.nextAllowedAt,
++ });
++
++ const remainingWindowMs = Math.max(0, state.resetAt - now);
++ const intervalMs = Math.max(
++ RATE_LIMIT_MIN_PACE_INTERVAL_MS,
++ Math.ceil(remainingWindowMs / Math.max(effectiveRemaining, 1)),
++ );
++
++ const candidateNext = Math.max(state.nextAllowedAt, now + intervalMs);
++ if (
++ state.nextAllowedAt <= now ||
++ candidateNext >= state.nextAllowedAt + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS
++ ) {
++ state.nextAllowedAt = candidateNext;
++ rateLimitLogger?.trace?.("Updated pacing nextAllowedAt", {
++ rateLimitKey: this.key,
++ nextAllowedAt: state.nextAllowedAt,
++ intervalMs,
++ remainingWindowMs,
++ effectiveRemaining,
++ });
++ }
++
++ return null;
++ }
++
++ onDispatch(_logger?: Logger): void {}
++
++ onComplete(logger?: Logger): void {
++ const rateLimitLogger = logger?.child({ module: "rate-limiter" });
++ const state = this.state;
++ if (!state || state.reserved <= 0) return;
++ state.reserved -= 1;
++ rateLimitLogger?.trace?.("Released rate limit reservation", {
++ rateLimitKey: this.key,
++ reserved: state.reserved,
++ remaining: state.remaining,
++ resetAt: state.resetAt,
++ nextAllowedAt: state.nextAllowedAt,
++ });
++ }
++
++ updateFromHeaders(
++ _metadata: TrafficRequestMetadata | undefined,
++ headers: unknown,
++ logger?: Logger,
++ ): RateLimitUpdateResult | undefined {
++ const rateLimitLogger = logger?.child({ module: "rate-limiter" });
++ const limitRequests = readHeaderValue(headers, "x-ratelimit-limit-requests");
++ const remainingRequests = readHeaderValue(headers, "x-ratelimit-remaining-requests");
++ const resetRequests = readHeaderValue(headers, "x-ratelimit-reset-requests");
++ const retryAfter = readHeaderValue(headers, "retry-after");
++ const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter) : undefined;
++
++ const now = Date.now();
++ const existing = this.state;
++ let state: RateLimitWindowState | undefined;
++ let headerSnapshot: RateLimitHeaderSnapshot | undefined;
++
++ if (limitRequests && remainingRequests && resetRequests) {
++ const limit = Number(limitRequests);
++ const remaining = Number(remainingRequests);
++ if (!Number.isFinite(limit) || !Number.isFinite(remaining)) {
++ rateLimitLogger?.debug?.("Invalid rate limit numeric headers; skipping", {
++ rateLimitKey: this.key,
++ limitRequests,
++ remainingRequests,
++ });
++ return undefined;
++ }
++
++ const resetRequestsMs = parseResetDurationToMs(resetRequests);
++ if (resetRequestsMs === undefined) {
++ rateLimitLogger?.debug?.("Unable to parse reset duration; skipping", {
++ rateLimitKey: this.key,
++ resetRequests,
++ });
++ return undefined;
++ }
++
++ const parsedResetAt = now + resetRequestsMs;
++ const isSameWindow = !!existing && now < existing.resetAt;
++ const resetAt = isSameWindow ? Math.max(existing.resetAt, parsedResetAt) : parsedResetAt;
++ const nextAllowedAt = isSameWindow ? Math.max(existing.nextAllowedAt, now) : now;
++ const reserved = Math.max(0, existing?.reserved ?? 0);
++
++ state = {
++ limit,
++ remaining: isSameWindow ? Math.min(existing.remaining, remaining) : remaining,
++ resetAt,
++ reserved,
++ nextAllowedAt,
++ };
++ headerSnapshot = {
++ limitRequests,
++ remainingRequests,
++ resetRequests,
++ resetRequestsMs,
++ };
++ } else if (retryAfterMs === undefined) {
++ rateLimitLogger?.trace?.("Missing rate limit headers; skipping", {
++ rateLimitKey: this.key,
++ hasLimit: !!limitRequests,
++ hasRemaining: !!remainingRequests,
++ hasReset: !!resetRequests,
++ hasRetryAfter: !!retryAfter,
++ });
++ return undefined;
++ }
++
++ if (!state) {
++ if (retryAfterMs === undefined) {
++ rateLimitLogger?.trace?.("Retry-After missing or unparsable; skipping", {
++ rateLimitKey: this.key,
++ retryAfter,
++ });
++ return undefined;
++ }
++ const targetAt = now + retryAfterMs;
++ const isSameWindow = !!existing && now < existing.resetAt;
++ state = {
++ limit: existing?.limit ?? 1,
++ remaining: 0,
++ resetAt: isSameWindow ? Math.max(existing.resetAt, targetAt) : targetAt,
++ reserved: Math.max(0, existing?.reserved ?? 0),
++ nextAllowedAt: Math.max(existing?.nextAllowedAt ?? now, targetAt),
++ };
++ headerSnapshot = { retryAfter, retryAfterMs };
++ } else if (retryAfterMs !== undefined) {
++ const targetAt = now + retryAfterMs;
++ state = {
++ ...state,
++ remaining: 0,
++ resetAt: Math.max(state.resetAt, targetAt),
++ nextAllowedAt: Math.max(state.nextAllowedAt, targetAt),
++ };
++ headerSnapshot = { ...headerSnapshot, retryAfter, retryAfterMs };
++ }
++
++ this.state = state;
++ rateLimitLogger?.debug?.("Applied rate limit headers to state", {
++ rateLimitKey: this.key,
++ limit: state.limit,
++ remaining: state.remaining,
++ effectiveRemaining: Math.max(0, state.remaining - state.reserved),
++ resetAt: state.resetAt,
++ nextAllowedAt: state.nextAllowedAt,
++ resetRequestsMs: headerSnapshot?.resetRequestsMs,
++ retryAfterMs: headerSnapshot?.retryAfterMs,
++ });
++
++ return {
++ key: this.key,
++ headerSnapshot: headerSnapshot ?? {},
++ state,
++ };
++ }
++}
+diff --git a/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts
+new file mode 100644
+index 00000000..fdb1c7a8
+--- /dev/null
++++ b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts
+@@ -0,0 +1,353 @@
++import type { Logger } from "../../logger";
++import {
++ RATE_LIMIT_EXHAUSTION_BUFFER,
++ RATE_LIMIT_MIN_PACE_INTERVAL_MS,
++ RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS,
++ RATE_LIMIT_PROBE_DELAY_MS,
++} from "../traffic-constants";
++import type {
++ DispatchDecision,
++ QueuedRequest,
++ RateLimitWindowState,
++} from "../traffic-controller-internal";
++import { parseRetryAfterMs, readHeaderValue } from "../traffic-error-utils";
++import type { RateLimitOptions, TrafficRequestMetadata } from "../traffic-types";
++import { DefaultRateLimitStrategy } from "./default-rate-limit-strategy";
++import type {
++ RateLimitStrategy,
++ RateLimitUpdateResult,
++ RateLimitUsage,
++} from "./rate-limit-strategy";
++import { parseResetDurationToMs } from "./rate-limit-utils";
++
++export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy {
++ readonly handlesTokenLimits = true;
++ private readonly window: DefaultRateLimitStrategy;
++ private readonly key: string;
++ private readonly requestsPerMinute?: number;
++ private readonly tokensPerMinute?: number;
++ private requestState?: RateLimitWindowState;
++ private tokenState?: RateLimitWindowState;
++ private bootstrapReserved = 0;
++ private readonly windowMs = 60_000;
++
++ constructor(key: string, options?: RateLimitOptions) {
++ this.key = key;
++ this.window = new DefaultRateLimitStrategy(key);
++ // Window strategy enforces fixed 60s windows; burstSize is intentionally ignored here.
++ this.requestsPerMinute = this.normalizeLimit(options?.requestsPerMinute);
++ this.tokensPerMinute = this.normalizeLimit(options?.tokensPerMinute);
++ }
++
++ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null {
++ if (this.requestsPerMinute !== undefined) {
++ const requestDecision = this.resolveRequestWindow(next, logger);
++ if (requestDecision) return requestDecision;
++ } else {
++ const decision = this.window.resolve(next, logger);
++ if (decision) return decision;
++
++ if (!next.rateLimitKey && this.tokensPerMinute === undefined) {
++ const rateLimitLogger = logger?.child({ module: "rate-limiter" });
++ if (this.bootstrapReserved >= 1) {
++ rateLimitLogger?.debug?.("OpenAI rate limit bootstrap active; waiting", {
++ rateLimitKey: this.key,
++ bootstrapReserved: this.bootstrapReserved,
++ });
++ return { kind: "wait" };
++ }
++
++ this.bootstrapReserved += 1;
++ next.rateLimitKey = this.key;
++ rateLimitLogger?.debug?.("OpenAI rate limit bootstrap reserved", {
++ rateLimitKey: this.key,
++ bootstrapReserved: this.bootstrapReserved,
++ });
++ }
++ }
++
++ const tokenDecision = this.resolveTokenWindow(next, logger);
++ if (tokenDecision) return tokenDecision;
++ return null;
++ }
++
++ onDispatch(logger?: Logger): void {
++ if (this.requestsPerMinute === undefined) {
++ this.window.onDispatch(logger);
++ }
++ }
++
++ onComplete(logger?: Logger): void {
++ if (this.requestsPerMinute !== undefined) {
++ const now = Date.now();
++ const state = this.ensureRequestState(now);
++ if (state.reserved > 0) {
++ state.reserved -= 1;
++ }
++ state.remaining = Math.max(0, state.remaining - 1);
++ return;
++ }
++
++ if (this.bootstrapReserved > 0) {
++ this.bootstrapReserved -= 1;
++ }
++ this.window.onComplete(logger);
++ }
++
++ recordUsage(usage: RateLimitUsage, logger?: Logger, reservedTokens?: number): void {
++ const tokens = this.resolveTokenCount(usage);
++ if (tokens <= 0) return;
++
++ const now = Date.now();
++ const state = this.ensureTokenState(now);
++ if (!state) return;
++ const reserved = typeof reservedTokens === "number" ? reservedTokens : 0;
++ const delta = tokens - reserved;
++ if (delta > 0) {
++ state.remaining = Math.max(0, state.remaining - delta);
++ } else if (delta < 0) {
++ state.remaining = Math.min(state.limit, state.remaining + Math.abs(delta));
++ }
++ logger?.child({ module: "rate-limiter" })?.trace?.("OpenAI token usage recorded", {
++ rateLimitKey: this.key,
++ tokens,
++ remaining: state.remaining,
++ resetAt: state.resetAt,
++ });
++ }
++
++ updateFromHeaders(
++ metadata: TrafficRequestMetadata | undefined,
++ headers: unknown,
++ logger?: Logger,
++ ): RateLimitUpdateResult | undefined {
++ const update =
++ this.requestsPerMinute !== undefined
++ ? undefined
++ : this.window.updateFromHeaders(metadata, headers, logger);
++ this.applyTokenHeaderUpdates(headers, logger);
++ return update;
++ }
++
++ private resolveRequestWindow(next: QueuedRequest, logger?: Logger): DispatchDecision | null {
++ const rateLimitLogger = logger?.child({ module: "rate-limiter" });
++ const now = Date.now();
++ const state = this.ensureRequestState(now);
++ const effectiveRemaining = Math.max(0, state.remaining - state.reserved);
++ const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS;
++
++ if (effectiveRemaining <= RATE_LIMIT_EXHAUSTION_BUFFER) {
++ if (now < probeAt) {
++ rateLimitLogger?.debug?.("OpenAI request window exhausted; waiting for probe", {
++ rateLimitKey: this.key,
++ remaining: state.remaining,
++ reserved: state.reserved,
++ effectiveRemaining,
++ resetAt: state.resetAt,
++ probeAt,
++ });
++ return { kind: "wait", wakeUpAt: probeAt };
++ }
++ if (state.reserved > 0) {
++ rateLimitLogger?.debug?.(
++ "OpenAI request window exhausted but in-flight reservations exist; waiting",
++ {
++ rateLimitKey: this.key,
++ remaining: state.remaining,
++ reserved: state.reserved,
++ effectiveRemaining,
++ resetAt: state.resetAt,
++ },
++ );
++ return { kind: "wait" };
++ }
++ }
++
++ if (now < state.nextAllowedAt) {
++ rateLimitLogger?.debug?.("OpenAI request window pacing; waiting until nextAllowedAt", {
++ rateLimitKey: this.key,
++ nextAllowedAt: state.nextAllowedAt,
++ resetAt: state.resetAt,
++ waitMs: Math.min(state.resetAt, state.nextAllowedAt) - now,
++ });
++ return { kind: "wait", wakeUpAt: Math.min(state.resetAt, state.nextAllowedAt) };
++ }
++
++ state.reserved += 1;
++ next.rateLimitKey = this.key;
++ rateLimitLogger?.trace?.("Reserved OpenAI request window slot", {
++ rateLimitKey: this.key,
++ reserved: state.reserved,
++ remaining: state.remaining,
++ resetAt: state.resetAt,
++ nextAllowedAt: state.nextAllowedAt,
++ });
++
++ const remainingWindowMs = Math.max(0, state.resetAt - now);
++ const intervalMs = Math.max(
++ RATE_LIMIT_MIN_PACE_INTERVAL_MS,
++ Math.ceil(remainingWindowMs / Math.max(effectiveRemaining, 1)),
++ );
++
++ const candidateNext = Math.max(state.nextAllowedAt, now + intervalMs);
++ if (
++ state.nextAllowedAt <= now ||
++ candidateNext >= state.nextAllowedAt + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS
++ ) {
++ state.nextAllowedAt = candidateNext;
++ rateLimitLogger?.trace?.("Updated OpenAI request pacing nextAllowedAt", {
++ rateLimitKey: this.key,
++ nextAllowedAt: state.nextAllowedAt,
++ intervalMs,
++ remainingWindowMs,
++ effectiveRemaining,
++ });
++ }
++
++ return null;
++ }
++
++ private resolveTokenWindow(next: QueuedRequest, logger?: Logger): DispatchDecision | null {
++ const rateLimitLogger = logger?.child({ module: "rate-limiter" });
++ const now = Date.now();
++ const state = this.ensureTokenState(now);
++ if (!state) return null;
++ const estimatedTokens = next.estimatedTokens;
++
++ if (typeof estimatedTokens === "number" && estimatedTokens > 0) {
++ if (state.remaining >= estimatedTokens) {
++ state.remaining = Math.max(0, state.remaining - estimatedTokens);
++ next.reservedTokens = estimatedTokens;
++ return null;
++ }
++ } else if (state.remaining > 0) {
++ return null;
++ }
++
++ const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS;
++ rateLimitLogger?.debug?.("OpenAI token window exhausted; waiting", {
++ rateLimitKey: this.key,
++ remaining: state.remaining,
++ resetAt: state.resetAt,
++ probeAt,
++ });
++ return { kind: "wait", wakeUpAt: probeAt };
++ }
++
++ private ensureRequestState(now: number): RateLimitWindowState {
++ const limit = this.requestsPerMinute ?? 0;
++ const state = this.requestState;
++ if (!state || now >= state.resetAt) {
++ this.requestState = {
++ limit,
++ remaining: limit,
++ resetAt: now + this.windowMs,
++ reserved: 0,
++ nextAllowedAt: now,
++ };
++ return this.requestState;
++ }
++ return state;
++ }
++
++ private ensureTokenState(now: number): RateLimitWindowState | undefined {
++ const configuredLimit = this.tokensPerMinute;
++ const state = this.tokenState;
++ if (!state) {
++ if (configuredLimit === undefined) return undefined;
++ this.tokenState = {
++ limit: configuredLimit,
++ remaining: configuredLimit,
++ resetAt: now + this.windowMs,
++ reserved: 0,
++ nextAllowedAt: now,
++ };
++ return this.tokenState;
++ }
++
++ if (now >= state.resetAt) {
++ const limit = configuredLimit ?? state.limit;
++ this.tokenState = {
++ limit,
++ remaining: limit,
++ resetAt: now + this.windowMs,
++ reserved: 0,
++ nextAllowedAt: now,
++ };
++ return this.tokenState;
++ }
++
++ if (configuredLimit !== undefined && configuredLimit !== state.limit) {
++ state.limit = configuredLimit;
++ state.remaining = Math.min(state.remaining, configuredLimit);
++ }
++
++ return state;
++ }
++
++ private normalizeLimit(value: number | undefined): number | undefined {
++ const numeric = typeof value === "number" ? value : Number(value);
++ return Number.isFinite(numeric) && numeric > 0 ? numeric : undefined;
++ }
++
++ private applyTokenHeaderUpdates(headers: unknown, logger?: Logger): void {
++ const rateLimitLogger = logger?.child({ module: "rate-limiter" });
++ const limitTokens = readHeaderValue(headers, "x-ratelimit-limit-tokens");
++ const remainingTokens = readHeaderValue(headers, "x-ratelimit-remaining-tokens");
++ const resetTokens = readHeaderValue(headers, "x-ratelimit-reset-tokens");
++ const retryAfter = readHeaderValue(headers, "retry-after");
++
++ const limit = Number(limitTokens);
++ const remaining = Number(remainingTokens);
++ const resetTokensMs = resetTokens ? parseResetDurationToMs(resetTokens) : undefined;
++ const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter) : undefined;
++
++ if (!Number.isFinite(limit) || !Number.isFinite(remaining) || resetTokensMs === undefined) {
++ rateLimitLogger?.trace?.("OpenAI token headers missing or invalid; skipping", {
++ rateLimitKey: this.key,
++ hasLimit: !!limitTokens,
++ hasRemaining: !!remainingTokens,
++ hasReset: !!resetTokens,
++ });
++ return;
++ }
++
++ const now = Date.now();
++ const configuredLimit = this.tokensPerMinute;
++ const effectiveLimit = configuredLimit === undefined ? limit : Math.min(configuredLimit, limit);
++ const clampedRemaining = Math.max(0, Math.min(remaining, effectiveLimit));
++ const parsedResetAt = now + resetTokensMs;
++ const existing = this.tokenState;
++ const isSameWindow = !!existing && now < existing.resetAt;
++ const resetAt = isSameWindow ? Math.max(existing.resetAt, parsedResetAt) : parsedResetAt;
++ const nextAllowedAt = isSameWindow ? Math.max(existing.nextAllowedAt, now) : now;
++ const reserved = Math.max(0, existing?.reserved ?? 0);
++ const effectiveRemaining = isSameWindow
++ ? Math.min(existing.remaining, clampedRemaining)
++ : clampedRemaining;
++
++ this.tokenState = {
++ limit: effectiveLimit,
++ remaining: effectiveRemaining,
++ resetAt,
++ reserved,
++ nextAllowedAt,
++ };
++
++ rateLimitLogger?.debug?.("OpenAI token headers applied", {
++ rateLimitKey: this.key,
++ limit: effectiveLimit,
++ remaining: effectiveRemaining,
++ resetAt,
++ retryAfterMs,
++ });
++ }
++
++ private resolveTokenCount(usage: RateLimitUsage): number {
++ const total = Number.isFinite(usage.totalTokens) ? usage.totalTokens : undefined;
++ if (total !== undefined) return total;
++ const input = Number.isFinite(usage.inputTokens) ? usage.inputTokens : 0;
++ const output = Number.isFinite(usage.outputTokens) ? usage.outputTokens : 0;
++ return input + output;
++ }
++}
+diff --git a/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts
+new file mode 100644
+index 00000000..653fdaf2
+--- /dev/null
++++ b/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts
+@@ -0,0 +1,41 @@
++import type { Logger } from "../../logger";
++import type {
++ DispatchDecision,
++ QueuedRequest,
++ RateLimitWindowState,
++} from "../traffic-controller-internal";
++import type { TrafficRequestMetadata } from "../traffic-types";
++
++export type RateLimitHeaderSnapshot = {
++ limitRequests?: string;
++ remainingRequests?: string;
++ resetRequests?: string;
++ resetRequestsMs?: number;
++ retryAfter?: string;
++ retryAfterMs?: number;
++};
++
++export type RateLimitUpdateResult = {
++ key: string;
++ headerSnapshot: RateLimitHeaderSnapshot;
++ state: RateLimitWindowState;
++};
++
++export type RateLimitUsage = {
++ inputTokens?: number;
++ outputTokens?: number;
++ totalTokens?: number;
++};
++
++export interface RateLimitStrategy {
++ readonly handlesTokenLimits?: boolean;
++ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null;
++ onDispatch(logger?: Logger): void;
++ onComplete(logger?: Logger): void;
++ recordUsage?(usage: RateLimitUsage, logger?: Logger, reservedTokens?: number): void;
++ updateFromHeaders(
++ metadata: TrafficRequestMetadata | undefined,
++ headers: unknown,
++ logger?: Logger,
++ ): RateLimitUpdateResult | undefined;
++}
+diff --git a/packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts b/packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts
+new file mode 100644
+index 00000000..310c9a7e
+--- /dev/null
++++ b/packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts
+@@ -0,0 +1,26 @@
++export function parseResetDurationToMs(raw: string): number | undefined {
++ const value = raw.trim();
++ if (!value) return undefined;
++
++ let totalMs = 0;
++ const regex = /(\d+(?:\.\d+)?)(ms|s|m|h|d)/g;
++ let matched = false;
++ for (const match of value.matchAll(regex)) {
++ matched = true;
++ const amount = Number.parseFloat(match[1] ?? "");
++ if (!Number.isFinite(amount)) continue;
++ const unit = match[2];
++ if (unit === "ms") totalMs += amount;
++ else if (unit === "s") totalMs += amount * 1000;
++ else if (unit === "m") totalMs += amount * 60_000;
++ else if (unit === "h") totalMs += amount * 3_600_000;
++ else if (unit === "d") totalMs += amount * 86_400_000;
++ }
++
++ if (matched) {
++ return Math.round(totalMs);
++ }
++
++ const n = Number(value);
++ return Number.isFinite(n) ? Math.round(n) : undefined;
++}
+diff --git a/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts
+new file mode 100644
+index 00000000..ee269ecd
+--- /dev/null
++++ b/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts
+@@ -0,0 +1,218 @@
++import type { Logger } from "../../logger";
++import type {
++ DispatchDecision,
++ QueuedRequest,
++ RateLimitWindowState,
++} from "../traffic-controller-internal";
++import { parseRetryAfterMs, readHeaderValue } from "../traffic-error-utils";
++import type { RateLimitOptions, TrafficRequestMetadata } from "../traffic-types";
++import type {
++ RateLimitHeaderSnapshot,
++ RateLimitStrategy,
++ RateLimitUpdateResult,
++} from "./rate-limit-strategy";
++import { parseResetDurationToMs } from "./rate-limit-utils";
++
++type TokenBucketState = {
++ capacity: number;
++ refillPerSecond: number;
++ tokens: number;
++ updatedAt: number;
++};
++
++function normalizeTokenBucketOptions(
++ raw: RateLimitOptions | undefined,
++): Omit | undefined {
++ const requestsPerMinuteRaw = raw?.requestsPerMinute;
++ const tokensPerMinuteRaw = raw?.tokensPerMinute;
++ const burstSizeRaw = raw?.burstSize;
++
++ const requestsPerMinute =
++ typeof requestsPerMinuteRaw === "number" ? requestsPerMinuteRaw : Number(requestsPerMinuteRaw);
++ const tokensPerMinute =
++ typeof tokensPerMinuteRaw === "number" ? tokensPerMinuteRaw : Number(tokensPerMinuteRaw);
++ const burstSize = typeof burstSizeRaw === "number" ? burstSizeRaw : Number(burstSizeRaw);
++
++ const safeRequestsPerMinute = Number.isFinite(requestsPerMinute) ? requestsPerMinute : 0;
++ const hasTokenLimit = Number.isFinite(tokensPerMinute) && tokensPerMinute > 0;
++ if (safeRequestsPerMinute <= 0 && hasTokenLimit) {
++ return undefined;
++ }
++ const safeBurst = Number.isFinite(burstSize) ? burstSize : safeRequestsPerMinute;
++ const refillPerSecond = safeRequestsPerMinute > 0 ? safeRequestsPerMinute / 60 : 0;
++
++ return {
++ capacity: safeBurst > 0 ? Math.max(1, safeBurst) : 0,
++ refillPerSecond,
++ };
++}
++function refillTokenBucket(bucket: TokenBucketState, now: number): void {
++ const elapsedMs = now - bucket.updatedAt;
++ if (elapsedMs <= 0) return;
++ bucket.updatedAt = now;
++ if (bucket.capacity <= 0 || bucket.refillPerSecond <= 0) return;
++
++ const refill = (elapsedMs / 1000) * bucket.refillPerSecond;
++ if (refill <= 0) return;
++ bucket.tokens = Math.min(bucket.capacity, bucket.tokens + refill);
++}
++
++export class TokenBucketRateLimitStrategy implements RateLimitStrategy {
++ private readonly key: string;
++ private bucket?: TokenBucketState;
++ private cooldownUntil?: number;
++
++ constructor(key: string, options?: RateLimitOptions) {
++ this.key = key;
++ if (!options) return;
++ const normalized = normalizeTokenBucketOptions(options);
++ if (!normalized) return;
++ const now = Date.now();
++ this.bucket = {
++ ...normalized,
++ tokens: normalized.capacity,
++ updatedAt: now,
++ };
++ }
++
++ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null {
++ const rateLimitLogger = logger?.child({ module: "rate-limiter" });
++ const now = Date.now();
++
++ if (this.cooldownUntil !== undefined && now < this.cooldownUntil) {
++ rateLimitLogger?.debug?.("Token bucket cooldown active; waiting", {
++ rateLimitKey: this.key,
++ cooldownUntil: this.cooldownUntil,
++ waitMs: this.cooldownUntil - now,
++ });
++ return { kind: "wait", wakeUpAt: this.cooldownUntil };
++ }
++
++ const bucket = this.bucket;
++ if (!bucket) return null;
++
++ refillTokenBucket(bucket, now);
++
++ if (bucket.capacity <= 0) {
++ rateLimitLogger?.debug?.("Token bucket misconfigured; blocking", {
++ rateLimitKey: this.key,
++ capacity: bucket.capacity,
++ refillPerSecond: bucket.refillPerSecond,
++ });
++ return { kind: "wait" };
++ }
++
++ if (bucket.tokens >= 1) {
++ bucket.tokens -= 1;
++ next.rateLimitKey = this.key;
++ rateLimitLogger?.trace?.("Consumed token bucket token", {
++ rateLimitKey: this.key,
++ tokens: bucket.tokens,
++ capacity: bucket.capacity,
++ refillPerSecond: bucket.refillPerSecond,
++ });
++ return null;
++ }
++
++ if (bucket.refillPerSecond <= 0) {
++ rateLimitLogger?.debug?.("Token bucket has no refill; blocking", {
++ rateLimitKey: this.key,
++ capacity: bucket.capacity,
++ refillPerSecond: bucket.refillPerSecond,
++ });
++ return { kind: "wait" };
++ }
++
++ const requiredTokens = 1 - bucket.tokens;
++ const waitMs = Math.max(1, Math.ceil((requiredTokens / bucket.refillPerSecond) * 1000));
++ const wakeUpAt = now + waitMs;
++ rateLimitLogger?.debug?.("Token bucket empty; waiting", {
++ rateLimitKey: this.key,
++ tokens: bucket.tokens,
++ capacity: bucket.capacity,
++ refillPerSecond: bucket.refillPerSecond,
++ wakeUpAt,
++ waitMs,
++ });
++ return { kind: "wait", wakeUpAt };
++ }
++
++ onDispatch(_logger?: Logger): void {}
++
++ onComplete(_logger?: Logger): void {}
++
++ updateFromHeaders(
++ _metadata: TrafficRequestMetadata | undefined,
++ headers: unknown,
++ logger?: Logger,
++ ): RateLimitUpdateResult | undefined {
++ const rateLimitLogger = logger?.child({ module: "rate-limiter" });
++ const now = Date.now();
++
++ const retryAfter = readHeaderValue(headers, "retry-after");
++ const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter, now) : undefined;
++
++ const remainingRequests = readHeaderValue(headers, "x-ratelimit-remaining-requests");
++ const resetRequests = readHeaderValue(headers, "x-ratelimit-reset-requests");
++ const resetRequestsMs = resetRequests ? parseResetDurationToMs(resetRequests) : undefined;
++
++ let appliedUntil: number | undefined;
++
++ if (retryAfterMs !== undefined) {
++ const targetAt = now + retryAfterMs;
++ this.cooldownUntil =
++ this.cooldownUntil === undefined ? targetAt : Math.max(this.cooldownUntil, targetAt);
++ appliedUntil = this.cooldownUntil;
++ }
++
++ if (remainingRequests && resetRequestsMs !== undefined) {
++ const remaining = Number(remainingRequests);
++ if (Number.isFinite(remaining) && remaining <= 0) {
++ const targetAt = now + resetRequestsMs;
++ this.cooldownUntil =
++ this.cooldownUntil === undefined ? targetAt : Math.max(this.cooldownUntil, targetAt);
++ appliedUntil = this.cooldownUntil;
++ }
++ }
++
++ if (appliedUntil === undefined) {
++ rateLimitLogger?.trace?.("No applicable cooldown headers; skipping", {
++ rateLimitKey: this.key,
++ hasRetryAfter: !!retryAfter,
++ hasRemainingRequests: !!remainingRequests,
++ hasResetRequests: !!resetRequests,
++ });
++ return undefined;
++ }
++
++ rateLimitLogger?.debug?.("Applied token bucket cooldown from headers", {
++ rateLimitKey: this.key,
++ cooldownUntil: appliedUntil,
++ inMs: Math.max(0, appliedUntil - now),
++ retryAfterMs,
++ resetRequestsMs,
++ });
++
++ const headerSnapshot: RateLimitHeaderSnapshot = {
++ remainingRequests,
++ resetRequests,
++ resetRequestsMs,
++ retryAfter,
++ retryAfterMs,
++ };
++
++ const state: RateLimitWindowState = {
++ limit: 1,
++ remaining: 0,
++ resetAt: appliedUntil,
++ reserved: 0,
++ nextAllowedAt: appliedUntil,
++ };
++
++ return {
++ key: this.key,
++ headerSnapshot,
++ state,
++ };
++ }
++}
+diff --git a/packages/core/src/traffic/traffic-circuit-breaker.ts b/packages/core/src/traffic/traffic-circuit-breaker.ts
+new file mode 100644
+index 00000000..20d166ca
+--- /dev/null
++++ b/packages/core/src/traffic/traffic-circuit-breaker.ts
+@@ -0,0 +1,478 @@
++import type { Logger } from "../logger";
++import {
++ CIRCUIT_COOLDOWN_MS,
++ CIRCUIT_FAILURE_THRESHOLD,
++ CIRCUIT_FAILURE_WINDOW_MS,
++ CIRCUIT_PROBE_INTERVAL_MS,
++ CIRCUIT_TIMEOUT_THRESHOLD,
++ CIRCUIT_TIMEOUT_WINDOW_MS,
++ DEFAULT_FALLBACK_CHAINS,
++} from "./traffic-constants";
++import type {
++ CircuitState,
++ CircuitStateStatus,
++ DispatchDecision,
++ QueuedRequest,
++} from "./traffic-controller-internal";
++import { extractStatusCode, isTimeoutError } from "./traffic-error-utils";
++import { CircuitBreakerOpenError } from "./traffic-errors";
++import type {
++ FallbackChainEntry,
++ FallbackPolicy,
++ FallbackPolicyConfig,
++ FallbackTarget,
++ TrafficRequestMetadata,
++ TrafficResponseMetadata,
++} from "./traffic-types";
++
++export class TrafficCircuitBreaker {
++ private readonly circuitBreakers = new Map();
++ private readonly fallbackChains: Map;
++ private readonly fallbackPolicy?: FallbackPolicyConfig;
++ private readonly buildRateLimitKey: (metadata?: TrafficRequestMetadata) => string;
++
++ constructor(options: {
++ fallbackChains?: Record;
++ fallbackPolicy?: FallbackPolicyConfig;
++ buildRateLimitKey: (metadata?: TrafficRequestMetadata) => string;
++ }) {
++ this.buildRateLimitKey = options.buildRateLimitKey;
++ const chains = options.fallbackChains ?? DEFAULT_FALLBACK_CHAINS;
++ this.fallbackChains = new Map(Object.entries(chains));
++ this.fallbackPolicy = options.fallbackPolicy;
++ }
++
++ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null {
++ const circuitLogger = logger?.child({ module: "circuit-breaker" });
++ const visitedKeys = new Set();
++
++ while (true) {
++ const key = this.buildRateLimitKey(next.request.metadata);
++ next.circuitKey = key;
++ visitedKeys.add(key);
++ circuitLogger?.trace?.("Circuit resolve step", {
++ circuitKey: key,
++ provider: next.request.metadata?.provider,
++ model: next.request.metadata?.model,
++ });
++
++ const evaluation = this.evaluateCircuitState(key, circuitLogger);
++ next.circuitStatus = evaluation.state;
++ circuitLogger?.debug?.("Circuit evaluated", {
++ circuitKey: key,
++ state: evaluation.state,
++ allowRequest: evaluation.allowRequest,
++ retryAfterMs: evaluation.retryAfterMs,
++ });
++
++ if (evaluation.allowRequest) return null;
++
++ const { policy, policyId } = this.resolveFallbackPolicy(next.request.metadata);
++ if (policy.mode === "wait") {
++ const wakeUpAt =
++ evaluation.retryAfterMs !== undefined ? Date.now() + evaluation.retryAfterMs : undefined;
++ circuitLogger?.debug?.("Circuit open; waiting per fallback policy", {
++ circuitKey: key,
++ policyId,
++ retryAfterMs: evaluation.retryAfterMs,
++ wakeUpAt,
++ });
++ return { kind: "wait", wakeUpAt };
++ }
++
++ const fallback = this.findFallbackTarget(next.request.metadata, visitedKeys, circuitLogger);
++ circuitLogger?.debug?.("Circuit open; attempting fallback", {
++ circuitKey: key,
++ currentModel: next.request.metadata?.model,
++ fallback,
++ visitedKeys: Array.from(visitedKeys),
++ });
++ if (!fallback || !next.request.createFallbackRequest) {
++ const error = new CircuitBreakerOpenError(
++ `Circuit open for ${key}`,
++ next.request.metadata,
++ evaluation.retryAfterMs,
++ );
++ const traffic: TrafficResponseMetadata = {
++ rateLimitKey: key,
++ retryAfterMs: evaluation.retryAfterMs,
++ tenantId: next.request.metadata?.tenantId ?? next.tenantId,
++ priority: next.request.metadata?.priority,
++ taskType: next.request.metadata?.taskType,
++ };
++ (error as Record).traffic = traffic;
++ next.reject(error);
++ circuitLogger?.warn?.("No fallback available; rejecting request", {
++ circuitKey: key,
++ retryAfterMs: evaluation.retryAfterMs,
++ });
++ return { kind: "skip" };
++ }
++
++ const fallbackRequest = next.request.createFallbackRequest(fallback);
++ if (!fallbackRequest) {
++ circuitLogger?.warn?.("createFallbackRequest returned undefined; skipping", {
++ circuitKey: key,
++ fallback,
++ });
++ return { kind: "skip" };
++ }
++
++ this.applyFallbackRequest(next, fallbackRequest, fallback, circuitLogger, {
++ previousCircuitKey: key,
++ reason: "circuit-open",
++ });
++ }
++ }
++
++ tryFallback(next: QueuedRequest, reason: "queue-timeout", logger?: Logger): boolean {
++ const circuitLogger = logger?.child({ module: "circuit-breaker" });
++ const { policy, policyId } = this.resolveFallbackPolicy(next.request.metadata);
++ if (policy.mode === "wait") {
++ circuitLogger?.debug?.("Fallback skipped by policy", {
++ policyId,
++ reason,
++ provider: next.request.metadata?.provider,
++ model: next.request.metadata?.model,
++ });
++ return false;
++ }
++
++ const visitedKeys = new Set();
++ const key = this.buildRateLimitKey(next.request.metadata);
++ visitedKeys.add(key);
++
++ const fallback = this.findFallbackTarget(next.request.metadata, visitedKeys, circuitLogger);
++ if (!fallback || !next.request.createFallbackRequest) {
++ circuitLogger?.debug?.("Fallback unavailable for request", {
++ reason,
++ provider: next.request.metadata?.provider,
++ model: next.request.metadata?.model,
++ fallback,
++ });
++ return false;
++ }
++
++ const fallbackRequest = next.request.createFallbackRequest(fallback);
++ if (!fallbackRequest) {
++ circuitLogger?.warn?.("createFallbackRequest returned undefined; skipping", {
++ reason,
++ fallback,
++ });
++ return false;
++ }
++
++ this.applyFallbackRequest(next, fallbackRequest, fallback, circuitLogger, {
++ previousCircuitKey: key,
++ reason,
++ policyId,
++ });
++ return true;
++ }
++
++ markTrial(item: QueuedRequest, logger?: Logger): void {
++ const circuitLogger = logger?.child({ module: "circuit-breaker" });
++ const key = item.circuitKey;
++ if (!key) return;
++ const state = this.circuitBreakers.get(key);
++ if (state && state.status === "half-open" && !state.trialInFlight) {
++ state.trialInFlight = true;
++ circuitLogger?.debug?.("Marked half-open trial in flight", { circuitKey: key });
++ }
++ }
++
++ recordSuccess(metadata?: TrafficRequestMetadata, logger?: Logger): void {
++ const circuitLogger = logger?.child({ module: "circuit-breaker" });
++ const key = this.buildRateLimitKey(metadata);
++ this.circuitBreakers.delete(key);
++ circuitLogger?.debug?.("Circuit success; cleared circuit state", {
++ circuitKey: key,
++ provider: metadata?.provider,
++ model: metadata?.model,
++ });
++ }
++
++ recordFailure(
++ metadata: TrafficRequestMetadata | undefined,
++ error: unknown,
++ logger?: Logger,
++ ): void {
++ const circuitLogger = logger?.child({ module: "circuit-breaker" });
++ const key = this.buildRateLimitKey(metadata);
++ const status = extractStatusCode(error, logger);
++ const isTimeout = status === 408 || isTimeoutError(error, logger);
++ const isStatusEligible = this.isCircuitBreakerStatus(status);
++ const isTimeoutEligible = !isStatusEligible && isTimeout;
++ const isEligible = isStatusEligible || isTimeoutEligible;
++
++ circuitLogger?.debug?.("Circuit failure observed", {
++ circuitKey: key,
++ status,
++ isTimeout,
++ eligible: isEligible,
++ provider: metadata?.provider,
++ model: metadata?.model,
++ });
++
++ if (!isEligible) {
++ this.circuitBreakers.delete(key);
++ circuitLogger?.debug?.("Failure not eligible for circuit breaker; cleared circuit state", {
++ circuitKey: key,
++ status,
++ isTimeout,
++ });
++ return;
++ }
++
++ const now = Date.now();
++ const state =
++ this.circuitBreakers.get(key) ??
++ ({ status: "closed", failureTimestamps: [], timeoutTimestamps: [] } as CircuitState);
++
++ state.failureTimestamps = state.failureTimestamps.filter(
++ (t) => now - t <= CIRCUIT_FAILURE_WINDOW_MS,
++ );
++ state.timeoutTimestamps = state.timeoutTimestamps.filter(
++ (t) => now - t <= CIRCUIT_TIMEOUT_WINDOW_MS,
++ );
++
++ state.failureTimestamps.push(now);
++ if (isTimeoutEligible) {
++ state.timeoutTimestamps.push(now);
++ }
++
++ if (
++ state.status === "half-open" ||
++ state.failureTimestamps.length >= CIRCUIT_FAILURE_THRESHOLD ||
++ state.timeoutTimestamps.length >= CIRCUIT_TIMEOUT_THRESHOLD
++ ) {
++ const openReasons: string[] = [];
++ if (state.status === "half-open") openReasons.push("half-open-failure");
++ if (state.failureTimestamps.length >= CIRCUIT_FAILURE_THRESHOLD) {
++ openReasons.push("failure-threshold");
++ }
++ if (state.timeoutTimestamps.length >= CIRCUIT_TIMEOUT_THRESHOLD) {
++ openReasons.push("timeout-threshold");
++ }
++
++ state.status = "open";
++ state.openedAt = now;
++ state.trialInFlight = false;
++ state.nextProbeAt = now + CIRCUIT_PROBE_INTERVAL_MS;
++ circuitLogger?.warn?.("Circuit opened", {
++ circuitKey: key,
++ openReasons,
++ status,
++ isTimeout,
++ failureCount: state.failureTimestamps.length,
++ failureThreshold: CIRCUIT_FAILURE_THRESHOLD,
++ timeoutCount: state.timeoutTimestamps.length,
++ timeoutThreshold: CIRCUIT_TIMEOUT_THRESHOLD,
++ openedAt: state.openedAt,
++ });
++ }
++
++ this.circuitBreakers.set(key, state);
++ circuitLogger?.trace?.("Circuit state updated", {
++ circuitKey: key,
++ status: state.status,
++ failureCount: state.failureTimestamps.length,
++ failureWindowMs: CIRCUIT_FAILURE_WINDOW_MS,
++ timeoutCount: state.timeoutTimestamps.length,
++ timeoutWindowMs: CIRCUIT_TIMEOUT_WINDOW_MS,
++ });
++ }
++
++ private evaluateCircuitState(
++ key: string,
++ logger?: Logger,
++ ): {
++ allowRequest: boolean;
++ state: CircuitStateStatus;
++ retryAfterMs?: number;
++ } {
++ const state = this.circuitBreakers.get(key);
++ if (!state) {
++ logger?.trace?.("Circuit state missing; allow request", { circuitKey: key });
++ return { allowRequest: true, state: "closed" };
++ }
++
++ const now = Date.now();
++
++ if (state.status === "open") {
++ const elapsed = state.openedAt ? now - state.openedAt : 0;
++ if (state.nextProbeAt === undefined) {
++ state.nextProbeAt = now + CIRCUIT_PROBE_INTERVAL_MS;
++ }
++ const cooldownRemaining = Math.max(0, CIRCUIT_COOLDOWN_MS - elapsed);
++ const probeRemaining = Math.max(0, state.nextProbeAt - now);
++ if (probeRemaining === 0 || cooldownRemaining === 0) {
++ state.status = "half-open";
++ state.trialInFlight = false;
++ state.failureTimestamps = [];
++ state.timeoutTimestamps = [];
++ state.nextProbeAt = undefined;
++ logger?.debug?.("Circuit transitioned to half-open", {
++ circuitKey: key,
++ reason: cooldownRemaining === 0 ? "cooldown" : "probe",
++ });
++ return { allowRequest: true, state: "half-open" };
++ }
++ return {
++ allowRequest: false,
++ state: "open",
++ retryAfterMs: Math.min(cooldownRemaining, probeRemaining),
++ };
++ }
++
++ if (state.status === "half-open" && state.trialInFlight) {
++ return { allowRequest: false, state: "half-open" };
++ }
++
++ return { allowRequest: true, state: state.status };
++ }
++
++ private resolveFallbackPolicy(metadata: TrafficRequestMetadata | undefined): {
++ policy: FallbackPolicy;
++ policyId?: string;
++ } {
++ const policyId =
++ metadata?.fallbackPolicyId ??
++ (metadata?.taskType
++ ? this.fallbackPolicy?.taskTypePolicyIds?.[metadata.taskType]
++ : undefined) ??
++ this.fallbackPolicy?.defaultPolicyId;
++
++ const policy = policyId ? this.fallbackPolicy?.policies?.[policyId] : undefined;
++ return {
++ policy: policy ?? { mode: "fallback" },
++ policyId,
++ };
++ }
++
++ private applyFallbackRequest(
++ next: QueuedRequest,
++ fallbackRequest: QueuedRequest["request"],
++ fallback: FallbackChainEntry,
++ logger?: Logger,
++ context?: { previousCircuitKey?: string; reason?: string; policyId?: string },
++ ): void {
++ next.request = fallbackRequest;
++ next.attempt = 1;
++ next.estimatedTokens = fallbackRequest.estimatedTokens;
++ next.reservedTokens = undefined;
++ next.tenantConcurrencyKey = undefined;
++ next.providerModelConcurrencyKey = undefined;
++ next.rateLimitKey = undefined;
++ next.etaMs = undefined;
++ next.circuitKey = undefined;
++ next.circuitStatus = undefined;
++ next.extractUsage = fallbackRequest.extractUsage;
++ if (context?.reason === "queue-timeout") {
++ next.queueTimeoutDisabled = true;
++ }
++ logger?.debug?.("Switched to fallback request", {
++ previousCircuitKey: context?.previousCircuitKey,
++ fallbackModel: fallback,
++ reason: context?.reason,
++ policyId: context?.policyId,
++ });
++ }
++
++ private isShortResponseFallback(
++ candidate: FallbackChainEntry,
++ ): candidate is { kind: "short-response"; text: string } {
++ return (
++ typeof candidate === "object" &&
++ candidate !== null &&
++ "kind" in candidate &&
++ (candidate as { kind?: string }).kind === "short-response"
++ );
++ }
++
++ private findFallbackTarget(
++ metadata: TrafficRequestMetadata | undefined,
++ visitedKeys: Set,
++ logger?: Logger,
++ ): FallbackChainEntry | undefined {
++ const currentModel = metadata?.model;
++ if (!currentModel) {
++ logger?.trace?.("No current model; no fallback", {});
++ return undefined;
++ }
++
++ const provider = metadata?.provider;
++ const chain = this.resolveFallbackChain(provider, currentModel);
++ if (!chain) {
++ logger?.trace?.("No fallback chain for model", {
++ currentModel,
++ provider,
++ });
++ return undefined;
++ }
++
++ for (const candidate of chain) {
++ if (this.isShortResponseFallback(candidate)) {
++ logger?.debug?.("Selected short-response fallback", {
++ currentModel,
++ currentProvider: provider,
++ });
++ return candidate;
++ }
++ const target = this.normalizeFallbackTarget(candidate, provider);
++ const candidateMetadata: TrafficRequestMetadata = {
++ ...(metadata ?? {}),
++ provider: target.provider ?? provider,
++ model: target.model,
++ };
++ const candidateKey = this.buildRateLimitKey(candidateMetadata);
++ if (visitedKeys.has(candidateKey)) {
++ continue;
++ }
++
++ const evaluation = this.evaluateCircuitState(candidateKey, logger);
++ if (evaluation.allowRequest) {
++ visitedKeys.add(candidateKey);
++ logger?.debug?.("Selected fallback target", {
++ currentModel,
++ currentProvider: provider,
++ fallbackModel: target.model,
++ fallbackProvider: target.provider ?? provider,
++ fallbackCircuitKey: candidateKey,
++ });
++ return candidate;
++ }
++ }
++
++ return undefined;
++ }
++
++ private resolveFallbackChain(
++ provider: string | undefined,
++ model: string,
++ ): FallbackChainEntry[] | undefined {
++ const providerKey = provider ? `${provider}::${model}` : undefined;
++ if (providerKey) {
++ const providerChain = this.fallbackChains.get(providerKey);
++ if (providerChain) return providerChain;
++ }
++ return this.fallbackChains.get(model);
++ }
++
++ private normalizeFallbackTarget(
++ candidate: FallbackChainEntry,
++ provider: string | undefined,
++ ): FallbackTarget {
++ if (typeof candidate === "string") {
++ return { provider, model: candidate };
++ }
++ return {
++ provider: candidate.provider ?? provider,
++ model: candidate.model,
++ };
++ }
++
++ private isCircuitBreakerStatus(status?: number): boolean {
++ return status === 429 || (status !== undefined && status >= 500);
++ }
++}
+diff --git a/packages/core/src/traffic/traffic-concurrency-limiter.ts b/packages/core/src/traffic/traffic-concurrency-limiter.ts
+new file mode 100644
+index 00000000..e1525612
+--- /dev/null
++++ b/packages/core/src/traffic/traffic-concurrency-limiter.ts
+@@ -0,0 +1,235 @@
++import type { Logger } from "../logger";
++import type { QueuedRequest } from "./traffic-controller-internal";
++import type {
++ ProviderModelConcurrencyLimit,
++ TenantConcurrencyLimit,
++ TrafficRequestMetadata,
++} from "./traffic-types";
++
++export type ConcurrencyBlockReason =
++ | {
++ gate: "providerModel";
++ key: string;
++ inFlight: number;
++ limit: number;
++ }
++ | {
++ gate: "tenant";
++ key: string;
++ inFlight: number;
++ limit: number;
++ };
++
++export type ConcurrencyDecision =
++ | { kind: "allow" }
++ | { kind: "wait"; reasons: ConcurrencyBlockReason[] };
++
++function toNonNegativeIntegerLimit(raw: unknown): number | undefined {
++ if (raw === undefined || raw === null) return undefined;
++ const n = typeof raw === "number" ? raw : Number(raw);
++ if (!Number.isFinite(n)) return undefined;
++ if (n <= 0) return 0;
++ return Math.floor(n);
++}
++
++function getInFlight(map: Map, key: string): number {
++ return map.get(key) ?? 0;
++}
++
++function incrementInFlight(map: Map, key: string): void {
++ map.set(key, getInFlight(map, key) + 1);
++}
++
++function decrementInFlight(map: Map, key: string): void {
++ const current = getInFlight(map, key);
++ if (current <= 1) {
++ map.delete(key);
++ return;
++ }
++ map.set(key, current - 1);
++}
++
++export class TrafficConcurrencyLimiter {
++ private readonly inFlightByProviderModel = new Map();
++ private readonly inFlightByTenant = new Map();
++
++ private readonly buildProviderModelKey: (metadata?: TrafficRequestMetadata) => string;
++ private readonly providerModelLimit?: ProviderModelConcurrencyLimit;
++ private readonly tenantLimit?: TenantConcurrencyLimit;
++ private readonly providerModelEnabled: boolean;
++ private readonly tenantEnabled: boolean;
++
++ constructor(options: {
++ buildProviderModelKey: (metadata?: TrafficRequestMetadata) => string;
++ maxConcurrentPerProviderModel?: ProviderModelConcurrencyLimit;
++ maxConcurrentPerTenant?: TenantConcurrencyLimit;
++ }) {
++ this.buildProviderModelKey = options.buildProviderModelKey;
++ this.providerModelLimit = options.maxConcurrentPerProviderModel;
++ this.tenantLimit = options.maxConcurrentPerTenant;
++ this.providerModelEnabled = options.maxConcurrentPerProviderModel !== undefined;
++ this.tenantEnabled = options.maxConcurrentPerTenant !== undefined;
++ }
++
++ resolve(next: QueuedRequest, logger?: Logger): ConcurrencyDecision {
++ if (!this.providerModelEnabled && !this.tenantEnabled) return { kind: "allow" };
++ const concurrencyLogger = logger?.child({ module: "concurrency-limiter" });
++ const reasons: ConcurrencyBlockReason[] = [];
++
++ if (this.providerModelEnabled) {
++ const providerModelKey = this.buildProviderModelKey(next.request.metadata);
++ const providerModelLimit = this.resolveProviderModelLimit(
++ providerModelKey,
++ next.request.metadata,
++ concurrencyLogger,
++ );
++ if (providerModelLimit !== undefined) {
++ const inFlight = getInFlight(this.inFlightByProviderModel, providerModelKey);
++ if (inFlight >= providerModelLimit) {
++ reasons.push({
++ gate: "providerModel",
++ key: providerModelKey,
++ inFlight,
++ limit: providerModelLimit,
++ });
++ }
++ }
++ }
++
++ if (this.tenantEnabled) {
++ const tenantKey = next.tenantId;
++ const tenantLimit = this.resolveTenantLimit(
++ tenantKey,
++ next.request.metadata,
++ concurrencyLogger,
++ );
++ if (tenantLimit !== undefined) {
++ const inFlight = getInFlight(this.inFlightByTenant, tenantKey);
++ if (inFlight >= tenantLimit) {
++ reasons.push({
++ gate: "tenant",
++ key: tenantKey,
++ inFlight,
++ limit: tenantLimit,
++ });
++ }
++ }
++ }
++
++ if (reasons.length === 0) return { kind: "allow" };
++
++ concurrencyLogger?.trace?.("Concurrency gate blocked request", {
++ tenantId: next.tenantId,
++ reasons,
++ });
++ return { kind: "wait", reasons };
++ }
++
++ acquire(next: QueuedRequest, logger?: Logger): void {
++ if (!this.providerModelEnabled && !this.tenantEnabled) return;
++ const concurrencyLogger = logger?.child({ module: "concurrency-limiter" });
++
++ let tenantKey: string | undefined;
++ if (this.tenantEnabled) {
++ tenantKey = next.tenantId;
++ next.tenantConcurrencyKey = tenantKey;
++ incrementInFlight(this.inFlightByTenant, tenantKey);
++ }
++
++ let providerModelKey: string | undefined;
++ if (this.providerModelEnabled) {
++ providerModelKey = this.buildProviderModelKey(next.request.metadata);
++ next.providerModelConcurrencyKey = providerModelKey;
++ incrementInFlight(this.inFlightByProviderModel, providerModelKey);
++ }
++
++ concurrencyLogger?.trace?.("Concurrency slots acquired", {
++ tenantId: tenantKey,
++ tenantInFlight: tenantKey ? getInFlight(this.inFlightByTenant, tenantKey) : undefined,
++ providerModelKey,
++ providerModelInFlight: providerModelKey
++ ? getInFlight(this.inFlightByProviderModel, providerModelKey)
++ : undefined,
++ });
++ }
++
++ release(next: QueuedRequest, logger?: Logger): void {
++ const concurrencyLogger = logger?.child({ module: "concurrency-limiter" });
++ const tenantKey = next.tenantConcurrencyKey;
++ const providerModelKey = next.providerModelConcurrencyKey;
++
++ if (tenantKey) {
++ decrementInFlight(this.inFlightByTenant, tenantKey);
++ }
++
++ if (providerModelKey) {
++ decrementInFlight(this.inFlightByProviderModel, providerModelKey);
++ }
++
++ if (tenantKey || providerModelKey) {
++ concurrencyLogger?.trace?.("Concurrency slots released", {
++ tenantId: tenantKey,
++ tenantInFlight: tenantKey ? getInFlight(this.inFlightByTenant, tenantKey) : undefined,
++ providerModelKey,
++ providerModelInFlight: providerModelKey
++ ? getInFlight(this.inFlightByProviderModel, providerModelKey)
++ : undefined,
++ });
++ }
++
++ next.tenantConcurrencyKey = undefined;
++ next.providerModelConcurrencyKey = undefined;
++ }
++
++ private resolveTenantLimit(
++ tenantId: string,
++ metadata: TrafficRequestMetadata | undefined,
++ logger?: Logger,
++ ): number | undefined {
++ const policy = this.tenantLimit;
++ if (policy === undefined) return undefined;
++
++ if (typeof policy === "number") return toNonNegativeIntegerLimit(policy);
++ if (typeof policy === "function") {
++ try {
++ return toNonNegativeIntegerLimit(policy(tenantId, metadata));
++ } catch (error) {
++ logger?.warn?.("Tenant concurrency resolver threw; ignoring", {
++ tenantId,
++ errorName: (error as { name?: unknown } | null)?.name,
++ errorMessage: (error as { message?: unknown } | null)?.message,
++ });
++ return undefined;
++ }
++ }
++
++ return toNonNegativeIntegerLimit(policy[tenantId]);
++ }
++
++ private resolveProviderModelLimit(
++ key: string,
++ metadata: TrafficRequestMetadata | undefined,
++ logger?: Logger,
++ ): number | undefined {
++ const policy = this.providerModelLimit;
++ if (policy === undefined) return undefined;
++
++ if (typeof policy === "number") return toNonNegativeIntegerLimit(policy);
++ if (typeof policy === "function") {
++ try {
++ return toNonNegativeIntegerLimit(policy(metadata, key));
++ } catch (error) {
++ logger?.warn?.("Provider/model concurrency resolver threw; ignoring", {
++ key,
++ provider: metadata?.provider,
++ model: metadata?.model,
++ errorName: (error as { name?: unknown } | null)?.name,
++ errorMessage: (error as { message?: unknown } | null)?.message,
++ });
++ return undefined;
++ }
++ }
++
++ return toNonNegativeIntegerLimit(policy[key]);
++ }
++}
+diff --git a/packages/core/src/traffic/traffic-constants.ts b/packages/core/src/traffic/traffic-constants.ts
+new file mode 100644
+index 00000000..68d99df7
+--- /dev/null
++++ b/packages/core/src/traffic/traffic-constants.ts
+@@ -0,0 +1,26 @@
++export const MAX_RETRY_ATTEMPTS = 3;
++export const TIMEOUT_RETRY_ATTEMPTS = 2;
++
++export const RATE_LIMIT_BASE_BACKOFF_MS = 500;
++export const SERVER_ERROR_BASE_BACKOFF_MS = 1000;
++export const TIMEOUT_BASE_BACKOFF_MS = 750;
++
++export const RATE_LIMIT_JITTER_FACTOR = 0.35;
++export const SERVER_ERROR_JITTER_FACTOR = 0.8;
++export const TIMEOUT_JITTER_FACTOR = 0.5;
++
++export const CIRCUIT_FAILURE_THRESHOLD = 5;
++export const CIRCUIT_FAILURE_WINDOW_MS = 10_000;
++export const CIRCUIT_TIMEOUT_THRESHOLD = CIRCUIT_FAILURE_THRESHOLD;
++export const CIRCUIT_TIMEOUT_WINDOW_MS = CIRCUIT_FAILURE_WINDOW_MS;
++export const CIRCUIT_COOLDOWN_MS = 30_000;
++export const CIRCUIT_PROBE_INTERVAL_MS = 5_000;
++
++export const RATE_LIMIT_EXHAUSTION_BUFFER = 1;
++export const RATE_LIMIT_PROBE_DELAY_MS = 50;
++export const RATE_LIMIT_MIN_PACE_INTERVAL_MS = 10;
++export const RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS = 10;
++
++export const DEFAULT_FALLBACK_CHAINS: Record = {
++ "gpt-4o": ["gpt-4o-mini", "gpt-3.5"],
++};
+diff --git a/packages/core/src/traffic/traffic-controller-internal.ts b/packages/core/src/traffic/traffic-controller-internal.ts
+new file mode 100644
+index 00000000..fd2012cf
+--- /dev/null
++++ b/packages/core/src/traffic/traffic-controller-internal.ts
+@@ -0,0 +1,57 @@
++import type { TrafficPriority, TrafficRequest, TrafficRequestType } from "./traffic-types";
++
++export type Scheduler = (callback: () => void) => void;
++
++export type DispatchDecision =
++ | { kind: "dispatch" }
++ | { kind: "skip" }
++ | { kind: "wait"; wakeUpAt?: number };
++
++export type CircuitStateStatus = "closed" | "open" | "half-open";
++
++export interface CircuitState {
++ status: CircuitStateStatus;
++ failureTimestamps: number[];
++ timeoutTimestamps: number[];
++ openedAt?: number;
++ trialInFlight?: boolean;
++ nextProbeAt?: number;
++}
++
++export interface RateLimitWindowState {
++ limit: number;
++ remaining: number;
++ resetAt: number;
++ reserved: number;
++ nextAllowedAt: number;
++}
++
++type BivariantHandler = {
++ bivarianceHack(...args: TArgs): void;
++}["bivarianceHack"];
++
++export interface QueuedRequest {
++ type: TrafficRequestType;
++ request: TrafficRequest;
++ resolve: BivariantHandler<[TResponse | PromiseLike]>;
++ reject: BivariantHandler<[reason?: unknown]>;
++ attempt: number;
++ priority: TrafficPriority;
++ tenantId: string;
++ enqueuedAt: number;
++ dispatchedAt?: number;
++ estimatedTokens?: number;
++ reservedTokens?: number;
++ queueTimeoutDisabled?: boolean;
++
++ tenantConcurrencyKey?: string;
++ providerModelConcurrencyKey?: string;
++
++ rateLimitKey?: string;
++ etaMs?: number;
++
++ circuitKey?: string;
++ circuitStatus?: CircuitStateStatus;
++
++ extractUsage?: TrafficRequest["extractUsage"];
++}
+diff --git a/packages/core/src/traffic/traffic-controller.spec.ts b/packages/core/src/traffic/traffic-controller.spec.ts
+new file mode 100644
+index 00000000..8f0a2c47
+--- /dev/null
++++ b/packages/core/src/traffic/traffic-controller.spec.ts
+@@ -0,0 +1,706 @@
++import { describe, expect, it, vi } from "vitest";
++import { CIRCUIT_FAILURE_THRESHOLD, RATE_LIMIT_PROBE_DELAY_MS } from "./traffic-constants";
++import { TrafficController } from "./traffic-controller";
++
++describe("TrafficController priority scheduling", () => {
++ it("prioritizes P0 over lower priorities when runnable", async () => {
++ const controller = new TrafficController({ maxConcurrent: 1 });
++ const order: string[] = [];
++
++ const p1 = controller.handleText({
++ metadata: { provider: "p", model: "m1", priority: "P1" },
++ execute: async () => {
++ order.push("P1");
++ return "P1";
++ },
++ });
++
++ const p2 = controller.handleText({
++ metadata: { provider: "p", model: "m2", priority: "P2" },
++ execute: async () => {
++ order.push("P2");
++ return "P2";
++ },
++ });
++
++ const p0 = controller.handleText({
++ metadata: { provider: "p", model: "m0", priority: "P0" },
++ execute: async () => {
++ order.push("P0");
++ return "P0";
++ },
++ });
++
++ await Promise.all([p0, p1, p2]);
++
++ expect(order[0]).toBe("P0");
++ expect(order).toEqual(["P0", "P1", "P2"]);
++ });
++
++ it("allows lower priorities to proceed when a higher priority request is rate limited", async () => {
++ vi.useFakeTimers();
++
++ try {
++ vi.setSystemTime(new Date(0));
++ const controller = new TrafficController({ maxConcurrent: 1 });
++ controller.updateRateLimitFromHeaders(
++ { provider: "p0", model: "m0" },
++ {
++ "x-ratelimit-limit-requests": "1",
++ "x-ratelimit-remaining-requests": "0",
++ "x-ratelimit-reset-requests": "1s",
++ },
++ );
++
++ const order: string[] = [];
++
++ const p0 = controller.handleText({
++ metadata: { provider: "p0", model: "m0", priority: "P0" },
++ execute: async () => {
++ order.push("P0");
++ return "P0";
++ },
++ });
++
++ const p1 = controller.handleText({
++ metadata: { provider: "p1", model: "m1", priority: "P1" },
++ execute: async () => {
++ order.push("P1");
++ return "P1";
++ },
++ });
++
++ await vi.runAllTimersAsync();
++ await Promise.all([p0, p1]);
++
++ expect(order[0]).toBe("P1");
++ expect(order[1]).toBe("P0");
++ } finally {
++ vi.useRealTimers();
++ }
++ });
++});
++
++describe("TrafficController concurrency limits", () => {
++ it("shares provider/model limits across tenants", async () => {
++ const controller = new TrafficController({
++ maxConcurrent: 2,
++ maxConcurrentPerProviderModel: 1,
++ });
++ const started: string[] = [];
++ let releaseFirst!: () => void;
++ const firstGate = new Promise((resolve) => {
++ releaseFirst = resolve;
++ });
++
++ const first = controller.handleText({
++ tenantId: "tenant-a",
++ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" },
++ execute: async () => {
++ started.push("tenant-a");
++ await firstGate;
++ return "a";
++ },
++ });
++
++ const second = controller.handleText({
++ tenantId: "tenant-b",
++ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" },
++ execute: async () => {
++ started.push("tenant-b");
++ return "b";
++ },
++ });
++
++ await new Promise((resolve) => setTimeout(resolve, 0));
++ expect(started).toEqual(["tenant-a"]);
++
++ releaseFirst();
++ await Promise.all([first, second]);
++ expect(started).toEqual(["tenant-a", "tenant-b"]);
++ });
++});
++
++describe("TrafficController rate limit headers", () => {
++ it("parses OpenAI-style compound reset durations (e.g. 1m30.951s)", () => {
++ vi.useFakeTimers();
++
++ try {
++ vi.setSystemTime(new Date(1_000_000));
++ const controller = new TrafficController({ maxConcurrent: 1 });
++ const now = Date.now();
++
++ const result = controller.updateRateLimitFromHeaders(
++ { provider: "openai.responses", model: "gpt-4o-mini" },
++ {
++ "x-ratelimit-limit-requests": "10000",
++ "x-ratelimit-remaining-requests": "9989",
++ "x-ratelimit-reset-requests": "1m30.951s",
++ },
++ );
++
++ expect(result).toBeTruthy();
++ expect(result?.headerSnapshot.resetRequestsMs).toBeCloseTo(90_951, 6);
++ expect(result?.state.limit).toBe(10000);
++ expect(result?.state.remaining).toBe(9989);
++ expect(result?.state.resetAt).toBe(now + 90_951);
++ expect(result?.state.reserved).toBe(0);
++ expect(result?.state.nextAllowedAt).toBe(now);
++ } finally {
++ vi.useRealTimers();
++ }
++ });
++
++ it("keeps resetAt monotonic when headers shorten the reset duration", () => {
++ vi.useFakeTimers();
++
++ try {
++ vi.setSystemTime(new Date(0));
++ const controller = new TrafficController({ maxConcurrent: 1 });
++
++ const first = controller.updateRateLimitFromHeaders(
++ { provider: "openai.responses", model: "gpt-4o-mini" },
++ {
++ "x-ratelimit-limit-requests": "10000",
++ "x-ratelimit-remaining-requests": "9999",
++ "x-ratelimit-reset-requests": "60s",
++ },
++ );
++
++ expect(first).toBeTruthy();
++ expect(first?.state.resetAt).toBe(60_000);
++
++ vi.setSystemTime(new Date(10_000));
++ const second = controller.updateRateLimitFromHeaders(
++ { provider: "openai.responses", model: "gpt-4o-mini" },
++ {
++ "x-ratelimit-limit-requests": "10000",
++ "x-ratelimit-remaining-requests": "9998",
++ "x-ratelimit-reset-requests": "5s",
++ },
++ );
++
++ expect(second).toBeTruthy();
++ expect(second?.state.resetAt).toBe(60_000);
++ } finally {
++ vi.useRealTimers();
++ }
++ });
++
++ it("never increases remaining within the same window", () => {
++ vi.useFakeTimers();
++
++ try {
++ vi.setSystemTime(new Date(0));
++ const controller = new TrafficController({ maxConcurrent: 1 });
++
++ const first = controller.updateRateLimitFromHeaders(
++ { provider: "openai.responses", model: "gpt-4o-mini" },
++ {
++ "x-ratelimit-limit-requests": "10",
++ "x-ratelimit-remaining-requests": "9",
++ "x-ratelimit-reset-requests": "60s",
++ },
++ );
++
++ expect(first?.state.remaining).toBe(9);
++ expect(first?.state.resetAt).toBe(60_000);
++
++ vi.setSystemTime(new Date(10_000));
++ const second = controller.updateRateLimitFromHeaders(
++ { provider: "openai.responses", model: "gpt-4o-mini" },
++ {
++ "x-ratelimit-limit-requests": "10",
++ "x-ratelimit-remaining-requests": "8",
++ "x-ratelimit-reset-requests": "50s",
++ },
++ );
++
++ expect(second?.state.remaining).toBe(8);
++ expect(second?.state.resetAt).toBe(60_000);
++
++ vi.setSystemTime(new Date(20_000));
++ const third = controller.updateRateLimitFromHeaders(
++ { provider: "openai.responses", model: "gpt-4o-mini" },
++ {
++ "x-ratelimit-limit-requests": "10",
++ "x-ratelimit-remaining-requests": "9",
++ "x-ratelimit-reset-requests": "40s",
++ },
++ );
++
++ expect(third?.state.remaining).toBe(8);
++ expect(third?.state.resetAt).toBe(60_000);
++ } finally {
++ vi.useRealTimers();
++ }
++ });
++
++ it("applies Retry-After even when x-ratelimit headers are missing", async () => {
++ vi.useFakeTimers();
++
++ try {
++ vi.setSystemTime(new Date(0));
++ const controller = new TrafficController({ maxConcurrent: 1 });
++ const order: string[] = [];
++
++ controller.updateRateLimitFromHeaders(
++ { provider: "p", model: "m" },
++ {
++ "retry-after": "2",
++ },
++ );
++
++ const p0 = controller.handleText({
++ metadata: { provider: "p", model: "m", priority: "P0" },
++ execute: async () => {
++ order.push("P0");
++ return "P0";
++ },
++ });
++
++ await vi.advanceTimersByTimeAsync(1_999);
++ expect(order).toEqual([]);
++
++ await vi.advanceTimersByTimeAsync(1);
++ await vi.runAllTimersAsync();
++ await p0;
++ expect(order).toEqual(["P0"]);
++ } finally {
++ vi.useRealTimers();
++ }
++ });
++
++ it("shares rate limits across tenants for the same provider/model", async () => {
++ vi.useFakeTimers();
++
++ try {
++ vi.setSystemTime(new Date(0));
++ const controller = new TrafficController({ maxConcurrent: 1 });
++ controller.updateRateLimitFromHeaders(
++ { provider: "openai", model: "gpt-4o", tenantId: "tenant-a" },
++ {
++ "x-ratelimit-limit-requests": "1",
++ "x-ratelimit-remaining-requests": "0",
++ "x-ratelimit-reset-requests": "1s",
++ },
++ );
++
++ const order: string[] = [];
++ const request = controller.handleText({
++ tenantId: "tenant-b",
++ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" },
++ execute: async () => {
++ order.push("tenant-b");
++ return "ok";
++ },
++ });
++
++ await vi.advanceTimersByTimeAsync(999);
++ await Promise.resolve();
++ expect(order).toEqual([]);
++
++ await vi.advanceTimersByTimeAsync(1);
++ await vi.runAllTimersAsync();
++ await request;
++
++ expect(order).toEqual(["tenant-b"]);
++ } finally {
++ vi.useRealTimers();
++ }
++ });
++});
++
++describe("TrafficController token limits", () => {
++ it("blocks OpenAI when the token window is exhausted even without RPM config", async () => {
++ vi.useFakeTimers();
++
++ try {
++ vi.setSystemTime(new Date(0));
++ const controller = new TrafficController({
++ maxConcurrent: 1,
++ rateLimits: {
++ "openai::gpt-4o": {
++ requestsPerMinute: 0,
++ tokensPerMinute: 2,
++ },
++ },
++ });
++ const order: string[] = [];
++
++ const first = controller.handleText({
++ tenantId: "tenant-a",
++ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" },
++ execute: async () => {
++ order.push("first");
++ return "first";
++ },
++ extractUsage: () => ({ totalTokens: 2 }),
++ });
++
++ const second = controller.handleText({
++ tenantId: "tenant-b",
++ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" },
++ execute: async () => {
++ order.push("second");
++ return "second";
++ },
++ extractUsage: () => ({ totalTokens: 1 }),
++ });
++
++ await first;
++ expect(order).toEqual(["first"]);
++
++ await vi.advanceTimersByTimeAsync(60_000 + RATE_LIMIT_PROBE_DELAY_MS - 1);
++ expect(order).toEqual(["first"]);
++
++ await vi.advanceTimersByTimeAsync(1);
++ await vi.runAllTimersAsync();
++ await second;
++ expect(order).toEqual(["first", "second"]);
++ } finally {
++ vi.useRealTimers();
++ }
++ });
++
++ it("reserves estimated tokens before dispatch", async () => {
++ vi.useFakeTimers();
++
++ try {
++ vi.setSystemTime(new Date(0));
++ const controller = new TrafficController({
++ maxConcurrent: 2,
++ rateLimits: {
++ "openai::gpt-4o": {
++ requestsPerMinute: 0,
++ tokensPerMinute: 2,
++ },
++ },
++ });
++ const order: string[] = [];
++ let releaseFirst!: () => void;
++ const firstGate = new Promise((resolve) => {
++ releaseFirst = resolve;
++ });
++
++ const first = controller.handleText({
++ tenantId: "tenant-a",
++ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" },
++ estimatedTokens: 2,
++ execute: async () => {
++ order.push("first");
++ await firstGate;
++ return "first";
++ },
++ extractUsage: () => ({ totalTokens: 2 }),
++ });
++
++ const second = controller.handleText({
++ tenantId: "tenant-b",
++ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" },
++ estimatedTokens: 1,
++ execute: async () => {
++ order.push("second");
++ return "second";
++ },
++ extractUsage: () => ({ totalTokens: 1 }),
++ });
++
++ await Promise.resolve();
++ expect(order).toEqual(["first"]);
++
++ await vi.advanceTimersByTimeAsync(60_000 + RATE_LIMIT_PROBE_DELAY_MS - 1);
++ await Promise.resolve();
++ expect(order).toEqual(["first"]);
++
++ await vi.advanceTimersByTimeAsync(1);
++ await vi.runAllTimersAsync();
++ await Promise.resolve();
++ expect(order).toEqual(["first", "second"]);
++
++ releaseFirst();
++ await Promise.all([first, second]);
++ } finally {
++ vi.useRealTimers();
++ }
++ });
++
++ it("allows token-only configs on non-OpenAI providers", async () => {
++ vi.useFakeTimers();
++
++ try {
++ vi.setSystemTime(new Date(0));
++ const controller = new TrafficController({
++ maxConcurrent: 2,
++ rateLimits: {
++ "p::m": {
++ requestsPerMinute: 0,
++ tokensPerMinute: 2,
++ },
++ },
++ });
++ const order: string[] = [];
++
++ const first = controller.handleText({
++ tenantId: "tenant-a",
++ metadata: { provider: "p", model: "m", priority: "P1" },
++ estimatedTokens: 2,
++ execute: async () => {
++ order.push("first");
++ return "first";
++ },
++ extractUsage: () => ({ totalTokens: 2 }),
++ });
++
++ const second = controller.handleText({
++ tenantId: "tenant-b",
++ metadata: { provider: "p", model: "m", priority: "P1" },
++ estimatedTokens: 1,
++ execute: async () => {
++ order.push("second");
++ return "second";
++ },
++ extractUsage: () => ({ totalTokens: 1 }),
++ });
++
++ await first;
++ expect(order).toEqual(["first"]);
++
++ await vi.advanceTimersByTimeAsync(29_999);
++ await Promise.resolve();
++ expect(order).toEqual(["first"]);
++
++ await vi.advanceTimersByTimeAsync(1);
++ await vi.runAllTimersAsync();
++ await second;
++ expect(order).toEqual(["first", "second"]);
++ } finally {
++ vi.useRealTimers();
++ }
++ });
++
++ it("honors OpenAI token headers even without token config", async () => {
++ vi.useFakeTimers();
++
++ try {
++ vi.setSystemTime(new Date(0));
++ const controller = new TrafficController({ maxConcurrent: 1 });
++ controller.updateRateLimitFromHeaders(
++ { provider: "openai", model: "gpt-4o" },
++ {
++ "x-ratelimit-limit-tokens": "2",
++ "x-ratelimit-remaining-tokens": "0",
++ "x-ratelimit-reset-tokens": "1s",
++ },
++ );
++
++ const order: string[] = [];
++ const request = controller.handleText({
++ tenantId: "tenant-a",
++ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" },
++ estimatedTokens: 1,
++ execute: async () => {
++ order.push("run");
++ return "ok";
++ },
++ });
++
++ await Promise.resolve();
++ expect(order).toEqual([]);
++
++ await vi.advanceTimersByTimeAsync(1_000 + RATE_LIMIT_PROBE_DELAY_MS - 1);
++ await Promise.resolve();
++ expect(order).toEqual([]);
++
++ await vi.advanceTimersByTimeAsync(1);
++ await vi.runAllTimersAsync();
++ await request;
++ expect(order).toEqual(["run"]);
++ } finally {
++ vi.useRealTimers();
++ }
++ });
++});
++
++describe("TrafficController stream reporting", () => {
++ it("slows down after stream 429 errors", async () => {
++ vi.useFakeTimers();
++
++ try {
++ vi.setSystemTime(new Date(0));
++ const controller = new TrafficController({
++ maxConcurrent: 1,
++ adaptiveLimiter: {
++ windowMs: 1_000,
++ threshold: 1,
++ minPenaltyMs: 10,
++ maxPenaltyMs: 10,
++ penaltyMultiplier: 1,
++ decayMs: 1_000,
++ },
++ });
++ const metadata = {
++ provider: "p",
++ model: "m",
++ priority: "P1" as const,
++ tenantId: "tenant-a",
++ };
++
++ controller.reportStreamFailure(
++ metadata,
++ Object.assign(new Error("rate limit"), { status: 429 }),
++ );
++
++ const order: string[] = [];
++ const request = controller.handleText({
++ tenantId: "tenant-a",
++ metadata,
++ execute: async () => {
++ order.push("run");
++ return "ok";
++ },
++ });
++
++ await Promise.resolve();
++ expect(order).toEqual([]);
++
++ await vi.advanceTimersByTimeAsync(9);
++ await Promise.resolve();
++ expect(order).toEqual([]);
++
++ await vi.advanceTimersByTimeAsync(1);
++ await vi.runAllTimersAsync();
++ await request;
++ expect(order).toEqual(["run"]);
++ } finally {
++ vi.useRealTimers();
++ }
++ });
++
++ it("treats post-start stream failures as circuit breaker failures", async () => {
++ const controller = new TrafficController({
++ maxConcurrent: 1,
++ fallbackChains: {
++ primary: ["fallback"],
++ },
++ });
++ const tenantId = "tenant-1";
++ const metadata = { provider: "p", model: "primary", priority: "P1" as const };
++
++ await controller.handleStream({
++ tenantId,
++ metadata,
++ execute: async () => ({ ok: true }),
++ });
++
++ for (let i = 0; i < CIRCUIT_FAILURE_THRESHOLD; i += 1) {
++ controller.reportStreamFailure(metadata, new Error("stream-failure"));
++ }
++
++ const order: string[] = [];
++ await controller.handleStream({
++ tenantId,
++ metadata,
++ execute: async () => {
++ order.push("primary");
++ return "primary";
++ },
++ createFallbackRequest: (target) => ({
++ tenantId,
++ metadata: {
++ provider: "p",
++ model: typeof target === "string" ? target : target.model,
++ priority: "P1",
++ },
++ execute: async () => {
++ const modelId = typeof target === "string" ? target : target.model;
++ order.push(modelId);
++ return modelId;
++ },
++ }),
++ });
++
++ expect(order).toEqual(["fallback"]);
++ });
++});
++
++describe("TrafficController queue timeouts", () => {
++ it("lets fallback requests wait after queue timeout without rejecting", async () => {
++ vi.useFakeTimers();
++
++ try {
++ vi.setSystemTime(new Date(0));
++ const controller = new TrafficController({
++ maxConcurrent: 1,
++ fallbackChains: {
++ "p::m": ["m-fallback"],
++ },
++ });
++ const order: string[] = [];
++ let releaseFirst!: () => void;
++ const firstGate = new Promise((resolve) => {
++ releaseFirst = resolve;
++ });
++
++ const first = controller.handleText({
++ tenantId: "tenant-a",
++ metadata: { provider: "p", model: "m", priority: "P1" },
++ execute: async () => {
++ order.push("first");
++ await firstGate;
++ return "first";
++ },
++ });
++
++ const second = controller.handleText({
++ tenantId: "tenant-a",
++ metadata: { provider: "p", model: "m", priority: "P1" },
++ maxQueueWaitMs: 1,
++ execute: async () => {
++ order.push("primary");
++ return "primary";
++ },
++ createFallbackRequest: (target) => ({
++ tenantId: "tenant-a",
++ metadata: {
++ provider: "p",
++ model: typeof target === "string" ? target : target.model,
++ priority: "P1",
++ },
++ maxQueueWaitMs: 1,
++ execute: async () => {
++ order.push("fallback");
++ return "fallback";
++ },
++ }),
++ });
++
++ await Promise.resolve();
++ expect(order).toEqual(["first"]);
++
++ await vi.advanceTimersByTimeAsync(2);
++
++ const third = controller.handleText({
++ tenantId: "tenant-a",
++ metadata: { provider: "p", model: "other", priority: "P1" },
++ execute: async () => {
++ order.push("third");
++ return "third";
++ },
++ });
++
++ await Promise.resolve();
++ expect(order).toEqual(["first"]);
++
++ releaseFirst();
++ await vi.runAllTimersAsync();
++
++ await expect(second).resolves.toBe("fallback");
++ await Promise.all([first, third]);
++
++ expect(order).toEqual(["first", "fallback", "third"]);
++ } finally {
++ vi.useRealTimers();
++ }
++ });
++});
+diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts
+new file mode 100644
+index 00000000..269304d9
+--- /dev/null
++++ b/packages/core/src/traffic/traffic-controller.ts
+@@ -0,0 +1,1268 @@
++import type { Logger } from "../logger";
++import { LoggerProxy } from "../logger";
++import { TrafficCircuitBreaker } from "./traffic-circuit-breaker";
++import { TrafficConcurrencyLimiter } from "./traffic-concurrency-limiter";
++import type { DispatchDecision, QueuedRequest, Scheduler } from "./traffic-controller-internal";
++import {
++ CircuitBreakerOpenError,
++ QueueWaitTimeoutError,
++ RateLimitedUpstreamError,
++ normalizeRateLimitError,
++} from "./traffic-errors";
++import {
++ OpenAIWindowRateLimitStrategy,
++ type RateLimitUpdateResult,
++ TokenBucketRateLimitStrategy,
++ TrafficRateLimiter,
++} from "./traffic-rate-limiter";
++import { buildRetryPlanWithPolicy } from "./traffic-retry";
++import type {
++ AdaptiveLimiterConfig,
++ FallbackChainEntry,
++ FallbackPolicy,
++ FallbackPolicyConfig,
++ FallbackPolicyMode,
++ FallbackTarget,
++ PriorityBurstLimits,
++ PriorityWeights,
++ ProviderModelConcurrencyLimit,
++ RateLimitConfig,
++ RateLimitKey,
++ RateLimitStrategyConfig,
++ RateLimitStrategyKind,
++ RetryPlan,
++ RetryPolicyConfig,
++ TenantConcurrencyLimit,
++ TenantUsage,
++ TrafficControllerOptions,
++ TrafficPriority,
++ TrafficRequest,
++ TrafficRequestMetadata,
++ TrafficRequestType,
++ TrafficResponseMetadata,
++} from "./traffic-types";
++import { TrafficUsageTracker } from "./traffic-usage-tracker";
++
++/* ============================================================
++ * Traffic Controller
++ * ============================================================
++ */
++
++export type {
++ AdaptiveLimiterConfig,
++ FallbackChainEntry,
++ FallbackPolicy,
++ FallbackPolicyConfig,
++ FallbackPolicyMode,
++ FallbackTarget,
++ PriorityBurstLimits,
++ PriorityWeights,
++ ProviderModelConcurrencyLimit,
++ RateLimitConfig,
++ RateLimitKey,
++ RateLimitStrategyConfig,
++ RateLimitStrategyKind,
++ TenantConcurrencyLimit,
++ TenantUsage,
++ TrafficControllerOptions,
++ TrafficPriority,
++ TrafficRequest,
++ TrafficRequestMetadata,
++ TrafficResponseMetadata,
++ TrafficRequestType,
++};
++
++export { CircuitBreakerOpenError };
++export { QueueWaitTimeoutError };
++export { RateLimitedUpstreamError };
++
++type TenantQueueState = {
++ order: string[];
++ index: number;
++ queues: Map;
++};
++
++type RateLimitSnapshot = {
++ limit?: number;
++ remaining?: number;
++ resetAt?: number;
++ nextAllowedAt?: number;
++ retryAfterMs?: number;
++};
++
++type AdaptiveLimiterState = {
++ recent429s: number[];
++ penaltyMs: number;
++ cooldownUntil?: number;
++ last429At?: number;
++};
++
++const DEFAULT_PRIORITY_WEIGHTS: Record = {
++ P0: 5,
++ P1: 3,
++ P2: 2,
++};
++
++const DEFAULT_ADAPTIVE_LIMITER: Required = {
++ windowMs: 30_000,
++ threshold: 3,
++ minPenaltyMs: 500,
++ maxPenaltyMs: 10_000,
++ penaltyMultiplier: 2,
++ decayMs: 10_000,
++};
++
++export class TrafficController {
++ /* ---------- Core ---------- */
++
++ private readonly scheduler: Scheduler;
++ private readonly maxConcurrent: number;
++ private readonly rateLimitKeyBuilder: (metadata?: TrafficRequestMetadata) => string;
++ private readonly retryPolicy?: RetryPolicyConfig;
++ private readonly logger: Logger;
++ private readonly trafficLogger: Logger;
++ private readonly controllerLogger: Logger;
++ private readonly concurrencyLimiter: TrafficConcurrencyLimiter;
++
++ private readonly queues: Record = {
++ P0: { order: [], index: 0, queues: new Map() },
++ P1: { order: [], index: 0, queues: new Map() },
++ P2: { order: [], index: 0, queues: new Map() },
++ };
++ private readonly priorityOrder: TrafficPriority[] = ["P0", "P1", "P2"];
++ private readonly priorityWeights: Record;
++ private readonly priorityCredits: Record;
++
++ private activeCount = 0;
++ private drainScheduled = false;
++
++ /* ---------- Rate limits ---------- */
++ private readonly rateLimiter: TrafficRateLimiter;
++
++ /* ---------- Circuit breakers ---------- */
++ private readonly circuitBreaker: TrafficCircuitBreaker;
++
++ /* ---------- Usage ---------- */
++ private readonly usageTracker = new TrafficUsageTracker();
++
++ /* ---------- Traffic metadata ---------- */
++ private readonly rateLimitSnapshots = new Map();
++
++ /* ---------- Adaptive limiter ---------- */
++ private readonly adaptiveLimiterConfig: Required;
++ private readonly adaptiveLimiterState = new Map();
++
++ constructor(options: TrafficControllerOptions = {}) {
++ this.maxConcurrent = options.maxConcurrent ?? Number.POSITIVE_INFINITY;
++ this.scheduler = this.createScheduler();
++ this.rateLimitKeyBuilder = options.rateLimitKeyBuilder ?? buildRateLimitKeyFromMetadata;
++ this.retryPolicy = options.retryPolicy;
++ const priorityOverrides = options.priorityWeights ?? options.priorityBurstLimits;
++ const priorityWeights = {
++ ...DEFAULT_PRIORITY_WEIGHTS,
++ ...(priorityOverrides ?? {}),
++ };
++ this.priorityWeights = {
++ P0: Math.max(0, Math.floor(priorityWeights.P0)),
++ P1: Math.max(0, Math.floor(priorityWeights.P1)),
++ P2: Math.max(0, Math.floor(priorityWeights.P2)),
++ };
++ this.priorityCredits = { ...this.priorityWeights };
++ this.adaptiveLimiterConfig = {
++ ...DEFAULT_ADAPTIVE_LIMITER,
++ ...(options.adaptiveLimiter ?? {}),
++ };
++ this.logger = new LoggerProxy({ component: "traffic-controller" }, options.logger);
++ this.trafficLogger = this.logger.child({ subsystem: "traffic" });
++ this.controllerLogger = this.trafficLogger.child({ module: "controller" });
++ const rateLimits = options.rateLimits;
++ const rateLimitStrategy = options.rateLimitStrategy;
++ this.rateLimiter = new TrafficRateLimiter(() => this.scheduleDrain(), {
++ rateLimits,
++ strategyFactory: (key) => {
++ const strategyKind = this.resolveRateLimitStrategy(key, rateLimitStrategy);
++ if (strategyKind === "window") {
++ return new OpenAIWindowRateLimitStrategy(key, rateLimits?.[key]);
++ }
++ return new TokenBucketRateLimitStrategy(key, rateLimits?.[key]);
++ },
++ });
++ this.circuitBreaker = new TrafficCircuitBreaker({
++ fallbackChains: options.fallbackChains,
++ fallbackPolicy: options.fallbackPolicy,
++ buildRateLimitKey: (metadata) => this.buildRateLimitKey(metadata),
++ });
++ this.concurrencyLimiter = new TrafficConcurrencyLimiter({
++ buildProviderModelKey: (metadata) => buildProviderModelKeyFromMetadata(metadata),
++ maxConcurrentPerProviderModel: options.maxConcurrentPerProviderModel,
++ maxConcurrentPerTenant: options.maxConcurrentPerTenant,
++ });
++
++ this.controllerLogger.debug("Initialized TrafficController", {
++ maxConcurrent: this.maxConcurrent,
++ hasFallbackChains: !!options.fallbackChains,
++ hasFallbackPolicy: options.fallbackPolicy !== undefined,
++ hasProviderModelConcurrency: options.maxConcurrentPerProviderModel !== undefined,
++ hasTenantConcurrency: options.maxConcurrentPerTenant !== undefined,
++ hasConfigRateLimits: options.rateLimits !== undefined,
++ hasStrategyOverrides: options.rateLimitStrategy !== undefined,
++ hasRetryPolicy: options.retryPolicy !== undefined,
++ hasPriorityBurstLimits: options.priorityBurstLimits !== undefined,
++ hasPriorityWeights: options.priorityWeights !== undefined,
++ hasAdaptiveLimiter: options.adaptiveLimiter !== undefined,
++ });
++ }
++
++ /* ============================================================
++ * Public API
++ * ============================================================
++ */
++
++ handleText(request: TrafficRequest): Promise {
++ this.controllerLogger.trace("handleText called", {
++ tenantId: request.tenantId,
++ provider: request.metadata?.provider,
++ model: request.metadata?.model,
++ priority: request.metadata?.priority,
++ });
++ return this.enqueue("text", request);
++ }
++
++ handleStream(request: TrafficRequest): Promise {
++ this.controllerLogger.trace("handleStream called", {
++ tenantId: request.tenantId,
++ provider: request.metadata?.provider,
++ model: request.metadata?.model,
++ priority: request.metadata?.priority,
++ });
++ return this.enqueue("stream", request);
++ }
++
++ reportStreamSuccess(metadata?: TrafficRequestMetadata): void {
++ this.controllerLogger.debug("Stream reported success", {
++ provider: metadata?.provider,
++ model: metadata?.model,
++ tenantId: metadata?.tenantId,
++ priority: metadata?.priority,
++ });
++ this.circuitBreaker.recordSuccess(metadata, this.trafficLogger);
++ const rateLimitKey = this.buildRateLimitKey(metadata);
++ const adaptiveKey = this.buildAdaptiveKey(
++ metadata,
++ metadata?.tenantId ?? "default",
++ rateLimitKey,
++ );
++ this.recordAdaptiveSuccess(adaptiveKey);
++ }
++
++ reportStreamFailure(metadata: TrafficRequestMetadata | undefined, error: unknown): void {
++ const rateLimitKey = this.buildRateLimitKey(metadata);
++ const normalizedRateLimitError = normalizeRateLimitError({
++ error,
++ metadata,
++ tenantId: metadata?.tenantId,
++ key: rateLimitKey,
++ logger: this.trafficLogger,
++ });
++ const errorForHandling = normalizedRateLimitError ?? error;
++
++ this.controllerLogger.warn("Stream reported failure", {
++ provider: metadata?.provider,
++ model: metadata?.model,
++ tenantId: metadata?.tenantId,
++ priority: metadata?.priority,
++ errorName: (error as { name?: unknown } | null)?.name,
++ errorMessage: (error as { message?: unknown } | null)?.message,
++ status: (error as { status?: unknown } | null)?.status,
++ statusCode: (error as { statusCode?: unknown } | null)?.statusCode,
++ });
++ this.circuitBreaker.recordFailure(metadata, errorForHandling, this.trafficLogger);
++ const adaptiveKey = this.buildAdaptiveKey(
++ metadata,
++ metadata?.tenantId ?? "default",
++ rateLimitKey,
++ );
++ if (errorForHandling instanceof RateLimitedUpstreamError) {
++ this.recordAdaptiveRateLimitHit(adaptiveKey, errorForHandling.retryAfterMs);
++ }
++ const traffic = this.buildTrafficResponseMetadataFromMetadata(
++ metadata,
++ rateLimitKey,
++ Date.now(),
++ errorForHandling,
++ );
++ this.attachTrafficMetadata(errorForHandling, traffic);
++ if (errorForHandling !== error) {
++ this.attachTrafficMetadata(error, traffic);
++ }
++ }
++
++ updateRateLimitFromHeaders(
++ metadata: TrafficRequestMetadata | undefined,
++ headers: unknown,
++ ): RateLimitUpdateResult | undefined {
++ const key = this.buildRateLimitKey(metadata);
++ this.controllerLogger.debug("updateRateLimitFromHeaders called", {
++ rateLimitKey: key,
++ provider: metadata?.provider,
++ model: metadata?.model,
++ });
++
++ const update = this.rateLimiter.updateFromHeaders(metadata, headers, key, this.trafficLogger);
++ if (!update) {
++ this.controllerLogger.debug("updateRateLimitFromHeaders skipped (no headers applied)", {
++ rateLimitKey: key,
++ });
++ return undefined;
++ }
++
++ this.controllerLogger.debug("Rate limit headers applied", {
++ rateLimitKey: update.key,
++ limit: update.state.limit,
++ remaining: update.state.remaining,
++ reserved: update.state.reserved,
++ resetAt: update.state.resetAt,
++ nextAllowedAt: update.state.nextAllowedAt,
++ resetRequestsMs: update.headerSnapshot.resetRequestsMs,
++ });
++
++ this.rateLimitSnapshots.set(update.key, {
++ limit: update.state.limit,
++ remaining: update.state.remaining,
++ resetAt: update.state.resetAt,
++ nextAllowedAt: update.state.nextAllowedAt,
++ retryAfterMs: update.headerSnapshot.retryAfterMs,
++ });
++
++ return update;
++ }
++
++ getTenantUsage(tenantId: string): TenantUsage | undefined {
++ this.controllerLogger.trace("getTenantUsage called", { tenantId });
++ return this.usageTracker.getTenantUsage(tenantId);
++ }
++
++ /* ============================================================
++ * Scheduler & Queue
++ * ============================================================
++ */
++
++ private createScheduler(): Scheduler {
++ return typeof queueMicrotask === "function" ? queueMicrotask : (cb) => setTimeout(cb, 0);
++ }
++
++ private enqueue(
++ type: TrafficRequestType,
++ request: TrafficRequest,
++ ): Promise {
++ return new Promise((resolve, reject) => {
++ const priority = this.resolvePriority(request.metadata);
++ const tenantId = this.resolveTenantId(request);
++ this.controllerLogger.debug("Enqueue request", {
++ type,
++ tenantId,
++ priority,
++ provider: request.metadata?.provider,
++ model: request.metadata?.model,
++ });
++ this.enqueueItem({
++ type,
++ request,
++ resolve,
++ reject,
++ attempt: 1,
++ priority,
++ tenantId,
++ enqueuedAt: Date.now(),
++ estimatedTokens: request.estimatedTokens,
++ extractUsage: request.extractUsage,
++ });
++ this.scheduleDrain();
++ });
++ }
++
++ private scheduleDrain(): void {
++ if (this.drainScheduled) return;
++ this.drainScheduled = true;
++
++ this.controllerLogger.trace("Drain scheduled");
++ this.scheduler(() => {
++ this.drainScheduled = false;
++ this.controllerLogger.trace("Drain tick");
++ this.drainQueue();
++ });
++ }
++
++ private drainQueue(): void {
++ this.controllerLogger.trace("Drain start", {
++ activeCount: this.activeCount,
++ maxConcurrent: this.maxConcurrent,
++ queuedP0: this.getQueuedCount("P0"),
++ queuedP1: this.getQueuedCount("P1"),
++ queuedP2: this.getQueuedCount("P2"),
++ });
++ while (true) {
++ const decision = this.tryDispatchNext();
++ this.controllerLogger.trace("Dispatch decision", decision);
++ if (decision.kind === "dispatch" || decision.kind === "skip") continue;
++ if (decision.kind === "wait") {
++ if (decision.wakeUpAt) {
++ this.controllerLogger.debug("Rate limit wait; scheduling wakeup", {
++ wakeUpAt: decision.wakeUpAt,
++ inMs: Math.max(0, decision.wakeUpAt - Date.now()),
++ });
++ this.scheduleRateLimitWakeUpAt(decision.wakeUpAt);
++ }
++ return;
++ }
++ return;
++ }
++ }
++
++ /* ============================================================
++ * Dispatch
++ * ============================================================
++ */
++
++ private tryDispatchNext(): DispatchDecision {
++ if (this.activeCount >= this.maxConcurrent) return { kind: "wait" };
++
++ let earliestWakeUpAt: number | undefined;
++
++ const observeWakeUpAt = (candidate?: number): void => {
++ if (candidate === undefined) return;
++ earliestWakeUpAt =
++ earliestWakeUpAt === undefined ? candidate : Math.min(earliestWakeUpAt, candidate);
++ };
++
++ const priorities = this.getPriorityDispatchOrder();
++ for (const priority of priorities) {
++ const state = this.queues[priority];
++ if (state.order.length === 0) continue;
++
++ let attempts = 0;
++ const maxAttempts = state.order.length;
++
++ while (attempts < maxAttempts) {
++ const candidate = this.getNextTenantCandidate(priority);
++ if (!candidate) break;
++ attempts += 1;
++
++ const { item: next, queue, tenantId } = candidate;
++ const now = Date.now();
++ const queueTimeoutAt = this.resolveQueueTimeoutAt(next);
++ const queueTimeoutTriggered = this.handleQueueTimeout(next, queue, 0, now, queueTimeoutAt);
++ if (queueTimeoutTriggered === "rejected") {
++ this.cleanupTenantQueue(priority, tenantId, queue);
++ return { kind: "skip" };
++ }
++ if (queueTimeoutAt !== undefined && now < queueTimeoutAt) {
++ observeWakeUpAt(queueTimeoutAt);
++ }
++ const queueTimeoutExpired = queueTimeoutTriggered === "expired";
++
++ this.controllerLogger.trace("Evaluate next queued request", {
++ priority,
++ tenantId: next.tenantId,
++ type: next.type,
++ attempt: next.attempt,
++ provider: next.request.metadata?.provider,
++ model: next.request.metadata?.model,
++ queueLength: queue.length,
++ });
++
++ const circuit = this.resolveCircuit(next);
++ if (circuit) {
++ this.controllerLogger.trace("Circuit resolution returned decision", {
++ priority,
++ decision: circuit,
++ circuitKey: next.circuitKey,
++ circuitStatus: next.circuitStatus,
++ });
++ if (circuit.kind === "skip") {
++ queue.shift();
++ this.cleanupTenantQueue(priority, tenantId, queue);
++ return { kind: "skip" };
++ }
++ if (circuit.kind === "wait") {
++ if (
++ this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "circuit wait")
++ ) {
++ this.cleanupTenantQueue(priority, tenantId, queue);
++ return { kind: "skip" };
++ }
++ next.etaMs =
++ circuit.wakeUpAt !== undefined ? Math.max(0, circuit.wakeUpAt - now) : undefined;
++ observeWakeUpAt(circuit.wakeUpAt);
++ continue;
++ }
++ }
++
++ const concurrency = this.concurrencyLimiter.resolve(next, this.trafficLogger);
++ if (concurrency.kind === "wait") {
++ this.controllerLogger.trace("Concurrency gate blocked request", {
++ priority,
++ tenantId: next.tenantId,
++ provider: next.request.metadata?.provider,
++ model: next.request.metadata?.model,
++ reasons: concurrency.reasons,
++ });
++ if (
++ this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "concurrency wait")
++ ) {
++ this.cleanupTenantQueue(priority, tenantId, queue);
++ return { kind: "skip" };
++ }
++ next.etaMs = undefined;
++ continue;
++ }
++
++ const adaptive = this.resolveAdaptiveLimit(next, now);
++ if (adaptive?.kind === "wait") {
++ if (
++ this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "adaptive wait")
++ ) {
++ this.cleanupTenantQueue(priority, tenantId, queue);
++ return { kind: "skip" };
++ }
++ next.etaMs =
++ adaptive.wakeUpAt !== undefined ? Math.max(0, adaptive.wakeUpAt - now) : undefined;
++ observeWakeUpAt(adaptive.wakeUpAt);
++ continue;
++ }
++
++ const rateLimit = this.resolveRateLimit(next);
++ if (rateLimit) {
++ this.controllerLogger.trace("Rate limit resolution returned decision", {
++ priority,
++ decision: rateLimit,
++ rateLimitKey: next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata),
++ });
++ if (rateLimit.kind === "wait") {
++ if (
++ this.rejectIfQueueTimedOut(
++ queueTimeoutExpired,
++ next,
++ queue,
++ 0,
++ now,
++ "rate limit wait",
++ )
++ ) {
++ this.cleanupTenantQueue(priority, tenantId, queue);
++ return { kind: "skip" };
++ }
++ next.etaMs =
++ rateLimit.wakeUpAt !== undefined ? Math.max(0, rateLimit.wakeUpAt - now) : undefined;
++ observeWakeUpAt(rateLimit.wakeUpAt);
++ }
++ continue;
++ }
++
++ if (queueTimeoutExpired) {
++ const timeoutError = this.createQueueTimeoutError(next, now);
++ this.attachTrafficMetadata(
++ timeoutError,
++ this.buildTrafficResponseMetadata(
++ next,
++ timeoutError.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata),
++ now,
++ timeoutError,
++ ),
++ );
++ this.controllerLogger.warn("Queue wait timed out before dispatch", {
++ tenantId: next.tenantId,
++ waitedMs: timeoutError.waitedMs,
++ maxQueueWaitMs: timeoutError.maxQueueWaitMs,
++ deadlineAt: timeoutError.deadlineAt,
++ provider: next.request.metadata?.provider,
++ model: next.request.metadata?.model,
++ rateLimitKey: timeoutError.rateLimitKey,
++ });
++ queue.shift();
++ this.cleanupTenantQueue(priority, tenantId, queue);
++ next.reject(timeoutError);
++ return { kind: "skip" };
++ }
++
++ this.startRequest(next, queue, tenantId);
++ return { kind: "dispatch" };
++ }
++ }
++
++ return earliestWakeUpAt !== undefined
++ ? { kind: "wait", wakeUpAt: earliestWakeUpAt }
++ : { kind: "wait" };
++ }
++
++ private startRequest(item: QueuedRequest, queue: QueuedRequest[], tenantId: string): void {
++ this.controllerLogger.debug("Start request", {
++ priority: item.priority,
++ type: item.type,
++ tenantId: item.tenantId,
++ attempt: item.attempt,
++ provider: item.request.metadata?.provider,
++ model: item.request.metadata?.model,
++ });
++ item.dispatchedAt = Date.now();
++ queue.shift();
++ this.cleanupTenantQueue(item.priority, tenantId, queue);
++ this.recordPriorityDispatch(item.priority);
++ this.activeCount++;
++ this.concurrencyLimiter.acquire(item, this.trafficLogger);
++ this.rateLimiter.notifyDispatch(item.rateLimitKey, this.trafficLogger);
++ this.circuitBreaker.markTrial(item, this.trafficLogger);
++ void this.executeRequest(item);
++ }
++
++ /* ============================================================
++ * Execution
++ * ============================================================
++ */
++
++ private async executeRequest(item: QueuedRequest): Promise {
++ const startedAt = Date.now();
++ try {
++ this.controllerLogger.debug("Execute request", {
++ priority: item.priority,
++ type: item.type,
++ tenantId: item.tenantId,
++ attempt: item.attempt,
++ provider: item.request.metadata?.provider,
++ model: item.request.metadata?.model,
++ rateLimitKey: item.rateLimitKey,
++ circuitKey: item.circuitKey,
++ circuitStatus: item.circuitStatus,
++ activeCount: this.activeCount,
++ });
++ const result = await item.request.execute();
++ const rateLimitKey = item.rateLimitKey ?? this.buildRateLimitKey(item.request.metadata);
++ const adaptiveKey = this.buildAdaptiveKey(item.request.metadata, item.tenantId, rateLimitKey);
++ this.controllerLogger.debug("Request succeeded", {
++ tenantId: item.tenantId,
++ attempt: item.attempt,
++ provider: item.request.metadata?.provider,
++ model: item.request.metadata?.model,
++ elapsedMs: Date.now() - startedAt,
++ });
++ if (item.type === "stream") {
++ this.controllerLogger.trace("Stream started successfully", {
++ tenantId: item.tenantId,
++ provider: item.request.metadata?.provider,
++ model: item.request.metadata?.model,
++ });
++ } else {
++ this.circuitBreaker.recordSuccess(item.request.metadata, this.trafficLogger);
++ }
++ const usage = this.usageTracker.recordUsage(item, result, this.trafficLogger);
++ this.rateLimiter.recordUsage(rateLimitKey, usage, this.trafficLogger, item.reservedTokens);
++ this.recordAdaptiveSuccess(adaptiveKey);
++ this.attachTrafficMetadata(
++ result,
++ this.buildTrafficResponseMetadata(item, rateLimitKey, Date.now()),
++ );
++ item.resolve(result);
++ } catch (error) {
++ const rateLimitKey = item.rateLimitKey ?? this.buildRateLimitKey(item.request.metadata);
++ const normalizedRateLimitError = normalizeRateLimitError({
++ error,
++ metadata: item.request.metadata,
++ tenantId: item.tenantId,
++ key: rateLimitKey,
++ logger: this.trafficLogger,
++ });
++ const errorForHandling = normalizedRateLimitError ?? error;
++ const adaptiveKey = this.buildAdaptiveKey(item.request.metadata, item.tenantId, rateLimitKey);
++ if (typeof item.reservedTokens === "number" && item.reservedTokens > 0) {
++ this.rateLimiter.recordUsage(
++ rateLimitKey,
++ { totalTokens: 0 },
++ this.trafficLogger,
++ item.reservedTokens,
++ );
++ }
++ if (errorForHandling instanceof RateLimitedUpstreamError) {
++ this.recordAdaptiveRateLimitHit(adaptiveKey, errorForHandling.retryAfterMs);
++ }
++
++ this.controllerLogger.warn("Request failed", {
++ tenantId: item.tenantId,
++ attempt: item.attempt,
++ provider: item.request.metadata?.provider,
++ model: item.request.metadata?.model,
++ elapsedMs: Date.now() - startedAt,
++ errorName: (error as { name?: unknown } | null)?.name,
++ errorMessage: (error as { message?: unknown } | null)?.message,
++ status: (error as { status?: unknown } | null)?.status,
++ statusCode: (error as { statusCode?: unknown } | null)?.statusCode,
++ });
++ this.circuitBreaker.recordFailure(
++ item.request.metadata,
++ errorForHandling,
++ this.trafficLogger,
++ );
++ this.attachTrafficMetadata(
++ errorForHandling,
++ this.buildTrafficResponseMetadata(item, rateLimitKey, Date.now(), errorForHandling),
++ );
++
++ const retry = buildRetryPlanWithPolicy(
++ {
++ error: errorForHandling,
++ attempt: item.attempt,
++ metadata: item.request.metadata,
++ key: rateLimitKey,
++ logger: this.trafficLogger,
++ },
++ this.retryPolicy,
++ );
++ if (retry) {
++ if (!this.canRetryWithinDeadline(item, retry.delayMs)) {
++ this.controllerLogger.debug("Retry skipped; deadline exceeded", {
++ tenantId: item.tenantId,
++ attempt: item.attempt,
++ provider: item.request.metadata?.provider,
++ model: item.request.metadata?.model,
++ deadlineAt: item.request.deadlineAt,
++ delayMs: retry.delayMs,
++ });
++ item.reject(errorForHandling);
++ } else {
++ this.controllerLogger.debug("Retrying request", {
++ tenantId: item.tenantId,
++ attempt: item.attempt,
++ nextAttempt: item.attempt + 1,
++ reason: retry.reason,
++ delayMs: retry.delayMs,
++ provider: item.request.metadata?.provider,
++ model: item.request.metadata?.model,
++ });
++ this.scheduleRetry(item, retry);
++ }
++ } else {
++ this.controllerLogger.debug("No retry plan; rejecting request", {
++ tenantId: item.tenantId,
++ attempt: item.attempt,
++ provider: item.request.metadata?.provider,
++ model: item.request.metadata?.model,
++ });
++ item.reject(errorForHandling);
++ }
++ } finally {
++ this.rateLimiter.releaseReservation(item.rateLimitKey, this.trafficLogger);
++ this.concurrencyLimiter.release(item, this.trafficLogger);
++ this.activeCount = Math.max(0, this.activeCount - 1);
++ this.controllerLogger.trace("Request finished; slot released", {
++ tenantId: item.tenantId,
++ activeCount: this.activeCount,
++ maxConcurrent: this.maxConcurrent,
++ });
++ this.scheduleDrain();
++ }
++ }
++
++ /* ============================================================
++ * Retry logic
++ * ============================================================
++ */
++
++ private scheduleRetry(item: QueuedRequest, plan: RetryPlan): void {
++ this.controllerLogger.debug("Schedule retry", {
++ tenantId: item.tenantId,
++ priority: item.priority,
++ currentAttempt: item.attempt,
++ nextAttempt: item.attempt + 1,
++ reason: plan.reason,
++ delayMs: plan.delayMs,
++ });
++ setTimeout(() => {
++ this.controllerLogger.debug("Retry timer fired", {
++ tenantId: item.tenantId,
++ priority: item.priority,
++ nextAttempt: item.attempt + 1,
++ });
++ this.enqueueItem({
++ ...item,
++ attempt: item.attempt + 1,
++ enqueuedAt: Date.now(),
++ dispatchedAt: undefined,
++ reservedTokens: undefined,
++ tenantConcurrencyKey: undefined,
++ providerModelConcurrencyKey: undefined,
++ rateLimitKey: undefined,
++ etaMs: undefined,
++ circuitKey: undefined,
++ circuitStatus: undefined,
++ });
++ this.scheduleDrain();
++ }, plan.delayMs);
++ }
++
++ private canRetryWithinDeadline(item: QueuedRequest, delayMs: number): boolean {
++ const deadlineAt = item.request.deadlineAt;
++ if (!deadlineAt) return true;
++ const nextAttemptAt = Date.now() + delayMs;
++ return nextAttemptAt <= deadlineAt;
++ }
++
++ /* ============================================================
++ * Rate limiting (verbatim logic)
++ * ============================================================
++ */
++
++ private resolveRateLimit(next: QueuedRequest): DispatchDecision | null {
++ const key = this.buildRateLimitKey(next.request.metadata);
++ return this.rateLimiter.resolve(next, key, this.trafficLogger);
++ }
++
++ private scheduleRateLimitWakeUpAt(wakeUpAt: number): void {
++ this.rateLimiter.scheduleWakeUpAt(wakeUpAt, this.trafficLogger);
++ }
++
++ /* ============================================================
++ * Circuit breakers (verbatim logic, linearized)
++ * ============================================================
++ */
++
++ private resolveCircuit(next: QueuedRequest): DispatchDecision | null {
++ return this.circuitBreaker.resolve(next, this.trafficLogger);
++ }
++
++ /* ============================================================
++ * Utilities
++ * ============================================================
++ */
++
++ private resolveQueueTimeoutAt(next: QueuedRequest): number | undefined {
++ if (next.queueTimeoutDisabled) {
++ return next.request.deadlineAt;
++ }
++ const maxQueueWaitMs = next.request.maxQueueWaitMs;
++ const normalizedMaxWait =
++ typeof maxQueueWaitMs === "number" && Number.isFinite(maxQueueWaitMs)
++ ? Math.max(0, maxQueueWaitMs)
++ : undefined;
++ const timeoutAt =
++ normalizedMaxWait !== undefined ? next.enqueuedAt + normalizedMaxWait : undefined;
++ const deadlineAt = next.request.deadlineAt;
++ if (timeoutAt === undefined) return deadlineAt;
++ if (deadlineAt === undefined) return timeoutAt;
++ return Math.min(timeoutAt, deadlineAt);
++ }
++
++ private handleQueueTimeout(
++ next: QueuedRequest,
++ queue: QueuedRequest[],
++ index: number,
++ now: number,
++ queueTimeoutAt?: number,
++ ): "none" | "expired" | "rejected" {
++ if (queueTimeoutAt === undefined) return "none";
++ if (now < queueTimeoutAt) return "none";
++
++ const fallbackApplied = this.circuitBreaker.tryFallback(
++ next,
++ "queue-timeout",
++ this.trafficLogger,
++ );
++ if (fallbackApplied) {
++ return "none";
++ }
++
++ const timeoutError = this.createQueueTimeoutError(next, now);
++ this.attachTrafficMetadata(
++ timeoutError,
++ this.buildTrafficResponseMetadata(
++ next,
++ timeoutError.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata),
++ now,
++ timeoutError,
++ ),
++ );
++ this.controllerLogger.warn("Queue wait timed out; rejecting request", {
++ tenantId: next.tenantId,
++ waitedMs: timeoutError.waitedMs,
++ maxQueueWaitMs: timeoutError.maxQueueWaitMs,
++ deadlineAt: timeoutError.deadlineAt,
++ provider: next.request.metadata?.provider,
++ model: next.request.metadata?.model,
++ rateLimitKey: timeoutError.rateLimitKey,
++ });
++ queue.splice(index, 1);
++ next.reject(timeoutError);
++ return "rejected";
++ }
++
++ private rejectIfQueueTimedOut(
++ queueTimeoutExpired: boolean,
++ next: QueuedRequest,
++ queue: QueuedRequest[],
++ index: number,
++ now: number,
++ reason: string,
++ ): boolean {
++ if (!queueTimeoutExpired) return false;
++ const timeoutError = this.createQueueTimeoutError(next, now);
++ this.attachTrafficMetadata(
++ timeoutError,
++ this.buildTrafficResponseMetadata(
++ next,
++ timeoutError.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata),
++ now,
++ timeoutError,
++ ),
++ );
++ this.controllerLogger.warn("Queue wait timed out during gate wait", {
++ tenantId: next.tenantId,
++ waitedMs: timeoutError.waitedMs,
++ maxQueueWaitMs: timeoutError.maxQueueWaitMs,
++ deadlineAt: timeoutError.deadlineAt,
++ provider: next.request.metadata?.provider,
++ model: next.request.metadata?.model,
++ rateLimitKey: timeoutError.rateLimitKey,
++ reason,
++ });
++ queue.splice(index, 1);
++ next.reject(timeoutError);
++ return true;
++ }
++
++ private createQueueTimeoutError(next: QueuedRequest, now: number): QueueWaitTimeoutError {
++ const waitedMs = Math.max(0, now - next.enqueuedAt);
++ return new QueueWaitTimeoutError({
++ waitedMs,
++ maxQueueWaitMs: next.request.maxQueueWaitMs,
++ deadlineAt: next.request.deadlineAt,
++ metadata: next.request.metadata,
++ rateLimitKey: next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata),
++ });
++ }
++
++ private resolveTenantId(request: TrafficRequest): string {
++ return request.tenantId ?? request.metadata?.tenantId ?? "default";
++ }
++
++ private enqueueItem(item: QueuedRequest): void {
++ const state = this.queues[item.priority];
++ const tenantId = item.tenantId;
++ let queue = state.queues.get(tenantId);
++ if (!queue) {
++ queue = [];
++ state.queues.set(tenantId, queue);
++ state.order.push(tenantId);
++ }
++ queue.push(item);
++ }
++
++ private getQueuedCount(priority: TrafficPriority): number {
++ const state = this.queues[priority];
++ let total = 0;
++ for (const queue of state.queues.values()) {
++ total += queue.length;
++ }
++ return total;
++ }
++
++ private refillPriorityCredits(): void {
++ this.priorityCredits.P0 = this.priorityWeights.P0;
++ this.priorityCredits.P1 = this.priorityWeights.P1;
++ this.priorityCredits.P2 = this.priorityWeights.P2;
++ }
++
++ private recordPriorityDispatch(priority: TrafficPriority): void {
++ if (this.priorityCredits[priority] > 0) {
++ this.priorityCredits[priority] -= 1;
++ }
++ }
++
++ private getPriorityDispatchOrder(): TrafficPriority[] {
++ const prioritiesWithWork = this.priorityOrder.filter(
++ (priority) => this.getQueuedCount(priority) > 0,
++ );
++ if (prioritiesWithWork.length === 0) return [];
++
++ let available = prioritiesWithWork.filter((priority) => this.priorityCredits[priority] > 0);
++ if (available.length === 0) {
++ this.refillPriorityCredits();
++ available = prioritiesWithWork.filter((priority) => this.priorityCredits[priority] > 0);
++ }
++
++ return available.length === 0 ? prioritiesWithWork : available;
++ }
++
++ private getNextTenantCandidate(
++ priority: TrafficPriority,
++ ): { item: QueuedRequest; queue: QueuedRequest[]; tenantId: string } | undefined {
++ const state = this.queues[priority];
++ if (state.order.length === 0) return undefined;
++ const maxAttempts = state.order.length;
++ let attempts = 0;
++
++ while (attempts < maxAttempts && state.order.length > 0) {
++ const index = state.index % state.order.length;
++ const tenantId = state.order[index];
++ const queue = state.queues.get(tenantId);
++ attempts += 1;
++
++ if (!queue || queue.length === 0) {
++ this.removeTenantQueue(priority, tenantId);
++ continue;
++ }
++
++ state.index = (index + 1) % state.order.length;
++ return { item: queue[0], queue, tenantId };
++ }
++
++ return undefined;
++ }
++
++ private cleanupTenantQueue(
++ priority: TrafficPriority,
++ tenantId: string,
++ queue: QueuedRequest[],
++ ): void {
++ if (queue.length > 0) return;
++ this.removeTenantQueue(priority, tenantId);
++ }
++
++ private removeTenantQueue(priority: TrafficPriority, tenantId: string): void {
++ const state = this.queues[priority];
++ state.queues.delete(tenantId);
++ const index = state.order.indexOf(tenantId);
++ if (index === -1) return;
++ state.order.splice(index, 1);
++ if (state.order.length === 0) {
++ state.index = 0;
++ return;
++ }
++ if (state.index > index) {
++ state.index -= 1;
++ }
++ if (state.index >= state.order.length) {
++ state.index = 0;
++ }
++ }
++
++ private resolvePriority(metadata?: TrafficRequestMetadata): TrafficPriority {
++ return metadata?.priority ?? "P1";
++ }
++
++ private buildRateLimitKey(metadata?: TrafficRequestMetadata): string {
++ return this.rateLimitKeyBuilder(metadata);
++ }
++
++ private resolveAdaptiveLimit(next: QueuedRequest, now: number): DispatchDecision | null {
++ const rateLimitKey = next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata);
++ const adaptiveKey = this.buildAdaptiveKey(next.request.metadata, next.tenantId, rateLimitKey);
++ const state = this.adaptiveLimiterState.get(adaptiveKey);
++ if (!state) return null;
++
++ this.applyAdaptiveDecay(state, now);
++ if (state.cooldownUntil !== undefined && now < state.cooldownUntil) {
++ return { kind: "wait", wakeUpAt: state.cooldownUntil };
++ }
++
++ return null;
++ }
++
++ private recordAdaptiveRateLimitHit(key: string, retryAfterMs?: number): void {
++ const state = this.getAdaptiveState(key);
++ const now = Date.now();
++ const { windowMs, threshold, minPenaltyMs, maxPenaltyMs, penaltyMultiplier } =
++ this.adaptiveLimiterConfig;
++
++ state.last429At = now;
++ state.recent429s = state.recent429s.filter((timestamp) => now - timestamp <= windowMs);
++ state.recent429s.push(now);
++
++ if (state.recent429s.length < threshold) {
++ return;
++ }
++
++ const basePenalty = state.penaltyMs > 0 ? state.penaltyMs : minPenaltyMs;
++ const nextPenalty = Math.min(
++ maxPenaltyMs,
++ Math.max(minPenaltyMs, Math.round(basePenalty * penaltyMultiplier)),
++ );
++ state.penaltyMs = nextPenalty;
++ const retryPenalty = typeof retryAfterMs === "number" ? retryAfterMs : 0;
++ const cooldownMs = Math.max(nextPenalty, retryPenalty);
++ state.cooldownUntil = now + cooldownMs;
++ }
++
++ private recordAdaptiveSuccess(key: string): void {
++ const state = this.adaptiveLimiterState.get(key);
++ if (!state) return;
++
++ const now = Date.now();
++ this.applyAdaptiveDecay(state, now);
++ if (state.penaltyMs === 0) {
++ state.cooldownUntil = undefined;
++ state.recent429s = [];
++ state.last429At = undefined;
++ }
++ }
++
++ private applyAdaptiveDecay(state: AdaptiveLimiterState, now: number): void {
++ const { decayMs, penaltyMultiplier } = this.adaptiveLimiterConfig;
++ if (state.last429At && now - state.last429At < decayMs) {
++ return;
++ }
++
++ if (state.penaltyMs > 0) {
++ state.penaltyMs = Math.max(0, Math.floor(state.penaltyMs / penaltyMultiplier));
++ }
++ }
++
++ private getAdaptiveState(key: string): AdaptiveLimiterState {
++ const existing = this.adaptiveLimiterState.get(key);
++ if (existing) return existing;
++ const created: AdaptiveLimiterState = {
++ recent429s: [],
++ penaltyMs: 0,
++ };
++ this.adaptiveLimiterState.set(key, created);
++ return created;
++ }
++
++ private buildAdaptiveKey(
++ metadata: TrafficRequestMetadata | undefined,
++ tenantId: string,
++ rateLimitKey: string,
++ ): string {
++ if (rateLimitKey.includes("tenant=")) {
++ return rateLimitKey;
++ }
++ const tenant = metadata?.tenantId ?? tenantId ?? "default";
++ return `${rateLimitKey}::tenant=${encodeURIComponent(tenant)}`;
++ }
++
++ private buildTrafficResponseMetadata(
++ item: QueuedRequest,
++ rateLimitKey: string,
++ now: number,
++ error?: unknown,
++ ): TrafficResponseMetadata {
++ const snapshot = this.rateLimitSnapshots.get(rateLimitKey);
++ const retryAfterMs = this.resolveRetryAfterMs(error, snapshot);
++ const queuedForMs =
++ item.dispatchedAt !== undefined ? item.dispatchedAt - item.enqueuedAt : now - item.enqueuedAt;
++ const queueEtaMs = item.etaMs ?? Math.max(0, queuedForMs);
++
++ return {
++ rateLimitKey,
++ retryAfterMs,
++ rateLimitRemaining: snapshot?.remaining,
++ rateLimitResetAt: snapshot?.resetAt,
++ rateLimitResetInMs:
++ snapshot?.resetAt !== undefined ? Math.max(0, snapshot.resetAt - now) : undefined,
++ queueEtaMs,
++ tenantId: item.tenantId,
++ priority: item.request.metadata?.priority,
++ taskType: item.request.metadata?.taskType,
++ };
++ }
++
++ private buildTrafficResponseMetadataFromMetadata(
++ metadata: TrafficRequestMetadata | undefined,
++ rateLimitKey: string,
++ now: number,
++ error?: unknown,
++ ): TrafficResponseMetadata {
++ const snapshot = this.rateLimitSnapshots.get(rateLimitKey);
++ const retryAfterMs = this.resolveRetryAfterMs(error, snapshot);
++
++ return {
++ rateLimitKey,
++ retryAfterMs,
++ rateLimitRemaining: snapshot?.remaining,
++ rateLimitResetAt: snapshot?.resetAt,
++ rateLimitResetInMs:
++ snapshot?.resetAt !== undefined ? Math.max(0, snapshot.resetAt - now) : undefined,
++ tenantId: metadata?.tenantId,
++ priority: metadata?.priority,
++ taskType: metadata?.taskType,
++ };
++ }
++
++ private attachTrafficMetadata(target: unknown, info: TrafficResponseMetadata): void {
++ if (!target || typeof target !== "object") return;
++ (target as Record).traffic = info;
++ }
++
++ private resolveRetryAfterMs(
++ error: unknown | undefined,
++ snapshot?: RateLimitSnapshot,
++ ): number | undefined {
++ if (error && typeof error === "object" && "retryAfterMs" in error) {
++ const candidate = (error as { retryAfterMs?: unknown }).retryAfterMs;
++ if (typeof candidate === "number" && Number.isFinite(candidate)) {
++ return candidate;
++ }
++ }
++ if (snapshot?.retryAfterMs !== undefined) {
++ return snapshot.retryAfterMs;
++ }
++ return undefined;
++ }
++
++ private resolveRateLimitStrategy(
++ key: string,
++ config?: RateLimitStrategyConfig,
++ ): RateLimitStrategyKind {
++ const modelOverride = config?.models?.[key];
++ if (modelOverride) return modelOverride;
++ const provider = key.split("::")[0] ?? "";
++ const providerOverride = config?.providers?.[provider];
++ if (providerOverride) return providerOverride;
++ if (provider.startsWith("openai")) return "window";
++ return "token-bucket";
++ }
++}
++
++/* ============================================================
++ * Error + Singleton
++ * ============================================================
++ */
++
++let singletonController: TrafficController | undefined;
++
++export function getTrafficController(options?: TrafficControllerOptions): TrafficController {
++ if (!singletonController) {
++ singletonController = new TrafficController(options);
++ }
++ return singletonController;
++}
++
++function buildRateLimitKeyFromMetadata(metadata?: TrafficRequestMetadata): string {
++ const provider = metadata?.provider ?? "default-provider";
++ const model = metadata?.model ?? "default-model";
++ const parts = [provider, model];
++
++ // SOP: Add new metadata fields in one place with a stable label and ordering.
++ // 1) Add the optional field to TrafficRequestMetadata.
++ // 2) Add it here with a stable label so keys stay predictable.
++ // Example: { label: "org", value: metadata?.orgId }
++ const optionalFields: Array<{ label: string; value?: string }> = [
++ { label: "apiKey", value: metadata?.apiKeyId },
++ { label: "region", value: metadata?.region },
++ { label: "endpoint", value: metadata?.endpoint },
++ // Intentionally exclude tenantId to enforce provider/model limits across tenants.
++ // Use rateLimitKeyBuilder to include tenant for per-tenant rate limits.
++ { label: "tenantTier", value: metadata?.tenantTier },
++ { label: "taskType", value: metadata?.taskType },
++ ];
++
++ for (const field of optionalFields) {
++ if (!field.value) continue;
++ parts.push(`${field.label}=${encodeURIComponent(field.value)}`);
++ }
++
++ return parts.join("::");
++}
++
++function buildProviderModelKeyFromMetadata(metadata?: TrafficRequestMetadata): string {
++ const provider = metadata?.provider ?? "default-provider";
++ const model = metadata?.model ?? "default-model";
++ return `${provider}::${model}`;
++}
+diff --git a/packages/core/src/traffic/traffic-error-utils.ts b/packages/core/src/traffic/traffic-error-utils.ts
+new file mode 100644
+index 00000000..4cbb98b5
+--- /dev/null
++++ b/packages/core/src/traffic/traffic-error-utils.ts
+@@ -0,0 +1,148 @@
++import type { Logger } from "../logger";
++
++function readObjectProperty(value: unknown, key: string): unknown {
++ if (!value || typeof value !== "object") return undefined;
++ return (value as Record)[key];
++}
++
++export function findHeaders(value: unknown): unknown[] {
++ const candidates: unknown[] = [
++ readObjectProperty(value, "headers"),
++ readObjectProperty(readObjectProperty(value, "response"), "headers"),
++ readObjectProperty(readObjectProperty(value, "cause"), "headers"),
++ readObjectProperty(
++ readObjectProperty(readObjectProperty(value, "cause"), "response"),
++ "headers",
++ ),
++ ];
++
++ return candidates.filter((candidate) => candidate !== undefined && candidate !== null);
++}
++
++export function readHeaderValue(headers: unknown, name: string): string | undefined {
++ if (!headers) return undefined;
++
++ if (typeof (headers as { get?: unknown }).get === "function") {
++ const v = (headers as { get: (name: string) => unknown }).get(name);
++ return v === null || v === undefined ? undefined : String(v);
++ }
++
++ if (typeof headers !== "object") return undefined;
++
++ const entries = Object.entries(headers as Record);
++ const target = name.toLowerCase();
++ const match = entries.find(([k]) => String(k).toLowerCase() === target);
++ if (!match) return undefined;
++
++ const value = match[1];
++ if (Array.isArray(value)) {
++ const first = value[0];
++ return first === null || first === undefined ? undefined : String(first);
++ }
++ return value === null || value === undefined ? undefined : String(value);
++}
++
++export function parseRetryAfterMs(value: string, nowMs: number = Date.now()): number | undefined {
++ const raw = value.trim();
++ if (!raw) return undefined;
++
++ const seconds = Number(raw);
++ if (Number.isFinite(seconds)) {
++ return Math.max(0, Math.round(seconds * 1000));
++ }
++
++ const parsedAt = Date.parse(raw);
++ if (Number.isFinite(parsedAt)) {
++ return Math.max(0, parsedAt - nowMs);
++ }
++
++ return undefined;
++}
++
++export function coerceStatus(value: unknown): number | undefined {
++ const n = Number(value);
++ return Number.isFinite(n) ? n : undefined;
++}
++
++export function extractStatusCode(error: unknown, logger?: Logger): number | undefined {
++ const status =
++ coerceStatus(readObjectProperty(error, "status")) ??
++ coerceStatus(readObjectProperty(error, "statusCode")) ??
++ coerceStatus(readObjectProperty(error, "httpStatus")) ??
++ coerceStatus(readObjectProperty(readObjectProperty(error, "response"), "status")) ??
++ coerceStatus(readObjectProperty(readObjectProperty(error, "cause"), "status"));
++
++ logger?.trace?.("Extracted status code", {
++ status,
++ hasStatus: readObjectProperty(error, "status") !== undefined,
++ hasStatusCode: readObjectProperty(error, "statusCode") !== undefined,
++ hasHttpStatus: readObjectProperty(error, "httpStatus") !== undefined,
++ hasResponseStatus:
++ readObjectProperty(readObjectProperty(error, "response"), "status") !== undefined,
++ hasCauseStatus: readObjectProperty(readObjectProperty(error, "cause"), "status") !== undefined,
++ });
++
++ return status;
++}
++
++export function extractRetryAfterMs(error: unknown, logger?: Logger): number | undefined {
++ const retryAfterLogger = logger?.child({ module: "retry-after" });
++ const candidates = findHeaders(error);
++
++ for (const headers of candidates) {
++ const raw = readHeaderValue(headers, "retry-after");
++ if (!raw) continue;
++ const parsed = parseRetryAfterMs(raw);
++ retryAfterLogger?.trace?.("Parsed Retry-After header", { raw, parsedMs: parsed });
++ if (parsed !== undefined) return parsed;
++ }
++
++ retryAfterLogger?.trace?.("Retry-After header missing or unparsable");
++ return undefined;
++}
++
++export function isTimeoutError(error: unknown, logger?: Logger): boolean {
++ const candidates: unknown[] = [error];
++
++ const cause = readObjectProperty(error, "cause");
++ if (cause) {
++ candidates.push(cause);
++ const nestedCause = readObjectProperty(cause, "cause");
++ if (nestedCause) candidates.push(nestedCause);
++ }
++
++ for (const candidate of candidates) {
++ const code = readObjectProperty(candidate, "code");
++ const name = readObjectProperty(candidate, "name");
++ const message = readObjectProperty(candidate, "message");
++
++ const codeText = String(code ?? "").toLowerCase();
++ const nameText = String(name ?? "").toLowerCase();
++ const messageText = String(message ?? "").toLowerCase();
++
++ const isTimeout =
++ codeText.includes("timeout") ||
++ codeText.includes("timedout") ||
++ nameText.includes("timeout") ||
++ nameText.includes("timedout") ||
++ messageText.includes("timeout") ||
++ messageText.includes("timedout") ||
++ messageText.includes("timed out");
++
++ logger?.trace?.("Checked timeout error", {
++ isTimeout,
++ code,
++ name,
++ messagePreview: typeof message === "string" ? message.slice(0, 160) : message,
++ hasCause: candidate !== error,
++ });
++
++ if (isTimeout) return true;
++ }
++
++ return false;
++}
++
++export function isPromiseLike(value: unknown): value is PromiseLike {
++ return !!value && typeof (value as { then?: unknown }).then === "function";
++}
+diff --git a/packages/core/src/traffic/traffic-errors.ts b/packages/core/src/traffic/traffic-errors.ts
+new file mode 100644
+index 00000000..4943c89f
+--- /dev/null
++++ b/packages/core/src/traffic/traffic-errors.ts
+@@ -0,0 +1,141 @@
++import type { Logger } from "../logger";
++import { extractRetryAfterMs, extractStatusCode } from "./traffic-error-utils";
++import type { TrafficRequestMetadata } from "./traffic-types";
++
++export type RateLimitErrorOptions = {
++ metadata?: TrafficRequestMetadata;
++ retryAfterMs?: number;
++ tenantId?: string;
++ key?: string;
++};
++
++export class CircuitBreakerOpenError extends Error {
++ readonly retryAfterMs?: number;
++ readonly metadata?: TrafficRequestMetadata;
++
++ constructor(message: string, metadata?: TrafficRequestMetadata, retryAfterMs?: number) {
++ super(message);
++ this.name = "CircuitBreakerOpenError";
++ this.metadata = metadata;
++ this.retryAfterMs = retryAfterMs;
++ }
++}
++
++export class QueueWaitTimeoutError extends Error {
++ readonly waitedMs: number;
++ readonly maxQueueWaitMs?: number;
++ readonly deadlineAt?: number;
++ readonly metadata?: TrafficRequestMetadata;
++ readonly rateLimitKey?: string;
++
++ constructor(options: {
++ waitedMs: number;
++ maxQueueWaitMs?: number;
++ deadlineAt?: number;
++ metadata?: TrafficRequestMetadata;
++ rateLimitKey?: string;
++ }) {
++ super("Queue wait time exceeded");
++ this.name = "QueueWaitTimeoutError";
++ this.waitedMs = options.waitedMs;
++ this.maxQueueWaitMs = options.maxQueueWaitMs;
++ this.deadlineAt = options.deadlineAt;
++ this.metadata = options.metadata;
++ this.rateLimitKey = options.rateLimitKey;
++ }
++}
++
++export class RateLimitedUpstreamError extends Error {
++ readonly status = 429;
++ readonly retryAfterMs?: number;
++ readonly metadata?: TrafficRequestMetadata;
++ readonly provider?: string;
++ readonly model?: string;
++ readonly tenantId?: string;
++ readonly key?: string;
++
++ constructor(
++ message: string,
++ metadata?: TrafficRequestMetadata,
++ retryAfterMs?: number,
++ options?: { tenantId?: string; key?: string },
++ );
++ constructor(message: string, options?: RateLimitErrorOptions);
++ constructor(
++ message: string,
++ metadataOrOptions?: TrafficRequestMetadata | RateLimitErrorOptions,
++ retryAfterMs?: number,
++ legacyOptions?: { tenantId?: string; key?: string },
++ ) {
++ super(message);
++ this.name = "RateLimitedUpstreamError";
++ const isOptions =
++ metadataOrOptions &&
++ (Object.prototype.hasOwnProperty.call(metadataOrOptions, "metadata") ||
++ Object.prototype.hasOwnProperty.call(metadataOrOptions, "retryAfterMs") ||
++ Object.prototype.hasOwnProperty.call(metadataOrOptions, "key"));
++
++ const metadata = isOptions
++ ? (metadataOrOptions as RateLimitErrorOptions).metadata
++ : (metadataOrOptions as TrafficRequestMetadata | undefined);
++ const retryAfter = isOptions
++ ? (metadataOrOptions as RateLimitErrorOptions).retryAfterMs
++ : retryAfterMs;
++ const tenantId = isOptions
++ ? (metadataOrOptions as RateLimitErrorOptions).tenantId
++ : legacyOptions?.tenantId;
++ const key = isOptions ? (metadataOrOptions as RateLimitErrorOptions).key : legacyOptions?.key;
++
++ this.metadata = metadata;
++ this.retryAfterMs = retryAfter;
++ this.provider = metadata?.provider;
++ this.model = metadata?.model;
++ this.tenantId = tenantId ?? metadata?.tenantId;
++ this.key = key;
++ }
++}
++
++export function normalizeRateLimitError(options: {
++ error: unknown;
++ metadata?: TrafficRequestMetadata;
++ tenantId?: string;
++ key?: string;
++ logger?: Logger;
++}): RateLimitedUpstreamError | undefined {
++ const { error, metadata, tenantId, key, logger } = options;
++ const retryAfterMs =
++ error instanceof RateLimitedUpstreamError
++ ? (error.retryAfterMs ?? extractRetryAfterMs(error, logger))
++ : extractRetryAfterMs(error, logger);
++
++ if (error instanceof RateLimitedUpstreamError) {
++ const baseMetadata = metadata ?? error.metadata;
++ const baseTenant = tenantId ?? error.tenantId;
++ const baseKey = key ?? error.key;
++ if (
++ error.metadata === baseMetadata &&
++ error.retryAfterMs === retryAfterMs &&
++ error.tenantId === baseTenant &&
++ error.key === baseKey
++ ) {
++ return error;
++ }
++ return new RateLimitedUpstreamError(error.message, {
++ metadata: baseMetadata,
++ retryAfterMs,
++ tenantId: baseTenant,
++ key: baseKey,
++ });
++ }
++
++ const status = extractStatusCode(error, logger);
++ if (status !== 429) return undefined;
++
++ const message = error instanceof Error ? error.message : "Rate limit exceeded";
++ return new RateLimitedUpstreamError(message, {
++ metadata,
++ retryAfterMs,
++ tenantId,
++ key,
++ });
++}
+diff --git a/packages/core/src/traffic/traffic-rate-limiter.ts b/packages/core/src/traffic/traffic-rate-limiter.ts
+new file mode 100644
+index 00000000..3e5aefbe
+--- /dev/null
++++ b/packages/core/src/traffic/traffic-rate-limiter.ts
+@@ -0,0 +1,295 @@
++import type { Logger } from "../logger";
++import type {
++ RateLimitStrategy,
++ RateLimitUpdateResult,
++} from "./rate-limit-strategies/rate-limit-strategy";
++import { TokenBucketRateLimitStrategy } from "./rate-limit-strategies/token-bucket-rate-limit-strategy";
++import type { DispatchDecision, QueuedRequest } from "./traffic-controller-internal";
++import type { RateLimitConfig, TrafficRequestMetadata } from "./traffic-types";
++
++export type {
++ RateLimitHeaderSnapshot,
++ RateLimitStrategy,
++ RateLimitUpdateResult,
++} from "./rate-limit-strategies/rate-limit-strategy";
++export { DefaultRateLimitStrategy } from "./rate-limit-strategies/default-rate-limit-strategy";
++export { OpenAIWindowRateLimitStrategy } from "./rate-limit-strategies/openai-window-rate-limit-strategy";
++export { TokenBucketRateLimitStrategy } from "./rate-limit-strategies/token-bucket-rate-limit-strategy";
++
++type SchedulerCallback = () => void;
++
++export type RateLimitStrategyFactory = (key: string) => RateLimitStrategy;
++
++type UsageCounters = {
++ inputTokens?: number;
++ outputTokens?: number;
++ totalTokens?: number;
++};
++
++type TokenRateState = {
++ capacity: number;
++ refillPerSecond: number;
++ tokens: number;
++ updatedAt: number;
++};
++
++export class TrafficRateLimiter {
++ private readonly strategies = new Map();
++ private readonly tokenRates = new Map();
++ private wakeUpTimeout?: ReturnType;
++ private wakeUpAt?: number;
++ private readonly onWakeUp: SchedulerCallback;
++ private readonly strategyFactory: RateLimitStrategyFactory;
++ private readonly rateLimits?: RateLimitConfig;
++
++ constructor(
++ onWakeUp: SchedulerCallback,
++ options?: { strategyFactory?: RateLimitStrategyFactory; rateLimits?: RateLimitConfig },
++ ) {
++ this.onWakeUp = onWakeUp;
++ this.rateLimits = options?.rateLimits;
++ this.strategyFactory =
++ options?.strategyFactory ??
++ ((key) => new TokenBucketRateLimitStrategy(key, this.rateLimits?.[key]));
++ }
++
++ resolve(next: QueuedRequest, key: string, logger?: Logger): DispatchDecision | null {
++ const strategy = this.strategies.get(key) ?? this.createStrategy(key, logger);
++ const requestDecision = strategy.resolve(next, logger);
++ if (requestDecision?.kind === "wait") {
++ const tokenDecision = strategy.handlesTokenLimits
++ ? null
++ : this.resolveTokenLimit(next, key, logger, false);
++ if (tokenDecision?.kind === "wait") {
++ const requestWakeUp = requestDecision.wakeUpAt;
++ const tokenWakeUp = tokenDecision.wakeUpAt;
++ if (tokenWakeUp !== undefined && requestWakeUp !== undefined) {
++ return { kind: "wait", wakeUpAt: Math.min(requestWakeUp, tokenWakeUp) };
++ }
++ if (tokenWakeUp !== undefined && requestWakeUp === undefined) {
++ return tokenDecision;
++ }
++ }
++ return requestDecision;
++ }
++
++ const tokenDecision = strategy.handlesTokenLimits
++ ? null
++ : this.resolveTokenLimit(next, key, logger, true);
++ if (tokenDecision?.kind === "wait") {
++ return tokenDecision;
++ }
++
++ return requestDecision;
++ }
++
++ notifyDispatch(key: string | undefined, logger?: Logger): void {
++ if (!key) return;
++ this.strategies.get(key)?.onDispatch(logger);
++ }
++
++ scheduleWakeUpAt(wakeUpAt: number, logger?: Logger): void {
++ const rateLimitLogger = logger?.child({ module: "rate-limiter" });
++ const now = Date.now();
++ const target = Math.max(now, wakeUpAt);
++
++ if (this.wakeUpTimeout && this.wakeUpAt !== undefined && this.wakeUpAt <= target) {
++ rateLimitLogger?.trace?.("Wakeup already scheduled earlier; skipping", {
++ currentWakeUpAt: this.wakeUpAt,
++ requestedWakeUpAt: target,
++ });
++ return;
++ }
++
++ if (this.wakeUpTimeout) clearTimeout(this.wakeUpTimeout);
++
++ this.wakeUpAt = target;
++ rateLimitLogger?.debug?.("Scheduling rate limit wakeup", {
++ wakeUpAt: target,
++ inMs: Math.max(1, target - now),
++ });
++ this.wakeUpTimeout = setTimeout(
++ () => {
++ this.wakeUpTimeout = undefined;
++ this.wakeUpAt = undefined;
++ rateLimitLogger?.debug?.("Rate limit wakeup fired");
++ this.onWakeUp();
++ },
++ Math.max(1, target - now),
++ );
++ }
++
++ releaseReservation(key?: string, logger?: Logger): void {
++ if (!key) return;
++ this.strategies.get(key)?.onComplete(logger);
++ }
++
++ recordUsage(
++ key: string | undefined,
++ usage: UsageCounters | Promise | undefined,
++ logger?: Logger,
++ reservedTokens?: number,
++ ): void {
++ if (!key || !usage) return;
++ if (typeof (usage as PromiseLike).then === "function") {
++ void (usage as Promise)
++ .then((resolved) => this.recordUsage(key, resolved, logger, reservedTokens))
++ .catch(() => {});
++ return;
++ }
++
++ const strategy = this.strategies.get(key);
++ if (strategy?.recordUsage) {
++ strategy.recordUsage(usage, logger, reservedTokens);
++ return;
++ }
++
++ const tokens = this.resolveTokenCount(usage);
++ if (tokens <= 0) return;
++
++ const bucket = this.getTokenRateState(key, logger);
++ if (!bucket) return;
++
++ const now = Date.now();
++ this.refillTokenRate(bucket, now);
++ bucket.tokens = Math.min(bucket.capacity, bucket.tokens);
++ const reserved = typeof reservedTokens === "number" ? reservedTokens : 0;
++ const delta = tokens - reserved;
++ if (delta > 0) {
++ bucket.tokens -= delta;
++ } else if (delta < 0) {
++ bucket.tokens = Math.min(bucket.capacity, bucket.tokens + Math.abs(delta));
++ }
++
++ if (bucket.tokens < 0 && bucket.refillPerSecond > 0) {
++ const waitMs = Math.max(1, Math.ceil((-bucket.tokens / bucket.refillPerSecond) * 1000));
++ this.scheduleWakeUpAt(now + waitMs, logger);
++ }
++ }
++
++ updateFromHeaders(
++ metadata: TrafficRequestMetadata | undefined,
++ headers: unknown,
++ key: string,
++ logger?: Logger,
++ ): RateLimitUpdateResult | undefined {
++ const existing = this.strategies.get(key);
++ if (existing) return existing.updateFromHeaders(metadata, headers, logger);
++
++ const created = this.strategyFactory(key);
++ const update = created.updateFromHeaders(metadata, headers, logger);
++ if (!update) return undefined;
++ this.strategies.set(key, created);
++ return update;
++ }
++
++ private createStrategy(key: string, logger?: Logger): RateLimitStrategy {
++ const created = this.strategyFactory(key);
++ this.strategies.set(key, created);
++ logger?.child({ module: "rate-limiter" })?.trace?.("Created rate limit strategy", {
++ rateLimitKey: key,
++ strategy: created.constructor.name,
++ });
++ return created;
++ }
++
++ private resolveTokenLimit(
++ next: QueuedRequest,
++ key: string,
++ logger?: Logger,
++ reserveTokens = true,
++ ): DispatchDecision | null {
++ const bucket = this.getTokenRateState(key, logger);
++ if (!bucket) return null;
++
++ const now = Date.now();
++ this.refillTokenRate(bucket, now);
++
++ if (bucket.capacity <= 0) {
++ logger?.child({ module: "rate-limiter" })?.debug?.("Token limit misconfigured; blocking", {
++ rateLimitKey: key,
++ capacity: bucket.capacity,
++ refillPerSecond: bucket.refillPerSecond,
++ });
++ return { kind: "wait" };
++ }
++
++ const estimatedTokens = next.estimatedTokens;
++ if (typeof estimatedTokens === "number" && estimatedTokens > 0) {
++ if (bucket.tokens >= estimatedTokens) {
++ if (reserveTokens) {
++ bucket.tokens -= estimatedTokens;
++ next.reservedTokens = estimatedTokens;
++ }
++ return null;
++ }
++ } else if (bucket.tokens >= 0) {
++ return null;
++ }
++
++ if (bucket.refillPerSecond <= 0) {
++ logger?.child({ module: "rate-limiter" })?.debug?.("Token limit has no refill; blocking", {
++ rateLimitKey: key,
++ capacity: bucket.capacity,
++ refillPerSecond: bucket.refillPerSecond,
++ });
++ return { kind: "wait" };
++ }
++
++ const requiredTokens =
++ typeof estimatedTokens === "number" && estimatedTokens > 0
++ ? Math.max(estimatedTokens - bucket.tokens, 1)
++ : -bucket.tokens;
++ const waitMs = Math.max(1, Math.ceil((requiredTokens / bucket.refillPerSecond) * 1000));
++ return { kind: "wait", wakeUpAt: now + waitMs };
++ }
++
++ private getTokenRateState(key: string, logger?: Logger): TokenRateState | undefined {
++ const existing = this.tokenRates.get(key);
++ if (existing) return existing;
++
++ const options = this.rateLimits?.[key];
++ if (!options) return undefined;
++
++ const tokensPerMinute = Number(options.tokensPerMinute);
++ if (!Number.isFinite(tokensPerMinute) || tokensPerMinute <= 0) {
++ return undefined;
++ }
++
++ // Token pacing uses a 1-minute burst by default; request bursts are handled separately.
++ const refillPerSecond = tokensPerMinute / 60;
++ const capacity = tokensPerMinute;
++ const now = Date.now();
++ const created: TokenRateState = {
++ capacity,
++ refillPerSecond,
++ tokens: capacity,
++ updatedAt: now,
++ };
++ this.tokenRates.set(key, created);
++ logger?.child({ module: "rate-limiter" })?.trace?.("Created token rate state", {
++ rateLimitKey: key,
++ capacity,
++ refillPerSecond,
++ });
++ return created;
++ }
++
++ private refillTokenRate(bucket: TokenRateState, now: number): void {
++ const elapsedMs = now - bucket.updatedAt;
++ if (elapsedMs <= 0) return;
++ bucket.updatedAt = now;
++ if (bucket.capacity <= 0 || bucket.refillPerSecond <= 0) return;
++ const refill = (elapsedMs / 1000) * bucket.refillPerSecond;
++ if (refill <= 0) return;
++ bucket.tokens = Math.min(bucket.capacity, bucket.tokens + refill);
++ }
++
++ private resolveTokenCount(usage: UsageCounters): number {
++ const total = Number.isFinite(usage.totalTokens) ? usage.totalTokens : undefined;
++ if (total !== undefined) return total;
++ const input = Number.isFinite(usage.inputTokens) ? usage.inputTokens : 0;
++ const output = Number.isFinite(usage.outputTokens) ? usage.outputTokens : 0;
++ return input + output;
++ }
++}
+diff --git a/packages/core/src/traffic/traffic-retry.spec.ts b/packages/core/src/traffic/traffic-retry.spec.ts
+new file mode 100644
+index 00000000..2360ca10
+--- /dev/null
++++ b/packages/core/src/traffic/traffic-retry.spec.ts
+@@ -0,0 +1,45 @@
++import { describe, expect, it, vi } from "vitest";
++import { buildRetryPlan } from "./traffic-retry";
++
++describe("buildRetryPlan", () => {
++ it("respects Retry-After for 429s", () => {
++ const randomSpy = vi.spyOn(Math, "random").mockReturnValue(0);
++ try {
++ const plan = buildRetryPlan(
++ {
++ status: 429,
++ response: { headers: { "retry-after": "2" } },
++ },
++ 1,
++ );
++
++ expect(plan).toBeTruthy();
++ expect(plan?.reason).toBe("rateLimit");
++ expect(plan?.delayMs).toBeGreaterThanOrEqual(2_000);
++ } finally {
++ randomSpy.mockRestore();
++ }
++ });
++
++ it("parses HTTP-date Retry-After values", () => {
++ vi.useFakeTimers();
++ const randomSpy = vi.spyOn(Math, "random").mockReturnValue(0);
++
++ try {
++ vi.setSystemTime(new Date("2020-01-01T00:00:00.000Z"));
++ const plan = buildRetryPlan(
++ {
++ statusCode: 429,
++ response: { headers: { "retry-after": "Wed, 01 Jan 2020 00:00:03 GMT" } },
++ },
++ 1,
++ );
++
++ expect(plan).toBeTruthy();
++ expect(plan?.delayMs).toBeGreaterThanOrEqual(3_000);
++ } finally {
++ vi.useRealTimers();
++ randomSpy.mockRestore();
++ }
++ });
++});
+diff --git a/packages/core/src/traffic/traffic-retry.ts b/packages/core/src/traffic/traffic-retry.ts
+new file mode 100644
+index 00000000..9604dc53
+--- /dev/null
++++ b/packages/core/src/traffic/traffic-retry.ts
+@@ -0,0 +1,144 @@
++import type { Logger } from "../logger";
++import {
++ MAX_RETRY_ATTEMPTS,
++ RATE_LIMIT_BASE_BACKOFF_MS,
++ RATE_LIMIT_JITTER_FACTOR,
++ SERVER_ERROR_BASE_BACKOFF_MS,
++ SERVER_ERROR_JITTER_FACTOR,
++ TIMEOUT_BASE_BACKOFF_MS,
++ TIMEOUT_JITTER_FACTOR,
++ TIMEOUT_RETRY_ATTEMPTS,
++} from "./traffic-constants";
++import { extractRetryAfterMs, extractStatusCode, isTimeoutError } from "./traffic-error-utils";
++import { RateLimitedUpstreamError } from "./traffic-errors";
++import type {
++ RetryPlan,
++ RetryPolicy,
++ RetryPolicyConfig,
++ RetryPolicyContext,
++ RetryReason,
++} from "./traffic-types";
++
++export type {
++ RetryPlan,
++ RetryPolicy,
++ RetryPolicyConfig,
++ RetryPolicyContext,
++ RetryReason,
++} from "./traffic-types";
++
++export function buildRetryPlan(
++ error: unknown,
++ attempt: number,
++ logger?: Logger,
++): RetryPlan | undefined {
++ const retryLogger = logger?.child({ module: "retry" });
++ const reason = getRetryReason(error, retryLogger);
++ if (!reason) {
++ retryLogger?.debug?.("No retry reason detected; skipping retry", { attempt });
++ return undefined;
++ }
++
++ const max = reason === "timeout" ? TIMEOUT_RETRY_ATTEMPTS : MAX_RETRY_ATTEMPTS;
++ if (attempt >= max) {
++ retryLogger?.debug?.("Retry attempts exhausted; skipping retry", {
++ attempt,
++ max,
++ reason,
++ });
++ return undefined;
++ }
++
++ const computedDelayMs = computeBackoffDelay(reason, attempt);
++ const retryAfterMs =
++ reason === "rateLimit"
++ ? error instanceof RateLimitedUpstreamError
++ ? error.retryAfterMs
++ : extractRetryAfterMs(error, retryLogger)
++ : undefined;
++ const delayMs =
++ retryAfterMs === undefined ? computedDelayMs : Math.max(computedDelayMs, retryAfterMs);
++
++ retryLogger?.debug?.("Retry plan built", {
++ attempt,
++ reason,
++ delayMs,
++ computedDelayMs,
++ retryAfterMs,
++ max,
++ });
++
++ return {
++ reason,
++ delayMs,
++ };
++}
++
++export function buildRetryPlanWithPolicy(
++ context: RetryPolicyContext,
++ policyConfig?: RetryPolicyConfig,
++): RetryPlan | undefined {
++ const retryLogger = context.logger?.child({ module: "retry" });
++ const policy = resolveRetryPolicy(context, policyConfig);
++ if (policy) {
++ const planned = policy(context);
++ if (planned) {
++ retryLogger?.debug?.("Retry policy returned a plan", {
++ attempt: context.attempt,
++ reason: planned.reason,
++ delayMs: planned.delayMs,
++ });
++ return planned;
++ }
++ retryLogger?.debug?.("Retry policy declined to retry", { attempt: context.attempt });
++ }
++
++ return buildRetryPlan(context.error, context.attempt, context.logger);
++}
++
++function resolveRetryPolicy(
++ context: RetryPolicyContext,
++ config?: RetryPolicyConfig,
++): RetryPolicy | undefined {
++ if (!config) return undefined;
++ const modelPolicy = context.key ? config.models?.[context.key] : undefined;
++ if (modelPolicy) return modelPolicy;
++ const providerModelKey =
++ context.metadata?.provider && context.metadata?.model
++ ? `${context.metadata.provider}::${context.metadata.model}`
++ : undefined;
++ const providerModelPolicy = providerModelKey ? config.models?.[providerModelKey] : undefined;
++ if (providerModelPolicy) return providerModelPolicy;
++ const provider = context.metadata?.provider;
++ const providerPolicy = provider ? config.providers?.[provider] : undefined;
++ if (providerPolicy) return providerPolicy;
++ return config.default;
++}
++
++function getRetryReason(error: unknown, logger?: Logger): RetryReason | undefined {
++ if (error instanceof RateLimitedUpstreamError) return "rateLimit";
++ const status = extractStatusCode(error, logger);
++ if (status === 429) return "rateLimit";
++ if (status && status >= 500) return "serverError";
++ if (status === 408 || isTimeoutError(error, logger)) return "timeout";
++ return undefined;
++}
++
++function computeBackoffDelay(reason: RetryReason, attempt: number): number {
++ const base =
++ reason === "serverError"
++ ? SERVER_ERROR_BASE_BACKOFF_MS
++ : reason === "timeout"
++ ? TIMEOUT_BASE_BACKOFF_MS
++ : RATE_LIMIT_BASE_BACKOFF_MS;
++
++ const jitter =
++ reason === "serverError"
++ ? SERVER_ERROR_JITTER_FACTOR
++ : reason === "timeout"
++ ? TIMEOUT_JITTER_FACTOR
++ : RATE_LIMIT_JITTER_FACTOR;
++
++ const exp = base * 2 ** (attempt - 1);
++ return Math.round(exp + exp * jitter * Math.random());
++}
+diff --git a/packages/core/src/traffic/traffic-types.ts b/packages/core/src/traffic/traffic-types.ts
+new file mode 100644
+index 00000000..1d847e25
+--- /dev/null
++++ b/packages/core/src/traffic/traffic-types.ts
+@@ -0,0 +1,181 @@
++import type { Logger } from "../logger";
++
++type BivariantFunction = {
++ bivarianceHack(...args: TArgs): TReturn;
++}["bivarianceHack"];
++
++type UsageCounters = {
++ inputTokens?: number;
++ outputTokens?: number;
++ totalTokens?: number;
++};
++
++export type RetryReason = "rateLimit" | "serverError" | "timeout";
++
++export type RetryPlan = {
++ delayMs: number;
++ reason: RetryReason;
++};
++
++export type RetryPolicyContext = {
++ error: unknown;
++ attempt: number;
++ metadata?: TrafficRequestMetadata;
++ key?: string;
++ logger?: Logger;
++};
++
++export type RetryPolicy = (context: RetryPolicyContext) => RetryPlan | undefined;
++
++export type RetryPolicyConfig = {
++ default?: RetryPolicy;
++ providers?: Record;
++ models?: Record;
++};
++
++export type TrafficRequestType = "text" | "stream";
++export type TrafficPriority = "P0" | "P1" | "P2";
++
++export interface TrafficRequestMetadata {
++ agentId?: string;
++ agentName?: string;
++ model?: string;
++ provider?: string;
++ priority?: TrafficPriority;
++ tenantId?: string;
++ apiKeyId?: string;
++ region?: string;
++ endpoint?: string;
++ tenantTier?: string;
++ taskType?: string;
++ fallbackPolicyId?: string;
++}
++
++export type TrafficResponseMetadata = {
++ rateLimitKey?: string;
++ retryAfterMs?: number;
++ rateLimitRemaining?: number;
++ rateLimitResetAt?: number;
++ rateLimitResetInMs?: number;
++ queueEtaMs?: number;
++ tenantId?: string;
++ priority?: TrafficPriority;
++ taskType?: string;
++};
++
++export type FallbackTarget = {
++ provider?: string;
++ model: string;
++};
++
++export type ShortResponseFallbackTarget = {
++ kind: "short-response";
++ text: string;
++};
++
++export type FallbackChainEntry = string | FallbackTarget | ShortResponseFallbackTarget;
++
++export type FallbackPolicyMode = "fallback" | "wait";
++
++export type FallbackPolicy = {
++ mode: FallbackPolicyMode;
++};
++
++export type FallbackPolicyConfig = {
++ defaultPolicyId?: string;
++ policies?: Record;
++ taskTypePolicyIds?: Record;
++};
++
++export type ProviderModelConcurrencyLimit =
++ | number
++ | Record
++ | ((metadata: TrafficRequestMetadata | undefined, key: string) => number | undefined);
++
++export type TenantConcurrencyLimit =
++ | number
++ | Record
++ | ((tenantId: string, metadata: TrafficRequestMetadata | undefined) => number | undefined);
++
++export type PriorityBurstLimits = Partial>;
++export type PriorityWeights = Partial>;
++
++export type AdaptiveLimiterConfig = {
++ windowMs?: number;
++ threshold?: number;
++ minPenaltyMs?: number;
++ maxPenaltyMs?: number;
++ penaltyMultiplier?: number;
++ decayMs?: number;
++};
++
++export interface TrafficRequest {
++ tenantId: string;
++ metadata?: TrafficRequestMetadata;
++ execute: () => Promise;
++ deadlineAt?: number;
++ maxQueueWaitMs?: number;
++ estimatedTokens?: number;
++ createFallbackRequest?: BivariantFunction<
++ [target: FallbackChainEntry],
++ TrafficRequest | undefined
++ >;
++ extractUsage?: BivariantFunction<
++ [response: TResponse],
++ Promise | UsageCounters | undefined
++ >;
++}
++
++export interface TrafficControllerOptions {
++ maxConcurrent?: number;
++ maxConcurrentPerProviderModel?: ProviderModelConcurrencyLimit;
++ maxConcurrentPerTenant?: TenantConcurrencyLimit;
++ rateLimits?: RateLimitConfig;
++ priorityBurstLimits?: PriorityBurstLimits;
++ priorityWeights?: PriorityWeights;
++ adaptiveLimiter?: AdaptiveLimiterConfig;
++ /**
++ * Optional override for rate-limit key construction.
++ * Useful when you need to add new metadata fields without changing core logic.
++ */
++ rateLimitKeyBuilder?: (metadata?: TrafficRequestMetadata) => string;
++ /**
++ * Optional retry policy overrides by provider/model.
++ * Models keys can use the rate-limit key or provider::model.
++ */
++ retryPolicy?: RetryPolicyConfig;
++ /**
++ * Optional fallback policy selection by task type or explicit policy id.
++ */
++ fallbackPolicy?: FallbackPolicyConfig;
++ /**
++ * Select a rate-limit strategy by provider/model.
++ * Example:
++ * { providers: { openai: "window" }, models: { "openai::gpt-4o": "window" } }
++ */
++ rateLimitStrategy?: RateLimitStrategyConfig;
++ logger?: Logger;
++ fallbackChains?: Record;
++}
++
++export type RateLimitStrategyKind = "window" | "token-bucket";
++
++export type RateLimitStrategyConfig = {
++ providers?: Record;
++ models?: Record;
++};
++
++export interface RateLimitOptions {
++ requestsPerMinute: number;
++ tokensPerMinute: number;
++ burstSize?: number;
++}
++
++export type RateLimitKey = string;
++export type RateLimitConfig = Record;
++
++export type TenantUsage = {
++ inputTokens: number;
++ outputTokens: number;
++ totalTokens: number;
++};
+diff --git a/packages/core/src/traffic/traffic-usage-tracker.ts b/packages/core/src/traffic/traffic-usage-tracker.ts
+new file mode 100644
+index 00000000..c79b311a
+--- /dev/null
++++ b/packages/core/src/traffic/traffic-usage-tracker.ts
+@@ -0,0 +1,83 @@
++import type { Logger } from "../logger";
++import type { QueuedRequest } from "./traffic-controller-internal";
++import { isPromiseLike } from "./traffic-error-utils";
++import type { TenantUsage } from "./traffic-types";
++
++type UsageCounters = {
++ inputTokens?: number;
++ outputTokens?: number;
++ totalTokens?: number;
++};
++
++export class TrafficUsageTracker {
++ private readonly tenantUsage = new Map();
++
++ getTenantUsage(tenantId: string): TenantUsage | undefined {
++ const usage = this.tenantUsage.get(tenantId);
++ return usage ? { ...usage } : undefined;
++ }
++
++ recordUsage(
++ item: QueuedRequest,
++ result: TResponse,
++ logger?: Logger,
++ ): UsageCounters | Promise | undefined {
++ const usageLogger = logger?.child({ module: "usage-tracker" });
++ const extractor = item.extractUsage ?? item.request.extractUsage;
++ if (!extractor) {
++ usageLogger?.trace?.("No usage extractor; skipping usage", { tenantId: item.tenantId });
++ return undefined;
++ }
++
++ const usage = extractor(result);
++ if (!usage) {
++ usageLogger?.trace?.("Usage extractor returned empty; skipping usage", {
++ tenantId: item.tenantId,
++ });
++ return undefined;
++ }
++
++ if (isPromiseLike(usage)) {
++ usageLogger?.trace?.("Usage extractor returned promise; awaiting", {
++ tenantId: item.tenantId,
++ });
++ void usage.then((u) => u && this.incrementTenantUsage(item.tenantId, u, usageLogger));
++ return usage;
++ }
++ this.incrementTenantUsage(item.tenantId, usage, usageLogger);
++ return usage;
++ }
++
++ private incrementTenantUsage(tenantId: string, usage: UsageCounters, logger?: Logger): void {
++ const current = this.tenantUsage.get(tenantId) ?? {
++ inputTokens: 0,
++ outputTokens: 0,
++ totalTokens: 0,
++ };
++
++ const input =
++ typeof usage.inputTokens === "number" && Number.isFinite(usage.inputTokens)
++ ? usage.inputTokens
++ : 0;
++ const output =
++ typeof usage.outputTokens === "number" && Number.isFinite(usage.outputTokens)
++ ? usage.outputTokens
++ : 0;
++ const total =
++ typeof usage.totalTokens === "number" && Number.isFinite(usage.totalTokens)
++ ? usage.totalTokens
++ : input + output;
++
++ this.tenantUsage.set(tenantId, {
++ inputTokens: current.inputTokens + input,
++ outputTokens: current.outputTokens + output,
++ totalTokens: current.totalTokens + total,
++ });
++
++ logger?.debug?.("Tenant usage incremented", {
++ tenantId,
++ delta: { inputTokens: input, outputTokens: output, totalTokens: total },
++ total: this.tenantUsage.get(tenantId),
++ });
++ }
++}
+diff --git a/packages/core/src/workflow/core.ts b/packages/core/src/workflow/core.ts
+index 3136511c..2b273d58 100644
+--- a/packages/core/src/workflow/core.ts
++++ b/packages/core/src/workflow/core.ts
+@@ -827,6 +827,9 @@ export function createWorkflow<
+
+ // Wrap entire execution in root span
+ const rootSpan = traceContext.getRootSpan();
++ if (options?.tenantId) {
++ rootSpan.setAttribute("tenant.id", options.tenantId);
++ }
+
+ // Add workflow state snapshot for remote observability
+ const workflowState = {
+@@ -848,6 +851,7 @@ export function createWorkflow<
+ executionId,
+ userId: options?.userId,
+ conversationId: options?.conversationId,
++ tenantId: options?.tenantId,
+ traceId: rootSpan.spanContext().traceId,
+ spanId: rootSpan.spanContext().spanId,
+ });
+diff --git a/packages/core/src/workflow/internal/state.ts b/packages/core/src/workflow/internal/state.ts
+index 71fa602d..2de12528 100644
+--- a/packages/core/src/workflow/internal/state.ts
++++ b/packages/core/src/workflow/internal/state.ts
+@@ -23,6 +23,7 @@ export type WorkflowState = {
+ executionId: string;
+ conversationId?: string;
+ userId?: string;
++ tenantId?: string;
+ context?: UserContext;
+ active: number;
+ startAt: Date;
+@@ -132,6 +133,7 @@ class WorkflowStateManagerInternal implements WorkflowStateManager
+ active: config?.active ?? 0,
+ userId: config?.userId,
+ conversationId: config?.conversationId,
++ tenantId: config?.tenantId,
+ context: config?.context,
+ startAt: new Date(),
+ endAt: null,
+diff --git a/packages/core/src/workflow/internal/utils.ts b/packages/core/src/workflow/internal/utils.ts
+index fc39530b..42250d82 100644
+--- a/packages/core/src/workflow/internal/utils.ts
++++ b/packages/core/src/workflow/internal/utils.ts
+@@ -32,6 +32,7 @@ export function convertWorkflowStateToParam(
+ executionId: state.executionId,
+ conversationId: state.conversationId,
+ userId: state.userId,
++ tenantId: state.tenantId,
+ context: state.context,
+ active: state.active,
+ startAt: state.startAt,
+diff --git a/packages/core/src/workflow/steps/and-agent.ts b/packages/core/src/workflow/steps/and-agent.ts
+index bc46c148..14af9b8f 100644
+--- a/packages/core/src/workflow/steps/and-agent.ts
++++ b/packages/core/src/workflow/steps/and-agent.ts
+@@ -66,6 +66,7 @@ export function andAgent(
+ context: restConfig.context ?? state.context,
+ conversationId: restConfig.conversationId ?? state.conversationId,
+ userId: restConfig.userId ?? state.userId,
++ tenantId: restConfig.tenantId ?? state.tenantId,
+ // No parentSpan when there's no workflow context
+ });
+ // Accumulate usage if available (no workflow context)
+@@ -92,6 +93,7 @@ export function andAgent(
+ context: restConfig.context ?? state.context,
+ conversationId: restConfig.conversationId ?? state.conversationId,
+ userId: restConfig.userId ?? state.userId,
++ tenantId: restConfig.tenantId ?? state.tenantId,
+ // Pass the current step span as parent for proper span hierarchy
+ parentSpan: state.workflowContext?.currentStepSpan,
+ });
+diff --git a/packages/core/src/workflow/types.ts b/packages/core/src/workflow/types.ts
+index f7eed282..49bfd8cb 100644
+--- a/packages/core/src/workflow/types.ts
++++ b/packages/core/src/workflow/types.ts
+@@ -214,6 +214,10 @@ export interface WorkflowRunOptions {
+ * The conversation ID, this can be used to track the current conversation in a workflow
+ */
+ conversationId?: string;
++ /**
++ * Tenant identifier propagated to agent steps and subcalls
++ */
++ tenantId?: string;
+ /**
+ * The user ID, this can be used to track the current user in a workflow
+ */
+diff --git a/packages/scorers/src/llm/answer-correctness.ts b/packages/scorers/src/llm/answer-correctness.ts
+index 2111fa31..d66cc007 100644
+--- a/packages/scorers/src/llm/answer-correctness.ts
++++ b/packages/scorers/src/llm/answer-correctness.ts
+@@ -7,6 +7,7 @@ import {
+ import { safeStringify } from "@voltagent/internal/utils";
+ import type { LanguageModel } from "ai";
+ import { z } from "zod";
++import { extractTenantId } from "./utils";
+
+ const ANSWER_CORRECTNESS_PROMPT = `Given a ground truth and an answer, analyze each statement in the answer and classify them in one of the following categories:
+
+@@ -84,15 +85,17 @@ export function createAnswerCorrectnessScorer<
+ const agent = new Agent({
+ name: "answer-correctness-classifier",
+ model,
++ trafficPriority: "P2",
+ instructions: "You classify statements for answer correctness evaluation",
+ });
+
++ const tenantId = extractTenantId(context);
+ const payload = resolvePayload(context, buildPayload);
+ const prompt = ANSWER_CORRECTNESS_PROMPT.replace("{{question}}", payload.input)
+ .replace("{{answer}}", payload.output)
+ .replace("{{ground_truth}}", payload.expected);
+
+- const response = await agent.generateObject(prompt, CLASSIFICATION_SCHEMA);
++ const response = await agent.generateObject(prompt, CLASSIFICATION_SCHEMA, { tenantId });
+ const normalized = normalizeClassification(response.object);
+
+ return {
+diff --git a/packages/scorers/src/llm/answer-relevancy.ts b/packages/scorers/src/llm/answer-relevancy.ts
+index a3de2237..d9bda1c9 100644
+--- a/packages/scorers/src/llm/answer-relevancy.ts
++++ b/packages/scorers/src/llm/answer-relevancy.ts
+@@ -8,6 +8,7 @@ import {
+ import { safeStringify } from "@voltagent/internal/utils";
+ import type { LanguageModel } from "ai";
+ import { z } from "zod";
++import { extractTenantId } from "./utils";
+
+ const QUESTION_GEN_PROMPT = `Generate a question for the given answer and Identify if answer is noncommittal. Give noncommittal as 1 if the answer is noncommittal and 0 if the answer is committal. A noncommittal answer is one that is evasive, vague, or ambiguous. For example, "I don't know" or "I'm not sure" are noncommittal answers
+
+@@ -119,9 +120,11 @@ export function createAnswerRelevancyScorer<
+ const agent = new Agent({
+ name: "question-generator",
+ model,
++ trafficPriority: "P2",
+ instructions: "You generate questions from answers to evaluate relevancy",
+ });
+
++ const tenantId = extractTenantId(context);
+ const payload = resolvePayload(context, buildPayload);
+ const questions: GeneratedQuestion[] = [];
+
+@@ -131,7 +134,7 @@ export function createAnswerRelevancyScorer<
+ payload.context,
+ );
+
+- const response = await agent.generateObject(prompt, QUESTION_SCHEMA);
++ const response = await agent.generateObject(prompt, QUESTION_SCHEMA, { tenantId });
+ questions.push({
+ question: response.object.question,
+ noncommittal: response.object.noncommittal === 1,
+diff --git a/packages/scorers/src/llm/classifiers.ts b/packages/scorers/src/llm/classifiers.ts
+index 1bca4239..a327e20d 100644
+--- a/packages/scorers/src/llm/classifiers.ts
++++ b/packages/scorers/src/llm/classifiers.ts
+@@ -7,6 +7,7 @@ import {
+ } from "@voltagent/core";
+ import { safeStringify } from "@voltagent/internal/utils";
+ import { z } from "zod";
++import { extractTenantId } from "./utils";
+
+ type ChoiceId = string;
+
+@@ -93,11 +94,14 @@ async function evaluateChoice(args: EvaluateChoiceArgs): Promise
+ const agent = new Agent({
+ name: `${scorerId}-judge`,
+ model,
++ trafficPriority: "P2",
+ instructions: judgeInstructions ?? buildDefaultChoiceInstructions(Object.keys(choices)),
+ });
+
++ const tenantId = extractTenantId(context);
+ const response = await agent.generateObject(prompt, CHOICE_RESPONSE_SCHEMA, {
+ maxOutputTokens,
++ tenantId,
+ });
+
+ const { choice, reason } = extractChoiceFromResponse(response.object, choices, scorerId);
+diff --git a/packages/scorers/src/llm/context-precision.ts b/packages/scorers/src/llm/context-precision.ts
+index d31b5b85..ba680f56 100644
+--- a/packages/scorers/src/llm/context-precision.ts
++++ b/packages/scorers/src/llm/context-precision.ts
+@@ -7,6 +7,7 @@ import {
+ import { safeStringify } from "@voltagent/internal/utils";
+ import type { LanguageModel } from "ai";
+ import { z } from "zod";
++import { extractTenantId } from "./utils";
+
+ const CONTEXT_PRECISION_PROMPT = `Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output.
+
+@@ -109,6 +110,7 @@ export function createContextPrecisionScorer<
+ const agent = new Agent({
+ name: "context-precision-evaluator",
+ model,
++ trafficPriority: "P2",
+ instructions: "You evaluate if context was useful for arriving at the answer",
+ });
+
+@@ -116,12 +118,15 @@ export function createContextPrecisionScorer<
+ const contextText = Array.isArray(payload.context)
+ ? payload.context.join("\n")
+ : payload.context;
++ const tenantId = extractTenantId(context);
+
+ const prompt = CONTEXT_PRECISION_PROMPT.replace("{{question}}", payload.input)
+ .replace("{{context}}", contextText)
+ .replace("{{answer}}", payload.output);
+
+- const response = await agent.generateObject(prompt, CONTEXT_PRECISION_SCHEMA);
++ const response = await agent.generateObject(prompt, CONTEXT_PRECISION_SCHEMA, {
++ tenantId,
++ });
+
+ context.results.raw.contextPrecisionVerdict = response.object;
+
+diff --git a/packages/scorers/src/llm/context-recall.ts b/packages/scorers/src/llm/context-recall.ts
+index e6e86510..2c6053fc 100644
+--- a/packages/scorers/src/llm/context-recall.ts
++++ b/packages/scorers/src/llm/context-recall.ts
+@@ -7,6 +7,7 @@ import {
+ import { safeStringify } from "@voltagent/internal/utils";
+ import type { LanguageModel } from "ai";
+ import { z } from "zod";
++import { extractTenantId } from "./utils";
+
+ const CONTEXT_RECALL_EXTRACT_PROMPT = `Given the context and ground truth (expected output), extract all factual statements from the ground truth.
+
+@@ -120,6 +121,7 @@ export function createContextRecallScorer<
+ const agent = new Agent({
+ name: "context-recall-evaluator",
+ model,
++ trafficPriority: "P2",
+ instructions: "You evaluate how well provided context supports factual statements",
+ });
+
+@@ -127,6 +129,7 @@ export function createContextRecallScorer<
+ const contextText = Array.isArray(payload.context)
+ ? payload.context.join("\n")
+ : payload.context;
++ const tenantId = extractTenantId(context);
+
+ // Extract statements from expected output
+ const extractPrompt = CONTEXT_RECALL_EXTRACT_PROMPT.replace(
+@@ -134,7 +137,9 @@ export function createContextRecallScorer<
+ contextText,
+ ).replace("{{expected}}", payload.expected);
+
+- const extractResponse = await agent.generateObject(extractPrompt, EXTRACT_SCHEMA);
++ const extractResponse = await agent.generateObject(extractPrompt, EXTRACT_SCHEMA, {
++ tenantId,
++ });
+ const statements = extractResponse.object.statements;
+
+ if (statements.length === 0) {
+@@ -152,7 +157,9 @@ export function createContextRecallScorer<
+ contextText,
+ ).replace("{{statement}}", statement);
+
+- const verifyResponse = await agent.generateObject(verifyPrompt, VERIFY_SCHEMA);
++ const verifyResponse = await agent.generateObject(verifyPrompt, VERIFY_SCHEMA, {
++ tenantId,
++ });
+ verdicts.push({
+ statement,
+ verdict: verifyResponse.object.verdict,
+diff --git a/packages/scorers/src/llm/context-relevancy.ts b/packages/scorers/src/llm/context-relevancy.ts
+index ee882b5b..aca608b2 100644
+--- a/packages/scorers/src/llm/context-relevancy.ts
++++ b/packages/scorers/src/llm/context-relevancy.ts
+@@ -7,6 +7,7 @@ import {
+ import { safeStringify } from "@voltagent/internal/utils";
+ import type { LanguageModel } from "ai";
+ import { z } from "zod";
++import { extractTenantId } from "./utils";
+
+ const CONTEXT_RELEVANCY_PROMPT = `Analyze the provided context and identify which parts are relevant to answering the given question. For each context sentence or passage, determine its relevance level.
+
+@@ -144,6 +145,7 @@ export function createContextRelevancyScorer<
+ const agent = new Agent({
+ name: "context-relevancy-evaluator",
+ model,
++ trafficPriority: "P2",
+ instructions: "You evaluate how relevant provided context is to answering questions",
+ });
+
+@@ -151,13 +153,16 @@ export function createContextRelevancyScorer<
+ const contextText = Array.isArray(payload.context)
+ ? payload.context.join("\n")
+ : payload.context;
++ const tenantId = extractTenantId(context);
+
+ const prompt = CONTEXT_RELEVANCY_PROMPT.replace("{{question}}", payload.input).replace(
+ "{{context}}",
+ contextText,
+ );
+
+- const response = await agent.generateObject(prompt, CONTEXT_RELEVANCY_SCHEMA);
++ const response = await agent.generateObject(prompt, CONTEXT_RELEVANCY_SCHEMA, {
++ tenantId,
++ });
+ const evaluations = response.object.evaluations;
+
+ context.results.raw.contextRelevancyEvaluations = evaluations;
+diff --git a/packages/scorers/src/llm/moderation.ts b/packages/scorers/src/llm/moderation.ts
+index 03563bfe..1055927f 100644
+--- a/packages/scorers/src/llm/moderation.ts
++++ b/packages/scorers/src/llm/moderation.ts
+@@ -7,6 +7,7 @@ import {
+ } from "@voltagent/core";
+ import { safeStringify } from "@voltagent/internal/utils";
+ import { z } from "zod";
++import { extractTenantId } from "./utils";
+
+ export interface ModerationScorerOptions {
+ id?: string;
+@@ -220,6 +221,7 @@ async function runModerationJudge(args: {
+ typeof context.results.prepare === "string"
+ ? context.results.prepare
+ : normalizeText(context.payload.output);
++ const tenantId = extractTenantId(context);
+
+ const prompt = await buildPrompt({
+ output: normalizedOutput,
+@@ -232,12 +234,14 @@ async function runModerationJudge(args: {
+ const agent = new Agent({
+ name: "moderation-judge",
+ model,
++ trafficPriority: "P2",
+ instructions:
+ "You are a safety classifier. Respond with JSON that matches the provided schema containing flagged, scores, and reason.",
+ });
+
+ const response = await agent.generateObject(prompt, MODERATION_SCHEMA, {
+ maxOutputTokens,
++ tenantId,
+ });
+
+ const parsed = mapModerationResponse(response.object, threshold);
+diff --git a/packages/scorers/src/llm/utils.ts b/packages/scorers/src/llm/utils.ts
+new file mode 100644
+index 00000000..75e886e3
+--- /dev/null
++++ b/packages/scorers/src/llm/utils.ts
+@@ -0,0 +1,14 @@
++import type { BuilderPrepareContext, BuilderScoreContext } from "@voltagent/core";
++
++type TenantAwareContext = BuilderScoreContext, Record> &
++ BuilderPrepareContext, Record>;
++
++export function extractTenantId(
++ context:
++ | BuilderScoreContext, Record>
++ | BuilderPrepareContext, Record>
++ | TenantAwareContext,
++): string | undefined {
++ const candidate = (context.payload as { tenantId?: unknown })?.tenantId;
++ return typeof candidate === "string" ? candidate : undefined;
++}
+diff --git a/packages/server-core/src/handlers/agent.handlers.ts b/packages/server-core/src/handlers/agent.handlers.ts
+index 00c0f2ee..37fbeaf4 100644
+--- a/packages/server-core/src/handlers/agent.handlers.ts
++++ b/packages/server-core/src/handlers/agent.handlers.ts
+@@ -1,11 +1,70 @@
+-import { ClientHTTPError, type ServerProviderDeps } from "@voltagent/core";
+-import { convertUsage } from "@voltagent/core";
++import {
++ ClientHTTPError,
++ type ServerProviderDeps,
++ type TrafficResponseMetadata,
++ convertUsage,
++} from "@voltagent/core";
+ import { type Logger, safeStringify } from "@voltagent/internal";
+ import { z } from "zod";
+ import { convertJsonSchemaToZod } from "zod-from-json-schema";
+ import { convertJsonSchemaToZod as convertJsonSchemaToZodV3 } from "zod-from-json-schema-v3";
+ import type { ApiResponse } from "../types";
+ import { processAgentOptions } from "../utils/options";
++import { buildTrafficHeaders } from "../utils/traffic";
++
++function extractTrafficMetadata(value: unknown): TrafficResponseMetadata | undefined {
++ if (!value || typeof value !== "object") return undefined;
++ const traffic = (value as { traffic?: unknown }).traffic;
++ if (!traffic || typeof traffic !== "object") return undefined;
++ return traffic as TrafficResponseMetadata;
++}
++
++function wrapStreamWithTraffic(
++ baseResponse: Response,
++ traffic?: TrafficResponseMetadata,
++): Response {
++ if (!traffic) return baseResponse;
++ const headers = new Headers(baseResponse.headers);
++ const trafficHeaders = buildTrafficHeaders(traffic);
++ for (const [key, value] of Object.entries(trafficHeaders)) {
++ headers.set(key, value);
++ }
++ const baseBody = baseResponse.body;
++ if (!baseBody) {
++ return new Response(baseBody, {
++ status: baseResponse.status,
++ headers,
++ });
++ }
++
++ const encoder = new TextEncoder();
++ const stream = new ReadableStream({
++ async start(controller) {
++ const trafficEvent = `data: ${safeStringify({ type: "traffic", traffic })}\n\n`;
++ controller.enqueue(encoder.encode(trafficEvent));
++ const reader = baseBody.getReader();
++ try {
++ while (true) {
++ const { done, value } = await reader.read();
++ if (done) break;
++ if (value !== undefined) {
++ controller.enqueue(value);
++ }
++ }
++ } catch (error) {
++ controller.error(error);
++ } finally {
++ reader.releaseLock();
++ controller.close();
++ }
++ },
++ });
++
++ return new Response(stream, {
++ status: baseResponse.status,
++ headers,
++ });
++}
+
+ /**
+ * Handler for listing all agents
+@@ -79,6 +138,7 @@ export async function handleGenerateText(
+ const options = processAgentOptions(body, signal);
+
+ const result = await agent.generateText(input, options);
++ const traffic = extractTrafficMetadata(result);
+
+ // Convert usage format if present
+ const usage = result.usage ? convertUsage(result.usage) : undefined;
+@@ -102,9 +162,11 @@ export async function handleGenerateText(
+ }
+ })(),
+ },
++ traffic,
+ };
+ } catch (error) {
+ logger.error("Failed to generate text", { error });
++ const traffic = extractTrafficMetadata(error);
+ if (error instanceof ClientHTTPError) {
+ return {
+ success: false,
+@@ -112,11 +174,13 @@ export async function handleGenerateText(
+ code: error.code,
+ name: error.name,
+ httpStatus: error.httpStatus,
++ traffic,
+ };
+ }
+ return {
+ success: false,
+ error: error instanceof Error ? error.message : "Unknown error",
++ traffic,
+ };
+ }
+ }
+@@ -153,6 +217,7 @@ export async function handleStreamText(
+ const options = processAgentOptions(body, signal);
+
+ const result = await agent.streamText(input, options);
++ const traffic = extractTrafficMetadata(result);
+
+ // Access the fullStream property
+ const { fullStream } = result;
+@@ -178,7 +243,7 @@ export async function handleStreamText(
+ },
+ });
+
+- return new Response(stream, {
++ const response = new Response(stream, {
+ status: 200,
+ headers: {
+ "Content-Type": "text/event-stream",
+@@ -186,20 +251,25 @@ export async function handleStreamText(
+ Connection: "keep-alive",
+ },
+ });
++ return wrapStreamWithTraffic(response, traffic);
+ } catch (error) {
+ logger.error("Failed to handle stream text request", { error });
+
+ const errorMessage = error instanceof Error ? error.message : "Unknown error";
++ const traffic = extractTrafficMetadata(error);
++ const trafficHeaders = buildTrafficHeaders(traffic);
+
+ return new Response(
+ safeStringify({
+ error: errorMessage,
+ message: errorMessage,
++ traffic,
+ }),
+ {
+ status: 500,
+ headers: {
+ "Content-Type": "application/json",
++ ...trafficHeaders,
+ },
+ },
+ );
+@@ -238,26 +308,32 @@ export async function handleChatStream(
+ const options = processAgentOptions(body, signal);
+
+ const result = await agent.streamText(input, options);
++ const traffic = extractTrafficMetadata(result);
+
+ // Use the built-in toUIMessageStreamResponse - it handles errors properly
+- return result.toUIMessageStreamResponse({
++ const response = result.toUIMessageStreamResponse({
+ sendReasoning: true,
+ sendSources: true,
+ });
++ return wrapStreamWithTraffic(response, traffic);
+ } catch (error) {
+ logger.error("Failed to handle chat stream request", { error });
+
+ const errorMessage = error instanceof Error ? error.message : "Unknown error";
++ const traffic = extractTrafficMetadata(error);
++ const trafficHeaders = buildTrafficHeaders(traffic);
+
+ return new Response(
+ safeStringify({
+ error: errorMessage,
+ message: errorMessage,
++ traffic,
+ }),
+ {
+ status: 500,
+ headers: {
+ "Content-Type": "application/json",
++ ...trafficHeaders,
+ },
+ },
+ );
+@@ -293,16 +369,20 @@ export async function handleGenerateObject(
+ ) as any;
+
+ const result = await agent.generateObject(input, zodSchema, options);
++ const traffic = extractTrafficMetadata(result);
+
+ return {
+ success: true,
+ data: result.object,
++ traffic,
+ };
+ } catch (error) {
+ logger.error("Failed to generate object", { error });
++ const traffic = extractTrafficMetadata(error);
+ return {
+ success: false,
+ error: error instanceof Error ? error.message : "Unknown error",
++ traffic,
+ };
+ }
+ }
+@@ -344,23 +424,29 @@ export async function handleStreamObject(
+ ) as any;
+
+ const result = await agent.streamObject(input, zodSchema, options);
++ const traffic = extractTrafficMetadata(result);
+
+ // Use the built-in toTextStreamResponse - it handles errors properly
+- return result.toTextStreamResponse();
++ const response = result.toTextStreamResponse();
++ return wrapStreamWithTraffic(response, traffic);
+ } catch (error) {
+ logger.error("Failed to handle stream object request", { error });
+
+ const errorMessage = error instanceof Error ? error.message : "Unknown error";
++ const traffic = extractTrafficMetadata(error);
++ const trafficHeaders = buildTrafficHeaders(traffic);
+
+ return new Response(
+ safeStringify({
+ error: errorMessage,
+ message: errorMessage,
++ traffic,
+ }),
+ {
+ status: 500,
+ headers: {
+ "Content-Type": "application/json",
++ ...trafficHeaders,
+ },
+ },
+ );
+diff --git a/packages/server-core/src/index.ts b/packages/server-core/src/index.ts
+index 1fe7e206..2f7ed826 100644
+--- a/packages/server-core/src/index.ts
++++ b/packages/server-core/src/index.ts
+@@ -40,6 +40,7 @@ export * from "./utils/server-utils";
+ export * from "./utils/ui-templates";
+ export * from "./utils/response-mappers";
+ export * from "./utils/sse";
++export * from "./utils/traffic";
+ export * from "./utils/announcements";
+
+ // Export WebSocket utilities
+diff --git a/packages/server-core/src/schemas/agent.schemas.ts b/packages/server-core/src/schemas/agent.schemas.ts
+index 52e80b83..41181e00 100644
+--- a/packages/server-core/src/schemas/agent.schemas.ts
++++ b/packages/server-core/src/schemas/agent.schemas.ts
+@@ -77,6 +77,18 @@ export const GenerateOptionsSchema = z
+ .object({
+ userId: z.string().optional().describe("Optional user ID for context tracking"),
+ conversationId: z.string().optional().describe("Optional conversation ID for context tracking"),
++ tenantId: z.string().optional().describe("Optional tenant ID for traffic limits"),
++ trafficPriority: z
++ .enum(["P0", "P1", "P2"])
++ .optional()
++ .describe("Optional traffic priority for scheduling (P0, P1, P2)"),
++ apiKeyId: z.string().optional().describe("Optional API key identifier for traffic limits"),
++ region: z.string().optional().describe("Optional region identifier for traffic limits"),
++ endpoint: z.string().optional().describe("Optional endpoint identifier for traffic limits"),
++ tenantTier: z
++ .string()
++ .optional()
++ .describe("Optional tenant tier identifier for traffic limits"),
+ context: z
+ .record(z.string(), z.unknown())
+ .nullish()
+@@ -94,6 +106,14 @@ export const GenerateOptionsSchema = z
+ .positive()
+ .optional()
+ .describe("Maximum number of steps for this request"),
++ maxQueueWaitMs: z
++ .number()
++ .int()
++ .nonnegative()
++ .optional()
++ .describe("Maximum time to wait in the queue before timing out (ms)"),
++ taskType: z.string().optional().describe("Optional task classification for fallback policy"),
++ fallbackPolicyId: z.string().optional().describe("Optional explicit fallback policy id"),
+ temperature: z
+ .number()
+ .min(0)
+diff --git a/packages/server-core/src/types/responses.ts b/packages/server-core/src/types/responses.ts
+index 2098c2f6..4935a535 100644
+--- a/packages/server-core/src/types/responses.ts
++++ b/packages/server-core/src/types/responses.ts
+@@ -1,10 +1,12 @@
+ /**
+ * Framework-agnostic response types for server handlers
+ */
++import type { TrafficResponseMetadata } from "@voltagent/core";
+
+ export interface SuccessResponse {
+ success: true;
+ data: T;
++ traffic?: TrafficResponseMetadata;
+ }
+
+ export interface ErrorResponse {
+@@ -13,6 +15,7 @@ export interface ErrorResponse {
+ httpStatus?: number;
+ code?: string;
+ name?: string;
++ traffic?: TrafficResponseMetadata;
+ }
+
+ export type ApiResponse = SuccessResponse | ErrorResponse;
+diff --git a/packages/server-core/src/utils/traffic.ts b/packages/server-core/src/utils/traffic.ts
+new file mode 100644
+index 00000000..f9be1845
+--- /dev/null
++++ b/packages/server-core/src/utils/traffic.ts
+@@ -0,0 +1,35 @@
++import type { TrafficResponseMetadata } from "@voltagent/core";
++
++export function buildTrafficHeaders(traffic?: TrafficResponseMetadata): Record {
++ if (!traffic) return {};
++
++ const headers: Record = {};
++
++ if (typeof traffic.retryAfterMs === "number" && Number.isFinite(traffic.retryAfterMs)) {
++ headers["Retry-After"] = String(Math.max(0, Math.ceil(traffic.retryAfterMs / 1000)));
++ }
++
++ if (traffic.rateLimitRemaining !== undefined) {
++ headers["X-RateLimit-Remaining"] = String(traffic.rateLimitRemaining);
++ }
++
++ if (typeof traffic.rateLimitResetAt === "number" && Number.isFinite(traffic.rateLimitResetAt)) {
++ headers["X-RateLimit-Reset"] = String(Math.max(0, Math.ceil(traffic.rateLimitResetAt / 1000)));
++ } else if (
++ typeof traffic.rateLimitResetInMs === "number" &&
++ Number.isFinite(traffic.rateLimitResetInMs)
++ ) {
++ const resetAt = Date.now() + Math.max(0, traffic.rateLimitResetInMs);
++ headers["X-RateLimit-Reset"] = String(Math.max(0, Math.ceil(resetAt / 1000)));
++ }
++
++ if (traffic.queueEtaMs !== undefined) {
++ headers["X-Queue-ETA"] = String(traffic.queueEtaMs);
++ }
++
++ if (traffic.rateLimitKey) {
++ headers["X-RateLimit-Key"] = traffic.rateLimitKey;
++ }
++
++ return headers;
++}
+diff --git a/packages/server-hono/src/routes/index.ts b/packages/server-hono/src/routes/index.ts
+index a5af8214..336a5bf4 100644
+--- a/packages/server-hono/src/routes/index.ts
++++ b/packages/server-hono/src/routes/index.ts
+@@ -2,6 +2,7 @@ import type { ServerProviderDeps } from "@voltagent/core";
+ import type { Logger } from "@voltagent/internal";
+ import {
+ UPDATE_ROUTES,
++ buildTrafficHeaders,
+ handleCancelWorkflow,
+ handleChatStream,
+ handleCheckUpdates,
+@@ -87,11 +88,12 @@ export function registerAgentRoutes(
+
+ const signal = c.req.raw.signal;
+ const response = await handleGenerateText(agentId, body, deps, logger, signal);
++ const trafficHeaders = buildTrafficHeaders(response.traffic);
+ if (!response.success) {
+ const { httpStatus, ...details } = response;
+- return c.json(details, httpStatus || 500);
++ return c.json(details, httpStatus || 500, trafficHeaders);
+ }
+- return c.json(response, 200);
++ return c.json(response, 200, trafficHeaders);
+ });
+
+ // POST /agents/:id/stream - Stream text (raw fullStream SSE)
+@@ -131,11 +133,12 @@ export function registerAgentRoutes(
+ const body = await c.req.json();
+ const signal = c.req.raw.signal;
+ const response = await handleGenerateObject(agentId, body, deps, logger, signal);
++ const trafficHeaders = buildTrafficHeaders(response.traffic);
+ if (!response.success) {
+ const { httpStatus, ...details } = response;
+- return c.json(details, httpStatus || 500);
++ return c.json(details, httpStatus || 500, trafficHeaders);
+ }
+- return c.json(response, 200);
++ return c.json(response, 200, trafficHeaders);
+ });
+
+ // POST /agents/:id/stream-object - Stream object
+diff --git a/packages/serverless-hono/src/routes.ts b/packages/serverless-hono/src/routes.ts
+index d377ce4b..39eabcf7 100644
+--- a/packages/serverless-hono/src/routes.ts
++++ b/packages/serverless-hono/src/routes.ts
+@@ -28,6 +28,7 @@ import {
+ type TriggerHttpRequestContext,
+ UPDATE_ROUTES,
+ WORKFLOW_ROUTES,
++ buildTrafficHeaders,
+ executeA2ARequest,
+ executeTriggerHandler,
+ getConversationMessagesHandler,
+@@ -165,7 +166,8 @@ export function registerAgentRoutes(app: Hono, deps: ServerProviderDeps, logger:
+ }
+ const signal = c.req.raw.signal;
+ const response = await handleGenerateText(agentId, body, deps, logger, signal);
+- return c.json(response, response.success ? 200 : 500);
++ const trafficHeaders = buildTrafficHeaders(response.traffic);
++ return c.json(response, response.success ? 200 : 500, trafficHeaders);
+ });
+
+ app.post(AGENT_ROUTES.streamText.path, async (c) => {
+@@ -197,7 +199,8 @@ export function registerAgentRoutes(app: Hono, deps: ServerProviderDeps, logger:
+ }
+ const signal = c.req.raw.signal;
+ const response = await handleGenerateObject(agentId, body, deps, logger, signal);
+- return c.json(response, response.success ? 200 : 500);
++ const trafficHeaders = buildTrafficHeaders(response.traffic);
++ return c.json(response, response.success ? 200 : 500, trafficHeaders);
+ });
+
+ app.post(AGENT_ROUTES.streamObject.path, async (c) => {
+diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
+index 20029de4..6671d8c1 100644
+--- a/pnpm-lock.yaml
++++ b/pnpm-lock.yaml
+@@ -37,7 +37,7 @@ importers:
+ version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2)
+ '@nx/plugin':
+ specifier: 20.4.6
+- version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(typescript@5.9.2)
++ version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2)
+ '@nx/vite':
+ specifier: 20.4.6
+ version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2)(vite@7.2.7)(vitest@3.2.4)
+@@ -92,6 +92,9 @@ importers:
+ syncpack:
+ specifier: ^13.0.2
+ version: 13.0.4(typescript@5.9.2)
++ ts-node:
++ specifier: ^10.9.2
++ version: 10.9.2(@swc/core@1.5.29)(@types/node@24.2.1)(typescript@5.9.2)
+ tslib:
+ specifier: ^2.3.0
+ version: 2.8.1
+@@ -99,7 +102,7 @@ importers:
+ specifier: ^8.5.0
+ version: 8.5.0(@swc/core@1.5.29)(typescript@5.9.2)
+ typescript:
+- specifier: ^5.8.2
++ specifier: ^5.9.2
+ version: 5.9.2
+ vite:
+ specifier: ^7.2.7
+@@ -2750,6 +2753,61 @@ importers:
+ specifier: ^0.5.3
+ version: 0.5.3(@tanstack/query-core@5.89.0)(@tanstack/react-query@5.89.0)(@tanstack/router-core@1.131.44)(@types/node@24.2.1)(@types/react@19.2.7)(@vitejs/plugin-react@5.1.2)(tsx@4.20.4)(vite@7.2.7)
+
++ examples/with-viteval/dist:
++ dependencies:
++ '@ai-sdk/openai':
++ specifier: ^2.0.52
++ version: 2.0.85(zod@3.25.76)
++ '@voltagent/cli':
++ specifier: ^0.1.16
++ version: link:../../../packages/cli
++ '@voltagent/core':
++ specifier: ^1.2.15
++ version: link:../../../packages/core
++ '@voltagent/libsql':
++ specifier: ^1.0.13
++ version: link:../../../packages/libsql
++ '@voltagent/logger':
++ specifier: ^1.0.4
++ version: link:../../../packages/logger
++ '@voltagent/server-hono':
++ specifier: ^1.2.5
++ version: link:../../../packages/server-hono
++ ai:
++ specifier: ^5.0.76
++ version: 5.0.113(zod@3.25.76)
++ consola:
++ specifier: ^3.4.2
++ version: 3.4.2
++ envalid:
++ specifier: ^8.1.0
++ version: 8.1.0
++ yargs:
++ specifier: ^18.0.0
++ version: 18.0.0
++ zod:
++ specifier: ^3.25.76
++ version: 3.25.76
++ devDependencies:
++ '@tsconfig/node24':
++ specifier: ^24.0.1
++ version: 24.0.1
++ '@types/yargs':
++ specifier: ^17.0.33
++ version: 17.0.33
++ dotenv:
++ specifier: ^16.4.5
++ version: 16.6.1
++ tsx:
++ specifier: ^4.19.3
++ version: 4.20.4
++ typescript:
++ specifier: ^5.8.2
++ version: 5.9.2
++ viteval:
++ specifier: ^0.5.3
++ version: 0.5.3(@tanstack/query-core@5.89.0)(@tanstack/react-query@5.89.0)(@tanstack/router-core@1.131.44)(@types/node@24.2.1)(@types/react@19.2.7)(@vitejs/plugin-react@5.1.2)(tsx@4.20.4)(vite@7.2.7)
++
+ examples/with-voice-elevenlabs:
+ dependencies:
+ '@ai-sdk/openai':
+@@ -3509,7 +3567,7 @@ importers:
+ version: 3.2.4(vitest@3.2.4)
+ jest:
+ specifier: ^29.5.0
+- version: 29.7.0(@types/node@24.2.1)
++ version: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2)
+ ts-jest:
+ specifier: ^29.1.0
+ version: 29.4.1(@babel/core@7.28.5)(esbuild@0.25.10)(jest@29.7.0)(typescript@5.9.2)
+@@ -9966,7 +10024,7 @@ packages:
+ slash: 3.0.0
+ dev: true
+
+- /@jest/core@29.7.0:
++ /@jest/core@29.7.0(ts-node@10.9.2):
+ resolution: {integrity: sha512-n7aeXWKMnGtDA48y8TLWJPJmLmmZ642Ceo78cYWEpiD7FzDgmNDV/GCVRorPABdXLJZ/9wzzgZAlHjXjxDHGsg==}
+ engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0}
+ peerDependencies:
+@@ -9987,7 +10045,7 @@ packages:
+ exit: 0.1.2
+ graceful-fs: 4.2.11
+ jest-changed-files: 29.7.0
+- jest-config: 29.7.0(@types/node@24.6.2)
++ jest-config: 29.7.0(@types/node@24.6.2)(ts-node@10.9.2)
+ jest-haste-map: 29.7.0
+ jest-message-util: 29.7.0
+ jest-regex-util: 29.6.3
+@@ -12403,7 +12461,7 @@ packages:
+ - verdaccio
+ dev: true
+
+- /@nx/jest@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2):
++ /@nx/jest@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2):
+ resolution: {integrity: sha512-yZOZJOQFtpdY3Fu/WYNoDx81TwvF9yfwvalFpLD19bz+2YGl7B89l0S1ZrtSRXFfKXA/w7gb0gmKwthJtQhx9Q==}
+ dependencies:
+ '@jest/reporters': 29.7.0
+@@ -12412,7 +12470,7 @@ packages:
+ '@nx/js': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2)
+ '@phenomnomnominal/tsquery': 5.0.1(typescript@5.9.2)
+ identity-obj-proxy: 3.0.0
+- jest-config: 29.7.0(@types/node@24.2.1)
++ jest-config: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2)
+ jest-resolve: 29.7.0
+ jest-util: 29.7.0
+ minimatch: 9.0.3
+@@ -12807,12 +12865,12 @@ packages:
+ dev: true
+ optional: true
+
+- /@nx/plugin@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(typescript@5.9.2):
++ /@nx/plugin@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2):
+ resolution: {integrity: sha512-7Jlv+BVqGoO0BolQN7P5Z87160phuE1i7H6C8xFwQnlQ3ZfwQCJzk2dkg1UyzxDkWl6lvVsqBjZPXD55gFQ3+w==}
+ dependencies:
+ '@nx/devkit': 20.4.6(nx@20.8.2)
+ '@nx/eslint': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)
+- '@nx/jest': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2)
++ '@nx/jest': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2)
+ '@nx/js': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2)
+ tslib: 2.8.1
+ transitivePeerDependencies:
+@@ -17770,8 +17828,8 @@ packages:
+ '@babel/plugin-syntax-jsx': 7.27.1(@babel/core@7.28.5)
+ '@babel/plugin-syntax-typescript': 7.27.1(@babel/core@7.28.5)
+ '@babel/template': 7.27.2
+- '@babel/traverse': 7.28.4
+- '@babel/types': 7.28.4
++ '@babel/traverse': 7.28.5
++ '@babel/types': 7.28.5
+ '@tanstack/react-router': 1.131.44(react-dom@19.2.3)(react@19.2.3)
+ '@tanstack/router-core': 1.131.44
+ '@tanstack/router-generator': 1.131.44
+@@ -22783,7 +22841,7 @@ packages:
+ crc-32: 1.2.2
+ readable-stream: 4.7.0
+
+- /create-jest@29.7.0(@types/node@24.2.1):
++ /create-jest@29.7.0(@types/node@24.2.1)(ts-node@10.9.2):
+ resolution: {integrity: sha512-Adz2bdH0Vq3F53KEMJOoftQFutWCukm6J24wbPWRO4k1kMY7gS7ds/uoJkNuV8wDCtWWnuwGcJwpWcih+zEW1Q==}
+ engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0}
+ hasBin: true
+@@ -22792,7 +22850,7 @@ packages:
+ chalk: 4.1.2
+ exit: 0.1.2
+ graceful-fs: 4.2.11
+- jest-config: 29.7.0(@types/node@24.2.1)
++ jest-config: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2)
+ jest-util: 29.7.0
+ prompts: 2.4.2
+ transitivePeerDependencies:
+@@ -27641,7 +27699,7 @@ packages:
+ - supports-color
+ dev: true
+
+- /jest-cli@29.7.0(@types/node@24.2.1):
++ /jest-cli@29.7.0(@types/node@24.2.1)(ts-node@10.9.2):
+ resolution: {integrity: sha512-OVVobw2IubN/GSYsxETi+gOe7Ka59EFMR/twOU3Jb2GnKKeMGJB5SGUUrEz3SFVmJASUdZUzy83sLNNQ2gZslg==}
+ engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0}
+ hasBin: true
+@@ -27651,14 +27709,14 @@ packages:
+ node-notifier:
+ optional: true
+ dependencies:
+- '@jest/core': 29.7.0
++ '@jest/core': 29.7.0(ts-node@10.9.2)
+ '@jest/test-result': 29.7.0
+ '@jest/types': 29.6.3
+ chalk: 4.1.2
+- create-jest: 29.7.0(@types/node@24.2.1)
++ create-jest: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2)
+ exit: 0.1.2
+ import-local: 3.2.0
+- jest-config: 29.7.0(@types/node@24.2.1)
++ jest-config: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2)
+ jest-util: 29.7.0
+ jest-validate: 29.7.0
+ yargs: 17.7.2
+@@ -27669,7 +27727,7 @@ packages:
+ - ts-node
+ dev: true
+
+- /jest-config@29.7.0(@types/node@24.2.1):
++ /jest-config@29.7.0(@types/node@24.2.1)(ts-node@10.9.2):
+ resolution: {integrity: sha512-uXbpfeQ7R6TZBqI3/TxCU4q4ttk3u0PJeC+E0zbfSoSjq6bJ7buBPxzQPL0ifrkY4DNu4JUdk0ImlBUYi840eQ==}
+ engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0}
+ peerDependencies:
+@@ -27704,12 +27762,13 @@ packages:
+ pretty-format: 29.7.0
+ slash: 3.0.0
+ strip-json-comments: 3.1.1
++ ts-node: 10.9.2(@swc/core@1.5.29)(@types/node@24.2.1)(typescript@5.9.2)
+ transitivePeerDependencies:
+ - babel-plugin-macros
+ - supports-color
+ dev: true
+
+- /jest-config@29.7.0(@types/node@24.6.2):
++ /jest-config@29.7.0(@types/node@24.6.2)(ts-node@10.9.2):
+ resolution: {integrity: sha512-uXbpfeQ7R6TZBqI3/TxCU4q4ttk3u0PJeC+E0zbfSoSjq6bJ7buBPxzQPL0ifrkY4DNu4JUdk0ImlBUYi840eQ==}
+ engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0}
+ peerDependencies:
+@@ -27744,6 +27803,7 @@ packages:
+ pretty-format: 29.7.0
+ slash: 3.0.0
+ strip-json-comments: 3.1.1
++ ts-node: 10.9.2(@swc/core@1.5.29)(@types/node@24.2.1)(typescript@5.9.2)
+ transitivePeerDependencies:
+ - babel-plugin-macros
+ - supports-color
+@@ -28041,7 +28101,7 @@ packages:
+ supports-color: 8.1.1
+ dev: true
+
+- /jest@29.7.0(@types/node@24.2.1):
++ /jest@29.7.0(@types/node@24.2.1)(ts-node@10.9.2):
+ resolution: {integrity: sha512-NIy3oAFp9shda19hy4HK0HRTWKtPJmGdnvywu01nOqNC2vZg+Z+fvJDxpMQA88eb2I9EcafcdjYgsDthnYTvGw==}
+ engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0}
+ hasBin: true
+@@ -28051,10 +28111,10 @@ packages:
+ node-notifier:
+ optional: true
+ dependencies:
+- '@jest/core': 29.7.0
++ '@jest/core': 29.7.0(ts-node@10.9.2)
+ '@jest/types': 29.6.3
+ import-local: 3.2.0
+- jest-cli: 29.7.0(@types/node@24.2.1)
++ jest-cli: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2)
+ transitivePeerDependencies:
+ - '@types/node'
+ - babel-plugin-macros
+@@ -36767,7 +36827,7 @@ packages:
+ esbuild: 0.25.10
+ fast-json-stable-stringify: 2.1.0
+ handlebars: 4.7.8
+- jest: 29.7.0(@types/node@24.2.1)
++ jest: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2)
+ json5: 2.2.3
+ lodash.memoize: 4.1.2
+ make-error: 1.3.6
+diff --git a/tmp/test/traffic-concurrency.ts b/tmp/test/traffic-concurrency.ts
+new file mode 100644
+index 00000000..d12fc5c9
+--- /dev/null
++++ b/tmp/test/traffic-concurrency.ts
+@@ -0,0 +1,91 @@
++// @ts-nocheck
++/**
++ * Manual test: TrafficController maxConcurrent scheduling.
++ *
++ * What to look for:
++ * - `inFlight` should never exceed `maxConcurrent`.
++ * - Requests should start in bursts up to `maxConcurrent`.
++ *
++ * Run:
++ * - pnpm ts-node tmp/test/traffic-concurrency.ts
++ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-concurrency.ts (enable controller debug logs)
++ */
++
++import { safeStringify } from "@voltagent/internal";
++import { Agent, getTrafficController } from "../../packages/core/dist/index.js";
++
++const verbose = process.env.VERBOSE === "1";
++if (!verbose) {
++ console.debug = () => {};
++}
++
++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
++const now = () => new Date().toISOString();
++
++const maxConcurrent = 3;
++const controller = getTrafficController({ maxConcurrent });
++
++let inFlight = 0;
++let maxObserved = 0;
++
++function makeModel(id: string, durationMs: number) {
++ return {
++ specificationVersion: "v2",
++ provider: "sim",
++ modelId: `concurrency-${id}`,
++ doGenerate: async () => {
++ inFlight += 1;
++ maxObserved = Math.max(maxObserved, inFlight);
++ console.log(`[${now()}] start ${id} inFlight=${inFlight}`);
++
++ try {
++ await sleep(durationMs);
++ return {
++ content: [{ type: "text", text: `ok:${id}` }],
++ finishReason: "stop",
++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
++ warnings: [],
++ response: { modelId: `concurrency-${id}`, headers: {} },
++ };
++ } finally {
++ inFlight -= 1;
++ console.log(`[${now()}] end ${id} inFlight=${inFlight}`);
++ }
++ },
++ };
++}
++
++async function main() {
++ console.log(`\n=== TrafficController concurrency (maxConcurrent=${maxConcurrent}) ===`);
++ void controller;
++
++ const agent = new Agent({
++ name: "traffic-concurrency",
++ instructions: "echo",
++ model: makeModel("base", 0),
++ temperature: 0,
++ maxOutputTokens: 32,
++ });
++
++ const ids = ["A", "B", "C", "D", "E"];
++ const jobs = ids.map((id) =>
++ agent.generateText(id, {
++ tenantId: "default",
++ trafficPriority: "P1",
++ model: makeModel(id, 700),
++ }),
++ );
++
++ const settled = await Promise.allSettled(jobs);
++ console.log(`\n[done] maxObserved=${maxObserved}`);
++ console.log(
++ `[done] results=${safeStringify(
++ settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)),
++ )}`,
++ );
++}
++
++main().catch((error) => {
++ console.error("Fatal error:", error);
++ process.exit(1);
++});
+diff --git a/tmp/test/traffic-fallback-chain.ts b/tmp/test/traffic-fallback-chain.ts
+new file mode 100644
+index 00000000..0cd77b2b
+--- /dev/null
++++ b/tmp/test/traffic-fallback-chain.ts
+@@ -0,0 +1,168 @@
++// @ts-nocheck
++/**
++ * Manual test: TrafficController circuit breaker + fallback chains.
++ *
++ * Scenarios:
++ * - Test 1: Open primary circuit (via repeated 429s), then route to fallback1.
++ * - Test 2: Open fallback1 circuit, then route to fallback2 (success).
++ * - Test 3: No fallback configured → CircuitBreakerOpenError.
++ *
++ * Run:
++ * - pnpm ts-node tmp/test/traffic-fallback-chain.ts
++ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-fallback-chain.ts
++ */
++
++import { safeStringify } from "@voltagent/internal";
++import { MockLanguageModelV2, MockProviderV2 } from "ai/test";
++import {
++ Agent,
++ CircuitBreakerOpenError,
++ getTrafficController,
++} from "../../packages/core/dist/index.js";
++
++const verbose = process.env.VERBOSE === "1";
++if (!verbose) {
++ console.debug = () => {};
++}
++
++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
++const now = () => new Date().toISOString();
++
++type ModelId = "primary" | "fallback1" | "fallback2" | "no-fallback";
++
++const provider = "test-provider";
++
++const controller = getTrafficController({
++ maxConcurrent: 1,
++ fallbackChains: {
++ primary: ["fallback1", "fallback2"],
++ fallback1: ["fallback2"],
++ },
++});
++
++function makeAlways429Model(modelId: ModelId) {
++ let attempts = 0;
++ return new MockLanguageModelV2({
++ provider,
++ modelId,
++ doGenerate: async () => {
++ attempts += 1;
++ console.log(`[${now()}] doGenerate model=${modelId} attempt=${attempts} -> 429`);
++ await sleep(25);
++ const err: any = new Error(`forced 429 for model=${modelId} attempt=${attempts}`);
++ err.status = 429;
++ throw err;
++ },
++ });
++}
++
++function makeAlwaysOkModel(modelId: ModelId) {
++ let attempts = 0;
++ return new MockLanguageModelV2({
++ provider,
++ modelId,
++ doGenerate: async () => {
++ attempts += 1;
++ console.log(`[${now()}] doGenerate model=${modelId} attempt=${attempts} -> ok`);
++ await sleep(25);
++ return {
++ content: [{ type: "text", text: `ok:${modelId}` }],
++ finishReason: "stop",
++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
++ warnings: [],
++ response: { modelId, headers: {} },
++ };
++ },
++ });
++}
++
++const primaryModel = makeAlways429Model("primary");
++const fallback1Model = makeAlways429Model("fallback1");
++const fallback2Model = makeAlwaysOkModel("fallback2");
++const noFallbackModel = makeAlways429Model("no-fallback");
++
++// Required so Agent fallbacks (string model IDs) resolve without network calls.
++(globalThis as any).AI_SDK_DEFAULT_PROVIDER = new MockProviderV2({
++ languageModels: {
++ primary: primaryModel,
++ fallback1: fallback1Model,
++ fallback2: fallback2Model,
++ "no-fallback": noFallbackModel,
++ },
++});
++
++const primaryAgent = new Agent({
++ name: "traffic-fallback-primary",
++ instructions: "echo",
++ model: primaryModel,
++ temperature: 0,
++ maxOutputTokens: 32,
++});
++
++const noFallbackAgent = new Agent({
++ name: "traffic-fallback-none",
++ instructions: "echo",
++ model: noFallbackModel,
++ temperature: 0,
++ maxOutputTokens: 32,
++});
++
++async function runOnce(label: string, agent: any) {
++ console.log(`\n--- ${label} ---`);
++ try {
++ const result = await agent.generateText(label, {
++ tenantId: "default",
++ trafficPriority: "P1",
++ });
++ console.log(
++ `[${label}] success text=${result.text} responseModel=${result.response?.modelId ?? "n/a"}`,
++ );
++ } catch (err: any) {
++ if (err instanceof CircuitBreakerOpenError) {
++ console.log(
++ `[${label}] CircuitBreakerOpenError retryAfterMs=${err.retryAfterMs} msg=${err.message}`,
++ );
++ } else {
++ console.log(
++ `[${label}] failed name=${err?.name ?? "Error"} status=${err?.status ?? err?.statusCode ?? "n/a"} msg=${err?.message}`,
++ );
++ }
++ }
++}
++
++async function main() {
++ console.log("\n=== Circuit breaker + fallback chain ===");
++ void controller;
++
++ console.log("\n[Test 1] Open primary circuit, then route to fallback1");
++ // Two calls * (up to 3 retries each) ≈ 6 failures → should open the circuit (threshold=5).
++ await runOnce("primary-warmup-1", primaryAgent);
++ await runOnce("primary-warmup-2", primaryAgent);
++ await runOnce("primary-after-open", primaryAgent); // should execute fallback1 (still closed)
++
++ console.log("\n[Test 2] Open fallback1 circuit, then route to fallback2");
++ // Build enough failures on fallback1 by routing multiple requests to it via primary circuit-open path.
++ await runOnce("fallback1-warmup-1-via-primary", primaryAgent);
++ await runOnce("fallback1-warmup-2-via-primary", primaryAgent);
++ await runOnce("primary-should-hit-fallback2", primaryAgent); // should execute fallback2 and succeed
++
++ console.log("\n[Test 3] No fallback configured → CircuitBreakerOpenError");
++ await runOnce("no-fallback-warmup-1", noFallbackAgent);
++ await runOnce("no-fallback-warmup-2", noFallbackAgent);
++ await runOnce("no-fallback-after-open", noFallbackAgent);
++
++ console.log("\n[debug] model call counts:");
++ console.log(
++ safeStringify({
++ primary: primaryModel.doGenerateCalls?.length,
++ fallback1: fallback1Model.doGenerateCalls?.length,
++ fallback2: fallback2Model.doGenerateCalls?.length,
++ "no-fallback": noFallbackModel.doGenerateCalls?.length,
++ }),
++ );
++}
++
++main().catch((error) => {
++ console.error("Fatal error:", error);
++ process.exit(1);
++});
+diff --git a/tmp/test/traffic-priority-openai-real.ts b/tmp/test/traffic-priority-openai-real.ts
+new file mode 100644
+index 00000000..223263ba
+--- /dev/null
++++ b/tmp/test/traffic-priority-openai-real.ts
+@@ -0,0 +1,117 @@
++// @ts-nocheck
++/**
++ * Manual test: TrafficController + AI SDK with real OpenAI calls.
++ *
++ * What this exercises:
++ * - Priority scheduling (P0/P1/P2) with `maxConcurrent=1`
++ * - Rate limit header ingestion via `updateRateLimitFromHeaders()` (if headers are present)
++ * - Tenant usage aggregation via `extractUsage` + `getTenantUsage()`
++ *
++ * Prereqs:
++ * - Set `OPENAI_API_KEY`
++ *
++ * Run:
++ * - OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-priority-openai-real.ts
++ * - VERBOSE=1 OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-priority-openai-real.ts
++ *
++ * Notes:
++ * - This will make real network calls and may incur cost.
++ */
++
++import { openai } from "@ai-sdk/openai";
++import { safeStringify } from "@voltagent/internal";
++import { Agent, getTrafficController } from "../../packages/core/dist/index.js";
++
++const verbose = process.env.VERBOSE === "1";
++if (!verbose) {
++ console.debug = () => {};
++}
++
++const apiKey = process.env.OPENAI_API_KEY;
++if (!apiKey) {
++ console.error("Missing OPENAI_API_KEY. Example:");
++ console.error(" OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-priority-openai-real.ts");
++ process.exit(1);
++}
++
++const _now = () => new Date().toISOString();
++const preview = (value: unknown, max = 140) => {
++ if (typeof value !== "string") return String(value ?? "");
++ return value.length > max ? `${value.slice(0, max)}…` : value;
++};
++
++const tenantId = process.env.TENANT_ID ?? "openai-real";
++const defaultModelId = process.env.OPENAI_MODEL ?? "gpt-4o-mini";
++
++const controller = getTrafficController({ maxConcurrent: 1 });
++
++function getHeader(headers: any, name: string): string | undefined {
++ if (!headers) return undefined;
++ if (typeof headers.get === "function") {
++ const v = headers.get(name);
++ return v === null || v === undefined ? undefined : String(v);
++ }
++ const key = Object.keys(headers).find((k) => k.toLowerCase() === name.toLowerCase());
++ if (!key) return undefined;
++ const v = headers[key];
++ return v === null || v === undefined ? undefined : String(Array.isArray(v) ? v[0] : v);
++}
++
++async function main() {
++ console.log(
++ `\n=== OpenAI real: priority scheduling (tenantId=${tenantId}, model=${defaultModelId}) ===`,
++ );
++ void controller;
++
++ const agent = new Agent({
++ name: "openai-real-traffic",
++ instructions: "Reply exactly with the requested token.",
++ model: openai(defaultModelId),
++ temperature: 0,
++ maxOutputTokens: 32,
++ });
++
++ // Enqueue in reverse priority order; controller should still execute P0 first.
++ const p2 = agent.generateText("Reply with only: P2", { tenantId, trafficPriority: "P2" });
++ const p1 = agent.generateText("Reply with only: P1", { tenantId, trafficPriority: "P1" });
++ const p0 = agent.generateText("Reply with only: P0", { tenantId, trafficPriority: "P0" });
++
++ const settled = await Promise.allSettled([p0, p1, p2]);
++ for (const result of settled) {
++ if (result.status !== "fulfilled") {
++ console.log(`[result] rejected=${result.reason?.message ?? String(result.reason)}`);
++ continue;
++ }
++
++ const headers = result.value.response?.headers;
++ const limit = getHeader(headers, "x-ratelimit-limit-requests");
++ const remaining = getHeader(headers, "x-ratelimit-remaining-requests");
++ const reset = getHeader(headers, "x-ratelimit-reset-requests");
++
++ console.log(
++ `[result] text=${preview(result.value.text)} finishReason=${result.value.finishReason} usage=${safeStringify(result.value.usage)}`,
++ );
++ console.log(
++ `[result] ratelimitHeaders=${safeStringify({
++ limit,
++ remaining,
++ reset,
++ })}`,
++ );
++ }
++
++ console.log(
++ `\n[done] settled=${safeStringify(
++ settled.map((s) => (s.status === "fulfilled" ? preview(s.value.text) : s.reason?.message)),
++ )}`,
++ );
++
++ console.log(
++ `[done] tenantUsage(${tenantId})=${safeStringify(controller.getTenantUsage(tenantId))}`,
++ );
++}
++
++main().catch((error) => {
++ console.error("Fatal error:", error);
++ process.exit(1);
++});
+diff --git a/tmp/test/traffic-priority-openai-sim.ts b/tmp/test/traffic-priority-openai-sim.ts
+new file mode 100644
+index 00000000..9d36a7d1
+--- /dev/null
++++ b/tmp/test/traffic-priority-openai-sim.ts
+@@ -0,0 +1,114 @@
++// @ts-nocheck
++/**
++ * Manual test: Agent → TrafficController priority scheduling (OpenAI-like stub models).
++ *
++ * This keeps the Agent + AI SDK path, but avoids real network calls by using stub models
++ * that pretend to be `provider="openai"` with modelIds like `gpt-4o`/`gpt-4o-mini`.
++ *
++ * Scenarios:
++ * - Test 1: P0 runs before P1/P2 when all runnable.
++ * - Test 2: P0 request (gpt-4o) is rate-limited → P1 (gpt-4o-mini) proceeds.
++ *
++ * Note:
++ * - Rate-limit wakeups include a small probe delay; a "1s" reset may unblock slightly after 1s.
++ *
++ * Run:
++ * - pnpm ts-node tmp/test/traffic-priority-openai-sim.ts
++ */
++
++import { safeStringify } from "@voltagent/internal";
++import { Agent, getTrafficController } from "../../packages/core/dist/index.js";
++
++const verbose = process.env.VERBOSE === "1";
++if (!verbose) {
++ console.debug = () => {};
++}
++
++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
++const now = () => new Date().toISOString();
++
++function makeOpenAIStubModel(modelId: string, delayMs: number) {
++ let calls = 0;
++ return {
++ specificationVersion: "v2",
++ provider: "openai",
++ modelId,
++ doGenerate: async () => {
++ calls += 1;
++ console.log(`[${now()}] [model] ${modelId} doGenerate call=${calls}`);
++ await sleep(delayMs);
++ return {
++ content: [{ type: "text", text: `ok:${modelId}` }],
++ finishReason: "stop",
++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
++ warnings: [],
++ response: { modelId, headers: {} },
++ };
++ },
++ };
++}
++
++const controller = getTrafficController({ maxConcurrent: 1 });
++
++const modelMini = makeOpenAIStubModel("gpt-4o-mini", 80);
++const modelBig = makeOpenAIStubModel("gpt-4o", 80);
++
++const agent = new Agent({
++ name: "priority-openai-sim",
++ instructions: "echo",
++ model: modelMini,
++ temperature: 0,
++ maxOutputTokens: 32,
++});
++
++async function test1_priorityOrder() {
++ console.log("\n=== Test 1: P0 ordering via Agent ===");
++
++ const p2 = agent.generateText("P2", { trafficPriority: "P2", tenantId: "sim" });
++ const p1 = agent.generateText("P1", { trafficPriority: "P1", tenantId: "sim" });
++ const p0 = agent.generateText("P0", { trafficPriority: "P0", tenantId: "sim" });
++
++ const results = await Promise.all([p0, p1, p2]);
++ console.log(`[Test 1] results=${safeStringify(results.map((r) => r.text))}`);
++}
++
++async function test2_p1RunsWhenP0RateLimited() {
++ console.log("\n=== Test 2: P1 proceeds when P0 is rate-limited ===");
++
++ // Seed remaining=0 for openai::gpt-4o so the P0 head item initially waits.
++ const applied = controller.updateRateLimitFromHeaders(
++ { provider: "openai", model: "gpt-4o" },
++ {
++ "x-ratelimit-limit-requests": "1",
++ "x-ratelimit-remaining-requests": "0",
++ "x-ratelimit-reset-requests": "1s",
++ },
++ );
++ console.log(`[Test 2] updateRateLimitFromHeaders=${safeStringify(applied)}`);
++
++ const p0Blocked = agent.generateText("P0 (gpt-4o, rate-limited)", {
++ trafficPriority: "P0",
++ tenantId: "sim",
++ model: modelBig, // per-call model override (new in this branch)
++ });
++
++ const p1Free = agent.generateText("P1 (gpt-4o-mini)", {
++ trafficPriority: "P1",
++ tenantId: "sim",
++ model: modelMini,
++ });
++
++ const [r0, r1] = await Promise.all([p0Blocked, p1Free]);
++ console.log(`[Test 2] p0 text=${r0.text}`);
++ console.log(`[Test 2] p1 text=${r1.text}`);
++}
++
++async function main() {
++ await test1_priorityOrder();
++ await test2_p1RunsWhenP0RateLimited();
++}
++
++main().catch((error) => {
++ console.error("Fatal error:", error);
++ process.exit(1);
++});
+diff --git a/tmp/test/traffic-priority.ts b/tmp/test/traffic-priority.ts
+new file mode 100644
+index 00000000..409e1078
+--- /dev/null
++++ b/tmp/test/traffic-priority.ts
+@@ -0,0 +1,159 @@
++// @ts-nocheck
++/**
++ * Manual test: TrafficController priority scheduling.
++ *
++ * Scenarios:
++ * - Test 1: P0 should run before P1/P2 when runnable.
++ * - Test 2: If a P0 request is rate-limited, a lower priority (P1) can proceed.
++ *
++ * Note:
++ * - Rate-limit wakeups include a small probe delay; a "1s" reset may unblock slightly after 1s.
++ *
++ * Run:
++ * - pnpm ts-node tmp/test/traffic-priority.ts
++ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-priority.ts
++ */
++
++import { safeStringify } from "@voltagent/internal";
++import { Agent, getTrafficController } from "../../packages/core/dist/index.js";
++
++const verbose = process.env.VERBOSE === "1";
++if (!verbose) {
++ console.debug = () => {};
++}
++
++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
++const now = () => new Date().toISOString();
++
++const controller = getTrafficController({ maxConcurrent: 1 });
++
++function extractLabel(prompt: any): string {
++ if (!Array.isArray(prompt)) {
++ return "unknown";
++ }
++
++ for (let index = prompt.length - 1; index >= 0; index -= 1) {
++ const message = prompt[index];
++ if (!message || message.role !== "user" || !Array.isArray(message.content)) {
++ continue;
++ }
++
++ const textPart = message.content.find((part: any) => part?.type === "text");
++ if (textPart?.text) {
++ return String(textPart.text);
++ }
++ }
++
++ return "unknown";
++}
++
++function makeModel(provider: string, modelId: string, delayMs = 50) {
++ let calls = 0;
++ let lastStartAt = 0;
++
++ return {
++ specificationVersion: "v2",
++ provider,
++ modelId,
++ doGenerate: async (options: any) => {
++ calls += 1;
++ const startAt = Date.now();
++ const delta = lastStartAt ? startAt - lastStartAt : 0;
++ lastStartAt = startAt;
++
++ const label = extractLabel(options?.prompt);
++ console.log(
++ `[${now()}] doGenerate start model=${provider}::${modelId} call=${calls} (+${delta}ms) input=${label}`,
++ );
++ await sleep(delayMs);
++ console.log(`[${now()}] doGenerate end model=${provider}::${modelId} input=${label}`);
++
++ return {
++ content: [{ type: "text", text: `ok:${label}` }],
++ finishReason: "stop",
++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
++ warnings: [],
++ response: { modelId, headers: {} },
++ };
++ },
++ };
++}
++
++async function test1_priorityOrder() {
++ console.log("\n=== Test 1: priority order (P0 before P1/P2) ===");
++
++ const sharedModel = makeModel("p", "shared-model", 50);
++ const agent = new Agent({
++ name: "traffic-priority",
++ instructions: "echo",
++ model: sharedModel,
++ temperature: 0,
++ maxOutputTokens: 32,
++ });
++
++ // Enqueue in reverse order; scheduler should still run P0 first.
++ const p2 = agent.generateText("P2", { tenantId: "default", trafficPriority: "P2" });
++ const p1 = agent.generateText("P1", { tenantId: "default", trafficPriority: "P1" });
++ const p0 = agent.generateText("P0", { tenantId: "default", trafficPriority: "P0" });
++
++ const settled = await Promise.allSettled([p0, p1, p2]);
++ console.log(
++ `[Test 1] results=${safeStringify(
++ settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)),
++ )}`,
++ );
++}
++
++async function test2_lowerPriorityWhenP0RateLimited() {
++ console.log("\n=== Test 2: P1 proceeds when P0 rate-limited ===");
++
++ const applied = controller.updateRateLimitFromHeaders(
++ { provider: "p0", model: "m0" },
++ {
++ "x-ratelimit-limit-requests": "1",
++ "x-ratelimit-remaining-requests": "0",
++ "x-ratelimit-reset-requests": "1s",
++ },
++ );
++ console.log(`[Test 2] updateRateLimitFromHeaders=${safeStringify(applied)}`);
++
++ const modelP0 = makeModel("p0", "m0", 50);
++ const modelP1 = makeModel("p1", "m1", 50);
++ const agent = new Agent({
++ name: "traffic-priority-rate-limit",
++ instructions: "echo",
++ model: modelP1,
++ temperature: 0,
++ maxOutputTokens: 32,
++ });
++
++ // Now the next P0 request is at the head of the queue but rate-limited,
++ // so a runnable P1 request should execute first.
++ const p0Blocked = agent.generateText("P0-blocked (rate limited)", {
++ tenantId: "default",
++ trafficPriority: "P0",
++ model: modelP0,
++ });
++ const p1Free = agent.generateText("P1-free (should run first)", {
++ tenantId: "default",
++ trafficPriority: "P1",
++ model: modelP1,
++ });
++
++ const settled = await Promise.allSettled([p0Blocked, p1Free]);
++ console.log(
++ `[Test 2] results=${safeStringify(
++ settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)),
++ )}`,
++ );
++}
++
++async function main() {
++ await test1_priorityOrder();
++ await test2_lowerPriorityWhenP0RateLimited();
++}
++
++main().catch((error) => {
++ console.error("Fatal error:", error);
++ process.exit(1);
++});
+diff --git a/tmp/test/traffic-rate-limit-from-headers.ts b/tmp/test/traffic-rate-limit-from-headers.ts
+new file mode 100644
+index 00000000..d8262661
+--- /dev/null
++++ b/tmp/test/traffic-rate-limit-from-headers.ts
+@@ -0,0 +1,158 @@
++// @ts-nocheck
++/**
++ * Manual test: TrafficController dynamic rate limits from OpenAI response headers.
++ *
++ * This hits the real OpenAI model via Agent + AI SDK, and relies on the
++ * `x-ratelimit-*` response headers to seed/update the TrafficController.
++ *
++ * What to look for:
++ * - Each request prints the observed `x-ratelimit-*` headers (if present).
++ * - Agent should also log: "[Traffic] Applied rate limit from response headers".
++ * - With enough parallel requests, some requests may take longer due to controller throttling.
++ *
++ * Prereqs:
++ * - Set `OPENAI_API_KEY`
++ *
++ * Optional env:
++ * - `OPENAI_MODEL` (default: gpt-4o-mini)
++ * - `REQUESTS` (default: 10)
++ * - `MAX_CONCURRENT` (default: 50)
++ * - `TENANT_ID` (default: openai-rate-limit-headers)
++ *
++ * Run:
++ * - OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-from-headers.ts
++ * - VERBOSE=1 OPENAI_API_KEY=... REQUESTS=30 pnpm ts-node tmp/test/traffic-rate-limit-from-headers.ts
++ */
++
++import { openai } from "@ai-sdk/openai";
++import { safeStringify } from "@voltagent/internal";
++import { Agent, getTrafficController } from "../../packages/core/dist/index.js";
++
++const verbose = process.env.VERBOSE === "1";
++if (!verbose) {
++ console.debug = () => {};
++}
++
++const now = () => new Date().toISOString();
++
++const apiKey = process.env.OPENAI_API_KEY;
++if (!apiKey) {
++ console.error("Missing OPENAI_API_KEY. Example:");
++ console.error(" OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-from-headers.ts");
++ process.exit(1);
++}
++
++const provider = "openai";
++const modelId = process.env.OPENAI_MODEL ?? "gpt-4o-mini";
++const tenantId = process.env.TENANT_ID ?? "openai-rate-limit-headers";
++const requestCountRaw = Number(process.env.REQUESTS ?? "10");
++const maxConcurrentRaw = Number(process.env.MAX_CONCURRENT ?? "50");
++const requestCount = Number.isFinite(requestCountRaw) && requestCountRaw > 0 ? requestCountRaw : 10;
++const maxConcurrent =
++ Number.isFinite(maxConcurrentRaw) && maxConcurrentRaw > 0 ? maxConcurrentRaw : 50;
++
++const key = `${provider}::${modelId}`;
++const controller = getTrafficController({ maxConcurrent });
++
++function getHeader(headers: any, name: string): string | undefined {
++ if (!headers) return undefined;
++ if (typeof headers.get === "function") {
++ const v = headers.get(name);
++ return v === null || v === undefined ? undefined : String(v);
++ }
++
++ const entries = Object.entries(headers as Record);
++ const target = name.toLowerCase();
++ const match = entries.find(([k]) => String(k).toLowerCase() === target);
++ if (!match) return undefined;
++
++ const value = match[1];
++ if (Array.isArray(value)) {
++ const first = value[0];
++ return first === null || first === undefined ? undefined : String(first);
++ }
++
++ return value === null || value === undefined ? undefined : String(value);
++}
++
++async function main() {
++ console.log(
++ `\n=== OpenAI rate limit headers → TrafficController (${key}, maxConcurrent=${maxConcurrent}, requests=${requestCount}) ===`,
++ );
++ void controller;
++
++ const agent = new Agent({
++ name: "openai-rate-limit-from-headers",
++ instructions: "Reply with only the requested token.",
++ model: openai(modelId),
++ temperature: 0,
++ maxOutputTokens: 32,
++ });
++
++ console.log("\n[seed] Making one request to capture headers...");
++ const seedStartedAt = Date.now();
++ const seed = await agent.generateText("Reply with only: seed", {
++ tenantId,
++ trafficPriority: "P1",
++ });
++ const seedElapsedMs = Date.now() - seedStartedAt;
++
++ const seedHeaders = seed.response?.headers;
++ console.log(`[seed] done in ${seedElapsedMs}ms text=${seed.text}`);
++ console.log(
++ `[seed] x-ratelimit-*=${safeStringify({
++ limit: getHeader(seedHeaders, "x-ratelimit-limit-requests"),
++ remaining: getHeader(seedHeaders, "x-ratelimit-remaining-requests"),
++ reset: getHeader(seedHeaders, "x-ratelimit-reset-requests"),
++ })}`,
++ );
++
++ console.log(`\n[burst] Scheduling ${requestCount} parallel requests...`);
++ const jobs = Array.from({ length: requestCount }, (_, idx) => {
++ const label = `req-${idx + 1}`;
++ const enqueuedAt = Date.now();
++ console.log(`[${now()}] enqueue ${label}`);
++
++ return agent
++ .generateText(`Reply with only: ${label}`, { tenantId, trafficPriority: "P1" })
++ .then((result) => {
++ const elapsedMs = Date.now() - enqueuedAt;
++ const headers = result.response?.headers;
++ console.log(
++ `[${now()}] done ${label} in ${elapsedMs}ms text=${result.text} x-ratelimit-remaining=${getHeader(
++ headers,
++ "x-ratelimit-remaining-requests",
++ )}`,
++ );
++ return {
++ label,
++ elapsedMs,
++ text: result.text,
++ headers: {
++ limit: getHeader(headers, "x-ratelimit-limit-requests"),
++ remaining: getHeader(headers, "x-ratelimit-remaining-requests"),
++ reset: getHeader(headers, "x-ratelimit-reset-requests"),
++ },
++ };
++ })
++ .catch((error) => {
++ const elapsedMs = Date.now() - enqueuedAt;
++ console.log(
++ `[${now()}] failed ${label} in ${elapsedMs}ms name=${error?.name ?? "Error"} status=${error?.status ?? error?.statusCode ?? "n/a"} msg=${error?.message}`,
++ );
++ throw error;
++ });
++ });
++
++ const settled = await Promise.allSettled(jobs);
++
++ console.log(`\n[done] settled=${safeStringify(settled.map((s) => s.status))}`);
++ console.log(
++ `[done] tenantUsage(${tenantId})=${safeStringify(controller.getTenantUsage(tenantId))}`,
++ );
++}
++
++main().catch((error) => {
++ console.error("Fatal error:", error);
++ process.exit(1);
++});
+diff --git a/tmp/test/traffic-rate-limit-openai-window-sim.ts b/tmp/test/traffic-rate-limit-openai-window-sim.ts
+new file mode 100644
+index 00000000..35232faa
+--- /dev/null
++++ b/tmp/test/traffic-rate-limit-openai-window-sim.ts
+@@ -0,0 +1,247 @@
++// @ts-nocheck
++/**
++ * Manual test (real network): Simulate OpenAI "window remaining + reset" semantics and watch
++ * TrafficController pace + probe behavior via logs.
++ *
++ * Why "simulate"?
++ * - Real OpenAI headers usually show very large remaining values, so pacing is hard to observe.
++ * - This script still hits the real OpenAI model, but it drives the controller state using
++ * synthetic `x-ratelimit-*` headers to force a small window (e.g. remaining=3, reset=30s).
++ *
++ * What this demonstrates (matches your Step 1–7):
++ * 1) We seed controller with remaining + reset window.
++ * 2) We enqueue many requests.
++ * 3) Controller subtracts `reserved` from `remaining` to avoid stampedes.
++ * 4) When `effectiveRemaining <= 1`, controller waits until `resetAt + probeDelay`.
++ * 5) When room exists, controller paces using `nextAllowedAt`.
++ * 6) When a request finishes, we release reservation (controller) and apply new headers (this script).
++ * 7) After reset, controller sends a probe even when remaining==0; probe "fetches" fresh headers and flow resumes.
++ *
++ * Prereqs:
++ * - Set `OPENAI_API_KEY`
++ *
++ * Suggested logging:
++ * - `VOLTAGENT_LOG_LEVEL=trace` (to see traffic controller internals)
++ *
++ * Run:
++ * - VOLTAGENT_LOG_LEVEL=trace OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-openai-window-sim.ts
++ *
++ * Optional env:
++ * - OPENAI_MODEL (default: gpt-4o-mini)
++ * - WINDOW_SECONDS (default: 30)
++ * - REMAINING (default: 3)
++ * - REQUESTS (default: 10)
++ * - MAX_CONCURRENT (default: 50)
++ */
++
++import { safeStringify } from "@voltagent/internal";
++import { TrafficController } from "../../packages/core/dist/index.js";
++
++const apiKey = process.env.OPENAI_API_KEY;
++if (!apiKey) {
++ console.error("Missing OPENAI_API_KEY. Example:");
++ console.error(
++ " VOLTAGENT_LOG_LEVEL=trace OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-openai-window-sim.ts",
++ );
++ process.exit(1);
++}
++
++const now = () => new Date().toISOString();
++
++const modelId = process.env.OPENAI_MODEL ?? "gpt-4o-mini";
++const windowSecondsRaw = Number(process.env.WINDOW_SECONDS ?? "30");
++const remainingRaw = Number(process.env.REMAINING ?? "3");
++const requestsRaw = Number(process.env.REQUESTS ?? "10");
++const maxConcurrentRaw = Number(process.env.MAX_CONCURRENT ?? "50");
++
++const windowSeconds =
++ Number.isFinite(windowSecondsRaw) && windowSecondsRaw > 0 ? windowSecondsRaw : 30;
++const initialRemaining =
++ Number.isFinite(remainingRaw) && remainingRaw > 0 ? Math.floor(remainingRaw) : 3;
++const requestCount = Number.isFinite(requestsRaw) && requestsRaw > 0 ? Math.floor(requestsRaw) : 10;
++const maxConcurrent =
++ Number.isFinite(maxConcurrentRaw) && maxConcurrentRaw > 0 ? Math.floor(maxConcurrentRaw) : 50;
++
++const provider = "openai";
++const tenantId = "openai-window-sim";
++const windowMs = Math.round(windowSeconds * 1000);
++
++async function callOpenAIResponses(label: string): Promise<{
++ status: number;
++ headers: Record;
++ textPreview: string;
++}> {
++ const url = "https://api.openai.com/v1/responses";
++ const body = safeStringify({
++ model: modelId,
++ input: `Reply with only: ${label}`,
++ max_output_tokens: 16,
++ });
++
++ const startedAt = Date.now();
++ const res = await fetch(url, {
++ method: "POST",
++ headers: {
++ authorization: `Bearer ${apiKey}`,
++ "content-type": "application/json",
++ },
++ body,
++ });
++
++ const limit = res.headers.get("x-ratelimit-limit-requests") ?? undefined;
++ const remaining = res.headers.get("x-ratelimit-remaining-requests") ?? undefined;
++ const reset = res.headers.get("x-ratelimit-reset-requests") ?? undefined;
++
++ if (!res.ok) {
++ const text = await res.text().catch(() => "");
++ throw new Error(
++ `OpenAI error status=${res.status} elapsedMs=${Date.now() - startedAt} body=${text.slice(0, 280)}`,
++ );
++ }
++
++ const data: any = await res.json();
++ const outputText =
++ data?.output?.[0]?.content?.find?.((c: any) => c?.type === "output_text")?.text ??
++ data?.output_text ??
++ data?.output?.[0]?.content?.[0]?.text ??
++ "";
++
++ return {
++ status: res.status,
++ headers: {
++ "x-ratelimit-limit-requests": limit,
++ "x-ratelimit-remaining-requests": remaining,
++ "x-ratelimit-reset-requests": reset,
++ },
++ textPreview: String(outputText).slice(0, 80),
++ };
++}
++
++async function main() {
++ console.log(
++ `\n=== OpenAI real + synthetic window rate limit (provider=${provider}, model=${modelId}) ===`,
++ );
++ console.log(
++ `[config] maxConcurrent=${maxConcurrent} windowSeconds=${windowSeconds} initialRemaining=${initialRemaining} requests=${requestCount}`,
++ );
++ console.log(
++ "[hint] Set VOLTAGENT_LOG_LEVEL=trace to see TrafficController internals (reserved/effectiveRemaining/nextAllowedAt).",
++ );
++
++ const controller = new TrafficController({ maxConcurrent });
++
++ // --- Step 1: seed "remaining + reset window" into controller ---
++ let windowResetAt = Date.now() + windowMs;
++ let remainingInWindow = initialRemaining;
++
++ const applySyntheticHeaders = (source: string) => {
++ const resetMs = Math.max(1, windowResetAt - Date.now());
++ const applied = controller.updateRateLimitFromHeaders(
++ { provider, model: modelId, tenantId },
++ {
++ "x-ratelimit-limit-requests": String(initialRemaining),
++ "x-ratelimit-remaining-requests": String(Math.max(0, remainingInWindow)),
++ "x-ratelimit-reset-requests": `${resetMs}ms`,
++ },
++ );
++ console.log(
++ `[${now()}] [synthetic] source=${source} remaining=${remainingInWindow} resetInMs=${resetMs} applied=${safeStringify(
++ applied && {
++ key: applied.key,
++ state: {
++ remaining: applied.state.remaining,
++ reserved: applied.state.reserved,
++ resetAt: applied.state.resetAt,
++ nextAllowedAt: applied.state.nextAllowedAt,
++ },
++ },
++ )}`,
++ );
++ };
++
++ applySyntheticHeaders("seed");
++
++ console.log("\n[seed] Making one real request to confirm connectivity + show real headers...");
++ const seed = await callOpenAIResponses("seed");
++ console.log(
++ `[${now()}] [seed] ok status=${seed.status} text=${seed.textPreview} realHeaders=${safeStringify(
++ seed.headers,
++ )}`,
++ );
++
++ console.log(`\n[burst] Enqueueing ${requestCount} controller-managed requests...`);
++
++ const jobs = Array.from({ length: requestCount }, (_, index) => {
++ const label = `req-${index + 1}`;
++ const enqueuedAt = Date.now();
++ console.log(`[${now()}] [enqueue] ${label}`);
++
++ return controller
++ .handleText({
++ tenantId,
++ metadata: {
++ tenantId,
++ provider,
++ model: modelId,
++ priority: "P1",
++ agentName: "openai-window-sim",
++ agentId: label,
++ },
++ execute: async () => {
++ const startedAt = Date.now();
++ console.log(`[${now()}] [execute-start] ${label}`);
++
++ const result = await callOpenAIResponses(label);
++
++ console.log(
++ `[${now()}] [execute-end] ${label} elapsedMs=${Date.now() - startedAt} realHeaders=${safeStringify(
++ result.headers,
++ )}`,
++ );
++
++ // --- Step 6: decrement remaining + apply new "headers" ---
++ const nowMs = Date.now();
++ if (nowMs >= windowResetAt) {
++ // --- Step 7: reset happened; probe request fetched "fresh" headers for the next window ---
++ console.log(
++ `[${now()}] [reset] window elapsed; starting new synthetic window (windowSeconds=${windowSeconds})`,
++ );
++ windowResetAt = nowMs + windowMs;
++ remainingInWindow = initialRemaining;
++ }
++
++ remainingInWindow = Math.max(0, remainingInWindow - 1);
++ applySyntheticHeaders("response");
++
++ return result;
++ },
++ })
++ .then((r) => {
++ const totalElapsedMs = Date.now() - enqueuedAt;
++ console.log(
++ `[${now()}] [done] ${label} totalElapsedMs=${totalElapsedMs} text=${r.textPreview}`,
++ );
++ return { label, totalElapsedMs, status: "fulfilled" as const };
++ })
++ .catch((error: any) => {
++ const totalElapsedMs = Date.now() - enqueuedAt;
++ console.log(
++ `[${now()}] [fail] ${label} totalElapsedMs=${totalElapsedMs} name=${error?.name ?? "Error"} msg=${
++ error?.message ?? String(error)
++ }`,
++ );
++ return { label, totalElapsedMs, status: "rejected" as const };
++ });
++ });
++
++ const settled = await Promise.all(jobs);
++ console.log(`\n[done] settled=${safeStringify(settled.map((s) => s.status))}`);
++ console.log(
++ `[done] tenantUsage(${tenantId})=${safeStringify(controller.getTenantUsage(tenantId))}`,
++ );
++}
++
++main().catch((error) => {
++ console.error("Fatal error:", error);
++ process.exit(1);
++});
+diff --git a/tmp/test/traffic-rate-limit-static.ts b/tmp/test/traffic-rate-limit-static.ts
+new file mode 100644
+index 00000000..3f91d5bb
+--- /dev/null
++++ b/tmp/test/traffic-rate-limit-static.ts
+@@ -0,0 +1,149 @@
++// @ts-nocheck
++/**
++ * Manual test: TrafficController window-based rate limiting (simulated OpenAI headers).
++ *
++ * What to look for:
++ * - Requests should be paced out across the window (no steady "refill" math).
++ * - If responses arrive out-of-order, remaining headers might "increase"; controller should
++ * keep remaining monotonic within the same window.
++ *
++ * Run:
++ * - pnpm ts-node tmp/test/traffic-rate-limit-static.ts
++ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-rate-limit-static.ts
++ *
++ * Optional env:
++ * - LIMIT=6 WINDOW_MS=3000 pnpm ts-node tmp/test/traffic-rate-limit-static.ts
++ */
++
++import { safeStringify } from "@voltagent/internal";
++import { Agent, getTrafficController } from "../../packages/core/dist/index.js";
++
++const verbose = process.env.VERBOSE === "1";
++if (!verbose) {
++ console.debug = () => {};
++}
++
++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
++const now = () => new Date().toISOString();
++
++const provider = "sim";
++const model = "rate-limited-model";
++const key = `${provider}::${model}`;
++
++const controller = getTrafficController({ maxConcurrent: 50 });
++
++const limit = Number(process.env.LIMIT ?? 6);
++const windowMs = Number(process.env.WINDOW_MS ?? 3000);
++let windowStartAt = Date.now();
++let windowResetAt = windowStartAt + windowMs;
++let usedInWindow = 0;
++
++function extractLabel(prompt: any): string {
++ if (!Array.isArray(prompt)) {
++ return "unknown";
++ }
++
++ for (let index = prompt.length - 1; index >= 0; index -= 1) {
++ const message = prompt[index];
++ if (!message || message.role !== "user" || !Array.isArray(message.content)) {
++ continue;
++ }
++
++ const textPart = message.content.find((part: any) => part?.type === "text");
++ if (textPart?.text) {
++ return String(textPart.text);
++ }
++ }
++
++ return "unknown";
++}
++
++async function main() {
++ console.log(
++ `\n=== Window rate limit for ${key} (limit=${limit}, windowMs=${windowMs}, jobs=10) ===`,
++ );
++
++ const seeded = controller.updateRateLimitFromHeaders(
++ { provider, model },
++ {
++ "x-ratelimit-limit-requests": String(limit),
++ "x-ratelimit-remaining-requests": String(limit),
++ "x-ratelimit-reset-requests": `${windowMs}ms`,
++ },
++ );
++ console.log(`[seed] updateRateLimitFromHeaders=${safeStringify(seeded)}`);
++
++ let calls = 0;
++ let lastStartAt = 0;
++ const rateLimitedModel = {
++ specificationVersion: "v2",
++ provider,
++ modelId: model,
++ doGenerate: async (options: any) => {
++ const simulatedLatencyMs = 10 + Math.floor(Math.random() * 120);
++ const nowMs = Date.now();
++ if (nowMs >= windowResetAt) {
++ windowStartAt = nowMs;
++ windowResetAt = windowStartAt + windowMs;
++ usedInWindow = 0;
++ }
++
++ calls += 1;
++ usedInWindow += 1;
++ const startAt = Date.now();
++ const delta = lastStartAt ? startAt - lastStartAt : 0;
++ lastStartAt = startAt;
++
++ const label = extractLabel(options?.prompt);
++ console.log(
++ `[${now()}] doGenerate start call=${calls} (+${delta}ms) input=${label} latencyMs=${simulatedLatencyMs}`,
++ );
++ await sleep(simulatedLatencyMs);
++ console.log(`[${now()}] doGenerate end input=${label}`);
++
++ const remainingAfterThis = Math.max(0, limit - usedInWindow);
++ const resetMs = Math.max(1, windowResetAt - Date.now());
++ return {
++ content: [{ type: "text", text: `ok:${label}` }],
++ finishReason: "stop",
++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
++ warnings: [],
++ response: {
++ modelId: model,
++ headers: {
++ "x-ratelimit-limit-requests": String(limit),
++ "x-ratelimit-remaining-requests": String(remainingAfterThis),
++ "x-ratelimit-reset-requests": `${resetMs}ms`,
++ },
++ },
++ };
++ },
++ };
++
++ const agent = new Agent({
++ name: "traffic-rate-limit-static",
++ instructions: "echo",
++ model: rateLimitedModel,
++ temperature: 0,
++ maxOutputTokens: 32,
++ });
++
++ const jobs = Array.from({ length: 10 }, (_, idx) =>
++ agent.generateText(`req-${idx + 1}`, {
++ tenantId: "default",
++ trafficPriority: "P1",
++ }),
++ );
++
++ const settled = await Promise.allSettled(jobs);
++ console.log(
++ `\n[done] results=${safeStringify(
++ settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)),
++ )}`,
++ );
++}
++
++main().catch((error) => {
++ console.error("Fatal error:", error);
++ process.exit(1);
++});
+diff --git a/tmp/test/traffic-retry-after.ts b/tmp/test/traffic-retry-after.ts
+new file mode 100644
+index 00000000..c0c213eb
+--- /dev/null
++++ b/tmp/test/traffic-retry-after.ts
+@@ -0,0 +1,245 @@
++// @ts-nocheck
++/**
++ * Manual test: Retry-After handling (429 retry + 200 OK header ingestion).
++ *
++ * What this exercises:
++ * - Retry-After on 429 errors increases retry delay (TrafficController retry plan).
++ * - Retry-After on successful responses throttles subsequent requests for the same provider::model.
++ *
++ * Run:
++ * - pnpm -C packages/core build
++ * - pnpm ts-node tmp/test/traffic-retry-after.ts
++ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-retry-after.ts
++ */
++
++import { safeStringify } from "@voltagent/internal";
++import {
++ Agent,
++ RateLimitedUpstreamError,
++ getTrafficController,
++} from "../../packages/core/dist/index.js";
++
++const verbose = process.env.VERBOSE === "1";
++if (!verbose) {
++ console.debug = () => {};
++}
++
++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
++const now = () => new Date().toISOString();
++
++function extractLabel(prompt: any): string {
++ if (!Array.isArray(prompt)) {
++ return "unknown";
++ }
++
++ for (let index = prompt.length - 1; index >= 0; index -= 1) {
++ const message = prompt[index];
++ if (!message || message.role !== "user" || !Array.isArray(message.content)) {
++ continue;
++ }
++
++ const textPart = message.content.find((part: any) => part?.type === "text");
++ if (textPart?.text) {
++ return String(textPart.text);
++ }
++ }
++
++ return "unknown";
++}
++
++function make429RetryAfterModel(args: {
++ provider: string;
++ modelId: string;
++ retryAfterSeconds: number;
++ mode: "headers" | "typedError";
++}) {
++ const { provider, modelId, retryAfterSeconds, mode } = args;
++ let calls = 0;
++ const startedAt: number[] = [];
++
++ return {
++ specificationVersion: "v2",
++ provider,
++ modelId,
++ startedAt,
++ doGenerate: async (options: any) => {
++ calls += 1;
++ const start = Date.now();
++ startedAt.push(start);
++
++ const label = extractLabel(options?.prompt);
++ console.log(`[${now()}] [model] ${provider}::${modelId} start call=${calls} input=${label}`);
++
++ if (calls === 1) {
++ const retryAfterValue = String(retryAfterSeconds);
++
++ if (mode === "typedError") {
++ throw new RateLimitedUpstreamError(
++ `rate limited (typed) retry-after=${retryAfterValue}s`,
++ { provider, model: modelId },
++ Math.round(retryAfterSeconds * 1000),
++ );
++ }
++
++ const err: any = new Error(`rate limited (headers) retry-after=${retryAfterValue}s`);
++ err.status = 429;
++ err.response = {
++ status: 429,
++ headers: {
++ "retry-after": retryAfterValue,
++ },
++ };
++ throw err;
++ }
++
++ return {
++ content: [{ type: "text", text: `ok:${label}` }],
++ finishReason: "stop",
++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
++ warnings: [],
++ response: { modelId, headers: {} },
++ };
++ },
++ };
++}
++
++function makeSuccessRetryAfterModel(args: {
++ provider: string;
++ modelId: string;
++ retryAfterSeconds: number;
++ latencyMs: number;
++}) {
++ const { provider, modelId, retryAfterSeconds, latencyMs } = args;
++ let calls = 0;
++ const startedAt: number[] = [];
++ const endedAt: number[] = [];
++
++ return {
++ specificationVersion: "v2",
++ provider,
++ modelId,
++ startedAt,
++ endedAt,
++ doGenerate: async (options: any) => {
++ calls += 1;
++ const start = Date.now();
++ startedAt.push(start);
++
++ const label = extractLabel(options?.prompt);
++ console.log(`[${now()}] [model] ${provider}::${modelId} start call=${calls} input=${label}`);
++ await sleep(latencyMs);
++
++ const end = Date.now();
++ endedAt.push(end);
++ console.log(`[${now()}] [model] ${provider}::${modelId} end call=${calls} input=${label}`);
++
++ return {
++ content: [{ type: "text", text: `ok:${label}` }],
++ finishReason: "stop",
++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
++ warnings: [],
++ response: {
++ modelId,
++ headers:
++ calls === 1
++ ? {
++ "retry-after": String(retryAfterSeconds),
++ }
++ : {},
++ },
++ };
++ },
++ };
++}
++
++async function test_retryAfterOn429(mode: "headers" | "typedError") {
++ const retryAfterSeconds = 1;
++ const provider = `retry-after-429-${mode}`;
++ const modelId = "ra-429";
++ const tenantId = `ra-429-${mode}`;
++
++ const model = make429RetryAfterModel({ provider, modelId, retryAfterSeconds, mode });
++ const agent = new Agent({
++ name: `ra-429-${mode}`,
++ instructions: "echo",
++ model,
++ temperature: 0,
++ maxOutputTokens: 32,
++ });
++
++ console.log(`\n=== Test: Retry-After on 429 (${mode}) ===`);
++ const result = await agent.generateText("hello", { tenantId, trafficPriority: "P1" });
++
++ const times = model.startedAt;
++ const deltaMs = times.length >= 2 ? times[1] - times[0] : undefined;
++
++ console.log(
++ `[result] text=${result.text} calls=${times.length} startedAt=${safeStringify(times)} deltaMs=${deltaMs}`,
++ );
++
++ if (deltaMs === undefined || deltaMs < retryAfterSeconds * 1000) {
++ throw new Error(
++ `Expected retry delay >= ${retryAfterSeconds * 1000}ms, got ${deltaMs ?? "n/a"}ms`,
++ );
++ }
++}
++
++async function test_retryAfterOnSuccessResponse() {
++ const retryAfterSeconds = 0.3;
++ const provider = "retry-after-200";
++ const modelId = "ra-200";
++ const tenantId = "ra-200";
++
++ const model = makeSuccessRetryAfterModel({
++ provider,
++ modelId,
++ retryAfterSeconds,
++ latencyMs: 20,
++ });
++
++ const agent = new Agent({
++ name: "ra-200",
++ instructions: "echo",
++ model,
++ temperature: 0,
++ maxOutputTokens: 32,
++ });
++
++ console.log("\n=== Test: Retry-After on 200 response headers ===");
++ const first = agent.generateText("first", { tenantId, trafficPriority: "P1" });
++ const second = agent.generateText("second", { tenantId, trafficPriority: "P1" });
++
++ const [r1, r2] = await Promise.all([first, second]);
++
++ const end1 = model.endedAt[0];
++ const start2 = model.startedAt[1];
++ const enforcedDelayMs = start2 && end1 ? start2 - end1 : undefined;
++
++ console.log(
++ `[result] texts=${safeStringify([r1.text, r2.text])} startedAt=${safeStringify(
++ model.startedAt,
++ )} endedAt=${safeStringify(model.endedAt)} enforcedDelayMs=${enforcedDelayMs}`,
++ );
++
++ if (enforcedDelayMs === undefined || enforcedDelayMs < retryAfterSeconds * 1000) {
++ throw new Error(
++ `Expected rate-limit delay >= ${retryAfterSeconds * 1000}ms, got ${enforcedDelayMs ?? "n/a"}ms`,
++ );
++ }
++}
++
++async function main() {
++ // Create controller early so all Agent calls share the same singleton.
++ getTrafficController({ maxConcurrent: 1 });
++
++ await test_retryAfterOn429("headers");
++ await test_retryAfterOn429("typedError");
++ await test_retryAfterOnSuccessResponse();
++
++ console.log("\n[done] All Retry-After manual checks passed.");
++}
++
++main().catch((error) => {
++ console.error("Fatal error:", error);
++ process.exit(1);
++});
+diff --git a/tmp/test/traffic-retry-behavior.ts b/tmp/test/traffic-retry-behavior.ts
+new file mode 100644
+index 00000000..273af55a
+--- /dev/null
++++ b/tmp/test/traffic-retry-behavior.ts
+@@ -0,0 +1,169 @@
++// @ts-nocheck
++/**
++ * Manual test: TrafficController retry behavior via Agent + AI SDK path (stub model).
++ *
++ * Scenarios included:
++ * - 5xx retries (up to 3 attempts)
++ * - 429 retries (up to 3 attempts)
++ * - timeout retries (up to 2 attempts)
++ * - non-retriable 4xx does not retry
++ *
++ * Run:
++ * - pnpm ts-node tmp/test/traffic-retry-behavior.ts
++ *
++ * Notes:
++ * - Uses a stub LanguageModel; no network calls.
++ * - Watch the `[model] attempt=...` logs to confirm retries.
++ */
++
++import { Agent, getTrafficController } from "../../packages/core/dist/index.js";
++
++const verbose = process.env.VERBOSE === "1";
++if (!verbose) {
++ console.debug = () => {};
++}
++
++type Scenario =
++ | "server-error"
++ | "rate-limit"
++ | "timeout"
++ | "bad-request"
++ | "forbidden"
++ // Variations to hit different retry-detection branches.
++ | "server-error-status-string"
++ | "server-error-statusCode"
++ | "server-error-response-status"
++ | "server-error-cause-status"
++ | "rate-limit-statusCode"
++ | "timeout-code-only"
++ | "timeout-name-only"
++ | "timeout-message-only"
++ // Variations that should STOP retrying (hit max attempts).
++ | "server-error-exceed-max"
++ | "timeout-exceed-max";
++
++type RetryPlan = {
++ failCountBeforeSuccess: number;
++ status?: number | string;
++ statusCode?: number | string;
++ httpStatus?: number | string;
++ responseStatus?: number | string;
++ causeStatus?: number | string;
++ code?: string;
++ name?: string;
++ message?: string;
++};
++
++const plans: Record = {
++ "server-error": { failCountBeforeSuccess: 2, status: 500 },
++ "rate-limit": { failCountBeforeSuccess: 2, status: 429 },
++ timeout: { failCountBeforeSuccess: 1, status: 408, code: "ETIMEDOUT", message: "timeout" },
++ "bad-request": { failCountBeforeSuccess: 10, status: 400 },
++ forbidden: { failCountBeforeSuccess: 10, status: 403 },
++ "server-error-status-string": { failCountBeforeSuccess: 2, status: "500" },
++ "server-error-statusCode": { failCountBeforeSuccess: 2, statusCode: 502 },
++ "server-error-response-status": { failCountBeforeSuccess: 2, responseStatus: 503 },
++ "server-error-cause-status": { failCountBeforeSuccess: 2, causeStatus: 500 },
++ "rate-limit-statusCode": { failCountBeforeSuccess: 2, statusCode: 429 },
++ "timeout-code-only": { failCountBeforeSuccess: 1, code: "timeout" },
++ "timeout-name-only": { failCountBeforeSuccess: 1, name: "TimeoutError" },
++ "timeout-message-only": { failCountBeforeSuccess: 1, message: "this is a TIMEOUT" },
++ "server-error-exceed-max": { failCountBeforeSuccess: 10, status: 500 },
++ "timeout-exceed-max": { failCountBeforeSuccess: 10, message: "timeout" },
++};
++
++function makeModel(modelId: string, plan: RetryPlan) {
++ let counter = 0;
++ let lastAttemptAt = 0;
++
++ return {
++ specificationVersion: "v2",
++ provider: "retry-provider",
++ modelId,
++ doGenerate: async () => {
++ counter += 1;
++ const now = Date.now();
++ const delta = lastAttemptAt ? now - lastAttemptAt : 0;
++ lastAttemptAt = now;
++
++ console.log(`[model] modelId=${modelId} attempt=${counter} (+${delta}ms)`);
++
++ if (counter <= plan.failCountBeforeSuccess) {
++ const err: any = new Error(plan.message ?? `forced failure ${counter} for ${modelId}`);
++ if (plan.status !== undefined) err.status = plan.status;
++ if (plan.statusCode !== undefined) err.statusCode = plan.statusCode;
++ if (plan.httpStatus !== undefined) err.httpStatus = plan.httpStatus;
++ if (plan.responseStatus !== undefined) err.response = { status: plan.responseStatus };
++ if (plan.causeStatus !== undefined) err.cause = { status: plan.causeStatus };
++ if (plan.code !== undefined) err.code = plan.code;
++ if (plan.name !== undefined) err.name = plan.name;
++ throw err;
++ }
++
++ return {
++ content: [{ type: "text", text: "ok" }],
++ finishReason: "stop",
++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
++ warnings: [],
++ response: { modelId, headers: {} },
++ };
++ },
++ };
++}
++
++async function runScenario(name: Scenario) {
++ const plan = plans[name];
++ const modelId = `retry-${name}`;
++ const model = makeModel(modelId, plan);
++
++ const agent = new Agent({
++ name: `RetryAgent-${name}`,
++ instructions: "echo",
++ model,
++ maxOutputTokens: 32,
++ temperature: 0,
++ });
++
++ console.log(`\n=== ${name} ===`);
++ try {
++ const result = await agent.generateText(name, { tenantId: "retry-test" });
++ console.log(`[${name}] succeeded. text=${result.text}`);
++ } catch (err: any) {
++ console.log(
++ `[${name}] failed. status=${err?.status ?? err?.statusCode ?? err?.response?.status ?? "n/a"}`,
++ );
++ }
++}
++
++async function main() {
++ // Create controller early so all Agent calls share the same singleton.
++ getTrafficController({ maxConcurrent: 1 });
++
++ const runs: Scenario[] = [
++ "server-error",
++ "rate-limit",
++ "timeout",
++ "bad-request",
++ "forbidden",
++ // Uncomment for additional coverage:
++ // "server-error-status-string",
++ // "server-error-statusCode",
++ // "server-error-response-status",
++ // "server-error-cause-status",
++ // "rate-limit-statusCode",
++ // "timeout-code-only",
++ // "timeout-name-only",
++ // "timeout-message-only",
++ // "server-error-exceed-max",
++ // "timeout-exceed-max",
++ ];
++
++ for (const name of runs) {
++ await runScenario(name);
++ }
++}
++
++main().catch((error) => {
++ console.error("Fatal error:", error);
++ process.exit(1);
++});
+diff --git a/tmp/test/traffic-tenant-usage.ts b/tmp/test/traffic-tenant-usage.ts
+new file mode 100644
+index 00000000..801d7761
+--- /dev/null
++++ b/tmp/test/traffic-tenant-usage.ts
+@@ -0,0 +1,71 @@
++// @ts-nocheck
++/**
++ * Manual test: Tenant usage aggregation (via Agent → TrafficController).
++ *
++ * What to look for:
++ * - `getTenantUsage(tenantId)` should increase after each agent call.
++ *
++ * Run:
++ * - pnpm ts-node tmp/test/traffic-tenant-usage.ts
++ */
++
++import { safeStringify } from "@voltagent/internal";
++import { Agent, getTrafficController } from "../../packages/core/dist/index.js";
++
++const verbose = process.env.VERBOSE === "1";
++if (!verbose) {
++ console.debug = () => {};
++}
++
++function makeModel(modelId: string) {
++ return {
++ specificationVersion: "v2",
++ provider: "usage-provider",
++ modelId,
++ doGenerate: async () => {
++ return {
++ content: [{ type: "text", text: `ok:${modelId}` }],
++ finishReason: "stop",
++ usage: { inputTokens: 2, outputTokens: 3, totalTokens: 5 },
++ warnings: [],
++ response: { modelId, headers: {} },
++ };
++ },
++ };
++}
++
++const controller = getTrafficController({ maxConcurrent: 10 });
++
++async function run(label: string, tenantId: string) {
++ const model = makeModel("tenant-usage-model");
++ const agent = new Agent({
++ name: `TenantUsageAgent-${label}`,
++ instructions: "echo",
++ model,
++ temperature: 0,
++ maxOutputTokens: 32,
++ });
++
++ console.log(`\n=== ${label} tenantId=${tenantId} ===`);
++ const result = await agent.generateText(`hello:${label}`, { tenantId });
++ console.log(`[${label}] text=${result.text}`);
++
++ const usage = controller.getTenantUsage(tenantId);
++ console.log(`[${label}] controller.getTenantUsage(${tenantId})=${safeStringify(usage)}`);
++}
++
++async function main() {
++ await run("A1", "tenant-a");
++ await run("A2", "tenant-a");
++ await run("B1", "tenant-b");
++
++ console.log("\n=== Final usage snapshot ===");
++ console.log(`tenant-a=${safeStringify(controller.getTenantUsage("tenant-a"))}`);
++ console.log(`tenant-b=${safeStringify(controller.getTenantUsage("tenant-b"))}`);
++ console.log(`default=${safeStringify(controller.getTenantUsage("default"))}`);
++}
++
++main().catch((error) => {
++ console.error("Fatal error:", error);
++ process.exit(1);
++});
+diff --git a/tmp/test/traffic-text-vs-stream.ts b/tmp/test/traffic-text-vs-stream.ts
+new file mode 100644
+index 00000000..41aa484d
+--- /dev/null
++++ b/tmp/test/traffic-text-vs-stream.ts
+@@ -0,0 +1,128 @@
++// @ts-nocheck
++/**
++ * Manual test: Text + stream traffic share the same TrafficController queue.
++ *
++ * What to look for:
++ * - Stream and text requests should respect the same maxConcurrent + priority rules.
++ *
++ * Run:
++ * - pnpm ts-node tmp/test/traffic-text-vs-stream.ts
++ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-text-vs-stream.ts
++ */
++
++import { ReadableStream } from "node:stream/web";
++import { safeStringify } from "@voltagent/internal";
++import { Agent, getTrafficController } from "../../packages/core/dist/index.js";
++
++const verbose = process.env.VERBOSE === "1";
++if (!verbose) {
++ console.debug = () => {};
++}
++
++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
++const now = () => new Date().toISOString();
++
++const controller = getTrafficController({ maxConcurrent: 1 });
++
++function extractLabel(prompt: any): string {
++ if (!Array.isArray(prompt)) {
++ return "unknown";
++ }
++
++ for (let index = prompt.length - 1; index >= 0; index -= 1) {
++ const message = prompt[index];
++ if (!message || message.role !== "user" || !Array.isArray(message.content)) {
++ continue;
++ }
++
++ const textPart = message.content.find((part: any) => part?.type === "text");
++ if (textPart?.text) {
++ return String(textPart.text);
++ }
++ }
++
++ return "unknown";
++}
++
++async function main() {
++ console.log("\n=== Text vs Stream (shared scheduler) ===");
++ void controller;
++
++ const provider = "sim";
++ const modelId = "shared-queue";
++
++ const model = {
++ specificationVersion: "v2",
++ provider,
++ modelId,
++ doGenerate: async (options: any) => {
++ const label = extractLabel(options?.prompt);
++ console.log(`[${now()}] doGenerate start input=${label}`);
++ await sleep(50);
++ console.log(`[${now()}] doGenerate end input=${label}`);
++ return {
++ content: [{ type: "text", text: `text:${label}` }],
++ finishReason: "stop",
++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
++ warnings: [],
++ response: { modelId, headers: {} },
++ };
++ },
++ doStream: async (options: any) => {
++ const label = extractLabel(options?.prompt);
++ console.log(`[${now()}] doStream start input=${label}`);
++
++ // Hold the controller slot for a bit so ordering is visible.
++ await sleep(400);
++
++ console.log(`[${now()}] doStream ready input=${label}`);
++ const streamId = `text-${label}`;
++ const text = `stream:${label}`;
++
++ const stream = new ReadableStream({
++ start(streamController) {
++ streamController.enqueue({ type: "stream-start", warnings: [] });
++ streamController.enqueue({ type: "text-start", id: streamId });
++ streamController.enqueue({ type: "text-delta", id: streamId, delta: text });
++ streamController.enqueue({ type: "text-end", id: streamId });
++ streamController.enqueue({
++ type: "finish",
++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
++ finishReason: "stop",
++ });
++ streamController.close();
++ },
++ });
++
++ return { stream, response: { headers: {} } };
++ },
++ };
++
++ const agent = new Agent({
++ name: "traffic-text-vs-stream",
++ instructions: "echo",
++ model,
++ temperature: 0,
++ maxOutputTokens: 32,
++ });
++
++ const streamP1 = agent.streamText("S1", { tenantId: "default", trafficPriority: "P1" });
++ const textP0 = agent.generateText("T0", { tenantId: "default", trafficPriority: "P0" });
++ const textP1 = agent.generateText("T1", { tenantId: "default", trafficPriority: "P1" });
++
++ const [streamResult, t0, t1] = await Promise.all([streamP1, textP0, textP1]);
++ const streamText = await streamResult.text;
++
++ console.log(
++ `\n[done] results=${safeStringify({
++ streamText,
++ textP0: t0.text,
++ textP1: t1.text,
++ })}`,
++ );
++}
++
++main().catch((error) => {
++ console.error("Fatal error:", error);
++ process.exit(1);
++});
diff --git a/examples/with-client-side-tools/next-env.d.ts b/examples/with-client-side-tools/next-env.d.ts
index 1b3be0840..9edff1c7c 100644
--- a/examples/with-client-side-tools/next-env.d.ts
+++ b/examples/with-client-side-tools/next-env.d.ts
@@ -1,5 +1,6 @@
///
///
+import "./.next/types/routes.d.ts";
// NOTE: This file should not be edited
// see https://nextjs.org/docs/app/api-reference/config/typescript for more information.
diff --git a/examples/with-client-side-tools/tsconfig.json b/examples/with-client-side-tools/tsconfig.json
index 3697fcb9b..0fca67d34 100644
--- a/examples/with-client-side-tools/tsconfig.json
+++ b/examples/with-client-side-tools/tsconfig.json
@@ -1,6 +1,10 @@
{
"compilerOptions": {
- "lib": ["dom", "dom.iterable", "esnext"],
+ "lib": [
+ "dom",
+ "dom.iterable",
+ "esnext"
+ ],
"allowJs": true,
"skipLibCheck": true,
"strict": true,
@@ -11,7 +15,7 @@
"resolveJsonModule": true,
"isolatedModules": true,
"sourceMap": true,
- "jsx": "preserve",
+ "jsx": "react-jsx",
"incremental": true,
"plugins": [
{
@@ -19,10 +23,20 @@
}
],
"paths": {
- "@/*": ["./*"]
+ "@/*": [
+ "./*"
+ ]
},
"target": "ES2017"
},
- "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
- "exclude": ["node_modules"]
+ "include": [
+ "next-env.d.ts",
+ "**/*.ts",
+ "**/*.tsx",
+ ".next/types/**/*.ts",
+ ".next/dev/types/**/*.ts"
+ ],
+ "exclude": [
+ "node_modules"
+ ]
}
diff --git a/examples/with-netlify-functions/netlify/functions/voltagent.js b/examples/with-netlify-functions/netlify/functions/voltagent.js
new file mode 100644
index 000000000..0ec386b8f
--- /dev/null
+++ b/examples/with-netlify-functions/netlify/functions/voltagent.js
@@ -0,0 +1,4 @@
+import { createNetlifyFunctionHandler } from "@voltagent/serverless-hono";
+import { getVoltAgent } from "../../src/index";
+const voltAgent = getVoltAgent();
+export const handler = createNetlifyFunctionHandler(voltAgent);
diff --git a/examples/with-netlify-functions/src/index.js b/examples/with-netlify-functions/src/index.js
new file mode 100644
index 000000000..af385b506
--- /dev/null
+++ b/examples/with-netlify-functions/src/index.js
@@ -0,0 +1,17 @@
+import { openai } from "@ai-sdk/openai";
+import { Agent, VoltAgent } from "@voltagent/core";
+import { serverlessHono } from "@voltagent/serverless-hono";
+import { weatherTool } from "./tools";
+const agent = new Agent({
+ name: "netlify-function-agent",
+ instructions: "Help the user quickly and call tools when needed.",
+ model: openai("gpt-4o-mini"),
+ tools: [weatherTool],
+});
+const voltAgent = new VoltAgent({
+ agents: { agent },
+ serverless: serverlessHono(),
+});
+export function getVoltAgent() {
+ return voltAgent;
+}
diff --git a/examples/with-netlify-functions/src/tools/index.js b/examples/with-netlify-functions/src/tools/index.js
new file mode 100644
index 000000000..d1c5bf43b
--- /dev/null
+++ b/examples/with-netlify-functions/src/tools/index.js
@@ -0,0 +1,26 @@
+import { createTool } from "@voltagent/core";
+import z from "zod";
+export const weatherTool = createTool({
+ id: "get-weather",
+ name: "getWeather",
+ description: "Return a mock weather report for the requested location",
+ parameters: z.object({
+ location: z.string().describe("City or location to look up"),
+ }),
+ execute: async ({ location }, context) => {
+ context?.logger.info(`Fetching weather for ${location}`);
+ const mockWeatherData = {
+ location,
+ temperature: Math.floor(Math.random() * 30) + 5,
+ condition: ["Sunny", "Cloudy", "Rainy", "Snowy", "Partly Cloudy"][
+ Math.floor(Math.random() * 5)
+ ],
+ humidity: Math.floor(Math.random() * 60) + 30,
+ windSpeed: Math.floor(Math.random() * 30),
+ };
+ return {
+ weather: mockWeatherData,
+ message: `Current weather in ${location}: ${mockWeatherData.temperature}°C and ${mockWeatherData.condition.toLowerCase()} with ${mockWeatherData.humidity}% humidity and wind speed of ${mockWeatherData.windSpeed} km/h.`,
+ };
+ },
+});
diff --git a/package.json b/package.json
index 7c80f7c59..7e3ef8ba1 100644
--- a/package.json
+++ b/package.json
@@ -32,9 +32,10 @@
"publint": "^0.3.8",
"rimraf": "^5.0.5",
"syncpack": "^13.0.2",
+ "ts-node": "^10.9.2",
"tslib": "^2.3.0",
"tsup": "^8.5.0",
- "typescript": "^5.8.2",
+ "typescript": "^5.9.2",
"vite": "^7.2.7",
"vitest": "^3.2.4"
},
diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts
index 291bdf7fd..84343c041 100644
--- a/packages/core/src/agent/agent.ts
+++ b/packages/core/src/agent/agent.ts
@@ -48,6 +48,14 @@ import type { BaseRetriever } from "../retriever/retriever";
import type { Tool, Toolkit } from "../tool";
import { createTool } from "../tool";
import { ToolManager } from "../tool/manager";
+import {
+ type FallbackChainEntry,
+ type TrafficPriority,
+ type TrafficRequest,
+ type TrafficRequestMetadata,
+ getTrafficController,
+} from "../traffic/traffic-controller";
+import { findHeaders } from "../traffic/traffic-error-utils";
import { randomUUID } from "../utils/id";
import { convertModelMessagesToUIMessages } from "../utils/message-converter";
import { NodeType, createNodeId } from "../utils/node-utils";
@@ -262,8 +270,42 @@ export interface BaseGenerationOptions extends Partial {
// Context
userId?: string;
conversationId?: string;
+ tenantId?: string;
+ /**
+ * Optional key metadata for per-key rate limits.
+ */
+ apiKeyId?: string;
+ /**
+ * Optional region metadata for per-region rate limits.
+ */
+ region?: string;
+ /**
+ * Optional endpoint metadata for per-endpoint rate limits.
+ */
+ endpoint?: string;
+ /**
+ * Optional tenant tier metadata for per-tier rate limits.
+ */
+ tenantTier?: string;
context?: ContextInput;
elicitation?: (request: unknown) => Promise;
+ /**
+ * Optional priority override for scheduling.
+ * Defaults to agent-level priority when omitted.
+ */
+ trafficPriority?: TrafficPriority;
+ /**
+ * Optional maximum time to wait in the queue before timing out.
+ */
+ maxQueueWaitMs?: number;
+ /**
+ * Optional task classification for circuit-breaker fallback policies.
+ */
+ taskType?: string;
+ /**
+ * Optional explicit fallback policy id.
+ */
+ fallbackPolicyId?: string;
// Parent tracking
parentAgentId?: string;
@@ -303,6 +345,8 @@ export interface BaseGenerationOptions extends Partial {
// Provider-specific options
providerOptions?: ProviderOptions;
+ // Optional per-call model override (used for fallbacks)
+ model?: LanguageModel;
// Experimental output (for structured generation)
experimental_output?: ReturnType | ReturnType;
@@ -347,6 +391,7 @@ export class Agent {
readonly voice?: Voice;
readonly retriever?: BaseRetriever;
readonly supervisorConfig?: SupervisorConfig;
+ private readonly trafficPriority: TrafficPriority;
private readonly context?: Map;
private readonly logger: Logger;
@@ -372,6 +417,7 @@ export class Agent {
this.temperature = options.temperature;
this.maxOutputTokens = options.maxOutputTokens;
this.maxSteps = options.maxSteps || 5;
+ this.trafficPriority = options.trafficPriority ?? "P1";
this.stopWhen = options.stopWhen;
this.markdown = options.markdown ?? false;
this.voice = options.voice;
@@ -444,6 +490,47 @@ export class Agent {
async generateText(
input: string | UIMessage[] | BaseMessage[],
options?: GenerateTextOptions,
+ ): Promise {
+ const controller = getTrafficController({ logger: this.logger }); // Use shared controller so all agent calls flow through central queue/metrics
+ const tenantId = this.resolveTenantId(options);
+ const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => {
+ const mergedOptions = this.mergeOptionsWithModel(options, modelOverride);
+ const metadata = this.buildTrafficMetadata(
+ mergedOptions?.model,
+ mergedOptions,
+ providerOverride,
+ ); // Compute once per queued request (including per-call model overrides)
+ return {
+ tenantId,
+ metadata,
+ maxQueueWaitMs: options?.maxQueueWaitMs,
+ estimatedTokens: this.estimateTokens(input, mergedOptions),
+ execute: () => this.executeGenerateText(input, mergedOptions, metadata), // Defer actual execution so controller can schedule it
+ extractUsage: (result: GenerateTextResultWithContext) =>
+ this.extractUsageFromResponse(result),
+ createFallbackRequest: (fallbackTarget) => {
+ if (this.isShortResponseFallback(fallbackTarget)) {
+ return this.buildShortTextFallbackRequest(
+ tenantId,
+ metadata,
+ mergedOptions,
+ fallbackTarget.text,
+ );
+ }
+ const { modelOverride: fallbackModel, providerOverride: fallbackProvider } =
+ this.resolveFallbackTarget(fallbackTarget);
+ return buildRequest(fallbackModel, fallbackProvider);
+ },
+ };
+ };
+
+ return controller.handleText(buildRequest(options?.model));
+ }
+
+ private async executeGenerateText(
+ input: string | UIMessage[] | BaseMessage[],
+ options?: GenerateTextOptions,
+ trafficMetadata?: TrafficRequestMetadata,
): Promise {
const startTime = Date.now();
const oc = this.createOperationContext(input, options);
@@ -471,7 +558,7 @@ export class Agent {
options,
);
- const modelName = this.getModelName();
+ const modelName = this.getModelName(model);
const contextLimit = options?.contextLimit;
// Add model attributes and all options
@@ -544,10 +631,20 @@ export class Agent {
hooks,
maxSteps: userMaxSteps,
tools: userTools,
+ maxQueueWaitMs,
+ taskType,
+ fallbackPolicyId,
experimental_output,
providerOptions,
+ maxRetries: _maxRetries, // Always disable provider retries (TrafficController handles retries)
+ model: _model, // Exclude model so aiSDKOptions doesn't override resolved model
...aiSDKOptions
} = options || {};
+ void _model;
+ void _maxRetries;
+ void maxQueueWaitMs;
+ void taskType;
+ void fallbackPolicyId;
const llmSpan = this.createLLMSpan(oc, {
operation: "generateText",
@@ -567,6 +664,11 @@ export class Agent {
let result!: GenerateTextResult;
try {
+ methodLogger.info("[AI SDK] Calling generateText", {
+ messageCount: messages.length,
+ modelName,
+ tools: tools ? Object.keys(tools) : [],
+ });
result = await oc.traceContext.withSpan(llmSpan, () =>
generateText({
model,
@@ -575,7 +677,7 @@ export class Agent {
// Default values
temperature: this.temperature,
maxOutputTokens: this.maxOutputTokens,
- maxRetries: 3,
+ maxRetries: 0,
stopWhen: options?.stopWhen ?? this.stopWhen ?? stepCountIs(maxSteps),
// User overrides from AI SDK options
...aiSDKOptions,
@@ -588,7 +690,15 @@ export class Agent {
onStepFinish: this.createStepHandler(oc, options),
}),
);
+ methodLogger.info("[AI SDK] Received generateText result", {
+ finishReason: result.finishReason,
+ usage: result.usage ? safeStringify(result.usage) : undefined,
+ stepCount: result.steps?.length ?? 0,
+ rawResult: safeStringify(result),
+ });
+ this.updateTrafficControllerRateLimits(result.response, trafficMetadata, methodLogger);
} catch (error) {
+ this.updateTrafficControllerRateLimits(error, trafficMetadata, methodLogger);
finalizeLLMSpan(SpanStatusCode.ERROR, { message: (error as Error).message });
throw error;
}
@@ -771,6 +881,47 @@ export class Agent {
async streamText(
input: string | UIMessage[] | BaseMessage[],
options?: StreamTextOptions,
+ ): Promise {
+ const controller = getTrafficController({ logger: this.logger }); // Same controller handles streaming to keep ordering/backpressure consistent
+ const tenantId = this.resolveTenantId(options);
+ const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => {
+ const mergedOptions = this.mergeOptionsWithModel(options, modelOverride);
+ const metadata = this.buildTrafficMetadata(
+ mergedOptions?.model,
+ mergedOptions,
+ providerOverride,
+ ); // Compute once per queued request (including per-call model overrides)
+ return {
+ tenantId,
+ metadata,
+ maxQueueWaitMs: options?.maxQueueWaitMs,
+ estimatedTokens: this.estimateTokens(input, mergedOptions),
+ execute: () => this.executeStreamText(input, mergedOptions, metadata), // Actual streaming work happens after the controller dequeues us
+ extractUsage: (result: StreamTextResultWithContext) =>
+ this.extractUsageFromResponse(result),
+ createFallbackRequest: (fallbackTarget) => {
+ if (this.isShortResponseFallback(fallbackTarget)) {
+ return this.buildShortStreamTextFallbackRequest(
+ tenantId,
+ metadata,
+ mergedOptions,
+ fallbackTarget.text,
+ );
+ }
+ const { modelOverride: fallbackModel, providerOverride: fallbackProvider } =
+ this.resolveFallbackTarget(fallbackTarget);
+ return buildRequest(fallbackModel, fallbackProvider);
+ },
+ };
+ };
+
+ return controller.handleStream(buildRequest(options?.model));
+ }
+
+ private async executeStreamText(
+ input: string | UIMessage[] | BaseMessage[],
+ options?: StreamTextOptions,
+ trafficMetadata?: TrafficRequestMetadata,
): Promise {
const startTime = Date.now();
const oc = this.createOperationContext(input, options);
@@ -800,7 +951,7 @@ export class Agent {
options,
);
- const modelName = this.getModelName();
+ const modelName = this.getModelName(model);
const contextLimit = options?.contextLimit;
// Add model attributes to root span if TraceContext exists
@@ -868,10 +1019,20 @@ export class Agent {
maxSteps: userMaxSteps,
tools: userTools,
onFinish: userOnFinish,
+ maxQueueWaitMs,
+ taskType,
+ fallbackPolicyId,
experimental_output,
providerOptions,
+ maxRetries: _maxRetries, // Always disable provider retries (TrafficController handles retries)
+ model: _model, // Exclude model from aiSDKOptions to avoid overriding resolved model
...aiSDKOptions
} = options || {};
+ void _model;
+ void _maxRetries;
+ void maxQueueWaitMs;
+ void taskType;
+ void fallbackPolicyId;
const guardrailStreamingEnabled = guardrailSet.output.length > 0;
@@ -893,7 +1054,13 @@ export class Agent {
},
});
const finalizeLLMSpan = this.createLLMSpanFinalizer(llmSpan);
+ const trafficController = getTrafficController({ logger: this.logger });
+ methodLogger.info("[AI SDK] Calling streamText", {
+ messageCount: messages.length,
+ modelName,
+ tools: tools ? Object.keys(tools) : [],
+ });
const result = streamText({
model,
messages,
@@ -901,7 +1068,7 @@ export class Agent {
// Default values
temperature: this.temperature,
maxOutputTokens: this.maxOutputTokens,
- maxRetries: 3,
+ maxRetries: 0, // Retry via traffic controller to avoid provider-level storms
stopWhen: options?.stopWhen ?? this.stopWhen ?? stepCountIs(maxSteps),
// User overrides from AI SDK options
...aiSDKOptions,
@@ -937,6 +1104,8 @@ export class Agent {
modelName: this.getModelName(),
});
+ this.updateTrafficControllerRateLimits(actualError, trafficMetadata, methodLogger);
+ trafficController.reportStreamFailure(trafficMetadata, actualError);
finalizeLLMSpan(SpanStatusCode.ERROR, { message: (actualError as Error)?.message });
// History update removed - using OpenTelemetry only
@@ -962,6 +1131,18 @@ export class Agent {
.catch(() => {});
},
onFinish: async (finalResult) => {
+ methodLogger.info("[AI SDK] streamText finished", {
+ finishReason: finalResult.finishReason,
+ usage: finalResult.totalUsage ? safeStringify(finalResult.totalUsage) : undefined,
+ stepCount: finalResult.steps?.length ?? 0,
+ rawResult: safeStringify(finalResult),
+ });
+ this.updateTrafficControllerRateLimits(
+ finalResult.response,
+ trafficMetadata,
+ methodLogger,
+ );
+ trafficController.reportStreamSuccess(trafficMetadata);
const providerUsage = finalResult.usage
? await Promise.resolve(finalResult.usage)
: undefined;
@@ -1428,6 +1609,49 @@ export class Agent {
input: string | UIMessage[] | BaseMessage[],
schema: T,
options?: GenerateObjectOptions,
+ ): Promise>> {
+ const controller = getTrafficController({ logger: this.logger });
+ const tenantId = this.resolveTenantId(options);
+ const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => {
+ const mergedOptions = this.mergeOptionsWithModel(options, modelOverride);
+ const metadata = this.buildTrafficMetadata(
+ mergedOptions?.model,
+ mergedOptions,
+ providerOverride,
+ ); // Compute once per queued request (including per-call model overrides)
+ return {
+ tenantId,
+ metadata,
+ maxQueueWaitMs: options?.maxQueueWaitMs,
+ estimatedTokens: this.estimateTokens(input, mergedOptions),
+ execute: () => this.executeGenerateObject(input, schema, mergedOptions, metadata),
+ extractUsage: (result: GenerateObjectResultWithContext>) =>
+ this.extractUsageFromResponse(result),
+ createFallbackRequest: (fallbackTarget) => {
+ if (this.isShortResponseFallback(fallbackTarget)) {
+ return this.buildShortObjectFallbackRequest(
+ tenantId,
+ metadata,
+ schema,
+ mergedOptions,
+ fallbackTarget.text,
+ );
+ }
+ const { modelOverride: fallbackModel, providerOverride: fallbackProvider } =
+ this.resolveFallbackTarget(fallbackTarget);
+ return buildRequest(fallbackModel, fallbackProvider);
+ },
+ };
+ };
+
+ return controller.handleText(buildRequest(options?.model));
+ }
+
+ private async executeGenerateObject(
+ input: string | UIMessage[] | BaseMessage[],
+ schema: T,
+ options?: GenerateObjectOptions,
+ trafficMetadata?: TrafficRequestMetadata,
): Promise>> {
const startTime = Date.now();
const oc = this.createOperationContext(input, options);
@@ -1452,7 +1676,7 @@ export class Agent {
options,
);
- const modelName = this.getModelName();
+ const modelName = this.getModelName(model);
const schemaName = schema.description || "unknown";
// Add model attributes and all options
@@ -1510,10 +1734,25 @@ export class Agent {
hooks,
maxSteps: userMaxSteps,
tools: userTools,
+ taskType,
+ fallbackPolicyId,
+ maxQueueWaitMs,
providerOptions,
+ maxRetries: _maxRetries, // Always disable provider retries (TrafficController handles retries)
+ model: _model, // Exclude model so spread does not override resolved model
...aiSDKOptions
} = options || {};
-
+ void _model;
+ void _maxRetries;
+ void taskType;
+ void fallbackPolicyId;
+ void maxQueueWaitMs;
+
+ methodLogger.info("[AI SDK] Calling generateObject", {
+ messageCount: messages.length,
+ modelName,
+ schemaName,
+ });
const result = await generateObject({
model,
messages,
@@ -1522,7 +1761,7 @@ export class Agent {
// Default values
maxOutputTokens: this.maxOutputTokens,
temperature: this.temperature,
- maxRetries: 3,
+ maxRetries: 0,
// User overrides from AI SDK options
...aiSDKOptions,
// Provider-specific options
@@ -1530,6 +1769,13 @@ export class Agent {
// VoltAgent controlled
abortSignal: oc.abortController.signal,
});
+ methodLogger.info("[AI SDK] Received generateObject result", {
+ finishReason: result.finishReason,
+ usage: result.usage ? safeStringify(result.usage) : undefined,
+ warnings: result.warnings,
+ rawResult: safeStringify(result),
+ });
+ this.updateTrafficControllerRateLimits(result.response, trafficMetadata, methodLogger);
const usageInfo = convertUsage(result.usage);
const finalObject = await executeOutputGuardrails({
@@ -1638,6 +1884,7 @@ export class Agent {
context: oc.context,
};
} catch (error) {
+ this.updateTrafficControllerRateLimits(error, trafficMetadata, methodLogger);
await this.flushPendingMessagesOnError(oc).catch(() => {});
return this.handleError(error as Error, oc, options, startTime);
} finally {
@@ -1655,6 +1902,49 @@ export class Agent {
input: string | UIMessage[] | BaseMessage[],
schema: T,
options?: StreamObjectOptions,
+ ): Promise>> {
+ const controller = getTrafficController({ logger: this.logger });
+ const tenantId = this.resolveTenantId(options);
+ const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => {
+ const mergedOptions = this.mergeOptionsWithModel(options, modelOverride);
+ const metadata = this.buildTrafficMetadata(
+ mergedOptions?.model,
+ mergedOptions,
+ providerOverride,
+ ); // Compute once per queued request (including per-call model overrides)
+ return {
+ tenantId,
+ metadata,
+ maxQueueWaitMs: options?.maxQueueWaitMs,
+ estimatedTokens: this.estimateTokens(input, mergedOptions),
+ execute: () => this.executeStreamObject(input, schema, mergedOptions, metadata),
+ extractUsage: (result: StreamObjectResultWithContext>) =>
+ this.extractUsageFromResponse(result),
+ createFallbackRequest: (fallbackTarget) => {
+ if (this.isShortResponseFallback(fallbackTarget)) {
+ return this.buildShortStreamObjectFallbackRequest(
+ tenantId,
+ metadata,
+ schema,
+ mergedOptions,
+ fallbackTarget.text,
+ );
+ }
+ const { modelOverride: fallbackModel, providerOverride: fallbackProvider } =
+ this.resolveFallbackTarget(fallbackTarget);
+ return buildRequest(fallbackModel, fallbackProvider);
+ },
+ };
+ };
+
+ return controller.handleStream(buildRequest(options?.model));
+ }
+
+ private async executeStreamObject(
+ input: string | UIMessage[] | BaseMessage[],
+ schema: T,
+ options?: StreamObjectOptions,
+ trafficMetadata?: TrafficRequestMetadata,
): Promise>> {
const startTime = Date.now();
const oc = this.createOperationContext(input, options);
@@ -1680,7 +1970,7 @@ export class Agent {
options,
);
- const modelName = this.getModelName();
+ const modelName = this.getModelName(model);
const schemaName = schema.description || "unknown";
// Add model attributes and all options
@@ -1739,14 +2029,30 @@ export class Agent {
maxSteps: userMaxSteps,
tools: userTools,
onFinish: userOnFinish,
+ taskType,
+ fallbackPolicyId,
+ maxQueueWaitMs,
providerOptions,
+ maxRetries: _maxRetries, // Always disable provider retries (TrafficController handles retries)
+ model: _model, // Exclude model so aiSDKOptions cannot override resolved model
...aiSDKOptions
} = options || {};
+ void _model;
+ void _maxRetries;
+ void taskType;
+ void fallbackPolicyId;
+ void maxQueueWaitMs;
let guardrailObjectPromise!: Promise>;
let resolveGuardrailObject: ((value: z.infer) => void) | undefined;
let rejectGuardrailObject: ((reason: unknown) => void) | undefined;
+ const trafficController = getTrafficController({ logger: this.logger });
+ methodLogger.info("[AI SDK] Calling streamObject", {
+ messageCount: messages.length,
+ modelName,
+ schemaName,
+ });
const result = streamObject({
model,
messages,
@@ -1755,7 +2061,7 @@ export class Agent {
// Default values
maxOutputTokens: this.maxOutputTokens,
temperature: this.temperature,
- maxRetries: 3,
+ maxRetries: 0,
// User overrides from AI SDK options
...aiSDKOptions,
// Provider-specific options
@@ -1771,9 +2077,11 @@ export class Agent {
methodLogger.error("Stream object error occurred", {
error: actualError,
agentName: this.name,
- modelName: this.getModelName(),
+ modelName: this.getModelName(model),
schemaName: schemaName,
});
+ this.updateTrafficControllerRateLimits(actualError, trafficMetadata, methodLogger);
+ trafficController.reportStreamFailure(trafficMetadata, actualError);
// History update removed - using OpenTelemetry only
@@ -1800,6 +2108,17 @@ export class Agent {
},
onFinish: async (finalResult: any) => {
try {
+ methodLogger.info("[AI SDK] streamObject finished", {
+ finishReason: finalResult.finishReason,
+ usage: finalResult.usage ? safeStringify(finalResult.usage) : undefined,
+ rawResult: safeStringify(finalResult),
+ });
+ this.updateTrafficControllerRateLimits(
+ finalResult.response,
+ trafficMetadata,
+ methodLogger,
+ );
+ trafficController.reportStreamSuccess(trafficMetadata);
const usageInfo = convertUsage(finalResult.usage as any);
let finalObject = finalResult.object as z.infer;
if (guardrailSet.output.length > 0) {
@@ -2021,8 +2340,9 @@ export class Agent {
// Calculate maxSteps (use provided option or calculate based on subagents)
const maxSteps = options?.maxSteps ?? this.calculateMaxSteps();
- // Resolve dynamic values
- const model = await this.resolveValue(this.model, oc);
+ // Resolve dynamic values (allow per-call model override for fallbacks)
+ const selectedModel = options?.model ?? this.model;
+ const model = await this.resolveValue(selectedModel, oc);
const dynamicToolList = (await this.resolveValue(this.dynamicTools, oc)) || [];
// Merge agent tools with option tools
@@ -2073,6 +2393,12 @@ export class Agent {
): OperationContext {
const operationId = randomUUID();
const startTimeDate = new Date();
+ const priority = this.resolveTrafficPriority(options);
+ const tenantId = this.resolveTenantId(options);
+ const apiKeyId = options?.apiKeyId ?? options?.parentOperationContext?.apiKeyId;
+ const region = options?.region ?? options?.parentOperationContext?.region;
+ const endpoint = options?.endpoint ?? options?.parentOperationContext?.endpoint;
+ const tenantTier = options?.tenantTier ?? options?.parentOperationContext?.tenantTier;
// Prefer reusing an existing context instance to preserve reference across calls/subagents
const runtimeContext = toContextMap(options?.context);
@@ -2123,6 +2449,7 @@ export class Agent {
operationId,
userId: options?.userId,
conversationId: options?.conversationId,
+ tenantId,
executionId: operationId,
});
@@ -2137,6 +2464,9 @@ export class Agent {
parentAgentId: options?.parentAgentId,
input,
});
+ if (tenantId) {
+ traceContext.getRootSpan().setAttribute("tenant.id", tenantId);
+ }
traceContext.getRootSpan().setAttribute("voltagent.operation_id", operationId);
// Use parent's AbortController if available, otherwise create new one
@@ -2174,8 +2504,14 @@ export class Agent {
logger,
conversationSteps: options?.parentOperationContext?.conversationSteps || [],
abortController,
+ priority,
userId: options?.userId,
conversationId: options?.conversationId,
+ tenantId,
+ apiKeyId,
+ region,
+ endpoint,
+ tenantTier,
parentAgentId: options?.parentAgentId,
traceContext,
startTime: startTimeDate,
@@ -3170,6 +3506,20 @@ export class Agent {
return value;
}
+ private mergeOptionsWithModel(
+ options: BaseGenerationOptions | undefined,
+ modelOverride?: LanguageModel,
+ ): BaseGenerationOptions | undefined {
+ if (!options && modelOverride === undefined) {
+ return undefined;
+ }
+
+ return {
+ ...(options ?? {}),
+ ...(modelOverride !== undefined ? { model: modelOverride } : {}),
+ };
+ }
+
/**
* Prepare tools with execution context
*/
@@ -3822,17 +4172,622 @@ export class Agent {
return this.subAgentManager.calculateMaxSteps(this.maxSteps);
}
+ private resolveTrafficPriority(options?: BaseGenerationOptions): TrafficPriority {
+ const normalize = (value?: TrafficPriority): TrafficPriority | undefined => {
+ if (value === "P0" || value === "P1" || value === "P2") {
+ return value;
+ }
+ return undefined;
+ };
+
+ const parentPriority = normalize(options?.parentOperationContext?.priority);
+ const localPriority = normalize(options?.trafficPriority) ?? this.trafficPriority ?? "P1";
+
+ if (parentPriority) {
+ return this.pickHigherPriority(parentPriority, localPriority);
+ }
+
+ return localPriority;
+ }
+
+ private resolveTenantId(options?: BaseGenerationOptions): string {
+ const parentTenant = options?.parentOperationContext?.tenantId;
+ if (parentTenant) {
+ return parentTenant;
+ }
+
+ if (options?.tenantId) {
+ return options.tenantId;
+ }
+
+ return "default";
+ }
+
+ private pickHigherPriority(a: TrafficPriority, b: TrafficPriority): TrafficPriority {
+ const rank: Record = { P0: 0, P1: 1, P2: 2 };
+ return rank[a] <= rank[b] ? a : b;
+ }
+
+ private buildTrafficMetadata(
+ modelOverride?: LanguageModel | DynamicValue,
+ options?: BaseGenerationOptions,
+ providerOverride?: string,
+ ): TrafficRequestMetadata {
+ const provider =
+ providerOverride ??
+ this.resolveProvider(modelOverride) ??
+ this.resolveProvider(this.model) ??
+ undefined;
+ const priority = this.resolveTrafficPriority(options);
+ const apiKeyId = options?.apiKeyId ?? options?.parentOperationContext?.apiKeyId;
+ const region = options?.region ?? options?.parentOperationContext?.region;
+ const endpoint = options?.endpoint ?? options?.parentOperationContext?.endpoint;
+ const tenantTier = options?.tenantTier ?? options?.parentOperationContext?.tenantTier;
+
+ return {
+ agentId: this.id, // Identify which agent issued the request
+ agentName: this.name, // Human-readable label for logs/metrics
+ model: this.getModelName(modelOverride), // Used for future capacity policies
+ provider, // Allows per-provider throttling later
+ priority,
+ tenantId: this.resolveTenantId(options),
+ apiKeyId,
+ region,
+ endpoint,
+ tenantTier,
+ taskType: options?.taskType,
+ fallbackPolicyId: options?.fallbackPolicyId,
+ };
+ }
+
+ private estimateTokens(
+ input: string | UIMessage[] | BaseMessage[],
+ options?: BaseGenerationOptions,
+ ): number | undefined {
+ let text = "";
+ if (typeof input === "string") {
+ text = input;
+ } else if (Array.isArray(input)) {
+ text = input
+ .map((message) => {
+ if (typeof message === "string") return message;
+ if (message && typeof message === "object") {
+ const content = (message as { content?: unknown }).content;
+ if (typeof content === "string") return content;
+ if (content !== undefined) return safeStringify(content);
+ return safeStringify(message);
+ }
+ return String(message ?? "");
+ })
+ .join(" ");
+ } else if (input) {
+ text = safeStringify(input);
+ }
+
+ const inputTokens = text ? Math.ceil(text.length / 4) : 0;
+ const outputTokensRaw =
+ typeof options?.maxOutputTokens === "number" ? options.maxOutputTokens : this.maxOutputTokens;
+ const outputTokens =
+ typeof outputTokensRaw === "number" && Number.isFinite(outputTokensRaw)
+ ? Math.max(0, Math.floor(outputTokensRaw))
+ : 0;
+ const total = inputTokens + outputTokens;
+ return total > 0 ? total : undefined;
+ }
+
+ private resolveFallbackTarget(target: FallbackChainEntry): {
+ modelOverride?: LanguageModel;
+ providerOverride?: string;
+ } {
+ if (typeof target === "string") {
+ return { modelOverride: target };
+ }
+ return {
+ modelOverride: target.model,
+ providerOverride: target.provider,
+ };
+ }
+
+ private isShortResponseFallback(
+ target: FallbackChainEntry,
+ ): target is { kind: "short-response"; text: string } {
+ return (
+ typeof target === "object" &&
+ target !== null &&
+ "kind" in target &&
+ (target as { kind?: string }).kind === "short-response"
+ );
+ }
+
+ private buildShortResponseMetadata(
+ baseMetadata: TrafficRequestMetadata | undefined,
+ ): TrafficRequestMetadata {
+ const metadata = baseMetadata ?? this.buildTrafficMetadata();
+ return {
+ ...metadata,
+ provider: "short-response",
+ model: "short-response",
+ };
+ }
+
+ private createZeroUsage(): LanguageModelUsage {
+ return { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
+ }
+
+ private createShortTextStream(text: string): AsyncIterableStream {
+ return createAsyncIterableReadable((controller) => {
+ controller.enqueue(text);
+ controller.close();
+ });
+ }
+
+ private createShortFullStream(text: string): AsyncIterableStream {
+ const usage = this.createZeroUsage();
+ const id = `short-response-${randomUUID()}`;
+ return createAsyncIterableReadable((controller) => {
+ controller.enqueue({
+ type: "text-delta",
+ id,
+ delta: text,
+ text,
+ } as VoltAgentTextStreamPart);
+ controller.enqueue({
+ type: "finish",
+ finishReason: "stop",
+ usage,
+ totalUsage: usage,
+ } as VoltAgentTextStreamPart);
+ controller.close();
+ });
+ }
+
+ private createShortTextResult(
+ text: string,
+ options?: GenerateTextOptions,
+ ): GenerateTextResultWithContext {
+ const usage = this.createZeroUsage();
+ const context = toContextMap(options?.context) ?? new Map();
+ const createTextStream = (): AsyncIterableStream => this.createShortTextStream(text);
+
+ return {
+ text,
+ content: [],
+ reasoning: [],
+ reasoningText: "",
+ files: [],
+ sources: [],
+ toolCalls: [],
+ staticToolCalls: [],
+ dynamicToolCalls: [],
+ toolResults: [],
+ staticToolResults: [],
+ dynamicToolResults: [],
+ usage,
+ totalUsage: usage,
+ warnings: [],
+ finishReason: "stop",
+ steps: [],
+ experimental_output: undefined,
+ response: {
+ id: "short-response",
+ modelId: "short-response",
+ timestamp: new Date(),
+ messages: [],
+ },
+ context,
+ request: {
+ body: {},
+ },
+ providerMetadata: undefined,
+ experimental_providerMetadata: undefined,
+ pipeTextStreamToResponse: (response, init) => {
+ pipeTextStreamToResponse({
+ response,
+ textStream: createTextStream(),
+ ...(init ?? {}),
+ });
+ },
+ toTextStreamResponse: (init) => {
+ return createTextStreamResponse({
+ textStream: createTextStream(),
+ ...(init ?? {}),
+ });
+ },
+ toDataStream: () => createTextStream(),
+ toDataStreamResponse: (init) => {
+ return createTextStreamResponse({
+ textStream: createTextStream(),
+ ...(init ?? {}),
+ });
+ },
+ pipeDataStreamToResponse: (response, init) => {
+ pipeTextStreamToResponse({
+ response,
+ textStream: createTextStream(),
+ ...(init ?? {}),
+ });
+ },
+ } as GenerateTextResultWithContext;
+ }
+
+ private createShortStreamTextResult(
+ text: string,
+ options?: StreamTextOptions,
+ ): StreamTextResultWithContext {
+ const usage = this.createZeroUsage();
+ const context = toContextMap(options?.context) ?? new Map();
+ const createTextStream = (): AsyncIterableStream => this.createShortTextStream(text);
+ const createFullStream = (): AsyncIterableStream =>
+ this.createShortFullStream(text);
+
+ const toUIMessageStream = (_options?: unknown) =>
+ createUIMessageStream({
+ execute: async ({ writer }) => {
+ writer.write({ type: "text", text } as any);
+ },
+ onError: (error) => String(error),
+ });
+
+ const toUIMessageStreamResponse = (options?: ResponseInit) => {
+ const stream = toUIMessageStream(options);
+ const responseInit = options ? { ...options } : {};
+ return createUIMessageStreamResponse({
+ stream,
+ ...responseInit,
+ });
+ };
+
+ const pipeUIMessageStreamToResponse = (response: any, init?: ResponseInit) => {
+ const stream = toUIMessageStream(init);
+ const initOptions = init ? { ...init } : {};
+ pipeUIMessageStreamToResponse({
+ response,
+ stream,
+ ...initOptions,
+ });
+ };
+
+ return {
+ text: Promise.resolve(text),
+ get textStream() {
+ return createTextStream();
+ },
+ get fullStream() {
+ return createFullStream();
+ },
+ usage: Promise.resolve(usage),
+ finishReason: Promise.resolve("stop"),
+ experimental_partialOutputStream: undefined,
+ toUIMessageStream: toUIMessageStream as StreamTextResultWithContext["toUIMessageStream"],
+ toUIMessageStreamResponse:
+ toUIMessageStreamResponse as StreamTextResultWithContext["toUIMessageStreamResponse"],
+ pipeUIMessageStreamToResponse:
+ pipeUIMessageStreamToResponse as StreamTextResultWithContext["pipeUIMessageStreamToResponse"],
+ pipeTextStreamToResponse: (response, init) => {
+ pipeTextStreamToResponse({
+ response,
+ textStream: createTextStream(),
+ ...(init ?? {}),
+ });
+ },
+ toTextStreamResponse: (init) => {
+ return createTextStreamResponse({
+ textStream: createTextStream(),
+ ...(init ?? {}),
+ });
+ },
+ context,
+ };
+ }
+
+ private resolveShortResponseObject(schema: T, text: string): z.infer {
+ const candidates: unknown[] = [];
+ if (text.length > 0) {
+ try {
+ candidates.push(JSON.parse(text));
+ } catch {}
+ }
+ candidates.push(text);
+ candidates.push({ text });
+ for (const candidate of candidates) {
+ const parsed = schema.safeParse(candidate);
+ if (parsed.success) {
+ return parsed.data;
+ }
+ }
+ return (candidates[0] ?? text) as z.infer;
+ }
+
+ private createShortObjectResult(
+ schema: T,
+ text: string,
+ options?: GenerateObjectOptions,
+ ): GenerateObjectResultWithContext> {
+ const object = this.resolveShortResponseObject(schema, text);
+ const usage = this.createZeroUsage();
+ const context = toContextMap(options?.context) ?? new Map();
+
+ return {
+ object,
+ usage,
+ warnings: [],
+ finishReason: "stop",
+ response: {
+ id: "short-response",
+ modelId: "short-response",
+ timestamp: new Date(),
+ messages: [],
+ },
+ context,
+ request: {
+ body: {},
+ },
+ reasoning: "",
+ providerMetadata: undefined,
+ toJsonResponse: (init?: ResponseInit) => {
+ const responseInit = init ? { ...init } : {};
+ const headers = {
+ "content-type": "application/json",
+ ...(responseInit.headers ?? {}),
+ };
+ return new Response(safeStringify(object), {
+ ...responseInit,
+ headers,
+ });
+ },
+ } as GenerateObjectResultWithContext>;
+ }
+
+ private createShortStreamObjectResult(
+ schema: T,
+ text: string,
+ options?: StreamObjectOptions,
+ ): StreamObjectResultWithContext> {
+ const object = this.resolveShortResponseObject(schema, text);
+ const usage = this.createZeroUsage();
+ const context = toContextMap(options?.context) ?? new Map();
+ const textPayload = safeStringify(object);
+ const createTextStream = (): AsyncIterableStream =>
+ this.createShortTextStream(textPayload);
+
+ const partialObjectStream = new ReadableStream>>({
+ start(controller) {
+ controller.enqueue(object);
+ controller.close();
+ },
+ });
+
+ return {
+ object: Promise.resolve(object),
+ partialObjectStream,
+ textStream: createTextStream(),
+ warnings: Promise.resolve(undefined),
+ usage: Promise.resolve(usage),
+ finishReason: Promise.resolve("stop"),
+ pipeTextStreamToResponse: (response, init) => {
+ pipeTextStreamToResponse({
+ response,
+ textStream: createTextStream(),
+ ...(init ?? {}),
+ });
+ },
+ toTextStreamResponse: (init) => {
+ return createTextStreamResponse({
+ textStream: createTextStream(),
+ ...(init ?? {}),
+ });
+ },
+ context,
+ };
+ }
+
+ private buildShortTextFallbackRequest(
+ tenantId: string,
+ metadata: TrafficRequestMetadata | undefined,
+ options: GenerateTextOptions | undefined,
+ text: string,
+ ): TrafficRequest {
+ const shortMetadata = this.buildShortResponseMetadata(metadata);
+ return {
+ tenantId,
+ metadata: shortMetadata,
+ maxQueueWaitMs: options?.maxQueueWaitMs,
+ estimatedTokens: 0,
+ execute: async () => this.createShortTextResult(text, options),
+ extractUsage: (result: GenerateTextResultWithContext) =>
+ this.extractUsageFromResponse(result),
+ };
+ }
+
+ private buildShortStreamTextFallbackRequest(
+ tenantId: string,
+ metadata: TrafficRequestMetadata | undefined,
+ options: StreamTextOptions | undefined,
+ text: string,
+ ): TrafficRequest {
+ const shortMetadata = this.buildShortResponseMetadata(metadata);
+ return {
+ tenantId,
+ metadata: shortMetadata,
+ maxQueueWaitMs: options?.maxQueueWaitMs,
+ estimatedTokens: 0,
+ execute: async () => this.createShortStreamTextResult(text, options),
+ extractUsage: (result: StreamTextResultWithContext) => this.extractUsageFromResponse(result),
+ };
+ }
+
+ private buildShortObjectFallbackRequest(
+ tenantId: string,
+ metadata: TrafficRequestMetadata | undefined,
+ schema: T,
+ options: GenerateObjectOptions | undefined,
+ text: string,
+ ): TrafficRequest>> {
+ const shortMetadata = this.buildShortResponseMetadata(metadata);
+ return {
+ tenantId,
+ metadata: shortMetadata,
+ maxQueueWaitMs: options?.maxQueueWaitMs,
+ estimatedTokens: 0,
+ execute: async () => this.createShortObjectResult(schema, text, options),
+ extractUsage: (result: GenerateObjectResultWithContext>) =>
+ this.extractUsageFromResponse(result),
+ };
+ }
+
+ private buildShortStreamObjectFallbackRequest(
+ tenantId: string,
+ metadata: TrafficRequestMetadata | undefined,
+ schema: T,
+ options: StreamObjectOptions | undefined,
+ text: string,
+ ): TrafficRequest>> {
+ const shortMetadata = this.buildShortResponseMetadata(metadata);
+ return {
+ tenantId,
+ metadata: shortMetadata,
+ maxQueueWaitMs: options?.maxQueueWaitMs,
+ estimatedTokens: 0,
+ execute: async () => this.createShortStreamObjectResult(schema, text, options),
+ extractUsage: (result: StreamObjectResultWithContext>) =>
+ this.extractUsageFromResponse(result),
+ };
+ }
+
+ private updateTrafficControllerRateLimits(
+ response: unknown,
+ metadata: TrafficRequestMetadata | undefined,
+ logger?: Logger,
+ ): void {
+ const headerCandidates = findHeaders(response);
+ if (headerCandidates.length === 0) {
+ logger?.debug?.("[Traffic] No headers found for rate limit update");
+ return;
+ }
+
+ const controller = getTrafficController();
+ const effectiveMetadata = metadata ?? this.buildTrafficMetadata();
+ let updateResult: ReturnType | undefined;
+ for (const headers of headerCandidates) {
+ updateResult = controller.updateRateLimitFromHeaders(effectiveMetadata, headers);
+ if (updateResult) break;
+ }
+
+ if (!updateResult) {
+ logger?.debug?.("[Traffic] No rate limit headers applied from response");
+ return;
+ }
+
+ const now = Date.now();
+ const effectiveRemaining = Math.max(
+ 0,
+ updateResult.state.remaining - updateResult.state.reserved,
+ );
+ const resetInMs = Math.max(0, updateResult.state.resetAt - now);
+ const nextAllowedInMs = Math.max(0, updateResult.state.nextAllowedAt - now);
+ logger?.info?.("[Traffic] Applied rate limit from response headers", {
+ rateLimitKey: updateResult.key,
+ limit: updateResult.state.limit,
+ remaining: updateResult.state.remaining,
+ reserved: updateResult.state.reserved,
+ effectiveRemaining,
+ resetAt: updateResult.state.resetAt,
+ resetInMs,
+ nextAllowedAt: updateResult.state.nextAllowedAt,
+ nextAllowedInMs,
+ headers: {
+ limitRequests: updateResult.headerSnapshot.limitRequests,
+ remainingRequests: updateResult.headerSnapshot.remainingRequests,
+ resetRequestsMs: updateResult.headerSnapshot.resetRequestsMs,
+ },
+ });
+ }
+
+ private extractUsageFromResponse(
+ result:
+ | {
+ usage?: LanguageModelUsage | Promise;
+ totalUsage?: LanguageModelUsage | Promise;
+ }
+ | undefined,
+ ): Promise | LanguageModelUsage | undefined {
+ if (!result) {
+ return undefined;
+ }
+
+ const usageCandidate =
+ (result as { totalUsage?: LanguageModelUsage | Promise })
+ ?.totalUsage ??
+ (result as { usage?: LanguageModelUsage | Promise })?.usage;
+
+ if (!usageCandidate) {
+ return undefined;
+ }
+
+ const normalizeUsage = (
+ usage: LanguageModelUsage | undefined,
+ ): LanguageModelUsage | undefined => {
+ if (!usage) return undefined;
+ const input = Number.isFinite(usage.inputTokens) ? (usage.inputTokens as number) : undefined;
+ const output = Number.isFinite(usage.outputTokens)
+ ? (usage.outputTokens as number)
+ : undefined;
+ const total = Number.isFinite(usage.totalTokens) ? (usage.totalTokens as number) : undefined;
+
+ if (total === undefined && input === undefined && output === undefined) {
+ return undefined;
+ }
+
+ const safeInput = input ?? 0;
+ const safeOutput = output ?? 0;
+ const safeTotal = total ?? safeInput + safeOutput;
+
+ return {
+ ...usage,
+ inputTokens: safeInput,
+ outputTokens: safeOutput,
+ totalTokens: safeTotal,
+ };
+ };
+
+ if (
+ typeof (usageCandidate as PromiseLike).then === "function"
+ ) {
+ return (usageCandidate as Promise)
+ .then((usage) => normalizeUsage(usage))
+ .catch(() => undefined);
+ }
+
+ return normalizeUsage(usageCandidate as LanguageModelUsage);
+ }
+
+ private resolveProvider(
+ model: LanguageModel | DynamicValue | undefined,
+ ): string | undefined {
+ if (
+ model &&
+ typeof model === "object" &&
+ "provider" in model &&
+ typeof (model as any).provider === "string"
+ ) {
+ return (model as any).provider;
+ }
+
+ return undefined;
+ }
+
/**
* Get the model name
*/
- public getModelName(): string {
- if (typeof this.model === "function") {
+ public getModelName(modelOverride?: LanguageModel | DynamicValue): string {
+ const selectedModel = modelOverride ?? this.model;
+ if (typeof selectedModel === "function") {
return "dynamic";
}
- if (typeof this.model === "string") {
- return this.model;
+ if (typeof selectedModel === "string") {
+ return selectedModel;
}
- return this.model.modelId || "unknown";
+ return selectedModel.modelId || "unknown";
}
/**
diff --git a/packages/core/src/agent/eval.ts b/packages/core/src/agent/eval.ts
index 9e4fe9f2e..de7125058 100644
--- a/packages/core/src/agent/eval.ts
+++ b/packages/core/src/agent/eval.ts
@@ -711,6 +711,7 @@ function buildEvalPayload(
rawOutput: output,
userId: oc.userId,
conversationId: oc.conversationId,
+ tenantId: oc.tenantId,
traceId: spanContext.traceId,
spanId: spanContext.spanId,
metadata,
diff --git a/packages/core/src/agent/types.ts b/packages/core/src/agent/types.ts
index dd5fb29d2..c70bd478e 100644
--- a/packages/core/src/agent/types.ts
+++ b/packages/core/src/agent/types.ts
@@ -29,6 +29,7 @@ import type { Logger } from "@voltagent/internal";
import type { LocalScorerDefinition, SamplingPolicy } from "../eval/runtime";
import type { MemoryOptions, MemoryStorageMetadata, WorkingMemorySummary } from "../memory/types";
import type { VoltAgentObservability } from "../observability";
+import type { TrafficPriority } from "../traffic/traffic-controller";
import type {
DynamicValue,
DynamicValueOptions,
@@ -456,6 +457,11 @@ export type AgentOptions = {
temperature?: number;
maxOutputTokens?: number;
maxSteps?: number;
+ /**
+ * Default scheduling priority for this agent's LLM calls.
+ * Defaults to P1 when unspecified.
+ */
+ trafficPriority?: TrafficPriority;
/**
* Default stop condition for step execution (ai-sdk `stopWhen`).
* Per-call `stopWhen` in method options overrides this.
@@ -493,6 +499,7 @@ export interface AgentEvalPayload {
rawOutput?: unknown;
userId?: string;
conversationId?: string;
+ tenantId?: string;
traceId: string;
spanId: string;
metadata?: Record;
@@ -890,6 +897,21 @@ export type OperationContext = {
/** Optional conversation identifier associated with this operation */
conversationId?: string;
+ /** Optional tenant identifier propagated across nested operations */
+ tenantId?: string;
+
+ /** Optional key identifier for per-key traffic limits */
+ apiKeyId?: string;
+
+ /** Optional region identifier for per-region traffic limits */
+ region?: string;
+
+ /** Optional endpoint identifier for per-endpoint traffic limits */
+ endpoint?: string;
+
+ /** Optional tenant tier identifier for per-tier traffic limits */
+ tenantTier?: string;
+
/** User-managed context map for this operation */
readonly context: Map;
@@ -914,6 +936,9 @@ export type OperationContext = {
/** Conversation steps for building full message history including tool calls/results */
conversationSteps?: StepWithContent[];
+ /** Scheduling priority propagated from parent calls */
+ priority?: TrafficPriority;
+
/** AbortController for cancelling the operation and accessing the signal */
abortController: AbortController;
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
index 8753f0391..9dee43331 100644
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -21,6 +21,30 @@ export type {
WorkflowTimelineEvent,
RegisteredWorkflow,
} from "./workflow";
+export {
+ // Surface traffic controller so downstream consumers can route agent calls through the shared scheduler
+ TrafficController,
+ CircuitBreakerOpenError,
+ QueueWaitTimeoutError,
+ RateLimitedUpstreamError,
+ getTrafficController,
+ type FallbackChainEntry,
+ type FallbackPolicy,
+ type FallbackPolicyConfig,
+ type FallbackPolicyMode,
+ type FallbackTarget,
+ type RateLimitConfig,
+ type RateLimitKey,
+ type RateLimitOptions,
+ type AdaptiveLimiterConfig,
+ type PriorityWeights,
+ type PriorityBurstLimits,
+ type TrafficRequest,
+ type TrafficRequestMetadata,
+ type TrafficResponseMetadata,
+ type TrafficPriority,
+ type TrafficRequestType,
+} from "./traffic/traffic-controller";
// Export new Agent from agent.ts
export {
Agent,
diff --git a/packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts
new file mode 100644
index 000000000..652b7e59a
--- /dev/null
+++ b/packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts
@@ -0,0 +1,243 @@
+import type { Logger } from "../../logger";
+import {
+ RATE_LIMIT_EXHAUSTION_BUFFER,
+ RATE_LIMIT_MIN_PACE_INTERVAL_MS,
+ RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS,
+ RATE_LIMIT_PROBE_DELAY_MS,
+} from "../traffic-constants";
+import type {
+ DispatchDecision,
+ QueuedRequest,
+ RateLimitWindowState,
+} from "../traffic-controller-internal";
+import { parseRetryAfterMs, readHeaderValue } from "../traffic-error-utils";
+import type { TrafficRequestMetadata } from "../traffic-types";
+import type {
+ RateLimitHeaderSnapshot,
+ RateLimitStrategy,
+ RateLimitUpdateResult,
+} from "./rate-limit-strategy";
+import { parseResetDurationToMs } from "./rate-limit-utils";
+
+export class DefaultRateLimitStrategy implements RateLimitStrategy {
+ private state?: RateLimitWindowState;
+ private readonly key: string;
+
+ constructor(key: string) {
+ this.key = key;
+ }
+
+ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null {
+ const rateLimitLogger = logger?.child({ module: "rate-limiter" });
+ const state = this.state;
+ if (!state) {
+ rateLimitLogger?.trace?.("Rate limit state missing; allow request", {
+ rateLimitKey: this.key,
+ });
+ return null;
+ }
+
+ const now = Date.now();
+ const effectiveRemaining = Math.max(0, state.remaining - state.reserved);
+ const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS;
+
+ if (effectiveRemaining <= RATE_LIMIT_EXHAUSTION_BUFFER) {
+ if (now < probeAt) {
+ rateLimitLogger?.debug?.("Rate limit exhausted; waiting for probe", {
+ rateLimitKey: this.key,
+ remaining: state.remaining,
+ reserved: state.reserved,
+ effectiveRemaining,
+ resetAt: state.resetAt,
+ probeAt,
+ });
+ return { kind: "wait", wakeUpAt: probeAt };
+ }
+ if (state.reserved > 0) {
+ rateLimitLogger?.debug?.("Rate limit exhausted but in-flight reservations exist; waiting", {
+ rateLimitKey: this.key,
+ remaining: state.remaining,
+ reserved: state.reserved,
+ effectiveRemaining,
+ resetAt: state.resetAt,
+ });
+ return { kind: "wait" };
+ }
+ }
+
+ if (now < state.nextAllowedAt) {
+ rateLimitLogger?.debug?.("Rate limit pacing; waiting until nextAllowedAt", {
+ rateLimitKey: this.key,
+ nextAllowedAt: state.nextAllowedAt,
+ resetAt: state.resetAt,
+ waitMs: Math.min(state.resetAt, state.nextAllowedAt) - now,
+ });
+ return { kind: "wait", wakeUpAt: Math.min(state.resetAt, state.nextAllowedAt) };
+ }
+
+ state.reserved += 1;
+ next.rateLimitKey = this.key;
+ rateLimitLogger?.trace?.("Reserved rate limit token", {
+ rateLimitKey: this.key,
+ reserved: state.reserved,
+ remaining: state.remaining,
+ resetAt: state.resetAt,
+ nextAllowedAt: state.nextAllowedAt,
+ });
+
+ const remainingWindowMs = Math.max(0, state.resetAt - now);
+ const intervalMs = Math.max(
+ RATE_LIMIT_MIN_PACE_INTERVAL_MS,
+ Math.ceil(remainingWindowMs / Math.max(effectiveRemaining, 1)),
+ );
+
+ const candidateNext = Math.max(state.nextAllowedAt, now + intervalMs);
+ if (
+ state.nextAllowedAt <= now ||
+ candidateNext >= state.nextAllowedAt + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS
+ ) {
+ state.nextAllowedAt = candidateNext;
+ rateLimitLogger?.trace?.("Updated pacing nextAllowedAt", {
+ rateLimitKey: this.key,
+ nextAllowedAt: state.nextAllowedAt,
+ intervalMs,
+ remainingWindowMs,
+ effectiveRemaining,
+ });
+ }
+
+ return null;
+ }
+
+ onDispatch(_logger?: Logger): void {}
+
+ onComplete(logger?: Logger): void {
+ const rateLimitLogger = logger?.child({ module: "rate-limiter" });
+ const state = this.state;
+ if (!state || state.reserved <= 0) return;
+ state.reserved -= 1;
+ rateLimitLogger?.trace?.("Released rate limit reservation", {
+ rateLimitKey: this.key,
+ reserved: state.reserved,
+ remaining: state.remaining,
+ resetAt: state.resetAt,
+ nextAllowedAt: state.nextAllowedAt,
+ });
+ }
+
+ updateFromHeaders(
+ _metadata: TrafficRequestMetadata | undefined,
+ headers: unknown,
+ logger?: Logger,
+ ): RateLimitUpdateResult | undefined {
+ const rateLimitLogger = logger?.child({ module: "rate-limiter" });
+ const limitRequests = readHeaderValue(headers, "x-ratelimit-limit-requests");
+ const remainingRequests = readHeaderValue(headers, "x-ratelimit-remaining-requests");
+ const resetRequests = readHeaderValue(headers, "x-ratelimit-reset-requests");
+ const retryAfter = readHeaderValue(headers, "retry-after");
+ const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter) : undefined;
+
+ const now = Date.now();
+ const existing = this.state;
+ let state: RateLimitWindowState | undefined;
+ let headerSnapshot: RateLimitHeaderSnapshot | undefined;
+
+ if (limitRequests && remainingRequests && resetRequests) {
+ const limit = Number(limitRequests);
+ const remaining = Number(remainingRequests);
+ if (!Number.isFinite(limit) || !Number.isFinite(remaining)) {
+ rateLimitLogger?.debug?.("Invalid rate limit numeric headers; skipping", {
+ rateLimitKey: this.key,
+ limitRequests,
+ remainingRequests,
+ });
+ return undefined;
+ }
+
+ const resetRequestsMs = parseResetDurationToMs(resetRequests);
+ if (resetRequestsMs === undefined) {
+ rateLimitLogger?.debug?.("Unable to parse reset duration; skipping", {
+ rateLimitKey: this.key,
+ resetRequests,
+ });
+ return undefined;
+ }
+
+ const parsedResetAt = now + resetRequestsMs;
+ const isSameWindow = !!existing && now < existing.resetAt;
+ const resetAt = isSameWindow ? Math.max(existing.resetAt, parsedResetAt) : parsedResetAt;
+ const nextAllowedAt = isSameWindow ? Math.max(existing.nextAllowedAt, now) : now;
+ const reserved = Math.max(0, existing?.reserved ?? 0);
+
+ state = {
+ limit,
+ remaining: isSameWindow ? Math.min(existing.remaining, remaining) : remaining,
+ resetAt,
+ reserved,
+ nextAllowedAt,
+ };
+ headerSnapshot = {
+ limitRequests,
+ remainingRequests,
+ resetRequests,
+ resetRequestsMs,
+ };
+ } else if (retryAfterMs === undefined) {
+ rateLimitLogger?.trace?.("Missing rate limit headers; skipping", {
+ rateLimitKey: this.key,
+ hasLimit: !!limitRequests,
+ hasRemaining: !!remainingRequests,
+ hasReset: !!resetRequests,
+ hasRetryAfter: !!retryAfter,
+ });
+ return undefined;
+ }
+
+ if (!state) {
+ if (retryAfterMs === undefined) {
+ rateLimitLogger?.trace?.("Retry-After missing or unparsable; skipping", {
+ rateLimitKey: this.key,
+ retryAfter,
+ });
+ return undefined;
+ }
+ const targetAt = now + retryAfterMs;
+ const isSameWindow = !!existing && now < existing.resetAt;
+ state = {
+ limit: existing?.limit ?? 1,
+ remaining: 0,
+ resetAt: isSameWindow ? Math.max(existing.resetAt, targetAt) : targetAt,
+ reserved: Math.max(0, existing?.reserved ?? 0),
+ nextAllowedAt: Math.max(existing?.nextAllowedAt ?? now, targetAt),
+ };
+ headerSnapshot = { retryAfter, retryAfterMs };
+ } else if (retryAfterMs !== undefined) {
+ const targetAt = now + retryAfterMs;
+ state = {
+ ...state,
+ remaining: 0,
+ resetAt: Math.max(state.resetAt, targetAt),
+ nextAllowedAt: Math.max(state.nextAllowedAt, targetAt),
+ };
+ headerSnapshot = { ...headerSnapshot, retryAfter, retryAfterMs };
+ }
+
+ this.state = state;
+ rateLimitLogger?.debug?.("Applied rate limit headers to state", {
+ rateLimitKey: this.key,
+ limit: state.limit,
+ remaining: state.remaining,
+ effectiveRemaining: Math.max(0, state.remaining - state.reserved),
+ resetAt: state.resetAt,
+ nextAllowedAt: state.nextAllowedAt,
+ resetRequestsMs: headerSnapshot?.resetRequestsMs,
+ retryAfterMs: headerSnapshot?.retryAfterMs,
+ });
+
+ return {
+ key: this.key,
+ headerSnapshot: headerSnapshot ?? {},
+ state,
+ };
+ }
+}
diff --git a/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts
new file mode 100644
index 000000000..7cca0d260
--- /dev/null
+++ b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts
@@ -0,0 +1,379 @@
+import type { Logger } from "../../logger";
+import {
+ RATE_LIMIT_EXHAUSTION_BUFFER,
+ RATE_LIMIT_MIN_PACE_INTERVAL_MS,
+ RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS,
+ RATE_LIMIT_PROBE_DELAY_MS,
+} from "../traffic-constants";
+import type {
+ DispatchDecision,
+ QueuedRequest,
+ RateLimitWindowState,
+} from "../traffic-controller-internal";
+import { parseRetryAfterMs, readHeaderValue } from "../traffic-error-utils";
+import type { RateLimitOptions, TrafficRequestMetadata } from "../traffic-types";
+import { DefaultRateLimitStrategy } from "./default-rate-limit-strategy";
+import type {
+ RateLimitStrategy,
+ RateLimitUpdateResult,
+ RateLimitUsage,
+} from "./rate-limit-strategy";
+import { parseResetDurationToMs } from "./rate-limit-utils";
+
+export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy {
+ readonly handlesTokenLimits = true;
+ private readonly window: DefaultRateLimitStrategy;
+ private readonly key: string;
+ private readonly requestsPerMinute?: number;
+ private readonly tokensPerMinute?: number;
+ private requestState?: RateLimitWindowState;
+ private tokenState?: RateLimitWindowState;
+ private bootstrapReserved = 0;
+ private readonly windowMs = 60_000;
+
+ constructor(key: string, options?: RateLimitOptions) {
+ this.key = key;
+ this.window = new DefaultRateLimitStrategy(key);
+ // Window strategy enforces fixed 60s windows; burstSize is intentionally ignored here.
+ this.requestsPerMinute = this.normalizeLimit(options?.requestsPerMinute);
+ this.tokensPerMinute = this.normalizeLimit(options?.tokensPerMinute);
+ }
+
+ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null {
+ if (this.requestsPerMinute !== undefined) {
+ const requestDecision = this.resolveRequestWindow(next, logger);
+ if (requestDecision) return requestDecision;
+ } else {
+ const decision = this.window.resolve(next, logger);
+ if (decision) return decision;
+
+ if (!next.rateLimitKey && this.tokensPerMinute === undefined) {
+ const rateLimitLogger = logger?.child({ module: "rate-limiter" });
+ if (this.bootstrapReserved >= 1) {
+ rateLimitLogger?.debug?.("OpenAI rate limit bootstrap active; waiting", {
+ rateLimitKey: this.key,
+ bootstrapReserved: this.bootstrapReserved,
+ });
+ return { kind: "wait" };
+ }
+
+ this.bootstrapReserved += 1;
+ next.rateLimitKey = this.key;
+ rateLimitLogger?.debug?.("OpenAI rate limit bootstrap reserved", {
+ rateLimitKey: this.key,
+ bootstrapReserved: this.bootstrapReserved,
+ });
+ }
+ }
+
+ const tokenDecision = this.resolveTokenWindow(next, logger);
+ if (tokenDecision) return tokenDecision;
+ return null;
+ }
+
+ onDispatch(logger?: Logger): void {
+ if (this.requestsPerMinute === undefined) {
+ this.window.onDispatch(logger);
+ }
+ }
+
+ onComplete(logger?: Logger): void {
+ if (this.requestsPerMinute !== undefined) {
+ const now = Date.now();
+ const state = this.ensureRequestState(now);
+ if (state.reserved > 0) {
+ state.reserved -= 1;
+ }
+ state.remaining = Math.max(0, state.remaining - 1);
+ return;
+ }
+
+ if (this.bootstrapReserved > 0) {
+ this.bootstrapReserved -= 1;
+ }
+ this.window.onComplete(logger);
+ }
+
+ recordUsage(usage: RateLimitUsage, logger?: Logger, reservedTokens?: number): void {
+ const tokens = this.resolveTokenCount(usage);
+ if (tokens <= 0) return;
+
+ const now = Date.now();
+ const state = this.ensureTokenState(now);
+ if (!state) return;
+ const reserved = typeof reservedTokens === "number" ? reservedTokens : 0;
+ const delta = tokens - reserved;
+ if (delta > 0) {
+ state.remaining = Math.max(0, state.remaining - delta);
+ } else if (delta < 0) {
+ state.remaining = Math.min(state.limit, state.remaining + Math.abs(delta));
+ }
+ logger?.child({ module: "rate-limiter" })?.trace?.("OpenAI token usage recorded", {
+ rateLimitKey: this.key,
+ tokens,
+ remaining: state.remaining,
+ resetAt: state.resetAt,
+ });
+ }
+
+ updateFromHeaders(
+ metadata: TrafficRequestMetadata | undefined,
+ headers: unknown,
+ logger?: Logger,
+ ): RateLimitUpdateResult | undefined {
+ const update =
+ this.requestsPerMinute !== undefined
+ ? undefined
+ : this.window.updateFromHeaders(metadata, headers, logger);
+ const tokenUpdate = this.applyTokenHeaderUpdates(headers, logger);
+ if (!update) {
+ return tokenUpdate;
+ }
+ if (tokenUpdate?.headerSnapshot) {
+ return {
+ ...update,
+ headerSnapshot: { ...update.headerSnapshot, ...tokenUpdate.headerSnapshot },
+ };
+ }
+ return update;
+ }
+
+ private resolveRequestWindow(next: QueuedRequest, logger?: Logger): DispatchDecision | null {
+ const rateLimitLogger = logger?.child({ module: "rate-limiter" });
+ const now = Date.now();
+ const state = this.ensureRequestState(now);
+ const effectiveRemaining = Math.max(0, state.remaining - state.reserved);
+ const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS;
+
+ if (effectiveRemaining <= RATE_LIMIT_EXHAUSTION_BUFFER) {
+ if (now < probeAt) {
+ rateLimitLogger?.debug?.("OpenAI request window exhausted; waiting for probe", {
+ rateLimitKey: this.key,
+ remaining: state.remaining,
+ reserved: state.reserved,
+ effectiveRemaining,
+ resetAt: state.resetAt,
+ probeAt,
+ });
+ return { kind: "wait", wakeUpAt: probeAt };
+ }
+ if (state.reserved > 0) {
+ rateLimitLogger?.debug?.(
+ "OpenAI request window exhausted but in-flight reservations exist; waiting",
+ {
+ rateLimitKey: this.key,
+ remaining: state.remaining,
+ reserved: state.reserved,
+ effectiveRemaining,
+ resetAt: state.resetAt,
+ },
+ );
+ return { kind: "wait" };
+ }
+ }
+
+ if (now < state.nextAllowedAt) {
+ rateLimitLogger?.debug?.("OpenAI request window pacing; waiting until nextAllowedAt", {
+ rateLimitKey: this.key,
+ nextAllowedAt: state.nextAllowedAt,
+ resetAt: state.resetAt,
+ waitMs: Math.min(state.resetAt, state.nextAllowedAt) - now,
+ });
+ return { kind: "wait", wakeUpAt: Math.min(state.resetAt, state.nextAllowedAt) };
+ }
+
+ state.reserved += 1;
+ next.rateLimitKey = this.key;
+ rateLimitLogger?.trace?.("Reserved OpenAI request window slot", {
+ rateLimitKey: this.key,
+ reserved: state.reserved,
+ remaining: state.remaining,
+ resetAt: state.resetAt,
+ nextAllowedAt: state.nextAllowedAt,
+ });
+
+ const remainingWindowMs = Math.max(0, state.resetAt - now);
+ const intervalMs = Math.max(
+ RATE_LIMIT_MIN_PACE_INTERVAL_MS,
+ Math.ceil(remainingWindowMs / Math.max(effectiveRemaining, 1)),
+ );
+
+ const candidateNext = Math.max(state.nextAllowedAt, now + intervalMs);
+ if (
+ state.nextAllowedAt <= now ||
+ candidateNext >= state.nextAllowedAt + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS
+ ) {
+ state.nextAllowedAt = candidateNext;
+ rateLimitLogger?.trace?.("Updated OpenAI request pacing nextAllowedAt", {
+ rateLimitKey: this.key,
+ nextAllowedAt: state.nextAllowedAt,
+ intervalMs,
+ remainingWindowMs,
+ effectiveRemaining,
+ });
+ }
+
+ return null;
+ }
+
+ private resolveTokenWindow(next: QueuedRequest, logger?: Logger): DispatchDecision | null {
+ const rateLimitLogger = logger?.child({ module: "rate-limiter" });
+ const now = Date.now();
+ const state = this.ensureTokenState(now);
+ if (!state) return null;
+ const estimatedTokens = next.estimatedTokens;
+
+ if (typeof estimatedTokens === "number" && estimatedTokens > 0) {
+ if (state.remaining >= estimatedTokens) {
+ state.remaining = Math.max(0, state.remaining - estimatedTokens);
+ next.reservedTokens = estimatedTokens;
+ return null;
+ }
+ } else if (state.remaining > 0) {
+ return null;
+ }
+
+ const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS;
+ rateLimitLogger?.debug?.("OpenAI token window exhausted; waiting", {
+ rateLimitKey: this.key,
+ remaining: state.remaining,
+ resetAt: state.resetAt,
+ probeAt,
+ });
+ return { kind: "wait", wakeUpAt: probeAt };
+ }
+
+ private ensureRequestState(now: number): RateLimitWindowState {
+ const limit = this.requestsPerMinute ?? 0;
+ const state = this.requestState;
+ if (!state || now >= state.resetAt) {
+ this.requestState = {
+ limit,
+ remaining: limit,
+ resetAt: now + this.windowMs,
+ reserved: 0,
+ nextAllowedAt: now,
+ };
+ return this.requestState;
+ }
+ return state;
+ }
+
+ private ensureTokenState(now: number): RateLimitWindowState | undefined {
+ const configuredLimit = this.tokensPerMinute;
+ const state = this.tokenState;
+ if (!state) {
+ if (configuredLimit === undefined) return undefined;
+ this.tokenState = {
+ limit: configuredLimit,
+ remaining: configuredLimit,
+ resetAt: now + this.windowMs,
+ reserved: 0,
+ nextAllowedAt: now,
+ };
+ return this.tokenState;
+ }
+
+ if (now >= state.resetAt) {
+ const limit = configuredLimit ?? state.limit;
+ this.tokenState = {
+ limit,
+ remaining: limit,
+ resetAt: now + this.windowMs,
+ reserved: 0,
+ nextAllowedAt: now,
+ };
+ return this.tokenState;
+ }
+
+ if (configuredLimit !== undefined && configuredLimit !== state.limit) {
+ state.limit = configuredLimit;
+ state.remaining = Math.min(state.remaining, configuredLimit);
+ }
+
+ return state;
+ }
+
+ private normalizeLimit(value: number | undefined): number | undefined {
+ const numeric = typeof value === "number" ? value : Number(value);
+ return Number.isFinite(numeric) && numeric > 0 ? numeric : undefined;
+ }
+
+ private applyTokenHeaderUpdates(
+ headers: unknown,
+ logger?: Logger,
+ ): RateLimitUpdateResult | undefined {
+ const rateLimitLogger = logger?.child({ module: "rate-limiter" });
+ const limitTokens = readHeaderValue(headers, "x-ratelimit-limit-tokens");
+ const remainingTokens = readHeaderValue(headers, "x-ratelimit-remaining-tokens");
+ const resetTokens = readHeaderValue(headers, "x-ratelimit-reset-tokens");
+ const retryAfter = readHeaderValue(headers, "retry-after");
+
+ const limit = Number(limitTokens);
+ const remaining = Number(remainingTokens);
+ const resetTokensMs = resetTokens ? parseResetDurationToMs(resetTokens) : undefined;
+ const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter) : undefined;
+
+ if (!Number.isFinite(limit) || !Number.isFinite(remaining) || resetTokensMs === undefined) {
+ rateLimitLogger?.trace?.("OpenAI token headers missing or invalid; skipping", {
+ rateLimitKey: this.key,
+ hasLimit: !!limitTokens,
+ hasRemaining: !!remainingTokens,
+ hasReset: !!resetTokens,
+ });
+ return undefined;
+ }
+
+ const now = Date.now();
+ const configuredLimit = this.tokensPerMinute;
+ const effectiveLimit = configuredLimit === undefined ? limit : Math.min(configuredLimit, limit);
+ const clampedRemaining = Math.max(0, Math.min(remaining, effectiveLimit));
+ const parsedResetAt = now + resetTokensMs;
+ const existing = this.tokenState;
+ const isSameWindow = !!existing && now < existing.resetAt;
+ const resetAt = isSameWindow ? Math.max(existing.resetAt, parsedResetAt) : parsedResetAt;
+ const nextAllowedAt = isSameWindow ? Math.max(existing.nextAllowedAt, now) : now;
+ const reserved = Math.max(0, existing?.reserved ?? 0);
+ const effectiveRemaining = isSameWindow
+ ? Math.min(existing.remaining, clampedRemaining)
+ : clampedRemaining;
+
+ const state: RateLimitWindowState = {
+ limit: effectiveLimit,
+ remaining: effectiveRemaining,
+ resetAt,
+ reserved,
+ nextAllowedAt,
+ };
+ this.tokenState = state;
+
+ rateLimitLogger?.debug?.("OpenAI token headers applied", {
+ rateLimitKey: this.key,
+ limit: effectiveLimit,
+ remaining: effectiveRemaining,
+ resetAt,
+ retryAfterMs,
+ });
+
+ return {
+ key: this.key,
+ headerSnapshot: {
+ limitTokens,
+ remainingTokens,
+ resetTokens,
+ resetTokensMs,
+ retryAfter,
+ retryAfterMs,
+ },
+ state,
+ };
+ }
+
+ private resolveTokenCount(usage: RateLimitUsage): number {
+ const total = Number.isFinite(usage.totalTokens) ? usage.totalTokens : undefined;
+ if (total !== undefined) return total;
+ const input = Number.isFinite(usage.inputTokens) ? usage.inputTokens : 0;
+ const output = Number.isFinite(usage.outputTokens) ? usage.outputTokens : 0;
+ return input + output;
+ }
+}
diff --git a/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts
new file mode 100644
index 000000000..af398b25f
--- /dev/null
+++ b/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts
@@ -0,0 +1,45 @@
+import type { Logger } from "../../logger";
+import type {
+ DispatchDecision,
+ QueuedRequest,
+ RateLimitWindowState,
+} from "../traffic-controller-internal";
+import type { TrafficRequestMetadata } from "../traffic-types";
+
+export type RateLimitHeaderSnapshot = {
+ limitRequests?: string;
+ remainingRequests?: string;
+ resetRequests?: string;
+ resetRequestsMs?: number;
+ limitTokens?: string;
+ remainingTokens?: string;
+ resetTokens?: string;
+ resetTokensMs?: number;
+ retryAfter?: string;
+ retryAfterMs?: number;
+};
+
+export type RateLimitUpdateResult = {
+ key: string;
+ headerSnapshot: RateLimitHeaderSnapshot;
+ state: RateLimitWindowState;
+};
+
+export type RateLimitUsage = {
+ inputTokens?: number;
+ outputTokens?: number;
+ totalTokens?: number;
+};
+
+export interface RateLimitStrategy {
+ readonly handlesTokenLimits?: boolean;
+ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null;
+ onDispatch(logger?: Logger): void;
+ onComplete(logger?: Logger): void;
+ recordUsage?(usage: RateLimitUsage, logger?: Logger, reservedTokens?: number): void;
+ updateFromHeaders(
+ metadata: TrafficRequestMetadata | undefined,
+ headers: unknown,
+ logger?: Logger,
+ ): RateLimitUpdateResult | undefined;
+}
diff --git a/packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts b/packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts
new file mode 100644
index 000000000..310c9a7e6
--- /dev/null
+++ b/packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts
@@ -0,0 +1,26 @@
+export function parseResetDurationToMs(raw: string): number | undefined {
+ const value = raw.trim();
+ if (!value) return undefined;
+
+ let totalMs = 0;
+ const regex = /(\d+(?:\.\d+)?)(ms|s|m|h|d)/g;
+ let matched = false;
+ for (const match of value.matchAll(regex)) {
+ matched = true;
+ const amount = Number.parseFloat(match[1] ?? "");
+ if (!Number.isFinite(amount)) continue;
+ const unit = match[2];
+ if (unit === "ms") totalMs += amount;
+ else if (unit === "s") totalMs += amount * 1000;
+ else if (unit === "m") totalMs += amount * 60_000;
+ else if (unit === "h") totalMs += amount * 3_600_000;
+ else if (unit === "d") totalMs += amount * 86_400_000;
+ }
+
+ if (matched) {
+ return Math.round(totalMs);
+ }
+
+ const n = Number(value);
+ return Number.isFinite(n) ? Math.round(n) : undefined;
+}
diff --git a/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts
new file mode 100644
index 000000000..ee269ecd2
--- /dev/null
+++ b/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts
@@ -0,0 +1,218 @@
+import type { Logger } from "../../logger";
+import type {
+ DispatchDecision,
+ QueuedRequest,
+ RateLimitWindowState,
+} from "../traffic-controller-internal";
+import { parseRetryAfterMs, readHeaderValue } from "../traffic-error-utils";
+import type { RateLimitOptions, TrafficRequestMetadata } from "../traffic-types";
+import type {
+ RateLimitHeaderSnapshot,
+ RateLimitStrategy,
+ RateLimitUpdateResult,
+} from "./rate-limit-strategy";
+import { parseResetDurationToMs } from "./rate-limit-utils";
+
+type TokenBucketState = {
+ capacity: number;
+ refillPerSecond: number;
+ tokens: number;
+ updatedAt: number;
+};
+
+function normalizeTokenBucketOptions(
+ raw: RateLimitOptions | undefined,
+): Omit | undefined {
+ const requestsPerMinuteRaw = raw?.requestsPerMinute;
+ const tokensPerMinuteRaw = raw?.tokensPerMinute;
+ const burstSizeRaw = raw?.burstSize;
+
+ const requestsPerMinute =
+ typeof requestsPerMinuteRaw === "number" ? requestsPerMinuteRaw : Number(requestsPerMinuteRaw);
+ const tokensPerMinute =
+ typeof tokensPerMinuteRaw === "number" ? tokensPerMinuteRaw : Number(tokensPerMinuteRaw);
+ const burstSize = typeof burstSizeRaw === "number" ? burstSizeRaw : Number(burstSizeRaw);
+
+ const safeRequestsPerMinute = Number.isFinite(requestsPerMinute) ? requestsPerMinute : 0;
+ const hasTokenLimit = Number.isFinite(tokensPerMinute) && tokensPerMinute > 0;
+ if (safeRequestsPerMinute <= 0 && hasTokenLimit) {
+ return undefined;
+ }
+ const safeBurst = Number.isFinite(burstSize) ? burstSize : safeRequestsPerMinute;
+ const refillPerSecond = safeRequestsPerMinute > 0 ? safeRequestsPerMinute / 60 : 0;
+
+ return {
+ capacity: safeBurst > 0 ? Math.max(1, safeBurst) : 0,
+ refillPerSecond,
+ };
+}
+function refillTokenBucket(bucket: TokenBucketState, now: number): void {
+ const elapsedMs = now - bucket.updatedAt;
+ if (elapsedMs <= 0) return;
+ bucket.updatedAt = now;
+ if (bucket.capacity <= 0 || bucket.refillPerSecond <= 0) return;
+
+ const refill = (elapsedMs / 1000) * bucket.refillPerSecond;
+ if (refill <= 0) return;
+ bucket.tokens = Math.min(bucket.capacity, bucket.tokens + refill);
+}
+
+export class TokenBucketRateLimitStrategy implements RateLimitStrategy {
+ private readonly key: string;
+ private bucket?: TokenBucketState;
+ private cooldownUntil?: number;
+
+ constructor(key: string, options?: RateLimitOptions) {
+ this.key = key;
+ if (!options) return;
+ const normalized = normalizeTokenBucketOptions(options);
+ if (!normalized) return;
+ const now = Date.now();
+ this.bucket = {
+ ...normalized,
+ tokens: normalized.capacity,
+ updatedAt: now,
+ };
+ }
+
+ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null {
+ const rateLimitLogger = logger?.child({ module: "rate-limiter" });
+ const now = Date.now();
+
+ if (this.cooldownUntil !== undefined && now < this.cooldownUntil) {
+ rateLimitLogger?.debug?.("Token bucket cooldown active; waiting", {
+ rateLimitKey: this.key,
+ cooldownUntil: this.cooldownUntil,
+ waitMs: this.cooldownUntil - now,
+ });
+ return { kind: "wait", wakeUpAt: this.cooldownUntil };
+ }
+
+ const bucket = this.bucket;
+ if (!bucket) return null;
+
+ refillTokenBucket(bucket, now);
+
+ if (bucket.capacity <= 0) {
+ rateLimitLogger?.debug?.("Token bucket misconfigured; blocking", {
+ rateLimitKey: this.key,
+ capacity: bucket.capacity,
+ refillPerSecond: bucket.refillPerSecond,
+ });
+ return { kind: "wait" };
+ }
+
+ if (bucket.tokens >= 1) {
+ bucket.tokens -= 1;
+ next.rateLimitKey = this.key;
+ rateLimitLogger?.trace?.("Consumed token bucket token", {
+ rateLimitKey: this.key,
+ tokens: bucket.tokens,
+ capacity: bucket.capacity,
+ refillPerSecond: bucket.refillPerSecond,
+ });
+ return null;
+ }
+
+ if (bucket.refillPerSecond <= 0) {
+ rateLimitLogger?.debug?.("Token bucket has no refill; blocking", {
+ rateLimitKey: this.key,
+ capacity: bucket.capacity,
+ refillPerSecond: bucket.refillPerSecond,
+ });
+ return { kind: "wait" };
+ }
+
+ const requiredTokens = 1 - bucket.tokens;
+ const waitMs = Math.max(1, Math.ceil((requiredTokens / bucket.refillPerSecond) * 1000));
+ const wakeUpAt = now + waitMs;
+ rateLimitLogger?.debug?.("Token bucket empty; waiting", {
+ rateLimitKey: this.key,
+ tokens: bucket.tokens,
+ capacity: bucket.capacity,
+ refillPerSecond: bucket.refillPerSecond,
+ wakeUpAt,
+ waitMs,
+ });
+ return { kind: "wait", wakeUpAt };
+ }
+
+ onDispatch(_logger?: Logger): void {}
+
+ onComplete(_logger?: Logger): void {}
+
+ updateFromHeaders(
+ _metadata: TrafficRequestMetadata | undefined,
+ headers: unknown,
+ logger?: Logger,
+ ): RateLimitUpdateResult | undefined {
+ const rateLimitLogger = logger?.child({ module: "rate-limiter" });
+ const now = Date.now();
+
+ const retryAfter = readHeaderValue(headers, "retry-after");
+ const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter, now) : undefined;
+
+ const remainingRequests = readHeaderValue(headers, "x-ratelimit-remaining-requests");
+ const resetRequests = readHeaderValue(headers, "x-ratelimit-reset-requests");
+ const resetRequestsMs = resetRequests ? parseResetDurationToMs(resetRequests) : undefined;
+
+ let appliedUntil: number | undefined;
+
+ if (retryAfterMs !== undefined) {
+ const targetAt = now + retryAfterMs;
+ this.cooldownUntil =
+ this.cooldownUntil === undefined ? targetAt : Math.max(this.cooldownUntil, targetAt);
+ appliedUntil = this.cooldownUntil;
+ }
+
+ if (remainingRequests && resetRequestsMs !== undefined) {
+ const remaining = Number(remainingRequests);
+ if (Number.isFinite(remaining) && remaining <= 0) {
+ const targetAt = now + resetRequestsMs;
+ this.cooldownUntil =
+ this.cooldownUntil === undefined ? targetAt : Math.max(this.cooldownUntil, targetAt);
+ appliedUntil = this.cooldownUntil;
+ }
+ }
+
+ if (appliedUntil === undefined) {
+ rateLimitLogger?.trace?.("No applicable cooldown headers; skipping", {
+ rateLimitKey: this.key,
+ hasRetryAfter: !!retryAfter,
+ hasRemainingRequests: !!remainingRequests,
+ hasResetRequests: !!resetRequests,
+ });
+ return undefined;
+ }
+
+ rateLimitLogger?.debug?.("Applied token bucket cooldown from headers", {
+ rateLimitKey: this.key,
+ cooldownUntil: appliedUntil,
+ inMs: Math.max(0, appliedUntil - now),
+ retryAfterMs,
+ resetRequestsMs,
+ });
+
+ const headerSnapshot: RateLimitHeaderSnapshot = {
+ remainingRequests,
+ resetRequests,
+ resetRequestsMs,
+ retryAfter,
+ retryAfterMs,
+ };
+
+ const state: RateLimitWindowState = {
+ limit: 1,
+ remaining: 0,
+ resetAt: appliedUntil,
+ reserved: 0,
+ nextAllowedAt: appliedUntil,
+ };
+
+ return {
+ key: this.key,
+ headerSnapshot,
+ state,
+ };
+ }
+}
diff --git a/packages/core/src/traffic/traffic-circuit-breaker.ts b/packages/core/src/traffic/traffic-circuit-breaker.ts
new file mode 100644
index 000000000..20d166ca2
--- /dev/null
+++ b/packages/core/src/traffic/traffic-circuit-breaker.ts
@@ -0,0 +1,478 @@
+import type { Logger } from "../logger";
+import {
+ CIRCUIT_COOLDOWN_MS,
+ CIRCUIT_FAILURE_THRESHOLD,
+ CIRCUIT_FAILURE_WINDOW_MS,
+ CIRCUIT_PROBE_INTERVAL_MS,
+ CIRCUIT_TIMEOUT_THRESHOLD,
+ CIRCUIT_TIMEOUT_WINDOW_MS,
+ DEFAULT_FALLBACK_CHAINS,
+} from "./traffic-constants";
+import type {
+ CircuitState,
+ CircuitStateStatus,
+ DispatchDecision,
+ QueuedRequest,
+} from "./traffic-controller-internal";
+import { extractStatusCode, isTimeoutError } from "./traffic-error-utils";
+import { CircuitBreakerOpenError } from "./traffic-errors";
+import type {
+ FallbackChainEntry,
+ FallbackPolicy,
+ FallbackPolicyConfig,
+ FallbackTarget,
+ TrafficRequestMetadata,
+ TrafficResponseMetadata,
+} from "./traffic-types";
+
+export class TrafficCircuitBreaker {
+ private readonly circuitBreakers = new Map();
+ private readonly fallbackChains: Map;
+ private readonly fallbackPolicy?: FallbackPolicyConfig;
+ private readonly buildRateLimitKey: (metadata?: TrafficRequestMetadata) => string;
+
+ constructor(options: {
+ fallbackChains?: Record;
+ fallbackPolicy?: FallbackPolicyConfig;
+ buildRateLimitKey: (metadata?: TrafficRequestMetadata) => string;
+ }) {
+ this.buildRateLimitKey = options.buildRateLimitKey;
+ const chains = options.fallbackChains ?? DEFAULT_FALLBACK_CHAINS;
+ this.fallbackChains = new Map(Object.entries(chains));
+ this.fallbackPolicy = options.fallbackPolicy;
+ }
+
+ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null {
+ const circuitLogger = logger?.child({ module: "circuit-breaker" });
+ const visitedKeys = new Set();
+
+ while (true) {
+ const key = this.buildRateLimitKey(next.request.metadata);
+ next.circuitKey = key;
+ visitedKeys.add(key);
+ circuitLogger?.trace?.("Circuit resolve step", {
+ circuitKey: key,
+ provider: next.request.metadata?.provider,
+ model: next.request.metadata?.model,
+ });
+
+ const evaluation = this.evaluateCircuitState(key, circuitLogger);
+ next.circuitStatus = evaluation.state;
+ circuitLogger?.debug?.("Circuit evaluated", {
+ circuitKey: key,
+ state: evaluation.state,
+ allowRequest: evaluation.allowRequest,
+ retryAfterMs: evaluation.retryAfterMs,
+ });
+
+ if (evaluation.allowRequest) return null;
+
+ const { policy, policyId } = this.resolveFallbackPolicy(next.request.metadata);
+ if (policy.mode === "wait") {
+ const wakeUpAt =
+ evaluation.retryAfterMs !== undefined ? Date.now() + evaluation.retryAfterMs : undefined;
+ circuitLogger?.debug?.("Circuit open; waiting per fallback policy", {
+ circuitKey: key,
+ policyId,
+ retryAfterMs: evaluation.retryAfterMs,
+ wakeUpAt,
+ });
+ return { kind: "wait", wakeUpAt };
+ }
+
+ const fallback = this.findFallbackTarget(next.request.metadata, visitedKeys, circuitLogger);
+ circuitLogger?.debug?.("Circuit open; attempting fallback", {
+ circuitKey: key,
+ currentModel: next.request.metadata?.model,
+ fallback,
+ visitedKeys: Array.from(visitedKeys),
+ });
+ if (!fallback || !next.request.createFallbackRequest) {
+ const error = new CircuitBreakerOpenError(
+ `Circuit open for ${key}`,
+ next.request.metadata,
+ evaluation.retryAfterMs,
+ );
+ const traffic: TrafficResponseMetadata = {
+ rateLimitKey: key,
+ retryAfterMs: evaluation.retryAfterMs,
+ tenantId: next.request.metadata?.tenantId ?? next.tenantId,
+ priority: next.request.metadata?.priority,
+ taskType: next.request.metadata?.taskType,
+ };
+ (error as Record).traffic = traffic;
+ next.reject(error);
+ circuitLogger?.warn?.("No fallback available; rejecting request", {
+ circuitKey: key,
+ retryAfterMs: evaluation.retryAfterMs,
+ });
+ return { kind: "skip" };
+ }
+
+ const fallbackRequest = next.request.createFallbackRequest(fallback);
+ if (!fallbackRequest) {
+ circuitLogger?.warn?.("createFallbackRequest returned undefined; skipping", {
+ circuitKey: key,
+ fallback,
+ });
+ return { kind: "skip" };
+ }
+
+ this.applyFallbackRequest(next, fallbackRequest, fallback, circuitLogger, {
+ previousCircuitKey: key,
+ reason: "circuit-open",
+ });
+ }
+ }
+
+ tryFallback(next: QueuedRequest, reason: "queue-timeout", logger?: Logger): boolean {
+ const circuitLogger = logger?.child({ module: "circuit-breaker" });
+ const { policy, policyId } = this.resolveFallbackPolicy(next.request.metadata);
+ if (policy.mode === "wait") {
+ circuitLogger?.debug?.("Fallback skipped by policy", {
+ policyId,
+ reason,
+ provider: next.request.metadata?.provider,
+ model: next.request.metadata?.model,
+ });
+ return false;
+ }
+
+ const visitedKeys = new Set();
+ const key = this.buildRateLimitKey(next.request.metadata);
+ visitedKeys.add(key);
+
+ const fallback = this.findFallbackTarget(next.request.metadata, visitedKeys, circuitLogger);
+ if (!fallback || !next.request.createFallbackRequest) {
+ circuitLogger?.debug?.("Fallback unavailable for request", {
+ reason,
+ provider: next.request.metadata?.provider,
+ model: next.request.metadata?.model,
+ fallback,
+ });
+ return false;
+ }
+
+ const fallbackRequest = next.request.createFallbackRequest(fallback);
+ if (!fallbackRequest) {
+ circuitLogger?.warn?.("createFallbackRequest returned undefined; skipping", {
+ reason,
+ fallback,
+ });
+ return false;
+ }
+
+ this.applyFallbackRequest(next, fallbackRequest, fallback, circuitLogger, {
+ previousCircuitKey: key,
+ reason,
+ policyId,
+ });
+ return true;
+ }
+
+ markTrial(item: QueuedRequest, logger?: Logger): void {
+ const circuitLogger = logger?.child({ module: "circuit-breaker" });
+ const key = item.circuitKey;
+ if (!key) return;
+ const state = this.circuitBreakers.get(key);
+ if (state && state.status === "half-open" && !state.trialInFlight) {
+ state.trialInFlight = true;
+ circuitLogger?.debug?.("Marked half-open trial in flight", { circuitKey: key });
+ }
+ }
+
+ recordSuccess(metadata?: TrafficRequestMetadata, logger?: Logger): void {
+ const circuitLogger = logger?.child({ module: "circuit-breaker" });
+ const key = this.buildRateLimitKey(metadata);
+ this.circuitBreakers.delete(key);
+ circuitLogger?.debug?.("Circuit success; cleared circuit state", {
+ circuitKey: key,
+ provider: metadata?.provider,
+ model: metadata?.model,
+ });
+ }
+
+ recordFailure(
+ metadata: TrafficRequestMetadata | undefined,
+ error: unknown,
+ logger?: Logger,
+ ): void {
+ const circuitLogger = logger?.child({ module: "circuit-breaker" });
+ const key = this.buildRateLimitKey(metadata);
+ const status = extractStatusCode(error, logger);
+ const isTimeout = status === 408 || isTimeoutError(error, logger);
+ const isStatusEligible = this.isCircuitBreakerStatus(status);
+ const isTimeoutEligible = !isStatusEligible && isTimeout;
+ const isEligible = isStatusEligible || isTimeoutEligible;
+
+ circuitLogger?.debug?.("Circuit failure observed", {
+ circuitKey: key,
+ status,
+ isTimeout,
+ eligible: isEligible,
+ provider: metadata?.provider,
+ model: metadata?.model,
+ });
+
+ if (!isEligible) {
+ this.circuitBreakers.delete(key);
+ circuitLogger?.debug?.("Failure not eligible for circuit breaker; cleared circuit state", {
+ circuitKey: key,
+ status,
+ isTimeout,
+ });
+ return;
+ }
+
+ const now = Date.now();
+ const state =
+ this.circuitBreakers.get(key) ??
+ ({ status: "closed", failureTimestamps: [], timeoutTimestamps: [] } as CircuitState);
+
+ state.failureTimestamps = state.failureTimestamps.filter(
+ (t) => now - t <= CIRCUIT_FAILURE_WINDOW_MS,
+ );
+ state.timeoutTimestamps = state.timeoutTimestamps.filter(
+ (t) => now - t <= CIRCUIT_TIMEOUT_WINDOW_MS,
+ );
+
+ state.failureTimestamps.push(now);
+ if (isTimeoutEligible) {
+ state.timeoutTimestamps.push(now);
+ }
+
+ if (
+ state.status === "half-open" ||
+ state.failureTimestamps.length >= CIRCUIT_FAILURE_THRESHOLD ||
+ state.timeoutTimestamps.length >= CIRCUIT_TIMEOUT_THRESHOLD
+ ) {
+ const openReasons: string[] = [];
+ if (state.status === "half-open") openReasons.push("half-open-failure");
+ if (state.failureTimestamps.length >= CIRCUIT_FAILURE_THRESHOLD) {
+ openReasons.push("failure-threshold");
+ }
+ if (state.timeoutTimestamps.length >= CIRCUIT_TIMEOUT_THRESHOLD) {
+ openReasons.push("timeout-threshold");
+ }
+
+ state.status = "open";
+ state.openedAt = now;
+ state.trialInFlight = false;
+ state.nextProbeAt = now + CIRCUIT_PROBE_INTERVAL_MS;
+ circuitLogger?.warn?.("Circuit opened", {
+ circuitKey: key,
+ openReasons,
+ status,
+ isTimeout,
+ failureCount: state.failureTimestamps.length,
+ failureThreshold: CIRCUIT_FAILURE_THRESHOLD,
+ timeoutCount: state.timeoutTimestamps.length,
+ timeoutThreshold: CIRCUIT_TIMEOUT_THRESHOLD,
+ openedAt: state.openedAt,
+ });
+ }
+
+ this.circuitBreakers.set(key, state);
+ circuitLogger?.trace?.("Circuit state updated", {
+ circuitKey: key,
+ status: state.status,
+ failureCount: state.failureTimestamps.length,
+ failureWindowMs: CIRCUIT_FAILURE_WINDOW_MS,
+ timeoutCount: state.timeoutTimestamps.length,
+ timeoutWindowMs: CIRCUIT_TIMEOUT_WINDOW_MS,
+ });
+ }
+
+ private evaluateCircuitState(
+ key: string,
+ logger?: Logger,
+ ): {
+ allowRequest: boolean;
+ state: CircuitStateStatus;
+ retryAfterMs?: number;
+ } {
+ const state = this.circuitBreakers.get(key);
+ if (!state) {
+ logger?.trace?.("Circuit state missing; allow request", { circuitKey: key });
+ return { allowRequest: true, state: "closed" };
+ }
+
+ const now = Date.now();
+
+ if (state.status === "open") {
+ const elapsed = state.openedAt ? now - state.openedAt : 0;
+ if (state.nextProbeAt === undefined) {
+ state.nextProbeAt = now + CIRCUIT_PROBE_INTERVAL_MS;
+ }
+ const cooldownRemaining = Math.max(0, CIRCUIT_COOLDOWN_MS - elapsed);
+ const probeRemaining = Math.max(0, state.nextProbeAt - now);
+ if (probeRemaining === 0 || cooldownRemaining === 0) {
+ state.status = "half-open";
+ state.trialInFlight = false;
+ state.failureTimestamps = [];
+ state.timeoutTimestamps = [];
+ state.nextProbeAt = undefined;
+ logger?.debug?.("Circuit transitioned to half-open", {
+ circuitKey: key,
+ reason: cooldownRemaining === 0 ? "cooldown" : "probe",
+ });
+ return { allowRequest: true, state: "half-open" };
+ }
+ return {
+ allowRequest: false,
+ state: "open",
+ retryAfterMs: Math.min(cooldownRemaining, probeRemaining),
+ };
+ }
+
+ if (state.status === "half-open" && state.trialInFlight) {
+ return { allowRequest: false, state: "half-open" };
+ }
+
+ return { allowRequest: true, state: state.status };
+ }
+
+ private resolveFallbackPolicy(metadata: TrafficRequestMetadata | undefined): {
+ policy: FallbackPolicy;
+ policyId?: string;
+ } {
+ const policyId =
+ metadata?.fallbackPolicyId ??
+ (metadata?.taskType
+ ? this.fallbackPolicy?.taskTypePolicyIds?.[metadata.taskType]
+ : undefined) ??
+ this.fallbackPolicy?.defaultPolicyId;
+
+ const policy = policyId ? this.fallbackPolicy?.policies?.[policyId] : undefined;
+ return {
+ policy: policy ?? { mode: "fallback" },
+ policyId,
+ };
+ }
+
+ private applyFallbackRequest(
+ next: QueuedRequest,
+ fallbackRequest: QueuedRequest["request"],
+ fallback: FallbackChainEntry,
+ logger?: Logger,
+ context?: { previousCircuitKey?: string; reason?: string; policyId?: string },
+ ): void {
+ next.request = fallbackRequest;
+ next.attempt = 1;
+ next.estimatedTokens = fallbackRequest.estimatedTokens;
+ next.reservedTokens = undefined;
+ next.tenantConcurrencyKey = undefined;
+ next.providerModelConcurrencyKey = undefined;
+ next.rateLimitKey = undefined;
+ next.etaMs = undefined;
+ next.circuitKey = undefined;
+ next.circuitStatus = undefined;
+ next.extractUsage = fallbackRequest.extractUsage;
+ if (context?.reason === "queue-timeout") {
+ next.queueTimeoutDisabled = true;
+ }
+ logger?.debug?.("Switched to fallback request", {
+ previousCircuitKey: context?.previousCircuitKey,
+ fallbackModel: fallback,
+ reason: context?.reason,
+ policyId: context?.policyId,
+ });
+ }
+
+ private isShortResponseFallback(
+ candidate: FallbackChainEntry,
+ ): candidate is { kind: "short-response"; text: string } {
+ return (
+ typeof candidate === "object" &&
+ candidate !== null &&
+ "kind" in candidate &&
+ (candidate as { kind?: string }).kind === "short-response"
+ );
+ }
+
+ private findFallbackTarget(
+ metadata: TrafficRequestMetadata | undefined,
+ visitedKeys: Set,
+ logger?: Logger,
+ ): FallbackChainEntry | undefined {
+ const currentModel = metadata?.model;
+ if (!currentModel) {
+ logger?.trace?.("No current model; no fallback", {});
+ return undefined;
+ }
+
+ const provider = metadata?.provider;
+ const chain = this.resolveFallbackChain(provider, currentModel);
+ if (!chain) {
+ logger?.trace?.("No fallback chain for model", {
+ currentModel,
+ provider,
+ });
+ return undefined;
+ }
+
+ for (const candidate of chain) {
+ if (this.isShortResponseFallback(candidate)) {
+ logger?.debug?.("Selected short-response fallback", {
+ currentModel,
+ currentProvider: provider,
+ });
+ return candidate;
+ }
+ const target = this.normalizeFallbackTarget(candidate, provider);
+ const candidateMetadata: TrafficRequestMetadata = {
+ ...(metadata ?? {}),
+ provider: target.provider ?? provider,
+ model: target.model,
+ };
+ const candidateKey = this.buildRateLimitKey(candidateMetadata);
+ if (visitedKeys.has(candidateKey)) {
+ continue;
+ }
+
+ const evaluation = this.evaluateCircuitState(candidateKey, logger);
+ if (evaluation.allowRequest) {
+ visitedKeys.add(candidateKey);
+ logger?.debug?.("Selected fallback target", {
+ currentModel,
+ currentProvider: provider,
+ fallbackModel: target.model,
+ fallbackProvider: target.provider ?? provider,
+ fallbackCircuitKey: candidateKey,
+ });
+ return candidate;
+ }
+ }
+
+ return undefined;
+ }
+
+ private resolveFallbackChain(
+ provider: string | undefined,
+ model: string,
+ ): FallbackChainEntry[] | undefined {
+ const providerKey = provider ? `${provider}::${model}` : undefined;
+ if (providerKey) {
+ const providerChain = this.fallbackChains.get(providerKey);
+ if (providerChain) return providerChain;
+ }
+ return this.fallbackChains.get(model);
+ }
+
+ private normalizeFallbackTarget(
+ candidate: FallbackChainEntry,
+ provider: string | undefined,
+ ): FallbackTarget {
+ if (typeof candidate === "string") {
+ return { provider, model: candidate };
+ }
+ return {
+ provider: candidate.provider ?? provider,
+ model: candidate.model,
+ };
+ }
+
+ private isCircuitBreakerStatus(status?: number): boolean {
+ return status === 429 || (status !== undefined && status >= 500);
+ }
+}
diff --git a/packages/core/src/traffic/traffic-concurrency-limiter.ts b/packages/core/src/traffic/traffic-concurrency-limiter.ts
new file mode 100644
index 000000000..e15256127
--- /dev/null
+++ b/packages/core/src/traffic/traffic-concurrency-limiter.ts
@@ -0,0 +1,235 @@
+import type { Logger } from "../logger";
+import type { QueuedRequest } from "./traffic-controller-internal";
+import type {
+ ProviderModelConcurrencyLimit,
+ TenantConcurrencyLimit,
+ TrafficRequestMetadata,
+} from "./traffic-types";
+
+export type ConcurrencyBlockReason =
+ | {
+ gate: "providerModel";
+ key: string;
+ inFlight: number;
+ limit: number;
+ }
+ | {
+ gate: "tenant";
+ key: string;
+ inFlight: number;
+ limit: number;
+ };
+
+export type ConcurrencyDecision =
+ | { kind: "allow" }
+ | { kind: "wait"; reasons: ConcurrencyBlockReason[] };
+
+function toNonNegativeIntegerLimit(raw: unknown): number | undefined {
+ if (raw === undefined || raw === null) return undefined;
+ const n = typeof raw === "number" ? raw : Number(raw);
+ if (!Number.isFinite(n)) return undefined;
+ if (n <= 0) return 0;
+ return Math.floor(n);
+}
+
+function getInFlight(map: Map, key: string): number {
+ return map.get(key) ?? 0;
+}
+
+function incrementInFlight(map: Map, key: string): void {
+ map.set(key, getInFlight(map, key) + 1);
+}
+
+function decrementInFlight(map: Map, key: string): void {
+ const current = getInFlight(map, key);
+ if (current <= 1) {
+ map.delete(key);
+ return;
+ }
+ map.set(key, current - 1);
+}
+
+export class TrafficConcurrencyLimiter {
+ private readonly inFlightByProviderModel = new Map();
+ private readonly inFlightByTenant = new Map();
+
+ private readonly buildProviderModelKey: (metadata?: TrafficRequestMetadata) => string;
+ private readonly providerModelLimit?: ProviderModelConcurrencyLimit;
+ private readonly tenantLimit?: TenantConcurrencyLimit;
+ private readonly providerModelEnabled: boolean;
+ private readonly tenantEnabled: boolean;
+
+ constructor(options: {
+ buildProviderModelKey: (metadata?: TrafficRequestMetadata) => string;
+ maxConcurrentPerProviderModel?: ProviderModelConcurrencyLimit;
+ maxConcurrentPerTenant?: TenantConcurrencyLimit;
+ }) {
+ this.buildProviderModelKey = options.buildProviderModelKey;
+ this.providerModelLimit = options.maxConcurrentPerProviderModel;
+ this.tenantLimit = options.maxConcurrentPerTenant;
+ this.providerModelEnabled = options.maxConcurrentPerProviderModel !== undefined;
+ this.tenantEnabled = options.maxConcurrentPerTenant !== undefined;
+ }
+
+ resolve(next: QueuedRequest, logger?: Logger): ConcurrencyDecision {
+ if (!this.providerModelEnabled && !this.tenantEnabled) return { kind: "allow" };
+ const concurrencyLogger = logger?.child({ module: "concurrency-limiter" });
+ const reasons: ConcurrencyBlockReason[] = [];
+
+ if (this.providerModelEnabled) {
+ const providerModelKey = this.buildProviderModelKey(next.request.metadata);
+ const providerModelLimit = this.resolveProviderModelLimit(
+ providerModelKey,
+ next.request.metadata,
+ concurrencyLogger,
+ );
+ if (providerModelLimit !== undefined) {
+ const inFlight = getInFlight(this.inFlightByProviderModel, providerModelKey);
+ if (inFlight >= providerModelLimit) {
+ reasons.push({
+ gate: "providerModel",
+ key: providerModelKey,
+ inFlight,
+ limit: providerModelLimit,
+ });
+ }
+ }
+ }
+
+ if (this.tenantEnabled) {
+ const tenantKey = next.tenantId;
+ const tenantLimit = this.resolveTenantLimit(
+ tenantKey,
+ next.request.metadata,
+ concurrencyLogger,
+ );
+ if (tenantLimit !== undefined) {
+ const inFlight = getInFlight(this.inFlightByTenant, tenantKey);
+ if (inFlight >= tenantLimit) {
+ reasons.push({
+ gate: "tenant",
+ key: tenantKey,
+ inFlight,
+ limit: tenantLimit,
+ });
+ }
+ }
+ }
+
+ if (reasons.length === 0) return { kind: "allow" };
+
+ concurrencyLogger?.trace?.("Concurrency gate blocked request", {
+ tenantId: next.tenantId,
+ reasons,
+ });
+ return { kind: "wait", reasons };
+ }
+
+ acquire(next: QueuedRequest, logger?: Logger): void {
+ if (!this.providerModelEnabled && !this.tenantEnabled) return;
+ const concurrencyLogger = logger?.child({ module: "concurrency-limiter" });
+
+ let tenantKey: string | undefined;
+ if (this.tenantEnabled) {
+ tenantKey = next.tenantId;
+ next.tenantConcurrencyKey = tenantKey;
+ incrementInFlight(this.inFlightByTenant, tenantKey);
+ }
+
+ let providerModelKey: string | undefined;
+ if (this.providerModelEnabled) {
+ providerModelKey = this.buildProviderModelKey(next.request.metadata);
+ next.providerModelConcurrencyKey = providerModelKey;
+ incrementInFlight(this.inFlightByProviderModel, providerModelKey);
+ }
+
+ concurrencyLogger?.trace?.("Concurrency slots acquired", {
+ tenantId: tenantKey,
+ tenantInFlight: tenantKey ? getInFlight(this.inFlightByTenant, tenantKey) : undefined,
+ providerModelKey,
+ providerModelInFlight: providerModelKey
+ ? getInFlight(this.inFlightByProviderModel, providerModelKey)
+ : undefined,
+ });
+ }
+
+ release(next: QueuedRequest, logger?: Logger): void {
+ const concurrencyLogger = logger?.child({ module: "concurrency-limiter" });
+ const tenantKey = next.tenantConcurrencyKey;
+ const providerModelKey = next.providerModelConcurrencyKey;
+
+ if (tenantKey) {
+ decrementInFlight(this.inFlightByTenant, tenantKey);
+ }
+
+ if (providerModelKey) {
+ decrementInFlight(this.inFlightByProviderModel, providerModelKey);
+ }
+
+ if (tenantKey || providerModelKey) {
+ concurrencyLogger?.trace?.("Concurrency slots released", {
+ tenantId: tenantKey,
+ tenantInFlight: tenantKey ? getInFlight(this.inFlightByTenant, tenantKey) : undefined,
+ providerModelKey,
+ providerModelInFlight: providerModelKey
+ ? getInFlight(this.inFlightByProviderModel, providerModelKey)
+ : undefined,
+ });
+ }
+
+ next.tenantConcurrencyKey = undefined;
+ next.providerModelConcurrencyKey = undefined;
+ }
+
+ private resolveTenantLimit(
+ tenantId: string,
+ metadata: TrafficRequestMetadata | undefined,
+ logger?: Logger,
+ ): number | undefined {
+ const policy = this.tenantLimit;
+ if (policy === undefined) return undefined;
+
+ if (typeof policy === "number") return toNonNegativeIntegerLimit(policy);
+ if (typeof policy === "function") {
+ try {
+ return toNonNegativeIntegerLimit(policy(tenantId, metadata));
+ } catch (error) {
+ logger?.warn?.("Tenant concurrency resolver threw; ignoring", {
+ tenantId,
+ errorName: (error as { name?: unknown } | null)?.name,
+ errorMessage: (error as { message?: unknown } | null)?.message,
+ });
+ return undefined;
+ }
+ }
+
+ return toNonNegativeIntegerLimit(policy[tenantId]);
+ }
+
+ private resolveProviderModelLimit(
+ key: string,
+ metadata: TrafficRequestMetadata | undefined,
+ logger?: Logger,
+ ): number | undefined {
+ const policy = this.providerModelLimit;
+ if (policy === undefined) return undefined;
+
+ if (typeof policy === "number") return toNonNegativeIntegerLimit(policy);
+ if (typeof policy === "function") {
+ try {
+ return toNonNegativeIntegerLimit(policy(metadata, key));
+ } catch (error) {
+ logger?.warn?.("Provider/model concurrency resolver threw; ignoring", {
+ key,
+ provider: metadata?.provider,
+ model: metadata?.model,
+ errorName: (error as { name?: unknown } | null)?.name,
+ errorMessage: (error as { message?: unknown } | null)?.message,
+ });
+ return undefined;
+ }
+ }
+
+ return toNonNegativeIntegerLimit(policy[key]);
+ }
+}
diff --git a/packages/core/src/traffic/traffic-constants.ts b/packages/core/src/traffic/traffic-constants.ts
new file mode 100644
index 000000000..68d99df78
--- /dev/null
+++ b/packages/core/src/traffic/traffic-constants.ts
@@ -0,0 +1,26 @@
+export const MAX_RETRY_ATTEMPTS = 3;
+export const TIMEOUT_RETRY_ATTEMPTS = 2;
+
+export const RATE_LIMIT_BASE_BACKOFF_MS = 500;
+export const SERVER_ERROR_BASE_BACKOFF_MS = 1000;
+export const TIMEOUT_BASE_BACKOFF_MS = 750;
+
+export const RATE_LIMIT_JITTER_FACTOR = 0.35;
+export const SERVER_ERROR_JITTER_FACTOR = 0.8;
+export const TIMEOUT_JITTER_FACTOR = 0.5;
+
+export const CIRCUIT_FAILURE_THRESHOLD = 5;
+export const CIRCUIT_FAILURE_WINDOW_MS = 10_000;
+export const CIRCUIT_TIMEOUT_THRESHOLD = CIRCUIT_FAILURE_THRESHOLD;
+export const CIRCUIT_TIMEOUT_WINDOW_MS = CIRCUIT_FAILURE_WINDOW_MS;
+export const CIRCUIT_COOLDOWN_MS = 30_000;
+export const CIRCUIT_PROBE_INTERVAL_MS = 5_000;
+
+export const RATE_LIMIT_EXHAUSTION_BUFFER = 1;
+export const RATE_LIMIT_PROBE_DELAY_MS = 50;
+export const RATE_LIMIT_MIN_PACE_INTERVAL_MS = 10;
+export const RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS = 10;
+
+export const DEFAULT_FALLBACK_CHAINS: Record = {
+ "gpt-4o": ["gpt-4o-mini", "gpt-3.5"],
+};
diff --git a/packages/core/src/traffic/traffic-controller-internal.ts b/packages/core/src/traffic/traffic-controller-internal.ts
new file mode 100644
index 000000000..fd2012cf5
--- /dev/null
+++ b/packages/core/src/traffic/traffic-controller-internal.ts
@@ -0,0 +1,57 @@
+import type { TrafficPriority, TrafficRequest, TrafficRequestType } from "./traffic-types";
+
+export type Scheduler = (callback: () => void) => void;
+
+export type DispatchDecision =
+ | { kind: "dispatch" }
+ | { kind: "skip" }
+ | { kind: "wait"; wakeUpAt?: number };
+
+export type CircuitStateStatus = "closed" | "open" | "half-open";
+
+export interface CircuitState {
+ status: CircuitStateStatus;
+ failureTimestamps: number[];
+ timeoutTimestamps: number[];
+ openedAt?: number;
+ trialInFlight?: boolean;
+ nextProbeAt?: number;
+}
+
+export interface RateLimitWindowState {
+ limit: number;
+ remaining: number;
+ resetAt: number;
+ reserved: number;
+ nextAllowedAt: number;
+}
+
+type BivariantHandler = {
+ bivarianceHack(...args: TArgs): void;
+}["bivarianceHack"];
+
+export interface QueuedRequest {
+ type: TrafficRequestType;
+ request: TrafficRequest;
+ resolve: BivariantHandler<[TResponse | PromiseLike]>;
+ reject: BivariantHandler<[reason?: unknown]>;
+ attempt: number;
+ priority: TrafficPriority;
+ tenantId: string;
+ enqueuedAt: number;
+ dispatchedAt?: number;
+ estimatedTokens?: number;
+ reservedTokens?: number;
+ queueTimeoutDisabled?: boolean;
+
+ tenantConcurrencyKey?: string;
+ providerModelConcurrencyKey?: string;
+
+ rateLimitKey?: string;
+ etaMs?: number;
+
+ circuitKey?: string;
+ circuitStatus?: CircuitStateStatus;
+
+ extractUsage?: TrafficRequest["extractUsage"];
+}
diff --git a/packages/core/src/traffic/traffic-controller.spec.ts b/packages/core/src/traffic/traffic-controller.spec.ts
new file mode 100644
index 000000000..dee0719f8
--- /dev/null
+++ b/packages/core/src/traffic/traffic-controller.spec.ts
@@ -0,0 +1,804 @@
+import { describe, expect, it, vi } from "vitest";
+import { CIRCUIT_FAILURE_THRESHOLD, RATE_LIMIT_PROBE_DELAY_MS } from "./traffic-constants";
+import { TrafficController } from "./traffic-controller";
+
+describe("TrafficController priority scheduling", () => {
+ it("prioritizes P0 over lower priorities when runnable", async () => {
+ const controller = new TrafficController({ maxConcurrent: 1 });
+ const order: string[] = [];
+
+ const p1 = controller.handleText({
+ metadata: { provider: "p", model: "m1", priority: "P1" },
+ execute: async () => {
+ order.push("P1");
+ return "P1";
+ },
+ });
+
+ const p2 = controller.handleText({
+ metadata: { provider: "p", model: "m2", priority: "P2" },
+ execute: async () => {
+ order.push("P2");
+ return "P2";
+ },
+ });
+
+ const p0 = controller.handleText({
+ metadata: { provider: "p", model: "m0", priority: "P0" },
+ execute: async () => {
+ order.push("P0");
+ return "P0";
+ },
+ });
+
+ await Promise.all([p0, p1, p2]);
+
+ expect(order[0]).toBe("P0");
+ expect(order).toEqual(["P0", "P1", "P2"]);
+ });
+
+ it("allows lower priorities to proceed when a higher priority request is rate limited", async () => {
+ vi.useFakeTimers();
+
+ try {
+ vi.setSystemTime(new Date(0));
+ const controller = new TrafficController({ maxConcurrent: 1 });
+ controller.updateRateLimitFromHeaders(
+ { provider: "p0", model: "m0" },
+ {
+ "x-ratelimit-limit-requests": "1",
+ "x-ratelimit-remaining-requests": "0",
+ "x-ratelimit-reset-requests": "1s",
+ },
+ );
+
+ const order: string[] = [];
+
+ const p0 = controller.handleText({
+ metadata: { provider: "p0", model: "m0", priority: "P0" },
+ execute: async () => {
+ order.push("P0");
+ return "P0";
+ },
+ });
+
+ const p1 = controller.handleText({
+ metadata: { provider: "p1", model: "m1", priority: "P1" },
+ execute: async () => {
+ order.push("P1");
+ return "P1";
+ },
+ });
+
+ await vi.runAllTimersAsync();
+ await Promise.all([p0, p1]);
+
+ expect(order[0]).toBe("P1");
+ expect(order[1]).toBe("P0");
+ } finally {
+ vi.useRealTimers();
+ }
+ });
+});
+
+describe("TrafficController concurrency limits", () => {
+ it("shares provider/model limits across tenants", async () => {
+ const controller = new TrafficController({
+ maxConcurrent: 2,
+ maxConcurrentPerProviderModel: 1,
+ });
+ const started: string[] = [];
+ let releaseFirst!: () => void;
+ const firstGate = new Promise((resolve) => {
+ releaseFirst = resolve;
+ });
+
+ const first = controller.handleText({
+ tenantId: "tenant-a",
+ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" },
+ execute: async () => {
+ started.push("tenant-a");
+ await firstGate;
+ return "a";
+ },
+ });
+
+ const second = controller.handleText({
+ tenantId: "tenant-b",
+ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" },
+ execute: async () => {
+ started.push("tenant-b");
+ return "b";
+ },
+ });
+
+ await new Promise((resolve) => setTimeout(resolve, 0));
+ expect(started).toEqual(["tenant-a"]);
+
+ releaseFirst();
+ await Promise.all([first, second]);
+ expect(started).toEqual(["tenant-a", "tenant-b"]);
+ });
+});
+
+describe("TrafficController rate limit headers", () => {
+ it("parses OpenAI-style compound reset durations (e.g. 1m30.951s)", () => {
+ vi.useFakeTimers();
+
+ try {
+ vi.setSystemTime(new Date(1_000_000));
+ const controller = new TrafficController({ maxConcurrent: 1 });
+ const now = Date.now();
+
+ const result = controller.updateRateLimitFromHeaders(
+ { provider: "openai.responses", model: "gpt-4o-mini" },
+ {
+ "x-ratelimit-limit-requests": "10000",
+ "x-ratelimit-remaining-requests": "9989",
+ "x-ratelimit-reset-requests": "1m30.951s",
+ },
+ );
+
+ expect(result).toBeTruthy();
+ expect(result?.headerSnapshot.resetRequestsMs).toBeCloseTo(90_951, 6);
+ expect(result?.state.limit).toBe(10000);
+ expect(result?.state.remaining).toBe(9989);
+ expect(result?.state.resetAt).toBe(now + 90_951);
+ expect(result?.state.reserved).toBe(0);
+ expect(result?.state.nextAllowedAt).toBe(now);
+ } finally {
+ vi.useRealTimers();
+ }
+ });
+
+ it("keeps resetAt monotonic when headers shorten the reset duration", () => {
+ vi.useFakeTimers();
+
+ try {
+ vi.setSystemTime(new Date(0));
+ const controller = new TrafficController({ maxConcurrent: 1 });
+
+ const first = controller.updateRateLimitFromHeaders(
+ { provider: "openai.responses", model: "gpt-4o-mini" },
+ {
+ "x-ratelimit-limit-requests": "10000",
+ "x-ratelimit-remaining-requests": "9999",
+ "x-ratelimit-reset-requests": "60s",
+ },
+ );
+
+ expect(first).toBeTruthy();
+ expect(first?.state.resetAt).toBe(60_000);
+
+ vi.setSystemTime(new Date(10_000));
+ const second = controller.updateRateLimitFromHeaders(
+ { provider: "openai.responses", model: "gpt-4o-mini" },
+ {
+ "x-ratelimit-limit-requests": "10000",
+ "x-ratelimit-remaining-requests": "9998",
+ "x-ratelimit-reset-requests": "5s",
+ },
+ );
+
+ expect(second).toBeTruthy();
+ expect(second?.state.resetAt).toBe(60_000);
+ } finally {
+ vi.useRealTimers();
+ }
+ });
+
+ it("never increases remaining within the same window", () => {
+ vi.useFakeTimers();
+
+ try {
+ vi.setSystemTime(new Date(0));
+ const controller = new TrafficController({ maxConcurrent: 1 });
+
+ const first = controller.updateRateLimitFromHeaders(
+ { provider: "openai.responses", model: "gpt-4o-mini" },
+ {
+ "x-ratelimit-limit-requests": "10",
+ "x-ratelimit-remaining-requests": "9",
+ "x-ratelimit-reset-requests": "60s",
+ },
+ );
+
+ expect(first?.state.remaining).toBe(9);
+ expect(first?.state.resetAt).toBe(60_000);
+
+ vi.setSystemTime(new Date(10_000));
+ const second = controller.updateRateLimitFromHeaders(
+ { provider: "openai.responses", model: "gpt-4o-mini" },
+ {
+ "x-ratelimit-limit-requests": "10",
+ "x-ratelimit-remaining-requests": "8",
+ "x-ratelimit-reset-requests": "50s",
+ },
+ );
+
+ expect(second?.state.remaining).toBe(8);
+ expect(second?.state.resetAt).toBe(60_000);
+
+ vi.setSystemTime(new Date(20_000));
+ const third = controller.updateRateLimitFromHeaders(
+ { provider: "openai.responses", model: "gpt-4o-mini" },
+ {
+ "x-ratelimit-limit-requests": "10",
+ "x-ratelimit-remaining-requests": "9",
+ "x-ratelimit-reset-requests": "40s",
+ },
+ );
+
+ expect(third?.state.remaining).toBe(8);
+ expect(third?.state.resetAt).toBe(60_000);
+ } finally {
+ vi.useRealTimers();
+ }
+ });
+
+ it("applies Retry-After even when x-ratelimit headers are missing", async () => {
+ vi.useFakeTimers();
+
+ try {
+ vi.setSystemTime(new Date(0));
+ const controller = new TrafficController({ maxConcurrent: 1 });
+ const order: string[] = [];
+
+ controller.updateRateLimitFromHeaders(
+ { provider: "p", model: "m" },
+ {
+ "retry-after": "2",
+ },
+ );
+
+ const p0 = controller.handleText({
+ metadata: { provider: "p", model: "m", priority: "P0" },
+ execute: async () => {
+ order.push("P0");
+ return "P0";
+ },
+ });
+
+ await vi.advanceTimersByTimeAsync(1_999);
+ expect(order).toEqual([]);
+
+ await vi.advanceTimersByTimeAsync(1);
+ await vi.runAllTimersAsync();
+ await p0;
+ expect(order).toEqual(["P0"]);
+ } finally {
+ vi.useRealTimers();
+ }
+ });
+
+ it("shares rate limits across tenants for the same provider/model", async () => {
+ vi.useFakeTimers();
+
+ try {
+ vi.setSystemTime(new Date(0));
+ const controller = new TrafficController({ maxConcurrent: 1 });
+ controller.updateRateLimitFromHeaders(
+ { provider: "openai", model: "gpt-4o", tenantId: "tenant-a" },
+ {
+ "x-ratelimit-limit-requests": "1",
+ "x-ratelimit-remaining-requests": "0",
+ "x-ratelimit-reset-requests": "1s",
+ },
+ );
+
+ const order: string[] = [];
+ const request = controller.handleText({
+ tenantId: "tenant-b",
+ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" },
+ execute: async () => {
+ order.push("tenant-b");
+ return "ok";
+ },
+ });
+
+ await vi.advanceTimersByTimeAsync(999);
+ await Promise.resolve();
+ expect(order).toEqual([]);
+
+ await vi.advanceTimersByTimeAsync(1);
+ await vi.runAllTimersAsync();
+ await request;
+
+ expect(order).toEqual(["tenant-b"]);
+ } finally {
+ vi.useRealTimers();
+ }
+ });
+});
+
+describe("TrafficController token limits", () => {
+ it("blocks OpenAI when the token window is exhausted even without RPM config", async () => {
+ vi.useFakeTimers();
+
+ try {
+ vi.setSystemTime(new Date(0));
+ const controller = new TrafficController({
+ maxConcurrent: 1,
+ rateLimits: {
+ "openai::gpt-4o": {
+ requestsPerMinute: 0,
+ tokensPerMinute: 2,
+ },
+ },
+ });
+ const order: string[] = [];
+
+ const first = controller.handleText({
+ tenantId: "tenant-a",
+ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" },
+ execute: async () => {
+ order.push("first");
+ return "first";
+ },
+ extractUsage: () => ({ totalTokens: 2 }),
+ });
+
+ const second = controller.handleText({
+ tenantId: "tenant-b",
+ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" },
+ execute: async () => {
+ order.push("second");
+ return "second";
+ },
+ extractUsage: () => ({ totalTokens: 1 }),
+ });
+
+ await first;
+ expect(order).toEqual(["first"]);
+
+ await vi.advanceTimersByTimeAsync(60_000 + RATE_LIMIT_PROBE_DELAY_MS - 1);
+ expect(order).toEqual(["first"]);
+
+ await vi.advanceTimersByTimeAsync(1);
+ await vi.runAllTimersAsync();
+ await second;
+ expect(order).toEqual(["first", "second"]);
+ } finally {
+ vi.useRealTimers();
+ }
+ });
+
+ it("reserves estimated tokens before dispatch", async () => {
+ vi.useFakeTimers();
+
+ try {
+ vi.setSystemTime(new Date(0));
+ const controller = new TrafficController({
+ maxConcurrent: 2,
+ rateLimits: {
+ "openai::gpt-4o": {
+ requestsPerMinute: 0,
+ tokensPerMinute: 2,
+ },
+ },
+ });
+ const order: string[] = [];
+ let releaseFirst!: () => void;
+ const firstGate = new Promise((resolve) => {
+ releaseFirst = resolve;
+ });
+
+ const first = controller.handleText({
+ tenantId: "tenant-a",
+ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" },
+ estimatedTokens: 2,
+ execute: async () => {
+ order.push("first");
+ await firstGate;
+ return "first";
+ },
+ extractUsage: () => ({ totalTokens: 2 }),
+ });
+
+ const second = controller.handleText({
+ tenantId: "tenant-b",
+ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" },
+ estimatedTokens: 1,
+ execute: async () => {
+ order.push("second");
+ return "second";
+ },
+ extractUsage: () => ({ totalTokens: 1 }),
+ });
+
+ await Promise.resolve();
+ expect(order).toEqual(["first"]);
+
+ await vi.advanceTimersByTimeAsync(60_000 + RATE_LIMIT_PROBE_DELAY_MS - 1);
+ await Promise.resolve();
+ expect(order).toEqual(["first"]);
+
+ await vi.advanceTimersByTimeAsync(1);
+ await vi.runAllTimersAsync();
+ await Promise.resolve();
+ expect(order).toEqual(["first", "second"]);
+
+ releaseFirst();
+ await Promise.all([first, second]);
+ } finally {
+ vi.useRealTimers();
+ }
+ });
+
+ it("allows token-only configs on non-OpenAI providers", async () => {
+ vi.useFakeTimers();
+
+ try {
+ vi.setSystemTime(new Date(0));
+ const controller = new TrafficController({
+ maxConcurrent: 2,
+ rateLimits: {
+ "p::m": {
+ requestsPerMinute: 0,
+ tokensPerMinute: 2,
+ },
+ },
+ });
+ const order: string[] = [];
+
+ const first = controller.handleText({
+ tenantId: "tenant-a",
+ metadata: { provider: "p", model: "m", priority: "P1" },
+ estimatedTokens: 2,
+ execute: async () => {
+ order.push("first");
+ return "first";
+ },
+ extractUsage: () => ({ totalTokens: 2 }),
+ });
+
+ const second = controller.handleText({
+ tenantId: "tenant-b",
+ metadata: { provider: "p", model: "m", priority: "P1" },
+ estimatedTokens: 1,
+ execute: async () => {
+ order.push("second");
+ return "second";
+ },
+ extractUsage: () => ({ totalTokens: 1 }),
+ });
+
+ await first;
+ expect(order).toEqual(["first"]);
+
+ await vi.advanceTimersByTimeAsync(29_999);
+ await Promise.resolve();
+ expect(order).toEqual(["first"]);
+
+ await vi.advanceTimersByTimeAsync(1);
+ await vi.runAllTimersAsync();
+ await second;
+ expect(order).toEqual(["first", "second"]);
+ } finally {
+ vi.useRealTimers();
+ }
+ });
+
+ it("honors OpenAI token headers even without token config", async () => {
+ vi.useFakeTimers();
+
+ try {
+ vi.setSystemTime(new Date(0));
+ const controller = new TrafficController({ maxConcurrent: 1 });
+ controller.updateRateLimitFromHeaders(
+ { provider: "openai", model: "gpt-4o" },
+ {
+ "x-ratelimit-limit-tokens": "2",
+ "x-ratelimit-remaining-tokens": "0",
+ "x-ratelimit-reset-tokens": "1s",
+ },
+ );
+
+ const order: string[] = [];
+ const request = controller.handleText({
+ tenantId: "tenant-a",
+ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" },
+ estimatedTokens: 1,
+ execute: async () => {
+ order.push("run");
+ return "ok";
+ },
+ });
+
+ await Promise.resolve();
+ expect(order).toEqual([]);
+
+ await vi.advanceTimersByTimeAsync(1_000 + RATE_LIMIT_PROBE_DELAY_MS - 1);
+ await Promise.resolve();
+ expect(order).toEqual([]);
+
+ await vi.advanceTimersByTimeAsync(1);
+ await vi.runAllTimersAsync();
+ await request;
+ expect(order).toEqual(["run"]);
+ } finally {
+ vi.useRealTimers();
+ }
+ });
+});
+
+describe("TrafficController stream reporting", () => {
+ it("holds concurrency slots for streams until completion", async () => {
+ const controller = new TrafficController({ maxConcurrent: 1 });
+ const order: string[] = [];
+ const firstMetadata = {
+ provider: "p",
+ model: "m",
+ priority: "P1" as const,
+ tenantId: "tenant-a",
+ };
+ const secondMetadata = {
+ provider: "p",
+ model: "m",
+ priority: "P1" as const,
+ tenantId: "tenant-a",
+ };
+
+ const first = controller.handleStream({
+ tenantId: "tenant-a",
+ metadata: firstMetadata,
+ execute: async () => {
+ order.push("first");
+ return "first";
+ },
+ });
+
+ const second = controller.handleStream({
+ tenantId: "tenant-a",
+ metadata: secondMetadata,
+ execute: async () => {
+ order.push("second");
+ return "second";
+ },
+ });
+
+ await first;
+ await Promise.resolve();
+ expect(order).toEqual(["first"]);
+
+ controller.reportStreamSuccess(firstMetadata);
+ await Promise.resolve();
+ expect(order).toEqual(["first", "second"]);
+
+ controller.reportStreamSuccess(secondMetadata);
+ await Promise.all([first, second]);
+ });
+
+ it("slows down after stream 429 errors", async () => {
+ vi.useFakeTimers();
+
+ try {
+ vi.setSystemTime(new Date(0));
+ const controller = new TrafficController({
+ maxConcurrent: 1,
+ adaptiveLimiter: {
+ windowMs: 1_000,
+ threshold: 1,
+ minPenaltyMs: 10,
+ maxPenaltyMs: 10,
+ penaltyMultiplier: 1,
+ decayMs: 1_000,
+ },
+ });
+ const metadata = {
+ provider: "p",
+ model: "m",
+ priority: "P1" as const,
+ tenantId: "tenant-a",
+ };
+
+ controller.reportStreamFailure(
+ metadata,
+ Object.assign(new Error("rate limit"), { status: 429 }),
+ );
+
+ const order: string[] = [];
+ const request = controller.handleText({
+ tenantId: "tenant-a",
+ metadata,
+ execute: async () => {
+ order.push("run");
+ return "ok";
+ },
+ });
+
+ await Promise.resolve();
+ expect(order).toEqual([]);
+
+ await vi.advanceTimersByTimeAsync(9);
+ await Promise.resolve();
+ expect(order).toEqual([]);
+
+ await vi.advanceTimersByTimeAsync(1);
+ await vi.runAllTimersAsync();
+ await request;
+ expect(order).toEqual(["run"]);
+ } finally {
+ vi.useRealTimers();
+ }
+ });
+
+ it("treats post-start stream failures as circuit breaker failures", async () => {
+ const controller = new TrafficController({
+ maxConcurrent: 1,
+ fallbackChains: {
+ primary: ["fallback"],
+ },
+ });
+ const tenantId = "tenant-1";
+ const metadata = { provider: "p", model: "primary", priority: "P1" as const };
+
+ await controller.handleStream({
+ tenantId,
+ metadata,
+ execute: async () => ({ ok: true }),
+ });
+
+ for (let i = 0; i < CIRCUIT_FAILURE_THRESHOLD; i += 1) {
+ controller.reportStreamFailure(metadata, new Error("stream-failure"));
+ }
+
+ const order: string[] = [];
+ await controller.handleStream({
+ tenantId,
+ metadata,
+ execute: async () => {
+ order.push("primary");
+ return "primary";
+ },
+ createFallbackRequest: (target) => ({
+ tenantId,
+ metadata: {
+ provider: "p",
+ model: typeof target === "string" ? target : target.model,
+ priority: "P1",
+ },
+ execute: async () => {
+ const modelId = typeof target === "string" ? target : target.model;
+ order.push(modelId);
+ return modelId;
+ },
+ }),
+ });
+
+ expect(order).toEqual(["fallback"]);
+ });
+});
+
+describe("TrafficController queue timeouts", () => {
+ it("times out queued requests even when max concurrency is saturated", async () => {
+ vi.useFakeTimers();
+
+ try {
+ vi.setSystemTime(new Date(0));
+ const controller = new TrafficController({ maxConcurrent: 1 });
+ const order: string[] = [];
+ let releaseFirst!: () => void;
+ const firstGate = new Promise((resolve) => {
+ releaseFirst = resolve;
+ });
+
+ const first = controller.handleText({
+ tenantId: "tenant-a",
+ metadata: { provider: "p", model: "m", priority: "P1" },
+ execute: async () => {
+ order.push("first");
+ await firstGate;
+ return "first";
+ },
+ });
+
+ const second = controller.handleText({
+ tenantId: "tenant-a",
+ metadata: { provider: "p", model: "m", priority: "P1" },
+ maxQueueWaitMs: 1,
+ execute: async () => {
+ order.push("second");
+ return "second";
+ },
+ });
+ const secondExpectation = expect(second).rejects.toHaveProperty(
+ "name",
+ "QueueWaitTimeoutError",
+ );
+
+ await Promise.resolve();
+ expect(order).toEqual(["first"]);
+
+ await vi.advanceTimersByTimeAsync(2);
+ await vi.runAllTimersAsync();
+ await secondExpectation;
+ expect(order).toEqual(["first"]);
+
+ releaseFirst();
+ await vi.runAllTimersAsync();
+ await first;
+ } finally {
+ vi.useRealTimers();
+ }
+ });
+
+ it("lets fallback requests wait after queue timeout without rejecting", async () => {
+ vi.useFakeTimers();
+
+ try {
+ vi.setSystemTime(new Date(0));
+ const controller = new TrafficController({
+ maxConcurrent: 1,
+ fallbackChains: {
+ "p::m": ["m-fallback"],
+ },
+ });
+ const order: string[] = [];
+ let releaseFirst!: () => void;
+ const firstGate = new Promise((resolve) => {
+ releaseFirst = resolve;
+ });
+
+ const first = controller.handleText({
+ tenantId: "tenant-a",
+ metadata: { provider: "p", model: "m", priority: "P1" },
+ execute: async () => {
+ order.push("first");
+ await firstGate;
+ return "first";
+ },
+ });
+
+ const second = controller.handleText({
+ tenantId: "tenant-a",
+ metadata: { provider: "p", model: "m", priority: "P1" },
+ maxQueueWaitMs: 1,
+ execute: async () => {
+ order.push("primary");
+ return "primary";
+ },
+ createFallbackRequest: (target) => ({
+ tenantId: "tenant-a",
+ metadata: {
+ provider: "p",
+ model: typeof target === "string" ? target : target.model,
+ priority: "P1",
+ },
+ maxQueueWaitMs: 1,
+ execute: async () => {
+ order.push("fallback");
+ return "fallback";
+ },
+ }),
+ });
+
+ await Promise.resolve();
+ expect(order).toEqual(["first"]);
+
+ await vi.advanceTimersByTimeAsync(2);
+
+ const third = controller.handleText({
+ tenantId: "tenant-a",
+ metadata: { provider: "p", model: "other", priority: "P1" },
+ execute: async () => {
+ order.push("third");
+ return "third";
+ },
+ });
+
+ await Promise.resolve();
+ expect(order).toEqual(["first"]);
+
+ releaseFirst();
+ await vi.runAllTimersAsync();
+
+ await expect(second).resolves.toBe("fallback");
+ await Promise.all([first, third]);
+
+ expect(order).toEqual(["first", "fallback", "third"]);
+ } finally {
+ vi.useRealTimers();
+ }
+ });
+});
diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts
new file mode 100644
index 000000000..90d56037c
--- /dev/null
+++ b/packages/core/src/traffic/traffic-controller.ts
@@ -0,0 +1,1408 @@
+import type { Logger } from "../logger";
+import { LoggerProxy } from "../logger";
+import { randomUUID } from "../utils/id";
+import { TrafficCircuitBreaker } from "./traffic-circuit-breaker";
+import { TrafficConcurrencyLimiter } from "./traffic-concurrency-limiter";
+import type { DispatchDecision, QueuedRequest, Scheduler } from "./traffic-controller-internal";
+import {
+ CircuitBreakerOpenError,
+ QueueWaitTimeoutError,
+ RateLimitedUpstreamError,
+ normalizeRateLimitError,
+} from "./traffic-errors";
+import {
+ OpenAIWindowRateLimitStrategy,
+ type RateLimitUpdateResult,
+ TokenBucketRateLimitStrategy,
+ TrafficRateLimiter,
+} from "./traffic-rate-limiter";
+import { buildRetryPlanWithPolicy } from "./traffic-retry";
+import type {
+ AdaptiveLimiterConfig,
+ FallbackChainEntry,
+ FallbackPolicy,
+ FallbackPolicyConfig,
+ FallbackPolicyMode,
+ FallbackTarget,
+ PriorityBurstLimits,
+ PriorityWeights,
+ ProviderModelConcurrencyLimit,
+ RateLimitConfig,
+ RateLimitKey,
+ RateLimitStrategyConfig,
+ RateLimitStrategyKind,
+ RetryPlan,
+ RetryPolicyConfig,
+ TenantConcurrencyLimit,
+ TenantUsage,
+ TrafficControllerOptions,
+ TrafficPriority,
+ TrafficRequest,
+ TrafficRequestMetadata,
+ TrafficRequestType,
+ TrafficResponseMetadata,
+} from "./traffic-types";
+import { TrafficUsageTracker } from "./traffic-usage-tracker";
+
+/* ============================================================
+ * Traffic Controller
+ * ============================================================
+ */
+
+export type {
+ AdaptiveLimiterConfig,
+ FallbackChainEntry,
+ FallbackPolicy,
+ FallbackPolicyConfig,
+ FallbackPolicyMode,
+ FallbackTarget,
+ PriorityBurstLimits,
+ PriorityWeights,
+ ProviderModelConcurrencyLimit,
+ RateLimitConfig,
+ RateLimitKey,
+ RateLimitStrategyConfig,
+ RateLimitStrategyKind,
+ TenantConcurrencyLimit,
+ TenantUsage,
+ TrafficControllerOptions,
+ TrafficPriority,
+ TrafficRequest,
+ TrafficRequestMetadata,
+ TrafficResponseMetadata,
+ TrafficRequestType,
+};
+
+export { CircuitBreakerOpenError };
+export { QueueWaitTimeoutError };
+export { RateLimitedUpstreamError };
+
+type TenantQueueState = {
+ order: string[];
+ index: number;
+ queues: Map;
+};
+
+type RateLimitSnapshot = {
+ limit?: number;
+ remaining?: number;
+ resetAt?: number;
+ nextAllowedAt?: number;
+ retryAfterMs?: number;
+};
+
+type AdaptiveLimiterState = {
+ recent429s: number[];
+ penaltyMs: number;
+ cooldownUntil?: number;
+ last429At?: number;
+};
+
+const DEFAULT_PRIORITY_WEIGHTS: Record = {
+ P0: 5,
+ P1: 3,
+ P2: 2,
+};
+
+const DEFAULT_ADAPTIVE_LIMITER: Required = {
+ windowMs: 30_000,
+ threshold: 3,
+ minPenaltyMs: 500,
+ maxPenaltyMs: 10_000,
+ penaltyMultiplier: 2,
+ decayMs: 10_000,
+};
+
+export class TrafficController {
+ /* ---------- Core ---------- */
+
+ private readonly scheduler: Scheduler;
+ private readonly maxConcurrent: number;
+ private readonly rateLimitKeyBuilder: (metadata?: TrafficRequestMetadata) => string;
+ private readonly retryPolicy?: RetryPolicyConfig;
+ private readonly logger: Logger;
+ private readonly trafficLogger: Logger;
+ private readonly controllerLogger: Logger;
+ private readonly concurrencyLimiter: TrafficConcurrencyLimiter;
+
+ private readonly queues: Record = {
+ P0: { order: [], index: 0, queues: new Map() },
+ P1: { order: [], index: 0, queues: new Map() },
+ P2: { order: [], index: 0, queues: new Map() },
+ };
+ private readonly priorityOrder: TrafficPriority[] = ["P0", "P1", "P2"];
+ private readonly priorityWeights: Record;
+ private readonly priorityCredits: Record;
+
+ private activeCount = 0;
+ private drainScheduled = false;
+ private readonly inFlightStreams = new Map();
+
+ /* ---------- Rate limits ---------- */
+ private readonly rateLimiter: TrafficRateLimiter;
+
+ /* ---------- Circuit breakers ---------- */
+ private readonly circuitBreaker: TrafficCircuitBreaker;
+
+ /* ---------- Usage ---------- */
+ private readonly usageTracker = new TrafficUsageTracker();
+
+ /* ---------- Traffic metadata ---------- */
+ private readonly rateLimitSnapshots = new Map();
+
+ /* ---------- Adaptive limiter ---------- */
+ private readonly adaptiveLimiterConfig: Required