From 96e17a2448e5bf883ab18eb96957fdecc1a306ea Mon Sep 17 00:00:00 2001 From: riturajFi Date: Fri, 28 Nov 2025 18:03:57 +0530 Subject: [PATCH 01/41] feat: rate limit v1 --- packages/core/src/agent/agent.ts | 41 +++ packages/core/src/index.ts | 8 + .../core/src/traffic/traffic-controller.ts | 267 ++++++++++++++++++ 3 files changed, 316 insertions(+) create mode 100644 packages/core/src/traffic/traffic-controller.ts diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index 291bdf7fd..b69db86f7 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -48,6 +48,7 @@ import type { BaseRetriever } from "../retriever/retriever"; import type { Tool, Toolkit } from "../tool"; import { createTool } from "../tool"; import { ToolManager } from "../tool/manager"; +import { type TrafficRequestMetadata, getTrafficController } from "../traffic/traffic-controller"; import { randomUUID } from "../utils/id"; import { convertModelMessagesToUIMessages } from "../utils/message-converter"; import { NodeType, createNodeId } from "../utils/node-utils"; @@ -444,6 +445,17 @@ export class Agent { async generateText( input: string | UIMessage[] | BaseMessage[], options?: GenerateTextOptions, + ): Promise { + const controller = getTrafficController({ logger: this.logger }); // Use shared controller so all agent calls flow through central queue/metrics + return controller.handleText({ + metadata: this.buildTrafficMetadata(), // Pass model/provider info for future rate limiting keys + execute: () => this.executeGenerateText(input, options), // Defer actual execution so controller can schedule it + }); + } + + private async executeGenerateText( + input: string | UIMessage[] | BaseMessage[], + options?: GenerateTextOptions, ): Promise { const startTime = Date.now(); const oc = this.createOperationContext(input, options); @@ -771,6 +783,17 @@ export class Agent { async streamText( input: string | UIMessage[] | BaseMessage[], options?: StreamTextOptions, + ): Promise { + const controller = getTrafficController({ logger: this.logger }); // Same controller handles streaming to keep ordering/backpressure consistent + return controller.handleStream({ + metadata: this.buildTrafficMetadata(), // Include identifiers to support per-provider/model policies later + execute: () => this.executeStreamText(input, options), // Actual streaming work happens after the controller dequeues us + }); + } + + private async executeStreamText( + input: string | UIMessage[] | BaseMessage[], + options?: StreamTextOptions, ): Promise { const startTime = Date.now(); const oc = this.createOperationContext(input, options); @@ -3822,6 +3845,24 @@ export class Agent { return this.subAgentManager.calculateMaxSteps(this.maxSteps); } + private buildTrafficMetadata(): TrafficRequestMetadata { + // Capture provider if the model object exposes it; fallback is undefined to avoid bad assumptions + const provider = + typeof this.model === "object" && + this.model !== null && + "provider" in this.model && + typeof (this.model as any).provider === "string" + ? ((this.model as any).provider as string) + : undefined; + + return { + agentId: this.id, // Identify which agent issued the request + agentName: this.name, // Human-readable label for logs/metrics + model: this.getModelName(), // Used for future capacity policies + provider, // Allows per-provider throttling later + }; + } + /** * Get the model name */ diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 8753f0391..e1e5ddb8e 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -21,6 +21,14 @@ export type { WorkflowTimelineEvent, RegisteredWorkflow, } from "./workflow"; +export { + // Surface traffic controller so downstream consumers can route agent calls through the shared scheduler + TrafficController, + getTrafficController, + type TrafficRequest, + type TrafficRequestMetadata, + type TrafficRequestType, +} from "./traffic/traffic-controller"; // Export new Agent from agent.ts export { Agent, diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts new file mode 100644 index 000000000..a87beb3b7 --- /dev/null +++ b/packages/core/src/traffic/traffic-controller.ts @@ -0,0 +1,267 @@ +import type { Logger } from "@voltagent/internal"; +import { randomUUID } from "../utils/id"; + +type Scheduler = (callback: () => void) => void; +type BivariantHandler = { + bivarianceHack(...args: TArgs): void; +}["bivarianceHack"]; + +interface RateLimitBucket { + tokens: number; + capacity: number; + refillPerMs: number; + lastRefill: number; +} + +export interface RateLimitOptions { + capacity: number; + refillPerSecond: number; +} + +export type TrafficRequestType = "text" | "stream"; + +export interface TrafficRequestMetadata { + agentId?: string; + agentName?: string; + model?: string; + provider?: string; +} + +export interface TrafficRequest { + metadata?: TrafficRequestMetadata; + execute: () => Promise; +} + +interface QueuedRequest { + id: string; + type: TrafficRequestType; + request: TrafficRequest; + resolve: BivariantHandler<[TResponse | PromiseLike]>; + reject: BivariantHandler<[reason?: unknown]>; +} + +export interface TrafficControllerOptions { + logger?: Logger; + maxConcurrent?: number; + rateLimit?: RateLimitOptions; +} + +// Centralized traffic controller responsible for scheduling LLM calls. +// Provides a FIFO queue with a non-blocking scheduler and entrypoints +// for text and stream traffic. +export class TrafficController { + private readonly scheduler: Scheduler; + private readonly maxConcurrent: number; + private readonly rateLimit?: { capacity: number; refillPerMs: number }; + private readonly rateLimitBuckets = new Map(); + private logger?: Logger; + private queue: QueuedRequest[] = []; + private activeCount = 0; + private drainScheduled = false; + private refillTimeout?: ReturnType; + + constructor(options: TrafficControllerOptions = {}) { + this.logger = options.logger; // Allow caller to plug in their logger for observability + this.maxConcurrent = options.maxConcurrent ?? Number.POSITIVE_INFINITY; // Concurrency guard; defaults to no cap for now + this.scheduler = this.createScheduler(); // Select scheduler once so the rest of the code can stay simple + if ( + options.rateLimit && + options.rateLimit.capacity > 0 && + options.rateLimit.refillPerSecond > 0 + ) { + this.rateLimit = { + capacity: options.rateLimit.capacity, + refillPerMs: options.rateLimit.refillPerSecond / 1000, // Convert to ms once so the math later stays simple + }; + } + } + + setLogger(logger?: Logger): void { + this.logger = logger; // Update logger when the singleton is reused across agents + } + + handleText(request: TrafficRequest): Promise { + // Route text generation requests into the queue so all LLM calls share the same scheduler + return this.enqueue("text", request); + } + + handleStream(request: TrafficRequest): Promise { + // Route streaming requests through the same queue to preserve ordering/backpressure rules + return this.enqueue("stream", request); + } + + private createScheduler(): Scheduler { + // Prefer queueMicrotask to keep the drain loop snappy without starving the event loop + if (typeof queueMicrotask === "function") { + return queueMicrotask; + } + + return (callback: () => void) => setTimeout(callback, 0); + } + + private enqueue( + type: TrafficRequestType, + request: TrafficRequest, + ): Promise { + // Each request gets a promise so callers can await their own result + return new Promise((resolve, reject) => { + // Collect the work item and metadata + this.queue.push({ + id: randomUUID(), + type, + request, + resolve, + reject, + }); + + // Emit trace-friendly breadcrumb for observability + this.logger?.debug?.("[TrafficController] enqueued", { + type, + queueSize: this.queue.length, + metadata: request.metadata, + }); + + // Kick the drain loop to start handling work + this.scheduleDrain(); + }); + } + + private scheduleDrain(): void { + if (this.drainScheduled) { + return; + } + + this.drainScheduled = true; // Prevent redundant scheduling when many requests arrive at once + this.scheduler(() => { + this.drainScheduled = false; + this.drainQueue(); // Drain asynchronously so we never block the caller's tick + }); + } + + private drainQueue(): void { + // Pull as many items as we can until we hit capacity or rate limits + while (this.queue.length > 0) { + const next = this.queue[0]; // Peek without removing so we only dequeue when we can process + if (!next) { + break; + } + if (!this.canProcess(next)) { + return; // Stop early; drain will be rescheduled once capacity frees up + } + + this.queue.shift(); // Remove after we've confirmed we can process + this.activeCount++; // Track in-flight work to enforce concurrency guard + + this.logger?.debug?.("[TrafficController] dispatch", { + type: next.type, + queueSize: this.queue.length, + active: this.activeCount, + metadata: next.request.metadata, + }); + + void this.runRequest(next); // Fire off processing without blocking the loop + } + } + + private canProcess(next: QueuedRequest): boolean { + if (this.activeCount >= this.maxConcurrent) { + return false; + } + + if (!this.rateLimit) { + return true; // No rate limit configured + } + + // Token bucket guard: only proceed when a token is available + const bucket = this.getRateLimitBucket(next.request.metadata); + if (bucket.tokens < 1) { + this.scheduleRefill(); // Ensure we retry as soon as tokens are replenished + return false; + } + + bucket.tokens -= 1; // Consume a token for this dispatch + return true; + } + + private getRateLimitBucket(metadata?: TrafficRequestMetadata): RateLimitBucket { + const rateLimit = this.rateLimit; + if (!rateLimit) { + throw new Error("Rate limit bucket requested without rate limit configuration"); + } + + const key = this.buildRateLimitKey(metadata); // Group by provider+model so they share limits + const now = Date.now(); // Snapshot time once to avoid drift within this method + let bucket = this.rateLimitBuckets.get(key); // Reuse the bucket if it already exists + + if (!bucket) { + // First request for this key: create a fresh bucket at full capacity + bucket = { + tokens: rateLimit.capacity, + capacity: rateLimit.capacity, + refillPerMs: rateLimit.refillPerMs, + lastRefill: now, + }; + this.rateLimitBuckets.set(key, bucket); + return bucket; + } + + const elapsedMs = Math.max(0, now - bucket.lastRefill); + if (elapsedMs > 0 && bucket.tokens < bucket.capacity) { + const refilled = elapsedMs * bucket.refillPerMs; // Refill based on elapsed time + bucket.tokens = Math.min(bucket.capacity, bucket.tokens + refilled); // Cap at bucket capacity + bucket.lastRefill = now; // Mark refill time for the next calculation + } + + return bucket; + } + + private buildRateLimitKey(metadata?: TrafficRequestMetadata): string { + const provider = metadata?.provider ?? "default-provider"; + const model = metadata?.model ?? "default-model"; + return `${provider}::${model}`; + } + + private scheduleRefill(): void { + if (this.refillTimeout || !this.rateLimit) { + return; + } + + const delayMs = Math.max(1, Math.ceil(1 / this.rateLimit.refillPerMs)); // Wait long enough for at least one token + this.refillTimeout = setTimeout(() => { + this.refillTimeout = undefined; // Allow future refills to be scheduled + this.scheduleDrain(); // Try draining again now that tokens should exist + }, delayMs); + } + + private async runRequest(item: QueuedRequest): Promise { + try { + const result = await item.request.execute(); // Execute the user's operation + item.resolve(result); // Deliver successful result back to the waiting caller + } catch (error) { + item.reject(error); // Surface failures to the caller + } finally { + this.activeCount = Math.max(0, this.activeCount - 1); // Ensure counter never underflows + this.scheduleDrain(); // Immediately try to pull the next request + } + } +} + +declare global { + // eslint-disable-next-line no-var + var ___voltagent_traffic_controller: TrafficController | undefined; +} + +/** + * Retrieve the shared traffic controller instance. + */ +export function getTrafficController(options?: TrafficControllerOptions): TrafficController { + if (!globalThis.___voltagent_traffic_controller) { + // Create a singleton controller so all agents share the same queue/scheduling behavior + globalThis.___voltagent_traffic_controller = new TrafficController(options); + } else if (options?.logger) { + // Update logger when caller provides a new one, keeping the singleton instance alive + globalThis.___voltagent_traffic_controller.setLogger(options.logger); + } + + return globalThis.___voltagent_traffic_controller; +} From cf6846e050641c5ad8204e3ab675aef2a2d6e176 Mon Sep 17 00:00:00 2001 From: riturajFi Date: Sat, 29 Nov 2025 15:06:25 +0530 Subject: [PATCH 02/41] feat: introduced RPM --- packages/core/src/agent/agent.ts | 107 +++- packages/core/src/index.ts | 3 + .../core/src/traffic/traffic-controller.ts | 476 ++++++++++++++++-- 3 files changed, 526 insertions(+), 60 deletions(-) diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index b69db86f7..f6ac232e3 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -446,16 +446,18 @@ export class Agent { input: string | UIMessage[] | BaseMessage[], options?: GenerateTextOptions, ): Promise { - const controller = getTrafficController({ logger: this.logger }); // Use shared controller so all agent calls flow through central queue/metrics + const controller = getTrafficController(); // Use shared controller so all agent calls flow through central queue/metrics + const trafficMetadata = this.buildTrafficMetadata(); return controller.handleText({ - metadata: this.buildTrafficMetadata(), // Pass model/provider info for future rate limiting keys - execute: () => this.executeGenerateText(input, options), // Defer actual execution so controller can schedule it + metadata: trafficMetadata, // Pass model/provider info for future rate limiting keys + execute: () => this.executeGenerateText(input, options, trafficMetadata), // Defer actual execution so controller can schedule it }); } private async executeGenerateText( input: string | UIMessage[] | BaseMessage[], options?: GenerateTextOptions, + trafficMetadata?: TrafficRequestMetadata, ): Promise { const startTime = Date.now(); const oc = this.createOperationContext(input, options); @@ -579,6 +581,11 @@ export class Agent { let result!: GenerateTextResult; try { + methodLogger.info("[AI SDK] Calling generateText", { + messageCount: messages.length, + modelName, + tools: tools ? Object.keys(tools) : [], + }); result = await oc.traceContext.withSpan(llmSpan, () => generateText({ model, @@ -600,6 +607,13 @@ export class Agent { onStepFinish: this.createStepHandler(oc, options), }), ); + methodLogger.info("[AI SDK] Received generateText result", { + finishReason: result.finishReason, + usage: result.usage ? safeStringify(result.usage) : undefined, + stepCount: result.steps?.length ?? 0, + rawResult: safeStringify(result), + }); + this.updateTrafficControllerRateLimits(result.response, trafficMetadata, methodLogger); } catch (error) { finalizeLLMSpan(SpanStatusCode.ERROR, { message: (error as Error).message }); throw error; @@ -784,16 +798,18 @@ export class Agent { input: string | UIMessage[] | BaseMessage[], options?: StreamTextOptions, ): Promise { - const controller = getTrafficController({ logger: this.logger }); // Same controller handles streaming to keep ordering/backpressure consistent + const controller = getTrafficController(); // Same controller handles streaming to keep ordering/backpressure consistent + const trafficMetadata = this.buildTrafficMetadata(); return controller.handleStream({ - metadata: this.buildTrafficMetadata(), // Include identifiers to support per-provider/model policies later - execute: () => this.executeStreamText(input, options), // Actual streaming work happens after the controller dequeues us + metadata: trafficMetadata, // Include identifiers to support per-provider/model policies later + execute: () => this.executeStreamText(input, options, trafficMetadata), // Actual streaming work happens after the controller dequeues us }); } private async executeStreamText( input: string | UIMessage[] | BaseMessage[], options?: StreamTextOptions, + trafficMetadata?: TrafficRequestMetadata, ): Promise { const startTime = Date.now(); const oc = this.createOperationContext(input, options); @@ -917,6 +933,11 @@ export class Agent { }); const finalizeLLMSpan = this.createLLMSpanFinalizer(llmSpan); + methodLogger.info("[AI SDK] Calling streamText", { + messageCount: messages.length, + modelName, + tools: tools ? Object.keys(tools) : [], + }); const result = streamText({ model, messages, @@ -985,6 +1006,17 @@ export class Agent { .catch(() => {}); }, onFinish: async (finalResult) => { + methodLogger.info("[AI SDK] streamText finished", { + finishReason: finalResult.finishReason, + usage: finalResult.totalUsage ? safeStringify(finalResult.totalUsage) : undefined, + stepCount: finalResult.steps?.length ?? 0, + rawResult: safeStringify(finalResult), + }); + this.updateTrafficControllerRateLimits( + finalResult.response, + trafficMetadata, + methodLogger, + ); const providerUsage = finalResult.usage ? await Promise.resolve(finalResult.usage) : undefined; @@ -1537,6 +1569,11 @@ export class Agent { ...aiSDKOptions } = options || {}; + methodLogger.info("[AI SDK] Calling generateObject", { + messageCount: messages.length, + modelName, + schemaName, + }); const result = await generateObject({ model, messages, @@ -1553,6 +1590,12 @@ export class Agent { // VoltAgent controlled abortSignal: oc.abortController.signal, }); + methodLogger.info("[AI SDK] Received generateObject result", { + finishReason: result.finishReason, + usage: result.usage ? safeStringify(result.usage) : undefined, + warnings: result.warnings, + rawResult: safeStringify(result), + }); const usageInfo = convertUsage(result.usage); const finalObject = await executeOutputGuardrails({ @@ -1770,6 +1813,11 @@ export class Agent { let resolveGuardrailObject: ((value: z.infer) => void) | undefined; let rejectGuardrailObject: ((reason: unknown) => void) | undefined; + methodLogger.info("[AI SDK] Calling streamObject", { + messageCount: messages.length, + modelName, + schemaName, + }); const result = streamObject({ model, messages, @@ -1823,6 +1871,11 @@ export class Agent { }, onFinish: async (finalResult: any) => { try { + methodLogger.info("[AI SDK] streamObject finished", { + finishReason: finalResult.finishReason, + usage: finalResult.usage ? safeStringify(finalResult.usage) : undefined, + rawResult: safeStringify(finalResult), + }); const usageInfo = convertUsage(finalResult.usage as any); let finalObject = finalResult.object as z.infer; if (guardrailSet.output.length > 0) { @@ -3863,6 +3916,48 @@ export class Agent { }; } + private updateTrafficControllerRateLimits( + response: unknown, + metadata: TrafficRequestMetadata | undefined, + logger?: Logger, + ): void { + if (!response || typeof response !== "object") { + logger?.debug?.("[Traffic] No response object available for rate limit update"); + return; + } + + const responseWithHeaders = response as { headers?: unknown } | null; + const headers = responseWithHeaders?.headers; + if (!headers) { + logger?.debug?.("[Traffic] Response missing headers; skipping rate limit update"); + return; + } + + const controller = getTrafficController(); + const updateResult = controller.updateRateLimitFromHeaders( + metadata ?? this.buildTrafficMetadata(), + headers, + ); + + if (!updateResult) { + logger?.debug?.("[Traffic] No rate limit headers applied from response"); + return; + } + + const refillPerSecond = updateResult.normalized.refillPerMs * 1000; + logger?.info?.("[Traffic] Applied rate limit from response headers", { + rateLimitKey: updateResult.key, + capacity: updateResult.normalized.capacity, + refillPerSecond, + appliedTokens: updateResult.appliedTokens, + headers: { + limitRequests: updateResult.headerSnapshot.limitRequests, + remainingRequests: updateResult.headerSnapshot.remainingRequests, + resetRequestsMs: updateResult.headerSnapshot.resetRequestsMs, + }, + }); + } + /** * Get the model name */ diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index e1e5ddb8e..665bfa424 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -25,6 +25,9 @@ export { // Surface traffic controller so downstream consumers can route agent calls through the shared scheduler TrafficController, getTrafficController, + type RateLimitConfig, + type RateLimitKey, + type RateLimitOptions, type TrafficRequest, type TrafficRequestMetadata, type TrafficRequestType, diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index a87beb3b7..c412b1389 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -1,6 +1,3 @@ -import type { Logger } from "@voltagent/internal"; -import { randomUUID } from "../utils/id"; - type Scheduler = (callback: () => void) => void; type BivariantHandler = { bivarianceHack(...args: TArgs): void; @@ -13,11 +10,32 @@ interface RateLimitBucket { lastRefill: number; } +type NormalizedRateLimit = { + capacity: number; + refillPerMs: number; +}; + export interface RateLimitOptions { capacity: number; refillPerSecond: number; } +export type RateLimitKey = string; +export type RateLimitConfig = Record; + +type RateLimitHeaderSnapshot = { + limitRequests: number; + remainingRequests?: number; + resetRequestsMs: number; +}; + +export type RateLimitUpdateResult = { + key: string; + headerSnapshot: RateLimitHeaderSnapshot; + normalized: NormalizedRateLimit; + appliedTokens: number; +}; + export type TrafficRequestType = "text" | "stream"; export interface TrafficRequestMetadata { @@ -33,17 +51,17 @@ export interface TrafficRequest { } interface QueuedRequest { - id: string; type: TrafficRequestType; request: TrafficRequest; resolve: BivariantHandler<[TResponse | PromiseLike]>; reject: BivariantHandler<[reason?: unknown]>; + etaMs?: number; + rateLimitKey?: string; } export interface TrafficControllerOptions { - logger?: Logger; maxConcurrent?: number; - rateLimit?: RateLimitOptions; + rateLimits?: RateLimitConfig; } // Centralized traffic controller responsible for scheduling LLM calls. @@ -52,32 +70,27 @@ export interface TrafficControllerOptions { export class TrafficController { private readonly scheduler: Scheduler; private readonly maxConcurrent: number; - private readonly rateLimit?: { capacity: number; refillPerMs: number }; + private rateLimits?: Map; private readonly rateLimitBuckets = new Map(); - private logger?: Logger; private queue: QueuedRequest[] = []; private activeCount = 0; private drainScheduled = false; private refillTimeout?: ReturnType; - constructor(options: TrafficControllerOptions = {}) { - this.logger = options.logger; // Allow caller to plug in their logger for observability - this.maxConcurrent = options.maxConcurrent ?? Number.POSITIVE_INFINITY; // Concurrency guard; defaults to no cap for now - this.scheduler = this.createScheduler(); // Select scheduler once so the rest of the code can stay simple - if ( - options.rateLimit && - options.rateLimit.capacity > 0 && - options.rateLimit.refillPerSecond > 0 - ) { - this.rateLimit = { - capacity: options.rateLimit.capacity, - refillPerMs: options.rateLimit.refillPerSecond / 1000, // Convert to ms once so the math later stays simple - }; + private logDebug(message: string, details?: Record): void { + if (typeof console?.debug === "function") { + console.debug(message, details); } } - setLogger(logger?: Logger): void { - this.logger = logger; // Update logger when the singleton is reused across agents + constructor(options: TrafficControllerOptions = {}) { + this.maxConcurrent = options.maxConcurrent ?? Number.POSITIVE_INFINITY; // Concurrency guard; defaults to no cap for now + this.rateLimits = this.normalizeRateLimits(options.rateLimits); + this.scheduler = this.createScheduler(); // Select scheduler once so the rest of the code can stay simple + this.logDebug("[TrafficController] init", { + maxConcurrent: this.maxConcurrent, + rateLimits: this.rateLimits ? Array.from(this.rateLimits.entries()) : undefined, + }); } handleText(request: TrafficRequest): Promise { @@ -107,15 +120,13 @@ export class TrafficController { return new Promise((resolve, reject) => { // Collect the work item and metadata this.queue.push({ - id: randomUUID(), type, request, resolve, reject, }); - // Emit trace-friendly breadcrumb for observability - this.logger?.debug?.("[TrafficController] enqueued", { + this.logDebug("[TrafficController] enqueue", { type, queueSize: this.queue.length, metadata: request.metadata, @@ -132,8 +143,13 @@ export class TrafficController { } this.drainScheduled = true; // Prevent redundant scheduling when many requests arrive at once + this.logDebug("[TrafficController] scheduleDrain", { queueSize: this.queue.length }); this.scheduler(() => { this.drainScheduled = false; + this.logDebug("[TrafficController] drainLoopStart", { + queueSize: this.queue.length, + active: this.activeCount, + }); this.drainQueue(); // Drain asynchronously so we never block the caller's tick }); } @@ -152,10 +168,12 @@ export class TrafficController { this.queue.shift(); // Remove after we've confirmed we can process this.activeCount++; // Track in-flight work to enforce concurrency guard - this.logger?.debug?.("[TrafficController] dispatch", { + this.logDebug("[TrafficController] dispatch", { type: next.type, queueSize: this.queue.length, active: this.activeCount, + etaMs: next.etaMs, + rateLimitKey: next.rateLimitKey, metadata: next.request.metadata, }); @@ -165,103 +183,453 @@ export class TrafficController { private canProcess(next: QueuedRequest): boolean { if (this.activeCount >= this.maxConcurrent) { + this.logDebug("[TrafficController] throttle concurrency", { + active: this.activeCount, + maxConcurrent: this.maxConcurrent, + }); return false; } - if (!this.rateLimit) { - return true; // No rate limit configured + const rateLimitConfig = this.getRateLimitConfig(next.request.metadata); + if (!rateLimitConfig) { + this.logDebug("[TrafficController] no rate limit match", { + metadata: next.request.metadata, + }); + next.rateLimitKey = undefined; + next.etaMs = 0; + return true; // No rate limit configured for this key } - // Token bucket guard: only proceed when a token is available - const bucket = this.getRateLimitBucket(next.request.metadata); + const queuedAhead = this.countQueuedAheadWithKey( + rateLimitConfig.key, + next, + /*logDetails*/ true, + ); + const bucket = this.getRateLimitBucket(rateLimitConfig.key, rateLimitConfig.limit); if (bucket.tokens < 1) { - this.scheduleRefill(); // Ensure we retry as soon as tokens are replenished + next.rateLimitKey = rateLimitConfig.key; + next.etaMs = this.computeEtaMs( + bucket, + rateLimitConfig.limit, + rateLimitConfig.key, + next, + queuedAhead, + ); + this.logDebug("[TrafficController] throttle rate", { + key: rateLimitConfig.key, + tokens: bucket.tokens, + etaMs: next.etaMs, + queuedAhead, + }); + this.scheduleRefill(rateLimitConfig.limit); // Ensure we retry as soon as tokens are replenished return false; } bucket.tokens -= 1; // Consume a token for this dispatch + this.logDebug("[TrafficController] token consumed", { + key: rateLimitConfig.key, + remaining: bucket.tokens, + capacity: bucket.capacity, + }); + next.rateLimitKey = rateLimitConfig.key; + next.etaMs = 0; return true; } - private getRateLimitBucket(metadata?: TrafficRequestMetadata): RateLimitBucket { - const rateLimit = this.rateLimit; - if (!rateLimit) { - throw new Error("Rate limit bucket requested without rate limit configuration"); + private getRateLimitConfig( + metadata?: TrafficRequestMetadata, + ): { key: string; limit: NormalizedRateLimit } | undefined { + if (!this.rateLimits || this.rateLimits.size === 0) { + return undefined; + } + + const key = this.buildRateLimitKey(metadata); + const limit = this.rateLimits.get(key); + if (!limit) { + return undefined; } - const key = this.buildRateLimitKey(metadata); // Group by provider+model so they share limits + this.logDebug("[TrafficController] rateLimitConfig hit", { key }); + return { key, limit }; + } + + private getRateLimitBucket(key: string, limit: NormalizedRateLimit): RateLimitBucket { const now = Date.now(); // Snapshot time once to avoid drift within this method let bucket = this.rateLimitBuckets.get(key); // Reuse the bucket if it already exists if (!bucket) { - // First request for this key: create a fresh bucket at full capacity bucket = { - tokens: rateLimit.capacity, - capacity: rateLimit.capacity, - refillPerMs: rateLimit.refillPerMs, + tokens: limit.capacity, + capacity: limit.capacity, + refillPerMs: limit.refillPerMs, lastRefill: now, }; this.rateLimitBuckets.set(key, bucket); + this.logDebug("[TrafficController] bucket create", { + key, + capacity: bucket.capacity, + refillPerMs: bucket.refillPerMs, + }); return bucket; } + if ( + bucket.capacity !== limit.capacity || + Math.abs(bucket.refillPerMs - limit.refillPerMs) > Number.EPSILON + ) { + bucket.capacity = limit.capacity; + bucket.refillPerMs = limit.refillPerMs; + bucket.tokens = Math.min(bucket.tokens, bucket.capacity); + bucket.lastRefill = now; + this.logDebug("[TrafficController] bucket sync with new limit", { + key, + capacity: bucket.capacity, + refillPerMs: bucket.refillPerMs, + }); + } + const elapsedMs = Math.max(0, now - bucket.lastRefill); if (elapsedMs > 0 && bucket.tokens < bucket.capacity) { const refilled = elapsedMs * bucket.refillPerMs; // Refill based on elapsed time bucket.tokens = Math.min(bucket.capacity, bucket.tokens + refilled); // Cap at bucket capacity bucket.lastRefill = now; // Mark refill time for the next calculation + this.logDebug("[TrafficController] bucket refill", { + key, + elapsedMs, + tokens: bucket.tokens, + }); } return bucket; } + private computeEtaMs( + bucket: RateLimitBucket, + limit: NormalizedRateLimit, + key: string, + current: QueuedRequest, + queuedAhead?: number, + ): number { + const missingTokens = Math.max(0, 1 - bucket.tokens); + const waitForToken = + missingTokens > 0 && limit.refillPerMs > 0 ? Math.ceil(missingTokens / limit.refillPerMs) : 0; + const aheadCount = + typeof queuedAhead === "number" + ? queuedAhead + : this.countQueuedAheadWithKey(key, current, /*logDetails*/ false); + const extraForQueue = + aheadCount > 0 && limit.refillPerMs > 0 ? Math.ceil(aheadCount / limit.refillPerMs) : 0; + this.logDebug("[TrafficController] computeEtaMs", { + key, + missingTokens, + waitForToken, + aheadCount, + extraForQueue, + eta: waitForToken + extraForQueue, + }); + return waitForToken + extraForQueue; + } + + private countQueuedAheadWithKey(key: string, current: QueuedRequest, logDetails = false): number { + let count = 0; + for (const item of this.queue) { + if (item === current) { + break; + } + + const itemKey = this.buildRateLimitKey(item.request.metadata); + if (itemKey === key) { + count += 1; + } + } + if (logDetails) { + this.logDebug("[TrafficController] countQueuedAheadWithKey", { + key, + count, + queueSize: this.queue.length, + }); + } + return count; + } + + private normalizeRateLimits( + rateLimits?: RateLimitConfig, + ): Map | undefined { + if (!rateLimits) { + return undefined; + } + + const normalized = new Map(); + for (const [key, config] of Object.entries(rateLimits)) { + if (config.capacity > 0 && config.refillPerSecond > 0) { + normalized.set(key, { + capacity: config.capacity, + refillPerMs: config.refillPerSecond / 1000, + }); + } + } + + return normalized.size > 0 ? normalized : undefined; + } + private buildRateLimitKey(metadata?: TrafficRequestMetadata): string { const provider = metadata?.provider ?? "default-provider"; const model = metadata?.model ?? "default-model"; return `${provider}::${model}`; } - private scheduleRefill(): void { - if (this.refillTimeout || !this.rateLimit) { + /** + * Update (or bootstrap) rate limit buckets based on provider response headers. + * This lets the controller adopt server-issued limits without static config. + */ + updateRateLimitFromHeaders( + metadata: TrafficRequestMetadata | undefined, + headers: unknown, + ): RateLimitUpdateResult | undefined { + const headerInfo = this.extractRateLimitHeaders(headers); + if (!headerInfo) { + this.logDebug("[TrafficController] no rate limit headers found on response", { + metadata, + }); + return undefined; + } + + const normalized = this.normalizeHeaderRateLimit(headerInfo); + if (!normalized) { + this.logDebug("[TrafficController] rate limit headers present but invalid", { + headerInfo, + }); + return undefined; + } + + const key = this.buildRateLimitKey(metadata); + if (!this.rateLimits) { + this.rateLimits = new Map(); + } + this.rateLimits.set(key, normalized); + + const now = Date.now(); + const remainingTokens = this.coerceRemaining(headerInfo.remainingRequests, normalized.capacity); + const existingBucket = this.rateLimitBuckets.get(key); + const tokens = remainingTokens ?? existingBucket?.tokens ?? normalized.capacity; + + if (existingBucket) { + existingBucket.capacity = normalized.capacity; + existingBucket.refillPerMs = normalized.refillPerMs; + existingBucket.tokens = Math.min(tokens, normalized.capacity); + existingBucket.lastRefill = now; + } else { + this.rateLimitBuckets.set(key, { + tokens: Math.min(tokens, normalized.capacity), + capacity: normalized.capacity, + refillPerMs: normalized.refillPerMs, + lastRefill: now, + }); + } + + this.logDebug("[TrafficController] rateLimit updated from headers", { + key, + capacity: normalized.capacity, + refillPerMs: normalized.refillPerMs, + remaining: remainingTokens, + }); + + // If we just refilled tokens, try draining again. + this.scheduleDrain(); + + return { + key, + headerSnapshot: headerInfo, + normalized, + appliedTokens: Math.min(tokens, normalized.capacity), + }; + } + + private extractRateLimitHeaders(headers: unknown): RateLimitHeaderSnapshot | undefined { + const getHeader = this.createHeaderLookup(headers); + if (!getHeader) { + return undefined; + } + + const limitRequests = this.parseNumberHeader(getHeader, "x-ratelimit-limit-requests"); + const resetRequestsMs = this.parseDurationHeaderToMs(getHeader, "x-ratelimit-reset-requests"); + + if ( + limitRequests === undefined || + limitRequests <= 0 || + resetRequestsMs === undefined || + resetRequestsMs <= 0 + ) { + return undefined; + } + + const remainingRequests = this.parseNumberHeader(getHeader, "x-ratelimit-remaining-requests"); + + return { + limitRequests, + remainingRequests, + resetRequestsMs, + }; + } + + private normalizeHeaderRateLimit( + snapshot: RateLimitHeaderSnapshot, + ): NormalizedRateLimit | undefined { + if (snapshot.limitRequests <= 0 || snapshot.resetRequestsMs <= 0) { + return undefined; + } + + return { + capacity: snapshot.limitRequests, + refillPerMs: snapshot.limitRequests / snapshot.resetRequestsMs, + }; + } + + private coerceRemaining(remaining: number | undefined, capacity: number): number | undefined { + if (remaining === undefined) { + return undefined; + } + + const parsed = Number(remaining); + if (!Number.isFinite(parsed)) { + return undefined; + } + + return Math.max(0, Math.min(capacity, Math.floor(parsed))); + } + + private createHeaderLookup(headers: unknown): ((name: string) => string | undefined) | undefined { + if (!headers) { + return undefined; + } + + const maybeHeaders = headers as { get?: (name: string) => unknown }; + if (typeof maybeHeaders?.get === "function") { + return (name: string) => { + const value = maybeHeaders.get?.(name); + return value === undefined || value === null ? undefined : String(value); + }; + } + + if (typeof headers === "object") { + const entries = Object.entries(headers as Record); + if (entries.length === 0) { + return undefined; + } + + return (name: string) => { + const target = name.toLowerCase(); + for (const [key, value] of entries) { + if (typeof key === "string" && key.toLowerCase() === target) { + if (Array.isArray(value)) { + const first = value[0]; + return first === undefined || first === null ? undefined : String(first); + } + return value === undefined || value === null ? undefined : String(value); + } + } + return undefined; + }; + } + + return undefined; + } + + private parseNumberHeader( + getHeader: (name: string) => string | undefined, + name: string, + ): number | undefined { + const raw = getHeader(name); + if (raw === undefined) { + return undefined; + } + + const parsed = Number(raw); + return Number.isFinite(parsed) ? parsed : undefined; + } + + private parseDurationHeaderToMs( + getHeader: (name: string) => string | undefined, + name: string, + ): number | undefined { + const raw = getHeader(name); + if (!raw) { + return undefined; + } + + const trimmed = raw.trim(); + const match = trimmed.match(/^(-?\d+(?:\.\d+)?)(ms|s)?$/i); + if (!match) { + return undefined; + } + + const value = Number(match[1]); + if (!Number.isFinite(value) || value <= 0) { + return undefined; + } + + const unit = (match[2] || "s").toLowerCase(); + return unit === "ms" ? value : value * 1000; + } + + private scheduleRefill(limit: NormalizedRateLimit): void { + if (this.refillTimeout) { return; } - const delayMs = Math.max(1, Math.ceil(1 / this.rateLimit.refillPerMs)); // Wait long enough for at least one token + const delayMs = Math.max(1, Math.ceil(1 / limit.refillPerMs)); // Wait long enough for at least one token + this.logDebug("[TrafficController] scheduleRefill", { delayMs }); this.refillTimeout = setTimeout(() => { this.refillTimeout = undefined; // Allow future refills to be scheduled + this.logDebug("[TrafficController] refillTimeoutFired", { + queueSize: this.queue.length, + active: this.activeCount, + }); this.scheduleDrain(); // Try draining again now that tokens should exist }, delayMs); } private async runRequest(item: QueuedRequest): Promise { + this.logDebug("[TrafficController] runRequest start", { + type: item.type, + rateLimitKey: item.rateLimitKey, + etaMs: item.etaMs, + active: this.activeCount, + queueSize: this.queue.length, + }); try { const result = await item.request.execute(); // Execute the user's operation + // Log raw result coming back from the underlying handler (e.g., AI SDK) + this.logDebug("[TrafficController] runRequest result", { + type: item.type, + rateLimitKey: item.rateLimitKey, + result, + }); item.resolve(result); // Deliver successful result back to the waiting caller } catch (error) { item.reject(error); // Surface failures to the caller } finally { this.activeCount = Math.max(0, this.activeCount - 1); // Ensure counter never underflows + this.logDebug("[TrafficController] runRequest complete", { + type: item.type, + active: this.activeCount, + queueSize: this.queue.length, + }); this.scheduleDrain(); // Immediately try to pull the next request } } } -declare global { - // eslint-disable-next-line no-var - var ___voltagent_traffic_controller: TrafficController | undefined; -} +let singletonController: TrafficController | undefined; /** * Retrieve the shared traffic controller instance. */ export function getTrafficController(options?: TrafficControllerOptions): TrafficController { - if (!globalThis.___voltagent_traffic_controller) { + if (!singletonController) { // Create a singleton controller so all agents share the same queue/scheduling behavior - globalThis.___voltagent_traffic_controller = new TrafficController(options); - } else if (options?.logger) { - // Update logger when caller provides a new one, keeping the singleton instance alive - globalThis.___voltagent_traffic_controller.setLogger(options.logger); + singletonController = new TrafficController(options); } - return globalThis.___voltagent_traffic_controller; + return singletonController; } From 1624c01213da34897651964d4fc1d936a7964922 Mon Sep 17 00:00:00 2001 From: riturajFi Date: Sat, 29 Nov 2025 17:05:55 +0530 Subject: [PATCH 03/41] feat: normalize rate limits --- packages/core/src/agent/agent.ts | 32 ++- .../core/src/traffic/traffic-controller.ts | 196 +++++++++++++++++- 2 files changed, 220 insertions(+), 8 deletions(-) diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index f6ac232e3..bb2149973 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -594,7 +594,7 @@ export class Agent { // Default values temperature: this.temperature, maxOutputTokens: this.maxOutputTokens, - maxRetries: 3, + maxRetries: 0, stopWhen: options?.stopWhen ?? this.stopWhen ?? stepCountIs(maxSteps), // User overrides from AI SDK options ...aiSDKOptions, @@ -945,7 +945,7 @@ export class Agent { // Default values temperature: this.temperature, maxOutputTokens: this.maxOutputTokens, - maxRetries: 3, + maxRetries: 0, // Retry via traffic controller to avoid provider-level storms stopWhen: options?.stopWhen ?? this.stopWhen ?? stepCountIs(maxSteps), // User overrides from AI SDK options ...aiSDKOptions, @@ -1483,6 +1483,18 @@ export class Agent { input: string | UIMessage[] | BaseMessage[], schema: T, options?: GenerateObjectOptions, + ): Promise>> { + const controller = getTrafficController({ logger: this.logger }); + return controller.handleText({ + metadata: this.buildTrafficMetadata(), + execute: () => this.executeGenerateObject(input, schema, options), + }); + } + + private async executeGenerateObject( + input: string | UIMessage[] | BaseMessage[], + schema: T, + options?: GenerateObjectOptions, ): Promise>> { const startTime = Date.now(); const oc = this.createOperationContext(input, options); @@ -1582,7 +1594,7 @@ export class Agent { // Default values maxOutputTokens: this.maxOutputTokens, temperature: this.temperature, - maxRetries: 3, + maxRetries: 0, // User overrides from AI SDK options ...aiSDKOptions, // Provider-specific options @@ -1721,6 +1733,18 @@ export class Agent { input: string | UIMessage[] | BaseMessage[], schema: T, options?: StreamObjectOptions, + ): Promise>> { + const controller = getTrafficController({ logger: this.logger }); + return controller.handleStream({ + metadata: this.buildTrafficMetadata(), + execute: () => this.executeStreamObject(input, schema, options), + }); + } + + private async executeStreamObject( + input: string | UIMessage[] | BaseMessage[], + schema: T, + options?: StreamObjectOptions, ): Promise>> { const startTime = Date.now(); const oc = this.createOperationContext(input, options); @@ -1826,7 +1850,7 @@ export class Agent { // Default values maxOutputTokens: this.maxOutputTokens, temperature: this.temperature, - maxRetries: 3, + maxRetries: 0, // User overrides from AI SDK options ...aiSDKOptions, // Provider-specific options diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index c412b1389..45a7578f8 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -1,8 +1,22 @@ +import type { Logger } from "../logger"; +import { LoggerProxy } from "../logger"; + type Scheduler = (callback: () => void) => void; type BivariantHandler = { bivarianceHack(...args: TArgs): void; }["bivarianceHack"]; +type RetryReason = "rateLimit" | "serverError" | "timeout"; + +const MAX_RETRY_ATTEMPTS = 3; +const TIMEOUT_RETRY_ATTEMPTS = 2; +const RATE_LIMIT_BASE_BACKOFF_MS = 500; +const SERVER_ERROR_BASE_BACKOFF_MS = 1000; +const TIMEOUT_BASE_BACKOFF_MS = 750; +const RATE_LIMIT_JITTER_FACTOR = 0.35; +const SERVER_ERROR_JITTER_FACTOR = 0.8; +const TIMEOUT_JITTER_FACTOR = 0.5; + interface RateLimitBucket { tokens: number; capacity: number; @@ -57,11 +71,13 @@ interface QueuedRequest { reject: BivariantHandler<[reason?: unknown]>; etaMs?: number; rateLimitKey?: string; + attempt?: number; } export interface TrafficControllerOptions { maxConcurrent?: number; rateLimits?: RateLimitConfig; + logger?: Logger; } // Centralized traffic controller responsible for scheduling LLM calls. @@ -76,6 +92,7 @@ export class TrafficController { private activeCount = 0; private drainScheduled = false; private refillTimeout?: ReturnType; + private readonly logger: Logger; private logDebug(message: string, details?: Record): void { if (typeof console?.debug === "function") { @@ -84,10 +101,15 @@ export class TrafficController { } constructor(options: TrafficControllerOptions = {}) { - this.maxConcurrent = options.maxConcurrent ?? Number.POSITIVE_INFINITY; // Concurrency guard; defaults to no cap for now + this.maxConcurrent = options.maxConcurrent ?? Number.POSITIVE_INFINITY; this.rateLimits = this.normalizeRateLimits(options.rateLimits); - this.scheduler = this.createScheduler(); // Select scheduler once so the rest of the code can stay simple - this.logDebug("[TrafficController] init", { + this.scheduler = this.createScheduler(); + + // NEW LOGGER (from c2 commit) + this.logger = new LoggerProxy({ component: "traffic-controller" }, options.logger); + + // INIT LOG (from HEAD) — rewritten to use the new logger + this.logger.debug("[TrafficController] init", { maxConcurrent: this.maxConcurrent, rateLimits: this.rateLimits ? Array.from(this.rateLimits.entries()) : undefined, }); @@ -124,6 +146,7 @@ export class TrafficController { request, resolve, reject, + attempt: 1, }); this.logDebug("[TrafficController] enqueue", { @@ -590,6 +613,9 @@ export class TrafficController { } private async runRequest(item: QueuedRequest): Promise { + + const attempt = item.attempt ?? 1; + this.logDebug("[TrafficController] runRequest start", { type: item.type, rateLimitKey: item.rateLimitKey, @@ -597,6 +623,7 @@ export class TrafficController { active: this.activeCount, queueSize: this.queue.length, }); + try { const result = await item.request.execute(); // Execute the user's operation // Log raw result coming back from the underlying handler (e.g., AI SDK) @@ -607,7 +634,12 @@ export class TrafficController { }); item.resolve(result); // Deliver successful result back to the waiting caller } catch (error) { - item.reject(error); // Surface failures to the caller + const retryPlan = this.buildRetryPlan(error, attempt); + if (retryPlan) { + this.scheduleRetry(item, attempt + 1, retryPlan.delayMs, retryPlan.reason); + } else { + item.reject(error); // Surface failures to the caller + } } finally { this.activeCount = Math.max(0, this.activeCount - 1); // Ensure counter never underflows this.logDebug("[TrafficController] runRequest complete", { @@ -618,6 +650,162 @@ export class TrafficController { this.scheduleDrain(); // Immediately try to pull the next request } } + + private buildRetryPlan( + error: unknown, + attempt: number, + ): { delayMs: number; reason: RetryReason } | undefined { + const reason = this.getRetryReason(error); + if (!reason) { + return undefined; + } + + const maxAttempts = reason === "timeout" ? TIMEOUT_RETRY_ATTEMPTS : MAX_RETRY_ATTEMPTS; + if (attempt >= maxAttempts) { + return undefined; + } + + return { + reason, + delayMs: this.computeBackoffDelay(reason, attempt), + }; + } + + private getRetryReason(error: unknown): RetryReason | undefined { + const statusCode = this.extractStatusCode(error); + if (statusCode === 429) { + return "rateLimit"; + } + + if (statusCode !== undefined && statusCode >= 500 && statusCode < 600) { + return "serverError"; + } + + if (statusCode === 408 || this.isTimeoutError(error)) { + return "timeout"; + } + + return undefined; + } + + private extractStatusCode(error: unknown): number | undefined { + if (!error || typeof error !== "object") { + return undefined; + } + + const candidate = error as { status?: unknown; statusCode?: unknown; httpStatus?: unknown }; + const directStatus = + this.coerceStatus(candidate.status) ?? + this.coerceStatus(candidate.statusCode) ?? + this.coerceStatus(candidate.httpStatus); + if (directStatus !== undefined) { + return directStatus; + } + + const responseStatus = (error as { response?: { status?: unknown } }).response?.status; + const normalizedResponseStatus = this.coerceStatus(responseStatus); + if (normalizedResponseStatus !== undefined) { + return normalizedResponseStatus; + } + + const causeStatus = (error as { cause?: { status?: unknown; statusCode?: unknown } }).cause; + if (causeStatus) { + const normalizedCauseStatus = + this.coerceStatus(causeStatus.status) ?? this.coerceStatus(causeStatus.statusCode); + if (normalizedCauseStatus !== undefined) { + return normalizedCauseStatus; + } + } + + return undefined; + } + + private isTimeoutError(error: unknown): boolean { + const candidates = [error, (error as { cause?: unknown })?.cause]; + + for (const candidate of candidates) { + if (!candidate || typeof candidate !== "object") { + continue; + } + + const timeoutCode = (candidate as { code?: unknown }).code; + if (typeof timeoutCode === "string" && timeoutCode.toLowerCase().includes("timeout")) { + return true; + } + + const name = (candidate as { name?: unknown }).name; + if (typeof name === "string" && name.toLowerCase().includes("timeout")) { + return true; + } + + const message = (candidate as { message?: unknown }).message; + if (typeof message === "string" && message.toLowerCase().includes("timeout")) { + return true; + } + } + + return false; + } + + private coerceStatus(value: unknown): number | undefined { + if (typeof value === "number" && Number.isFinite(value)) { + return value; + } + + if (typeof value === "string") { + const parsed = Number(value); + if (Number.isFinite(parsed)) { + return parsed; + } + } + + return undefined; + } + + private computeBackoffDelay(reason: RetryReason, attempt: number): number { + const base = + reason === "serverError" + ? SERVER_ERROR_BASE_BACKOFF_MS + : reason === "timeout" + ? TIMEOUT_BASE_BACKOFF_MS + : RATE_LIMIT_BASE_BACKOFF_MS; + + const jitterFactor = + reason === "serverError" + ? SERVER_ERROR_JITTER_FACTOR + : reason === "timeout" + ? TIMEOUT_JITTER_FACTOR + : RATE_LIMIT_JITTER_FACTOR; + + const exponential = base * 2 ** Math.max(0, attempt - 1); + const jitter = exponential * jitterFactor * Math.random(); + return Math.max(1, Math.round(exponential + jitter)); + } + + private scheduleRetry( + item: QueuedRequest, + nextAttempt: number, + delayMs: number, + reason: RetryReason, + ): void { + this.logger.debug("Retrying request through controller", { + reason, + delayMs, + attempt: nextAttempt, + maxAttempts: reason === "timeout" ? TIMEOUT_RETRY_ATTEMPTS : MAX_RETRY_ATTEMPTS, + metadata: item.request.metadata, + }); + + setTimeout(() => { + this.queue.push({ + ...item, + attempt: nextAttempt, + etaMs: undefined, + rateLimitKey: undefined, + }); + this.scheduleDrain(); + }, delayMs); + } } let singletonController: TrafficController | undefined; From 17fe4421625d528b60b76c0aff3ca150a14908c5 Mon Sep 17 00:00:00 2001 From: riturajFi Date: Mon, 1 Dec 2025 12:37:59 +0530 Subject: [PATCH 04/41] feat: circuit breaker v1 + simple fallback --- packages/core/src/agent/agent.ts | 129 +++++--- packages/core/src/index.ts | 1 + .../core/src/traffic/traffic-controller.ts | 299 +++++++++++++++++- 3 files changed, 386 insertions(+), 43 deletions(-) diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index bb2149973..835fa2a35 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -304,6 +304,8 @@ export interface BaseGenerationOptions extends Partial { // Provider-specific options providerOptions?: ProviderOptions; + // Optional per-call model override (used for fallbacks) + model?: LanguageModel; // Experimental output (for structured generation) experimental_output?: ReturnType | ReturnType; @@ -446,12 +448,15 @@ export class Agent { input: string | UIMessage[] | BaseMessage[], options?: GenerateTextOptions, ): Promise { - const controller = getTrafficController(); // Use shared controller so all agent calls flow through central queue/metrics - const trafficMetadata = this.buildTrafficMetadata(); - return controller.handleText({ - metadata: trafficMetadata, // Pass model/provider info for future rate limiting keys - execute: () => this.executeGenerateText(input, options, trafficMetadata), // Defer actual execution so controller can schedule it + const controller = getTrafficController({ logger: this.logger }); // Use shared controller so all agent calls flow through central queue/metrics + const buildRequest = (modelOverride?: LanguageModel) => ({ + metadata: this.buildTrafficMetadata(modelOverride ?? options?.model), // Pass model/provider info for future rate limiting keys + execute: () => + this.executeGenerateText(input, this.mergeOptionsWithModel(options, modelOverride)), // Defer actual execution so controller can schedule it + createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), }); + + return controller.handleText(buildRequest(options?.model)); } private async executeGenerateText( @@ -485,7 +490,7 @@ export class Agent { options, ); - const modelName = this.getModelName(); + const modelName = this.getModelName(model); const contextLimit = options?.contextLimit; // Add model attributes and all options @@ -560,8 +565,10 @@ export class Agent { tools: userTools, experimental_output, providerOptions, + model: _model, // Exclude model so aiSDKOptions doesn't override resolved model ...aiSDKOptions } = options || {}; + void _model; const llmSpan = this.createLLMSpan(oc, { operation: "generateText", @@ -798,12 +805,15 @@ export class Agent { input: string | UIMessage[] | BaseMessage[], options?: StreamTextOptions, ): Promise { - const controller = getTrafficController(); // Same controller handles streaming to keep ordering/backpressure consistent - const trafficMetadata = this.buildTrafficMetadata(); - return controller.handleStream({ - metadata: trafficMetadata, // Include identifiers to support per-provider/model policies later - execute: () => this.executeStreamText(input, options, trafficMetadata), // Actual streaming work happens after the controller dequeues us + const controller = getTrafficController({ logger: this.logger }); // Same controller handles streaming to keep ordering/backpressure consistent + const buildRequest = (modelOverride?: LanguageModel) => ({ + metadata: this.buildTrafficMetadata(modelOverride ?? options?.model), // Include identifiers to support per-provider/model policies later + execute: () => + this.executeStreamText(input, this.mergeOptionsWithModel(options, modelOverride)), // Actual streaming work happens after the controller dequeues us + createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), }); + + return controller.handleStream(buildRequest(options?.model)); } private async executeStreamText( @@ -839,7 +849,7 @@ export class Agent { options, ); - const modelName = this.getModelName(); + const modelName = this.getModelName(model); const contextLimit = options?.contextLimit; // Add model attributes to root span if TraceContext exists @@ -909,8 +919,10 @@ export class Agent { onFinish: userOnFinish, experimental_output, providerOptions, + model: _model, // Exclude model from aiSDKOptions to avoid overriding resolved model ...aiSDKOptions } = options || {}; + void _model; const guardrailStreamingEnabled = guardrailSet.output.length > 0; @@ -1485,10 +1497,18 @@ export class Agent { options?: GenerateObjectOptions, ): Promise>> { const controller = getTrafficController({ logger: this.logger }); - return controller.handleText({ - metadata: this.buildTrafficMetadata(), - execute: () => this.executeGenerateObject(input, schema, options), + const buildRequest = (modelOverride?: LanguageModel) => ({ + metadata: this.buildTrafficMetadata(modelOverride ?? options?.model), + execute: () => + this.executeGenerateObject( + input, + schema, + this.mergeOptionsWithModel(options, modelOverride), + ), + createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), }); + + return controller.handleText(buildRequest(options?.model)); } private async executeGenerateObject( @@ -1519,7 +1539,7 @@ export class Agent { options, ); - const modelName = this.getModelName(); + const modelName = this.getModelName(model); const schemaName = schema.description || "unknown"; // Add model attributes and all options @@ -1578,8 +1598,10 @@ export class Agent { maxSteps: userMaxSteps, tools: userTools, providerOptions, + model: _model, // Exclude model so spread does not override resolved model ...aiSDKOptions } = options || {}; + void _model; methodLogger.info("[AI SDK] Calling generateObject", { messageCount: messages.length, @@ -1735,10 +1757,14 @@ export class Agent { options?: StreamObjectOptions, ): Promise>> { const controller = getTrafficController({ logger: this.logger }); - return controller.handleStream({ - metadata: this.buildTrafficMetadata(), - execute: () => this.executeStreamObject(input, schema, options), + const buildRequest = (modelOverride?: LanguageModel) => ({ + metadata: this.buildTrafficMetadata(modelOverride ?? options?.model), + execute: () => + this.executeStreamObject(input, schema, this.mergeOptionsWithModel(options, modelOverride)), + createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), }); + + return controller.handleStream(buildRequest(options?.model)); } private async executeStreamObject( @@ -1770,7 +1796,7 @@ export class Agent { options, ); - const modelName = this.getModelName(); + const modelName = this.getModelName(model); const schemaName = schema.description || "unknown"; // Add model attributes and all options @@ -1830,8 +1856,10 @@ export class Agent { tools: userTools, onFinish: userOnFinish, providerOptions, + model: _model, // Exclude model so aiSDKOptions cannot override resolved model ...aiSDKOptions } = options || {}; + void _model; let guardrailObjectPromise!: Promise>; let resolveGuardrailObject: ((value: z.infer) => void) | undefined; @@ -1866,7 +1894,7 @@ export class Agent { methodLogger.error("Stream object error occurred", { error: actualError, agentName: this.name, - modelName: this.getModelName(), + modelName: this.getModelName(model), schemaName: schemaName, }); @@ -2121,8 +2149,9 @@ export class Agent { // Calculate maxSteps (use provided option or calculate based on subagents) const maxSteps = options?.maxSteps ?? this.calculateMaxSteps(); - // Resolve dynamic values - const model = await this.resolveValue(this.model, oc); + // Resolve dynamic values (allow per-call model override for fallbacks) + const selectedModel = options?.model ?? this.model; + const model = await this.resolveValue(selectedModel, oc); const dynamicToolList = (await this.resolveValue(this.dynamicTools, oc)) || []; // Merge agent tools with option tools @@ -3270,6 +3299,20 @@ export class Agent { return value; } + private mergeOptionsWithModel( + options: BaseGenerationOptions | undefined, + modelOverride?: LanguageModel, + ): BaseGenerationOptions | undefined { + if (!options && modelOverride === undefined) { + return undefined; + } + + return { + ...(options ?? {}), + ...(modelOverride !== undefined ? { model: modelOverride } : {}), + }; + } + /** * Prepare tools with execution context */ @@ -3922,20 +3965,16 @@ export class Agent { return this.subAgentManager.calculateMaxSteps(this.maxSteps); } - private buildTrafficMetadata(): TrafficRequestMetadata { - // Capture provider if the model object exposes it; fallback is undefined to avoid bad assumptions + private buildTrafficMetadata( + modelOverride?: LanguageModel | DynamicValue, + ): TrafficRequestMetadata { const provider = - typeof this.model === "object" && - this.model !== null && - "provider" in this.model && - typeof (this.model as any).provider === "string" - ? ((this.model as any).provider as string) - : undefined; + this.resolveProvider(modelOverride) ?? this.resolveProvider(this.model) ?? undefined; return { agentId: this.id, // Identify which agent issued the request agentName: this.name, // Human-readable label for logs/metrics - model: this.getModelName(), // Used for future capacity policies + model: this.getModelName(modelOverride), // Used for future capacity policies provider, // Allows per-provider throttling later }; } @@ -3982,17 +4021,33 @@ export class Agent { }); } + private resolveProvider( + model: LanguageModel | DynamicValue | undefined, + ): string | undefined { + if ( + model && + typeof model === "object" && + "provider" in model && + typeof (model as any).provider === "string" + ) { + return (model as any).provider; + } + + return undefined; + } + /** * Get the model name */ - public getModelName(): string { - if (typeof this.model === "function") { + public getModelName(modelOverride?: LanguageModel | DynamicValue): string { + const selectedModel = modelOverride ?? this.model; + if (typeof selectedModel === "function") { return "dynamic"; } - if (typeof this.model === "string") { - return this.model; + if (typeof selectedModel === "string") { + return selectedModel; } - return this.model.modelId || "unknown"; + return selectedModel.modelId || "unknown"; } /** diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 665bfa424..5db71f10e 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -24,6 +24,7 @@ export type { export { // Surface traffic controller so downstream consumers can route agent calls through the shared scheduler TrafficController, + CircuitBreakerOpenError, getTrafficController, type RateLimitConfig, type RateLimitKey, diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index 45a7578f8..b9bc0babc 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -11,11 +11,17 @@ type RetryReason = "rateLimit" | "serverError" | "timeout"; const MAX_RETRY_ATTEMPTS = 3; const TIMEOUT_RETRY_ATTEMPTS = 2; const RATE_LIMIT_BASE_BACKOFF_MS = 500; +const CIRCUIT_FAILURE_THRESHOLD = 5; +const CIRCUIT_FAILURE_WINDOW_MS = 10_000; +const CIRCUIT_COOLDOWN_MS = 30_000; const SERVER_ERROR_BASE_BACKOFF_MS = 1000; const TIMEOUT_BASE_BACKOFF_MS = 750; const RATE_LIMIT_JITTER_FACTOR = 0.35; const SERVER_ERROR_JITTER_FACTOR = 0.8; const TIMEOUT_JITTER_FACTOR = 0.5; +const DEFAULT_FALLBACK_CHAINS: Record = { + "gpt-4o": ["gpt-4o-mini", "gpt-3.5"], +}; interface RateLimitBucket { tokens: number; @@ -62,6 +68,16 @@ export interface TrafficRequestMetadata { export interface TrafficRequest { metadata?: TrafficRequestMetadata; execute: () => Promise; + createFallbackRequest?: (modelId: string) => TrafficRequest | undefined; +} + +type CircuitStateStatus = "closed" | "open" | "half-open"; + +interface CircuitState { + status: CircuitStateStatus; + failureTimestamps: number[]; + openedAt?: number; + trialInFlight?: boolean; } interface QueuedRequest { @@ -72,14 +88,19 @@ interface QueuedRequest { etaMs?: number; rateLimitKey?: string; attempt?: number; + circuitKey?: string; + circuitStatus?: CircuitStateStatus; } export interface TrafficControllerOptions { maxConcurrent?: number; rateLimits?: RateLimitConfig; logger?: Logger; + fallbackChains?: Record; } +type ProcessDecision = "process" | "skip" | "wait"; + // Centralized traffic controller responsible for scheduling LLM calls. // Provides a FIFO queue with a non-blocking scheduler and entrypoints // for text and stream traffic. @@ -88,6 +109,8 @@ export class TrafficController { private readonly maxConcurrent: number; private rateLimits?: Map; private readonly rateLimitBuckets = new Map(); + private readonly circuitBreakers = new Map(); + private readonly fallbackChains: Map; private queue: QueuedRequest[] = []; private activeCount = 0; private drainScheduled = false; @@ -103,6 +126,7 @@ export class TrafficController { constructor(options: TrafficControllerOptions = {}) { this.maxConcurrent = options.maxConcurrent ?? Number.POSITIVE_INFINITY; this.rateLimits = this.normalizeRateLimits(options.rateLimits); + this.fallbackChains = this.normalizeFallbackChains(options.fallbackChains); this.scheduler = this.createScheduler(); // NEW LOGGER (from c2 commit) @@ -184,12 +208,19 @@ export class TrafficController { if (!next) { break; } - if (!this.canProcess(next)) { + + const decision = this.getProcessDecision(next); + if (decision === "wait") { return; // Stop early; drain will be rescheduled once capacity frees up } this.queue.shift(); // Remove after we've confirmed we can process + if (decision === "skip") { + continue; // Already handled (e.g., circuit open with no fallback) + } + this.activeCount++; // Track in-flight work to enforce concurrency guard + this.markCircuitTrial(next); // Reserve the half-open trial slot if needed this.logDebug("[TrafficController] dispatch", { type: next.type, @@ -204,13 +235,18 @@ export class TrafficController { } } - private canProcess(next: QueuedRequest): boolean { + private getProcessDecision(next: QueuedRequest): ProcessDecision { + const circuitDecision = this.evaluateCircuitBreaker(next); + if (circuitDecision !== "process") { + return circuitDecision; + } + if (this.activeCount >= this.maxConcurrent) { this.logDebug("[TrafficController] throttle concurrency", { active: this.activeCount, maxConcurrent: this.maxConcurrent, }); - return false; + return "wait"; } const rateLimitConfig = this.getRateLimitConfig(next.request.metadata); @@ -220,7 +256,7 @@ export class TrafficController { }); next.rateLimitKey = undefined; next.etaMs = 0; - return true; // No rate limit configured for this key + return "process"; // No rate limit configured for this key } const queuedAhead = this.countQueuedAheadWithKey( @@ -245,7 +281,7 @@ export class TrafficController { queuedAhead, }); this.scheduleRefill(rateLimitConfig.limit); // Ensure we retry as soon as tokens are replenished - return false; + return "wait"; } bucket.tokens -= 1; // Consume a token for this dispatch @@ -256,7 +292,7 @@ export class TrafficController { }); next.rateLimitKey = rateLimitConfig.key; next.etaMs = 0; - return true; + return "process"; } private getRateLimitConfig( @@ -375,6 +411,146 @@ export class TrafficController { return count; } + private evaluateCircuitBreaker(next: QueuedRequest): ProcessDecision { + return this.evaluateCircuitBreakerForRequest(next, new Set()); + } + + private evaluateCircuitBreakerForRequest( + next: QueuedRequest, + visitedModels: Set, + ): ProcessDecision { + const key = this.buildRateLimitKey(next.request.metadata); + next.circuitKey = key; + + const currentModel = next.request.metadata?.model; + if (currentModel) { + visitedModels.add(currentModel); + } + + const evaluation = this.evaluateCircuitState(key); + next.circuitStatus = evaluation.state; + + if (evaluation.allowRequest) { + return "process"; + } + + const fallbackModel = this.findFallbackModel(next.request.metadata, visitedModels); + if (fallbackModel && next.request.createFallbackRequest) { + const fallbackRequest = next.request.createFallbackRequest(fallbackModel); + if (fallbackRequest) { + this.logger.warn("Circuit open; attempting fallback model", { + fromModel: currentModel, + fallbackModel, + provider: next.request.metadata?.provider, + }); + next.request = fallbackRequest; + next.attempt = 1; + next.rateLimitKey = undefined; + next.etaMs = undefined; + next.circuitKey = undefined; + next.circuitStatus = undefined; + return this.evaluateCircuitBreakerForRequest(next, visitedModels); + } + } + + const retryAfterMs = evaluation.retryAfterMs ?? CIRCUIT_COOLDOWN_MS; + this.logger.warn("Circuit open; rejecting request", { + circuitKey: key, + retryAfterMs, + metadata: next.request.metadata, + }); + next.reject( + new CircuitBreakerOpenError( + `Circuit open for ${key}; retry after ${retryAfterMs}ms`, + next.request.metadata, + retryAfterMs, + ), + ); + return "skip"; + } + + private evaluateCircuitState(key: string): { + allowRequest: boolean; + state: CircuitStateStatus; + retryAfterMs?: number; + } { + const state = this.circuitBreakers.get(key); + if (!state) { + return { allowRequest: true, state: "closed" }; + } + + const now = Date.now(); + + if (state.status === "open") { + const elapsed = state.openedAt ? now - state.openedAt : 0; + if (elapsed >= CIRCUIT_COOLDOWN_MS) { + state.status = "half-open"; + state.trialInFlight = false; + state.failureTimestamps = []; + this.circuitBreakers.set(key, state); + return { allowRequest: true, state: state.status }; + } + return { + allowRequest: false, + state: state.status, + retryAfterMs: Math.max(0, CIRCUIT_COOLDOWN_MS - elapsed), + }; + } + + if (state.status === "half-open") { + if (state.trialInFlight) { + return { allowRequest: false, state: state.status }; + } + return { allowRequest: true, state: state.status }; + } + + return { allowRequest: true, state: state.status }; + } + + private findFallbackModel( + metadata: TrafficRequestMetadata | undefined, + visitedModels: Set, + ): string | undefined { + const currentModel = metadata?.model; + if (!currentModel) { + return undefined; + } + + const chain = this.fallbackChains.get(currentModel); + if (!chain) { + return undefined; + } + + const provider = metadata?.provider; + for (const candidate of chain) { + if (visitedModels.has(candidate)) { + continue; + } + + const candidateKey = this.buildRateLimitKey({ provider, model: candidate }); + const evaluation = this.evaluateCircuitState(candidateKey); + if (evaluation.allowRequest) { + visitedModels.add(candidate); + return candidate; + } + } + + return undefined; + } + + private markCircuitTrial(next: QueuedRequest): void { + const key = next.circuitKey; + if (!key) { + return; + } + + const state = this.circuitBreakers.get(key); + if (state && state.status === "half-open" && !state.trialInFlight) { + state.trialInFlight = true; + this.circuitBreakers.set(key, state); + } + } + private normalizeRateLimits( rateLimits?: RateLimitConfig, ): Map | undefined { @@ -395,6 +571,21 @@ export class TrafficController { return normalized.size > 0 ? normalized : undefined; } + private normalizeFallbackChains( + fallbackChains?: Record, + ): Map { + const configuredChains = fallbackChains ?? DEFAULT_FALLBACK_CHAINS; + const normalized = new Map(); + + for (const [model, chain] of Object.entries(configuredChains)) { + if (Array.isArray(chain) && chain.length > 0) { + normalized.set(model, [...chain]); + } + } + + return normalized; + } + private buildRateLimitKey(metadata?: TrafficRequestMetadata): string { const provider = metadata?.provider ?? "default-provider"; const model = metadata?.model ?? "default-model"; @@ -612,6 +803,86 @@ export class TrafficController { }, delayMs); } + private recordCircuitSuccess(metadata?: TrafficRequestMetadata): void { + const key = this.buildRateLimitKey(metadata); + if (this.circuitBreakers.has(key)) { + this.circuitBreakers.delete(key); + } + } + + private recordCircuitFailure(metadata: TrafficRequestMetadata | undefined, error: unknown): void { + const status = this.extractStatusCode(error); + if (!this.isCircuitBreakerStatus(status)) { + this.resetCircuitFailures(metadata); + return; + } + + const key = this.buildRateLimitKey(metadata); + const now = Date.now(); + const state = + this.circuitBreakers.get(key) ?? + ({ + status: "closed", + failureTimestamps: [], + } as CircuitState); + + const recentFailures = state.failureTimestamps.filter( + (timestamp) => now - timestamp <= CIRCUIT_FAILURE_WINDOW_MS, + ); + recentFailures.push(now); + + if (state.status === "half-open") { + state.status = "open"; + state.openedAt = now; + state.trialInFlight = false; + state.failureTimestamps = [now]; + this.circuitBreakers.set(key, state); + this.logger.warn("Circuit reopened after half-open failure", { + circuitKey: key, + statusCode: status, + }); + return; + } + + state.failureTimestamps = recentFailures; + if (state.failureTimestamps.length >= CIRCUIT_FAILURE_THRESHOLD) { + state.status = "open"; + state.openedAt = now; + state.trialInFlight = false; + this.logger.warn("Circuit opened after consecutive failures", { + circuitKey: key, + failureCount: state.failureTimestamps.length, + statusCode: status, + }); + } + + this.circuitBreakers.set(key, state); + } + + private resetCircuitFailures(metadata?: TrafficRequestMetadata): void { + const key = this.buildRateLimitKey(metadata); + const state = this.circuitBreakers.get(key); + if (!state) { + return; + } + + state.failureTimestamps = []; + if (state.status !== "open") { + state.status = "closed"; + state.trialInFlight = false; + } + + this.circuitBreakers.set(key, state); + } + + private isCircuitBreakerStatus(status?: number): boolean { + if (status === 429) { + return true; + } + + return status !== undefined && status >= 500 && status < 600; + } + private async runRequest(item: QueuedRequest): Promise { const attempt = item.attempt ?? 1; @@ -626,6 +897,7 @@ export class TrafficController { try { const result = await item.request.execute(); // Execute the user's operation + this.recordCircuitSuccess(item.request.metadata); // Log raw result coming back from the underlying handler (e.g., AI SDK) this.logDebug("[TrafficController] runRequest result", { type: item.type, @@ -634,6 +906,7 @@ export class TrafficController { }); item.resolve(result); // Deliver successful result back to the waiting caller } catch (error) { + this.recordCircuitFailure(item.request.metadata, error); const retryPlan = this.buildRetryPlan(error, attempt); if (retryPlan) { this.scheduleRetry(item, attempt + 1, retryPlan.delayMs, retryPlan.reason); @@ -802,6 +1075,8 @@ export class TrafficController { attempt: nextAttempt, etaMs: undefined, rateLimitKey: undefined, + circuitKey: undefined, + circuitStatus: undefined, }); this.scheduleDrain(); }, delayMs); @@ -810,6 +1085,18 @@ export class TrafficController { let singletonController: TrafficController | undefined; +export class CircuitBreakerOpenError extends Error { + readonly retryAfterMs?: number; + readonly metadata?: TrafficRequestMetadata; + + constructor(message: string, metadata?: TrafficRequestMetadata, retryAfterMs?: number) { + super(message); + this.name = "CircuitBreakerOpenError"; + this.metadata = metadata; + this.retryAfterMs = retryAfterMs; + } +} + /** * Retrieve the shared traffic controller instance. */ From 229e48fa2bbff6e598250c2e71b5fdf631224369 Mon Sep 17 00:00:00 2001 From: riturajFi Date: Mon, 1 Dec 2025 14:04:35 +0530 Subject: [PATCH 05/41] feat: priority queue --- packages/core/src/agent/agent.ts | 49 ++++++- packages/core/src/agent/types.ts | 9 ++ packages/core/src/index.ts | 1 + .../src/traffic/traffic-controller.spec.ts | 87 +++++++++++ .../core/src/traffic/traffic-controller.ts | 136 ++++++++++++------ .../scorers/src/llm/answer-correctness.ts | 1 + packages/scorers/src/llm/answer-relevancy.ts | 1 + packages/scorers/src/llm/classifiers.ts | 1 + packages/scorers/src/llm/context-precision.ts | 1 + packages/scorers/src/llm/context-recall.ts | 1 + packages/scorers/src/llm/context-relevancy.ts | 1 + packages/scorers/src/llm/moderation.ts | 1 + 12 files changed, 244 insertions(+), 45 deletions(-) create mode 100644 packages/core/src/traffic/traffic-controller.spec.ts diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index 835fa2a35..862c0b3e1 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -48,7 +48,11 @@ import type { BaseRetriever } from "../retriever/retriever"; import type { Tool, Toolkit } from "../tool"; import { createTool } from "../tool"; import { ToolManager } from "../tool/manager"; -import { type TrafficRequestMetadata, getTrafficController } from "../traffic/traffic-controller"; +import { + type TrafficPriority, + type TrafficRequestMetadata, + getTrafficController, +} from "../traffic/traffic-controller"; import { randomUUID } from "../utils/id"; import { convertModelMessagesToUIMessages } from "../utils/message-converter"; import { NodeType, createNodeId } from "../utils/node-utils"; @@ -265,6 +269,11 @@ export interface BaseGenerationOptions extends Partial { conversationId?: string; context?: ContextInput; elicitation?: (request: unknown) => Promise; + /** + * Optional priority override for scheduling. + * Defaults to agent-level priority when omitted. + */ + trafficPriority?: TrafficPriority; // Parent tracking parentAgentId?: string; @@ -350,6 +359,7 @@ export class Agent { readonly voice?: Voice; readonly retriever?: BaseRetriever; readonly supervisorConfig?: SupervisorConfig; + private readonly trafficPriority: TrafficPriority; private readonly context?: Map; private readonly logger: Logger; @@ -375,6 +385,7 @@ export class Agent { this.temperature = options.temperature; this.maxOutputTokens = options.maxOutputTokens; this.maxSteps = options.maxSteps || 5; + this.trafficPriority = options.trafficPriority ?? "P1"; this.stopWhen = options.stopWhen; this.markdown = options.markdown ?? false; this.voice = options.voice; @@ -450,7 +461,7 @@ export class Agent { ): Promise { const controller = getTrafficController({ logger: this.logger }); // Use shared controller so all agent calls flow through central queue/metrics const buildRequest = (modelOverride?: LanguageModel) => ({ - metadata: this.buildTrafficMetadata(modelOverride ?? options?.model), // Pass model/provider info for future rate limiting keys + metadata: this.buildTrafficMetadata(modelOverride ?? options?.model, options), // Pass model/provider info for future rate limiting keys execute: () => this.executeGenerateText(input, this.mergeOptionsWithModel(options, modelOverride)), // Defer actual execution so controller can schedule it createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), @@ -807,7 +818,7 @@ export class Agent { ): Promise { const controller = getTrafficController({ logger: this.logger }); // Same controller handles streaming to keep ordering/backpressure consistent const buildRequest = (modelOverride?: LanguageModel) => ({ - metadata: this.buildTrafficMetadata(modelOverride ?? options?.model), // Include identifiers to support per-provider/model policies later + metadata: this.buildTrafficMetadata(modelOverride ?? options?.model, options), // Include identifiers to support per-provider/model policies later execute: () => this.executeStreamText(input, this.mergeOptionsWithModel(options, modelOverride)), // Actual streaming work happens after the controller dequeues us createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), @@ -1498,7 +1509,7 @@ export class Agent { ): Promise>> { const controller = getTrafficController({ logger: this.logger }); const buildRequest = (modelOverride?: LanguageModel) => ({ - metadata: this.buildTrafficMetadata(modelOverride ?? options?.model), + metadata: this.buildTrafficMetadata(modelOverride ?? options?.model, options), execute: () => this.executeGenerateObject( input, @@ -1758,7 +1769,7 @@ export class Agent { ): Promise>> { const controller = getTrafficController({ logger: this.logger }); const buildRequest = (modelOverride?: LanguageModel) => ({ - metadata: this.buildTrafficMetadata(modelOverride ?? options?.model), + metadata: this.buildTrafficMetadata(modelOverride ?? options?.model, options), execute: () => this.executeStreamObject(input, schema, this.mergeOptionsWithModel(options, modelOverride)), createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), @@ -2202,6 +2213,7 @@ export class Agent { ): OperationContext { const operationId = randomUUID(); const startTimeDate = new Date(); + const priority = this.resolveTrafficPriority(options); // Prefer reusing an existing context instance to preserve reference across calls/subagents const runtimeContext = toContextMap(options?.context); @@ -2303,6 +2315,7 @@ export class Agent { logger, conversationSteps: options?.parentOperationContext?.conversationSteps || [], abortController, + priority, userId: options?.userId, conversationId: options?.conversationId, parentAgentId: options?.parentAgentId, @@ -3965,17 +3978,43 @@ export class Agent { return this.subAgentManager.calculateMaxSteps(this.maxSteps); } + private resolveTrafficPriority(options?: BaseGenerationOptions): TrafficPriority { + const normalize = (value?: TrafficPriority): TrafficPriority | undefined => { + if (value === "P0" || value === "P1" || value === "P2") { + return value; + } + return undefined; + }; + + const parentPriority = normalize(options?.parentOperationContext?.priority); + const localPriority = normalize(options?.trafficPriority) ?? this.trafficPriority ?? "P1"; + + if (parentPriority) { + return this.pickHigherPriority(parentPriority, localPriority); + } + + return localPriority; + } + + private pickHigherPriority(a: TrafficPriority, b: TrafficPriority): TrafficPriority { + const rank: Record = { P0: 0, P1: 1, P2: 2 }; + return rank[a] <= rank[b] ? a : b; + } + private buildTrafficMetadata( modelOverride?: LanguageModel | DynamicValue, + options?: BaseGenerationOptions, ): TrafficRequestMetadata { const provider = this.resolveProvider(modelOverride) ?? this.resolveProvider(this.model) ?? undefined; + const priority = this.resolveTrafficPriority(options); return { agentId: this.id, // Identify which agent issued the request agentName: this.name, // Human-readable label for logs/metrics model: this.getModelName(modelOverride), // Used for future capacity policies provider, // Allows per-provider throttling later + priority, }; } diff --git a/packages/core/src/agent/types.ts b/packages/core/src/agent/types.ts index dd5fb29d2..21b33d0ec 100644 --- a/packages/core/src/agent/types.ts +++ b/packages/core/src/agent/types.ts @@ -29,6 +29,7 @@ import type { Logger } from "@voltagent/internal"; import type { LocalScorerDefinition, SamplingPolicy } from "../eval/runtime"; import type { MemoryOptions, MemoryStorageMetadata, WorkingMemorySummary } from "../memory/types"; import type { VoltAgentObservability } from "../observability"; +import type { TrafficPriority } from "../traffic/traffic-controller"; import type { DynamicValue, DynamicValueOptions, @@ -456,6 +457,11 @@ export type AgentOptions = { temperature?: number; maxOutputTokens?: number; maxSteps?: number; + /** + * Default scheduling priority for this agent's LLM calls. + * Defaults to P1 when unspecified. + */ + trafficPriority?: TrafficPriority; /** * Default stop condition for step execution (ai-sdk `stopWhen`). * Per-call `stopWhen` in method options overrides this. @@ -914,6 +920,9 @@ export type OperationContext = { /** Conversation steps for building full message history including tool calls/results */ conversationSteps?: StepWithContent[]; + /** Scheduling priority propagated from parent calls */ + priority?: TrafficPriority; + /** AbortController for cancelling the operation and accessing the signal */ abortController: AbortController; diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 5db71f10e..0aef165ad 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -31,6 +31,7 @@ export { type RateLimitOptions, type TrafficRequest, type TrafficRequestMetadata, + type TrafficPriority, type TrafficRequestType, } from "./traffic/traffic-controller"; // Export new Agent from agent.ts diff --git a/packages/core/src/traffic/traffic-controller.spec.ts b/packages/core/src/traffic/traffic-controller.spec.ts new file mode 100644 index 000000000..9b89d4b85 --- /dev/null +++ b/packages/core/src/traffic/traffic-controller.spec.ts @@ -0,0 +1,87 @@ +import { describe, expect, it, vi } from "vitest"; +import { TrafficController } from "./traffic-controller"; + +describe("TrafficController priority scheduling", () => { + it("prioritizes P0 over lower priorities when runnable", async () => { + const controller = new TrafficController({ maxConcurrent: 1 }); + const order: string[] = []; + + const p1 = controller.handleText({ + metadata: { provider: "p", model: "m1", priority: "P1" }, + execute: async () => { + order.push("P1"); + return "P1"; + }, + }); + + const p2 = controller.handleText({ + metadata: { provider: "p", model: "m2", priority: "P2" }, + execute: async () => { + order.push("P2"); + return "P2"; + }, + }); + + const p0 = controller.handleText({ + metadata: { provider: "p", model: "m0", priority: "P0" }, + execute: async () => { + order.push("P0"); + return "P0"; + }, + }); + + await Promise.all([p0, p1, p2]); + + expect(order[0]).toBe("P0"); + expect(order).toEqual(["P0", "P1", "P2"]); + }); + + it("allows lower priorities to proceed when a higher priority request is rate limited", async () => { + vi.useFakeTimers(); + + try { + const controller = new TrafficController({ + maxConcurrent: 1, + rateLimits: { + "p0::m0": { capacity: 1, refillPerSecond: 1 }, + }, + }); + + // Exhaust the bucket for the P0 key so it initially waits + const buckets = (controller as unknown as { rateLimitBuckets: Map }) + .rateLimitBuckets; + buckets.set("p0::m0", { + tokens: 0, + capacity: 1, + refillPerMs: 1 / 1000, + lastRefill: Date.now(), + }); + + const order: string[] = []; + + const p0 = controller.handleText({ + metadata: { provider: "p0", model: "m0", priority: "P0" }, + execute: async () => { + order.push("P0"); + return "P0"; + }, + }); + + const p1 = controller.handleText({ + metadata: { provider: "p1", model: "m1", priority: "P1" }, + execute: async () => { + order.push("P1"); + return "P1"; + }, + }); + + await vi.runAllTimersAsync(); + await Promise.all([p0, p1]); + + expect(order[0]).toBe("P1"); + expect(order[1]).toBe("P0"); + } finally { + vi.useRealTimers(); + } + }); +}); diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index b9bc0babc..2432c1471 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -58,11 +58,14 @@ export type RateLimitUpdateResult = { export type TrafficRequestType = "text" | "stream"; +export type TrafficPriority = "P0" | "P1" | "P2"; + export interface TrafficRequestMetadata { agentId?: string; agentName?: string; model?: string; provider?: string; + priority?: TrafficPriority; } export interface TrafficRequest { @@ -90,6 +93,7 @@ interface QueuedRequest { attempt?: number; circuitKey?: string; circuitStatus?: CircuitStateStatus; + priority: TrafficPriority; } export interface TrafficControllerOptions { @@ -111,7 +115,12 @@ export class TrafficController { private readonly rateLimitBuckets = new Map(); private readonly circuitBreakers = new Map(); private readonly fallbackChains: Map; - private queue: QueuedRequest[] = []; + private readonly priorityOrder: TrafficPriority[] = ["P0", "P1", "P2"]; + private readonly queues: Record = { + P0: [], + P1: [], + P2: [], + }; private activeCount = 0; private drainScheduled = false; private refillTimeout?: ReturnType; @@ -164,18 +173,20 @@ export class TrafficController { ): Promise { // Each request gets a promise so callers can await their own result return new Promise((resolve, reject) => { + const priority = this.resolvePriority(request.metadata); // Collect the work item and metadata - this.queue.push({ + this.getQueue(priority).push({ type, request, resolve, reject, attempt: 1, + priority, }); this.logDebug("[TrafficController] enqueue", { type, - queueSize: this.queue.length, + queueSize: this.getQueueSize(), metadata: request.metadata, }); @@ -190,11 +201,11 @@ export class TrafficController { } this.drainScheduled = true; // Prevent redundant scheduling when many requests arrive at once - this.logDebug("[TrafficController] scheduleDrain", { queueSize: this.queue.length }); + this.logDebug("[TrafficController] scheduleDrain", { queueSize: this.getQueueSize() }); this.scheduler(() => { this.drainScheduled = false; this.logDebug("[TrafficController] drainLoopStart", { - queueSize: this.queue.length, + queueSize: this.getQueueSize(), active: this.activeCount, }); this.drainQueue(); // Drain asynchronously so we never block the caller's tick @@ -203,35 +214,52 @@ export class TrafficController { private drainQueue(): void { // Pull as many items as we can until we hit capacity or rate limits - while (this.queue.length > 0) { - const next = this.queue[0]; // Peek without removing so we only dequeue when we can process - if (!next) { - break; + while (this.hasQueuedWork()) { + if (this.activeCount >= this.maxConcurrent) { + return; } - const decision = this.getProcessDecision(next); - if (decision === "wait") { - return; // Stop early; drain will be rescheduled once capacity frees up - } + let selected: { item: QueuedRequest; priority: TrafficPriority } | undefined; + let skippedItem = false; + + for (const priority of this.priorityOrder) { + const queue = this.getQueue(priority); + if (queue.length === 0) { + continue; + } + + const candidate = queue[0]; + const decision = this.getProcessDecision(candidate); + if (decision === "process") { + selected = { item: candidate, priority }; + break; + } + + if (decision === "skip") { + queue.shift(); // Remove rejected item + skippedItem = true; + break; // Re-evaluate from highest priority after removing + } - this.queue.shift(); // Remove after we've confirmed we can process - if (decision === "skip") { - continue; // Already handled (e.g., circuit open with no fallback) + // If wait, try lower priorities in the same drain cycle } - this.activeCount++; // Track in-flight work to enforce concurrency guard - this.markCircuitTrial(next); // Reserve the half-open trial slot if needed + if (selected) { + const { item, priority } = selected; + this.getQueue(priority).shift(); + this.activeCount++; // Track in-flight work to enforce concurrency guard + this.markCircuitTrial(item); // Reserve the half-open trial slot if needed - this.logDebug("[TrafficController] dispatch", { - type: next.type, - queueSize: this.queue.length, - active: this.activeCount, - etaMs: next.etaMs, - rateLimitKey: next.rateLimitKey, - metadata: next.request.metadata, - }); + void this.runRequest(item); // Fire off processing without blocking the loop + continue; + } + + if (skippedItem) { + continue; // We removed a blocked item; re-evaluate queues + } - void this.runRequest(next); // Fire off processing without blocking the loop + // No runnable work right now; exit until capacity/rate-limit changes + return; } } @@ -391,21 +419,24 @@ export class TrafficController { private countQueuedAheadWithKey(key: string, current: QueuedRequest, logDetails = false): number { let count = 0; - for (const item of this.queue) { - if (item === current) { - break; - } + for (const priority of this.priorityOrder) { + const queue = this.getQueue(priority); + for (const item of queue) { + if (item === current) { + return count; + } - const itemKey = this.buildRateLimitKey(item.request.metadata); - if (itemKey === key) { - count += 1; + const itemKey = this.buildRateLimitKey(item.request.metadata); + if (itemKey === key) { + count += 1; + } } } if (logDetails) { this.logDebug("[TrafficController] countQueuedAheadWithKey", { key, count, - queueSize: this.queue.length, + queueSize: this.getQueueSize(), }); } return count; @@ -786,6 +817,31 @@ export class TrafficController { return unit === "ms" ? value : value * 1000; } + private resolvePriority(metadata?: TrafficRequestMetadata): TrafficPriority { + const candidate = metadata?.priority; + if (candidate === "P0" || candidate === "P1" || candidate === "P2") { + return candidate; + } + + return "P1"; + } + + private getQueue(priority: TrafficPriority): QueuedRequest[] { + return this.queues[priority]; + } + + private hasQueuedWork(): boolean { + return this.priorityOrder.some((priority) => this.getQueue(priority).length > 0); + } + + private getQueueSize(): number { + let size = 0; + for (const priority of this.priorityOrder) { + size += this.getQueue(priority).length; + } + return size; + } + private scheduleRefill(limit: NormalizedRateLimit): void { if (this.refillTimeout) { return; @@ -796,7 +852,7 @@ export class TrafficController { this.refillTimeout = setTimeout(() => { this.refillTimeout = undefined; // Allow future refills to be scheduled this.logDebug("[TrafficController] refillTimeoutFired", { - queueSize: this.queue.length, + queueSize: this.getQueueSize(), active: this.activeCount, }); this.scheduleDrain(); // Try draining again now that tokens should exist @@ -884,7 +940,6 @@ export class TrafficController { } private async runRequest(item: QueuedRequest): Promise { - const attempt = item.attempt ?? 1; this.logDebug("[TrafficController] runRequest start", { @@ -892,7 +947,7 @@ export class TrafficController { rateLimitKey: item.rateLimitKey, etaMs: item.etaMs, active: this.activeCount, - queueSize: this.queue.length, + queueSize: this.getQueueSize(), }); try { @@ -918,7 +973,7 @@ export class TrafficController { this.logDebug("[TrafficController] runRequest complete", { type: item.type, active: this.activeCount, - queueSize: this.queue.length, + queueSize: this.getQueueSize(), }); this.scheduleDrain(); // Immediately try to pull the next request } @@ -1070,7 +1125,8 @@ export class TrafficController { }); setTimeout(() => { - this.queue.push({ + const retryPriority = item.priority; + this.getQueue(retryPriority).push({ ...item, attempt: nextAttempt, etaMs: undefined, diff --git a/packages/scorers/src/llm/answer-correctness.ts b/packages/scorers/src/llm/answer-correctness.ts index 2111fa31c..bfea8c70c 100644 --- a/packages/scorers/src/llm/answer-correctness.ts +++ b/packages/scorers/src/llm/answer-correctness.ts @@ -84,6 +84,7 @@ export function createAnswerCorrectnessScorer< const agent = new Agent({ name: "answer-correctness-classifier", model, + trafficPriority: "P2", instructions: "You classify statements for answer correctness evaluation", }); diff --git a/packages/scorers/src/llm/answer-relevancy.ts b/packages/scorers/src/llm/answer-relevancy.ts index a3de2237c..efe3ed83a 100644 --- a/packages/scorers/src/llm/answer-relevancy.ts +++ b/packages/scorers/src/llm/answer-relevancy.ts @@ -119,6 +119,7 @@ export function createAnswerRelevancyScorer< const agent = new Agent({ name: "question-generator", model, + trafficPriority: "P2", instructions: "You generate questions from answers to evaluate relevancy", }); diff --git a/packages/scorers/src/llm/classifiers.ts b/packages/scorers/src/llm/classifiers.ts index 1bca42393..81d9b1235 100644 --- a/packages/scorers/src/llm/classifiers.ts +++ b/packages/scorers/src/llm/classifiers.ts @@ -93,6 +93,7 @@ async function evaluateChoice(args: EvaluateChoiceArgs): Promise const agent = new Agent({ name: `${scorerId}-judge`, model, + trafficPriority: "P2", instructions: judgeInstructions ?? buildDefaultChoiceInstructions(Object.keys(choices)), }); diff --git a/packages/scorers/src/llm/context-precision.ts b/packages/scorers/src/llm/context-precision.ts index d31b5b851..9483c1a94 100644 --- a/packages/scorers/src/llm/context-precision.ts +++ b/packages/scorers/src/llm/context-precision.ts @@ -109,6 +109,7 @@ export function createContextPrecisionScorer< const agent = new Agent({ name: "context-precision-evaluator", model, + trafficPriority: "P2", instructions: "You evaluate if context was useful for arriving at the answer", }); diff --git a/packages/scorers/src/llm/context-recall.ts b/packages/scorers/src/llm/context-recall.ts index e6e865106..15a981155 100644 --- a/packages/scorers/src/llm/context-recall.ts +++ b/packages/scorers/src/llm/context-recall.ts @@ -120,6 +120,7 @@ export function createContextRecallScorer< const agent = new Agent({ name: "context-recall-evaluator", model, + trafficPriority: "P2", instructions: "You evaluate how well provided context supports factual statements", }); diff --git a/packages/scorers/src/llm/context-relevancy.ts b/packages/scorers/src/llm/context-relevancy.ts index ee882b5b1..2004e68b6 100644 --- a/packages/scorers/src/llm/context-relevancy.ts +++ b/packages/scorers/src/llm/context-relevancy.ts @@ -144,6 +144,7 @@ export function createContextRelevancyScorer< const agent = new Agent({ name: "context-relevancy-evaluator", model, + trafficPriority: "P2", instructions: "You evaluate how relevant provided context is to answering questions", }); diff --git a/packages/scorers/src/llm/moderation.ts b/packages/scorers/src/llm/moderation.ts index 03563bfe4..cfcb46041 100644 --- a/packages/scorers/src/llm/moderation.ts +++ b/packages/scorers/src/llm/moderation.ts @@ -232,6 +232,7 @@ async function runModerationJudge(args: { const agent = new Agent({ name: "moderation-judge", model, + trafficPriority: "P2", instructions: "You are a safety classifier. Respond with JSON that matches the provided schema containing flagged, scores, and reason.", }); From 72fc58ab63d9be50578d9601bebdf8dc33ec3d7a Mon Sep 17 00:00:00 2001 From: riturajFi Date: Mon, 1 Dec 2025 15:46:17 +0530 Subject: [PATCH 06/41] feat: tenant id --- packages/core/src/agent/agent.ts | 66 +++++++++++ packages/core/src/agent/eval.ts | 1 + packages/core/src/agent/types.ts | 4 + .../core/src/traffic/traffic-controller.ts | 106 +++++++++++++++++- packages/core/src/workflow/core.ts | 4 + packages/core/src/workflow/internal/state.ts | 2 + packages/core/src/workflow/internal/utils.ts | 1 + packages/core/src/workflow/steps/and-agent.ts | 2 + packages/core/src/workflow/types.ts | 4 + .../scorers/src/llm/answer-correctness.ts | 4 +- packages/scorers/src/llm/answer-relevancy.ts | 4 +- packages/scorers/src/llm/classifiers.ts | 3 + packages/scorers/src/llm/context-precision.ts | 6 +- packages/scorers/src/llm/context-recall.ts | 10 +- packages/scorers/src/llm/context-relevancy.ts | 6 +- packages/scorers/src/llm/moderation.ts | 3 + packages/scorers/src/llm/utils.ts | 14 +++ 17 files changed, 228 insertions(+), 12 deletions(-) create mode 100644 packages/scorers/src/llm/utils.ts diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index 862c0b3e1..51d097011 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -267,6 +267,7 @@ export interface BaseGenerationOptions extends Partial { // Context userId?: string; conversationId?: string; + tenantId?: string; context?: ContextInput; elicitation?: (request: unknown) => Promise; /** @@ -460,10 +461,14 @@ export class Agent { options?: GenerateTextOptions, ): Promise { const controller = getTrafficController({ logger: this.logger }); // Use shared controller so all agent calls flow through central queue/metrics + const tenantId = this.resolveTenantId(options); const buildRequest = (modelOverride?: LanguageModel) => ({ + tenantId, metadata: this.buildTrafficMetadata(modelOverride ?? options?.model, options), // Pass model/provider info for future rate limiting keys execute: () => this.executeGenerateText(input, this.mergeOptionsWithModel(options, modelOverride)), // Defer actual execution so controller can schedule it + extractUsage: (result: GenerateTextResultWithContext) => + this.extractUsageFromResponse(result), createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), }); @@ -817,10 +822,13 @@ export class Agent { options?: StreamTextOptions, ): Promise { const controller = getTrafficController({ logger: this.logger }); // Same controller handles streaming to keep ordering/backpressure consistent + const tenantId = this.resolveTenantId(options); const buildRequest = (modelOverride?: LanguageModel) => ({ + tenantId, metadata: this.buildTrafficMetadata(modelOverride ?? options?.model, options), // Include identifiers to support per-provider/model policies later execute: () => this.executeStreamText(input, this.mergeOptionsWithModel(options, modelOverride)), // Actual streaming work happens after the controller dequeues us + extractUsage: (result: StreamTextResultWithContext) => this.extractUsageFromResponse(result), createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), }); @@ -1508,7 +1516,9 @@ export class Agent { options?: GenerateObjectOptions, ): Promise>> { const controller = getTrafficController({ logger: this.logger }); + const tenantId = this.resolveTenantId(options); const buildRequest = (modelOverride?: LanguageModel) => ({ + tenantId, metadata: this.buildTrafficMetadata(modelOverride ?? options?.model, options), execute: () => this.executeGenerateObject( @@ -1516,6 +1526,8 @@ export class Agent { schema, this.mergeOptionsWithModel(options, modelOverride), ), + extractUsage: (result: GenerateObjectResultWithContext>) => + this.extractUsageFromResponse(result), createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), }); @@ -1768,10 +1780,14 @@ export class Agent { options?: StreamObjectOptions, ): Promise>> { const controller = getTrafficController({ logger: this.logger }); + const tenantId = this.resolveTenantId(options); const buildRequest = (modelOverride?: LanguageModel) => ({ + tenantId, metadata: this.buildTrafficMetadata(modelOverride ?? options?.model, options), execute: () => this.executeStreamObject(input, schema, this.mergeOptionsWithModel(options, modelOverride)), + extractUsage: (result: StreamObjectResultWithContext>) => + this.extractUsageFromResponse(result), createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), }); @@ -2214,6 +2230,7 @@ export class Agent { const operationId = randomUUID(); const startTimeDate = new Date(); const priority = this.resolveTrafficPriority(options); + const tenantId = this.resolveTenantId(options); // Prefer reusing an existing context instance to preserve reference across calls/subagents const runtimeContext = toContextMap(options?.context); @@ -2264,6 +2281,7 @@ export class Agent { operationId, userId: options?.userId, conversationId: options?.conversationId, + tenantId, executionId: operationId, }); @@ -2278,6 +2296,9 @@ export class Agent { parentAgentId: options?.parentAgentId, input, }); + if (tenantId) { + traceContext.getRootSpan().setAttribute("tenant.id", tenantId); + } traceContext.getRootSpan().setAttribute("voltagent.operation_id", operationId); // Use parent's AbortController if available, otherwise create new one @@ -2318,6 +2339,7 @@ export class Agent { priority, userId: options?.userId, conversationId: options?.conversationId, + tenantId, parentAgentId: options?.parentAgentId, traceContext, startTime: startTimeDate, @@ -3996,6 +4018,19 @@ export class Agent { return localPriority; } + private resolveTenantId(options?: BaseGenerationOptions): string { + const parentTenant = options?.parentOperationContext?.tenantId; + if (parentTenant) { + return parentTenant; + } + + if (options?.tenantId) { + return options.tenantId; + } + + return "default"; + } + private pickHigherPriority(a: TrafficPriority, b: TrafficPriority): TrafficPriority { const rank: Record = { P0: 0, P1: 1, P2: 2 }; return rank[a] <= rank[b] ? a : b; @@ -4015,6 +4050,7 @@ export class Agent { model: this.getModelName(modelOverride), // Used for future capacity policies provider, // Allows per-provider throttling later priority, + tenantId: this.resolveTenantId(options), }; } @@ -4060,6 +4096,36 @@ export class Agent { }); } + private extractUsageFromResponse( + result: + | { + usage?: LanguageModelUsage | Promise; + totalUsage?: LanguageModelUsage | Promise; + } + | undefined, + ): Promise | LanguageModelUsage | undefined { + if (!result) { + return undefined; + } + + const usageCandidate = + (result as { totalUsage?: LanguageModelUsage | Promise }) + ?.totalUsage ?? + (result as { usage?: LanguageModelUsage | Promise })?.usage; + + if (!usageCandidate) { + return undefined; + } + + if ( + typeof (usageCandidate as PromiseLike).then === "function" + ) { + return (usageCandidate as Promise).catch(() => undefined); + } + + return usageCandidate as LanguageModelUsage; + } + private resolveProvider( model: LanguageModel | DynamicValue | undefined, ): string | undefined { diff --git a/packages/core/src/agent/eval.ts b/packages/core/src/agent/eval.ts index 9e4fe9f2e..de7125058 100644 --- a/packages/core/src/agent/eval.ts +++ b/packages/core/src/agent/eval.ts @@ -711,6 +711,7 @@ function buildEvalPayload( rawOutput: output, userId: oc.userId, conversationId: oc.conversationId, + tenantId: oc.tenantId, traceId: spanContext.traceId, spanId: spanContext.spanId, metadata, diff --git a/packages/core/src/agent/types.ts b/packages/core/src/agent/types.ts index 21b33d0ec..add69edfd 100644 --- a/packages/core/src/agent/types.ts +++ b/packages/core/src/agent/types.ts @@ -499,6 +499,7 @@ export interface AgentEvalPayload { rawOutput?: unknown; userId?: string; conversationId?: string; + tenantId?: string; traceId: string; spanId: string; metadata?: Record; @@ -896,6 +897,9 @@ export type OperationContext = { /** Optional conversation identifier associated with this operation */ conversationId?: string; + /** Optional tenant identifier propagated across nested operations */ + tenantId?: string; + /** User-managed context map for this operation */ readonly context: Map; diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index 2432c1471..8d82e8a5d 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -5,6 +5,9 @@ type Scheduler = (callback: () => void) => void; type BivariantHandler = { bivarianceHack(...args: TArgs): void; }["bivarianceHack"]; +type BivariantFunction = { + bivarianceHack(...args: TArgs): TReturn; +}["bivarianceHack"]; type RetryReason = "rateLimit" | "serverError" | "timeout"; @@ -40,6 +43,18 @@ export interface RateLimitOptions { refillPerSecond: number; } +export type TenantUsage = { + inputTokens: number; + outputTokens: number; + totalTokens: number; +}; + +type UsageCounters = { + inputTokens?: number; + outputTokens?: number; + totalTokens?: number; +}; + export type RateLimitKey = string; export type RateLimitConfig = Record; @@ -66,12 +81,18 @@ export interface TrafficRequestMetadata { model?: string; provider?: string; priority?: TrafficPriority; + tenantId?: string; } export interface TrafficRequest { + tenantId: string; metadata?: TrafficRequestMetadata; execute: () => Promise; createFallbackRequest?: (modelId: string) => TrafficRequest | undefined; + extractUsage?: BivariantFunction< + [response: TResponse], + Promise | UsageCounters | undefined + >; } type CircuitStateStatus = "closed" | "open" | "half-open"; @@ -94,6 +115,11 @@ interface QueuedRequest { circuitKey?: string; circuitStatus?: CircuitStateStatus; priority: TrafficPriority; + tenantId: string; + extractUsage?: BivariantFunction< + [response: TResponse], + Promise | UsageCounters | undefined + >; } export interface TrafficControllerOptions { @@ -124,6 +150,7 @@ export class TrafficController { private activeCount = 0; private drainScheduled = false; private refillTimeout?: ReturnType; + private readonly tenantUsage = new Map(); private readonly logger: Logger; private logDebug(message: string, details?: Record): void { @@ -158,6 +185,11 @@ export class TrafficController { return this.enqueue("stream", request); } + getTenantUsage(tenantId: string): TenantUsage | undefined { + const usage = this.tenantUsage.get(tenantId); + return usage ? { ...usage } : undefined; + } + private createScheduler(): Scheduler { // Prefer queueMicrotask to keep the drain loop snappy without starving the event loop if (typeof queueMicrotask === "function") { @@ -174,6 +206,11 @@ export class TrafficController { // Each request gets a promise so callers can await their own result return new Promise((resolve, reject) => { const priority = this.resolvePriority(request.metadata); + this.logger.debug("Enqueuing LLM request", { + tenantId: request.tenantId, + type, + priority, + }); // Collect the work item and metadata this.getQueue(priority).push({ type, @@ -182,6 +219,8 @@ export class TrafficController { reject, attempt: 1, priority, + tenantId: request.tenantId, + extractUsage: request.extractUsage, }); this.logDebug("[TrafficController] enqueue", { @@ -931,6 +970,66 @@ export class TrafficController { this.circuitBreakers.set(key, state); } + private recordUsageFromResult( + item: QueuedRequest, + result: TResponse, + ): void { + const extractor = item.extractUsage ?? item.request.extractUsage; + if (!extractor) { + return; + } + + try { + const usageCandidate = extractor(result); + if (!usageCandidate) { + return; + } + + if (this.isPromiseLike(usageCandidate)) { + void Promise.resolve(usageCandidate) + .then((usage) => { + if (usage) { + this.incrementTenantUsage(item.tenantId, usage); + } + }) + .catch((error) => { + this.logger.debug("Failed to record tenant usage", { tenantId: item.tenantId, error }); + }); + return; + } + + this.incrementTenantUsage(item.tenantId, usageCandidate as UsageCounters); + } catch (error) { + this.logger.debug("Failed to record tenant usage", { tenantId: item.tenantId, error }); + } + } + + private incrementTenantUsage(tenantId: string, usage: UsageCounters): void { + const current = this.tenantUsage.get(tenantId) ?? { + inputTokens: 0, + outputTokens: 0, + totalTokens: 0, + }; + const inputTokens = usage.inputTokens ?? 0; + const outputTokens = usage.outputTokens ?? 0; + const totalTokens = usage.totalTokens ?? inputTokens + outputTokens; + const updated: TenantUsage = { + inputTokens: current.inputTokens + inputTokens, + outputTokens: current.outputTokens + outputTokens, + totalTokens: current.totalTokens + totalTokens, + }; + this.tenantUsage.set(tenantId, updated); + this.logger.debug("Recorded tenant usage", { tenantId, usage: updated }); + } + + private isPromiseLike(value: unknown): value is PromiseLike { + return ( + typeof value === "object" && + value !== null && + typeof (value as PromiseLike).then === "function" + ); + } + private isCircuitBreakerStatus(status?: number): boolean { if (status === 429) { return true; @@ -953,12 +1052,7 @@ export class TrafficController { try { const result = await item.request.execute(); // Execute the user's operation this.recordCircuitSuccess(item.request.metadata); - // Log raw result coming back from the underlying handler (e.g., AI SDK) - this.logDebug("[TrafficController] runRequest result", { - type: item.type, - rateLimitKey: item.rateLimitKey, - result, - }); + this.recordUsageFromResult(item, result); item.resolve(result); // Deliver successful result back to the waiting caller } catch (error) { this.recordCircuitFailure(item.request.metadata, error); diff --git a/packages/core/src/workflow/core.ts b/packages/core/src/workflow/core.ts index 3136511ca..2b273d588 100644 --- a/packages/core/src/workflow/core.ts +++ b/packages/core/src/workflow/core.ts @@ -827,6 +827,9 @@ export function createWorkflow< // Wrap entire execution in root span const rootSpan = traceContext.getRootSpan(); + if (options?.tenantId) { + rootSpan.setAttribute("tenant.id", options.tenantId); + } // Add workflow state snapshot for remote observability const workflowState = { @@ -848,6 +851,7 @@ export function createWorkflow< executionId, userId: options?.userId, conversationId: options?.conversationId, + tenantId: options?.tenantId, traceId: rootSpan.spanContext().traceId, spanId: rootSpan.spanContext().spanId, }); diff --git a/packages/core/src/workflow/internal/state.ts b/packages/core/src/workflow/internal/state.ts index 71fa602d4..2de12528c 100644 --- a/packages/core/src/workflow/internal/state.ts +++ b/packages/core/src/workflow/internal/state.ts @@ -23,6 +23,7 @@ export type WorkflowState = { executionId: string; conversationId?: string; userId?: string; + tenantId?: string; context?: UserContext; active: number; startAt: Date; @@ -132,6 +133,7 @@ class WorkflowStateManagerInternal implements WorkflowStateManager active: config?.active ?? 0, userId: config?.userId, conversationId: config?.conversationId, + tenantId: config?.tenantId, context: config?.context, startAt: new Date(), endAt: null, diff --git a/packages/core/src/workflow/internal/utils.ts b/packages/core/src/workflow/internal/utils.ts index fc39530b5..42250d828 100644 --- a/packages/core/src/workflow/internal/utils.ts +++ b/packages/core/src/workflow/internal/utils.ts @@ -32,6 +32,7 @@ export function convertWorkflowStateToParam( executionId: state.executionId, conversationId: state.conversationId, userId: state.userId, + tenantId: state.tenantId, context: state.context, active: state.active, startAt: state.startAt, diff --git a/packages/core/src/workflow/steps/and-agent.ts b/packages/core/src/workflow/steps/and-agent.ts index bc46c1480..14af9b8f6 100644 --- a/packages/core/src/workflow/steps/and-agent.ts +++ b/packages/core/src/workflow/steps/and-agent.ts @@ -66,6 +66,7 @@ export function andAgent( context: restConfig.context ?? state.context, conversationId: restConfig.conversationId ?? state.conversationId, userId: restConfig.userId ?? state.userId, + tenantId: restConfig.tenantId ?? state.tenantId, // No parentSpan when there's no workflow context }); // Accumulate usage if available (no workflow context) @@ -92,6 +93,7 @@ export function andAgent( context: restConfig.context ?? state.context, conversationId: restConfig.conversationId ?? state.conversationId, userId: restConfig.userId ?? state.userId, + tenantId: restConfig.tenantId ?? state.tenantId, // Pass the current step span as parent for proper span hierarchy parentSpan: state.workflowContext?.currentStepSpan, }); diff --git a/packages/core/src/workflow/types.ts b/packages/core/src/workflow/types.ts index f7eed2823..49bfd8cb4 100644 --- a/packages/core/src/workflow/types.ts +++ b/packages/core/src/workflow/types.ts @@ -214,6 +214,10 @@ export interface WorkflowRunOptions { * The conversation ID, this can be used to track the current conversation in a workflow */ conversationId?: string; + /** + * Tenant identifier propagated to agent steps and subcalls + */ + tenantId?: string; /** * The user ID, this can be used to track the current user in a workflow */ diff --git a/packages/scorers/src/llm/answer-correctness.ts b/packages/scorers/src/llm/answer-correctness.ts index bfea8c70c..d66cc0079 100644 --- a/packages/scorers/src/llm/answer-correctness.ts +++ b/packages/scorers/src/llm/answer-correctness.ts @@ -7,6 +7,7 @@ import { import { safeStringify } from "@voltagent/internal/utils"; import type { LanguageModel } from "ai"; import { z } from "zod"; +import { extractTenantId } from "./utils"; const ANSWER_CORRECTNESS_PROMPT = `Given a ground truth and an answer, analyze each statement in the answer and classify them in one of the following categories: @@ -88,12 +89,13 @@ export function createAnswerCorrectnessScorer< instructions: "You classify statements for answer correctness evaluation", }); + const tenantId = extractTenantId(context); const payload = resolvePayload(context, buildPayload); const prompt = ANSWER_CORRECTNESS_PROMPT.replace("{{question}}", payload.input) .replace("{{answer}}", payload.output) .replace("{{ground_truth}}", payload.expected); - const response = await agent.generateObject(prompt, CLASSIFICATION_SCHEMA); + const response = await agent.generateObject(prompt, CLASSIFICATION_SCHEMA, { tenantId }); const normalized = normalizeClassification(response.object); return { diff --git a/packages/scorers/src/llm/answer-relevancy.ts b/packages/scorers/src/llm/answer-relevancy.ts index efe3ed83a..d9bda1c9a 100644 --- a/packages/scorers/src/llm/answer-relevancy.ts +++ b/packages/scorers/src/llm/answer-relevancy.ts @@ -8,6 +8,7 @@ import { import { safeStringify } from "@voltagent/internal/utils"; import type { LanguageModel } from "ai"; import { z } from "zod"; +import { extractTenantId } from "./utils"; const QUESTION_GEN_PROMPT = `Generate a question for the given answer and Identify if answer is noncommittal. Give noncommittal as 1 if the answer is noncommittal and 0 if the answer is committal. A noncommittal answer is one that is evasive, vague, or ambiguous. For example, "I don't know" or "I'm not sure" are noncommittal answers @@ -123,6 +124,7 @@ export function createAnswerRelevancyScorer< instructions: "You generate questions from answers to evaluate relevancy", }); + const tenantId = extractTenantId(context); const payload = resolvePayload(context, buildPayload); const questions: GeneratedQuestion[] = []; @@ -132,7 +134,7 @@ export function createAnswerRelevancyScorer< payload.context, ); - const response = await agent.generateObject(prompt, QUESTION_SCHEMA); + const response = await agent.generateObject(prompt, QUESTION_SCHEMA, { tenantId }); questions.push({ question: response.object.question, noncommittal: response.object.noncommittal === 1, diff --git a/packages/scorers/src/llm/classifiers.ts b/packages/scorers/src/llm/classifiers.ts index 81d9b1235..a327e20d4 100644 --- a/packages/scorers/src/llm/classifiers.ts +++ b/packages/scorers/src/llm/classifiers.ts @@ -7,6 +7,7 @@ import { } from "@voltagent/core"; import { safeStringify } from "@voltagent/internal/utils"; import { z } from "zod"; +import { extractTenantId } from "./utils"; type ChoiceId = string; @@ -97,8 +98,10 @@ async function evaluateChoice(args: EvaluateChoiceArgs): Promise instructions: judgeInstructions ?? buildDefaultChoiceInstructions(Object.keys(choices)), }); + const tenantId = extractTenantId(context); const response = await agent.generateObject(prompt, CHOICE_RESPONSE_SCHEMA, { maxOutputTokens, + tenantId, }); const { choice, reason } = extractChoiceFromResponse(response.object, choices, scorerId); diff --git a/packages/scorers/src/llm/context-precision.ts b/packages/scorers/src/llm/context-precision.ts index 9483c1a94..ba680f560 100644 --- a/packages/scorers/src/llm/context-precision.ts +++ b/packages/scorers/src/llm/context-precision.ts @@ -7,6 +7,7 @@ import { import { safeStringify } from "@voltagent/internal/utils"; import type { LanguageModel } from "ai"; import { z } from "zod"; +import { extractTenantId } from "./utils"; const CONTEXT_PRECISION_PROMPT = `Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output. @@ -117,12 +118,15 @@ export function createContextPrecisionScorer< const contextText = Array.isArray(payload.context) ? payload.context.join("\n") : payload.context; + const tenantId = extractTenantId(context); const prompt = CONTEXT_PRECISION_PROMPT.replace("{{question}}", payload.input) .replace("{{context}}", contextText) .replace("{{answer}}", payload.output); - const response = await agent.generateObject(prompt, CONTEXT_PRECISION_SCHEMA); + const response = await agent.generateObject(prompt, CONTEXT_PRECISION_SCHEMA, { + tenantId, + }); context.results.raw.contextPrecisionVerdict = response.object; diff --git a/packages/scorers/src/llm/context-recall.ts b/packages/scorers/src/llm/context-recall.ts index 15a981155..2c6053fc9 100644 --- a/packages/scorers/src/llm/context-recall.ts +++ b/packages/scorers/src/llm/context-recall.ts @@ -7,6 +7,7 @@ import { import { safeStringify } from "@voltagent/internal/utils"; import type { LanguageModel } from "ai"; import { z } from "zod"; +import { extractTenantId } from "./utils"; const CONTEXT_RECALL_EXTRACT_PROMPT = `Given the context and ground truth (expected output), extract all factual statements from the ground truth. @@ -128,6 +129,7 @@ export function createContextRecallScorer< const contextText = Array.isArray(payload.context) ? payload.context.join("\n") : payload.context; + const tenantId = extractTenantId(context); // Extract statements from expected output const extractPrompt = CONTEXT_RECALL_EXTRACT_PROMPT.replace( @@ -135,7 +137,9 @@ export function createContextRecallScorer< contextText, ).replace("{{expected}}", payload.expected); - const extractResponse = await agent.generateObject(extractPrompt, EXTRACT_SCHEMA); + const extractResponse = await agent.generateObject(extractPrompt, EXTRACT_SCHEMA, { + tenantId, + }); const statements = extractResponse.object.statements; if (statements.length === 0) { @@ -153,7 +157,9 @@ export function createContextRecallScorer< contextText, ).replace("{{statement}}", statement); - const verifyResponse = await agent.generateObject(verifyPrompt, VERIFY_SCHEMA); + const verifyResponse = await agent.generateObject(verifyPrompt, VERIFY_SCHEMA, { + tenantId, + }); verdicts.push({ statement, verdict: verifyResponse.object.verdict, diff --git a/packages/scorers/src/llm/context-relevancy.ts b/packages/scorers/src/llm/context-relevancy.ts index 2004e68b6..aca608b25 100644 --- a/packages/scorers/src/llm/context-relevancy.ts +++ b/packages/scorers/src/llm/context-relevancy.ts @@ -7,6 +7,7 @@ import { import { safeStringify } from "@voltagent/internal/utils"; import type { LanguageModel } from "ai"; import { z } from "zod"; +import { extractTenantId } from "./utils"; const CONTEXT_RELEVANCY_PROMPT = `Analyze the provided context and identify which parts are relevant to answering the given question. For each context sentence or passage, determine its relevance level. @@ -152,13 +153,16 @@ export function createContextRelevancyScorer< const contextText = Array.isArray(payload.context) ? payload.context.join("\n") : payload.context; + const tenantId = extractTenantId(context); const prompt = CONTEXT_RELEVANCY_PROMPT.replace("{{question}}", payload.input).replace( "{{context}}", contextText, ); - const response = await agent.generateObject(prompt, CONTEXT_RELEVANCY_SCHEMA); + const response = await agent.generateObject(prompt, CONTEXT_RELEVANCY_SCHEMA, { + tenantId, + }); const evaluations = response.object.evaluations; context.results.raw.contextRelevancyEvaluations = evaluations; diff --git a/packages/scorers/src/llm/moderation.ts b/packages/scorers/src/llm/moderation.ts index cfcb46041..1055927f2 100644 --- a/packages/scorers/src/llm/moderation.ts +++ b/packages/scorers/src/llm/moderation.ts @@ -7,6 +7,7 @@ import { } from "@voltagent/core"; import { safeStringify } from "@voltagent/internal/utils"; import { z } from "zod"; +import { extractTenantId } from "./utils"; export interface ModerationScorerOptions { id?: string; @@ -220,6 +221,7 @@ async function runModerationJudge(args: { typeof context.results.prepare === "string" ? context.results.prepare : normalizeText(context.payload.output); + const tenantId = extractTenantId(context); const prompt = await buildPrompt({ output: normalizedOutput, @@ -239,6 +241,7 @@ async function runModerationJudge(args: { const response = await agent.generateObject(prompt, MODERATION_SCHEMA, { maxOutputTokens, + tenantId, }); const parsed = mapModerationResponse(response.object, threshold); diff --git a/packages/scorers/src/llm/utils.ts b/packages/scorers/src/llm/utils.ts new file mode 100644 index 000000000..75e886e3e --- /dev/null +++ b/packages/scorers/src/llm/utils.ts @@ -0,0 +1,14 @@ +import type { BuilderPrepareContext, BuilderScoreContext } from "@voltagent/core"; + +type TenantAwareContext = BuilderScoreContext, Record> & + BuilderPrepareContext, Record>; + +export function extractTenantId( + context: + | BuilderScoreContext, Record> + | BuilderPrepareContext, Record> + | TenantAwareContext, +): string | undefined { + const candidate = (context.payload as { tenantId?: unknown })?.tenantId; + return typeof candidate === "string" ? candidate : undefined; +} From 3e517752542a0d7836aa57c4539f4d579c1bc6ff Mon Sep 17 00:00:00 2001 From: riturajFi Date: Sat, 6 Dec 2025 16:15:50 +0530 Subject: [PATCH 07/41] feat: minor fix --- commits.txt | 6 +++++ examples/with-client-side-tools/next-env.d.ts | 1 + examples/with-client-side-tools/tsconfig.json | 24 +++++++++++++---- .../netlify/functions/voltagent.js | 4 +++ examples/with-netlify-functions/src/index.js | 17 ++++++++++++ .../with-netlify-functions/src/tools/index.js | 26 +++++++++++++++++++ 6 files changed, 73 insertions(+), 5 deletions(-) create mode 100644 commits.txt create mode 100644 examples/with-netlify-functions/netlify/functions/voltagent.js create mode 100644 examples/with-netlify-functions/src/index.js create mode 100644 examples/with-netlify-functions/src/tools/index.js diff --git a/commits.txt b/commits.txt new file mode 100644 index 000000000..73fd43c52 --- /dev/null +++ b/commits.txt @@ -0,0 +1,6 @@ +e8443df2 +9503a0a6 +293fe825 +a88ecd67 +66d74dd2 +53f34370 \ No newline at end of file diff --git a/examples/with-client-side-tools/next-env.d.ts b/examples/with-client-side-tools/next-env.d.ts index 1b3be0840..9edff1c7c 100644 --- a/examples/with-client-side-tools/next-env.d.ts +++ b/examples/with-client-side-tools/next-env.d.ts @@ -1,5 +1,6 @@ /// /// +import "./.next/types/routes.d.ts"; // NOTE: This file should not be edited // see https://nextjs.org/docs/app/api-reference/config/typescript for more information. diff --git a/examples/with-client-side-tools/tsconfig.json b/examples/with-client-side-tools/tsconfig.json index 3697fcb9b..0fca67d34 100644 --- a/examples/with-client-side-tools/tsconfig.json +++ b/examples/with-client-side-tools/tsconfig.json @@ -1,6 +1,10 @@ { "compilerOptions": { - "lib": ["dom", "dom.iterable", "esnext"], + "lib": [ + "dom", + "dom.iterable", + "esnext" + ], "allowJs": true, "skipLibCheck": true, "strict": true, @@ -11,7 +15,7 @@ "resolveJsonModule": true, "isolatedModules": true, "sourceMap": true, - "jsx": "preserve", + "jsx": "react-jsx", "incremental": true, "plugins": [ { @@ -19,10 +23,20 @@ } ], "paths": { - "@/*": ["./*"] + "@/*": [ + "./*" + ] }, "target": "ES2017" }, - "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"], - "exclude": ["node_modules"] + "include": [ + "next-env.d.ts", + "**/*.ts", + "**/*.tsx", + ".next/types/**/*.ts", + ".next/dev/types/**/*.ts" + ], + "exclude": [ + "node_modules" + ] } diff --git a/examples/with-netlify-functions/netlify/functions/voltagent.js b/examples/with-netlify-functions/netlify/functions/voltagent.js new file mode 100644 index 000000000..0ec386b8f --- /dev/null +++ b/examples/with-netlify-functions/netlify/functions/voltagent.js @@ -0,0 +1,4 @@ +import { createNetlifyFunctionHandler } from "@voltagent/serverless-hono"; +import { getVoltAgent } from "../../src/index"; +const voltAgent = getVoltAgent(); +export const handler = createNetlifyFunctionHandler(voltAgent); diff --git a/examples/with-netlify-functions/src/index.js b/examples/with-netlify-functions/src/index.js new file mode 100644 index 000000000..af385b506 --- /dev/null +++ b/examples/with-netlify-functions/src/index.js @@ -0,0 +1,17 @@ +import { openai } from "@ai-sdk/openai"; +import { Agent, VoltAgent } from "@voltagent/core"; +import { serverlessHono } from "@voltagent/serverless-hono"; +import { weatherTool } from "./tools"; +const agent = new Agent({ + name: "netlify-function-agent", + instructions: "Help the user quickly and call tools when needed.", + model: openai("gpt-4o-mini"), + tools: [weatherTool], +}); +const voltAgent = new VoltAgent({ + agents: { agent }, + serverless: serverlessHono(), +}); +export function getVoltAgent() { + return voltAgent; +} diff --git a/examples/with-netlify-functions/src/tools/index.js b/examples/with-netlify-functions/src/tools/index.js new file mode 100644 index 000000000..d1c5bf43b --- /dev/null +++ b/examples/with-netlify-functions/src/tools/index.js @@ -0,0 +1,26 @@ +import { createTool } from "@voltagent/core"; +import z from "zod"; +export const weatherTool = createTool({ + id: "get-weather", + name: "getWeather", + description: "Return a mock weather report for the requested location", + parameters: z.object({ + location: z.string().describe("City or location to look up"), + }), + execute: async ({ location }, context) => { + context?.logger.info(`Fetching weather for ${location}`); + const mockWeatherData = { + location, + temperature: Math.floor(Math.random() * 30) + 5, + condition: ["Sunny", "Cloudy", "Rainy", "Snowy", "Partly Cloudy"][ + Math.floor(Math.random() * 5) + ], + humidity: Math.floor(Math.random() * 60) + 30, + windSpeed: Math.floor(Math.random() * 30), + }; + return { + weather: mockWeatherData, + message: `Current weather in ${location}: ${mockWeatherData.temperature}°C and ${mockWeatherData.condition.toLowerCase()} with ${mockWeatherData.humidity}% humidity and wind speed of ${mockWeatherData.windSpeed} km/h.`, + }; + }, +}); From 1ee665757a902f31f47d0f40803f0405246cd873 Mon Sep 17 00:00:00 2001 From: riturajFi Date: Sat, 13 Dec 2025 13:09:36 +0530 Subject: [PATCH 08/41] feat: final v0 --- diff.txt | 2570 +++++++++++++++++ packages/core/src/agent/agent.ts | 19 +- .../src/traffic/traffic-controller.spec.ts | 140 +- .../core/src/traffic/traffic-controller.ts | 468 ++- tmp/test/traffic-concurrency.ts | 91 + tmp/test/traffic-fallback-chain.ts | 168 ++ tmp/test/traffic-priority-openai-real.ts | 117 + tmp/test/traffic-priority-openai-sim.ts | 111 + tmp/test/traffic-priority.ts | 156 + tmp/test/traffic-rate-limit-from-headers.ts | 158 + tmp/test/traffic-rate-limit-static.ts | 144 + tmp/test/traffic-retry-behavior.ts | 169 ++ tmp/test/traffic-tenant-usage.ts | 71 + tmp/test/traffic-text-vs-stream.ts | 128 + 14 files changed, 4256 insertions(+), 254 deletions(-) create mode 100644 diff.txt create mode 100644 tmp/test/traffic-concurrency.ts create mode 100644 tmp/test/traffic-fallback-chain.ts create mode 100644 tmp/test/traffic-priority-openai-real.ts create mode 100644 tmp/test/traffic-priority-openai-sim.ts create mode 100644 tmp/test/traffic-priority.ts create mode 100644 tmp/test/traffic-rate-limit-from-headers.ts create mode 100644 tmp/test/traffic-rate-limit-static.ts create mode 100644 tmp/test/traffic-retry-behavior.ts create mode 100644 tmp/test/traffic-tenant-usage.ts create mode 100644 tmp/test/traffic-text-vs-stream.ts diff --git a/diff.txt b/diff.txt new file mode 100644 index 000000000..8e81b0333 --- /dev/null +++ b/diff.txt @@ -0,0 +1,2570 @@ +diff --git a/commits.txt b/commits.txt +new file mode 100644 +index 00000000..73fd43c5 +--- /dev/null ++++ b/commits.txt +@@ -0,0 +1,6 @@ ++e8443df2 ++9503a0a6 ++293fe825 ++a88ecd67 ++66d74dd2 ++53f34370 +\ No newline at end of file +diff --git a/examples/with-client-side-tools/next-env.d.ts b/examples/with-client-side-tools/next-env.d.ts +index 1b3be084..9edff1c7 100644 +--- a/examples/with-client-side-tools/next-env.d.ts ++++ b/examples/with-client-side-tools/next-env.d.ts +@@ -1,5 +1,6 @@ + /// + /// ++import "./.next/types/routes.d.ts"; + + // NOTE: This file should not be edited + // see https://nextjs.org/docs/app/api-reference/config/typescript for more information. +diff --git a/examples/with-client-side-tools/tsconfig.json b/examples/with-client-side-tools/tsconfig.json +index 3697fcb9..0fca67d3 100644 +--- a/examples/with-client-side-tools/tsconfig.json ++++ b/examples/with-client-side-tools/tsconfig.json +@@ -1,6 +1,10 @@ + { + "compilerOptions": { +- "lib": ["dom", "dom.iterable", "esnext"], ++ "lib": [ ++ "dom", ++ "dom.iterable", ++ "esnext" ++ ], + "allowJs": true, + "skipLibCheck": true, + "strict": true, +@@ -11,7 +15,7 @@ + "resolveJsonModule": true, + "isolatedModules": true, + "sourceMap": true, +- "jsx": "preserve", ++ "jsx": "react-jsx", + "incremental": true, + "plugins": [ + { +@@ -19,10 +23,20 @@ + } + ], + "paths": { +- "@/*": ["./*"] ++ "@/*": [ ++ "./*" ++ ] + }, + "target": "ES2017" + }, +- "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"], +- "exclude": ["node_modules"] ++ "include": [ ++ "next-env.d.ts", ++ "**/*.ts", ++ "**/*.tsx", ++ ".next/types/**/*.ts", ++ ".next/dev/types/**/*.ts" ++ ], ++ "exclude": [ ++ "node_modules" ++ ] + } +diff --git a/examples/with-netlify-functions/netlify/functions/voltagent.js b/examples/with-netlify-functions/netlify/functions/voltagent.js +new file mode 100644 +index 00000000..0ec386b8 +--- /dev/null ++++ b/examples/with-netlify-functions/netlify/functions/voltagent.js +@@ -0,0 +1,4 @@ ++import { createNetlifyFunctionHandler } from "@voltagent/serverless-hono"; ++import { getVoltAgent } from "../../src/index"; ++const voltAgent = getVoltAgent(); ++export const handler = createNetlifyFunctionHandler(voltAgent); +diff --git a/examples/with-netlify-functions/src/index.js b/examples/with-netlify-functions/src/index.js +new file mode 100644 +index 00000000..af385b50 +--- /dev/null ++++ b/examples/with-netlify-functions/src/index.js +@@ -0,0 +1,17 @@ ++import { openai } from "@ai-sdk/openai"; ++import { Agent, VoltAgent } from "@voltagent/core"; ++import { serverlessHono } from "@voltagent/serverless-hono"; ++import { weatherTool } from "./tools"; ++const agent = new Agent({ ++ name: "netlify-function-agent", ++ instructions: "Help the user quickly and call tools when needed.", ++ model: openai("gpt-4o-mini"), ++ tools: [weatherTool], ++}); ++const voltAgent = new VoltAgent({ ++ agents: { agent }, ++ serverless: serverlessHono(), ++}); ++export function getVoltAgent() { ++ return voltAgent; ++} +diff --git a/examples/with-netlify-functions/src/tools/index.js b/examples/with-netlify-functions/src/tools/index.js +new file mode 100644 +index 00000000..d1c5bf43 +--- /dev/null ++++ b/examples/with-netlify-functions/src/tools/index.js +@@ -0,0 +1,26 @@ ++import { createTool } from "@voltagent/core"; ++import z from "zod"; ++export const weatherTool = createTool({ ++ id: "get-weather", ++ name: "getWeather", ++ description: "Return a mock weather report for the requested location", ++ parameters: z.object({ ++ location: z.string().describe("City or location to look up"), ++ }), ++ execute: async ({ location }, context) => { ++ context?.logger.info(`Fetching weather for ${location}`); ++ const mockWeatherData = { ++ location, ++ temperature: Math.floor(Math.random() * 30) + 5, ++ condition: ["Sunny", "Cloudy", "Rainy", "Snowy", "Partly Cloudy"][ ++ Math.floor(Math.random() * 5) ++ ], ++ humidity: Math.floor(Math.random() * 60) + 30, ++ windSpeed: Math.floor(Math.random() * 30), ++ }; ++ return { ++ weather: mockWeatherData, ++ message: `Current weather in ${location}: ${mockWeatherData.temperature}°C and ${mockWeatherData.condition.toLowerCase()} with ${mockWeatherData.humidity}% humidity and wind speed of ${mockWeatherData.windSpeed} km/h.`, ++ }; ++ }, ++}); +diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts +index 95a6a413..2486335c 100644 +--- a/packages/core/src/agent/agent.ts ++++ b/packages/core/src/agent/agent.ts +@@ -48,6 +48,11 @@ import type { BaseRetriever } from "../retriever/retriever"; + import type { Tool, Toolkit } from "../tool"; + import { createTool } from "../tool"; + import { ToolManager } from "../tool/manager"; ++import { ++ type TrafficPriority, ++ type TrafficRequestMetadata, ++ getTrafficController, ++} from "../traffic/traffic-controller"; + import { randomUUID } from "../utils/id"; + import { convertModelMessagesToUIMessages } from "../utils/message-converter"; + import { NodeType, createNodeId } from "../utils/node-utils"; +@@ -262,8 +267,14 @@ export interface BaseGenerationOptions extends Partial { + // Context + userId?: string; + conversationId?: string; ++ tenantId?: string; + context?: ContextInput; + elicitation?: (request: unknown) => Promise; ++ /** ++ * Optional priority override for scheduling. ++ * Defaults to agent-level priority when omitted. ++ */ ++ trafficPriority?: TrafficPriority; + + // Parent tracking + parentAgentId?: string; +@@ -303,6 +314,8 @@ export interface BaseGenerationOptions extends Partial { + + // Provider-specific options + providerOptions?: ProviderOptions; ++ // Optional per-call model override (used for fallbacks) ++ model?: LanguageModel; + + // Experimental output (for structured generation) + experimental_output?: ReturnType | ReturnType; +@@ -347,6 +360,7 @@ export class Agent { + readonly voice?: Voice; + readonly retriever?: BaseRetriever; + readonly supervisorConfig?: SupervisorConfig; ++ private readonly trafficPriority: TrafficPriority; + private readonly context?: Map; + + private readonly logger: Logger; +@@ -372,6 +386,7 @@ export class Agent { + this.temperature = options.temperature; + this.maxOutputTokens = options.maxOutputTokens; + this.maxSteps = options.maxSteps || 5; ++ this.trafficPriority = options.trafficPriority ?? "P1"; + this.stopWhen = options.stopWhen; + this.markdown = options.markdown ?? false; + this.voice = options.voice; +@@ -444,6 +459,26 @@ export class Agent { + async generateText( + input: string | UIMessage[] | BaseMessage[], + options?: GenerateTextOptions, ++ ): Promise { ++ const controller = getTrafficController({ logger: this.logger }); // Use shared controller so all agent calls flow through central queue/metrics ++ const tenantId = this.resolveTenantId(options); ++ const buildRequest = (modelOverride?: LanguageModel) => ({ ++ tenantId, ++ metadata: this.buildTrafficMetadata(modelOverride ?? options?.model, options), // Pass model/provider info for future rate limiting keys ++ execute: () => ++ this.executeGenerateText(input, this.mergeOptionsWithModel(options, modelOverride)), // Defer actual execution so controller can schedule it ++ extractUsage: (result: GenerateTextResultWithContext) => ++ this.extractUsageFromResponse(result), ++ createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), ++ }); ++ ++ return controller.handleText(buildRequest(options?.model)); ++ } ++ ++ private async executeGenerateText( ++ input: string | UIMessage[] | BaseMessage[], ++ options?: GenerateTextOptions, ++ trafficMetadata?: TrafficRequestMetadata, + ): Promise { + const startTime = Date.now(); + const oc = this.createOperationContext(input, options); +@@ -471,7 +506,7 @@ export class Agent { + options, + ); + +- const modelName = this.getModelName(); ++ const modelName = this.getModelName(model); + const contextLimit = options?.contextLimit; + + // Add model attributes and all options +@@ -546,8 +581,10 @@ export class Agent { + tools: userTools, + experimental_output, + providerOptions, ++ model: _model, // Exclude model so aiSDKOptions doesn't override resolved model + ...aiSDKOptions + } = options || {}; ++ void _model; + + const llmSpan = this.createLLMSpan(oc, { + operation: "generateText", +@@ -567,6 +604,11 @@ export class Agent { + + let result!: GenerateTextResult; + try { ++ methodLogger.info("[AI SDK] Calling generateText", { ++ messageCount: messages.length, ++ modelName, ++ tools: tools ? Object.keys(tools) : [], ++ }); + result = await oc.traceContext.withSpan(llmSpan, () => + generateText({ + model, +@@ -575,7 +617,7 @@ export class Agent { + // Default values + temperature: this.temperature, + maxOutputTokens: this.maxOutputTokens, +- maxRetries: 3, ++ maxRetries: 0, + stopWhen: options?.stopWhen ?? this.stopWhen ?? stepCountIs(maxSteps), + // User overrides from AI SDK options + ...aiSDKOptions, +@@ -588,6 +630,13 @@ export class Agent { + onStepFinish: this.createStepHandler(oc, options), + }), + ); ++ methodLogger.info("[AI SDK] Received generateText result", { ++ finishReason: result.finishReason, ++ usage: result.usage ? safeStringify(result.usage) : undefined, ++ stepCount: result.steps?.length ?? 0, ++ rawResult: safeStringify(result), ++ }); ++ this.updateTrafficControllerRateLimits(result.response, trafficMetadata, methodLogger); + } catch (error) { + finalizeLLMSpan(SpanStatusCode.ERROR, { message: (error as Error).message }); + throw error; +@@ -771,6 +820,25 @@ export class Agent { + async streamText( + input: string | UIMessage[] | BaseMessage[], + options?: StreamTextOptions, ++ ): Promise { ++ const controller = getTrafficController({ logger: this.logger }); // Same controller handles streaming to keep ordering/backpressure consistent ++ const tenantId = this.resolveTenantId(options); ++ const buildRequest = (modelOverride?: LanguageModel) => ({ ++ tenantId, ++ metadata: this.buildTrafficMetadata(modelOverride ?? options?.model, options), // Include identifiers to support per-provider/model policies later ++ execute: () => ++ this.executeStreamText(input, this.mergeOptionsWithModel(options, modelOverride)), // Actual streaming work happens after the controller dequeues us ++ extractUsage: (result: StreamTextResultWithContext) => this.extractUsageFromResponse(result), ++ createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), ++ }); ++ ++ return controller.handleStream(buildRequest(options?.model)); ++ } ++ ++ private async executeStreamText( ++ input: string | UIMessage[] | BaseMessage[], ++ options?: StreamTextOptions, ++ trafficMetadata?: TrafficRequestMetadata, + ): Promise { + const startTime = Date.now(); + const oc = this.createOperationContext(input, options); +@@ -800,7 +868,7 @@ export class Agent { + options, + ); + +- const modelName = this.getModelName(); ++ const modelName = this.getModelName(model); + const contextLimit = options?.contextLimit; + + // Add model attributes to root span if TraceContext exists +@@ -870,8 +938,10 @@ export class Agent { + onFinish: userOnFinish, + experimental_output, + providerOptions, ++ model: _model, // Exclude model from aiSDKOptions to avoid overriding resolved model + ...aiSDKOptions + } = options || {}; ++ void _model; + + const guardrailStreamingEnabled = guardrailSet.output.length > 0; + +@@ -894,6 +964,11 @@ export class Agent { + }); + const finalizeLLMSpan = this.createLLMSpanFinalizer(llmSpan); + ++ methodLogger.info("[AI SDK] Calling streamText", { ++ messageCount: messages.length, ++ modelName, ++ tools: tools ? Object.keys(tools) : [], ++ }); + const result = streamText({ + model, + messages, +@@ -901,7 +976,7 @@ export class Agent { + // Default values + temperature: this.temperature, + maxOutputTokens: this.maxOutputTokens, +- maxRetries: 3, ++ maxRetries: 0, // Retry via traffic controller to avoid provider-level storms + stopWhen: options?.stopWhen ?? this.stopWhen ?? stepCountIs(maxSteps), + // User overrides from AI SDK options + ...aiSDKOptions, +@@ -962,6 +1037,17 @@ export class Agent { + .catch(() => {}); + }, + onFinish: async (finalResult) => { ++ methodLogger.info("[AI SDK] streamText finished", { ++ finishReason: finalResult.finishReason, ++ usage: finalResult.totalUsage ? safeStringify(finalResult.totalUsage) : undefined, ++ stepCount: finalResult.steps?.length ?? 0, ++ rawResult: safeStringify(finalResult), ++ }); ++ this.updateTrafficControllerRateLimits( ++ finalResult.response, ++ trafficMetadata, ++ methodLogger, ++ ); + const providerUsage = finalResult.usage + ? await Promise.resolve(finalResult.usage) + : undefined; +@@ -1428,6 +1514,30 @@ export class Agent { + input: string | UIMessage[] | BaseMessage[], + schema: T, + options?: GenerateObjectOptions, ++ ): Promise>> { ++ const controller = getTrafficController({ logger: this.logger }); ++ const tenantId = this.resolveTenantId(options); ++ const buildRequest = (modelOverride?: LanguageModel) => ({ ++ tenantId, ++ metadata: this.buildTrafficMetadata(modelOverride ?? options?.model, options), ++ execute: () => ++ this.executeGenerateObject( ++ input, ++ schema, ++ this.mergeOptionsWithModel(options, modelOverride), ++ ), ++ extractUsage: (result: GenerateObjectResultWithContext>) => ++ this.extractUsageFromResponse(result), ++ createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), ++ }); ++ ++ return controller.handleText(buildRequest(options?.model)); ++ } ++ ++ private async executeGenerateObject( ++ input: string | UIMessage[] | BaseMessage[], ++ schema: T, ++ options?: GenerateObjectOptions, + ): Promise>> { + const startTime = Date.now(); + const oc = this.createOperationContext(input, options); +@@ -1452,7 +1562,7 @@ export class Agent { + options, + ); + +- const modelName = this.getModelName(); ++ const modelName = this.getModelName(model); + const schemaName = schema.description || "unknown"; + + // Add model attributes and all options +@@ -1511,9 +1621,16 @@ export class Agent { + maxSteps: userMaxSteps, + tools: userTools, + providerOptions, ++ model: _model, // Exclude model so spread does not override resolved model + ...aiSDKOptions + } = options || {}; ++ void _model; + ++ methodLogger.info("[AI SDK] Calling generateObject", { ++ messageCount: messages.length, ++ modelName, ++ schemaName, ++ }); + const result = await generateObject({ + model, + messages, +@@ -1522,7 +1639,7 @@ export class Agent { + // Default values + maxOutputTokens: this.maxOutputTokens, + temperature: this.temperature, +- maxRetries: 3, ++ maxRetries: 0, + // User overrides from AI SDK options + ...aiSDKOptions, + // Provider-specific options +@@ -1530,6 +1647,12 @@ export class Agent { + // VoltAgent controlled + abortSignal: oc.abortController.signal, + }); ++ methodLogger.info("[AI SDK] Received generateObject result", { ++ finishReason: result.finishReason, ++ usage: result.usage ? safeStringify(result.usage) : undefined, ++ warnings: result.warnings, ++ rawResult: safeStringify(result), ++ }); + + const usageInfo = convertUsage(result.usage); + const finalObject = await executeOutputGuardrails({ +@@ -1655,6 +1778,26 @@ export class Agent { + input: string | UIMessage[] | BaseMessage[], + schema: T, + options?: StreamObjectOptions, ++ ): Promise>> { ++ const controller = getTrafficController({ logger: this.logger }); ++ const tenantId = this.resolveTenantId(options); ++ const buildRequest = (modelOverride?: LanguageModel) => ({ ++ tenantId, ++ metadata: this.buildTrafficMetadata(modelOverride ?? options?.model, options), ++ execute: () => ++ this.executeStreamObject(input, schema, this.mergeOptionsWithModel(options, modelOverride)), ++ extractUsage: (result: StreamObjectResultWithContext>) => ++ this.extractUsageFromResponse(result), ++ createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), ++ }); ++ ++ return controller.handleStream(buildRequest(options?.model)); ++ } ++ ++ private async executeStreamObject( ++ input: string | UIMessage[] | BaseMessage[], ++ schema: T, ++ options?: StreamObjectOptions, + ): Promise>> { + const startTime = Date.now(); + const oc = this.createOperationContext(input, options); +@@ -1680,7 +1823,7 @@ export class Agent { + options, + ); + +- const modelName = this.getModelName(); ++ const modelName = this.getModelName(model); + const schemaName = schema.description || "unknown"; + + // Add model attributes and all options +@@ -1740,13 +1883,20 @@ export class Agent { + tools: userTools, + onFinish: userOnFinish, + providerOptions, ++ model: _model, // Exclude model so aiSDKOptions cannot override resolved model + ...aiSDKOptions + } = options || {}; ++ void _model; + + let guardrailObjectPromise!: Promise>; + let resolveGuardrailObject: ((value: z.infer) => void) | undefined; + let rejectGuardrailObject: ((reason: unknown) => void) | undefined; + ++ methodLogger.info("[AI SDK] Calling streamObject", { ++ messageCount: messages.length, ++ modelName, ++ schemaName, ++ }); + const result = streamObject({ + model, + messages, +@@ -1755,7 +1905,7 @@ export class Agent { + // Default values + maxOutputTokens: this.maxOutputTokens, + temperature: this.temperature, +- maxRetries: 3, ++ maxRetries: 0, + // User overrides from AI SDK options + ...aiSDKOptions, + // Provider-specific options +@@ -1771,7 +1921,7 @@ export class Agent { + methodLogger.error("Stream object error occurred", { + error: actualError, + agentName: this.name, +- modelName: this.getModelName(), ++ modelName: this.getModelName(model), + schemaName: schemaName, + }); + +@@ -1800,6 +1950,11 @@ export class Agent { + }, + onFinish: async (finalResult: any) => { + try { ++ methodLogger.info("[AI SDK] streamObject finished", { ++ finishReason: finalResult.finishReason, ++ usage: finalResult.usage ? safeStringify(finalResult.usage) : undefined, ++ rawResult: safeStringify(finalResult), ++ }); + const usageInfo = convertUsage(finalResult.usage as any); + let finalObject = finalResult.object as z.infer; + if (guardrailSet.output.length > 0) { +@@ -2021,8 +2176,9 @@ export class Agent { + // Calculate maxSteps (use provided option or calculate based on subagents) + const maxSteps = options?.maxSteps ?? this.calculateMaxSteps(); + +- // Resolve dynamic values +- const model = await this.resolveValue(this.model, oc); ++ // Resolve dynamic values (allow per-call model override for fallbacks) ++ const selectedModel = options?.model ?? this.model; ++ const model = await this.resolveValue(selectedModel, oc); + const dynamicToolList = (await this.resolveValue(this.dynamicTools, oc)) || []; + + // Merge agent tools with option tools +@@ -2073,6 +2229,8 @@ export class Agent { + ): OperationContext { + const operationId = randomUUID(); + const startTimeDate = new Date(); ++ const priority = this.resolveTrafficPriority(options); ++ const tenantId = this.resolveTenantId(options); + + // Prefer reusing an existing context instance to preserve reference across calls/subagents + const runtimeContext = toContextMap(options?.context); +@@ -2123,6 +2281,7 @@ export class Agent { + operationId, + userId: options?.userId, + conversationId: options?.conversationId, ++ tenantId, + executionId: operationId, + }); + +@@ -2137,6 +2296,9 @@ export class Agent { + parentAgentId: options?.parentAgentId, + input, + }); ++ if (tenantId) { ++ traceContext.getRootSpan().setAttribute("tenant.id", tenantId); ++ } + traceContext.getRootSpan().setAttribute("voltagent.operation_id", operationId); + + // Use parent's AbortController if available, otherwise create new one +@@ -2174,8 +2336,10 @@ export class Agent { + logger, + conversationSteps: options?.parentOperationContext?.conversationSteps || [], + abortController, ++ priority, + userId: options?.userId, + conversationId: options?.conversationId, ++ tenantId, + parentAgentId: options?.parentAgentId, + traceContext, + startTime: startTimeDate, +@@ -3147,6 +3311,20 @@ export class Agent { + return value; + } + ++ private mergeOptionsWithModel( ++ options: BaseGenerationOptions | undefined, ++ modelOverride?: LanguageModel, ++ ): BaseGenerationOptions | undefined { ++ if (!options && modelOverride === undefined) { ++ return undefined; ++ } ++ ++ return { ++ ...(options ?? {}), ++ ...(modelOverride !== undefined ? { model: modelOverride } : {}), ++ }; ++ } ++ + /** + * Prepare tools with execution context + */ +@@ -3799,17 +3977,159 @@ export class Agent { + return this.subAgentManager.calculateMaxSteps(this.maxSteps); + } + ++ private resolveTrafficPriority(options?: BaseGenerationOptions): TrafficPriority { ++ const normalize = (value?: TrafficPriority): TrafficPriority | undefined => { ++ if (value === "P0" || value === "P1" || value === "P2") { ++ return value; ++ } ++ return undefined; ++ }; ++ ++ const parentPriority = normalize(options?.parentOperationContext?.priority); ++ const localPriority = normalize(options?.trafficPriority) ?? this.trafficPriority ?? "P1"; ++ ++ if (parentPriority) { ++ return this.pickHigherPriority(parentPriority, localPriority); ++ } ++ ++ return localPriority; ++ } ++ ++ private resolveTenantId(options?: BaseGenerationOptions): string { ++ const parentTenant = options?.parentOperationContext?.tenantId; ++ if (parentTenant) { ++ return parentTenant; ++ } ++ ++ if (options?.tenantId) { ++ return options.tenantId; ++ } ++ ++ return "default"; ++ } ++ ++ private pickHigherPriority(a: TrafficPriority, b: TrafficPriority): TrafficPriority { ++ const rank: Record = { P0: 0, P1: 1, P2: 2 }; ++ return rank[a] <= rank[b] ? a : b; ++ } ++ ++ private buildTrafficMetadata( ++ modelOverride?: LanguageModel | DynamicValue, ++ options?: BaseGenerationOptions, ++ ): TrafficRequestMetadata { ++ const provider = ++ this.resolveProvider(modelOverride) ?? this.resolveProvider(this.model) ?? undefined; ++ const priority = this.resolveTrafficPriority(options); ++ ++ return { ++ agentId: this.id, // Identify which agent issued the request ++ agentName: this.name, // Human-readable label for logs/metrics ++ model: this.getModelName(modelOverride), // Used for future capacity policies ++ provider, // Allows per-provider throttling later ++ priority, ++ tenantId: this.resolveTenantId(options), ++ }; ++ } ++ ++ private updateTrafficControllerRateLimits( ++ response: unknown, ++ metadata: TrafficRequestMetadata | undefined, ++ logger?: Logger, ++ ): void { ++ if (!response || typeof response !== "object") { ++ logger?.debug?.("[Traffic] No response object available for rate limit update"); ++ return; ++ } ++ ++ const responseWithHeaders = response as { headers?: unknown } | null; ++ const headers = responseWithHeaders?.headers; ++ if (!headers) { ++ logger?.debug?.("[Traffic] Response missing headers; skipping rate limit update"); ++ return; ++ } ++ ++ const controller = getTrafficController(); ++ const updateResult = controller.updateRateLimitFromHeaders( ++ metadata ?? this.buildTrafficMetadata(), ++ headers, ++ ); ++ ++ if (!updateResult) { ++ logger?.debug?.("[Traffic] No rate limit headers applied from response"); ++ return; ++ } ++ ++ const refillPerSecond = updateResult.normalized.refillPerMs * 1000; ++ logger?.info?.("[Traffic] Applied rate limit from response headers", { ++ rateLimitKey: updateResult.key, ++ capacity: updateResult.normalized.capacity, ++ refillPerSecond, ++ appliedTokens: updateResult.appliedTokens, ++ headers: { ++ limitRequests: updateResult.headerSnapshot.limitRequests, ++ remainingRequests: updateResult.headerSnapshot.remainingRequests, ++ resetRequestsMs: updateResult.headerSnapshot.resetRequestsMs, ++ }, ++ }); ++ } ++ ++ private extractUsageFromResponse( ++ result: ++ | { ++ usage?: LanguageModelUsage | Promise; ++ totalUsage?: LanguageModelUsage | Promise; ++ } ++ | undefined, ++ ): Promise | LanguageModelUsage | undefined { ++ if (!result) { ++ return undefined; ++ } ++ ++ const usageCandidate = ++ (result as { totalUsage?: LanguageModelUsage | Promise }) ++ ?.totalUsage ?? ++ (result as { usage?: LanguageModelUsage | Promise })?.usage; ++ ++ if (!usageCandidate) { ++ return undefined; ++ } ++ ++ if ( ++ typeof (usageCandidate as PromiseLike).then === "function" ++ ) { ++ return (usageCandidate as Promise).catch(() => undefined); ++ } ++ ++ return usageCandidate as LanguageModelUsage; ++ } ++ ++ private resolveProvider( ++ model: LanguageModel | DynamicValue | undefined, ++ ): string | undefined { ++ if ( ++ model && ++ typeof model === "object" && ++ "provider" in model && ++ typeof (model as any).provider === "string" ++ ) { ++ return (model as any).provider; ++ } ++ ++ return undefined; ++ } ++ + /** + * Get the model name + */ +- public getModelName(): string { +- if (typeof this.model === "function") { ++ public getModelName(modelOverride?: LanguageModel | DynamicValue): string { ++ const selectedModel = modelOverride ?? this.model; ++ if (typeof selectedModel === "function") { + return "dynamic"; + } +- if (typeof this.model === "string") { +- return this.model; ++ if (typeof selectedModel === "string") { ++ return selectedModel; + } +- return this.model.modelId || "unknown"; ++ return selectedModel.modelId || "unknown"; + } + + /** +diff --git a/packages/core/src/agent/eval.ts b/packages/core/src/agent/eval.ts +index 9e4fe9f2..de712505 100644 +--- a/packages/core/src/agent/eval.ts ++++ b/packages/core/src/agent/eval.ts +@@ -711,6 +711,7 @@ function buildEvalPayload( + rawOutput: output, + userId: oc.userId, + conversationId: oc.conversationId, ++ tenantId: oc.tenantId, + traceId: spanContext.traceId, + spanId: spanContext.spanId, + metadata, +diff --git a/packages/core/src/agent/types.ts b/packages/core/src/agent/types.ts +index dd5fb29d..add69edf 100644 +--- a/packages/core/src/agent/types.ts ++++ b/packages/core/src/agent/types.ts +@@ -29,6 +29,7 @@ import type { Logger } from "@voltagent/internal"; + import type { LocalScorerDefinition, SamplingPolicy } from "../eval/runtime"; + import type { MemoryOptions, MemoryStorageMetadata, WorkingMemorySummary } from "../memory/types"; + import type { VoltAgentObservability } from "../observability"; ++import type { TrafficPriority } from "../traffic/traffic-controller"; + import type { + DynamicValue, + DynamicValueOptions, +@@ -456,6 +457,11 @@ export type AgentOptions = { + temperature?: number; + maxOutputTokens?: number; + maxSteps?: number; ++ /** ++ * Default scheduling priority for this agent's LLM calls. ++ * Defaults to P1 when unspecified. ++ */ ++ trafficPriority?: TrafficPriority; + /** + * Default stop condition for step execution (ai-sdk `stopWhen`). + * Per-call `stopWhen` in method options overrides this. +@@ -493,6 +499,7 @@ export interface AgentEvalPayload { + rawOutput?: unknown; + userId?: string; + conversationId?: string; ++ tenantId?: string; + traceId: string; + spanId: string; + metadata?: Record; +@@ -890,6 +897,9 @@ export type OperationContext = { + /** Optional conversation identifier associated with this operation */ + conversationId?: string; + ++ /** Optional tenant identifier propagated across nested operations */ ++ tenantId?: string; ++ + /** User-managed context map for this operation */ + readonly context: Map; + +@@ -914,6 +924,9 @@ export type OperationContext = { + /** Conversation steps for building full message history including tool calls/results */ + conversationSteps?: StepWithContent[]; + ++ /** Scheduling priority propagated from parent calls */ ++ priority?: TrafficPriority; ++ + /** AbortController for cancelling the operation and accessing the signal */ + abortController: AbortController; + +diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts +index 8753f039..0aef165a 100644 +--- a/packages/core/src/index.ts ++++ b/packages/core/src/index.ts +@@ -21,6 +21,19 @@ export type { + WorkflowTimelineEvent, + RegisteredWorkflow, + } from "./workflow"; ++export { ++ // Surface traffic controller so downstream consumers can route agent calls through the shared scheduler ++ TrafficController, ++ CircuitBreakerOpenError, ++ getTrafficController, ++ type RateLimitConfig, ++ type RateLimitKey, ++ type RateLimitOptions, ++ type TrafficRequest, ++ type TrafficRequestMetadata, ++ type TrafficPriority, ++ type TrafficRequestType, ++} from "./traffic/traffic-controller"; + // Export new Agent from agent.ts + export { + Agent, +diff --git a/packages/core/src/traffic/traffic-controller.spec.ts b/packages/core/src/traffic/traffic-controller.spec.ts +new file mode 100644 +index 00000000..9b89d4b8 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-controller.spec.ts +@@ -0,0 +1,87 @@ ++import { describe, expect, it, vi } from "vitest"; ++import { TrafficController } from "./traffic-controller"; ++ ++describe("TrafficController priority scheduling", () => { ++ it("prioritizes P0 over lower priorities when runnable", async () => { ++ const controller = new TrafficController({ maxConcurrent: 1 }); ++ const order: string[] = []; ++ ++ const p1 = controller.handleText({ ++ metadata: { provider: "p", model: "m1", priority: "P1" }, ++ execute: async () => { ++ order.push("P1"); ++ return "P1"; ++ }, ++ }); ++ ++ const p2 = controller.handleText({ ++ metadata: { provider: "p", model: "m2", priority: "P2" }, ++ execute: async () => { ++ order.push("P2"); ++ return "P2"; ++ }, ++ }); ++ ++ const p0 = controller.handleText({ ++ metadata: { provider: "p", model: "m0", priority: "P0" }, ++ execute: async () => { ++ order.push("P0"); ++ return "P0"; ++ }, ++ }); ++ ++ await Promise.all([p0, p1, p2]); ++ ++ expect(order[0]).toBe("P0"); ++ expect(order).toEqual(["P0", "P1", "P2"]); ++ }); ++ ++ it("allows lower priorities to proceed when a higher priority request is rate limited", async () => { ++ vi.useFakeTimers(); ++ ++ try { ++ const controller = new TrafficController({ ++ maxConcurrent: 1, ++ rateLimits: { ++ "p0::m0": { capacity: 1, refillPerSecond: 1 }, ++ }, ++ }); ++ ++ // Exhaust the bucket for the P0 key so it initially waits ++ const buckets = (controller as unknown as { rateLimitBuckets: Map }) ++ .rateLimitBuckets; ++ buckets.set("p0::m0", { ++ tokens: 0, ++ capacity: 1, ++ refillPerMs: 1 / 1000, ++ lastRefill: Date.now(), ++ }); ++ ++ const order: string[] = []; ++ ++ const p0 = controller.handleText({ ++ metadata: { provider: "p0", model: "m0", priority: "P0" }, ++ execute: async () => { ++ order.push("P0"); ++ return "P0"; ++ }, ++ }); ++ ++ const p1 = controller.handleText({ ++ metadata: { provider: "p1", model: "m1", priority: "P1" }, ++ execute: async () => { ++ order.push("P1"); ++ return "P1"; ++ }, ++ }); ++ ++ await vi.runAllTimersAsync(); ++ await Promise.all([p0, p1]); ++ ++ expect(order[0]).toBe("P1"); ++ expect(order[1]).toBe("P0"); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++}); +diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts +new file mode 100644 +index 00000000..8d82e8a5 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-controller.ts +@@ -0,0 +1,1260 @@ ++import type { Logger } from "../logger"; ++import { LoggerProxy } from "../logger"; ++ ++type Scheduler = (callback: () => void) => void; ++type BivariantHandler = { ++ bivarianceHack(...args: TArgs): void; ++}["bivarianceHack"]; ++type BivariantFunction = { ++ bivarianceHack(...args: TArgs): TReturn; ++}["bivarianceHack"]; ++ ++type RetryReason = "rateLimit" | "serverError" | "timeout"; ++ ++const MAX_RETRY_ATTEMPTS = 3; ++const TIMEOUT_RETRY_ATTEMPTS = 2; ++const RATE_LIMIT_BASE_BACKOFF_MS = 500; ++const CIRCUIT_FAILURE_THRESHOLD = 5; ++const CIRCUIT_FAILURE_WINDOW_MS = 10_000; ++const CIRCUIT_COOLDOWN_MS = 30_000; ++const SERVER_ERROR_BASE_BACKOFF_MS = 1000; ++const TIMEOUT_BASE_BACKOFF_MS = 750; ++const RATE_LIMIT_JITTER_FACTOR = 0.35; ++const SERVER_ERROR_JITTER_FACTOR = 0.8; ++const TIMEOUT_JITTER_FACTOR = 0.5; ++const DEFAULT_FALLBACK_CHAINS: Record = { ++ "gpt-4o": ["gpt-4o-mini", "gpt-3.5"], ++}; ++ ++interface RateLimitBucket { ++ tokens: number; ++ capacity: number; ++ refillPerMs: number; ++ lastRefill: number; ++} ++ ++type NormalizedRateLimit = { ++ capacity: number; ++ refillPerMs: number; ++}; ++ ++export interface RateLimitOptions { ++ capacity: number; ++ refillPerSecond: number; ++} ++ ++export type TenantUsage = { ++ inputTokens: number; ++ outputTokens: number; ++ totalTokens: number; ++}; ++ ++type UsageCounters = { ++ inputTokens?: number; ++ outputTokens?: number; ++ totalTokens?: number; ++}; ++ ++export type RateLimitKey = string; ++export type RateLimitConfig = Record; ++ ++type RateLimitHeaderSnapshot = { ++ limitRequests: number; ++ remainingRequests?: number; ++ resetRequestsMs: number; ++}; ++ ++export type RateLimitUpdateResult = { ++ key: string; ++ headerSnapshot: RateLimitHeaderSnapshot; ++ normalized: NormalizedRateLimit; ++ appliedTokens: number; ++}; ++ ++export type TrafficRequestType = "text" | "stream"; ++ ++export type TrafficPriority = "P0" | "P1" | "P2"; ++ ++export interface TrafficRequestMetadata { ++ agentId?: string; ++ agentName?: string; ++ model?: string; ++ provider?: string; ++ priority?: TrafficPriority; ++ tenantId?: string; ++} ++ ++export interface TrafficRequest { ++ tenantId: string; ++ metadata?: TrafficRequestMetadata; ++ execute: () => Promise; ++ createFallbackRequest?: (modelId: string) => TrafficRequest | undefined; ++ extractUsage?: BivariantFunction< ++ [response: TResponse], ++ Promise | UsageCounters | undefined ++ >; ++} ++ ++type CircuitStateStatus = "closed" | "open" | "half-open"; ++ ++interface CircuitState { ++ status: CircuitStateStatus; ++ failureTimestamps: number[]; ++ openedAt?: number; ++ trialInFlight?: boolean; ++} ++ ++interface QueuedRequest { ++ type: TrafficRequestType; ++ request: TrafficRequest; ++ resolve: BivariantHandler<[TResponse | PromiseLike]>; ++ reject: BivariantHandler<[reason?: unknown]>; ++ etaMs?: number; ++ rateLimitKey?: string; ++ attempt?: number; ++ circuitKey?: string; ++ circuitStatus?: CircuitStateStatus; ++ priority: TrafficPriority; ++ tenantId: string; ++ extractUsage?: BivariantFunction< ++ [response: TResponse], ++ Promise | UsageCounters | undefined ++ >; ++} ++ ++export interface TrafficControllerOptions { ++ maxConcurrent?: number; ++ rateLimits?: RateLimitConfig; ++ logger?: Logger; ++ fallbackChains?: Record; ++} ++ ++type ProcessDecision = "process" | "skip" | "wait"; ++ ++// Centralized traffic controller responsible for scheduling LLM calls. ++// Provides a FIFO queue with a non-blocking scheduler and entrypoints ++// for text and stream traffic. ++export class TrafficController { ++ private readonly scheduler: Scheduler; ++ private readonly maxConcurrent: number; ++ private rateLimits?: Map; ++ private readonly rateLimitBuckets = new Map(); ++ private readonly circuitBreakers = new Map(); ++ private readonly fallbackChains: Map; ++ private readonly priorityOrder: TrafficPriority[] = ["P0", "P1", "P2"]; ++ private readonly queues: Record = { ++ P0: [], ++ P1: [], ++ P2: [], ++ }; ++ private activeCount = 0; ++ private drainScheduled = false; ++ private refillTimeout?: ReturnType; ++ private readonly tenantUsage = new Map(); ++ private readonly logger: Logger; ++ ++ private logDebug(message: string, details?: Record): void { ++ if (typeof console?.debug === "function") { ++ console.debug(message, details); ++ } ++ } ++ ++ constructor(options: TrafficControllerOptions = {}) { ++ this.maxConcurrent = options.maxConcurrent ?? Number.POSITIVE_INFINITY; ++ this.rateLimits = this.normalizeRateLimits(options.rateLimits); ++ this.fallbackChains = this.normalizeFallbackChains(options.fallbackChains); ++ this.scheduler = this.createScheduler(); ++ ++ // NEW LOGGER (from c2 commit) ++ this.logger = new LoggerProxy({ component: "traffic-controller" }, options.logger); ++ ++ // INIT LOG (from HEAD) — rewritten to use the new logger ++ this.logger.debug("[TrafficController] init", { ++ maxConcurrent: this.maxConcurrent, ++ rateLimits: this.rateLimits ? Array.from(this.rateLimits.entries()) : undefined, ++ }); ++ } ++ ++ handleText(request: TrafficRequest): Promise { ++ // Route text generation requests into the queue so all LLM calls share the same scheduler ++ return this.enqueue("text", request); ++ } ++ ++ handleStream(request: TrafficRequest): Promise { ++ // Route streaming requests through the same queue to preserve ordering/backpressure rules ++ return this.enqueue("stream", request); ++ } ++ ++ getTenantUsage(tenantId: string): TenantUsage | undefined { ++ const usage = this.tenantUsage.get(tenantId); ++ return usage ? { ...usage } : undefined; ++ } ++ ++ private createScheduler(): Scheduler { ++ // Prefer queueMicrotask to keep the drain loop snappy without starving the event loop ++ if (typeof queueMicrotask === "function") { ++ return queueMicrotask; ++ } ++ ++ return (callback: () => void) => setTimeout(callback, 0); ++ } ++ ++ private enqueue( ++ type: TrafficRequestType, ++ request: TrafficRequest, ++ ): Promise { ++ // Each request gets a promise so callers can await their own result ++ return new Promise((resolve, reject) => { ++ const priority = this.resolvePriority(request.metadata); ++ this.logger.debug("Enqueuing LLM request", { ++ tenantId: request.tenantId, ++ type, ++ priority, ++ }); ++ // Collect the work item and metadata ++ this.getQueue(priority).push({ ++ type, ++ request, ++ resolve, ++ reject, ++ attempt: 1, ++ priority, ++ tenantId: request.tenantId, ++ extractUsage: request.extractUsage, ++ }); ++ ++ this.logDebug("[TrafficController] enqueue", { ++ type, ++ queueSize: this.getQueueSize(), ++ metadata: request.metadata, ++ }); ++ ++ // Kick the drain loop to start handling work ++ this.scheduleDrain(); ++ }); ++ } ++ ++ private scheduleDrain(): void { ++ if (this.drainScheduled) { ++ return; ++ } ++ ++ this.drainScheduled = true; // Prevent redundant scheduling when many requests arrive at once ++ this.logDebug("[TrafficController] scheduleDrain", { queueSize: this.getQueueSize() }); ++ this.scheduler(() => { ++ this.drainScheduled = false; ++ this.logDebug("[TrafficController] drainLoopStart", { ++ queueSize: this.getQueueSize(), ++ active: this.activeCount, ++ }); ++ this.drainQueue(); // Drain asynchronously so we never block the caller's tick ++ }); ++ } ++ ++ private drainQueue(): void { ++ // Pull as many items as we can until we hit capacity or rate limits ++ while (this.hasQueuedWork()) { ++ if (this.activeCount >= this.maxConcurrent) { ++ return; ++ } ++ ++ let selected: { item: QueuedRequest; priority: TrafficPriority } | undefined; ++ let skippedItem = false; ++ ++ for (const priority of this.priorityOrder) { ++ const queue = this.getQueue(priority); ++ if (queue.length === 0) { ++ continue; ++ } ++ ++ const candidate = queue[0]; ++ const decision = this.getProcessDecision(candidate); ++ if (decision === "process") { ++ selected = { item: candidate, priority }; ++ break; ++ } ++ ++ if (decision === "skip") { ++ queue.shift(); // Remove rejected item ++ skippedItem = true; ++ break; // Re-evaluate from highest priority after removing ++ } ++ ++ // If wait, try lower priorities in the same drain cycle ++ } ++ ++ if (selected) { ++ const { item, priority } = selected; ++ this.getQueue(priority).shift(); ++ this.activeCount++; // Track in-flight work to enforce concurrency guard ++ this.markCircuitTrial(item); // Reserve the half-open trial slot if needed ++ ++ void this.runRequest(item); // Fire off processing without blocking the loop ++ continue; ++ } ++ ++ if (skippedItem) { ++ continue; // We removed a blocked item; re-evaluate queues ++ } ++ ++ // No runnable work right now; exit until capacity/rate-limit changes ++ return; ++ } ++ } ++ ++ private getProcessDecision(next: QueuedRequest): ProcessDecision { ++ const circuitDecision = this.evaluateCircuitBreaker(next); ++ if (circuitDecision !== "process") { ++ return circuitDecision; ++ } ++ ++ if (this.activeCount >= this.maxConcurrent) { ++ this.logDebug("[TrafficController] throttle concurrency", { ++ active: this.activeCount, ++ maxConcurrent: this.maxConcurrent, ++ }); ++ return "wait"; ++ } ++ ++ const rateLimitConfig = this.getRateLimitConfig(next.request.metadata); ++ if (!rateLimitConfig) { ++ this.logDebug("[TrafficController] no rate limit match", { ++ metadata: next.request.metadata, ++ }); ++ next.rateLimitKey = undefined; ++ next.etaMs = 0; ++ return "process"; // No rate limit configured for this key ++ } ++ ++ const queuedAhead = this.countQueuedAheadWithKey( ++ rateLimitConfig.key, ++ next, ++ /*logDetails*/ true, ++ ); ++ const bucket = this.getRateLimitBucket(rateLimitConfig.key, rateLimitConfig.limit); ++ if (bucket.tokens < 1) { ++ next.rateLimitKey = rateLimitConfig.key; ++ next.etaMs = this.computeEtaMs( ++ bucket, ++ rateLimitConfig.limit, ++ rateLimitConfig.key, ++ next, ++ queuedAhead, ++ ); ++ this.logDebug("[TrafficController] throttle rate", { ++ key: rateLimitConfig.key, ++ tokens: bucket.tokens, ++ etaMs: next.etaMs, ++ queuedAhead, ++ }); ++ this.scheduleRefill(rateLimitConfig.limit); // Ensure we retry as soon as tokens are replenished ++ return "wait"; ++ } ++ ++ bucket.tokens -= 1; // Consume a token for this dispatch ++ this.logDebug("[TrafficController] token consumed", { ++ key: rateLimitConfig.key, ++ remaining: bucket.tokens, ++ capacity: bucket.capacity, ++ }); ++ next.rateLimitKey = rateLimitConfig.key; ++ next.etaMs = 0; ++ return "process"; ++ } ++ ++ private getRateLimitConfig( ++ metadata?: TrafficRequestMetadata, ++ ): { key: string; limit: NormalizedRateLimit } | undefined { ++ if (!this.rateLimits || this.rateLimits.size === 0) { ++ return undefined; ++ } ++ ++ const key = this.buildRateLimitKey(metadata); ++ const limit = this.rateLimits.get(key); ++ if (!limit) { ++ return undefined; ++ } ++ ++ this.logDebug("[TrafficController] rateLimitConfig hit", { key }); ++ return { key, limit }; ++ } ++ ++ private getRateLimitBucket(key: string, limit: NormalizedRateLimit): RateLimitBucket { ++ const now = Date.now(); // Snapshot time once to avoid drift within this method ++ let bucket = this.rateLimitBuckets.get(key); // Reuse the bucket if it already exists ++ ++ if (!bucket) { ++ bucket = { ++ tokens: limit.capacity, ++ capacity: limit.capacity, ++ refillPerMs: limit.refillPerMs, ++ lastRefill: now, ++ }; ++ this.rateLimitBuckets.set(key, bucket); ++ this.logDebug("[TrafficController] bucket create", { ++ key, ++ capacity: bucket.capacity, ++ refillPerMs: bucket.refillPerMs, ++ }); ++ return bucket; ++ } ++ ++ if ( ++ bucket.capacity !== limit.capacity || ++ Math.abs(bucket.refillPerMs - limit.refillPerMs) > Number.EPSILON ++ ) { ++ bucket.capacity = limit.capacity; ++ bucket.refillPerMs = limit.refillPerMs; ++ bucket.tokens = Math.min(bucket.tokens, bucket.capacity); ++ bucket.lastRefill = now; ++ this.logDebug("[TrafficController] bucket sync with new limit", { ++ key, ++ capacity: bucket.capacity, ++ refillPerMs: bucket.refillPerMs, ++ }); ++ } ++ ++ const elapsedMs = Math.max(0, now - bucket.lastRefill); ++ if (elapsedMs > 0 && bucket.tokens < bucket.capacity) { ++ const refilled = elapsedMs * bucket.refillPerMs; // Refill based on elapsed time ++ bucket.tokens = Math.min(bucket.capacity, bucket.tokens + refilled); // Cap at bucket capacity ++ bucket.lastRefill = now; // Mark refill time for the next calculation ++ this.logDebug("[TrafficController] bucket refill", { ++ key, ++ elapsedMs, ++ tokens: bucket.tokens, ++ }); ++ } ++ ++ return bucket; ++ } ++ ++ private computeEtaMs( ++ bucket: RateLimitBucket, ++ limit: NormalizedRateLimit, ++ key: string, ++ current: QueuedRequest, ++ queuedAhead?: number, ++ ): number { ++ const missingTokens = Math.max(0, 1 - bucket.tokens); ++ const waitForToken = ++ missingTokens > 0 && limit.refillPerMs > 0 ? Math.ceil(missingTokens / limit.refillPerMs) : 0; ++ const aheadCount = ++ typeof queuedAhead === "number" ++ ? queuedAhead ++ : this.countQueuedAheadWithKey(key, current, /*logDetails*/ false); ++ const extraForQueue = ++ aheadCount > 0 && limit.refillPerMs > 0 ? Math.ceil(aheadCount / limit.refillPerMs) : 0; ++ this.logDebug("[TrafficController] computeEtaMs", { ++ key, ++ missingTokens, ++ waitForToken, ++ aheadCount, ++ extraForQueue, ++ eta: waitForToken + extraForQueue, ++ }); ++ return waitForToken + extraForQueue; ++ } ++ ++ private countQueuedAheadWithKey(key: string, current: QueuedRequest, logDetails = false): number { ++ let count = 0; ++ for (const priority of this.priorityOrder) { ++ const queue = this.getQueue(priority); ++ for (const item of queue) { ++ if (item === current) { ++ return count; ++ } ++ ++ const itemKey = this.buildRateLimitKey(item.request.metadata); ++ if (itemKey === key) { ++ count += 1; ++ } ++ } ++ } ++ if (logDetails) { ++ this.logDebug("[TrafficController] countQueuedAheadWithKey", { ++ key, ++ count, ++ queueSize: this.getQueueSize(), ++ }); ++ } ++ return count; ++ } ++ ++ private evaluateCircuitBreaker(next: QueuedRequest): ProcessDecision { ++ return this.evaluateCircuitBreakerForRequest(next, new Set()); ++ } ++ ++ private evaluateCircuitBreakerForRequest( ++ next: QueuedRequest, ++ visitedModels: Set, ++ ): ProcessDecision { ++ const key = this.buildRateLimitKey(next.request.metadata); ++ next.circuitKey = key; ++ ++ const currentModel = next.request.metadata?.model; ++ if (currentModel) { ++ visitedModels.add(currentModel); ++ } ++ ++ const evaluation = this.evaluateCircuitState(key); ++ next.circuitStatus = evaluation.state; ++ ++ if (evaluation.allowRequest) { ++ return "process"; ++ } ++ ++ const fallbackModel = this.findFallbackModel(next.request.metadata, visitedModels); ++ if (fallbackModel && next.request.createFallbackRequest) { ++ const fallbackRequest = next.request.createFallbackRequest(fallbackModel); ++ if (fallbackRequest) { ++ this.logger.warn("Circuit open; attempting fallback model", { ++ fromModel: currentModel, ++ fallbackModel, ++ provider: next.request.metadata?.provider, ++ }); ++ next.request = fallbackRequest; ++ next.attempt = 1; ++ next.rateLimitKey = undefined; ++ next.etaMs = undefined; ++ next.circuitKey = undefined; ++ next.circuitStatus = undefined; ++ return this.evaluateCircuitBreakerForRequest(next, visitedModels); ++ } ++ } ++ ++ const retryAfterMs = evaluation.retryAfterMs ?? CIRCUIT_COOLDOWN_MS; ++ this.logger.warn("Circuit open; rejecting request", { ++ circuitKey: key, ++ retryAfterMs, ++ metadata: next.request.metadata, ++ }); ++ next.reject( ++ new CircuitBreakerOpenError( ++ `Circuit open for ${key}; retry after ${retryAfterMs}ms`, ++ next.request.metadata, ++ retryAfterMs, ++ ), ++ ); ++ return "skip"; ++ } ++ ++ private evaluateCircuitState(key: string): { ++ allowRequest: boolean; ++ state: CircuitStateStatus; ++ retryAfterMs?: number; ++ } { ++ const state = this.circuitBreakers.get(key); ++ if (!state) { ++ return { allowRequest: true, state: "closed" }; ++ } ++ ++ const now = Date.now(); ++ ++ if (state.status === "open") { ++ const elapsed = state.openedAt ? now - state.openedAt : 0; ++ if (elapsed >= CIRCUIT_COOLDOWN_MS) { ++ state.status = "half-open"; ++ state.trialInFlight = false; ++ state.failureTimestamps = []; ++ this.circuitBreakers.set(key, state); ++ return { allowRequest: true, state: state.status }; ++ } ++ return { ++ allowRequest: false, ++ state: state.status, ++ retryAfterMs: Math.max(0, CIRCUIT_COOLDOWN_MS - elapsed), ++ }; ++ } ++ ++ if (state.status === "half-open") { ++ if (state.trialInFlight) { ++ return { allowRequest: false, state: state.status }; ++ } ++ return { allowRequest: true, state: state.status }; ++ } ++ ++ return { allowRequest: true, state: state.status }; ++ } ++ ++ private findFallbackModel( ++ metadata: TrafficRequestMetadata | undefined, ++ visitedModels: Set, ++ ): string | undefined { ++ const currentModel = metadata?.model; ++ if (!currentModel) { ++ return undefined; ++ } ++ ++ const chain = this.fallbackChains.get(currentModel); ++ if (!chain) { ++ return undefined; ++ } ++ ++ const provider = metadata?.provider; ++ for (const candidate of chain) { ++ if (visitedModels.has(candidate)) { ++ continue; ++ } ++ ++ const candidateKey = this.buildRateLimitKey({ provider, model: candidate }); ++ const evaluation = this.evaluateCircuitState(candidateKey); ++ if (evaluation.allowRequest) { ++ visitedModels.add(candidate); ++ return candidate; ++ } ++ } ++ ++ return undefined; ++ } ++ ++ private markCircuitTrial(next: QueuedRequest): void { ++ const key = next.circuitKey; ++ if (!key) { ++ return; ++ } ++ ++ const state = this.circuitBreakers.get(key); ++ if (state && state.status === "half-open" && !state.trialInFlight) { ++ state.trialInFlight = true; ++ this.circuitBreakers.set(key, state); ++ } ++ } ++ ++ private normalizeRateLimits( ++ rateLimits?: RateLimitConfig, ++ ): Map | undefined { ++ if (!rateLimits) { ++ return undefined; ++ } ++ ++ const normalized = new Map(); ++ for (const [key, config] of Object.entries(rateLimits)) { ++ if (config.capacity > 0 && config.refillPerSecond > 0) { ++ normalized.set(key, { ++ capacity: config.capacity, ++ refillPerMs: config.refillPerSecond / 1000, ++ }); ++ } ++ } ++ ++ return normalized.size > 0 ? normalized : undefined; ++ } ++ ++ private normalizeFallbackChains( ++ fallbackChains?: Record, ++ ): Map { ++ const configuredChains = fallbackChains ?? DEFAULT_FALLBACK_CHAINS; ++ const normalized = new Map(); ++ ++ for (const [model, chain] of Object.entries(configuredChains)) { ++ if (Array.isArray(chain) && chain.length > 0) { ++ normalized.set(model, [...chain]); ++ } ++ } ++ ++ return normalized; ++ } ++ ++ private buildRateLimitKey(metadata?: TrafficRequestMetadata): string { ++ const provider = metadata?.provider ?? "default-provider"; ++ const model = metadata?.model ?? "default-model"; ++ return `${provider}::${model}`; ++ } ++ ++ /** ++ * Update (or bootstrap) rate limit buckets based on provider response headers. ++ * This lets the controller adopt server-issued limits without static config. ++ */ ++ updateRateLimitFromHeaders( ++ metadata: TrafficRequestMetadata | undefined, ++ headers: unknown, ++ ): RateLimitUpdateResult | undefined { ++ const headerInfo = this.extractRateLimitHeaders(headers); ++ if (!headerInfo) { ++ this.logDebug("[TrafficController] no rate limit headers found on response", { ++ metadata, ++ }); ++ return undefined; ++ } ++ ++ const normalized = this.normalizeHeaderRateLimit(headerInfo); ++ if (!normalized) { ++ this.logDebug("[TrafficController] rate limit headers present but invalid", { ++ headerInfo, ++ }); ++ return undefined; ++ } ++ ++ const key = this.buildRateLimitKey(metadata); ++ if (!this.rateLimits) { ++ this.rateLimits = new Map(); ++ } ++ this.rateLimits.set(key, normalized); ++ ++ const now = Date.now(); ++ const remainingTokens = this.coerceRemaining(headerInfo.remainingRequests, normalized.capacity); ++ const existingBucket = this.rateLimitBuckets.get(key); ++ const tokens = remainingTokens ?? existingBucket?.tokens ?? normalized.capacity; ++ ++ if (existingBucket) { ++ existingBucket.capacity = normalized.capacity; ++ existingBucket.refillPerMs = normalized.refillPerMs; ++ existingBucket.tokens = Math.min(tokens, normalized.capacity); ++ existingBucket.lastRefill = now; ++ } else { ++ this.rateLimitBuckets.set(key, { ++ tokens: Math.min(tokens, normalized.capacity), ++ capacity: normalized.capacity, ++ refillPerMs: normalized.refillPerMs, ++ lastRefill: now, ++ }); ++ } ++ ++ this.logDebug("[TrafficController] rateLimit updated from headers", { ++ key, ++ capacity: normalized.capacity, ++ refillPerMs: normalized.refillPerMs, ++ remaining: remainingTokens, ++ }); ++ ++ // If we just refilled tokens, try draining again. ++ this.scheduleDrain(); ++ ++ return { ++ key, ++ headerSnapshot: headerInfo, ++ normalized, ++ appliedTokens: Math.min(tokens, normalized.capacity), ++ }; ++ } ++ ++ private extractRateLimitHeaders(headers: unknown): RateLimitHeaderSnapshot | undefined { ++ const getHeader = this.createHeaderLookup(headers); ++ if (!getHeader) { ++ return undefined; ++ } ++ ++ const limitRequests = this.parseNumberHeader(getHeader, "x-ratelimit-limit-requests"); ++ const resetRequestsMs = this.parseDurationHeaderToMs(getHeader, "x-ratelimit-reset-requests"); ++ ++ if ( ++ limitRequests === undefined || ++ limitRequests <= 0 || ++ resetRequestsMs === undefined || ++ resetRequestsMs <= 0 ++ ) { ++ return undefined; ++ } ++ ++ const remainingRequests = this.parseNumberHeader(getHeader, "x-ratelimit-remaining-requests"); ++ ++ return { ++ limitRequests, ++ remainingRequests, ++ resetRequestsMs, ++ }; ++ } ++ ++ private normalizeHeaderRateLimit( ++ snapshot: RateLimitHeaderSnapshot, ++ ): NormalizedRateLimit | undefined { ++ if (snapshot.limitRequests <= 0 || snapshot.resetRequestsMs <= 0) { ++ return undefined; ++ } ++ ++ return { ++ capacity: snapshot.limitRequests, ++ refillPerMs: snapshot.limitRequests / snapshot.resetRequestsMs, ++ }; ++ } ++ ++ private coerceRemaining(remaining: number | undefined, capacity: number): number | undefined { ++ if (remaining === undefined) { ++ return undefined; ++ } ++ ++ const parsed = Number(remaining); ++ if (!Number.isFinite(parsed)) { ++ return undefined; ++ } ++ ++ return Math.max(0, Math.min(capacity, Math.floor(parsed))); ++ } ++ ++ private createHeaderLookup(headers: unknown): ((name: string) => string | undefined) | undefined { ++ if (!headers) { ++ return undefined; ++ } ++ ++ const maybeHeaders = headers as { get?: (name: string) => unknown }; ++ if (typeof maybeHeaders?.get === "function") { ++ return (name: string) => { ++ const value = maybeHeaders.get?.(name); ++ return value === undefined || value === null ? undefined : String(value); ++ }; ++ } ++ ++ if (typeof headers === "object") { ++ const entries = Object.entries(headers as Record); ++ if (entries.length === 0) { ++ return undefined; ++ } ++ ++ return (name: string) => { ++ const target = name.toLowerCase(); ++ for (const [key, value] of entries) { ++ if (typeof key === "string" && key.toLowerCase() === target) { ++ if (Array.isArray(value)) { ++ const first = value[0]; ++ return first === undefined || first === null ? undefined : String(first); ++ } ++ return value === undefined || value === null ? undefined : String(value); ++ } ++ } ++ return undefined; ++ }; ++ } ++ ++ return undefined; ++ } ++ ++ private parseNumberHeader( ++ getHeader: (name: string) => string | undefined, ++ name: string, ++ ): number | undefined { ++ const raw = getHeader(name); ++ if (raw === undefined) { ++ return undefined; ++ } ++ ++ const parsed = Number(raw); ++ return Number.isFinite(parsed) ? parsed : undefined; ++ } ++ ++ private parseDurationHeaderToMs( ++ getHeader: (name: string) => string | undefined, ++ name: string, ++ ): number | undefined { ++ const raw = getHeader(name); ++ if (!raw) { ++ return undefined; ++ } ++ ++ const trimmed = raw.trim(); ++ const match = trimmed.match(/^(-?\d+(?:\.\d+)?)(ms|s)?$/i); ++ if (!match) { ++ return undefined; ++ } ++ ++ const value = Number(match[1]); ++ if (!Number.isFinite(value) || value <= 0) { ++ return undefined; ++ } ++ ++ const unit = (match[2] || "s").toLowerCase(); ++ return unit === "ms" ? value : value * 1000; ++ } ++ ++ private resolvePriority(metadata?: TrafficRequestMetadata): TrafficPriority { ++ const candidate = metadata?.priority; ++ if (candidate === "P0" || candidate === "P1" || candidate === "P2") { ++ return candidate; ++ } ++ ++ return "P1"; ++ } ++ ++ private getQueue(priority: TrafficPriority): QueuedRequest[] { ++ return this.queues[priority]; ++ } ++ ++ private hasQueuedWork(): boolean { ++ return this.priorityOrder.some((priority) => this.getQueue(priority).length > 0); ++ } ++ ++ private getQueueSize(): number { ++ let size = 0; ++ for (const priority of this.priorityOrder) { ++ size += this.getQueue(priority).length; ++ } ++ return size; ++ } ++ ++ private scheduleRefill(limit: NormalizedRateLimit): void { ++ if (this.refillTimeout) { ++ return; ++ } ++ ++ const delayMs = Math.max(1, Math.ceil(1 / limit.refillPerMs)); // Wait long enough for at least one token ++ this.logDebug("[TrafficController] scheduleRefill", { delayMs }); ++ this.refillTimeout = setTimeout(() => { ++ this.refillTimeout = undefined; // Allow future refills to be scheduled ++ this.logDebug("[TrafficController] refillTimeoutFired", { ++ queueSize: this.getQueueSize(), ++ active: this.activeCount, ++ }); ++ this.scheduleDrain(); // Try draining again now that tokens should exist ++ }, delayMs); ++ } ++ ++ private recordCircuitSuccess(metadata?: TrafficRequestMetadata): void { ++ const key = this.buildRateLimitKey(metadata); ++ if (this.circuitBreakers.has(key)) { ++ this.circuitBreakers.delete(key); ++ } ++ } ++ ++ private recordCircuitFailure(metadata: TrafficRequestMetadata | undefined, error: unknown): void { ++ const status = this.extractStatusCode(error); ++ if (!this.isCircuitBreakerStatus(status)) { ++ this.resetCircuitFailures(metadata); ++ return; ++ } ++ ++ const key = this.buildRateLimitKey(metadata); ++ const now = Date.now(); ++ const state = ++ this.circuitBreakers.get(key) ?? ++ ({ ++ status: "closed", ++ failureTimestamps: [], ++ } as CircuitState); ++ ++ const recentFailures = state.failureTimestamps.filter( ++ (timestamp) => now - timestamp <= CIRCUIT_FAILURE_WINDOW_MS, ++ ); ++ recentFailures.push(now); ++ ++ if (state.status === "half-open") { ++ state.status = "open"; ++ state.openedAt = now; ++ state.trialInFlight = false; ++ state.failureTimestamps = [now]; ++ this.circuitBreakers.set(key, state); ++ this.logger.warn("Circuit reopened after half-open failure", { ++ circuitKey: key, ++ statusCode: status, ++ }); ++ return; ++ } ++ ++ state.failureTimestamps = recentFailures; ++ if (state.failureTimestamps.length >= CIRCUIT_FAILURE_THRESHOLD) { ++ state.status = "open"; ++ state.openedAt = now; ++ state.trialInFlight = false; ++ this.logger.warn("Circuit opened after consecutive failures", { ++ circuitKey: key, ++ failureCount: state.failureTimestamps.length, ++ statusCode: status, ++ }); ++ } ++ ++ this.circuitBreakers.set(key, state); ++ } ++ ++ private resetCircuitFailures(metadata?: TrafficRequestMetadata): void { ++ const key = this.buildRateLimitKey(metadata); ++ const state = this.circuitBreakers.get(key); ++ if (!state) { ++ return; ++ } ++ ++ state.failureTimestamps = []; ++ if (state.status !== "open") { ++ state.status = "closed"; ++ state.trialInFlight = false; ++ } ++ ++ this.circuitBreakers.set(key, state); ++ } ++ ++ private recordUsageFromResult( ++ item: QueuedRequest, ++ result: TResponse, ++ ): void { ++ const extractor = item.extractUsage ?? item.request.extractUsage; ++ if (!extractor) { ++ return; ++ } ++ ++ try { ++ const usageCandidate = extractor(result); ++ if (!usageCandidate) { ++ return; ++ } ++ ++ if (this.isPromiseLike(usageCandidate)) { ++ void Promise.resolve(usageCandidate) ++ .then((usage) => { ++ if (usage) { ++ this.incrementTenantUsage(item.tenantId, usage); ++ } ++ }) ++ .catch((error) => { ++ this.logger.debug("Failed to record tenant usage", { tenantId: item.tenantId, error }); ++ }); ++ return; ++ } ++ ++ this.incrementTenantUsage(item.tenantId, usageCandidate as UsageCounters); ++ } catch (error) { ++ this.logger.debug("Failed to record tenant usage", { tenantId: item.tenantId, error }); ++ } ++ } ++ ++ private incrementTenantUsage(tenantId: string, usage: UsageCounters): void { ++ const current = this.tenantUsage.get(tenantId) ?? { ++ inputTokens: 0, ++ outputTokens: 0, ++ totalTokens: 0, ++ }; ++ const inputTokens = usage.inputTokens ?? 0; ++ const outputTokens = usage.outputTokens ?? 0; ++ const totalTokens = usage.totalTokens ?? inputTokens + outputTokens; ++ const updated: TenantUsage = { ++ inputTokens: current.inputTokens + inputTokens, ++ outputTokens: current.outputTokens + outputTokens, ++ totalTokens: current.totalTokens + totalTokens, ++ }; ++ this.tenantUsage.set(tenantId, updated); ++ this.logger.debug("Recorded tenant usage", { tenantId, usage: updated }); ++ } ++ ++ private isPromiseLike(value: unknown): value is PromiseLike { ++ return ( ++ typeof value === "object" && ++ value !== null && ++ typeof (value as PromiseLike).then === "function" ++ ); ++ } ++ ++ private isCircuitBreakerStatus(status?: number): boolean { ++ if (status === 429) { ++ return true; ++ } ++ ++ return status !== undefined && status >= 500 && status < 600; ++ } ++ ++ private async runRequest(item: QueuedRequest): Promise { ++ const attempt = item.attempt ?? 1; ++ ++ this.logDebug("[TrafficController] runRequest start", { ++ type: item.type, ++ rateLimitKey: item.rateLimitKey, ++ etaMs: item.etaMs, ++ active: this.activeCount, ++ queueSize: this.getQueueSize(), ++ }); ++ ++ try { ++ const result = await item.request.execute(); // Execute the user's operation ++ this.recordCircuitSuccess(item.request.metadata); ++ this.recordUsageFromResult(item, result); ++ item.resolve(result); // Deliver successful result back to the waiting caller ++ } catch (error) { ++ this.recordCircuitFailure(item.request.metadata, error); ++ const retryPlan = this.buildRetryPlan(error, attempt); ++ if (retryPlan) { ++ this.scheduleRetry(item, attempt + 1, retryPlan.delayMs, retryPlan.reason); ++ } else { ++ item.reject(error); // Surface failures to the caller ++ } ++ } finally { ++ this.activeCount = Math.max(0, this.activeCount - 1); // Ensure counter never underflows ++ this.logDebug("[TrafficController] runRequest complete", { ++ type: item.type, ++ active: this.activeCount, ++ queueSize: this.getQueueSize(), ++ }); ++ this.scheduleDrain(); // Immediately try to pull the next request ++ } ++ } ++ ++ private buildRetryPlan( ++ error: unknown, ++ attempt: number, ++ ): { delayMs: number; reason: RetryReason } | undefined { ++ const reason = this.getRetryReason(error); ++ if (!reason) { ++ return undefined; ++ } ++ ++ const maxAttempts = reason === "timeout" ? TIMEOUT_RETRY_ATTEMPTS : MAX_RETRY_ATTEMPTS; ++ if (attempt >= maxAttempts) { ++ return undefined; ++ } ++ ++ return { ++ reason, ++ delayMs: this.computeBackoffDelay(reason, attempt), ++ }; ++ } ++ ++ private getRetryReason(error: unknown): RetryReason | undefined { ++ const statusCode = this.extractStatusCode(error); ++ if (statusCode === 429) { ++ return "rateLimit"; ++ } ++ ++ if (statusCode !== undefined && statusCode >= 500 && statusCode < 600) { ++ return "serverError"; ++ } ++ ++ if (statusCode === 408 || this.isTimeoutError(error)) { ++ return "timeout"; ++ } ++ ++ return undefined; ++ } ++ ++ private extractStatusCode(error: unknown): number | undefined { ++ if (!error || typeof error !== "object") { ++ return undefined; ++ } ++ ++ const candidate = error as { status?: unknown; statusCode?: unknown; httpStatus?: unknown }; ++ const directStatus = ++ this.coerceStatus(candidate.status) ?? ++ this.coerceStatus(candidate.statusCode) ?? ++ this.coerceStatus(candidate.httpStatus); ++ if (directStatus !== undefined) { ++ return directStatus; ++ } ++ ++ const responseStatus = (error as { response?: { status?: unknown } }).response?.status; ++ const normalizedResponseStatus = this.coerceStatus(responseStatus); ++ if (normalizedResponseStatus !== undefined) { ++ return normalizedResponseStatus; ++ } ++ ++ const causeStatus = (error as { cause?: { status?: unknown; statusCode?: unknown } }).cause; ++ if (causeStatus) { ++ const normalizedCauseStatus = ++ this.coerceStatus(causeStatus.status) ?? this.coerceStatus(causeStatus.statusCode); ++ if (normalizedCauseStatus !== undefined) { ++ return normalizedCauseStatus; ++ } ++ } ++ ++ return undefined; ++ } ++ ++ private isTimeoutError(error: unknown): boolean { ++ const candidates = [error, (error as { cause?: unknown })?.cause]; ++ ++ for (const candidate of candidates) { ++ if (!candidate || typeof candidate !== "object") { ++ continue; ++ } ++ ++ const timeoutCode = (candidate as { code?: unknown }).code; ++ if (typeof timeoutCode === "string" && timeoutCode.toLowerCase().includes("timeout")) { ++ return true; ++ } ++ ++ const name = (candidate as { name?: unknown }).name; ++ if (typeof name === "string" && name.toLowerCase().includes("timeout")) { ++ return true; ++ } ++ ++ const message = (candidate as { message?: unknown }).message; ++ if (typeof message === "string" && message.toLowerCase().includes("timeout")) { ++ return true; ++ } ++ } ++ ++ return false; ++ } ++ ++ private coerceStatus(value: unknown): number | undefined { ++ if (typeof value === "number" && Number.isFinite(value)) { ++ return value; ++ } ++ ++ if (typeof value === "string") { ++ const parsed = Number(value); ++ if (Number.isFinite(parsed)) { ++ return parsed; ++ } ++ } ++ ++ return undefined; ++ } ++ ++ private computeBackoffDelay(reason: RetryReason, attempt: number): number { ++ const base = ++ reason === "serverError" ++ ? SERVER_ERROR_BASE_BACKOFF_MS ++ : reason === "timeout" ++ ? TIMEOUT_BASE_BACKOFF_MS ++ : RATE_LIMIT_BASE_BACKOFF_MS; ++ ++ const jitterFactor = ++ reason === "serverError" ++ ? SERVER_ERROR_JITTER_FACTOR ++ : reason === "timeout" ++ ? TIMEOUT_JITTER_FACTOR ++ : RATE_LIMIT_JITTER_FACTOR; ++ ++ const exponential = base * 2 ** Math.max(0, attempt - 1); ++ const jitter = exponential * jitterFactor * Math.random(); ++ return Math.max(1, Math.round(exponential + jitter)); ++ } ++ ++ private scheduleRetry( ++ item: QueuedRequest, ++ nextAttempt: number, ++ delayMs: number, ++ reason: RetryReason, ++ ): void { ++ this.logger.debug("Retrying request through controller", { ++ reason, ++ delayMs, ++ attempt: nextAttempt, ++ maxAttempts: reason === "timeout" ? TIMEOUT_RETRY_ATTEMPTS : MAX_RETRY_ATTEMPTS, ++ metadata: item.request.metadata, ++ }); ++ ++ setTimeout(() => { ++ const retryPriority = item.priority; ++ this.getQueue(retryPriority).push({ ++ ...item, ++ attempt: nextAttempt, ++ etaMs: undefined, ++ rateLimitKey: undefined, ++ circuitKey: undefined, ++ circuitStatus: undefined, ++ }); ++ this.scheduleDrain(); ++ }, delayMs); ++ } ++} ++ ++let singletonController: TrafficController | undefined; ++ ++export class CircuitBreakerOpenError extends Error { ++ readonly retryAfterMs?: number; ++ readonly metadata?: TrafficRequestMetadata; ++ ++ constructor(message: string, metadata?: TrafficRequestMetadata, retryAfterMs?: number) { ++ super(message); ++ this.name = "CircuitBreakerOpenError"; ++ this.metadata = metadata; ++ this.retryAfterMs = retryAfterMs; ++ } ++} ++ ++/** ++ * Retrieve the shared traffic controller instance. ++ */ ++export function getTrafficController(options?: TrafficControllerOptions): TrafficController { ++ if (!singletonController) { ++ // Create a singleton controller so all agents share the same queue/scheduling behavior ++ singletonController = new TrafficController(options); ++ } ++ ++ return singletonController; ++} +diff --git a/packages/core/src/workflow/core.ts b/packages/core/src/workflow/core.ts +index 3136511c..2b273d58 100644 +--- a/packages/core/src/workflow/core.ts ++++ b/packages/core/src/workflow/core.ts +@@ -827,6 +827,9 @@ export function createWorkflow< + + // Wrap entire execution in root span + const rootSpan = traceContext.getRootSpan(); ++ if (options?.tenantId) { ++ rootSpan.setAttribute("tenant.id", options.tenantId); ++ } + + // Add workflow state snapshot for remote observability + const workflowState = { +@@ -848,6 +851,7 @@ export function createWorkflow< + executionId, + userId: options?.userId, + conversationId: options?.conversationId, ++ tenantId: options?.tenantId, + traceId: rootSpan.spanContext().traceId, + spanId: rootSpan.spanContext().spanId, + }); +diff --git a/packages/core/src/workflow/internal/state.ts b/packages/core/src/workflow/internal/state.ts +index 71fa602d..2de12528 100644 +--- a/packages/core/src/workflow/internal/state.ts ++++ b/packages/core/src/workflow/internal/state.ts +@@ -23,6 +23,7 @@ export type WorkflowState = { + executionId: string; + conversationId?: string; + userId?: string; ++ tenantId?: string; + context?: UserContext; + active: number; + startAt: Date; +@@ -132,6 +133,7 @@ class WorkflowStateManagerInternal implements WorkflowStateManager + active: config?.active ?? 0, + userId: config?.userId, + conversationId: config?.conversationId, ++ tenantId: config?.tenantId, + context: config?.context, + startAt: new Date(), + endAt: null, +diff --git a/packages/core/src/workflow/internal/utils.ts b/packages/core/src/workflow/internal/utils.ts +index fc39530b..42250d82 100644 +--- a/packages/core/src/workflow/internal/utils.ts ++++ b/packages/core/src/workflow/internal/utils.ts +@@ -32,6 +32,7 @@ export function convertWorkflowStateToParam( + executionId: state.executionId, + conversationId: state.conversationId, + userId: state.userId, ++ tenantId: state.tenantId, + context: state.context, + active: state.active, + startAt: state.startAt, +diff --git a/packages/core/src/workflow/steps/and-agent.ts b/packages/core/src/workflow/steps/and-agent.ts +index bc46c148..14af9b8f 100644 +--- a/packages/core/src/workflow/steps/and-agent.ts ++++ b/packages/core/src/workflow/steps/and-agent.ts +@@ -66,6 +66,7 @@ export function andAgent( + context: restConfig.context ?? state.context, + conversationId: restConfig.conversationId ?? state.conversationId, + userId: restConfig.userId ?? state.userId, ++ tenantId: restConfig.tenantId ?? state.tenantId, + // No parentSpan when there's no workflow context + }); + // Accumulate usage if available (no workflow context) +@@ -92,6 +93,7 @@ export function andAgent( + context: restConfig.context ?? state.context, + conversationId: restConfig.conversationId ?? state.conversationId, + userId: restConfig.userId ?? state.userId, ++ tenantId: restConfig.tenantId ?? state.tenantId, + // Pass the current step span as parent for proper span hierarchy + parentSpan: state.workflowContext?.currentStepSpan, + }); +diff --git a/packages/core/src/workflow/types.ts b/packages/core/src/workflow/types.ts +index f7eed282..49bfd8cb 100644 +--- a/packages/core/src/workflow/types.ts ++++ b/packages/core/src/workflow/types.ts +@@ -214,6 +214,10 @@ export interface WorkflowRunOptions { + * The conversation ID, this can be used to track the current conversation in a workflow + */ + conversationId?: string; ++ /** ++ * Tenant identifier propagated to agent steps and subcalls ++ */ ++ tenantId?: string; + /** + * The user ID, this can be used to track the current user in a workflow + */ +diff --git a/packages/scorers/src/llm/answer-correctness.ts b/packages/scorers/src/llm/answer-correctness.ts +index 2111fa31..d66cc007 100644 +--- a/packages/scorers/src/llm/answer-correctness.ts ++++ b/packages/scorers/src/llm/answer-correctness.ts +@@ -7,6 +7,7 @@ import { + import { safeStringify } from "@voltagent/internal/utils"; + import type { LanguageModel } from "ai"; + import { z } from "zod"; ++import { extractTenantId } from "./utils"; + + const ANSWER_CORRECTNESS_PROMPT = `Given a ground truth and an answer, analyze each statement in the answer and classify them in one of the following categories: + +@@ -84,15 +85,17 @@ export function createAnswerCorrectnessScorer< + const agent = new Agent({ + name: "answer-correctness-classifier", + model, ++ trafficPriority: "P2", + instructions: "You classify statements for answer correctness evaluation", + }); + ++ const tenantId = extractTenantId(context); + const payload = resolvePayload(context, buildPayload); + const prompt = ANSWER_CORRECTNESS_PROMPT.replace("{{question}}", payload.input) + .replace("{{answer}}", payload.output) + .replace("{{ground_truth}}", payload.expected); + +- const response = await agent.generateObject(prompt, CLASSIFICATION_SCHEMA); ++ const response = await agent.generateObject(prompt, CLASSIFICATION_SCHEMA, { tenantId }); + const normalized = normalizeClassification(response.object); + + return { +diff --git a/packages/scorers/src/llm/answer-relevancy.ts b/packages/scorers/src/llm/answer-relevancy.ts +index a3de2237..d9bda1c9 100644 +--- a/packages/scorers/src/llm/answer-relevancy.ts ++++ b/packages/scorers/src/llm/answer-relevancy.ts +@@ -8,6 +8,7 @@ import { + import { safeStringify } from "@voltagent/internal/utils"; + import type { LanguageModel } from "ai"; + import { z } from "zod"; ++import { extractTenantId } from "./utils"; + + const QUESTION_GEN_PROMPT = `Generate a question for the given answer and Identify if answer is noncommittal. Give noncommittal as 1 if the answer is noncommittal and 0 if the answer is committal. A noncommittal answer is one that is evasive, vague, or ambiguous. For example, "I don't know" or "I'm not sure" are noncommittal answers + +@@ -119,9 +120,11 @@ export function createAnswerRelevancyScorer< + const agent = new Agent({ + name: "question-generator", + model, ++ trafficPriority: "P2", + instructions: "You generate questions from answers to evaluate relevancy", + }); + ++ const tenantId = extractTenantId(context); + const payload = resolvePayload(context, buildPayload); + const questions: GeneratedQuestion[] = []; + +@@ -131,7 +134,7 @@ export function createAnswerRelevancyScorer< + payload.context, + ); + +- const response = await agent.generateObject(prompt, QUESTION_SCHEMA); ++ const response = await agent.generateObject(prompt, QUESTION_SCHEMA, { tenantId }); + questions.push({ + question: response.object.question, + noncommittal: response.object.noncommittal === 1, +diff --git a/packages/scorers/src/llm/classifiers.ts b/packages/scorers/src/llm/classifiers.ts +index 1bca4239..a327e20d 100644 +--- a/packages/scorers/src/llm/classifiers.ts ++++ b/packages/scorers/src/llm/classifiers.ts +@@ -7,6 +7,7 @@ import { + } from "@voltagent/core"; + import { safeStringify } from "@voltagent/internal/utils"; + import { z } from "zod"; ++import { extractTenantId } from "./utils"; + + type ChoiceId = string; + +@@ -93,11 +94,14 @@ async function evaluateChoice(args: EvaluateChoiceArgs): Promise + const agent = new Agent({ + name: `${scorerId}-judge`, + model, ++ trafficPriority: "P2", + instructions: judgeInstructions ?? buildDefaultChoiceInstructions(Object.keys(choices)), + }); + ++ const tenantId = extractTenantId(context); + const response = await agent.generateObject(prompt, CHOICE_RESPONSE_SCHEMA, { + maxOutputTokens, ++ tenantId, + }); + + const { choice, reason } = extractChoiceFromResponse(response.object, choices, scorerId); +diff --git a/packages/scorers/src/llm/context-precision.ts b/packages/scorers/src/llm/context-precision.ts +index d31b5b85..ba680f56 100644 +--- a/packages/scorers/src/llm/context-precision.ts ++++ b/packages/scorers/src/llm/context-precision.ts +@@ -7,6 +7,7 @@ import { + import { safeStringify } from "@voltagent/internal/utils"; + import type { LanguageModel } from "ai"; + import { z } from "zod"; ++import { extractTenantId } from "./utils"; + + const CONTEXT_PRECISION_PROMPT = `Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output. + +@@ -109,6 +110,7 @@ export function createContextPrecisionScorer< + const agent = new Agent({ + name: "context-precision-evaluator", + model, ++ trafficPriority: "P2", + instructions: "You evaluate if context was useful for arriving at the answer", + }); + +@@ -116,12 +118,15 @@ export function createContextPrecisionScorer< + const contextText = Array.isArray(payload.context) + ? payload.context.join("\n") + : payload.context; ++ const tenantId = extractTenantId(context); + + const prompt = CONTEXT_PRECISION_PROMPT.replace("{{question}}", payload.input) + .replace("{{context}}", contextText) + .replace("{{answer}}", payload.output); + +- const response = await agent.generateObject(prompt, CONTEXT_PRECISION_SCHEMA); ++ const response = await agent.generateObject(prompt, CONTEXT_PRECISION_SCHEMA, { ++ tenantId, ++ }); + + context.results.raw.contextPrecisionVerdict = response.object; + +diff --git a/packages/scorers/src/llm/context-recall.ts b/packages/scorers/src/llm/context-recall.ts +index e6e86510..2c6053fc 100644 +--- a/packages/scorers/src/llm/context-recall.ts ++++ b/packages/scorers/src/llm/context-recall.ts +@@ -7,6 +7,7 @@ import { + import { safeStringify } from "@voltagent/internal/utils"; + import type { LanguageModel } from "ai"; + import { z } from "zod"; ++import { extractTenantId } from "./utils"; + + const CONTEXT_RECALL_EXTRACT_PROMPT = `Given the context and ground truth (expected output), extract all factual statements from the ground truth. + +@@ -120,6 +121,7 @@ export function createContextRecallScorer< + const agent = new Agent({ + name: "context-recall-evaluator", + model, ++ trafficPriority: "P2", + instructions: "You evaluate how well provided context supports factual statements", + }); + +@@ -127,6 +129,7 @@ export function createContextRecallScorer< + const contextText = Array.isArray(payload.context) + ? payload.context.join("\n") + : payload.context; ++ const tenantId = extractTenantId(context); + + // Extract statements from expected output + const extractPrompt = CONTEXT_RECALL_EXTRACT_PROMPT.replace( +@@ -134,7 +137,9 @@ export function createContextRecallScorer< + contextText, + ).replace("{{expected}}", payload.expected); + +- const extractResponse = await agent.generateObject(extractPrompt, EXTRACT_SCHEMA); ++ const extractResponse = await agent.generateObject(extractPrompt, EXTRACT_SCHEMA, { ++ tenantId, ++ }); + const statements = extractResponse.object.statements; + + if (statements.length === 0) { +@@ -152,7 +157,9 @@ export function createContextRecallScorer< + contextText, + ).replace("{{statement}}", statement); + +- const verifyResponse = await agent.generateObject(verifyPrompt, VERIFY_SCHEMA); ++ const verifyResponse = await agent.generateObject(verifyPrompt, VERIFY_SCHEMA, { ++ tenantId, ++ }); + verdicts.push({ + statement, + verdict: verifyResponse.object.verdict, +diff --git a/packages/scorers/src/llm/context-relevancy.ts b/packages/scorers/src/llm/context-relevancy.ts +index ee882b5b..aca608b2 100644 +--- a/packages/scorers/src/llm/context-relevancy.ts ++++ b/packages/scorers/src/llm/context-relevancy.ts +@@ -7,6 +7,7 @@ import { + import { safeStringify } from "@voltagent/internal/utils"; + import type { LanguageModel } from "ai"; + import { z } from "zod"; ++import { extractTenantId } from "./utils"; + + const CONTEXT_RELEVANCY_PROMPT = `Analyze the provided context and identify which parts are relevant to answering the given question. For each context sentence or passage, determine its relevance level. + +@@ -144,6 +145,7 @@ export function createContextRelevancyScorer< + const agent = new Agent({ + name: "context-relevancy-evaluator", + model, ++ trafficPriority: "P2", + instructions: "You evaluate how relevant provided context is to answering questions", + }); + +@@ -151,13 +153,16 @@ export function createContextRelevancyScorer< + const contextText = Array.isArray(payload.context) + ? payload.context.join("\n") + : payload.context; ++ const tenantId = extractTenantId(context); + + const prompt = CONTEXT_RELEVANCY_PROMPT.replace("{{question}}", payload.input).replace( + "{{context}}", + contextText, + ); + +- const response = await agent.generateObject(prompt, CONTEXT_RELEVANCY_SCHEMA); ++ const response = await agent.generateObject(prompt, CONTEXT_RELEVANCY_SCHEMA, { ++ tenantId, ++ }); + const evaluations = response.object.evaluations; + + context.results.raw.contextRelevancyEvaluations = evaluations; +diff --git a/packages/scorers/src/llm/moderation.ts b/packages/scorers/src/llm/moderation.ts +index 03563bfe..1055927f 100644 +--- a/packages/scorers/src/llm/moderation.ts ++++ b/packages/scorers/src/llm/moderation.ts +@@ -7,6 +7,7 @@ import { + } from "@voltagent/core"; + import { safeStringify } from "@voltagent/internal/utils"; + import { z } from "zod"; ++import { extractTenantId } from "./utils"; + + export interface ModerationScorerOptions { + id?: string; +@@ -220,6 +221,7 @@ async function runModerationJudge(args: { + typeof context.results.prepare === "string" + ? context.results.prepare + : normalizeText(context.payload.output); ++ const tenantId = extractTenantId(context); + + const prompt = await buildPrompt({ + output: normalizedOutput, +@@ -232,12 +234,14 @@ async function runModerationJudge(args: { + const agent = new Agent({ + name: "moderation-judge", + model, ++ trafficPriority: "P2", + instructions: + "You are a safety classifier. Respond with JSON that matches the provided schema containing flagged, scores, and reason.", + }); + + const response = await agent.generateObject(prompt, MODERATION_SCHEMA, { + maxOutputTokens, ++ tenantId, + }); + + const parsed = mapModerationResponse(response.object, threshold); +diff --git a/packages/scorers/src/llm/utils.ts b/packages/scorers/src/llm/utils.ts +new file mode 100644 +index 00000000..75e886e3 +--- /dev/null ++++ b/packages/scorers/src/llm/utils.ts +@@ -0,0 +1,14 @@ ++import type { BuilderPrepareContext, BuilderScoreContext } from "@voltagent/core"; ++ ++type TenantAwareContext = BuilderScoreContext, Record> & ++ BuilderPrepareContext, Record>; ++ ++export function extractTenantId( ++ context: ++ | BuilderScoreContext, Record> ++ | BuilderPrepareContext, Record> ++ | TenantAwareContext, ++): string | undefined { ++ const candidate = (context.payload as { tenantId?: unknown })?.tenantId; ++ return typeof candidate === "string" ? candidate : undefined; ++} diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index 51d097011..adf6e92f9 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -4082,12 +4082,23 @@ export class Agent { return; } - const refillPerSecond = updateResult.normalized.refillPerMs * 1000; + const now = Date.now(); + const effectiveRemaining = Math.max( + 0, + updateResult.state.remaining - updateResult.state.reserved, + ); + const resetInMs = Math.max(0, updateResult.state.resetAt - now); + const nextAllowedInMs = Math.max(0, updateResult.state.nextAllowedAt - now); logger?.info?.("[Traffic] Applied rate limit from response headers", { rateLimitKey: updateResult.key, - capacity: updateResult.normalized.capacity, - refillPerSecond, - appliedTokens: updateResult.appliedTokens, + limit: updateResult.state.limit, + remaining: updateResult.state.remaining, + reserved: updateResult.state.reserved, + effectiveRemaining, + resetAt: updateResult.state.resetAt, + resetInMs, + nextAllowedAt: updateResult.state.nextAllowedAt, + nextAllowedInMs, headers: { limitRequests: updateResult.headerSnapshot.limitRequests, remainingRequests: updateResult.headerSnapshot.remainingRequests, diff --git a/packages/core/src/traffic/traffic-controller.spec.ts b/packages/core/src/traffic/traffic-controller.spec.ts index 9b89d4b85..aa7fba6d0 100644 --- a/packages/core/src/traffic/traffic-controller.spec.ts +++ b/packages/core/src/traffic/traffic-controller.spec.ts @@ -40,22 +40,16 @@ describe("TrafficController priority scheduling", () => { vi.useFakeTimers(); try { - const controller = new TrafficController({ - maxConcurrent: 1, - rateLimits: { - "p0::m0": { capacity: 1, refillPerSecond: 1 }, + vi.setSystemTime(new Date(0)); + const controller = new TrafficController({ maxConcurrent: 1 }); + controller.updateRateLimitFromHeaders( + { provider: "p0", model: "m0" }, + { + "x-ratelimit-limit-requests": "1", + "x-ratelimit-remaining-requests": "0", + "x-ratelimit-reset-requests": "1s", }, - }); - - // Exhaust the bucket for the P0 key so it initially waits - const buckets = (controller as unknown as { rateLimitBuckets: Map }) - .rateLimitBuckets; - buckets.set("p0::m0", { - tokens: 0, - capacity: 1, - refillPerMs: 1 / 1000, - lastRefill: Date.now(), - }); + ); const order: string[] = []; @@ -85,3 +79,119 @@ describe("TrafficController priority scheduling", () => { } }); }); + +describe("TrafficController rate limit headers", () => { + it("parses OpenAI-style compound reset durations (e.g. 1m30.951s)", () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(1_000_000)); + const controller = new TrafficController({ maxConcurrent: 1 }); + const now = Date.now(); + + const result = controller.updateRateLimitFromHeaders( + { provider: "openai.responses", model: "gpt-4o-mini" }, + { + "x-ratelimit-limit-requests": "10000", + "x-ratelimit-remaining-requests": "9989", + "x-ratelimit-reset-requests": "1m30.951s", + }, + ); + + expect(result).toBeTruthy(); + expect(result?.headerSnapshot.resetRequestsMs).toBeCloseTo(90_951, 6); + expect(result?.state.limit).toBe(10000); + expect(result?.state.remaining).toBe(9989); + expect(result?.state.resetAt).toBe(now + 90_951); + expect(result?.state.reserved).toBe(0); + expect(result?.state.nextAllowedAt).toBe(now); + } finally { + vi.useRealTimers(); + } + }); + + it("keeps resetAt monotonic when headers shorten the reset duration", () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(0)); + const controller = new TrafficController({ maxConcurrent: 1 }); + + const first = controller.updateRateLimitFromHeaders( + { provider: "openai.responses", model: "gpt-4o-mini" }, + { + "x-ratelimit-limit-requests": "10000", + "x-ratelimit-remaining-requests": "9999", + "x-ratelimit-reset-requests": "60s", + }, + ); + + expect(first).toBeTruthy(); + expect(first?.state.resetAt).toBe(60_000); + + vi.setSystemTime(new Date(10_000)); + const second = controller.updateRateLimitFromHeaders( + { provider: "openai.responses", model: "gpt-4o-mini" }, + { + "x-ratelimit-limit-requests": "10000", + "x-ratelimit-remaining-requests": "9998", + "x-ratelimit-reset-requests": "5s", + }, + ); + + expect(second).toBeTruthy(); + expect(second?.state.resetAt).toBe(60_000); + } finally { + vi.useRealTimers(); + } + }); + + it("never increases remaining within the same window", () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(0)); + const controller = new TrafficController({ maxConcurrent: 1 }); + + const first = controller.updateRateLimitFromHeaders( + { provider: "openai.responses", model: "gpt-4o-mini" }, + { + "x-ratelimit-limit-requests": "10", + "x-ratelimit-remaining-requests": "9", + "x-ratelimit-reset-requests": "60s", + }, + ); + + expect(first?.state.remaining).toBe(9); + expect(first?.state.resetAt).toBe(60_000); + + vi.setSystemTime(new Date(10_000)); + const second = controller.updateRateLimitFromHeaders( + { provider: "openai.responses", model: "gpt-4o-mini" }, + { + "x-ratelimit-limit-requests": "10", + "x-ratelimit-remaining-requests": "8", + "x-ratelimit-reset-requests": "50s", + }, + ); + + expect(second?.state.remaining).toBe(8); + expect(second?.state.resetAt).toBe(60_000); + + vi.setSystemTime(new Date(20_000)); + const third = controller.updateRateLimitFromHeaders( + { provider: "openai.responses", model: "gpt-4o-mini" }, + { + "x-ratelimit-limit-requests": "10", + "x-ratelimit-remaining-requests": "9", + "x-ratelimit-reset-requests": "40s", + }, + ); + + expect(third?.state.remaining).toBe(8); + expect(third?.state.resetAt).toBe(60_000); + } finally { + vi.useRealTimers(); + } + }); +}); diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index 8d82e8a5d..1cf76109b 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -22,20 +22,20 @@ const TIMEOUT_BASE_BACKOFF_MS = 750; const RATE_LIMIT_JITTER_FACTOR = 0.35; const SERVER_ERROR_JITTER_FACTOR = 0.8; const TIMEOUT_JITTER_FACTOR = 0.5; +const RATE_LIMIT_EXHAUSTION_BUFFER = 1; +const RATE_LIMIT_PROBE_DELAY_MS = 50; +const RATE_LIMIT_MIN_PACE_INTERVAL_MS = 10; +const RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS = 10; const DEFAULT_FALLBACK_CHAINS: Record = { "gpt-4o": ["gpt-4o-mini", "gpt-3.5"], }; -interface RateLimitBucket { - tokens: number; - capacity: number; - refillPerMs: number; - lastRefill: number; -} - -type NormalizedRateLimit = { - capacity: number; - refillPerMs: number; +type RateLimitWindowState = { + limit: number; + remaining: number; + resetAt: number; + reserved: number; + nextAllowedAt: number; }; export interface RateLimitOptions { @@ -60,15 +60,14 @@ export type RateLimitConfig = Record; type RateLimitHeaderSnapshot = { limitRequests: number; - remainingRequests?: number; + remainingRequests: number; resetRequestsMs: number; }; export type RateLimitUpdateResult = { key: string; headerSnapshot: RateLimitHeaderSnapshot; - normalized: NormalizedRateLimit; - appliedTokens: number; + state: RateLimitWindowState; }; export type TrafficRequestType = "text" | "stream"; @@ -137,8 +136,7 @@ type ProcessDecision = "process" | "skip" | "wait"; export class TrafficController { private readonly scheduler: Scheduler; private readonly maxConcurrent: number; - private rateLimits?: Map; - private readonly rateLimitBuckets = new Map(); + private readonly rateLimitStates = new Map(); private readonly circuitBreakers = new Map(); private readonly fallbackChains: Map; private readonly priorityOrder: TrafficPriority[] = ["P0", "P1", "P2"]; @@ -149,7 +147,8 @@ export class TrafficController { }; private activeCount = 0; private drainScheduled = false; - private refillTimeout?: ReturnType; + private wakeUpTimeout?: ReturnType; + private wakeUpAt?: number; private readonly tenantUsage = new Map(); private readonly logger: Logger; @@ -161,7 +160,6 @@ export class TrafficController { constructor(options: TrafficControllerOptions = {}) { this.maxConcurrent = options.maxConcurrent ?? Number.POSITIVE_INFINITY; - this.rateLimits = this.normalizeRateLimits(options.rateLimits); this.fallbackChains = this.normalizeFallbackChains(options.fallbackChains); this.scheduler = this.createScheduler(); @@ -171,7 +169,7 @@ export class TrafficController { // INIT LOG (from HEAD) — rewritten to use the new logger this.logger.debug("[TrafficController] init", { maxConcurrent: this.maxConcurrent, - rateLimits: this.rateLimits ? Array.from(this.rateLimits.entries()) : undefined, + rateLimitKeys: Array.from(this.rateLimitStates.keys()), }); } @@ -316,8 +314,8 @@ export class TrafficController { return "wait"; } - const rateLimitConfig = this.getRateLimitConfig(next.request.metadata); - if (!rateLimitConfig) { + const rateLimitState = this.getRateLimitState(next.request.metadata); + if (!rateLimitState) { this.logDebug("[TrafficController] no rate limit match", { metadata: next.request.metadata, }); @@ -326,159 +324,93 @@ export class TrafficController { return "process"; // No rate limit configured for this key } - const queuedAhead = this.countQueuedAheadWithKey( - rateLimitConfig.key, - next, - /*logDetails*/ true, - ); - const bucket = this.getRateLimitBucket(rateLimitConfig.key, rateLimitConfig.limit); - if (bucket.tokens < 1) { - next.rateLimitKey = rateLimitConfig.key; - next.etaMs = this.computeEtaMs( - bucket, - rateLimitConfig.limit, - rateLimitConfig.key, - next, - queuedAhead, - ); + const { key, state } = rateLimitState; + const now = Date.now(); + const effectiveRemaining = Math.max(0, state.remaining - state.reserved); + const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; + + if (effectiveRemaining <= RATE_LIMIT_EXHAUSTION_BUFFER) { + next.rateLimitKey = key; + next.etaMs = Math.max(0, probeAt - now); this.logDebug("[TrafficController] throttle rate", { - key: rateLimitConfig.key, - tokens: bucket.tokens, + key, + remaining: state.remaining, + reserved: state.reserved, + resetAt: state.resetAt, + nextAllowedAt: state.nextAllowedAt, + effectiveRemaining, etaMs: next.etaMs, - queuedAhead, }); - this.scheduleRefill(rateLimitConfig.limit); // Ensure we retry as soon as tokens are replenished - return "wait"; - } - - bucket.tokens -= 1; // Consume a token for this dispatch - this.logDebug("[TrafficController] token consumed", { - key: rateLimitConfig.key, - remaining: bucket.tokens, - capacity: bucket.capacity, - }); - next.rateLimitKey = rateLimitConfig.key; - next.etaMs = 0; - return "process"; - } - private getRateLimitConfig( - metadata?: TrafficRequestMetadata, - ): { key: string; limit: NormalizedRateLimit } | undefined { - if (!this.rateLimits || this.rateLimits.size === 0) { - return undefined; - } + if (now < probeAt) { + this.scheduleRateLimitWakeUpAt(probeAt); + return "wait"; + } - const key = this.buildRateLimitKey(metadata); - const limit = this.rateLimits.get(key); - if (!limit) { - return undefined; + // Window has expired, but we have not observed a newer header snapshot yet. + // Allow a single probe request (no in-flight reservations) to refresh headers. + if (state.reserved > 0) { + return "wait"; + } } - this.logDebug("[TrafficController] rateLimitConfig hit", { key }); - return { key, limit }; - } - - private getRateLimitBucket(key: string, limit: NormalizedRateLimit): RateLimitBucket { - const now = Date.now(); // Snapshot time once to avoid drift within this method - let bucket = this.rateLimitBuckets.get(key); // Reuse the bucket if it already exists - - if (!bucket) { - bucket = { - tokens: limit.capacity, - capacity: limit.capacity, - refillPerMs: limit.refillPerMs, - lastRefill: now, - }; - this.rateLimitBuckets.set(key, bucket); - this.logDebug("[TrafficController] bucket create", { + if (now < state.nextAllowedAt) { + next.rateLimitKey = key; + next.etaMs = Math.max(0, state.nextAllowedAt - now); + this.logDebug("[TrafficController] throttle rate", { key, - capacity: bucket.capacity, - refillPerMs: bucket.refillPerMs, + remaining: state.remaining, + reserved: state.reserved, + resetAt: state.resetAt, + nextAllowedAt: state.nextAllowedAt, + effectiveRemaining, + etaMs: next.etaMs, }); - return bucket; + this.scheduleRateLimitWakeUpAt(Math.min(state.resetAt, state.nextAllowedAt)); + return "wait"; } - if ( - bucket.capacity !== limit.capacity || - Math.abs(bucket.refillPerMs - limit.refillPerMs) > Number.EPSILON - ) { - bucket.capacity = limit.capacity; - bucket.refillPerMs = limit.refillPerMs; - bucket.tokens = Math.min(bucket.tokens, bucket.capacity); - bucket.lastRefill = now; - this.logDebug("[TrafficController] bucket sync with new limit", { - key, - capacity: bucket.capacity, - refillPerMs: bucket.refillPerMs, - }); - } + // Allow request: reserve one slot until we receive headers (or completion). + state.reserved += 1; + next.rateLimitKey = key; + next.etaMs = 0; - const elapsedMs = Math.max(0, now - bucket.lastRefill); - if (elapsedMs > 0 && bucket.tokens < bucket.capacity) { - const refilled = elapsedMs * bucket.refillPerMs; // Refill based on elapsed time - bucket.tokens = Math.min(bucket.capacity, bucket.tokens + refilled); // Cap at bucket capacity - bucket.lastRefill = now; // Mark refill time for the next calculation - this.logDebug("[TrafficController] bucket refill", { - key, - elapsedMs, - tokens: bucket.tokens, - }); + const remainingWindowMs = Math.max(0, state.resetAt - now); + const intervalMs = Math.max( + RATE_LIMIT_MIN_PACE_INTERVAL_MS, + Math.ceil(remainingWindowMs / Math.max(effectiveRemaining, 1)), + ); + const candidateNextAllowedAt = Math.max(state.nextAllowedAt, now + intervalMs); + const shouldUpdateNextAllowedAt = + state.nextAllowedAt <= now || + candidateNextAllowedAt >= state.nextAllowedAt + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS; + if (shouldUpdateNextAllowedAt) { + state.nextAllowedAt = candidateNextAllowedAt; } - return bucket; - } - - private computeEtaMs( - bucket: RateLimitBucket, - limit: NormalizedRateLimit, - key: string, - current: QueuedRequest, - queuedAhead?: number, - ): number { - const missingTokens = Math.max(0, 1 - bucket.tokens); - const waitForToken = - missingTokens > 0 && limit.refillPerMs > 0 ? Math.ceil(missingTokens / limit.refillPerMs) : 0; - const aheadCount = - typeof queuedAhead === "number" - ? queuedAhead - : this.countQueuedAheadWithKey(key, current, /*logDetails*/ false); - const extraForQueue = - aheadCount > 0 && limit.refillPerMs > 0 ? Math.ceil(aheadCount / limit.refillPerMs) : 0; - this.logDebug("[TrafficController] computeEtaMs", { + this.logDebug("[TrafficController] rate limit reserved", { key, - missingTokens, - waitForToken, - aheadCount, - extraForQueue, - eta: waitForToken + extraForQueue, + remaining: state.remaining, + reserved: state.reserved, + resetAt: state.resetAt, + nextAllowedAt: state.nextAllowedAt, + intervalMs, + effectiveRemaining, }); - return waitForToken + extraForQueue; + return "process"; } - private countQueuedAheadWithKey(key: string, current: QueuedRequest, logDetails = false): number { - let count = 0; - for (const priority of this.priorityOrder) { - const queue = this.getQueue(priority); - for (const item of queue) { - if (item === current) { - return count; - } - - const itemKey = this.buildRateLimitKey(item.request.metadata); - if (itemKey === key) { - count += 1; - } - } - } - if (logDetails) { - this.logDebug("[TrafficController] countQueuedAheadWithKey", { - key, - count, - queueSize: this.getQueueSize(), - }); + private getRateLimitState( + metadata?: TrafficRequestMetadata, + ): { key: string; state: RateLimitWindowState } | undefined { + const key = this.buildRateLimitKey(metadata); + const state = this.rateLimitStates.get(key); + if (!state) { + return undefined; } - return count; + + this.logDebug("[TrafficController] rateLimitState hit", { key }); + return { key, state }; } private evaluateCircuitBreaker(next: QueuedRequest): ProcessDecision { @@ -621,26 +553,6 @@ export class TrafficController { } } - private normalizeRateLimits( - rateLimits?: RateLimitConfig, - ): Map | undefined { - if (!rateLimits) { - return undefined; - } - - const normalized = new Map(); - for (const [key, config] of Object.entries(rateLimits)) { - if (config.capacity > 0 && config.refillPerSecond > 0) { - normalized.set(key, { - capacity: config.capacity, - refillPerMs: config.refillPerSecond / 1000, - }); - } - } - - return normalized.size > 0 ? normalized : undefined; - } - private normalizeFallbackChains( fallbackChains?: Record, ): Map { @@ -663,7 +575,7 @@ export class TrafficController { } /** - * Update (or bootstrap) rate limit buckets based on provider response headers. + * Update (or bootstrap) rate limit window state based on provider response headers. * This lets the controller adopt server-issued limits without static config. */ updateRateLimitFromHeaders( @@ -678,54 +590,54 @@ export class TrafficController { return undefined; } - const normalized = this.normalizeHeaderRateLimit(headerInfo); - if (!normalized) { + const key = this.buildRateLimitKey(metadata); + const now = Date.now(); + const limit = headerInfo.limitRequests; + const remaining = this.coerceRemaining(headerInfo.remainingRequests, limit); + if (remaining === undefined) { this.logDebug("[TrafficController] rate limit headers present but invalid", { headerInfo, }); return undefined; } - const key = this.buildRateLimitKey(metadata); - if (!this.rateLimits) { - this.rateLimits = new Map(); - } - this.rateLimits.set(key, normalized); + const existing = this.rateLimitStates.get(key); + const resetAtCandidate = now + headerInfo.resetRequestsMs; + const resetAt = existing ? Math.max(existing.resetAt, resetAtCandidate) : resetAtCandidate; + const reserved = Math.max(0, existing?.reserved ?? 0); + const remainingFromHeaders = Math.min(limit, remaining); + const isSameWindow = Boolean(existing && now < existing.resetAt); + const nextRemaining = isSameWindow + ? Math.min(existing?.remaining ?? remainingFromHeaders, remainingFromHeaders) + : remainingFromHeaders; + const effectiveRemaining = Math.max(0, nextRemaining - reserved); + const state: RateLimitWindowState = { + limit, + remaining: nextRemaining, + resetAt, + reserved, + nextAllowedAt: existing?.nextAllowedAt ?? now, + }; - const now = Date.now(); - const remainingTokens = this.coerceRemaining(headerInfo.remainingRequests, normalized.capacity); - const existingBucket = this.rateLimitBuckets.get(key); - const tokens = remainingTokens ?? existingBucket?.tokens ?? normalized.capacity; - - if (existingBucket) { - existingBucket.capacity = normalized.capacity; - existingBucket.refillPerMs = normalized.refillPerMs; - existingBucket.tokens = Math.min(tokens, normalized.capacity); - existingBucket.lastRefill = now; - } else { - this.rateLimitBuckets.set(key, { - tokens: Math.min(tokens, normalized.capacity), - capacity: normalized.capacity, - refillPerMs: normalized.refillPerMs, - lastRefill: now, - }); - } + this.rateLimitStates.set(key, state); this.logDebug("[TrafficController] rateLimit updated from headers", { key, - capacity: normalized.capacity, - refillPerMs: normalized.refillPerMs, - remaining: remainingTokens, + limit: state.limit, + remaining: state.remaining, + reserved: state.reserved, + effectiveRemaining, + resetAt: state.resetAt, + nextAllowedAt: state.nextAllowedAt, }); - // If we just refilled tokens, try draining again. + // Try draining again in case this update unblocks queued work. this.scheduleDrain(); return { key, headerSnapshot: headerInfo, - normalized, - appliedTokens: Math.min(tokens, normalized.capacity), + state, }; } @@ -736,19 +648,19 @@ export class TrafficController { } const limitRequests = this.parseNumberHeader(getHeader, "x-ratelimit-limit-requests"); + const remainingRequests = this.parseNumberHeader(getHeader, "x-ratelimit-remaining-requests"); const resetRequestsMs = this.parseDurationHeaderToMs(getHeader, "x-ratelimit-reset-requests"); if ( limitRequests === undefined || limitRequests <= 0 || + remainingRequests === undefined || resetRequestsMs === undefined || resetRequestsMs <= 0 ) { return undefined; } - const remainingRequests = this.parseNumberHeader(getHeader, "x-ratelimit-remaining-requests"); - return { limitRequests, remainingRequests, @@ -756,19 +668,6 @@ export class TrafficController { }; } - private normalizeHeaderRateLimit( - snapshot: RateLimitHeaderSnapshot, - ): NormalizedRateLimit | undefined { - if (snapshot.limitRequests <= 0 || snapshot.resetRequestsMs <= 0) { - return undefined; - } - - return { - capacity: snapshot.limitRequests, - refillPerMs: snapshot.limitRequests / snapshot.resetRequestsMs, - }; - } - private coerceRemaining(remaining: number | undefined, capacity: number): number | undefined { if (remaining === undefined) { return undefined; @@ -842,18 +741,78 @@ export class TrafficController { } const trimmed = raw.trim(); - const match = trimmed.match(/^(-?\d+(?:\.\d+)?)(ms|s)?$/i); - if (!match) { + if (!trimmed) { return undefined; } - const value = Number(match[1]); - if (!Number.isFinite(value) || value <= 0) { + const simpleMatch = trimmed.match(/^(\d+(?:\.\d+)?)(ms|s|m|h)?$/i); + if (simpleMatch) { + const value = Number(simpleMatch[1]); + if (!Number.isFinite(value) || value <= 0) { + return undefined; + } + + const unit = (simpleMatch[2] ?? "s").toLowerCase(); + switch (unit) { + case "ms": + return value; + case "s": + return value * 1000; + case "m": + return value * 60 * 1000; + case "h": + return value * 60 * 60 * 1000; + default: + return undefined; + } + } + + // Compound durations like "1m30.951s" + const segmentRegex = /(\d+(?:\.\d+)?)(ms|s|m|h)/gi; + let totalMs = 0; + let matched = false; + + segmentRegex.lastIndex = 0; + let segment: RegExpExecArray | null = segmentRegex.exec(trimmed); + while (segment !== null) { + matched = true; + const value = Number(segment[1]); + if (!Number.isFinite(value) || value < 0) { + return undefined; + } + + const unit = segment[2].toLowerCase(); + switch (unit) { + case "ms": + totalMs += value; + break; + case "s": + totalMs += value * 1000; + break; + case "m": + totalMs += value * 60 * 1000; + break; + case "h": + totalMs += value * 60 * 60 * 1000; + break; + default: + return undefined; + } + + segment = segmentRegex.exec(trimmed); + } + + if (!matched || totalMs <= 0) { + return undefined; + } + + segmentRegex.lastIndex = 0; + const leftover = trimmed.replace(segmentRegex, "").trim(); + if (leftover) { return undefined; } - const unit = (match[2] || "s").toLowerCase(); - return unit === "ms" ? value : value * 1000; + return totalMs; } private resolvePriority(metadata?: TrafficRequestMetadata): TrafficPriority { @@ -881,23 +840,61 @@ export class TrafficController { return size; } - private scheduleRefill(limit: NormalizedRateLimit): void { - if (this.refillTimeout) { + private scheduleRateLimitWakeUpAt(wakeUpAt: number): void { + if (!Number.isFinite(wakeUpAt)) { return; } - const delayMs = Math.max(1, Math.ceil(1 / limit.refillPerMs)); // Wait long enough for at least one token - this.logDebug("[TrafficController] scheduleRefill", { delayMs }); - this.refillTimeout = setTimeout(() => { - this.refillTimeout = undefined; // Allow future refills to be scheduled - this.logDebug("[TrafficController] refillTimeoutFired", { + const now = Date.now(); + const targetAt = Math.max(now, wakeUpAt); + + if (this.wakeUpTimeout && this.wakeUpAt !== undefined && this.wakeUpAt <= targetAt) { + return; + } + + if (this.wakeUpTimeout) { + clearTimeout(this.wakeUpTimeout); + this.wakeUpTimeout = undefined; + } + + this.wakeUpAt = targetAt; + const delayMs = Math.max(1, Math.ceil(targetAt - now)); + this.logDebug("[TrafficController] scheduleRateLimitWakeUp", { delayMs, wakeUpAt: targetAt }); + this.wakeUpTimeout = setTimeout(() => { + this.wakeUpTimeout = undefined; + this.wakeUpAt = undefined; + this.logDebug("[TrafficController] rateLimitWakeUpFired", { queueSize: this.getQueueSize(), active: this.activeCount, }); - this.scheduleDrain(); // Try draining again now that tokens should exist + this.scheduleDrain(); }, delayMs); } + private releaseRateLimitReservation(key: string | undefined): void { + if (!key) { + return; + } + + const state = this.rateLimitStates.get(key); + if (!state) { + return; + } + + if (state.reserved <= 0) { + return; + } + + state.reserved = Math.max(0, state.reserved - 1); + this.logDebug("[TrafficController] rate limit released", { + key, + reserved: state.reserved, + remaining: state.remaining, + resetAt: state.resetAt, + nextAllowedAt: state.nextAllowedAt, + }); + } + private recordCircuitSuccess(metadata?: TrafficRequestMetadata): void { const key = this.buildRateLimitKey(metadata); if (this.circuitBreakers.has(key)) { @@ -1063,6 +1060,7 @@ export class TrafficController { item.reject(error); // Surface failures to the caller } } finally { + this.releaseRateLimitReservation(item.rateLimitKey); this.activeCount = Math.max(0, this.activeCount - 1); // Ensure counter never underflows this.logDebug("[TrafficController] runRequest complete", { type: item.type, diff --git a/tmp/test/traffic-concurrency.ts b/tmp/test/traffic-concurrency.ts new file mode 100644 index 000000000..d12fc5c9f --- /dev/null +++ b/tmp/test/traffic-concurrency.ts @@ -0,0 +1,91 @@ +// @ts-nocheck +/** + * Manual test: TrafficController maxConcurrent scheduling. + * + * What to look for: + * - `inFlight` should never exceed `maxConcurrent`. + * - Requests should start in bursts up to `maxConcurrent`. + * + * Run: + * - pnpm ts-node tmp/test/traffic-concurrency.ts + * - VERBOSE=1 pnpm ts-node tmp/test/traffic-concurrency.ts (enable controller debug logs) + */ + +import { safeStringify } from "@voltagent/internal"; +import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; + +const verbose = process.env.VERBOSE === "1"; +if (!verbose) { + console.debug = () => {}; +} + +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); +const now = () => new Date().toISOString(); + +const maxConcurrent = 3; +const controller = getTrafficController({ maxConcurrent }); + +let inFlight = 0; +let maxObserved = 0; + +function makeModel(id: string, durationMs: number) { + return { + specificationVersion: "v2", + provider: "sim", + modelId: `concurrency-${id}`, + doGenerate: async () => { + inFlight += 1; + maxObserved = Math.max(maxObserved, inFlight); + console.log(`[${now()}] start ${id} inFlight=${inFlight}`); + + try { + await sleep(durationMs); + return { + content: [{ type: "text", text: `ok:${id}` }], + finishReason: "stop", + usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, + warnings: [], + response: { modelId: `concurrency-${id}`, headers: {} }, + }; + } finally { + inFlight -= 1; + console.log(`[${now()}] end ${id} inFlight=${inFlight}`); + } + }, + }; +} + +async function main() { + console.log(`\n=== TrafficController concurrency (maxConcurrent=${maxConcurrent}) ===`); + void controller; + + const agent = new Agent({ + name: "traffic-concurrency", + instructions: "echo", + model: makeModel("base", 0), + temperature: 0, + maxOutputTokens: 32, + }); + + const ids = ["A", "B", "C", "D", "E"]; + const jobs = ids.map((id) => + agent.generateText(id, { + tenantId: "default", + trafficPriority: "P1", + model: makeModel(id, 700), + }), + ); + + const settled = await Promise.allSettled(jobs); + console.log(`\n[done] maxObserved=${maxObserved}`); + console.log( + `[done] results=${safeStringify( + settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)), + )}`, + ); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/tmp/test/traffic-fallback-chain.ts b/tmp/test/traffic-fallback-chain.ts new file mode 100644 index 000000000..0cd77b2ba --- /dev/null +++ b/tmp/test/traffic-fallback-chain.ts @@ -0,0 +1,168 @@ +// @ts-nocheck +/** + * Manual test: TrafficController circuit breaker + fallback chains. + * + * Scenarios: + * - Test 1: Open primary circuit (via repeated 429s), then route to fallback1. + * - Test 2: Open fallback1 circuit, then route to fallback2 (success). + * - Test 3: No fallback configured → CircuitBreakerOpenError. + * + * Run: + * - pnpm ts-node tmp/test/traffic-fallback-chain.ts + * - VERBOSE=1 pnpm ts-node tmp/test/traffic-fallback-chain.ts + */ + +import { safeStringify } from "@voltagent/internal"; +import { MockLanguageModelV2, MockProviderV2 } from "ai/test"; +import { + Agent, + CircuitBreakerOpenError, + getTrafficController, +} from "../../packages/core/dist/index.js"; + +const verbose = process.env.VERBOSE === "1"; +if (!verbose) { + console.debug = () => {}; +} + +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); +const now = () => new Date().toISOString(); + +type ModelId = "primary" | "fallback1" | "fallback2" | "no-fallback"; + +const provider = "test-provider"; + +const controller = getTrafficController({ + maxConcurrent: 1, + fallbackChains: { + primary: ["fallback1", "fallback2"], + fallback1: ["fallback2"], + }, +}); + +function makeAlways429Model(modelId: ModelId) { + let attempts = 0; + return new MockLanguageModelV2({ + provider, + modelId, + doGenerate: async () => { + attempts += 1; + console.log(`[${now()}] doGenerate model=${modelId} attempt=${attempts} -> 429`); + await sleep(25); + const err: any = new Error(`forced 429 for model=${modelId} attempt=${attempts}`); + err.status = 429; + throw err; + }, + }); +} + +function makeAlwaysOkModel(modelId: ModelId) { + let attempts = 0; + return new MockLanguageModelV2({ + provider, + modelId, + doGenerate: async () => { + attempts += 1; + console.log(`[${now()}] doGenerate model=${modelId} attempt=${attempts} -> ok`); + await sleep(25); + return { + content: [{ type: "text", text: `ok:${modelId}` }], + finishReason: "stop", + usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, + warnings: [], + response: { modelId, headers: {} }, + }; + }, + }); +} + +const primaryModel = makeAlways429Model("primary"); +const fallback1Model = makeAlways429Model("fallback1"); +const fallback2Model = makeAlwaysOkModel("fallback2"); +const noFallbackModel = makeAlways429Model("no-fallback"); + +// Required so Agent fallbacks (string model IDs) resolve without network calls. +(globalThis as any).AI_SDK_DEFAULT_PROVIDER = new MockProviderV2({ + languageModels: { + primary: primaryModel, + fallback1: fallback1Model, + fallback2: fallback2Model, + "no-fallback": noFallbackModel, + }, +}); + +const primaryAgent = new Agent({ + name: "traffic-fallback-primary", + instructions: "echo", + model: primaryModel, + temperature: 0, + maxOutputTokens: 32, +}); + +const noFallbackAgent = new Agent({ + name: "traffic-fallback-none", + instructions: "echo", + model: noFallbackModel, + temperature: 0, + maxOutputTokens: 32, +}); + +async function runOnce(label: string, agent: any) { + console.log(`\n--- ${label} ---`); + try { + const result = await agent.generateText(label, { + tenantId: "default", + trafficPriority: "P1", + }); + console.log( + `[${label}] success text=${result.text} responseModel=${result.response?.modelId ?? "n/a"}`, + ); + } catch (err: any) { + if (err instanceof CircuitBreakerOpenError) { + console.log( + `[${label}] CircuitBreakerOpenError retryAfterMs=${err.retryAfterMs} msg=${err.message}`, + ); + } else { + console.log( + `[${label}] failed name=${err?.name ?? "Error"} status=${err?.status ?? err?.statusCode ?? "n/a"} msg=${err?.message}`, + ); + } + } +} + +async function main() { + console.log("\n=== Circuit breaker + fallback chain ==="); + void controller; + + console.log("\n[Test 1] Open primary circuit, then route to fallback1"); + // Two calls * (up to 3 retries each) ≈ 6 failures → should open the circuit (threshold=5). + await runOnce("primary-warmup-1", primaryAgent); + await runOnce("primary-warmup-2", primaryAgent); + await runOnce("primary-after-open", primaryAgent); // should execute fallback1 (still closed) + + console.log("\n[Test 2] Open fallback1 circuit, then route to fallback2"); + // Build enough failures on fallback1 by routing multiple requests to it via primary circuit-open path. + await runOnce("fallback1-warmup-1-via-primary", primaryAgent); + await runOnce("fallback1-warmup-2-via-primary", primaryAgent); + await runOnce("primary-should-hit-fallback2", primaryAgent); // should execute fallback2 and succeed + + console.log("\n[Test 3] No fallback configured → CircuitBreakerOpenError"); + await runOnce("no-fallback-warmup-1", noFallbackAgent); + await runOnce("no-fallback-warmup-2", noFallbackAgent); + await runOnce("no-fallback-after-open", noFallbackAgent); + + console.log("\n[debug] model call counts:"); + console.log( + safeStringify({ + primary: primaryModel.doGenerateCalls?.length, + fallback1: fallback1Model.doGenerateCalls?.length, + fallback2: fallback2Model.doGenerateCalls?.length, + "no-fallback": noFallbackModel.doGenerateCalls?.length, + }), + ); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/tmp/test/traffic-priority-openai-real.ts b/tmp/test/traffic-priority-openai-real.ts new file mode 100644 index 000000000..223263ba8 --- /dev/null +++ b/tmp/test/traffic-priority-openai-real.ts @@ -0,0 +1,117 @@ +// @ts-nocheck +/** + * Manual test: TrafficController + AI SDK with real OpenAI calls. + * + * What this exercises: + * - Priority scheduling (P0/P1/P2) with `maxConcurrent=1` + * - Rate limit header ingestion via `updateRateLimitFromHeaders()` (if headers are present) + * - Tenant usage aggregation via `extractUsage` + `getTenantUsage()` + * + * Prereqs: + * - Set `OPENAI_API_KEY` + * + * Run: + * - OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-priority-openai-real.ts + * - VERBOSE=1 OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-priority-openai-real.ts + * + * Notes: + * - This will make real network calls and may incur cost. + */ + +import { openai } from "@ai-sdk/openai"; +import { safeStringify } from "@voltagent/internal"; +import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; + +const verbose = process.env.VERBOSE === "1"; +if (!verbose) { + console.debug = () => {}; +} + +const apiKey = process.env.OPENAI_API_KEY; +if (!apiKey) { + console.error("Missing OPENAI_API_KEY. Example:"); + console.error(" OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-priority-openai-real.ts"); + process.exit(1); +} + +const _now = () => new Date().toISOString(); +const preview = (value: unknown, max = 140) => { + if (typeof value !== "string") return String(value ?? ""); + return value.length > max ? `${value.slice(0, max)}…` : value; +}; + +const tenantId = process.env.TENANT_ID ?? "openai-real"; +const defaultModelId = process.env.OPENAI_MODEL ?? "gpt-4o-mini"; + +const controller = getTrafficController({ maxConcurrent: 1 }); + +function getHeader(headers: any, name: string): string | undefined { + if (!headers) return undefined; + if (typeof headers.get === "function") { + const v = headers.get(name); + return v === null || v === undefined ? undefined : String(v); + } + const key = Object.keys(headers).find((k) => k.toLowerCase() === name.toLowerCase()); + if (!key) return undefined; + const v = headers[key]; + return v === null || v === undefined ? undefined : String(Array.isArray(v) ? v[0] : v); +} + +async function main() { + console.log( + `\n=== OpenAI real: priority scheduling (tenantId=${tenantId}, model=${defaultModelId}) ===`, + ); + void controller; + + const agent = new Agent({ + name: "openai-real-traffic", + instructions: "Reply exactly with the requested token.", + model: openai(defaultModelId), + temperature: 0, + maxOutputTokens: 32, + }); + + // Enqueue in reverse priority order; controller should still execute P0 first. + const p2 = agent.generateText("Reply with only: P2", { tenantId, trafficPriority: "P2" }); + const p1 = agent.generateText("Reply with only: P1", { tenantId, trafficPriority: "P1" }); + const p0 = agent.generateText("Reply with only: P0", { tenantId, trafficPriority: "P0" }); + + const settled = await Promise.allSettled([p0, p1, p2]); + for (const result of settled) { + if (result.status !== "fulfilled") { + console.log(`[result] rejected=${result.reason?.message ?? String(result.reason)}`); + continue; + } + + const headers = result.value.response?.headers; + const limit = getHeader(headers, "x-ratelimit-limit-requests"); + const remaining = getHeader(headers, "x-ratelimit-remaining-requests"); + const reset = getHeader(headers, "x-ratelimit-reset-requests"); + + console.log( + `[result] text=${preview(result.value.text)} finishReason=${result.value.finishReason} usage=${safeStringify(result.value.usage)}`, + ); + console.log( + `[result] ratelimitHeaders=${safeStringify({ + limit, + remaining, + reset, + })}`, + ); + } + + console.log( + `\n[done] settled=${safeStringify( + settled.map((s) => (s.status === "fulfilled" ? preview(s.value.text) : s.reason?.message)), + )}`, + ); + + console.log( + `[done] tenantUsage(${tenantId})=${safeStringify(controller.getTenantUsage(tenantId))}`, + ); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/tmp/test/traffic-priority-openai-sim.ts b/tmp/test/traffic-priority-openai-sim.ts new file mode 100644 index 000000000..a9ef2a368 --- /dev/null +++ b/tmp/test/traffic-priority-openai-sim.ts @@ -0,0 +1,111 @@ +// @ts-nocheck +/** + * Manual test: Agent → TrafficController priority scheduling (OpenAI-like stub models). + * + * This keeps the Agent + AI SDK path, but avoids real network calls by using stub models + * that pretend to be `provider="openai"` with modelIds like `gpt-4o`/`gpt-4o-mini`. + * + * Scenarios: + * - Test 1: P0 runs before P1/P2 when all runnable. + * - Test 2: P0 request (gpt-4o) is rate-limited → P1 (gpt-4o-mini) proceeds. + * + * Run: + * - pnpm ts-node tmp/test/traffic-priority-openai-sim.ts + */ + +import { safeStringify } from "@voltagent/internal"; +import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; + +const verbose = process.env.VERBOSE === "1"; +if (!verbose) { + console.debug = () => {}; +} + +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); +const now = () => new Date().toISOString(); + +function makeOpenAIStubModel(modelId: string, delayMs: number) { + let calls = 0; + return { + specificationVersion: "v2", + provider: "openai", + modelId, + doGenerate: async () => { + calls += 1; + console.log(`[${now()}] [model] ${modelId} doGenerate call=${calls}`); + await sleep(delayMs); + return { + content: [{ type: "text", text: `ok:${modelId}` }], + finishReason: "stop", + usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, + warnings: [], + response: { modelId, headers: {} }, + }; + }, + }; +} + +const controller = getTrafficController({ maxConcurrent: 1 }); + +const modelMini = makeOpenAIStubModel("gpt-4o-mini", 80); +const modelBig = makeOpenAIStubModel("gpt-4o", 80); + +const agent = new Agent({ + name: "priority-openai-sim", + instructions: "echo", + model: modelMini, + temperature: 0, + maxOutputTokens: 32, +}); + +async function test1_priorityOrder() { + console.log("\n=== Test 1: P0 ordering via Agent ==="); + + const p2 = agent.generateText("P2", { trafficPriority: "P2", tenantId: "sim" }); + const p1 = agent.generateText("P1", { trafficPriority: "P1", tenantId: "sim" }); + const p0 = agent.generateText("P0", { trafficPriority: "P0", tenantId: "sim" }); + + const results = await Promise.all([p0, p1, p2]); + console.log(`[Test 1] results=${safeStringify(results.map((r) => r.text))}`); +} + +async function test2_p1RunsWhenP0RateLimited() { + console.log("\n=== Test 2: P1 proceeds when P0 is rate-limited ==="); + + // Seed remaining=0 for openai::gpt-4o so the P0 head item initially waits. + const applied = controller.updateRateLimitFromHeaders( + { provider: "openai", model: "gpt-4o" }, + { + "x-ratelimit-limit-requests": "1", + "x-ratelimit-remaining-requests": "0", + "x-ratelimit-reset-requests": "1s", + }, + ); + console.log(`[Test 2] updateRateLimitFromHeaders=${safeStringify(applied)}`); + + const p0Blocked = agent.generateText("P0 (gpt-4o, rate-limited)", { + trafficPriority: "P0", + tenantId: "sim", + model: modelBig, // per-call model override (new in this branch) + }); + + const p1Free = agent.generateText("P1 (gpt-4o-mini)", { + trafficPriority: "P1", + tenantId: "sim", + model: modelMini, + }); + + const [r0, r1] = await Promise.all([p0Blocked, p1Free]); + console.log(`[Test 2] p0 text=${r0.text}`); + console.log(`[Test 2] p1 text=${r1.text}`); +} + +async function main() { + await test1_priorityOrder(); + await test2_p1RunsWhenP0RateLimited(); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/tmp/test/traffic-priority.ts b/tmp/test/traffic-priority.ts new file mode 100644 index 000000000..28934051a --- /dev/null +++ b/tmp/test/traffic-priority.ts @@ -0,0 +1,156 @@ +// @ts-nocheck +/** + * Manual test: TrafficController priority scheduling. + * + * Scenarios: + * - Test 1: P0 should run before P1/P2 when runnable. + * - Test 2: If a P0 request is rate-limited, a lower priority (P1) can proceed. + * + * Run: + * - pnpm ts-node tmp/test/traffic-priority.ts + * - VERBOSE=1 pnpm ts-node tmp/test/traffic-priority.ts + */ + +import { safeStringify } from "@voltagent/internal"; +import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; + +const verbose = process.env.VERBOSE === "1"; +if (!verbose) { + console.debug = () => {}; +} + +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); +const now = () => new Date().toISOString(); + +const controller = getTrafficController({ maxConcurrent: 1 }); + +function extractLabel(prompt: any): string { + if (!Array.isArray(prompt)) { + return "unknown"; + } + + for (let index = prompt.length - 1; index >= 0; index -= 1) { + const message = prompt[index]; + if (!message || message.role !== "user" || !Array.isArray(message.content)) { + continue; + } + + const textPart = message.content.find((part: any) => part?.type === "text"); + if (textPart?.text) { + return String(textPart.text); + } + } + + return "unknown"; +} + +function makeModel(provider: string, modelId: string, delayMs = 50) { + let calls = 0; + let lastStartAt = 0; + + return { + specificationVersion: "v2", + provider, + modelId, + doGenerate: async (options: any) => { + calls += 1; + const startAt = Date.now(); + const delta = lastStartAt ? startAt - lastStartAt : 0; + lastStartAt = startAt; + + const label = extractLabel(options?.prompt); + console.log( + `[${now()}] doGenerate start model=${provider}::${modelId} call=${calls} (+${delta}ms) input=${label}`, + ); + await sleep(delayMs); + console.log(`[${now()}] doGenerate end model=${provider}::${modelId} input=${label}`); + + return { + content: [{ type: "text", text: `ok:${label}` }], + finishReason: "stop", + usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, + warnings: [], + response: { modelId, headers: {} }, + }; + }, + }; +} + +async function test1_priorityOrder() { + console.log("\n=== Test 1: priority order (P0 before P1/P2) ==="); + + const sharedModel = makeModel("p", "shared-model", 50); + const agent = new Agent({ + name: "traffic-priority", + instructions: "echo", + model: sharedModel, + temperature: 0, + maxOutputTokens: 32, + }); + + // Enqueue in reverse order; scheduler should still run P0 first. + const p2 = agent.generateText("P2", { tenantId: "default", trafficPriority: "P2" }); + const p1 = agent.generateText("P1", { tenantId: "default", trafficPriority: "P1" }); + const p0 = agent.generateText("P0", { tenantId: "default", trafficPriority: "P0" }); + + const settled = await Promise.allSettled([p0, p1, p2]); + console.log( + `[Test 1] results=${safeStringify( + settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)), + )}`, + ); +} + +async function test2_lowerPriorityWhenP0RateLimited() { + console.log("\n=== Test 2: P1 proceeds when P0 rate-limited ==="); + + const applied = controller.updateRateLimitFromHeaders( + { provider: "p0", model: "m0" }, + { + "x-ratelimit-limit-requests": "1", + "x-ratelimit-remaining-requests": "0", + "x-ratelimit-reset-requests": "1s", + }, + ); + console.log(`[Test 2] updateRateLimitFromHeaders=${safeStringify(applied)}`); + + const modelP0 = makeModel("p0", "m0", 50); + const modelP1 = makeModel("p1", "m1", 50); + const agent = new Agent({ + name: "traffic-priority-rate-limit", + instructions: "echo", + model: modelP1, + temperature: 0, + maxOutputTokens: 32, + }); + + // Now the next P0 request is at the head of the queue but rate-limited, + // so a runnable P1 request should execute first. + const p0Blocked = agent.generateText("P0-blocked (rate limited)", { + tenantId: "default", + trafficPriority: "P0", + model: modelP0, + }); + const p1Free = agent.generateText("P1-free (should run first)", { + tenantId: "default", + trafficPriority: "P1", + model: modelP1, + }); + + const settled = await Promise.allSettled([p0Blocked, p1Free]); + console.log( + `[Test 2] results=${safeStringify( + settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)), + )}`, + ); +} + +async function main() { + await test1_priorityOrder(); + await test2_lowerPriorityWhenP0RateLimited(); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/tmp/test/traffic-rate-limit-from-headers.ts b/tmp/test/traffic-rate-limit-from-headers.ts new file mode 100644 index 000000000..d82626611 --- /dev/null +++ b/tmp/test/traffic-rate-limit-from-headers.ts @@ -0,0 +1,158 @@ +// @ts-nocheck +/** + * Manual test: TrafficController dynamic rate limits from OpenAI response headers. + * + * This hits the real OpenAI model via Agent + AI SDK, and relies on the + * `x-ratelimit-*` response headers to seed/update the TrafficController. + * + * What to look for: + * - Each request prints the observed `x-ratelimit-*` headers (if present). + * - Agent should also log: "[Traffic] Applied rate limit from response headers". + * - With enough parallel requests, some requests may take longer due to controller throttling. + * + * Prereqs: + * - Set `OPENAI_API_KEY` + * + * Optional env: + * - `OPENAI_MODEL` (default: gpt-4o-mini) + * - `REQUESTS` (default: 10) + * - `MAX_CONCURRENT` (default: 50) + * - `TENANT_ID` (default: openai-rate-limit-headers) + * + * Run: + * - OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-from-headers.ts + * - VERBOSE=1 OPENAI_API_KEY=... REQUESTS=30 pnpm ts-node tmp/test/traffic-rate-limit-from-headers.ts + */ + +import { openai } from "@ai-sdk/openai"; +import { safeStringify } from "@voltagent/internal"; +import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; + +const verbose = process.env.VERBOSE === "1"; +if (!verbose) { + console.debug = () => {}; +} + +const now = () => new Date().toISOString(); + +const apiKey = process.env.OPENAI_API_KEY; +if (!apiKey) { + console.error("Missing OPENAI_API_KEY. Example:"); + console.error(" OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-from-headers.ts"); + process.exit(1); +} + +const provider = "openai"; +const modelId = process.env.OPENAI_MODEL ?? "gpt-4o-mini"; +const tenantId = process.env.TENANT_ID ?? "openai-rate-limit-headers"; +const requestCountRaw = Number(process.env.REQUESTS ?? "10"); +const maxConcurrentRaw = Number(process.env.MAX_CONCURRENT ?? "50"); +const requestCount = Number.isFinite(requestCountRaw) && requestCountRaw > 0 ? requestCountRaw : 10; +const maxConcurrent = + Number.isFinite(maxConcurrentRaw) && maxConcurrentRaw > 0 ? maxConcurrentRaw : 50; + +const key = `${provider}::${modelId}`; +const controller = getTrafficController({ maxConcurrent }); + +function getHeader(headers: any, name: string): string | undefined { + if (!headers) return undefined; + if (typeof headers.get === "function") { + const v = headers.get(name); + return v === null || v === undefined ? undefined : String(v); + } + + const entries = Object.entries(headers as Record); + const target = name.toLowerCase(); + const match = entries.find(([k]) => String(k).toLowerCase() === target); + if (!match) return undefined; + + const value = match[1]; + if (Array.isArray(value)) { + const first = value[0]; + return first === null || first === undefined ? undefined : String(first); + } + + return value === null || value === undefined ? undefined : String(value); +} + +async function main() { + console.log( + `\n=== OpenAI rate limit headers → TrafficController (${key}, maxConcurrent=${maxConcurrent}, requests=${requestCount}) ===`, + ); + void controller; + + const agent = new Agent({ + name: "openai-rate-limit-from-headers", + instructions: "Reply with only the requested token.", + model: openai(modelId), + temperature: 0, + maxOutputTokens: 32, + }); + + console.log("\n[seed] Making one request to capture headers..."); + const seedStartedAt = Date.now(); + const seed = await agent.generateText("Reply with only: seed", { + tenantId, + trafficPriority: "P1", + }); + const seedElapsedMs = Date.now() - seedStartedAt; + + const seedHeaders = seed.response?.headers; + console.log(`[seed] done in ${seedElapsedMs}ms text=${seed.text}`); + console.log( + `[seed] x-ratelimit-*=${safeStringify({ + limit: getHeader(seedHeaders, "x-ratelimit-limit-requests"), + remaining: getHeader(seedHeaders, "x-ratelimit-remaining-requests"), + reset: getHeader(seedHeaders, "x-ratelimit-reset-requests"), + })}`, + ); + + console.log(`\n[burst] Scheduling ${requestCount} parallel requests...`); + const jobs = Array.from({ length: requestCount }, (_, idx) => { + const label = `req-${idx + 1}`; + const enqueuedAt = Date.now(); + console.log(`[${now()}] enqueue ${label}`); + + return agent + .generateText(`Reply with only: ${label}`, { tenantId, trafficPriority: "P1" }) + .then((result) => { + const elapsedMs = Date.now() - enqueuedAt; + const headers = result.response?.headers; + console.log( + `[${now()}] done ${label} in ${elapsedMs}ms text=${result.text} x-ratelimit-remaining=${getHeader( + headers, + "x-ratelimit-remaining-requests", + )}`, + ); + return { + label, + elapsedMs, + text: result.text, + headers: { + limit: getHeader(headers, "x-ratelimit-limit-requests"), + remaining: getHeader(headers, "x-ratelimit-remaining-requests"), + reset: getHeader(headers, "x-ratelimit-reset-requests"), + }, + }; + }) + .catch((error) => { + const elapsedMs = Date.now() - enqueuedAt; + console.log( + `[${now()}] failed ${label} in ${elapsedMs}ms name=${error?.name ?? "Error"} status=${error?.status ?? error?.statusCode ?? "n/a"} msg=${error?.message}`, + ); + throw error; + }); + }); + + const settled = await Promise.allSettled(jobs); + + console.log(`\n[done] settled=${safeStringify(settled.map((s) => s.status))}`); + console.log( + `[done] tenantUsage(${tenantId})=${safeStringify(controller.getTenantUsage(tenantId))}`, + ); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/tmp/test/traffic-rate-limit-static.ts b/tmp/test/traffic-rate-limit-static.ts new file mode 100644 index 000000000..d06427a3b --- /dev/null +++ b/tmp/test/traffic-rate-limit-static.ts @@ -0,0 +1,144 @@ +// @ts-nocheck +/** + * Manual test: TrafficController window-based rate limiting (simulated OpenAI headers). + * + * What to look for: + * - Requests should be paced out across the window (no steady "refill" math). + * + * Run: + * - pnpm ts-node tmp/test/traffic-rate-limit-static.ts + * - VERBOSE=1 pnpm ts-node tmp/test/traffic-rate-limit-static.ts + * + * Optional env: + * - LIMIT=6 WINDOW_MS=3000 pnpm ts-node tmp/test/traffic-rate-limit-static.ts + */ + +import { safeStringify } from "@voltagent/internal"; +import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; + +const verbose = process.env.VERBOSE === "1"; +if (!verbose) { + console.debug = () => {}; +} + +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); +const now = () => new Date().toISOString(); + +const provider = "sim"; +const model = "rate-limited-model"; +const key = `${provider}::${model}`; + +const controller = getTrafficController({ maxConcurrent: 50 }); + +const limit = Number(process.env.LIMIT ?? 6); +const windowMs = Number(process.env.WINDOW_MS ?? 3000); +let windowStartAt = Date.now(); +let windowResetAt = windowStartAt + windowMs; +let usedInWindow = 0; + +function extractLabel(prompt: any): string { + if (!Array.isArray(prompt)) { + return "unknown"; + } + + for (let index = prompt.length - 1; index >= 0; index -= 1) { + const message = prompt[index]; + if (!message || message.role !== "user" || !Array.isArray(message.content)) { + continue; + } + + const textPart = message.content.find((part: any) => part?.type === "text"); + if (textPart?.text) { + return String(textPart.text); + } + } + + return "unknown"; +} + +async function main() { + console.log( + `\n=== Window rate limit for ${key} (limit=${limit}, windowMs=${windowMs}, jobs=10) ===`, + ); + + const seeded = controller.updateRateLimitFromHeaders( + { provider, model }, + { + "x-ratelimit-limit-requests": String(limit), + "x-ratelimit-remaining-requests": String(limit), + "x-ratelimit-reset-requests": `${windowMs}ms`, + }, + ); + console.log(`[seed] updateRateLimitFromHeaders=${safeStringify(seeded)}`); + + let calls = 0; + let lastStartAt = 0; + const rateLimitedModel = { + specificationVersion: "v2", + provider, + modelId: model, + doGenerate: async (options: any) => { + const nowMs = Date.now(); + if (nowMs >= windowResetAt) { + windowStartAt = nowMs; + windowResetAt = windowStartAt + windowMs; + usedInWindow = 0; + } + + calls += 1; + usedInWindow += 1; + const startAt = Date.now(); + const delta = lastStartAt ? startAt - lastStartAt : 0; + lastStartAt = startAt; + + const label = extractLabel(options?.prompt); + console.log(`[${now()}] doGenerate start call=${calls} (+${delta}ms) input=${label}`); + await sleep(50); + console.log(`[${now()}] doGenerate end input=${label}`); + + const remainingAfterThis = Math.max(0, limit - usedInWindow); + const resetMs = Math.max(1, windowResetAt - Date.now()); + return { + content: [{ type: "text", text: `ok:${label}` }], + finishReason: "stop", + usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, + warnings: [], + response: { + modelId: model, + headers: { + "x-ratelimit-limit-requests": String(limit), + "x-ratelimit-remaining-requests": String(remainingAfterThis), + "x-ratelimit-reset-requests": `${resetMs}ms`, + }, + }, + }; + }, + }; + + const agent = new Agent({ + name: "traffic-rate-limit-static", + instructions: "echo", + model: rateLimitedModel, + temperature: 0, + maxOutputTokens: 32, + }); + + const jobs = Array.from({ length: 10 }, (_, idx) => + agent.generateText(`req-${idx + 1}`, { + tenantId: "default", + trafficPriority: "P1", + }), + ); + + const settled = await Promise.allSettled(jobs); + console.log( + `\n[done] results=${safeStringify( + settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)), + )}`, + ); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/tmp/test/traffic-retry-behavior.ts b/tmp/test/traffic-retry-behavior.ts new file mode 100644 index 000000000..273af55ab --- /dev/null +++ b/tmp/test/traffic-retry-behavior.ts @@ -0,0 +1,169 @@ +// @ts-nocheck +/** + * Manual test: TrafficController retry behavior via Agent + AI SDK path (stub model). + * + * Scenarios included: + * - 5xx retries (up to 3 attempts) + * - 429 retries (up to 3 attempts) + * - timeout retries (up to 2 attempts) + * - non-retriable 4xx does not retry + * + * Run: + * - pnpm ts-node tmp/test/traffic-retry-behavior.ts + * + * Notes: + * - Uses a stub LanguageModel; no network calls. + * - Watch the `[model] attempt=...` logs to confirm retries. + */ + +import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; + +const verbose = process.env.VERBOSE === "1"; +if (!verbose) { + console.debug = () => {}; +} + +type Scenario = + | "server-error" + | "rate-limit" + | "timeout" + | "bad-request" + | "forbidden" + // Variations to hit different retry-detection branches. + | "server-error-status-string" + | "server-error-statusCode" + | "server-error-response-status" + | "server-error-cause-status" + | "rate-limit-statusCode" + | "timeout-code-only" + | "timeout-name-only" + | "timeout-message-only" + // Variations that should STOP retrying (hit max attempts). + | "server-error-exceed-max" + | "timeout-exceed-max"; + +type RetryPlan = { + failCountBeforeSuccess: number; + status?: number | string; + statusCode?: number | string; + httpStatus?: number | string; + responseStatus?: number | string; + causeStatus?: number | string; + code?: string; + name?: string; + message?: string; +}; + +const plans: Record = { + "server-error": { failCountBeforeSuccess: 2, status: 500 }, + "rate-limit": { failCountBeforeSuccess: 2, status: 429 }, + timeout: { failCountBeforeSuccess: 1, status: 408, code: "ETIMEDOUT", message: "timeout" }, + "bad-request": { failCountBeforeSuccess: 10, status: 400 }, + forbidden: { failCountBeforeSuccess: 10, status: 403 }, + "server-error-status-string": { failCountBeforeSuccess: 2, status: "500" }, + "server-error-statusCode": { failCountBeforeSuccess: 2, statusCode: 502 }, + "server-error-response-status": { failCountBeforeSuccess: 2, responseStatus: 503 }, + "server-error-cause-status": { failCountBeforeSuccess: 2, causeStatus: 500 }, + "rate-limit-statusCode": { failCountBeforeSuccess: 2, statusCode: 429 }, + "timeout-code-only": { failCountBeforeSuccess: 1, code: "timeout" }, + "timeout-name-only": { failCountBeforeSuccess: 1, name: "TimeoutError" }, + "timeout-message-only": { failCountBeforeSuccess: 1, message: "this is a TIMEOUT" }, + "server-error-exceed-max": { failCountBeforeSuccess: 10, status: 500 }, + "timeout-exceed-max": { failCountBeforeSuccess: 10, message: "timeout" }, +}; + +function makeModel(modelId: string, plan: RetryPlan) { + let counter = 0; + let lastAttemptAt = 0; + + return { + specificationVersion: "v2", + provider: "retry-provider", + modelId, + doGenerate: async () => { + counter += 1; + const now = Date.now(); + const delta = lastAttemptAt ? now - lastAttemptAt : 0; + lastAttemptAt = now; + + console.log(`[model] modelId=${modelId} attempt=${counter} (+${delta}ms)`); + + if (counter <= plan.failCountBeforeSuccess) { + const err: any = new Error(plan.message ?? `forced failure ${counter} for ${modelId}`); + if (plan.status !== undefined) err.status = plan.status; + if (plan.statusCode !== undefined) err.statusCode = plan.statusCode; + if (plan.httpStatus !== undefined) err.httpStatus = plan.httpStatus; + if (plan.responseStatus !== undefined) err.response = { status: plan.responseStatus }; + if (plan.causeStatus !== undefined) err.cause = { status: plan.causeStatus }; + if (plan.code !== undefined) err.code = plan.code; + if (plan.name !== undefined) err.name = plan.name; + throw err; + } + + return { + content: [{ type: "text", text: "ok" }], + finishReason: "stop", + usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, + warnings: [], + response: { modelId, headers: {} }, + }; + }, + }; +} + +async function runScenario(name: Scenario) { + const plan = plans[name]; + const modelId = `retry-${name}`; + const model = makeModel(modelId, plan); + + const agent = new Agent({ + name: `RetryAgent-${name}`, + instructions: "echo", + model, + maxOutputTokens: 32, + temperature: 0, + }); + + console.log(`\n=== ${name} ===`); + try { + const result = await agent.generateText(name, { tenantId: "retry-test" }); + console.log(`[${name}] succeeded. text=${result.text}`); + } catch (err: any) { + console.log( + `[${name}] failed. status=${err?.status ?? err?.statusCode ?? err?.response?.status ?? "n/a"}`, + ); + } +} + +async function main() { + // Create controller early so all Agent calls share the same singleton. + getTrafficController({ maxConcurrent: 1 }); + + const runs: Scenario[] = [ + "server-error", + "rate-limit", + "timeout", + "bad-request", + "forbidden", + // Uncomment for additional coverage: + // "server-error-status-string", + // "server-error-statusCode", + // "server-error-response-status", + // "server-error-cause-status", + // "rate-limit-statusCode", + // "timeout-code-only", + // "timeout-name-only", + // "timeout-message-only", + // "server-error-exceed-max", + // "timeout-exceed-max", + ]; + + for (const name of runs) { + await runScenario(name); + } +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/tmp/test/traffic-tenant-usage.ts b/tmp/test/traffic-tenant-usage.ts new file mode 100644 index 000000000..801d7761c --- /dev/null +++ b/tmp/test/traffic-tenant-usage.ts @@ -0,0 +1,71 @@ +// @ts-nocheck +/** + * Manual test: Tenant usage aggregation (via Agent → TrafficController). + * + * What to look for: + * - `getTenantUsage(tenantId)` should increase after each agent call. + * + * Run: + * - pnpm ts-node tmp/test/traffic-tenant-usage.ts + */ + +import { safeStringify } from "@voltagent/internal"; +import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; + +const verbose = process.env.VERBOSE === "1"; +if (!verbose) { + console.debug = () => {}; +} + +function makeModel(modelId: string) { + return { + specificationVersion: "v2", + provider: "usage-provider", + modelId, + doGenerate: async () => { + return { + content: [{ type: "text", text: `ok:${modelId}` }], + finishReason: "stop", + usage: { inputTokens: 2, outputTokens: 3, totalTokens: 5 }, + warnings: [], + response: { modelId, headers: {} }, + }; + }, + }; +} + +const controller = getTrafficController({ maxConcurrent: 10 }); + +async function run(label: string, tenantId: string) { + const model = makeModel("tenant-usage-model"); + const agent = new Agent({ + name: `TenantUsageAgent-${label}`, + instructions: "echo", + model, + temperature: 0, + maxOutputTokens: 32, + }); + + console.log(`\n=== ${label} tenantId=${tenantId} ===`); + const result = await agent.generateText(`hello:${label}`, { tenantId }); + console.log(`[${label}] text=${result.text}`); + + const usage = controller.getTenantUsage(tenantId); + console.log(`[${label}] controller.getTenantUsage(${tenantId})=${safeStringify(usage)}`); +} + +async function main() { + await run("A1", "tenant-a"); + await run("A2", "tenant-a"); + await run("B1", "tenant-b"); + + console.log("\n=== Final usage snapshot ==="); + console.log(`tenant-a=${safeStringify(controller.getTenantUsage("tenant-a"))}`); + console.log(`tenant-b=${safeStringify(controller.getTenantUsage("tenant-b"))}`); + console.log(`default=${safeStringify(controller.getTenantUsage("default"))}`); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/tmp/test/traffic-text-vs-stream.ts b/tmp/test/traffic-text-vs-stream.ts new file mode 100644 index 000000000..41aa484d4 --- /dev/null +++ b/tmp/test/traffic-text-vs-stream.ts @@ -0,0 +1,128 @@ +// @ts-nocheck +/** + * Manual test: Text + stream traffic share the same TrafficController queue. + * + * What to look for: + * - Stream and text requests should respect the same maxConcurrent + priority rules. + * + * Run: + * - pnpm ts-node tmp/test/traffic-text-vs-stream.ts + * - VERBOSE=1 pnpm ts-node tmp/test/traffic-text-vs-stream.ts + */ + +import { ReadableStream } from "node:stream/web"; +import { safeStringify } from "@voltagent/internal"; +import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; + +const verbose = process.env.VERBOSE === "1"; +if (!verbose) { + console.debug = () => {}; +} + +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); +const now = () => new Date().toISOString(); + +const controller = getTrafficController({ maxConcurrent: 1 }); + +function extractLabel(prompt: any): string { + if (!Array.isArray(prompt)) { + return "unknown"; + } + + for (let index = prompt.length - 1; index >= 0; index -= 1) { + const message = prompt[index]; + if (!message || message.role !== "user" || !Array.isArray(message.content)) { + continue; + } + + const textPart = message.content.find((part: any) => part?.type === "text"); + if (textPart?.text) { + return String(textPart.text); + } + } + + return "unknown"; +} + +async function main() { + console.log("\n=== Text vs Stream (shared scheduler) ==="); + void controller; + + const provider = "sim"; + const modelId = "shared-queue"; + + const model = { + specificationVersion: "v2", + provider, + modelId, + doGenerate: async (options: any) => { + const label = extractLabel(options?.prompt); + console.log(`[${now()}] doGenerate start input=${label}`); + await sleep(50); + console.log(`[${now()}] doGenerate end input=${label}`); + return { + content: [{ type: "text", text: `text:${label}` }], + finishReason: "stop", + usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, + warnings: [], + response: { modelId, headers: {} }, + }; + }, + doStream: async (options: any) => { + const label = extractLabel(options?.prompt); + console.log(`[${now()}] doStream start input=${label}`); + + // Hold the controller slot for a bit so ordering is visible. + await sleep(400); + + console.log(`[${now()}] doStream ready input=${label}`); + const streamId = `text-${label}`; + const text = `stream:${label}`; + + const stream = new ReadableStream({ + start(streamController) { + streamController.enqueue({ type: "stream-start", warnings: [] }); + streamController.enqueue({ type: "text-start", id: streamId }); + streamController.enqueue({ type: "text-delta", id: streamId, delta: text }); + streamController.enqueue({ type: "text-end", id: streamId }); + streamController.enqueue({ + type: "finish", + usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, + finishReason: "stop", + }); + streamController.close(); + }, + }); + + return { stream, response: { headers: {} } }; + }, + }; + + const agent = new Agent({ + name: "traffic-text-vs-stream", + instructions: "echo", + model, + temperature: 0, + maxOutputTokens: 32, + }); + + const streamP1 = agent.streamText("S1", { tenantId: "default", trafficPriority: "P1" }); + const textP0 = agent.generateText("T0", { tenantId: "default", trafficPriority: "P0" }); + const textP1 = agent.generateText("T1", { tenantId: "default", trafficPriority: "P1" }); + + const [streamResult, t0, t1] = await Promise.all([streamP1, textP0, textP1]); + const streamText = await streamResult.text; + + console.log( + `\n[done] results=${safeStringify({ + streamText, + textP0: t0.text, + textP1: t1.text, + })}`, + ); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); From aa53fd5732324525245be66ab3b2b80a4d163ef0 Mon Sep 17 00:00:00 2001 From: riturajFi Date: Sat, 13 Dec 2025 13:48:41 +0530 Subject: [PATCH 09/41] feat: final v0 --- .../core/src/traffic/traffic-controller.ts | 1434 ++++++----------- 1 file changed, 464 insertions(+), 970 deletions(-) diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index 1cf76109b..e633792e8 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -1,77 +1,24 @@ +/* eslint-disable @typescript-eslint/no-explicit-any */ + import type { Logger } from "../logger"; import { LoggerProxy } from "../logger"; +/* ============================================================ + * Types + * ============================================================ + */ + type Scheduler = (callback: () => void) => void; + type BivariantHandler = { bivarianceHack(...args: TArgs): void; }["bivarianceHack"]; + type BivariantFunction = { bivarianceHack(...args: TArgs): TReturn; }["bivarianceHack"]; -type RetryReason = "rateLimit" | "serverError" | "timeout"; - -const MAX_RETRY_ATTEMPTS = 3; -const TIMEOUT_RETRY_ATTEMPTS = 2; -const RATE_LIMIT_BASE_BACKOFF_MS = 500; -const CIRCUIT_FAILURE_THRESHOLD = 5; -const CIRCUIT_FAILURE_WINDOW_MS = 10_000; -const CIRCUIT_COOLDOWN_MS = 30_000; -const SERVER_ERROR_BASE_BACKOFF_MS = 1000; -const TIMEOUT_BASE_BACKOFF_MS = 750; -const RATE_LIMIT_JITTER_FACTOR = 0.35; -const SERVER_ERROR_JITTER_FACTOR = 0.8; -const TIMEOUT_JITTER_FACTOR = 0.5; -const RATE_LIMIT_EXHAUSTION_BUFFER = 1; -const RATE_LIMIT_PROBE_DELAY_MS = 50; -const RATE_LIMIT_MIN_PACE_INTERVAL_MS = 10; -const RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS = 10; -const DEFAULT_FALLBACK_CHAINS: Record = { - "gpt-4o": ["gpt-4o-mini", "gpt-3.5"], -}; - -type RateLimitWindowState = { - limit: number; - remaining: number; - resetAt: number; - reserved: number; - nextAllowedAt: number; -}; - -export interface RateLimitOptions { - capacity: number; - refillPerSecond: number; -} - -export type TenantUsage = { - inputTokens: number; - outputTokens: number; - totalTokens: number; -}; - -type UsageCounters = { - inputTokens?: number; - outputTokens?: number; - totalTokens?: number; -}; - -export type RateLimitKey = string; -export type RateLimitConfig = Record; - -type RateLimitHeaderSnapshot = { - limitRequests: number; - remainingRequests: number; - resetRequestsMs: number; -}; - -export type RateLimitUpdateResult = { - key: string; - headerSnapshot: RateLimitHeaderSnapshot; - state: RateLimitWindowState; -}; - export type TrafficRequestType = "text" | "stream"; - export type TrafficPriority = "P0" | "P1" | "P2"; export interface TrafficRequestMetadata { @@ -94,31 +41,22 @@ export interface TrafficRequest { >; } -type CircuitStateStatus = "closed" | "open" | "half-open"; - -interface CircuitState { - status: CircuitStateStatus; - failureTimestamps: number[]; - openedAt?: number; - trialInFlight?: boolean; -} - interface QueuedRequest { type: TrafficRequestType; request: TrafficRequest; resolve: BivariantHandler<[TResponse | PromiseLike]>; reject: BivariantHandler<[reason?: unknown]>; - etaMs?: number; + attempt: number; + priority: TrafficPriority; + tenantId: string; + rateLimitKey?: string; - attempt?: number; + etaMs?: number; + circuitKey?: string; circuitStatus?: CircuitStateStatus; - priority: TrafficPriority; - tenantId: string; - extractUsage?: BivariantFunction< - [response: TResponse], - Promise | UsageCounters | undefined - >; + + extractUsage?: TrafficRequest["extractUsage"]; } export interface TrafficControllerOptions { @@ -128,58 +66,153 @@ export interface TrafficControllerOptions { fallbackChains?: Record; } -type ProcessDecision = "process" | "skip" | "wait"; +/* ============================================================ + * Rate limiting + * ============================================================ + */ + +export interface RateLimitOptions { + capacity: number; + refillPerSecond: number; +} + +export type RateLimitKey = string; +export type RateLimitConfig = Record; + +interface RateLimitWindowState { + limit: number; + remaining: number; + resetAt: number; + reserved: number; + nextAllowedAt: number; +} + +/* ============================================================ + * Circuit breaker + * ============================================================ + */ + +type CircuitStateStatus = "closed" | "open" | "half-open"; + +interface CircuitState { + status: CircuitStateStatus; + failureTimestamps: number[]; + openedAt?: number; + trialInFlight?: boolean; +} + +/* ============================================================ + * Usage + * ============================================================ + */ + +export type TenantUsage = { + inputTokens: number; + outputTokens: number; + totalTokens: number; +}; + +type UsageCounters = { + inputTokens?: number; + outputTokens?: number; + totalTokens?: number; +}; + +/* ============================================================ + * Internal constants + * ============================================================ + */ + +type RetryReason = "rateLimit" | "serverError" | "timeout"; + +const MAX_RETRY_ATTEMPTS = 3; +const TIMEOUT_RETRY_ATTEMPTS = 2; + +const RATE_LIMIT_BASE_BACKOFF_MS = 500; +const SERVER_ERROR_BASE_BACKOFF_MS = 1000; +const TIMEOUT_BASE_BACKOFF_MS = 750; + +const RATE_LIMIT_JITTER_FACTOR = 0.35; +const SERVER_ERROR_JITTER_FACTOR = 0.8; +const TIMEOUT_JITTER_FACTOR = 0.5; + +const CIRCUIT_FAILURE_THRESHOLD = 5; +const CIRCUIT_FAILURE_WINDOW_MS = 10_000; +const CIRCUIT_COOLDOWN_MS = 30_000; + +const RATE_LIMIT_EXHAUSTION_BUFFER = 1; +const RATE_LIMIT_PROBE_DELAY_MS = 50; +const RATE_LIMIT_MIN_PACE_INTERVAL_MS = 10; +const RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS = 10; + +const DEFAULT_FALLBACK_CHAINS: Record = { + "gpt-4o": ["gpt-4o-mini", "gpt-3.5"], +}; + +/* ============================================================ + * Dispatch decisions + * ============================================================ + */ + +type DispatchDecision = + | { kind: "dispatch" } + | { kind: "skip" } + | { kind: "wait"; wakeUpAt?: number }; + +/* ============================================================ + * Traffic Controller + * ============================================================ + */ -// Centralized traffic controller responsible for scheduling LLM calls. -// Provides a FIFO queue with a non-blocking scheduler and entrypoints -// for text and stream traffic. export class TrafficController { + /* ---------- Core ---------- */ + private readonly scheduler: Scheduler; private readonly maxConcurrent: number; - private readonly rateLimitStates = new Map(); - private readonly circuitBreakers = new Map(); - private readonly fallbackChains: Map; - private readonly priorityOrder: TrafficPriority[] = ["P0", "P1", "P2"]; + private readonly logger: Logger; + private readonly queues: Record = { P0: [], P1: [], P2: [], }; + private readonly priorityOrder: TrafficPriority[] = ["P0", "P1", "P2"]; + private activeCount = 0; private drainScheduled = false; + + /* ---------- Rate limits ---------- */ + + private readonly rateLimitStates = new Map(); private wakeUpTimeout?: ReturnType; private wakeUpAt?: number; - private readonly tenantUsage = new Map(); - private readonly logger: Logger; - private logDebug(message: string, details?: Record): void { - if (typeof console?.debug === "function") { - console.debug(message, details); - } - } + /* ---------- Circuit breakers ---------- */ + + private readonly circuitBreakers = new Map(); + private readonly fallbackChains: Map; + + /* ---------- Usage ---------- */ + + private readonly tenantUsage = new Map(); constructor(options: TrafficControllerOptions = {}) { this.maxConcurrent = options.maxConcurrent ?? Number.POSITIVE_INFINITY; - this.fallbackChains = this.normalizeFallbackChains(options.fallbackChains); this.scheduler = this.createScheduler(); - - // NEW LOGGER (from c2 commit) + this.fallbackChains = this.normalizeFallbackChains(options.fallbackChains); this.logger = new LoggerProxy({ component: "traffic-controller" }, options.logger); - - // INIT LOG (from HEAD) — rewritten to use the new logger - this.logger.debug("[TrafficController] init", { - maxConcurrent: this.maxConcurrent, - rateLimitKeys: Array.from(this.rateLimitStates.keys()), - }); } + /* ============================================================ + * Public API + * ============================================================ + */ + handleText(request: TrafficRequest): Promise { - // Route text generation requests into the queue so all LLM calls share the same scheduler return this.enqueue("text", request); } handleStream(request: TrafficRequest): Promise { - // Route streaming requests through the same queue to preserve ordering/backpressure rules return this.enqueue("stream", request); } @@ -188,29 +221,22 @@ export class TrafficController { return usage ? { ...usage } : undefined; } - private createScheduler(): Scheduler { - // Prefer queueMicrotask to keep the drain loop snappy without starving the event loop - if (typeof queueMicrotask === "function") { - return queueMicrotask; - } + /* ============================================================ + * Scheduler & Queue + * ============================================================ + */ - return (callback: () => void) => setTimeout(callback, 0); + private createScheduler(): Scheduler { + return typeof queueMicrotask === "function" ? queueMicrotask : (cb) => setTimeout(cb, 0); } private enqueue( type: TrafficRequestType, request: TrafficRequest, ): Promise { - // Each request gets a promise so callers can await their own result - return new Promise((resolve, reject) => { + return new Promise((resolve, reject) => { const priority = this.resolvePriority(request.metadata); - this.logger.debug("Enqueuing LLM request", { - tenantId: request.tenantId, - type, - priority, - }); - // Collect the work item and metadata - this.getQueue(priority).push({ + this.queues[priority].push({ type, request, resolve, @@ -220,692 +246,335 @@ export class TrafficController { tenantId: request.tenantId, extractUsage: request.extractUsage, }); - - this.logDebug("[TrafficController] enqueue", { - type, - queueSize: this.getQueueSize(), - metadata: request.metadata, - }); - - // Kick the drain loop to start handling work this.scheduleDrain(); }); } private scheduleDrain(): void { - if (this.drainScheduled) { - return; - } + if (this.drainScheduled) return; + this.drainScheduled = true; - this.drainScheduled = true; // Prevent redundant scheduling when many requests arrive at once - this.logDebug("[TrafficController] scheduleDrain", { queueSize: this.getQueueSize() }); this.scheduler(() => { this.drainScheduled = false; - this.logDebug("[TrafficController] drainLoopStart", { - queueSize: this.getQueueSize(), - active: this.activeCount, - }); - this.drainQueue(); // Drain asynchronously so we never block the caller's tick + this.drainQueue(); }); } private drainQueue(): void { - // Pull as many items as we can until we hit capacity or rate limits - while (this.hasQueuedWork()) { - if (this.activeCount >= this.maxConcurrent) { + while (true) { + const decision = this.tryDispatchNext(); + if (decision.kind === "dispatch" || decision.kind === "skip") continue; + if (decision.kind === "wait") { + if (decision.wakeUpAt) this.scheduleRateLimitWakeUpAt(decision.wakeUpAt); return; } - - let selected: { item: QueuedRequest; priority: TrafficPriority } | undefined; - let skippedItem = false; - - for (const priority of this.priorityOrder) { - const queue = this.getQueue(priority); - if (queue.length === 0) { - continue; - } - - const candidate = queue[0]; - const decision = this.getProcessDecision(candidate); - if (decision === "process") { - selected = { item: candidate, priority }; - break; - } - - if (decision === "skip") { - queue.shift(); // Remove rejected item - skippedItem = true; - break; // Re-evaluate from highest priority after removing - } - - // If wait, try lower priorities in the same drain cycle - } - - if (selected) { - const { item, priority } = selected; - this.getQueue(priority).shift(); - this.activeCount++; // Track in-flight work to enforce concurrency guard - this.markCircuitTrial(item); // Reserve the half-open trial slot if needed - - void this.runRequest(item); // Fire off processing without blocking the loop - continue; - } - - if (skippedItem) { - continue; // We removed a blocked item; re-evaluate queues - } - - // No runnable work right now; exit until capacity/rate-limit changes return; } } - private getProcessDecision(next: QueuedRequest): ProcessDecision { - const circuitDecision = this.evaluateCircuitBreaker(next); - if (circuitDecision !== "process") { - return circuitDecision; - } - - if (this.activeCount >= this.maxConcurrent) { - this.logDebug("[TrafficController] throttle concurrency", { - active: this.activeCount, - maxConcurrent: this.maxConcurrent, - }); - return "wait"; - } - - const rateLimitState = this.getRateLimitState(next.request.metadata); - if (!rateLimitState) { - this.logDebug("[TrafficController] no rate limit match", { - metadata: next.request.metadata, - }); - next.rateLimitKey = undefined; - next.etaMs = 0; - return "process"; // No rate limit configured for this key - } - - const { key, state } = rateLimitState; - const now = Date.now(); - const effectiveRemaining = Math.max(0, state.remaining - state.reserved); - const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; - - if (effectiveRemaining <= RATE_LIMIT_EXHAUSTION_BUFFER) { - next.rateLimitKey = key; - next.etaMs = Math.max(0, probeAt - now); - this.logDebug("[TrafficController] throttle rate", { - key, - remaining: state.remaining, - reserved: state.reserved, - resetAt: state.resetAt, - nextAllowedAt: state.nextAllowedAt, - effectiveRemaining, - etaMs: next.etaMs, - }); - - if (now < probeAt) { - this.scheduleRateLimitWakeUpAt(probeAt); - return "wait"; - } - - // Window has expired, but we have not observed a newer header snapshot yet. - // Allow a single probe request (no in-flight reservations) to refresh headers. - if (state.reserved > 0) { - return "wait"; - } - } + /* ============================================================ + * Dispatch + * ============================================================ + */ - if (now < state.nextAllowedAt) { - next.rateLimitKey = key; - next.etaMs = Math.max(0, state.nextAllowedAt - now); - this.logDebug("[TrafficController] throttle rate", { - key, - remaining: state.remaining, - reserved: state.reserved, - resetAt: state.resetAt, - nextAllowedAt: state.nextAllowedAt, - effectiveRemaining, - etaMs: next.etaMs, - }); - this.scheduleRateLimitWakeUpAt(Math.min(state.resetAt, state.nextAllowedAt)); - return "wait"; - } + private tryDispatchNext(): DispatchDecision { + const next = this.peekNext(); + if (!next) return { kind: "wait" }; + if (this.activeCount >= this.maxConcurrent) return { kind: "wait" }; - // Allow request: reserve one slot until we receive headers (or completion). - state.reserved += 1; - next.rateLimitKey = key; - next.etaMs = 0; + const circuit = this.resolveCircuit(next); + if (circuit) return circuit; - const remainingWindowMs = Math.max(0, state.resetAt - now); - const intervalMs = Math.max( - RATE_LIMIT_MIN_PACE_INTERVAL_MS, - Math.ceil(remainingWindowMs / Math.max(effectiveRemaining, 1)), - ); - const candidateNextAllowedAt = Math.max(state.nextAllowedAt, now + intervalMs); - const shouldUpdateNextAllowedAt = - state.nextAllowedAt <= now || - candidateNextAllowedAt >= state.nextAllowedAt + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS; - if (shouldUpdateNextAllowedAt) { - state.nextAllowedAt = candidateNextAllowedAt; - } + const rateLimit = this.resolveRateLimit(next); + if (rateLimit) return rateLimit; - this.logDebug("[TrafficController] rate limit reserved", { - key, - remaining: state.remaining, - reserved: state.reserved, - resetAt: state.resetAt, - nextAllowedAt: state.nextAllowedAt, - intervalMs, - effectiveRemaining, - }); - return "process"; + this.startRequest(next); + return { kind: "dispatch" }; } - private getRateLimitState( - metadata?: TrafficRequestMetadata, - ): { key: string; state: RateLimitWindowState } | undefined { - const key = this.buildRateLimitKey(metadata); - const state = this.rateLimitStates.get(key); - if (!state) { - return undefined; + private peekNext(): QueuedRequest | undefined { + for (const p of this.priorityOrder) { + if (this.queues[p].length > 0) return this.queues[p][0]; } - - this.logDebug("[TrafficController] rateLimitState hit", { key }); - return { key, state }; - } - - private evaluateCircuitBreaker(next: QueuedRequest): ProcessDecision { - return this.evaluateCircuitBreakerForRequest(next, new Set()); - } - - private evaluateCircuitBreakerForRequest( - next: QueuedRequest, - visitedModels: Set, - ): ProcessDecision { - const key = this.buildRateLimitKey(next.request.metadata); - next.circuitKey = key; - - const currentModel = next.request.metadata?.model; - if (currentModel) { - visitedModels.add(currentModel); - } - - const evaluation = this.evaluateCircuitState(key); - next.circuitStatus = evaluation.state; - - if (evaluation.allowRequest) { - return "process"; - } - - const fallbackModel = this.findFallbackModel(next.request.metadata, visitedModels); - if (fallbackModel && next.request.createFallbackRequest) { - const fallbackRequest = next.request.createFallbackRequest(fallbackModel); - if (fallbackRequest) { - this.logger.warn("Circuit open; attempting fallback model", { - fromModel: currentModel, - fallbackModel, - provider: next.request.metadata?.provider, - }); - next.request = fallbackRequest; - next.attempt = 1; - next.rateLimitKey = undefined; - next.etaMs = undefined; - next.circuitKey = undefined; - next.circuitStatus = undefined; - return this.evaluateCircuitBreakerForRequest(next, visitedModels); - } - } - - const retryAfterMs = evaluation.retryAfterMs ?? CIRCUIT_COOLDOWN_MS; - this.logger.warn("Circuit open; rejecting request", { - circuitKey: key, - retryAfterMs, - metadata: next.request.metadata, - }); - next.reject( - new CircuitBreakerOpenError( - `Circuit open for ${key}; retry after ${retryAfterMs}ms`, - next.request.metadata, - retryAfterMs, - ), - ); - return "skip"; + return undefined; } - private evaluateCircuitState(key: string): { - allowRequest: boolean; - state: CircuitStateStatus; - retryAfterMs?: number; - } { - const state = this.circuitBreakers.get(key); - if (!state) { - return { allowRequest: true, state: "closed" }; - } - - const now = Date.now(); - - if (state.status === "open") { - const elapsed = state.openedAt ? now - state.openedAt : 0; - if (elapsed >= CIRCUIT_COOLDOWN_MS) { - state.status = "half-open"; - state.trialInFlight = false; - state.failureTimestamps = []; - this.circuitBreakers.set(key, state); - return { allowRequest: true, state: state.status }; - } - return { - allowRequest: false, - state: state.status, - retryAfterMs: Math.max(0, CIRCUIT_COOLDOWN_MS - elapsed), - }; - } - - if (state.status === "half-open") { - if (state.trialInFlight) { - return { allowRequest: false, state: state.status }; - } - return { allowRequest: true, state: state.status }; - } - - return { allowRequest: true, state: state.status }; + private startRequest(item: QueuedRequest): void { + this.queues[item.priority].shift(); + this.activeCount++; + this.markCircuitTrial(item); + void this.executeRequest(item); } - private findFallbackModel( - metadata: TrafficRequestMetadata | undefined, - visitedModels: Set, - ): string | undefined { - const currentModel = metadata?.model; - if (!currentModel) { - return undefined; - } - - const chain = this.fallbackChains.get(currentModel); - if (!chain) { - return undefined; - } + /* ============================================================ + * Execution + * ============================================================ + */ - const provider = metadata?.provider; - for (const candidate of chain) { - if (visitedModels.has(candidate)) { - continue; - } + private async executeRequest(item: QueuedRequest): Promise { + try { + const result = await item.request.execute(); + this.recordCircuitSuccess(item.request.metadata); + this.recordUsage(item, result); + item.resolve(result); + } catch (error) { + this.recordCircuitFailure(item.request.metadata, error); - const candidateKey = this.buildRateLimitKey({ provider, model: candidate }); - const evaluation = this.evaluateCircuitState(candidateKey); - if (evaluation.allowRequest) { - visitedModels.add(candidate); - return candidate; + const retry = this.buildRetryPlan(error, item.attempt); + if (retry) { + this.scheduleRetry(item, retry); + } else { + item.reject(error); } + } finally { + this.releaseRateLimitReservation(item.rateLimitKey); + this.activeCount = Math.max(0, this.activeCount - 1); + this.scheduleDrain(); } - - return undefined; } - private markCircuitTrial(next: QueuedRequest): void { - const key = next.circuitKey; - if (!key) { - return; - } + /* ============================================================ + * Retry logic + * ============================================================ + */ - const state = this.circuitBreakers.get(key); - if (state && state.status === "half-open" && !state.trialInFlight) { - state.trialInFlight = true; - this.circuitBreakers.set(key, state); - } + private scheduleRetry( + item: QueuedRequest, + plan: { delayMs: number; reason: RetryReason }, + ): void { + setTimeout(() => { + this.queues[item.priority].push({ + ...item, + attempt: item.attempt + 1, + rateLimitKey: undefined, + etaMs: undefined, + circuitKey: undefined, + circuitStatus: undefined, + }); + this.scheduleDrain(); + }, plan.delayMs); } - private normalizeFallbackChains( - fallbackChains?: Record, - ): Map { - const configuredChains = fallbackChains ?? DEFAULT_FALLBACK_CHAINS; - const normalized = new Map(); - - for (const [model, chain] of Object.entries(configuredChains)) { - if (Array.isArray(chain) && chain.length > 0) { - normalized.set(model, [...chain]); - } - } + private buildRetryPlan( + error: unknown, + attempt: number, + ): { delayMs: number; reason: RetryReason } | undefined { + const reason = this.getRetryReason(error); + if (!reason) return undefined; - return normalized; - } + const max = reason === "timeout" ? TIMEOUT_RETRY_ATTEMPTS : MAX_RETRY_ATTEMPTS; + if (attempt >= max) return undefined; - private buildRateLimitKey(metadata?: TrafficRequestMetadata): string { - const provider = metadata?.provider ?? "default-provider"; - const model = metadata?.model ?? "default-model"; - return `${provider}::${model}`; + return { + reason, + delayMs: this.computeBackoffDelay(reason, attempt), + }; } - /** - * Update (or bootstrap) rate limit window state based on provider response headers. - * This lets the controller adopt server-issued limits without static config. + /* ============================================================ + * Rate limiting (verbatim logic) + * ============================================================ */ - updateRateLimitFromHeaders( - metadata: TrafficRequestMetadata | undefined, - headers: unknown, - ): RateLimitUpdateResult | undefined { - const headerInfo = this.extractRateLimitHeaders(headers); - if (!headerInfo) { - this.logDebug("[TrafficController] no rate limit headers found on response", { - metadata, - }); - return undefined; - } - const key = this.buildRateLimitKey(metadata); - const now = Date.now(); - const limit = headerInfo.limitRequests; - const remaining = this.coerceRemaining(headerInfo.remainingRequests, limit); - if (remaining === undefined) { - this.logDebug("[TrafficController] rate limit headers present but invalid", { - headerInfo, - }); - return undefined; - } - - const existing = this.rateLimitStates.get(key); - const resetAtCandidate = now + headerInfo.resetRequestsMs; - const resetAt = existing ? Math.max(existing.resetAt, resetAtCandidate) : resetAtCandidate; - const reserved = Math.max(0, existing?.reserved ?? 0); - const remainingFromHeaders = Math.min(limit, remaining); - const isSameWindow = Boolean(existing && now < existing.resetAt); - const nextRemaining = isSameWindow - ? Math.min(existing?.remaining ?? remainingFromHeaders, remainingFromHeaders) - : remainingFromHeaders; - const effectiveRemaining = Math.max(0, nextRemaining - reserved); - const state: RateLimitWindowState = { - limit, - remaining: nextRemaining, - resetAt, - reserved, - nextAllowedAt: existing?.nextAllowedAt ?? now, - }; - - this.rateLimitStates.set(key, state); - - this.logDebug("[TrafficController] rateLimit updated from headers", { - key, - limit: state.limit, - remaining: state.remaining, - reserved: state.reserved, - effectiveRemaining, - resetAt: state.resetAt, - nextAllowedAt: state.nextAllowedAt, - }); - - // Try draining again in case this update unblocks queued work. - this.scheduleDrain(); + private resolveRateLimit(next: QueuedRequest): DispatchDecision | null { + const key = this.buildRateLimitKey(next.request.metadata); + const state = this.rateLimitStates.get(key); + if (!state) return null; - return { - key, - headerSnapshot: headerInfo, - state, - }; - } + const now = Date.now(); + const effectiveRemaining = Math.max(0, state.remaining - state.reserved); + const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; - private extractRateLimitHeaders(headers: unknown): RateLimitHeaderSnapshot | undefined { - const getHeader = this.createHeaderLookup(headers); - if (!getHeader) { - return undefined; + if (effectiveRemaining <= RATE_LIMIT_EXHAUSTION_BUFFER) { + if (now < probeAt) { + return { kind: "wait", wakeUpAt: probeAt }; + } + if (state.reserved > 0) { + return { kind: "wait" }; + } } - const limitRequests = this.parseNumberHeader(getHeader, "x-ratelimit-limit-requests"); - const remainingRequests = this.parseNumberHeader(getHeader, "x-ratelimit-remaining-requests"); - const resetRequestsMs = this.parseDurationHeaderToMs(getHeader, "x-ratelimit-reset-requests"); - - if ( - limitRequests === undefined || - limitRequests <= 0 || - remainingRequests === undefined || - resetRequestsMs === undefined || - resetRequestsMs <= 0 - ) { - return undefined; + if (now < state.nextAllowedAt) { + return { kind: "wait", wakeUpAt: Math.min(state.resetAt, state.nextAllowedAt) }; } - return { - limitRequests, - remainingRequests, - resetRequestsMs, - }; - } + state.reserved += 1; + next.rateLimitKey = key; - private coerceRemaining(remaining: number | undefined, capacity: number): number | undefined { - if (remaining === undefined) { - return undefined; - } + const remainingWindowMs = Math.max(0, state.resetAt - now); + const intervalMs = Math.max( + RATE_LIMIT_MIN_PACE_INTERVAL_MS, + Math.ceil(remainingWindowMs / Math.max(effectiveRemaining, 1)), + ); - const parsed = Number(remaining); - if (!Number.isFinite(parsed)) { - return undefined; + const candidateNext = Math.max(state.nextAllowedAt, now + intervalMs); + if ( + state.nextAllowedAt <= now || + candidateNext >= state.nextAllowedAt + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS + ) { + state.nextAllowedAt = candidateNext; } - return Math.max(0, Math.min(capacity, Math.floor(parsed))); + return null; } - private createHeaderLookup(headers: unknown): ((name: string) => string | undefined) | undefined { - if (!headers) { - return undefined; - } + private scheduleRateLimitWakeUpAt(wakeUpAt: number): void { + const now = Date.now(); + const target = Math.max(now, wakeUpAt); - const maybeHeaders = headers as { get?: (name: string) => unknown }; - if (typeof maybeHeaders?.get === "function") { - return (name: string) => { - const value = maybeHeaders.get?.(name); - return value === undefined || value === null ? undefined : String(value); - }; + if (this.wakeUpTimeout && this.wakeUpAt !== undefined && this.wakeUpAt <= target) { + return; } - if (typeof headers === "object") { - const entries = Object.entries(headers as Record); - if (entries.length === 0) { - return undefined; - } - - return (name: string) => { - const target = name.toLowerCase(); - for (const [key, value] of entries) { - if (typeof key === "string" && key.toLowerCase() === target) { - if (Array.isArray(value)) { - const first = value[0]; - return first === undefined || first === null ? undefined : String(first); - } - return value === undefined || value === null ? undefined : String(value); - } - } - return undefined; - }; - } + if (this.wakeUpTimeout) clearTimeout(this.wakeUpTimeout); - return undefined; + this.wakeUpAt = target; + this.wakeUpTimeout = setTimeout( + () => { + this.wakeUpTimeout = undefined; + this.wakeUpAt = undefined; + this.scheduleDrain(); + }, + Math.max(1, target - now), + ); } - private parseNumberHeader( - getHeader: (name: string) => string | undefined, - name: string, - ): number | undefined { - const raw = getHeader(name); - if (raw === undefined) { - return undefined; - } - - const parsed = Number(raw); - return Number.isFinite(parsed) ? parsed : undefined; + private releaseRateLimitReservation(key?: string): void { + if (!key) return; + const state = this.rateLimitStates.get(key); + if (!state || state.reserved <= 0) return; + state.reserved -= 1; } - private parseDurationHeaderToMs( - getHeader: (name: string) => string | undefined, - name: string, - ): number | undefined { - const raw = getHeader(name); - if (!raw) { - return undefined; - } - - const trimmed = raw.trim(); - if (!trimmed) { - return undefined; - } - - const simpleMatch = trimmed.match(/^(\d+(?:\.\d+)?)(ms|s|m|h)?$/i); - if (simpleMatch) { - const value = Number(simpleMatch[1]); - if (!Number.isFinite(value) || value <= 0) { - return undefined; - } + /* ============================================================ + * Circuit breakers (verbatim logic, linearized) + * ============================================================ + */ - const unit = (simpleMatch[2] ?? "s").toLowerCase(); - switch (unit) { - case "ms": - return value; - case "s": - return value * 1000; - case "m": - return value * 60 * 1000; - case "h": - return value * 60 * 60 * 1000; - default: - return undefined; - } - } + private resolveCircuit(next: QueuedRequest): DispatchDecision | null { + const visited = new Set(); - // Compound durations like "1m30.951s" - const segmentRegex = /(\d+(?:\.\d+)?)(ms|s|m|h)/gi; - let totalMs = 0; - let matched = false; - - segmentRegex.lastIndex = 0; - let segment: RegExpExecArray | null = segmentRegex.exec(trimmed); - while (segment !== null) { - matched = true; - const value = Number(segment[1]); - if (!Number.isFinite(value) || value < 0) { - return undefined; - } + while (true) { + const key = this.buildRateLimitKey(next.request.metadata); + next.circuitKey = key; - const unit = segment[2].toLowerCase(); - switch (unit) { - case "ms": - totalMs += value; - break; - case "s": - totalMs += value * 1000; - break; - case "m": - totalMs += value * 60 * 1000; - break; - case "h": - totalMs += value * 60 * 60 * 1000; - break; - default: - return undefined; - } + const model = next.request.metadata?.model; + if (model) visited.add(model); - segment = segmentRegex.exec(trimmed); - } + const evaluation = this.evaluateCircuitState(key); + next.circuitStatus = evaluation.state; - if (!matched || totalMs <= 0) { - return undefined; - } + if (evaluation.allowRequest) return null; - segmentRegex.lastIndex = 0; - const leftover = trimmed.replace(segmentRegex, "").trim(); - if (leftover) { - return undefined; - } + const fallback = this.findFallbackModel(next.request.metadata, visited); + if (!fallback || !next.request.createFallbackRequest) { + next.reject( + new CircuitBreakerOpenError( + `Circuit open for ${key}`, + next.request.metadata, + evaluation.retryAfterMs, + ), + ); + return { kind: "skip" }; + } - return totalMs; - } + const fallbackRequest = next.request.createFallbackRequest(fallback); + if (!fallbackRequest) return { kind: "skip" }; - private resolvePriority(metadata?: TrafficRequestMetadata): TrafficPriority { - const candidate = metadata?.priority; - if (candidate === "P0" || candidate === "P1" || candidate === "P2") { - return candidate; + next.request = fallbackRequest; + next.attempt = 1; + next.rateLimitKey = undefined; + next.etaMs = undefined; + next.circuitKey = undefined; + next.circuitStatus = undefined; } - - return "P1"; } - private getQueue(priority: TrafficPriority): QueuedRequest[] { - return this.queues[priority]; - } + private evaluateCircuitState(key: string): { + allowRequest: boolean; + state: CircuitStateStatus; + retryAfterMs?: number; + } { + const state = this.circuitBreakers.get(key); + if (!state) return { allowRequest: true, state: "closed" }; - private hasQueuedWork(): boolean { - return this.priorityOrder.some((priority) => this.getQueue(priority).length > 0); - } + const now = Date.now(); - private getQueueSize(): number { - let size = 0; - for (const priority of this.priorityOrder) { - size += this.getQueue(priority).length; + if (state.status === "open") { + const elapsed = state.openedAt ? now - state.openedAt : 0; + if (elapsed >= CIRCUIT_COOLDOWN_MS) { + state.status = "half-open"; + state.trialInFlight = false; + state.failureTimestamps = []; + return { allowRequest: true, state: "half-open" }; + } + return { + allowRequest: false, + state: "open", + retryAfterMs: CIRCUIT_COOLDOWN_MS - elapsed, + }; } - return size; - } - private scheduleRateLimitWakeUpAt(wakeUpAt: number): void { - if (!Number.isFinite(wakeUpAt)) { - return; + if (state.status === "half-open" && state.trialInFlight) { + return { allowRequest: false, state: "half-open" }; } - const now = Date.now(); - const targetAt = Math.max(now, wakeUpAt); + return { allowRequest: true, state: state.status }; + } - if (this.wakeUpTimeout && this.wakeUpAt !== undefined && this.wakeUpAt <= targetAt) { - return; + private findFallbackModel( + metadata: TrafficRequestMetadata | undefined, + visitedModels: Set, + ): string | undefined { + const currentModel = metadata?.model; + if (!currentModel) { + return undefined; } - if (this.wakeUpTimeout) { - clearTimeout(this.wakeUpTimeout); - this.wakeUpTimeout = undefined; + const chain = this.fallbackChains.get(currentModel); + if (!chain) { + return undefined; } - this.wakeUpAt = targetAt; - const delayMs = Math.max(1, Math.ceil(targetAt - now)); - this.logDebug("[TrafficController] scheduleRateLimitWakeUp", { delayMs, wakeUpAt: targetAt }); - this.wakeUpTimeout = setTimeout(() => { - this.wakeUpTimeout = undefined; - this.wakeUpAt = undefined; - this.logDebug("[TrafficController] rateLimitWakeUpFired", { - queueSize: this.getQueueSize(), - active: this.activeCount, + const provider = metadata?.provider; + for (const candidate of chain) { + if (visitedModels.has(candidate)) { + continue; + } + + const candidateKey = this.buildRateLimitKey({ + provider, + model: candidate, }); - this.scheduleDrain(); - }, delayMs); - } - private releaseRateLimitReservation(key: string | undefined): void { - if (!key) { - return; + const evaluation = this.evaluateCircuitState(candidateKey); + if (evaluation.allowRequest) { + visitedModels.add(candidate); + return candidate; + } } - const state = this.rateLimitStates.get(key); - if (!state) { - return; - } + return undefined; + } - if (state.reserved <= 0) { - return; + private markCircuitTrial(item: QueuedRequest): void { + const key = item.circuitKey; + if (!key) return; + const state = this.circuitBreakers.get(key); + if (state && state.status === "half-open" && !state.trialInFlight) { + state.trialInFlight = true; } - - state.reserved = Math.max(0, state.reserved - 1); - this.logDebug("[TrafficController] rate limit released", { - key, - reserved: state.reserved, - remaining: state.remaining, - resetAt: state.resetAt, - nextAllowedAt: state.nextAllowedAt, - }); } private recordCircuitSuccess(metadata?: TrafficRequestMetadata): void { const key = this.buildRateLimitKey(metadata); - if (this.circuitBreakers.has(key)) { - this.circuitBreakers.delete(key); - } + this.circuitBreakers.delete(key); } private recordCircuitFailure(metadata: TrafficRequestMetadata | undefined, error: unknown): void { const status = this.extractStatusCode(error); if (!this.isCircuitBreakerStatus(status)) { - this.resetCircuitFailures(metadata); + this.circuitBreakers.delete(this.buildRateLimitKey(metadata)); return; } @@ -913,91 +582,41 @@ export class TrafficController { const now = Date.now(); const state = this.circuitBreakers.get(key) ?? - ({ - status: "closed", - failureTimestamps: [], - } as CircuitState); + ({ status: "closed", failureTimestamps: [] } as CircuitState); - const recentFailures = state.failureTimestamps.filter( - (timestamp) => now - timestamp <= CIRCUIT_FAILURE_WINDOW_MS, + state.failureTimestamps = state.failureTimestamps.filter( + (t) => now - t <= CIRCUIT_FAILURE_WINDOW_MS, ); - recentFailures.push(now); + state.failureTimestamps.push(now); - if (state.status === "half-open") { - state.status = "open"; - state.openedAt = now; - state.trialInFlight = false; - state.failureTimestamps = [now]; - this.circuitBreakers.set(key, state); - this.logger.warn("Circuit reopened after half-open failure", { - circuitKey: key, - statusCode: status, - }); - return; - } - - state.failureTimestamps = recentFailures; - if (state.failureTimestamps.length >= CIRCUIT_FAILURE_THRESHOLD) { + if ( + state.status === "half-open" || + state.failureTimestamps.length >= CIRCUIT_FAILURE_THRESHOLD + ) { state.status = "open"; state.openedAt = now; state.trialInFlight = false; - this.logger.warn("Circuit opened after consecutive failures", { - circuitKey: key, - failureCount: state.failureTimestamps.length, - statusCode: status, - }); } this.circuitBreakers.set(key, state); } - private resetCircuitFailures(metadata?: TrafficRequestMetadata): void { - const key = this.buildRateLimitKey(metadata); - const state = this.circuitBreakers.get(key); - if (!state) { - return; - } - - state.failureTimestamps = []; - if (state.status !== "open") { - state.status = "closed"; - state.trialInFlight = false; - } - - this.circuitBreakers.set(key, state); - } + /* ============================================================ + * Usage + * ============================================================ + */ - private recordUsageFromResult( - item: QueuedRequest, - result: TResponse, - ): void { + private recordUsage(item: QueuedRequest, result: TResponse): void { const extractor = item.extractUsage ?? item.request.extractUsage; - if (!extractor) { - return; - } - - try { - const usageCandidate = extractor(result); - if (!usageCandidate) { - return; - } + if (!extractor) return; - if (this.isPromiseLike(usageCandidate)) { - void Promise.resolve(usageCandidate) - .then((usage) => { - if (usage) { - this.incrementTenantUsage(item.tenantId, usage); - } - }) - .catch((error) => { - this.logger.debug("Failed to record tenant usage", { tenantId: item.tenantId, error }); - }); - return; - } + const usage = extractor(result); + if (!usage) return; - this.incrementTenantUsage(item.tenantId, usageCandidate as UsageCounters); - } catch (error) { - this.logger.debug("Failed to record tenant usage", { tenantId: item.tenantId, error }); + if (this.isPromiseLike(usage)) { + void usage.then((u) => u && this.incrementTenantUsage(item.tenantId, u)); + } else { + this.incrementTenantUsage(item.tenantId, usage); } } @@ -1007,178 +626,43 @@ export class TrafficController { outputTokens: 0, totalTokens: 0, }; - const inputTokens = usage.inputTokens ?? 0; - const outputTokens = usage.outputTokens ?? 0; - const totalTokens = usage.totalTokens ?? inputTokens + outputTokens; - const updated: TenantUsage = { - inputTokens: current.inputTokens + inputTokens, - outputTokens: current.outputTokens + outputTokens, - totalTokens: current.totalTokens + totalTokens, - }; - this.tenantUsage.set(tenantId, updated); - this.logger.debug("Recorded tenant usage", { tenantId, usage: updated }); - } - - private isPromiseLike(value: unknown): value is PromiseLike { - return ( - typeof value === "object" && - value !== null && - typeof (value as PromiseLike).then === "function" - ); - } - - private isCircuitBreakerStatus(status?: number): boolean { - if (status === 429) { - return true; - } - - return status !== undefined && status >= 500 && status < 600; - } - private async runRequest(item: QueuedRequest): Promise { - const attempt = item.attempt ?? 1; + const input = usage.inputTokens ?? 0; + const output = usage.outputTokens ?? 0; + const total = usage.totalTokens ?? input + output; - this.logDebug("[TrafficController] runRequest start", { - type: item.type, - rateLimitKey: item.rateLimitKey, - etaMs: item.etaMs, - active: this.activeCount, - queueSize: this.getQueueSize(), + this.tenantUsage.set(tenantId, { + inputTokens: current.inputTokens + input, + outputTokens: current.outputTokens + output, + totalTokens: current.totalTokens + total, }); - - try { - const result = await item.request.execute(); // Execute the user's operation - this.recordCircuitSuccess(item.request.metadata); - this.recordUsageFromResult(item, result); - item.resolve(result); // Deliver successful result back to the waiting caller - } catch (error) { - this.recordCircuitFailure(item.request.metadata, error); - const retryPlan = this.buildRetryPlan(error, attempt); - if (retryPlan) { - this.scheduleRetry(item, attempt + 1, retryPlan.delayMs, retryPlan.reason); - } else { - item.reject(error); // Surface failures to the caller - } - } finally { - this.releaseRateLimitReservation(item.rateLimitKey); - this.activeCount = Math.max(0, this.activeCount - 1); // Ensure counter never underflows - this.logDebug("[TrafficController] runRequest complete", { - type: item.type, - active: this.activeCount, - queueSize: this.getQueueSize(), - }); - this.scheduleDrain(); // Immediately try to pull the next request - } - } - - private buildRetryPlan( - error: unknown, - attempt: number, - ): { delayMs: number; reason: RetryReason } | undefined { - const reason = this.getRetryReason(error); - if (!reason) { - return undefined; - } - - const maxAttempts = reason === "timeout" ? TIMEOUT_RETRY_ATTEMPTS : MAX_RETRY_ATTEMPTS; - if (attempt >= maxAttempts) { - return undefined; - } - - return { - reason, - delayMs: this.computeBackoffDelay(reason, attempt), - }; } - private getRetryReason(error: unknown): RetryReason | undefined { - const statusCode = this.extractStatusCode(error); - if (statusCode === 429) { - return "rateLimit"; - } - - if (statusCode !== undefined && statusCode >= 500 && statusCode < 600) { - return "serverError"; - } - - if (statusCode === 408 || this.isTimeoutError(error)) { - return "timeout"; - } + /* ============================================================ + * Utilities + * ============================================================ + */ - return undefined; + private resolvePriority(metadata?: TrafficRequestMetadata): TrafficPriority { + return metadata?.priority ?? "P1"; } - private extractStatusCode(error: unknown): number | undefined { - if (!error || typeof error !== "object") { - return undefined; - } - - const candidate = error as { status?: unknown; statusCode?: unknown; httpStatus?: unknown }; - const directStatus = - this.coerceStatus(candidate.status) ?? - this.coerceStatus(candidate.statusCode) ?? - this.coerceStatus(candidate.httpStatus); - if (directStatus !== undefined) { - return directStatus; - } - - const responseStatus = (error as { response?: { status?: unknown } }).response?.status; - const normalizedResponseStatus = this.coerceStatus(responseStatus); - if (normalizedResponseStatus !== undefined) { - return normalizedResponseStatus; - } - - const causeStatus = (error as { cause?: { status?: unknown; statusCode?: unknown } }).cause; - if (causeStatus) { - const normalizedCauseStatus = - this.coerceStatus(causeStatus.status) ?? this.coerceStatus(causeStatus.statusCode); - if (normalizedCauseStatus !== undefined) { - return normalizedCauseStatus; - } - } - - return undefined; + private buildRateLimitKey(metadata?: TrafficRequestMetadata): string { + return `${metadata?.provider ?? "default-provider"}::${metadata?.model ?? "default-model"}`; } - private isTimeoutError(error: unknown): boolean { - const candidates = [error, (error as { cause?: unknown })?.cause]; - - for (const candidate of candidates) { - if (!candidate || typeof candidate !== "object") { - continue; - } - - const timeoutCode = (candidate as { code?: unknown }).code; - if (typeof timeoutCode === "string" && timeoutCode.toLowerCase().includes("timeout")) { - return true; - } - - const name = (candidate as { name?: unknown }).name; - if (typeof name === "string" && name.toLowerCase().includes("timeout")) { - return true; - } - - const message = (candidate as { message?: unknown }).message; - if (typeof message === "string" && message.toLowerCase().includes("timeout")) { - return true; - } - } - - return false; + private normalizeFallbackChains( + fallbackChains?: Record, + ): Map { + const chains = fallbackChains ?? DEFAULT_FALLBACK_CHAINS; + return new Map(Object.entries(chains)); } - private coerceStatus(value: unknown): number | undefined { - if (typeof value === "number" && Number.isFinite(value)) { - return value; - } - - if (typeof value === "string") { - const parsed = Number(value); - if (Number.isFinite(parsed)) { - return parsed; - } - } - + private getRetryReason(error: unknown): RetryReason | undefined { + const status = this.extractStatusCode(error); + if (status === 429) return "rateLimit"; + if (status && status >= 500) return "serverError"; + if (status === 408 || this.isTimeoutError(error)) return "timeout"; return undefined; } @@ -1190,48 +674,61 @@ export class TrafficController { ? TIMEOUT_BASE_BACKOFF_MS : RATE_LIMIT_BASE_BACKOFF_MS; - const jitterFactor = + const jitter = reason === "serverError" ? SERVER_ERROR_JITTER_FACTOR : reason === "timeout" ? TIMEOUT_JITTER_FACTOR : RATE_LIMIT_JITTER_FACTOR; - const exponential = base * 2 ** Math.max(0, attempt - 1); - const jitter = exponential * jitterFactor * Math.random(); - return Math.max(1, Math.round(exponential + jitter)); + const exp = base * 2 ** (attempt - 1); + return Math.round(exp + exp * jitter * Math.random()); } - private scheduleRetry( - item: QueuedRequest, - nextAttempt: number, - delayMs: number, - reason: RetryReason, - ): void { - this.logger.debug("Retrying request through controller", { - reason, - delayMs, - attempt: nextAttempt, - maxAttempts: reason === "timeout" ? TIMEOUT_RETRY_ATTEMPTS : MAX_RETRY_ATTEMPTS, - metadata: item.request.metadata, - }); + private extractStatusCode(error: unknown): number | undefined { + const e = error as any; + return ( + this.coerceStatus(e?.status) ?? + this.coerceStatus(e?.statusCode) ?? + this.coerceStatus(e?.httpStatus) ?? + this.coerceStatus(e?.response?.status) ?? + this.coerceStatus(e?.cause?.status) + ); + } - setTimeout(() => { - const retryPriority = item.priority; - this.getQueue(retryPriority).push({ - ...item, - attempt: nextAttempt, - etaMs: undefined, - rateLimitKey: undefined, - circuitKey: undefined, - circuitStatus: undefined, - }); - this.scheduleDrain(); - }, delayMs); + private isTimeoutError(error: unknown): boolean { + const e = error as any; + return ( + String(e?.code ?? "") + .toLowerCase() + .includes("timeout") || + String(e?.name ?? "") + .toLowerCase() + .includes("timeout") || + String(e?.message ?? "") + .toLowerCase() + .includes("timeout") + ); + } + + private isCircuitBreakerStatus(status?: number): boolean { + return status === 429 || (status !== undefined && status >= 500); + } + + private coerceStatus(value: unknown): number | undefined { + const n = Number(value); + return Number.isFinite(n) ? n : undefined; + } + + private isPromiseLike(value: unknown): value is PromiseLike { + return !!value && typeof (value as any).then === "function"; } } -let singletonController: TrafficController | undefined; +/* ============================================================ + * Error + Singleton + * ============================================================ + */ export class CircuitBreakerOpenError extends Error { readonly retryAfterMs?: number; @@ -1245,14 +742,11 @@ export class CircuitBreakerOpenError extends Error { } } -/** - * Retrieve the shared traffic controller instance. - */ +let singletonController: TrafficController | undefined; + export function getTrafficController(options?: TrafficControllerOptions): TrafficController { if (!singletonController) { - // Create a singleton controller so all agents share the same queue/scheduling behavior singletonController = new TrafficController(options); } - return singletonController; } From e9780530eb173c4890a0f2b57c25ead7d06ccbb4 Mon Sep 17 00:00:00 2001 From: riturajFi Date: Sat, 13 Dec 2025 14:23:03 +0530 Subject: [PATCH 10/41] feat: refactotr --- .../src/traffic/traffic-circuit-breaker.ts | 184 +++++ .../core/src/traffic/traffic-constants.ts | 23 + .../traffic/traffic-controller-internal.ts | 52 ++ .../core/src/traffic/traffic-controller.ts | 629 +++--------------- .../core/src/traffic/traffic-error-utils.ts | 40 ++ packages/core/src/traffic/traffic-errors.ts | 13 + .../core/src/traffic/traffic-rate-limiter.ts | 206 ++++++ packages/core/src/traffic/traffic-retry.ts | 56 ++ packages/core/src/traffic/traffic-types.ts | 55 ++ .../core/src/traffic/traffic-usage-tracker.ts | 50 ++ 10 files changed, 772 insertions(+), 536 deletions(-) create mode 100644 packages/core/src/traffic/traffic-circuit-breaker.ts create mode 100644 packages/core/src/traffic/traffic-constants.ts create mode 100644 packages/core/src/traffic/traffic-controller-internal.ts create mode 100644 packages/core/src/traffic/traffic-error-utils.ts create mode 100644 packages/core/src/traffic/traffic-errors.ts create mode 100644 packages/core/src/traffic/traffic-rate-limiter.ts create mode 100644 packages/core/src/traffic/traffic-retry.ts create mode 100644 packages/core/src/traffic/traffic-types.ts create mode 100644 packages/core/src/traffic/traffic-usage-tracker.ts diff --git a/packages/core/src/traffic/traffic-circuit-breaker.ts b/packages/core/src/traffic/traffic-circuit-breaker.ts new file mode 100644 index 000000000..9cc407995 --- /dev/null +++ b/packages/core/src/traffic/traffic-circuit-breaker.ts @@ -0,0 +1,184 @@ +import { + CIRCUIT_COOLDOWN_MS, + CIRCUIT_FAILURE_THRESHOLD, + CIRCUIT_FAILURE_WINDOW_MS, + DEFAULT_FALLBACK_CHAINS, +} from "./traffic-constants"; +import type { + CircuitState, + CircuitStateStatus, + DispatchDecision, + QueuedRequest, +} from "./traffic-controller-internal"; +import { extractStatusCode } from "./traffic-error-utils"; +import { CircuitBreakerOpenError } from "./traffic-errors"; +import type { TrafficRequestMetadata } from "./traffic-types"; + +export class TrafficCircuitBreaker { + private readonly circuitBreakers = new Map(); + private readonly fallbackChains: Map; + private readonly buildRateLimitKey: (metadata?: TrafficRequestMetadata) => string; + + constructor(options: { + fallbackChains?: Record; + buildRateLimitKey: (metadata?: TrafficRequestMetadata) => string; + }) { + this.buildRateLimitKey = options.buildRateLimitKey; + const chains = options.fallbackChains ?? DEFAULT_FALLBACK_CHAINS; + this.fallbackChains = new Map(Object.entries(chains)); + } + + resolve(next: QueuedRequest): DispatchDecision | null { + const visited = new Set(); + + while (true) { + const key = this.buildRateLimitKey(next.request.metadata); + next.circuitKey = key; + + const model = next.request.metadata?.model; + if (model) visited.add(model); + + const evaluation = this.evaluateCircuitState(key); + next.circuitStatus = evaluation.state; + + if (evaluation.allowRequest) return null; + + const fallback = this.findFallbackModel(next.request.metadata, visited); + if (!fallback || !next.request.createFallbackRequest) { + next.reject( + new CircuitBreakerOpenError( + `Circuit open for ${key}`, + next.request.metadata, + evaluation.retryAfterMs, + ), + ); + return { kind: "skip" }; + } + + const fallbackRequest = next.request.createFallbackRequest(fallback); + if (!fallbackRequest) return { kind: "skip" }; + + next.request = fallbackRequest; + next.attempt = 1; + next.rateLimitKey = undefined; + next.etaMs = undefined; + next.circuitKey = undefined; + next.circuitStatus = undefined; + } + } + + markTrial(item: QueuedRequest): void { + const key = item.circuitKey; + if (!key) return; + const state = this.circuitBreakers.get(key); + if (state && state.status === "half-open" && !state.trialInFlight) { + state.trialInFlight = true; + } + } + + recordSuccess(metadata?: TrafficRequestMetadata): void { + const key = this.buildRateLimitKey(metadata); + this.circuitBreakers.delete(key); + } + + recordFailure(metadata: TrafficRequestMetadata | undefined, error: unknown): void { + const status = extractStatusCode(error); + if (!this.isCircuitBreakerStatus(status)) { + this.circuitBreakers.delete(this.buildRateLimitKey(metadata)); + return; + } + + const key = this.buildRateLimitKey(metadata); + const now = Date.now(); + const state = + this.circuitBreakers.get(key) ?? + ({ status: "closed", failureTimestamps: [] } as CircuitState); + + state.failureTimestamps = state.failureTimestamps.filter( + (t) => now - t <= CIRCUIT_FAILURE_WINDOW_MS, + ); + state.failureTimestamps.push(now); + + if ( + state.status === "half-open" || + state.failureTimestamps.length >= CIRCUIT_FAILURE_THRESHOLD + ) { + state.status = "open"; + state.openedAt = now; + state.trialInFlight = false; + } + + this.circuitBreakers.set(key, state); + } + + private evaluateCircuitState(key: string): { + allowRequest: boolean; + state: CircuitStateStatus; + retryAfterMs?: number; + } { + const state = this.circuitBreakers.get(key); + if (!state) return { allowRequest: true, state: "closed" }; + + const now = Date.now(); + + if (state.status === "open") { + const elapsed = state.openedAt ? now - state.openedAt : 0; + if (elapsed >= CIRCUIT_COOLDOWN_MS) { + state.status = "half-open"; + state.trialInFlight = false; + state.failureTimestamps = []; + return { allowRequest: true, state: "half-open" }; + } + return { + allowRequest: false, + state: "open", + retryAfterMs: CIRCUIT_COOLDOWN_MS - elapsed, + }; + } + + if (state.status === "half-open" && state.trialInFlight) { + return { allowRequest: false, state: "half-open" }; + } + + return { allowRequest: true, state: state.status }; + } + + private findFallbackModel( + metadata: TrafficRequestMetadata | undefined, + visitedModels: Set, + ): string | undefined { + const currentModel = metadata?.model; + if (!currentModel) { + return undefined; + } + + const chain = this.fallbackChains.get(currentModel); + if (!chain) { + return undefined; + } + + const provider = metadata?.provider; + for (const candidate of chain) { + if (visitedModels.has(candidate)) { + continue; + } + + const candidateKey = this.buildRateLimitKey({ + provider, + model: candidate, + }); + + const evaluation = this.evaluateCircuitState(candidateKey); + if (evaluation.allowRequest) { + visitedModels.add(candidate); + return candidate; + } + } + + return undefined; + } + + private isCircuitBreakerStatus(status?: number): boolean { + return status === 429 || (status !== undefined && status >= 500); + } +} diff --git a/packages/core/src/traffic/traffic-constants.ts b/packages/core/src/traffic/traffic-constants.ts new file mode 100644 index 000000000..74845ab65 --- /dev/null +++ b/packages/core/src/traffic/traffic-constants.ts @@ -0,0 +1,23 @@ +export const MAX_RETRY_ATTEMPTS = 3; +export const TIMEOUT_RETRY_ATTEMPTS = 2; + +export const RATE_LIMIT_BASE_BACKOFF_MS = 500; +export const SERVER_ERROR_BASE_BACKOFF_MS = 1000; +export const TIMEOUT_BASE_BACKOFF_MS = 750; + +export const RATE_LIMIT_JITTER_FACTOR = 0.35; +export const SERVER_ERROR_JITTER_FACTOR = 0.8; +export const TIMEOUT_JITTER_FACTOR = 0.5; + +export const CIRCUIT_FAILURE_THRESHOLD = 5; +export const CIRCUIT_FAILURE_WINDOW_MS = 10_000; +export const CIRCUIT_COOLDOWN_MS = 30_000; + +export const RATE_LIMIT_EXHAUSTION_BUFFER = 1; +export const RATE_LIMIT_PROBE_DELAY_MS = 50; +export const RATE_LIMIT_MIN_PACE_INTERVAL_MS = 10; +export const RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS = 10; + +export const DEFAULT_FALLBACK_CHAINS: Record = { + "gpt-4o": ["gpt-4o-mini", "gpt-3.5"], +}; diff --git a/packages/core/src/traffic/traffic-controller-internal.ts b/packages/core/src/traffic/traffic-controller-internal.ts new file mode 100644 index 000000000..3b9c01244 --- /dev/null +++ b/packages/core/src/traffic/traffic-controller-internal.ts @@ -0,0 +1,52 @@ +import type { + TrafficPriority, + TrafficRequest, + TrafficRequestMetadata, + TrafficRequestType, +} from "./traffic-types"; + +export type Scheduler = (callback: () => void) => void; + +export type DispatchDecision = + | { kind: "dispatch" } + | { kind: "skip" } + | { kind: "wait"; wakeUpAt?: number }; + +export type CircuitStateStatus = "closed" | "open" | "half-open"; + +export interface CircuitState { + status: CircuitStateStatus; + failureTimestamps: number[]; + openedAt?: number; + trialInFlight?: boolean; +} + +export interface RateLimitWindowState { + limit: number; + remaining: number; + resetAt: number; + reserved: number; + nextAllowedAt: number; +} + +type BivariantHandler = { + bivarianceHack(...args: TArgs): void; +}["bivarianceHack"]; + +export interface QueuedRequest { + type: TrafficRequestType; + request: TrafficRequest; + resolve: BivariantHandler<[TResponse | PromiseLike]>; + reject: BivariantHandler<[reason?: unknown]>; + attempt: number; + priority: TrafficPriority; + tenantId: string; + + rateLimitKey?: string; + etaMs?: number; + + circuitKey?: string; + circuitStatus?: CircuitStateStatus; + + extractUsage?: TrafficRequest["extractUsage"]; +} diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index e633792e8..2169c36f0 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -1,168 +1,41 @@ -/* eslint-disable @typescript-eslint/no-explicit-any */ - import type { Logger } from "../logger"; import { LoggerProxy } from "../logger"; +import { TrafficCircuitBreaker } from "./traffic-circuit-breaker"; +import type { DispatchDecision, QueuedRequest, Scheduler } from "./traffic-controller-internal"; +import { CircuitBreakerOpenError } from "./traffic-errors"; +import { type RateLimitUpdateResult, TrafficRateLimiter } from "./traffic-rate-limiter"; +import { type RetryReason, buildRetryPlan } from "./traffic-retry"; +import type { + RateLimitConfig, + RateLimitKey, + RateLimitOptions, + TenantUsage, + TrafficControllerOptions, + TrafficPriority, + TrafficRequest, + TrafficRequestMetadata, + TrafficRequestType, +} from "./traffic-types"; +import { TrafficUsageTracker } from "./traffic-usage-tracker"; /* ============================================================ - * Types - * ============================================================ - */ - -type Scheduler = (callback: () => void) => void; - -type BivariantHandler = { - bivarianceHack(...args: TArgs): void; -}["bivarianceHack"]; - -type BivariantFunction = { - bivarianceHack(...args: TArgs): TReturn; -}["bivarianceHack"]; - -export type TrafficRequestType = "text" | "stream"; -export type TrafficPriority = "P0" | "P1" | "P2"; - -export interface TrafficRequestMetadata { - agentId?: string; - agentName?: string; - model?: string; - provider?: string; - priority?: TrafficPriority; - tenantId?: string; -} - -export interface TrafficRequest { - tenantId: string; - metadata?: TrafficRequestMetadata; - execute: () => Promise; - createFallbackRequest?: (modelId: string) => TrafficRequest | undefined; - extractUsage?: BivariantFunction< - [response: TResponse], - Promise | UsageCounters | undefined - >; -} - -interface QueuedRequest { - type: TrafficRequestType; - request: TrafficRequest; - resolve: BivariantHandler<[TResponse | PromiseLike]>; - reject: BivariantHandler<[reason?: unknown]>; - attempt: number; - priority: TrafficPriority; - tenantId: string; - - rateLimitKey?: string; - etaMs?: number; - - circuitKey?: string; - circuitStatus?: CircuitStateStatus; - - extractUsage?: TrafficRequest["extractUsage"]; -} - -export interface TrafficControllerOptions { - maxConcurrent?: number; - rateLimits?: RateLimitConfig; - logger?: Logger; - fallbackChains?: Record; -} - -/* ============================================================ - * Rate limiting - * ============================================================ - */ - -export interface RateLimitOptions { - capacity: number; - refillPerSecond: number; -} - -export type RateLimitKey = string; -export type RateLimitConfig = Record; - -interface RateLimitWindowState { - limit: number; - remaining: number; - resetAt: number; - reserved: number; - nextAllowedAt: number; -} - -/* ============================================================ - * Circuit breaker - * ============================================================ - */ - -type CircuitStateStatus = "closed" | "open" | "half-open"; - -interface CircuitState { - status: CircuitStateStatus; - failureTimestamps: number[]; - openedAt?: number; - trialInFlight?: boolean; -} - -/* ============================================================ - * Usage - * ============================================================ - */ - -export type TenantUsage = { - inputTokens: number; - outputTokens: number; - totalTokens: number; -}; - -type UsageCounters = { - inputTokens?: number; - outputTokens?: number; - totalTokens?: number; -}; - -/* ============================================================ - * Internal constants + * Traffic Controller * ============================================================ */ -type RetryReason = "rateLimit" | "serverError" | "timeout"; - -const MAX_RETRY_ATTEMPTS = 3; -const TIMEOUT_RETRY_ATTEMPTS = 2; - -const RATE_LIMIT_BASE_BACKOFF_MS = 500; -const SERVER_ERROR_BASE_BACKOFF_MS = 1000; -const TIMEOUT_BASE_BACKOFF_MS = 750; - -const RATE_LIMIT_JITTER_FACTOR = 0.35; -const SERVER_ERROR_JITTER_FACTOR = 0.8; -const TIMEOUT_JITTER_FACTOR = 0.5; - -const CIRCUIT_FAILURE_THRESHOLD = 5; -const CIRCUIT_FAILURE_WINDOW_MS = 10_000; -const CIRCUIT_COOLDOWN_MS = 30_000; - -const RATE_LIMIT_EXHAUSTION_BUFFER = 1; -const RATE_LIMIT_PROBE_DELAY_MS = 50; -const RATE_LIMIT_MIN_PACE_INTERVAL_MS = 10; -const RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS = 10; - -const DEFAULT_FALLBACK_CHAINS: Record = { - "gpt-4o": ["gpt-4o-mini", "gpt-3.5"], +export type { + RateLimitConfig, + RateLimitKey, + RateLimitOptions, + TenantUsage, + TrafficControllerOptions, + TrafficPriority, + TrafficRequest, + TrafficRequestMetadata, + TrafficRequestType, }; -/* ============================================================ - * Dispatch decisions - * ============================================================ - */ - -type DispatchDecision = - | { kind: "dispatch" } - | { kind: "skip" } - | { kind: "wait"; wakeUpAt?: number }; - -/* ============================================================ - * Traffic Controller - * ============================================================ - */ +export { CircuitBreakerOpenError }; export class TrafficController { /* ---------- Core ---------- */ @@ -182,25 +55,23 @@ export class TrafficController { private drainScheduled = false; /* ---------- Rate limits ---------- */ - - private readonly rateLimitStates = new Map(); - private wakeUpTimeout?: ReturnType; - private wakeUpAt?: number; + private readonly rateLimiter: TrafficRateLimiter; /* ---------- Circuit breakers ---------- */ - - private readonly circuitBreakers = new Map(); - private readonly fallbackChains: Map; + private readonly circuitBreaker: TrafficCircuitBreaker; /* ---------- Usage ---------- */ - - private readonly tenantUsage = new Map(); + private readonly usageTracker = new TrafficUsageTracker(); constructor(options: TrafficControllerOptions = {}) { this.maxConcurrent = options.maxConcurrent ?? Number.POSITIVE_INFINITY; this.scheduler = this.createScheduler(); - this.fallbackChains = this.normalizeFallbackChains(options.fallbackChains); this.logger = new LoggerProxy({ component: "traffic-controller" }, options.logger); + this.rateLimiter = new TrafficRateLimiter(() => this.scheduleDrain()); + this.circuitBreaker = new TrafficCircuitBreaker({ + fallbackChains: options.fallbackChains, + buildRateLimitKey: (metadata) => this.buildRateLimitKey(metadata), + }); } /* ============================================================ @@ -216,9 +87,16 @@ export class TrafficController { return this.enqueue("stream", request); } + updateRateLimitFromHeaders( + metadata: TrafficRequestMetadata | undefined, + headers: unknown, + ): RateLimitUpdateResult | undefined { + const key = this.buildRateLimitKey(metadata); + return this.rateLimiter.updateFromHeaders(metadata, headers, key); + } + getTenantUsage(tenantId: string): TenantUsage | undefined { - const usage = this.tenantUsage.get(tenantId); - return usage ? { ...usage } : undefined; + return this.usageTracker.getTenantUsage(tenantId); } /* ============================================================ @@ -278,31 +156,55 @@ export class TrafficController { */ private tryDispatchNext(): DispatchDecision { - const next = this.peekNext(); - if (!next) return { kind: "wait" }; if (this.activeCount >= this.maxConcurrent) return { kind: "wait" }; - const circuit = this.resolveCircuit(next); - if (circuit) return circuit; - - const rateLimit = this.resolveRateLimit(next); - if (rateLimit) return rateLimit; + let earliestWakeUpAt: number | undefined; + + for (const priority of this.priorityOrder) { + const next = this.queues[priority][0]; + if (!next) continue; + + const circuit = this.resolveCircuit(next); + if (circuit) { + if (circuit.kind === "skip") { + this.queues[priority].shift(); + return { kind: "skip" }; + } + if (circuit.kind === "wait") { + if (circuit.wakeUpAt !== undefined) { + earliestWakeUpAt = + earliestWakeUpAt === undefined + ? circuit.wakeUpAt + : Math.min(earliestWakeUpAt, circuit.wakeUpAt); + } + continue; + } + } - this.startRequest(next); - return { kind: "dispatch" }; - } + const rateLimit = this.resolveRateLimit(next); + if (rateLimit) { + if (rateLimit.kind === "wait" && rateLimit.wakeUpAt !== undefined) { + earliestWakeUpAt = + earliestWakeUpAt === undefined + ? rateLimit.wakeUpAt + : Math.min(earliestWakeUpAt, rateLimit.wakeUpAt); + } + continue; + } - private peekNext(): QueuedRequest | undefined { - for (const p of this.priorityOrder) { - if (this.queues[p].length > 0) return this.queues[p][0]; + this.startRequest(next); + return { kind: "dispatch" }; } - return undefined; + + return earliestWakeUpAt !== undefined + ? { kind: "wait", wakeUpAt: earliestWakeUpAt } + : { kind: "wait" }; } private startRequest(item: QueuedRequest): void { this.queues[item.priority].shift(); this.activeCount++; - this.markCircuitTrial(item); + this.circuitBreaker.markTrial(item); void this.executeRequest(item); } @@ -314,20 +216,20 @@ export class TrafficController { private async executeRequest(item: QueuedRequest): Promise { try { const result = await item.request.execute(); - this.recordCircuitSuccess(item.request.metadata); - this.recordUsage(item, result); + this.circuitBreaker.recordSuccess(item.request.metadata); + this.usageTracker.recordUsage(item, result); item.resolve(result); } catch (error) { - this.recordCircuitFailure(item.request.metadata, error); + this.circuitBreaker.recordFailure(item.request.metadata, error); - const retry = this.buildRetryPlan(error, item.attempt); + const retry = buildRetryPlan(error, item.attempt); if (retry) { this.scheduleRetry(item, retry); } else { item.reject(error); } } finally { - this.releaseRateLimitReservation(item.rateLimitKey); + this.rateLimiter.releaseReservation(item.rateLimitKey); this.activeCount = Math.max(0, this.activeCount - 1); this.scheduleDrain(); } @@ -355,22 +257,6 @@ export class TrafficController { }, plan.delayMs); } - private buildRetryPlan( - error: unknown, - attempt: number, - ): { delayMs: number; reason: RetryReason } | undefined { - const reason = this.getRetryReason(error); - if (!reason) return undefined; - - const max = reason === "timeout" ? TIMEOUT_RETRY_ATTEMPTS : MAX_RETRY_ATTEMPTS; - if (attempt >= max) return undefined; - - return { - reason, - delayMs: this.computeBackoffDelay(reason, attempt), - }; - } - /* ============================================================ * Rate limiting (verbatim logic) * ============================================================ @@ -378,72 +264,11 @@ export class TrafficController { private resolveRateLimit(next: QueuedRequest): DispatchDecision | null { const key = this.buildRateLimitKey(next.request.metadata); - const state = this.rateLimitStates.get(key); - if (!state) return null; - - const now = Date.now(); - const effectiveRemaining = Math.max(0, state.remaining - state.reserved); - const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; - - if (effectiveRemaining <= RATE_LIMIT_EXHAUSTION_BUFFER) { - if (now < probeAt) { - return { kind: "wait", wakeUpAt: probeAt }; - } - if (state.reserved > 0) { - return { kind: "wait" }; - } - } - - if (now < state.nextAllowedAt) { - return { kind: "wait", wakeUpAt: Math.min(state.resetAt, state.nextAllowedAt) }; - } - - state.reserved += 1; - next.rateLimitKey = key; - - const remainingWindowMs = Math.max(0, state.resetAt - now); - const intervalMs = Math.max( - RATE_LIMIT_MIN_PACE_INTERVAL_MS, - Math.ceil(remainingWindowMs / Math.max(effectiveRemaining, 1)), - ); - - const candidateNext = Math.max(state.nextAllowedAt, now + intervalMs); - if ( - state.nextAllowedAt <= now || - candidateNext >= state.nextAllowedAt + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS - ) { - state.nextAllowedAt = candidateNext; - } - - return null; + return this.rateLimiter.resolve(next, key); } private scheduleRateLimitWakeUpAt(wakeUpAt: number): void { - const now = Date.now(); - const target = Math.max(now, wakeUpAt); - - if (this.wakeUpTimeout && this.wakeUpAt !== undefined && this.wakeUpAt <= target) { - return; - } - - if (this.wakeUpTimeout) clearTimeout(this.wakeUpTimeout); - - this.wakeUpAt = target; - this.wakeUpTimeout = setTimeout( - () => { - this.wakeUpTimeout = undefined; - this.wakeUpAt = undefined; - this.scheduleDrain(); - }, - Math.max(1, target - now), - ); - } - - private releaseRateLimitReservation(key?: string): void { - if (!key) return; - const state = this.rateLimitStates.get(key); - if (!state || state.reserved <= 0) return; - state.reserved -= 1; + this.rateLimiter.scheduleWakeUpAt(wakeUpAt); } /* ============================================================ @@ -452,190 +277,7 @@ export class TrafficController { */ private resolveCircuit(next: QueuedRequest): DispatchDecision | null { - const visited = new Set(); - - while (true) { - const key = this.buildRateLimitKey(next.request.metadata); - next.circuitKey = key; - - const model = next.request.metadata?.model; - if (model) visited.add(model); - - const evaluation = this.evaluateCircuitState(key); - next.circuitStatus = evaluation.state; - - if (evaluation.allowRequest) return null; - - const fallback = this.findFallbackModel(next.request.metadata, visited); - if (!fallback || !next.request.createFallbackRequest) { - next.reject( - new CircuitBreakerOpenError( - `Circuit open for ${key}`, - next.request.metadata, - evaluation.retryAfterMs, - ), - ); - return { kind: "skip" }; - } - - const fallbackRequest = next.request.createFallbackRequest(fallback); - if (!fallbackRequest) return { kind: "skip" }; - - next.request = fallbackRequest; - next.attempt = 1; - next.rateLimitKey = undefined; - next.etaMs = undefined; - next.circuitKey = undefined; - next.circuitStatus = undefined; - } - } - - private evaluateCircuitState(key: string): { - allowRequest: boolean; - state: CircuitStateStatus; - retryAfterMs?: number; - } { - const state = this.circuitBreakers.get(key); - if (!state) return { allowRequest: true, state: "closed" }; - - const now = Date.now(); - - if (state.status === "open") { - const elapsed = state.openedAt ? now - state.openedAt : 0; - if (elapsed >= CIRCUIT_COOLDOWN_MS) { - state.status = "half-open"; - state.trialInFlight = false; - state.failureTimestamps = []; - return { allowRequest: true, state: "half-open" }; - } - return { - allowRequest: false, - state: "open", - retryAfterMs: CIRCUIT_COOLDOWN_MS - elapsed, - }; - } - - if (state.status === "half-open" && state.trialInFlight) { - return { allowRequest: false, state: "half-open" }; - } - - return { allowRequest: true, state: state.status }; - } - - private findFallbackModel( - metadata: TrafficRequestMetadata | undefined, - visitedModels: Set, - ): string | undefined { - const currentModel = metadata?.model; - if (!currentModel) { - return undefined; - } - - const chain = this.fallbackChains.get(currentModel); - if (!chain) { - return undefined; - } - - const provider = metadata?.provider; - for (const candidate of chain) { - if (visitedModels.has(candidate)) { - continue; - } - - const candidateKey = this.buildRateLimitKey({ - provider, - model: candidate, - }); - - const evaluation = this.evaluateCircuitState(candidateKey); - if (evaluation.allowRequest) { - visitedModels.add(candidate); - return candidate; - } - } - - return undefined; - } - - private markCircuitTrial(item: QueuedRequest): void { - const key = item.circuitKey; - if (!key) return; - const state = this.circuitBreakers.get(key); - if (state && state.status === "half-open" && !state.trialInFlight) { - state.trialInFlight = true; - } - } - - private recordCircuitSuccess(metadata?: TrafficRequestMetadata): void { - const key = this.buildRateLimitKey(metadata); - this.circuitBreakers.delete(key); - } - - private recordCircuitFailure(metadata: TrafficRequestMetadata | undefined, error: unknown): void { - const status = this.extractStatusCode(error); - if (!this.isCircuitBreakerStatus(status)) { - this.circuitBreakers.delete(this.buildRateLimitKey(metadata)); - return; - } - - const key = this.buildRateLimitKey(metadata); - const now = Date.now(); - const state = - this.circuitBreakers.get(key) ?? - ({ status: "closed", failureTimestamps: [] } as CircuitState); - - state.failureTimestamps = state.failureTimestamps.filter( - (t) => now - t <= CIRCUIT_FAILURE_WINDOW_MS, - ); - state.failureTimestamps.push(now); - - if ( - state.status === "half-open" || - state.failureTimestamps.length >= CIRCUIT_FAILURE_THRESHOLD - ) { - state.status = "open"; - state.openedAt = now; - state.trialInFlight = false; - } - - this.circuitBreakers.set(key, state); - } - - /* ============================================================ - * Usage - * ============================================================ - */ - - private recordUsage(item: QueuedRequest, result: TResponse): void { - const extractor = item.extractUsage ?? item.request.extractUsage; - if (!extractor) return; - - const usage = extractor(result); - if (!usage) return; - - if (this.isPromiseLike(usage)) { - void usage.then((u) => u && this.incrementTenantUsage(item.tenantId, u)); - } else { - this.incrementTenantUsage(item.tenantId, usage); - } - } - - private incrementTenantUsage(tenantId: string, usage: UsageCounters): void { - const current = this.tenantUsage.get(tenantId) ?? { - inputTokens: 0, - outputTokens: 0, - totalTokens: 0, - }; - - const input = usage.inputTokens ?? 0; - const output = usage.outputTokens ?? 0; - const total = usage.totalTokens ?? input + output; - - this.tenantUsage.set(tenantId, { - inputTokens: current.inputTokens + input, - outputTokens: current.outputTokens + output, - totalTokens: current.totalTokens + total, - }); + return this.circuitBreaker.resolve(next); } /* ============================================================ @@ -650,79 +292,6 @@ export class TrafficController { private buildRateLimitKey(metadata?: TrafficRequestMetadata): string { return `${metadata?.provider ?? "default-provider"}::${metadata?.model ?? "default-model"}`; } - - private normalizeFallbackChains( - fallbackChains?: Record, - ): Map { - const chains = fallbackChains ?? DEFAULT_FALLBACK_CHAINS; - return new Map(Object.entries(chains)); - } - - private getRetryReason(error: unknown): RetryReason | undefined { - const status = this.extractStatusCode(error); - if (status === 429) return "rateLimit"; - if (status && status >= 500) return "serverError"; - if (status === 408 || this.isTimeoutError(error)) return "timeout"; - return undefined; - } - - private computeBackoffDelay(reason: RetryReason, attempt: number): number { - const base = - reason === "serverError" - ? SERVER_ERROR_BASE_BACKOFF_MS - : reason === "timeout" - ? TIMEOUT_BASE_BACKOFF_MS - : RATE_LIMIT_BASE_BACKOFF_MS; - - const jitter = - reason === "serverError" - ? SERVER_ERROR_JITTER_FACTOR - : reason === "timeout" - ? TIMEOUT_JITTER_FACTOR - : RATE_LIMIT_JITTER_FACTOR; - - const exp = base * 2 ** (attempt - 1); - return Math.round(exp + exp * jitter * Math.random()); - } - - private extractStatusCode(error: unknown): number | undefined { - const e = error as any; - return ( - this.coerceStatus(e?.status) ?? - this.coerceStatus(e?.statusCode) ?? - this.coerceStatus(e?.httpStatus) ?? - this.coerceStatus(e?.response?.status) ?? - this.coerceStatus(e?.cause?.status) - ); - } - - private isTimeoutError(error: unknown): boolean { - const e = error as any; - return ( - String(e?.code ?? "") - .toLowerCase() - .includes("timeout") || - String(e?.name ?? "") - .toLowerCase() - .includes("timeout") || - String(e?.message ?? "") - .toLowerCase() - .includes("timeout") - ); - } - - private isCircuitBreakerStatus(status?: number): boolean { - return status === 429 || (status !== undefined && status >= 500); - } - - private coerceStatus(value: unknown): number | undefined { - const n = Number(value); - return Number.isFinite(n) ? n : undefined; - } - - private isPromiseLike(value: unknown): value is PromiseLike { - return !!value && typeof (value as any).then === "function"; - } } /* ============================================================ @@ -730,18 +299,6 @@ export class TrafficController { * ============================================================ */ -export class CircuitBreakerOpenError extends Error { - readonly retryAfterMs?: number; - readonly metadata?: TrafficRequestMetadata; - - constructor(message: string, metadata?: TrafficRequestMetadata, retryAfterMs?: number) { - super(message); - this.name = "CircuitBreakerOpenError"; - this.metadata = metadata; - this.retryAfterMs = retryAfterMs; - } -} - let singletonController: TrafficController | undefined; export function getTrafficController(options?: TrafficControllerOptions): TrafficController { diff --git a/packages/core/src/traffic/traffic-error-utils.ts b/packages/core/src/traffic/traffic-error-utils.ts new file mode 100644 index 000000000..653329a99 --- /dev/null +++ b/packages/core/src/traffic/traffic-error-utils.ts @@ -0,0 +1,40 @@ +function readObjectProperty(value: unknown, key: string): unknown { + if (!value || typeof value !== "object") return undefined; + return (value as Record)[key]; +} + +export function coerceStatus(value: unknown): number | undefined { + const n = Number(value); + return Number.isFinite(n) ? n : undefined; +} + +export function extractStatusCode(error: unknown): number | undefined { + return ( + coerceStatus(readObjectProperty(error, "status")) ?? + coerceStatus(readObjectProperty(error, "statusCode")) ?? + coerceStatus(readObjectProperty(error, "httpStatus")) ?? + coerceStatus(readObjectProperty(readObjectProperty(error, "response"), "status")) ?? + coerceStatus(readObjectProperty(readObjectProperty(error, "cause"), "status")) + ); +} + +export function isTimeoutError(error: unknown): boolean { + const code = readObjectProperty(error, "code"); + const name = readObjectProperty(error, "name"); + const message = readObjectProperty(error, "message"); + return ( + String(code ?? "") + .toLowerCase() + .includes("timeout") || + String(name ?? "") + .toLowerCase() + .includes("timeout") || + String(message ?? "") + .toLowerCase() + .includes("timeout") + ); +} + +export function isPromiseLike(value: unknown): value is PromiseLike { + return !!value && typeof (value as { then?: unknown }).then === "function"; +} diff --git a/packages/core/src/traffic/traffic-errors.ts b/packages/core/src/traffic/traffic-errors.ts new file mode 100644 index 000000000..1c6166027 --- /dev/null +++ b/packages/core/src/traffic/traffic-errors.ts @@ -0,0 +1,13 @@ +import type { TrafficRequestMetadata } from "./traffic-types"; + +export class CircuitBreakerOpenError extends Error { + readonly retryAfterMs?: number; + readonly metadata?: TrafficRequestMetadata; + + constructor(message: string, metadata?: TrafficRequestMetadata, retryAfterMs?: number) { + super(message); + this.name = "CircuitBreakerOpenError"; + this.metadata = metadata; + this.retryAfterMs = retryAfterMs; + } +} diff --git a/packages/core/src/traffic/traffic-rate-limiter.ts b/packages/core/src/traffic/traffic-rate-limiter.ts new file mode 100644 index 000000000..fbcb9b224 --- /dev/null +++ b/packages/core/src/traffic/traffic-rate-limiter.ts @@ -0,0 +1,206 @@ +import { + RATE_LIMIT_EXHAUSTION_BUFFER, + RATE_LIMIT_MIN_PACE_INTERVAL_MS, + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS, + RATE_LIMIT_PROBE_DELAY_MS, +} from "./traffic-constants"; +import type { + DispatchDecision, + QueuedRequest, + RateLimitWindowState, +} from "./traffic-controller-internal"; +import type { TrafficRequestMetadata } from "./traffic-types"; + +export type RateLimitHeaderSnapshot = { + limitRequests: string; + remainingRequests: string; + resetRequests: string; + resetRequestsMs: number; +}; + +export type RateLimitUpdateResult = { + key: string; + headerSnapshot: RateLimitHeaderSnapshot; + state: RateLimitWindowState; +}; + +type SchedulerCallback = () => void; + +function readHeader(headers: unknown, name: string): string | undefined { + if (!headers) return undefined; + + if (typeof (headers as { get?: unknown }).get === "function") { + const v = (headers as { get: (name: string) => unknown }).get(name); + return v === null || v === undefined ? undefined : String(v); + } + + if (typeof headers !== "object") return undefined; + + const entries = Object.entries(headers as Record); + const target = name.toLowerCase(); + const match = entries.find(([k]) => String(k).toLowerCase() === target); + if (!match) return undefined; + + const value = match[1]; + if (Array.isArray(value)) { + const first = value[0]; + return first === null || first === undefined ? undefined : String(first); + } + return value === null || value === undefined ? undefined : String(value); +} + +function parseResetDurationToMs(raw: string): number | undefined { + const value = raw.trim(); + if (!value) return undefined; + + let totalMs = 0; + const regex = /(\d+(?:\.\d+)?)(ms|s|m|h|d)/g; + let matched = false; + for (const match of value.matchAll(regex)) { + matched = true; + const amount = Number.parseFloat(match[1] ?? ""); + if (!Number.isFinite(amount)) continue; + const unit = match[2]; + if (unit === "ms") totalMs += amount; + else if (unit === "s") totalMs += amount * 1000; + else if (unit === "m") totalMs += amount * 60_000; + else if (unit === "h") totalMs += amount * 3_600_000; + else if (unit === "d") totalMs += amount * 86_400_000; + } + + if (matched) { + return Math.round(totalMs); + } + + const n = Number(value); + return Number.isFinite(n) ? Math.round(n) : undefined; +} + +export class TrafficRateLimiter { + private readonly rateLimitStates = new Map(); + private wakeUpTimeout?: ReturnType; + private wakeUpAt?: number; + private readonly onWakeUp: SchedulerCallback; + + constructor(onWakeUp: SchedulerCallback) { + this.onWakeUp = onWakeUp; + } + + resolve(next: QueuedRequest, key: string): DispatchDecision | null { + const state = this.rateLimitStates.get(key); + if (!state) return null; + + const now = Date.now(); + const effectiveRemaining = Math.max(0, state.remaining - state.reserved); + const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; + + if (effectiveRemaining <= RATE_LIMIT_EXHAUSTION_BUFFER) { + if (now < probeAt) { + return { kind: "wait", wakeUpAt: probeAt }; + } + if (state.reserved > 0) { + return { kind: "wait" }; + } + } + + if (now < state.nextAllowedAt) { + return { kind: "wait", wakeUpAt: Math.min(state.resetAt, state.nextAllowedAt) }; + } + + state.reserved += 1; + next.rateLimitKey = key; + + const remainingWindowMs = Math.max(0, state.resetAt - now); + const intervalMs = Math.max( + RATE_LIMIT_MIN_PACE_INTERVAL_MS, + Math.ceil(remainingWindowMs / Math.max(effectiveRemaining, 1)), + ); + + const candidateNext = Math.max(state.nextAllowedAt, now + intervalMs); + if ( + state.nextAllowedAt <= now || + candidateNext >= state.nextAllowedAt + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS + ) { + state.nextAllowedAt = candidateNext; + } + + return null; + } + + scheduleWakeUpAt(wakeUpAt: number): void { + const now = Date.now(); + const target = Math.max(now, wakeUpAt); + + if (this.wakeUpTimeout && this.wakeUpAt !== undefined && this.wakeUpAt <= target) { + return; + } + + if (this.wakeUpTimeout) clearTimeout(this.wakeUpTimeout); + + this.wakeUpAt = target; + this.wakeUpTimeout = setTimeout( + () => { + this.wakeUpTimeout = undefined; + this.wakeUpAt = undefined; + this.onWakeUp(); + }, + Math.max(1, target - now), + ); + } + + releaseReservation(key?: string): void { + if (!key) return; + const state = this.rateLimitStates.get(key); + if (!state || state.reserved <= 0) return; + state.reserved -= 1; + } + + updateFromHeaders( + _metadata: TrafficRequestMetadata | undefined, + headers: unknown, + key: string, + ): RateLimitUpdateResult | undefined { + const limitRequests = readHeader(headers, "x-ratelimit-limit-requests"); + const remainingRequests = readHeader(headers, "x-ratelimit-remaining-requests"); + const resetRequests = readHeader(headers, "x-ratelimit-reset-requests"); + + if (!limitRequests || !remainingRequests || !resetRequests) return undefined; + + const limit = Number(limitRequests); + const remaining = Number(remainingRequests); + if (!Number.isFinite(limit) || !Number.isFinite(remaining)) return undefined; + + const resetRequestsMs = parseResetDurationToMs(resetRequests); + if (resetRequestsMs === undefined) return undefined; + + const now = Date.now(); + const parsedResetAt = now + resetRequestsMs; + + const existing = this.rateLimitStates.get(key); + const isSameWindow = !!existing && now < existing.resetAt; + const resetAt = isSameWindow ? Math.max(existing.resetAt, parsedResetAt) : parsedResetAt; + const nextAllowedAt = isSameWindow ? Math.max(existing.nextAllowedAt, now) : now; + const reserved = Math.max(0, existing?.reserved ?? 0); + + const state: RateLimitWindowState = { + limit, + remaining: isSameWindow ? Math.min(existing.remaining, remaining) : remaining, + resetAt, + reserved, + nextAllowedAt, + }; + + this.rateLimitStates.set(key, state); + + return { + key, + headerSnapshot: { + limitRequests, + remainingRequests, + resetRequests, + resetRequestsMs, + }, + state, + }; + } +} diff --git a/packages/core/src/traffic/traffic-retry.ts b/packages/core/src/traffic/traffic-retry.ts new file mode 100644 index 000000000..992ca4413 --- /dev/null +++ b/packages/core/src/traffic/traffic-retry.ts @@ -0,0 +1,56 @@ +import { + MAX_RETRY_ATTEMPTS, + RATE_LIMIT_BASE_BACKOFF_MS, + RATE_LIMIT_JITTER_FACTOR, + SERVER_ERROR_BASE_BACKOFF_MS, + SERVER_ERROR_JITTER_FACTOR, + TIMEOUT_BASE_BACKOFF_MS, + TIMEOUT_JITTER_FACTOR, + TIMEOUT_RETRY_ATTEMPTS, +} from "./traffic-constants"; +import { extractStatusCode, isTimeoutError } from "./traffic-error-utils"; + +export type RetryReason = "rateLimit" | "serverError" | "timeout"; + +export function buildRetryPlan( + error: unknown, + attempt: number, +): { delayMs: number; reason: RetryReason } | undefined { + const reason = getRetryReason(error); + if (!reason) return undefined; + + const max = reason === "timeout" ? TIMEOUT_RETRY_ATTEMPTS : MAX_RETRY_ATTEMPTS; + if (attempt >= max) return undefined; + + return { + reason, + delayMs: computeBackoffDelay(reason, attempt), + }; +} + +function getRetryReason(error: unknown): RetryReason | undefined { + const status = extractStatusCode(error); + if (status === 429) return "rateLimit"; + if (status && status >= 500) return "serverError"; + if (status === 408 || isTimeoutError(error)) return "timeout"; + return undefined; +} + +function computeBackoffDelay(reason: RetryReason, attempt: number): number { + const base = + reason === "serverError" + ? SERVER_ERROR_BASE_BACKOFF_MS + : reason === "timeout" + ? TIMEOUT_BASE_BACKOFF_MS + : RATE_LIMIT_BASE_BACKOFF_MS; + + const jitter = + reason === "serverError" + ? SERVER_ERROR_JITTER_FACTOR + : reason === "timeout" + ? TIMEOUT_JITTER_FACTOR + : RATE_LIMIT_JITTER_FACTOR; + + const exp = base * 2 ** (attempt - 1); + return Math.round(exp + exp * jitter * Math.random()); +} diff --git a/packages/core/src/traffic/traffic-types.ts b/packages/core/src/traffic/traffic-types.ts new file mode 100644 index 000000000..ed3b4f316 --- /dev/null +++ b/packages/core/src/traffic/traffic-types.ts @@ -0,0 +1,55 @@ +import type { Logger } from "../logger"; + +type BivariantFunction = { + bivarianceHack(...args: TArgs): TReturn; +}["bivarianceHack"]; + +type UsageCounters = { + inputTokens?: number; + outputTokens?: number; + totalTokens?: number; +}; + +export type TrafficRequestType = "text" | "stream"; +export type TrafficPriority = "P0" | "P1" | "P2"; + +export interface TrafficRequestMetadata { + agentId?: string; + agentName?: string; + model?: string; + provider?: string; + priority?: TrafficPriority; + tenantId?: string; +} + +export interface TrafficRequest { + tenantId: string; + metadata?: TrafficRequestMetadata; + execute: () => Promise; + createFallbackRequest?: (modelId: string) => TrafficRequest | undefined; + extractUsage?: BivariantFunction< + [response: TResponse], + Promise | UsageCounters | undefined + >; +} + +export interface TrafficControllerOptions { + maxConcurrent?: number; + rateLimits?: RateLimitConfig; + logger?: Logger; + fallbackChains?: Record; +} + +export interface RateLimitOptions { + capacity: number; + refillPerSecond: number; +} + +export type RateLimitKey = string; +export type RateLimitConfig = Record; + +export type TenantUsage = { + inputTokens: number; + outputTokens: number; + totalTokens: number; +}; diff --git a/packages/core/src/traffic/traffic-usage-tracker.ts b/packages/core/src/traffic/traffic-usage-tracker.ts new file mode 100644 index 000000000..204f53808 --- /dev/null +++ b/packages/core/src/traffic/traffic-usage-tracker.ts @@ -0,0 +1,50 @@ +import type { QueuedRequest } from "./traffic-controller-internal"; +import { isPromiseLike } from "./traffic-error-utils"; +import type { TenantUsage } from "./traffic-types"; + +type UsageCounters = { + inputTokens?: number; + outputTokens?: number; + totalTokens?: number; +}; + +export class TrafficUsageTracker { + private readonly tenantUsage = new Map(); + + getTenantUsage(tenantId: string): TenantUsage | undefined { + const usage = this.tenantUsage.get(tenantId); + return usage ? { ...usage } : undefined; + } + + recordUsage(item: QueuedRequest, result: TResponse): void { + const extractor = item.extractUsage ?? item.request.extractUsage; + if (!extractor) return; + + const usage = extractor(result); + if (!usage) return; + + if (isPromiseLike(usage)) { + void usage.then((u) => u && this.incrementTenantUsage(item.tenantId, u)); + } else { + this.incrementTenantUsage(item.tenantId, usage); + } + } + + private incrementTenantUsage(tenantId: string, usage: UsageCounters): void { + const current = this.tenantUsage.get(tenantId) ?? { + inputTokens: 0, + outputTokens: 0, + totalTokens: 0, + }; + + const input = usage.inputTokens ?? 0; + const output = usage.outputTokens ?? 0; + const total = usage.totalTokens ?? input + output; + + this.tenantUsage.set(tenantId, { + inputTokens: current.inputTokens + input, + outputTokens: current.outputTokens + output, + totalTokens: current.totalTokens + total, + }); + } +} From ecf14a09ea98357b33ae9d81646b8bcc2d4a6079 Mon Sep 17 00:00:00 2001 From: riturajFi Date: Sun, 14 Dec 2025 14:44:11 +0530 Subject: [PATCH 11/41] feat: refactotr --- package.json | 3 +- .../src/traffic/traffic-circuit-breaker.ts | 105 +++++++- .../traffic/traffic-controller-internal.ts | 7 +- .../core/src/traffic/traffic-controller.ts | 184 ++++++++++++- .../core/src/traffic/traffic-error-utils.ts | 37 ++- .../core/src/traffic/traffic-rate-limiter.ts | 236 ++++++++++++++--- packages/core/src/traffic/traffic-retry.ts | 30 ++- .../core/src/traffic/traffic-usage-tracker.ts | 31 ++- pnpm-lock.yaml | 106 ++++++-- tmp/test/traffic-priority-openai-sim.ts | 3 + tmp/test/traffic-priority.ts | 3 + .../traffic-rate-limit-openai-window-sim.ts | 247 ++++++++++++++++++ tmp/test/traffic-rate-limit-static.ts | 9 +- 13 files changed, 885 insertions(+), 116 deletions(-) create mode 100644 tmp/test/traffic-rate-limit-openai-window-sim.ts diff --git a/package.json b/package.json index 7c80f7c59..7e3ef8ba1 100644 --- a/package.json +++ b/package.json @@ -32,9 +32,10 @@ "publint": "^0.3.8", "rimraf": "^5.0.5", "syncpack": "^13.0.2", + "ts-node": "^10.9.2", "tslib": "^2.3.0", "tsup": "^8.5.0", - "typescript": "^5.8.2", + "typescript": "^5.9.2", "vite": "^7.2.7", "vitest": "^3.2.4" }, diff --git a/packages/core/src/traffic/traffic-circuit-breaker.ts b/packages/core/src/traffic/traffic-circuit-breaker.ts index 9cc407995..be11b6d36 100644 --- a/packages/core/src/traffic/traffic-circuit-breaker.ts +++ b/packages/core/src/traffic/traffic-circuit-breaker.ts @@ -1,3 +1,4 @@ +import type { Logger } from "../logger"; import { CIRCUIT_COOLDOWN_MS, CIRCUIT_FAILURE_THRESHOLD, @@ -28,22 +29,40 @@ export class TrafficCircuitBreaker { this.fallbackChains = new Map(Object.entries(chains)); } - resolve(next: QueuedRequest): DispatchDecision | null { + resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { + const circuitLogger = logger?.child({ module: "circuit-breaker" }); const visited = new Set(); while (true) { const key = this.buildRateLimitKey(next.request.metadata); next.circuitKey = key; + circuitLogger?.trace?.("Circuit resolve step", { + circuitKey: key, + provider: next.request.metadata?.provider, + model: next.request.metadata?.model, + }); const model = next.request.metadata?.model; if (model) visited.add(model); - const evaluation = this.evaluateCircuitState(key); + const evaluation = this.evaluateCircuitState(key, circuitLogger); next.circuitStatus = evaluation.state; + circuitLogger?.debug?.("Circuit evaluated", { + circuitKey: key, + state: evaluation.state, + allowRequest: evaluation.allowRequest, + retryAfterMs: evaluation.retryAfterMs, + }); if (evaluation.allowRequest) return null; - const fallback = this.findFallbackModel(next.request.metadata, visited); + const fallback = this.findFallbackModel(next.request.metadata, visited, circuitLogger); + circuitLogger?.debug?.("Circuit open; attempting fallback", { + circuitKey: key, + currentModel: next.request.metadata?.model, + fallback, + visitedModels: Array.from(visited), + }); if (!fallback || !next.request.createFallbackRequest) { next.reject( new CircuitBreakerOpenError( @@ -52,11 +71,21 @@ export class TrafficCircuitBreaker { evaluation.retryAfterMs, ), ); + circuitLogger?.warn?.("No fallback available; rejecting request", { + circuitKey: key, + retryAfterMs: evaluation.retryAfterMs, + }); return { kind: "skip" }; } const fallbackRequest = next.request.createFallbackRequest(fallback); - if (!fallbackRequest) return { kind: "skip" }; + if (!fallbackRequest) { + circuitLogger?.warn?.("createFallbackRequest returned undefined; skipping", { + circuitKey: key, + fallback, + }); + return { kind: "skip" }; + } next.request = fallbackRequest; next.attempt = 1; @@ -64,27 +93,54 @@ export class TrafficCircuitBreaker { next.etaMs = undefined; next.circuitKey = undefined; next.circuitStatus = undefined; + circuitLogger?.debug?.("Switched to fallback request", { + previousCircuitKey: key, + fallbackModel: fallback, + }); } } - markTrial(item: QueuedRequest): void { + markTrial(item: QueuedRequest, logger?: Logger): void { + const circuitLogger = logger?.child({ module: "circuit-breaker" }); const key = item.circuitKey; if (!key) return; const state = this.circuitBreakers.get(key); if (state && state.status === "half-open" && !state.trialInFlight) { state.trialInFlight = true; + circuitLogger?.debug?.("Marked half-open trial in flight", { circuitKey: key }); } } - recordSuccess(metadata?: TrafficRequestMetadata): void { + recordSuccess(metadata?: TrafficRequestMetadata, logger?: Logger): void { + const circuitLogger = logger?.child({ module: "circuit-breaker" }); const key = this.buildRateLimitKey(metadata); this.circuitBreakers.delete(key); + circuitLogger?.debug?.("Circuit success; cleared circuit state", { + circuitKey: key, + provider: metadata?.provider, + model: metadata?.model, + }); } - recordFailure(metadata: TrafficRequestMetadata | undefined, error: unknown): void { - const status = extractStatusCode(error); + recordFailure( + metadata: TrafficRequestMetadata | undefined, + error: unknown, + logger?: Logger, + ): void { + const circuitLogger = logger?.child({ module: "circuit-breaker" }); + const status = extractStatusCode(error, logger); + circuitLogger?.debug?.("Circuit failure observed", { + circuitKey: this.buildRateLimitKey(metadata), + status, + provider: metadata?.provider, + model: metadata?.model, + }); if (!this.isCircuitBreakerStatus(status)) { this.circuitBreakers.delete(this.buildRateLimitKey(metadata)); + circuitLogger?.debug?.("Failure not eligible for circuit breaker; cleared circuit state", { + circuitKey: this.buildRateLimitKey(metadata), + status, + }); return; } @@ -106,18 +162,36 @@ export class TrafficCircuitBreaker { state.status = "open"; state.openedAt = now; state.trialInFlight = false; + circuitLogger?.warn?.("Circuit opened", { + circuitKey: key, + failureCount: state.failureTimestamps.length, + threshold: CIRCUIT_FAILURE_THRESHOLD, + openedAt: state.openedAt, + }); } this.circuitBreakers.set(key, state); + circuitLogger?.trace?.("Circuit state updated", { + circuitKey: key, + status: state.status, + failureCount: state.failureTimestamps.length, + windowMs: CIRCUIT_FAILURE_WINDOW_MS, + }); } - private evaluateCircuitState(key: string): { + private evaluateCircuitState( + key: string, + logger?: Logger, + ): { allowRequest: boolean; state: CircuitStateStatus; retryAfterMs?: number; } { const state = this.circuitBreakers.get(key); - if (!state) return { allowRequest: true, state: "closed" }; + if (!state) { + logger?.trace?.("Circuit state missing; allow request", { circuitKey: key }); + return { allowRequest: true, state: "closed" }; + } const now = Date.now(); @@ -127,6 +201,7 @@ export class TrafficCircuitBreaker { state.status = "half-open"; state.trialInFlight = false; state.failureTimestamps = []; + logger?.debug?.("Circuit transitioned to half-open", { circuitKey: key }); return { allowRequest: true, state: "half-open" }; } return { @@ -146,14 +221,17 @@ export class TrafficCircuitBreaker { private findFallbackModel( metadata: TrafficRequestMetadata | undefined, visitedModels: Set, + logger?: Logger, ): string | undefined { const currentModel = metadata?.model; if (!currentModel) { + logger?.trace?.("No current model; no fallback", {}); return undefined; } const chain = this.fallbackChains.get(currentModel); if (!chain) { + logger?.trace?.("No fallback chain for model", { currentModel }); return undefined; } @@ -168,9 +246,14 @@ export class TrafficCircuitBreaker { model: candidate, }); - const evaluation = this.evaluateCircuitState(candidateKey); + const evaluation = this.evaluateCircuitState(candidateKey, logger); if (evaluation.allowRequest) { visitedModels.add(candidate); + logger?.debug?.("Selected fallback model", { + currentModel, + fallbackModel: candidate, + fallbackCircuitKey: candidateKey, + }); return candidate; } } diff --git a/packages/core/src/traffic/traffic-controller-internal.ts b/packages/core/src/traffic/traffic-controller-internal.ts index 3b9c01244..48a3b9894 100644 --- a/packages/core/src/traffic/traffic-controller-internal.ts +++ b/packages/core/src/traffic/traffic-controller-internal.ts @@ -1,9 +1,4 @@ -import type { - TrafficPriority, - TrafficRequest, - TrafficRequestMetadata, - TrafficRequestType, -} from "./traffic-types"; +import type { TrafficPriority, TrafficRequest, TrafficRequestType } from "./traffic-types"; export type Scheduler = (callback: () => void) => void; diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index 2169c36f0..e2d24acb0 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -43,6 +43,8 @@ export class TrafficController { private readonly scheduler: Scheduler; private readonly maxConcurrent: number; private readonly logger: Logger; + private readonly trafficLogger: Logger; + private readonly controllerLogger: Logger; private readonly queues: Record = { P0: [], @@ -67,11 +69,18 @@ export class TrafficController { this.maxConcurrent = options.maxConcurrent ?? Number.POSITIVE_INFINITY; this.scheduler = this.createScheduler(); this.logger = new LoggerProxy({ component: "traffic-controller" }, options.logger); + this.trafficLogger = this.logger.child({ subsystem: "traffic" }); + this.controllerLogger = this.trafficLogger.child({ module: "controller" }); this.rateLimiter = new TrafficRateLimiter(() => this.scheduleDrain()); this.circuitBreaker = new TrafficCircuitBreaker({ fallbackChains: options.fallbackChains, buildRateLimitKey: (metadata) => this.buildRateLimitKey(metadata), }); + + this.controllerLogger.debug("Initialized TrafficController", { + maxConcurrent: this.maxConcurrent, + hasFallbackChains: !!options.fallbackChains, + }); } /* ============================================================ @@ -80,10 +89,22 @@ export class TrafficController { */ handleText(request: TrafficRequest): Promise { + this.controllerLogger.trace("handleText called", { + tenantId: request.tenantId, + provider: request.metadata?.provider, + model: request.metadata?.model, + priority: request.metadata?.priority, + }); return this.enqueue("text", request); } handleStream(request: TrafficRequest): Promise { + this.controllerLogger.trace("handleStream called", { + tenantId: request.tenantId, + provider: request.metadata?.provider, + model: request.metadata?.model, + priority: request.metadata?.priority, + }); return this.enqueue("stream", request); } @@ -92,10 +113,35 @@ export class TrafficController { headers: unknown, ): RateLimitUpdateResult | undefined { const key = this.buildRateLimitKey(metadata); - return this.rateLimiter.updateFromHeaders(metadata, headers, key); + this.controllerLogger.debug("updateRateLimitFromHeaders called", { + rateLimitKey: key, + provider: metadata?.provider, + model: metadata?.model, + }); + + const update = this.rateLimiter.updateFromHeaders(metadata, headers, key, this.trafficLogger); + if (!update) { + this.controllerLogger.debug("updateRateLimitFromHeaders skipped (no headers applied)", { + rateLimitKey: key, + }); + return undefined; + } + + this.controllerLogger.debug("Rate limit headers applied", { + rateLimitKey: update.key, + limit: update.state.limit, + remaining: update.state.remaining, + reserved: update.state.reserved, + resetAt: update.state.resetAt, + nextAllowedAt: update.state.nextAllowedAt, + resetRequestsMs: update.headerSnapshot.resetRequestsMs, + }); + + return update; } getTenantUsage(tenantId: string): TenantUsage | undefined { + this.controllerLogger.trace("getTenantUsage called", { tenantId }); return this.usageTracker.getTenantUsage(tenantId); } @@ -114,6 +160,13 @@ export class TrafficController { ): Promise { return new Promise((resolve, reject) => { const priority = this.resolvePriority(request.metadata); + this.controllerLogger.debug("Enqueue request", { + type, + tenantId: request.tenantId, + priority, + provider: request.metadata?.provider, + model: request.metadata?.model, + }); this.queues[priority].push({ type, request, @@ -132,18 +185,34 @@ export class TrafficController { if (this.drainScheduled) return; this.drainScheduled = true; + this.controllerLogger.trace("Drain scheduled"); this.scheduler(() => { this.drainScheduled = false; + this.controllerLogger.trace("Drain tick"); this.drainQueue(); }); } private drainQueue(): void { + this.controllerLogger.trace("Drain start", { + activeCount: this.activeCount, + maxConcurrent: this.maxConcurrent, + queuedP0: this.queues.P0.length, + queuedP1: this.queues.P1.length, + queuedP2: this.queues.P2.length, + }); while (true) { const decision = this.tryDispatchNext(); + this.controllerLogger.trace("Dispatch decision", decision); if (decision.kind === "dispatch" || decision.kind === "skip") continue; if (decision.kind === "wait") { - if (decision.wakeUpAt) this.scheduleRateLimitWakeUpAt(decision.wakeUpAt); + if (decision.wakeUpAt) { + this.controllerLogger.debug("Rate limit wait; scheduling wakeup", { + wakeUpAt: decision.wakeUpAt, + inMs: Math.max(0, decision.wakeUpAt - Date.now()), + }); + this.scheduleRateLimitWakeUpAt(decision.wakeUpAt); + } return; } return; @@ -164,8 +233,23 @@ export class TrafficController { const next = this.queues[priority][0]; if (!next) continue; + this.controllerLogger.trace("Evaluate next queued request", { + priority, + type: next.type, + tenantId: next.tenantId, + attempt: next.attempt, + provider: next.request.metadata?.provider, + model: next.request.metadata?.model, + }); + const circuit = this.resolveCircuit(next); if (circuit) { + this.controllerLogger.trace("Circuit resolution returned decision", { + priority, + decision: circuit, + circuitKey: next.circuitKey, + circuitStatus: next.circuitStatus, + }); if (circuit.kind === "skip") { this.queues[priority].shift(); return { kind: "skip" }; @@ -183,6 +267,11 @@ export class TrafficController { const rateLimit = this.resolveRateLimit(next); if (rateLimit) { + this.controllerLogger.trace("Rate limit resolution returned decision", { + priority, + decision: rateLimit, + rateLimitKey: next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), + }); if (rateLimit.kind === "wait" && rateLimit.wakeUpAt !== undefined) { earliestWakeUpAt = earliestWakeUpAt === undefined @@ -202,9 +291,18 @@ export class TrafficController { } private startRequest(item: QueuedRequest): void { + this.controllerLogger.debug("Start request", { + priority: item.priority, + type: item.type, + tenantId: item.tenantId, + attempt: item.attempt, + provider: item.request.metadata?.provider, + model: item.request.metadata?.model, + }); this.queues[item.priority].shift(); this.activeCount++; - this.circuitBreaker.markTrial(item); + this.rateLimiter.notifyDispatch(item.rateLimitKey, this.trafficLogger); + this.circuitBreaker.markTrial(item, this.trafficLogger); void this.executeRequest(item); } @@ -214,23 +312,74 @@ export class TrafficController { */ private async executeRequest(item: QueuedRequest): Promise { + const startedAt = Date.now(); try { + this.controllerLogger.debug("Execute request", { + priority: item.priority, + type: item.type, + tenantId: item.tenantId, + attempt: item.attempt, + provider: item.request.metadata?.provider, + model: item.request.metadata?.model, + rateLimitKey: item.rateLimitKey, + circuitKey: item.circuitKey, + circuitStatus: item.circuitStatus, + activeCount: this.activeCount, + }); const result = await item.request.execute(); - this.circuitBreaker.recordSuccess(item.request.metadata); - this.usageTracker.recordUsage(item, result); + this.controllerLogger.debug("Request succeeded", { + tenantId: item.tenantId, + attempt: item.attempt, + provider: item.request.metadata?.provider, + model: item.request.metadata?.model, + elapsedMs: Date.now() - startedAt, + }); + this.circuitBreaker.recordSuccess(item.request.metadata, this.trafficLogger); + this.usageTracker.recordUsage(item, result, this.trafficLogger); item.resolve(result); } catch (error) { - this.circuitBreaker.recordFailure(item.request.metadata, error); + this.controllerLogger.warn("Request failed", { + tenantId: item.tenantId, + attempt: item.attempt, + provider: item.request.metadata?.provider, + model: item.request.metadata?.model, + elapsedMs: Date.now() - startedAt, + errorName: (error as { name?: unknown } | null)?.name, + errorMessage: (error as { message?: unknown } | null)?.message, + status: (error as { status?: unknown } | null)?.status, + statusCode: (error as { statusCode?: unknown } | null)?.statusCode, + }); + this.circuitBreaker.recordFailure(item.request.metadata, error, this.trafficLogger); - const retry = buildRetryPlan(error, item.attempt); + const retry = buildRetryPlan(error, item.attempt, this.trafficLogger); if (retry) { + this.controllerLogger.debug("Retrying request", { + tenantId: item.tenantId, + attempt: item.attempt, + nextAttempt: item.attempt + 1, + reason: retry.reason, + delayMs: retry.delayMs, + provider: item.request.metadata?.provider, + model: item.request.metadata?.model, + }); this.scheduleRetry(item, retry); } else { + this.controllerLogger.debug("No retry plan; rejecting request", { + tenantId: item.tenantId, + attempt: item.attempt, + provider: item.request.metadata?.provider, + model: item.request.metadata?.model, + }); item.reject(error); } } finally { - this.rateLimiter.releaseReservation(item.rateLimitKey); + this.rateLimiter.releaseReservation(item.rateLimitKey, this.trafficLogger); this.activeCount = Math.max(0, this.activeCount - 1); + this.controllerLogger.trace("Request finished; slot released", { + tenantId: item.tenantId, + activeCount: this.activeCount, + maxConcurrent: this.maxConcurrent, + }); this.scheduleDrain(); } } @@ -244,7 +393,20 @@ export class TrafficController { item: QueuedRequest, plan: { delayMs: number; reason: RetryReason }, ): void { + this.controllerLogger.debug("Schedule retry", { + tenantId: item.tenantId, + priority: item.priority, + currentAttempt: item.attempt, + nextAttempt: item.attempt + 1, + reason: plan.reason, + delayMs: plan.delayMs, + }); setTimeout(() => { + this.controllerLogger.debug("Retry timer fired", { + tenantId: item.tenantId, + priority: item.priority, + nextAttempt: item.attempt + 1, + }); this.queues[item.priority].push({ ...item, attempt: item.attempt + 1, @@ -264,11 +426,11 @@ export class TrafficController { private resolveRateLimit(next: QueuedRequest): DispatchDecision | null { const key = this.buildRateLimitKey(next.request.metadata); - return this.rateLimiter.resolve(next, key); + return this.rateLimiter.resolve(next, key, this.trafficLogger); } private scheduleRateLimitWakeUpAt(wakeUpAt: number): void { - this.rateLimiter.scheduleWakeUpAt(wakeUpAt); + this.rateLimiter.scheduleWakeUpAt(wakeUpAt, this.trafficLogger); } /* ============================================================ @@ -277,7 +439,7 @@ export class TrafficController { */ private resolveCircuit(next: QueuedRequest): DispatchDecision | null { - return this.circuitBreaker.resolve(next); + return this.circuitBreaker.resolve(next, this.trafficLogger); } /* ============================================================ diff --git a/packages/core/src/traffic/traffic-error-utils.ts b/packages/core/src/traffic/traffic-error-utils.ts index 653329a99..d765e8ae4 100644 --- a/packages/core/src/traffic/traffic-error-utils.ts +++ b/packages/core/src/traffic/traffic-error-utils.ts @@ -1,3 +1,5 @@ +import type { Logger } from "../logger"; + function readObjectProperty(value: unknown, key: string): unknown { if (!value || typeof value !== "object") return undefined; return (value as Record)[key]; @@ -8,21 +10,32 @@ export function coerceStatus(value: unknown): number | undefined { return Number.isFinite(n) ? n : undefined; } -export function extractStatusCode(error: unknown): number | undefined { - return ( +export function extractStatusCode(error: unknown, logger?: Logger): number | undefined { + const status = coerceStatus(readObjectProperty(error, "status")) ?? coerceStatus(readObjectProperty(error, "statusCode")) ?? coerceStatus(readObjectProperty(error, "httpStatus")) ?? coerceStatus(readObjectProperty(readObjectProperty(error, "response"), "status")) ?? - coerceStatus(readObjectProperty(readObjectProperty(error, "cause"), "status")) - ); + coerceStatus(readObjectProperty(readObjectProperty(error, "cause"), "status")); + + logger?.trace?.("Extracted status code", { + status, + hasStatus: readObjectProperty(error, "status") !== undefined, + hasStatusCode: readObjectProperty(error, "statusCode") !== undefined, + hasHttpStatus: readObjectProperty(error, "httpStatus") !== undefined, + hasResponseStatus: + readObjectProperty(readObjectProperty(error, "response"), "status") !== undefined, + hasCauseStatus: readObjectProperty(readObjectProperty(error, "cause"), "status") !== undefined, + }); + + return status; } -export function isTimeoutError(error: unknown): boolean { +export function isTimeoutError(error: unknown, logger?: Logger): boolean { const code = readObjectProperty(error, "code"); const name = readObjectProperty(error, "name"); const message = readObjectProperty(error, "message"); - return ( + const isTimeout = String(code ?? "") .toLowerCase() .includes("timeout") || @@ -31,8 +44,16 @@ export function isTimeoutError(error: unknown): boolean { .includes("timeout") || String(message ?? "") .toLowerCase() - .includes("timeout") - ); + .includes("timeout"); + + logger?.trace?.("Checked timeout error", { + isTimeout, + code, + name, + messagePreview: typeof message === "string" ? message.slice(0, 160) : message, + }); + + return isTimeout; } export function isPromiseLike(value: unknown): value is PromiseLike { diff --git a/packages/core/src/traffic/traffic-rate-limiter.ts b/packages/core/src/traffic/traffic-rate-limiter.ts index fbcb9b224..ff24a221d 100644 --- a/packages/core/src/traffic/traffic-rate-limiter.ts +++ b/packages/core/src/traffic/traffic-rate-limiter.ts @@ -1,3 +1,4 @@ +import type { Logger } from "../logger"; import { RATE_LIMIT_EXHAUSTION_BUFFER, RATE_LIMIT_MIN_PACE_INTERVAL_MS, @@ -26,6 +27,17 @@ export type RateLimitUpdateResult = { type SchedulerCallback = () => void; +export interface RateLimitStrategy { + resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null; + onDispatch(logger?: Logger): void; + onComplete(logger?: Logger): void; + updateFromHeaders( + metadata: TrafficRequestMetadata | undefined, + headers: unknown, + logger?: Logger, + ): RateLimitUpdateResult | undefined; +} + function readHeader(headers: unknown, name: string): string | undefined { if (!headers) return undefined; @@ -76,19 +88,23 @@ function parseResetDurationToMs(raw: string): number | undefined { return Number.isFinite(n) ? Math.round(n) : undefined; } -export class TrafficRateLimiter { - private readonly rateLimitStates = new Map(); - private wakeUpTimeout?: ReturnType; - private wakeUpAt?: number; - private readonly onWakeUp: SchedulerCallback; +export class DefaultRateLimitStrategy implements RateLimitStrategy { + private state?: RateLimitWindowState; + private readonly key: string; - constructor(onWakeUp: SchedulerCallback) { - this.onWakeUp = onWakeUp; + constructor(key: string) { + this.key = key; } - resolve(next: QueuedRequest, key: string): DispatchDecision | null { - const state = this.rateLimitStates.get(key); - if (!state) return null; + resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { + const rateLimitLogger = logger?.child({ module: "rate-limiter" }); + const state = this.state; + if (!state) { + rateLimitLogger?.trace?.("Rate limit state missing; allow request", { + rateLimitKey: this.key, + }); + return null; + } const now = Date.now(); const effectiveRemaining = Math.max(0, state.remaining - state.reserved); @@ -96,19 +112,47 @@ export class TrafficRateLimiter { if (effectiveRemaining <= RATE_LIMIT_EXHAUSTION_BUFFER) { if (now < probeAt) { + rateLimitLogger?.debug?.("Rate limit exhausted; waiting for probe", { + rateLimitKey: this.key, + remaining: state.remaining, + reserved: state.reserved, + effectiveRemaining, + resetAt: state.resetAt, + probeAt, + }); return { kind: "wait", wakeUpAt: probeAt }; } if (state.reserved > 0) { + rateLimitLogger?.debug?.("Rate limit exhausted but in-flight reservations exist; waiting", { + rateLimitKey: this.key, + remaining: state.remaining, + reserved: state.reserved, + effectiveRemaining, + resetAt: state.resetAt, + }); return { kind: "wait" }; } } if (now < state.nextAllowedAt) { + rateLimitLogger?.debug?.("Rate limit pacing; waiting until nextAllowedAt", { + rateLimitKey: this.key, + nextAllowedAt: state.nextAllowedAt, + resetAt: state.resetAt, + waitMs: Math.min(state.resetAt, state.nextAllowedAt) - now, + }); return { kind: "wait", wakeUpAt: Math.min(state.resetAt, state.nextAllowedAt) }; } state.reserved += 1; - next.rateLimitKey = key; + next.rateLimitKey = this.key; + rateLimitLogger?.trace?.("Reserved rate limit token", { + rateLimitKey: this.key, + reserved: state.reserved, + remaining: state.remaining, + resetAt: state.resetAt, + nextAllowedAt: state.nextAllowedAt, + }); const remainingWindowMs = Math.max(0, state.resetAt - now); const intervalMs = Math.max( @@ -122,61 +166,78 @@ export class TrafficRateLimiter { candidateNext >= state.nextAllowedAt + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS ) { state.nextAllowedAt = candidateNext; + rateLimitLogger?.trace?.("Updated pacing nextAllowedAt", { + rateLimitKey: this.key, + nextAllowedAt: state.nextAllowedAt, + intervalMs, + remainingWindowMs, + effectiveRemaining, + }); } return null; } - scheduleWakeUpAt(wakeUpAt: number): void { - const now = Date.now(); - const target = Math.max(now, wakeUpAt); - - if (this.wakeUpTimeout && this.wakeUpAt !== undefined && this.wakeUpAt <= target) { - return; - } + onDispatch(_logger?: Logger): void {} - if (this.wakeUpTimeout) clearTimeout(this.wakeUpTimeout); - - this.wakeUpAt = target; - this.wakeUpTimeout = setTimeout( - () => { - this.wakeUpTimeout = undefined; - this.wakeUpAt = undefined; - this.onWakeUp(); - }, - Math.max(1, target - now), - ); - } - - releaseReservation(key?: string): void { - if (!key) return; - const state = this.rateLimitStates.get(key); + onComplete(logger?: Logger): void { + const rateLimitLogger = logger?.child({ module: "rate-limiter" }); + const state = this.state; if (!state || state.reserved <= 0) return; state.reserved -= 1; + rateLimitLogger?.trace?.("Released rate limit reservation", { + rateLimitKey: this.key, + reserved: state.reserved, + remaining: state.remaining, + resetAt: state.resetAt, + nextAllowedAt: state.nextAllowedAt, + }); } updateFromHeaders( _metadata: TrafficRequestMetadata | undefined, headers: unknown, - key: string, + logger?: Logger, ): RateLimitUpdateResult | undefined { + const rateLimitLogger = logger?.child({ module: "rate-limiter" }); const limitRequests = readHeader(headers, "x-ratelimit-limit-requests"); const remainingRequests = readHeader(headers, "x-ratelimit-remaining-requests"); const resetRequests = readHeader(headers, "x-ratelimit-reset-requests"); - if (!limitRequests || !remainingRequests || !resetRequests) return undefined; + if (!limitRequests || !remainingRequests || !resetRequests) { + rateLimitLogger?.trace?.("Missing rate limit headers; skipping", { + rateLimitKey: this.key, + hasLimit: !!limitRequests, + hasRemaining: !!remainingRequests, + hasReset: !!resetRequests, + }); + return undefined; + } const limit = Number(limitRequests); const remaining = Number(remainingRequests); - if (!Number.isFinite(limit) || !Number.isFinite(remaining)) return undefined; + if (!Number.isFinite(limit) || !Number.isFinite(remaining)) { + rateLimitLogger?.debug?.("Invalid rate limit numeric headers; skipping", { + rateLimitKey: this.key, + limitRequests, + remainingRequests, + }); + return undefined; + } const resetRequestsMs = parseResetDurationToMs(resetRequests); - if (resetRequestsMs === undefined) return undefined; + if (resetRequestsMs === undefined) { + rateLimitLogger?.debug?.("Unable to parse reset duration; skipping", { + rateLimitKey: this.key, + resetRequests, + }); + return undefined; + } const now = Date.now(); const parsedResetAt = now + resetRequestsMs; - const existing = this.rateLimitStates.get(key); + const existing = this.state; const isSameWindow = !!existing && now < existing.resetAt; const resetAt = isSameWindow ? Math.max(existing.resetAt, parsedResetAt) : parsedResetAt; const nextAllowedAt = isSameWindow ? Math.max(existing.nextAllowedAt, now) : now; @@ -190,10 +251,21 @@ export class TrafficRateLimiter { nextAllowedAt, }; - this.rateLimitStates.set(key, state); + this.state = state; + rateLimitLogger?.debug?.("Applied rate limit headers to state", { + rateLimitKey: this.key, + limit, + remaining, + effectiveRemaining: Math.max(0, state.remaining - state.reserved), + resetAt, + nextAllowedAt, + isSameWindow, + parsedResetAt, + resetRequestsMs, + }); return { - key, + key: this.key, headerSnapshot: { limitRequests, remainingRequests, @@ -204,3 +276,85 @@ export class TrafficRateLimiter { }; } } + +export type RateLimitStrategyFactory = (key: string) => RateLimitStrategy; + +export class TrafficRateLimiter { + private readonly strategies = new Map(); + private wakeUpTimeout?: ReturnType; + private wakeUpAt?: number; + private readonly onWakeUp: SchedulerCallback; + private readonly strategyFactory: RateLimitStrategyFactory; + + constructor(onWakeUp: SchedulerCallback, strategyFactory?: RateLimitStrategyFactory) { + this.onWakeUp = onWakeUp; + this.strategyFactory = strategyFactory ?? ((key) => new DefaultRateLimitStrategy(key)); + } + + resolve(next: QueuedRequest, key: string, logger?: Logger): DispatchDecision | null { + const strategy = this.strategies.get(key); + if (!strategy) { + logger + ?.child({ module: "rate-limiter" }) + ?.trace?.("Rate limit state missing; allow request", { rateLimitKey: key }); + return null; + } + return strategy.resolve(next, logger); + } + + notifyDispatch(key: string | undefined, logger?: Logger): void { + if (!key) return; + this.strategies.get(key)?.onDispatch(logger); + } + + scheduleWakeUpAt(wakeUpAt: number, logger?: Logger): void { + const rateLimitLogger = logger?.child({ module: "rate-limiter" }); + const now = Date.now(); + const target = Math.max(now, wakeUpAt); + + if (this.wakeUpTimeout && this.wakeUpAt !== undefined && this.wakeUpAt <= target) { + rateLimitLogger?.trace?.("Wakeup already scheduled earlier; skipping", { + currentWakeUpAt: this.wakeUpAt, + requestedWakeUpAt: target, + }); + return; + } + + if (this.wakeUpTimeout) clearTimeout(this.wakeUpTimeout); + + this.wakeUpAt = target; + rateLimitLogger?.debug?.("Scheduling rate limit wakeup", { + wakeUpAt: target, + inMs: Math.max(1, target - now), + }); + this.wakeUpTimeout = setTimeout( + () => { + this.wakeUpTimeout = undefined; + this.wakeUpAt = undefined; + rateLimitLogger?.debug?.("Rate limit wakeup fired"); + this.onWakeUp(); + }, + Math.max(1, target - now), + ); + } + + releaseReservation(key?: string, logger?: Logger): void { + if (!key) return; + this.strategies.get(key)?.onComplete(logger); + } + + updateFromHeaders( + metadata: TrafficRequestMetadata | undefined, + headers: unknown, + key: string, + logger?: Logger, + ): RateLimitUpdateResult | undefined { + const existing = this.strategies.get(key); + if (existing) return existing.updateFromHeaders(metadata, headers, logger); + const created = this.strategyFactory(key); + const update = created.updateFromHeaders(metadata, headers, logger); + if (!update) return undefined; + this.strategies.set(key, created); + return update; + } +} diff --git a/packages/core/src/traffic/traffic-retry.ts b/packages/core/src/traffic/traffic-retry.ts index 992ca4413..056a552d3 100644 --- a/packages/core/src/traffic/traffic-retry.ts +++ b/packages/core/src/traffic/traffic-retry.ts @@ -1,3 +1,4 @@ +import type { Logger } from "../logger"; import { MAX_RETRY_ATTEMPTS, RATE_LIMIT_BASE_BACKOFF_MS, @@ -15,24 +16,39 @@ export type RetryReason = "rateLimit" | "serverError" | "timeout"; export function buildRetryPlan( error: unknown, attempt: number, + logger?: Logger, ): { delayMs: number; reason: RetryReason } | undefined { - const reason = getRetryReason(error); - if (!reason) return undefined; + const retryLogger = logger?.child({ module: "retry" }); + const reason = getRetryReason(error, retryLogger); + if (!reason) { + retryLogger?.debug?.("No retry reason detected; skipping retry", { attempt }); + return undefined; + } const max = reason === "timeout" ? TIMEOUT_RETRY_ATTEMPTS : MAX_RETRY_ATTEMPTS; - if (attempt >= max) return undefined; + if (attempt >= max) { + retryLogger?.debug?.("Retry attempts exhausted; skipping retry", { + attempt, + max, + reason, + }); + return undefined; + } + + const delayMs = computeBackoffDelay(reason, attempt); + retryLogger?.debug?.("Retry plan built", { attempt, reason, delayMs, max }); return { reason, - delayMs: computeBackoffDelay(reason, attempt), + delayMs, }; } -function getRetryReason(error: unknown): RetryReason | undefined { - const status = extractStatusCode(error); +function getRetryReason(error: unknown, logger?: Logger): RetryReason | undefined { + const status = extractStatusCode(error, logger); if (status === 429) return "rateLimit"; if (status && status >= 500) return "serverError"; - if (status === 408 || isTimeoutError(error)) return "timeout"; + if (status === 408 || isTimeoutError(error, logger)) return "timeout"; return undefined; } diff --git a/packages/core/src/traffic/traffic-usage-tracker.ts b/packages/core/src/traffic/traffic-usage-tracker.ts index 204f53808..e875d21c2 100644 --- a/packages/core/src/traffic/traffic-usage-tracker.ts +++ b/packages/core/src/traffic/traffic-usage-tracker.ts @@ -1,3 +1,4 @@ +import type { Logger } from "../logger"; import type { QueuedRequest } from "./traffic-controller-internal"; import { isPromiseLike } from "./traffic-error-utils"; import type { TenantUsage } from "./traffic-types"; @@ -16,21 +17,33 @@ export class TrafficUsageTracker { return usage ? { ...usage } : undefined; } - recordUsage(item: QueuedRequest, result: TResponse): void { + recordUsage(item: QueuedRequest, result: TResponse, logger?: Logger): void { + const usageLogger = logger?.child({ module: "usage-tracker" }); const extractor = item.extractUsage ?? item.request.extractUsage; - if (!extractor) return; + if (!extractor) { + usageLogger?.trace?.("No usage extractor; skipping usage", { tenantId: item.tenantId }); + return; + } const usage = extractor(result); - if (!usage) return; + if (!usage) { + usageLogger?.trace?.("Usage extractor returned empty; skipping usage", { + tenantId: item.tenantId, + }); + return; + } if (isPromiseLike(usage)) { - void usage.then((u) => u && this.incrementTenantUsage(item.tenantId, u)); + usageLogger?.trace?.("Usage extractor returned promise; awaiting", { + tenantId: item.tenantId, + }); + void usage.then((u) => u && this.incrementTenantUsage(item.tenantId, u, usageLogger)); } else { - this.incrementTenantUsage(item.tenantId, usage); + this.incrementTenantUsage(item.tenantId, usage, usageLogger); } } - private incrementTenantUsage(tenantId: string, usage: UsageCounters): void { + private incrementTenantUsage(tenantId: string, usage: UsageCounters, logger?: Logger): void { const current = this.tenantUsage.get(tenantId) ?? { inputTokens: 0, outputTokens: 0, @@ -46,5 +59,11 @@ export class TrafficUsageTracker { outputTokens: current.outputTokens + output, totalTokens: current.totalTokens + total, }); + + logger?.debug?.("Tenant usage incremented", { + tenantId, + delta: { inputTokens: input, outputTokens: output, totalTokens: total }, + total: this.tenantUsage.get(tenantId), + }); } } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 20029de49..6671d8c17 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -37,7 +37,7 @@ importers: version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2) '@nx/plugin': specifier: 20.4.6 - version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(typescript@5.9.2) + version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2) '@nx/vite': specifier: 20.4.6 version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2)(vite@7.2.7)(vitest@3.2.4) @@ -92,6 +92,9 @@ importers: syncpack: specifier: ^13.0.2 version: 13.0.4(typescript@5.9.2) + ts-node: + specifier: ^10.9.2 + version: 10.9.2(@swc/core@1.5.29)(@types/node@24.2.1)(typescript@5.9.2) tslib: specifier: ^2.3.0 version: 2.8.1 @@ -99,7 +102,7 @@ importers: specifier: ^8.5.0 version: 8.5.0(@swc/core@1.5.29)(typescript@5.9.2) typescript: - specifier: ^5.8.2 + specifier: ^5.9.2 version: 5.9.2 vite: specifier: ^7.2.7 @@ -2750,6 +2753,61 @@ importers: specifier: ^0.5.3 version: 0.5.3(@tanstack/query-core@5.89.0)(@tanstack/react-query@5.89.0)(@tanstack/router-core@1.131.44)(@types/node@24.2.1)(@types/react@19.2.7)(@vitejs/plugin-react@5.1.2)(tsx@4.20.4)(vite@7.2.7) + examples/with-viteval/dist: + dependencies: + '@ai-sdk/openai': + specifier: ^2.0.52 + version: 2.0.85(zod@3.25.76) + '@voltagent/cli': + specifier: ^0.1.16 + version: link:../../../packages/cli + '@voltagent/core': + specifier: ^1.2.15 + version: link:../../../packages/core + '@voltagent/libsql': + specifier: ^1.0.13 + version: link:../../../packages/libsql + '@voltagent/logger': + specifier: ^1.0.4 + version: link:../../../packages/logger + '@voltagent/server-hono': + specifier: ^1.2.5 + version: link:../../../packages/server-hono + ai: + specifier: ^5.0.76 + version: 5.0.113(zod@3.25.76) + consola: + specifier: ^3.4.2 + version: 3.4.2 + envalid: + specifier: ^8.1.0 + version: 8.1.0 + yargs: + specifier: ^18.0.0 + version: 18.0.0 + zod: + specifier: ^3.25.76 + version: 3.25.76 + devDependencies: + '@tsconfig/node24': + specifier: ^24.0.1 + version: 24.0.1 + '@types/yargs': + specifier: ^17.0.33 + version: 17.0.33 + dotenv: + specifier: ^16.4.5 + version: 16.6.1 + tsx: + specifier: ^4.19.3 + version: 4.20.4 + typescript: + specifier: ^5.8.2 + version: 5.9.2 + viteval: + specifier: ^0.5.3 + version: 0.5.3(@tanstack/query-core@5.89.0)(@tanstack/react-query@5.89.0)(@tanstack/router-core@1.131.44)(@types/node@24.2.1)(@types/react@19.2.7)(@vitejs/plugin-react@5.1.2)(tsx@4.20.4)(vite@7.2.7) + examples/with-voice-elevenlabs: dependencies: '@ai-sdk/openai': @@ -3509,7 +3567,7 @@ importers: version: 3.2.4(vitest@3.2.4) jest: specifier: ^29.5.0 - version: 29.7.0(@types/node@24.2.1) + version: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) ts-jest: specifier: ^29.1.0 version: 29.4.1(@babel/core@7.28.5)(esbuild@0.25.10)(jest@29.7.0)(typescript@5.9.2) @@ -9966,7 +10024,7 @@ packages: slash: 3.0.0 dev: true - /@jest/core@29.7.0: + /@jest/core@29.7.0(ts-node@10.9.2): resolution: {integrity: sha512-n7aeXWKMnGtDA48y8TLWJPJmLmmZ642Ceo78cYWEpiD7FzDgmNDV/GCVRorPABdXLJZ/9wzzgZAlHjXjxDHGsg==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} peerDependencies: @@ -9987,7 +10045,7 @@ packages: exit: 0.1.2 graceful-fs: 4.2.11 jest-changed-files: 29.7.0 - jest-config: 29.7.0(@types/node@24.6.2) + jest-config: 29.7.0(@types/node@24.6.2)(ts-node@10.9.2) jest-haste-map: 29.7.0 jest-message-util: 29.7.0 jest-regex-util: 29.6.3 @@ -12403,7 +12461,7 @@ packages: - verdaccio dev: true - /@nx/jest@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2): + /@nx/jest@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2): resolution: {integrity: sha512-yZOZJOQFtpdY3Fu/WYNoDx81TwvF9yfwvalFpLD19bz+2YGl7B89l0S1ZrtSRXFfKXA/w7gb0gmKwthJtQhx9Q==} dependencies: '@jest/reporters': 29.7.0 @@ -12412,7 +12470,7 @@ packages: '@nx/js': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2) '@phenomnomnominal/tsquery': 5.0.1(typescript@5.9.2) identity-obj-proxy: 3.0.0 - jest-config: 29.7.0(@types/node@24.2.1) + jest-config: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) jest-resolve: 29.7.0 jest-util: 29.7.0 minimatch: 9.0.3 @@ -12807,12 +12865,12 @@ packages: dev: true optional: true - /@nx/plugin@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(typescript@5.9.2): + /@nx/plugin@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2): resolution: {integrity: sha512-7Jlv+BVqGoO0BolQN7P5Z87160phuE1i7H6C8xFwQnlQ3ZfwQCJzk2dkg1UyzxDkWl6lvVsqBjZPXD55gFQ3+w==} dependencies: '@nx/devkit': 20.4.6(nx@20.8.2) '@nx/eslint': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2) - '@nx/jest': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2) + '@nx/jest': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2) '@nx/js': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2) tslib: 2.8.1 transitivePeerDependencies: @@ -17770,8 +17828,8 @@ packages: '@babel/plugin-syntax-jsx': 7.27.1(@babel/core@7.28.5) '@babel/plugin-syntax-typescript': 7.27.1(@babel/core@7.28.5) '@babel/template': 7.27.2 - '@babel/traverse': 7.28.4 - '@babel/types': 7.28.4 + '@babel/traverse': 7.28.5 + '@babel/types': 7.28.5 '@tanstack/react-router': 1.131.44(react-dom@19.2.3)(react@19.2.3) '@tanstack/router-core': 1.131.44 '@tanstack/router-generator': 1.131.44 @@ -22783,7 +22841,7 @@ packages: crc-32: 1.2.2 readable-stream: 4.7.0 - /create-jest@29.7.0(@types/node@24.2.1): + /create-jest@29.7.0(@types/node@24.2.1)(ts-node@10.9.2): resolution: {integrity: sha512-Adz2bdH0Vq3F53KEMJOoftQFutWCukm6J24wbPWRO4k1kMY7gS7ds/uoJkNuV8wDCtWWnuwGcJwpWcih+zEW1Q==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} hasBin: true @@ -22792,7 +22850,7 @@ packages: chalk: 4.1.2 exit: 0.1.2 graceful-fs: 4.2.11 - jest-config: 29.7.0(@types/node@24.2.1) + jest-config: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) jest-util: 29.7.0 prompts: 2.4.2 transitivePeerDependencies: @@ -27641,7 +27699,7 @@ packages: - supports-color dev: true - /jest-cli@29.7.0(@types/node@24.2.1): + /jest-cli@29.7.0(@types/node@24.2.1)(ts-node@10.9.2): resolution: {integrity: sha512-OVVobw2IubN/GSYsxETi+gOe7Ka59EFMR/twOU3Jb2GnKKeMGJB5SGUUrEz3SFVmJASUdZUzy83sLNNQ2gZslg==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} hasBin: true @@ -27651,14 +27709,14 @@ packages: node-notifier: optional: true dependencies: - '@jest/core': 29.7.0 + '@jest/core': 29.7.0(ts-node@10.9.2) '@jest/test-result': 29.7.0 '@jest/types': 29.6.3 chalk: 4.1.2 - create-jest: 29.7.0(@types/node@24.2.1) + create-jest: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) exit: 0.1.2 import-local: 3.2.0 - jest-config: 29.7.0(@types/node@24.2.1) + jest-config: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) jest-util: 29.7.0 jest-validate: 29.7.0 yargs: 17.7.2 @@ -27669,7 +27727,7 @@ packages: - ts-node dev: true - /jest-config@29.7.0(@types/node@24.2.1): + /jest-config@29.7.0(@types/node@24.2.1)(ts-node@10.9.2): resolution: {integrity: sha512-uXbpfeQ7R6TZBqI3/TxCU4q4ttk3u0PJeC+E0zbfSoSjq6bJ7buBPxzQPL0ifrkY4DNu4JUdk0ImlBUYi840eQ==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} peerDependencies: @@ -27704,12 +27762,13 @@ packages: pretty-format: 29.7.0 slash: 3.0.0 strip-json-comments: 3.1.1 + ts-node: 10.9.2(@swc/core@1.5.29)(@types/node@24.2.1)(typescript@5.9.2) transitivePeerDependencies: - babel-plugin-macros - supports-color dev: true - /jest-config@29.7.0(@types/node@24.6.2): + /jest-config@29.7.0(@types/node@24.6.2)(ts-node@10.9.2): resolution: {integrity: sha512-uXbpfeQ7R6TZBqI3/TxCU4q4ttk3u0PJeC+E0zbfSoSjq6bJ7buBPxzQPL0ifrkY4DNu4JUdk0ImlBUYi840eQ==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} peerDependencies: @@ -27744,6 +27803,7 @@ packages: pretty-format: 29.7.0 slash: 3.0.0 strip-json-comments: 3.1.1 + ts-node: 10.9.2(@swc/core@1.5.29)(@types/node@24.2.1)(typescript@5.9.2) transitivePeerDependencies: - babel-plugin-macros - supports-color @@ -28041,7 +28101,7 @@ packages: supports-color: 8.1.1 dev: true - /jest@29.7.0(@types/node@24.2.1): + /jest@29.7.0(@types/node@24.2.1)(ts-node@10.9.2): resolution: {integrity: sha512-NIy3oAFp9shda19hy4HK0HRTWKtPJmGdnvywu01nOqNC2vZg+Z+fvJDxpMQA88eb2I9EcafcdjYgsDthnYTvGw==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} hasBin: true @@ -28051,10 +28111,10 @@ packages: node-notifier: optional: true dependencies: - '@jest/core': 29.7.0 + '@jest/core': 29.7.0(ts-node@10.9.2) '@jest/types': 29.6.3 import-local: 3.2.0 - jest-cli: 29.7.0(@types/node@24.2.1) + jest-cli: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) transitivePeerDependencies: - '@types/node' - babel-plugin-macros @@ -36767,7 +36827,7 @@ packages: esbuild: 0.25.10 fast-json-stable-stringify: 2.1.0 handlebars: 4.7.8 - jest: 29.7.0(@types/node@24.2.1) + jest: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) json5: 2.2.3 lodash.memoize: 4.1.2 make-error: 1.3.6 diff --git a/tmp/test/traffic-priority-openai-sim.ts b/tmp/test/traffic-priority-openai-sim.ts index a9ef2a368..9d36a7d14 100644 --- a/tmp/test/traffic-priority-openai-sim.ts +++ b/tmp/test/traffic-priority-openai-sim.ts @@ -9,6 +9,9 @@ * - Test 1: P0 runs before P1/P2 when all runnable. * - Test 2: P0 request (gpt-4o) is rate-limited → P1 (gpt-4o-mini) proceeds. * + * Note: + * - Rate-limit wakeups include a small probe delay; a "1s" reset may unblock slightly after 1s. + * * Run: * - pnpm ts-node tmp/test/traffic-priority-openai-sim.ts */ diff --git a/tmp/test/traffic-priority.ts b/tmp/test/traffic-priority.ts index 28934051a..409e10782 100644 --- a/tmp/test/traffic-priority.ts +++ b/tmp/test/traffic-priority.ts @@ -6,6 +6,9 @@ * - Test 1: P0 should run before P1/P2 when runnable. * - Test 2: If a P0 request is rate-limited, a lower priority (P1) can proceed. * + * Note: + * - Rate-limit wakeups include a small probe delay; a "1s" reset may unblock slightly after 1s. + * * Run: * - pnpm ts-node tmp/test/traffic-priority.ts * - VERBOSE=1 pnpm ts-node tmp/test/traffic-priority.ts diff --git a/tmp/test/traffic-rate-limit-openai-window-sim.ts b/tmp/test/traffic-rate-limit-openai-window-sim.ts new file mode 100644 index 000000000..35232faa0 --- /dev/null +++ b/tmp/test/traffic-rate-limit-openai-window-sim.ts @@ -0,0 +1,247 @@ +// @ts-nocheck +/** + * Manual test (real network): Simulate OpenAI "window remaining + reset" semantics and watch + * TrafficController pace + probe behavior via logs. + * + * Why "simulate"? + * - Real OpenAI headers usually show very large remaining values, so pacing is hard to observe. + * - This script still hits the real OpenAI model, but it drives the controller state using + * synthetic `x-ratelimit-*` headers to force a small window (e.g. remaining=3, reset=30s). + * + * What this demonstrates (matches your Step 1–7): + * 1) We seed controller with remaining + reset window. + * 2) We enqueue many requests. + * 3) Controller subtracts `reserved` from `remaining` to avoid stampedes. + * 4) When `effectiveRemaining <= 1`, controller waits until `resetAt + probeDelay`. + * 5) When room exists, controller paces using `nextAllowedAt`. + * 6) When a request finishes, we release reservation (controller) and apply new headers (this script). + * 7) After reset, controller sends a probe even when remaining==0; probe "fetches" fresh headers and flow resumes. + * + * Prereqs: + * - Set `OPENAI_API_KEY` + * + * Suggested logging: + * - `VOLTAGENT_LOG_LEVEL=trace` (to see traffic controller internals) + * + * Run: + * - VOLTAGENT_LOG_LEVEL=trace OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-openai-window-sim.ts + * + * Optional env: + * - OPENAI_MODEL (default: gpt-4o-mini) + * - WINDOW_SECONDS (default: 30) + * - REMAINING (default: 3) + * - REQUESTS (default: 10) + * - MAX_CONCURRENT (default: 50) + */ + +import { safeStringify } from "@voltagent/internal"; +import { TrafficController } from "../../packages/core/dist/index.js"; + +const apiKey = process.env.OPENAI_API_KEY; +if (!apiKey) { + console.error("Missing OPENAI_API_KEY. Example:"); + console.error( + " VOLTAGENT_LOG_LEVEL=trace OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-openai-window-sim.ts", + ); + process.exit(1); +} + +const now = () => new Date().toISOString(); + +const modelId = process.env.OPENAI_MODEL ?? "gpt-4o-mini"; +const windowSecondsRaw = Number(process.env.WINDOW_SECONDS ?? "30"); +const remainingRaw = Number(process.env.REMAINING ?? "3"); +const requestsRaw = Number(process.env.REQUESTS ?? "10"); +const maxConcurrentRaw = Number(process.env.MAX_CONCURRENT ?? "50"); + +const windowSeconds = + Number.isFinite(windowSecondsRaw) && windowSecondsRaw > 0 ? windowSecondsRaw : 30; +const initialRemaining = + Number.isFinite(remainingRaw) && remainingRaw > 0 ? Math.floor(remainingRaw) : 3; +const requestCount = Number.isFinite(requestsRaw) && requestsRaw > 0 ? Math.floor(requestsRaw) : 10; +const maxConcurrent = + Number.isFinite(maxConcurrentRaw) && maxConcurrentRaw > 0 ? Math.floor(maxConcurrentRaw) : 50; + +const provider = "openai"; +const tenantId = "openai-window-sim"; +const windowMs = Math.round(windowSeconds * 1000); + +async function callOpenAIResponses(label: string): Promise<{ + status: number; + headers: Record; + textPreview: string; +}> { + const url = "https://api.openai.com/v1/responses"; + const body = safeStringify({ + model: modelId, + input: `Reply with only: ${label}`, + max_output_tokens: 16, + }); + + const startedAt = Date.now(); + const res = await fetch(url, { + method: "POST", + headers: { + authorization: `Bearer ${apiKey}`, + "content-type": "application/json", + }, + body, + }); + + const limit = res.headers.get("x-ratelimit-limit-requests") ?? undefined; + const remaining = res.headers.get("x-ratelimit-remaining-requests") ?? undefined; + const reset = res.headers.get("x-ratelimit-reset-requests") ?? undefined; + + if (!res.ok) { + const text = await res.text().catch(() => ""); + throw new Error( + `OpenAI error status=${res.status} elapsedMs=${Date.now() - startedAt} body=${text.slice(0, 280)}`, + ); + } + + const data: any = await res.json(); + const outputText = + data?.output?.[0]?.content?.find?.((c: any) => c?.type === "output_text")?.text ?? + data?.output_text ?? + data?.output?.[0]?.content?.[0]?.text ?? + ""; + + return { + status: res.status, + headers: { + "x-ratelimit-limit-requests": limit, + "x-ratelimit-remaining-requests": remaining, + "x-ratelimit-reset-requests": reset, + }, + textPreview: String(outputText).slice(0, 80), + }; +} + +async function main() { + console.log( + `\n=== OpenAI real + synthetic window rate limit (provider=${provider}, model=${modelId}) ===`, + ); + console.log( + `[config] maxConcurrent=${maxConcurrent} windowSeconds=${windowSeconds} initialRemaining=${initialRemaining} requests=${requestCount}`, + ); + console.log( + "[hint] Set VOLTAGENT_LOG_LEVEL=trace to see TrafficController internals (reserved/effectiveRemaining/nextAllowedAt).", + ); + + const controller = new TrafficController({ maxConcurrent }); + + // --- Step 1: seed "remaining + reset window" into controller --- + let windowResetAt = Date.now() + windowMs; + let remainingInWindow = initialRemaining; + + const applySyntheticHeaders = (source: string) => { + const resetMs = Math.max(1, windowResetAt - Date.now()); + const applied = controller.updateRateLimitFromHeaders( + { provider, model: modelId, tenantId }, + { + "x-ratelimit-limit-requests": String(initialRemaining), + "x-ratelimit-remaining-requests": String(Math.max(0, remainingInWindow)), + "x-ratelimit-reset-requests": `${resetMs}ms`, + }, + ); + console.log( + `[${now()}] [synthetic] source=${source} remaining=${remainingInWindow} resetInMs=${resetMs} applied=${safeStringify( + applied && { + key: applied.key, + state: { + remaining: applied.state.remaining, + reserved: applied.state.reserved, + resetAt: applied.state.resetAt, + nextAllowedAt: applied.state.nextAllowedAt, + }, + }, + )}`, + ); + }; + + applySyntheticHeaders("seed"); + + console.log("\n[seed] Making one real request to confirm connectivity + show real headers..."); + const seed = await callOpenAIResponses("seed"); + console.log( + `[${now()}] [seed] ok status=${seed.status} text=${seed.textPreview} realHeaders=${safeStringify( + seed.headers, + )}`, + ); + + console.log(`\n[burst] Enqueueing ${requestCount} controller-managed requests...`); + + const jobs = Array.from({ length: requestCount }, (_, index) => { + const label = `req-${index + 1}`; + const enqueuedAt = Date.now(); + console.log(`[${now()}] [enqueue] ${label}`); + + return controller + .handleText({ + tenantId, + metadata: { + tenantId, + provider, + model: modelId, + priority: "P1", + agentName: "openai-window-sim", + agentId: label, + }, + execute: async () => { + const startedAt = Date.now(); + console.log(`[${now()}] [execute-start] ${label}`); + + const result = await callOpenAIResponses(label); + + console.log( + `[${now()}] [execute-end] ${label} elapsedMs=${Date.now() - startedAt} realHeaders=${safeStringify( + result.headers, + )}`, + ); + + // --- Step 6: decrement remaining + apply new "headers" --- + const nowMs = Date.now(); + if (nowMs >= windowResetAt) { + // --- Step 7: reset happened; probe request fetched "fresh" headers for the next window --- + console.log( + `[${now()}] [reset] window elapsed; starting new synthetic window (windowSeconds=${windowSeconds})`, + ); + windowResetAt = nowMs + windowMs; + remainingInWindow = initialRemaining; + } + + remainingInWindow = Math.max(0, remainingInWindow - 1); + applySyntheticHeaders("response"); + + return result; + }, + }) + .then((r) => { + const totalElapsedMs = Date.now() - enqueuedAt; + console.log( + `[${now()}] [done] ${label} totalElapsedMs=${totalElapsedMs} text=${r.textPreview}`, + ); + return { label, totalElapsedMs, status: "fulfilled" as const }; + }) + .catch((error: any) => { + const totalElapsedMs = Date.now() - enqueuedAt; + console.log( + `[${now()}] [fail] ${label} totalElapsedMs=${totalElapsedMs} name=${error?.name ?? "Error"} msg=${ + error?.message ?? String(error) + }`, + ); + return { label, totalElapsedMs, status: "rejected" as const }; + }); + }); + + const settled = await Promise.all(jobs); + console.log(`\n[done] settled=${safeStringify(settled.map((s) => s.status))}`); + console.log( + `[done] tenantUsage(${tenantId})=${safeStringify(controller.getTenantUsage(tenantId))}`, + ); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/tmp/test/traffic-rate-limit-static.ts b/tmp/test/traffic-rate-limit-static.ts index d06427a3b..3f91d5bbb 100644 --- a/tmp/test/traffic-rate-limit-static.ts +++ b/tmp/test/traffic-rate-limit-static.ts @@ -4,6 +4,8 @@ * * What to look for: * - Requests should be paced out across the window (no steady "refill" math). + * - If responses arrive out-of-order, remaining headers might "increase"; controller should + * keep remaining monotonic within the same window. * * Run: * - pnpm ts-node tmp/test/traffic-rate-limit-static.ts @@ -78,6 +80,7 @@ async function main() { provider, modelId: model, doGenerate: async (options: any) => { + const simulatedLatencyMs = 10 + Math.floor(Math.random() * 120); const nowMs = Date.now(); if (nowMs >= windowResetAt) { windowStartAt = nowMs; @@ -92,8 +95,10 @@ async function main() { lastStartAt = startAt; const label = extractLabel(options?.prompt); - console.log(`[${now()}] doGenerate start call=${calls} (+${delta}ms) input=${label}`); - await sleep(50); + console.log( + `[${now()}] doGenerate start call=${calls} (+${delta}ms) input=${label} latencyMs=${simulatedLatencyMs}`, + ); + await sleep(simulatedLatencyMs); console.log(`[${now()}] doGenerate end input=${label}`); const remainingAfterThis = Math.max(0, limit - usedInWindow); From 32276d732133452ab65c52ea25941b70e693c3b4 Mon Sep 17 00:00:00 2001 From: riturajFi Date: Mon, 15 Dec 2025 00:25:43 +0530 Subject: [PATCH 12/41] feat: retry after header --- packages/core/src/index.ts | 1 + .../src/traffic/traffic-controller.spec.ts | 35 +++ .../core/src/traffic/traffic-controller.ts | 3 +- .../core/src/traffic/traffic-error-utils.ts | 64 +++++ packages/core/src/traffic/traffic-errors.ts | 13 + .../core/src/traffic/traffic-rate-limiter.ts | 172 ++++++------ .../core/src/traffic/traffic-retry.spec.ts | 45 ++++ packages/core/src/traffic/traffic-retry.ts | 24 +- tmp/test/traffic-retry-after.ts | 245 ++++++++++++++++++ 9 files changed, 519 insertions(+), 83 deletions(-) create mode 100644 packages/core/src/traffic/traffic-retry.spec.ts create mode 100644 tmp/test/traffic-retry-after.ts diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 0aef165ad..f9dd9fef3 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -25,6 +25,7 @@ export { // Surface traffic controller so downstream consumers can route agent calls through the shared scheduler TrafficController, CircuitBreakerOpenError, + RateLimitedUpstreamError, getTrafficController, type RateLimitConfig, type RateLimitKey, diff --git a/packages/core/src/traffic/traffic-controller.spec.ts b/packages/core/src/traffic/traffic-controller.spec.ts index aa7fba6d0..6640e0324 100644 --- a/packages/core/src/traffic/traffic-controller.spec.ts +++ b/packages/core/src/traffic/traffic-controller.spec.ts @@ -194,4 +194,39 @@ describe("TrafficController rate limit headers", () => { vi.useRealTimers(); } }); + + it("applies Retry-After even when x-ratelimit headers are missing", async () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(0)); + const controller = new TrafficController({ maxConcurrent: 1 }); + const order: string[] = []; + + controller.updateRateLimitFromHeaders( + { provider: "p", model: "m" }, + { + "retry-after": "2", + }, + ); + + const p0 = controller.handleText({ + metadata: { provider: "p", model: "m", priority: "P0" }, + execute: async () => { + order.push("P0"); + return "P0"; + }, + }); + + await vi.advanceTimersByTimeAsync(1_999); + expect(order).toEqual([]); + + await vi.advanceTimersByTimeAsync(1); + await vi.runAllTimersAsync(); + await p0; + expect(order).toEqual(["P0"]); + } finally { + vi.useRealTimers(); + } + }); }); diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index e2d24acb0..8dcb9a350 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -2,7 +2,7 @@ import type { Logger } from "../logger"; import { LoggerProxy } from "../logger"; import { TrafficCircuitBreaker } from "./traffic-circuit-breaker"; import type { DispatchDecision, QueuedRequest, Scheduler } from "./traffic-controller-internal"; -import { CircuitBreakerOpenError } from "./traffic-errors"; +import { CircuitBreakerOpenError, RateLimitedUpstreamError } from "./traffic-errors"; import { type RateLimitUpdateResult, TrafficRateLimiter } from "./traffic-rate-limiter"; import { type RetryReason, buildRetryPlan } from "./traffic-retry"; import type { @@ -36,6 +36,7 @@ export type { }; export { CircuitBreakerOpenError }; +export { RateLimitedUpstreamError }; export class TrafficController { /* ---------- Core ---------- */ diff --git a/packages/core/src/traffic/traffic-error-utils.ts b/packages/core/src/traffic/traffic-error-utils.ts index d765e8ae4..69d15d84c 100644 --- a/packages/core/src/traffic/traffic-error-utils.ts +++ b/packages/core/src/traffic/traffic-error-utils.ts @@ -5,6 +5,46 @@ function readObjectProperty(value: unknown, key: string): unknown { return (value as Record)[key]; } +export function readHeaderValue(headers: unknown, name: string): string | undefined { + if (!headers) return undefined; + + if (typeof (headers as { get?: unknown }).get === "function") { + const v = (headers as { get: (name: string) => unknown }).get(name); + return v === null || v === undefined ? undefined : String(v); + } + + if (typeof headers !== "object") return undefined; + + const entries = Object.entries(headers as Record); + const target = name.toLowerCase(); + const match = entries.find(([k]) => String(k).toLowerCase() === target); + if (!match) return undefined; + + const value = match[1]; + if (Array.isArray(value)) { + const first = value[0]; + return first === null || first === undefined ? undefined : String(first); + } + return value === null || value === undefined ? undefined : String(value); +} + +export function parseRetryAfterMs(value: string, nowMs: number = Date.now()): number | undefined { + const raw = value.trim(); + if (!raw) return undefined; + + const seconds = Number(raw); + if (Number.isFinite(seconds)) { + return Math.max(0, Math.round(seconds * 1000)); + } + + const parsedAt = Date.parse(raw); + if (Number.isFinite(parsedAt)) { + return Math.max(0, parsedAt - nowMs); + } + + return undefined; +} + export function coerceStatus(value: unknown): number | undefined { const n = Number(value); return Number.isFinite(n) ? n : undefined; @@ -31,6 +71,30 @@ export function extractStatusCode(error: unknown, logger?: Logger): number | und return status; } +export function extractRetryAfterMs(error: unknown, logger?: Logger): number | undefined { + const retryAfterLogger = logger?.child({ module: "retry-after" }); + const candidates: unknown[] = [ + readObjectProperty(error, "headers"), + readObjectProperty(readObjectProperty(error, "response"), "headers"), + readObjectProperty(readObjectProperty(error, "cause"), "headers"), + readObjectProperty( + readObjectProperty(readObjectProperty(error, "cause"), "response"), + "headers", + ), + ]; + + for (const headers of candidates) { + const raw = readHeaderValue(headers, "retry-after"); + if (!raw) continue; + const parsed = parseRetryAfterMs(raw); + retryAfterLogger?.trace?.("Parsed Retry-After header", { raw, parsedMs: parsed }); + if (parsed !== undefined) return parsed; + } + + retryAfterLogger?.trace?.("Retry-After header missing or unparsable"); + return undefined; +} + export function isTimeoutError(error: unknown, logger?: Logger): boolean { const code = readObjectProperty(error, "code"); const name = readObjectProperty(error, "name"); diff --git a/packages/core/src/traffic/traffic-errors.ts b/packages/core/src/traffic/traffic-errors.ts index 1c6166027..2fd93890b 100644 --- a/packages/core/src/traffic/traffic-errors.ts +++ b/packages/core/src/traffic/traffic-errors.ts @@ -11,3 +11,16 @@ export class CircuitBreakerOpenError extends Error { this.retryAfterMs = retryAfterMs; } } + +export class RateLimitedUpstreamError extends Error { + readonly status = 429; + readonly retryAfterMs?: number; + readonly metadata?: TrafficRequestMetadata; + + constructor(message: string, metadata?: TrafficRequestMetadata, retryAfterMs?: number) { + super(message); + this.name = "RateLimitedUpstreamError"; + this.metadata = metadata; + this.retryAfterMs = retryAfterMs; + } +} diff --git a/packages/core/src/traffic/traffic-rate-limiter.ts b/packages/core/src/traffic/traffic-rate-limiter.ts index ff24a221d..f05396f52 100644 --- a/packages/core/src/traffic/traffic-rate-limiter.ts +++ b/packages/core/src/traffic/traffic-rate-limiter.ts @@ -10,13 +10,16 @@ import type { QueuedRequest, RateLimitWindowState, } from "./traffic-controller-internal"; +import { parseRetryAfterMs, readHeaderValue } from "./traffic-error-utils"; import type { TrafficRequestMetadata } from "./traffic-types"; export type RateLimitHeaderSnapshot = { - limitRequests: string; - remainingRequests: string; - resetRequests: string; - resetRequestsMs: number; + limitRequests?: string; + remainingRequests?: string; + resetRequests?: string; + resetRequestsMs?: number; + retryAfter?: string; + retryAfterMs?: number; }; export type RateLimitUpdateResult = { @@ -38,29 +41,6 @@ export interface RateLimitStrategy { ): RateLimitUpdateResult | undefined; } -function readHeader(headers: unknown, name: string): string | undefined { - if (!headers) return undefined; - - if (typeof (headers as { get?: unknown }).get === "function") { - const v = (headers as { get: (name: string) => unknown }).get(name); - return v === null || v === undefined ? undefined : String(v); - } - - if (typeof headers !== "object") return undefined; - - const entries = Object.entries(headers as Record); - const target = name.toLowerCase(); - const match = entries.find(([k]) => String(k).toLowerCase() === target); - if (!match) return undefined; - - const value = match[1]; - if (Array.isArray(value)) { - const first = value[0]; - return first === null || first === undefined ? undefined : String(first); - } - return value === null || value === undefined ? undefined : String(value); -} - function parseResetDurationToMs(raw: string): number | undefined { const value = raw.trim(); if (!value) return undefined; @@ -200,78 +180,112 @@ export class DefaultRateLimitStrategy implements RateLimitStrategy { logger?: Logger, ): RateLimitUpdateResult | undefined { const rateLimitLogger = logger?.child({ module: "rate-limiter" }); - const limitRequests = readHeader(headers, "x-ratelimit-limit-requests"); - const remainingRequests = readHeader(headers, "x-ratelimit-remaining-requests"); - const resetRequests = readHeader(headers, "x-ratelimit-reset-requests"); + const limitRequests = readHeaderValue(headers, "x-ratelimit-limit-requests"); + const remainingRequests = readHeaderValue(headers, "x-ratelimit-remaining-requests"); + const resetRequests = readHeaderValue(headers, "x-ratelimit-reset-requests"); + const retryAfter = readHeaderValue(headers, "retry-after"); + const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter) : undefined; - if (!limitRequests || !remainingRequests || !resetRequests) { + const now = Date.now(); + const existing = this.state; + let state: RateLimitWindowState | undefined; + let headerSnapshot: RateLimitHeaderSnapshot | undefined; + + if (limitRequests && remainingRequests && resetRequests) { + const limit = Number(limitRequests); + const remaining = Number(remainingRequests); + if (!Number.isFinite(limit) || !Number.isFinite(remaining)) { + rateLimitLogger?.debug?.("Invalid rate limit numeric headers; skipping", { + rateLimitKey: this.key, + limitRequests, + remainingRequests, + }); + return undefined; + } + + const resetRequestsMs = parseResetDurationToMs(resetRequests); + if (resetRequestsMs === undefined) { + rateLimitLogger?.debug?.("Unable to parse reset duration; skipping", { + rateLimitKey: this.key, + resetRequests, + }); + return undefined; + } + + const parsedResetAt = now + resetRequestsMs; + const isSameWindow = !!existing && now < existing.resetAt; + const resetAt = isSameWindow ? Math.max(existing.resetAt, parsedResetAt) : parsedResetAt; + const nextAllowedAt = isSameWindow ? Math.max(existing.nextAllowedAt, now) : now; + const reserved = Math.max(0, existing?.reserved ?? 0); + + state = { + limit, + remaining: isSameWindow ? Math.min(existing.remaining, remaining) : remaining, + resetAt, + reserved, + nextAllowedAt, + }; + headerSnapshot = { + limitRequests, + remainingRequests, + resetRequests, + resetRequestsMs, + }; + } else if (retryAfterMs === undefined) { rateLimitLogger?.trace?.("Missing rate limit headers; skipping", { rateLimitKey: this.key, hasLimit: !!limitRequests, hasRemaining: !!remainingRequests, hasReset: !!resetRequests, + hasRetryAfter: !!retryAfter, }); return undefined; } - const limit = Number(limitRequests); - const remaining = Number(remainingRequests); - if (!Number.isFinite(limit) || !Number.isFinite(remaining)) { - rateLimitLogger?.debug?.("Invalid rate limit numeric headers; skipping", { - rateLimitKey: this.key, - limitRequests, - remainingRequests, - }); - return undefined; - } - - const resetRequestsMs = parseResetDurationToMs(resetRequests); - if (resetRequestsMs === undefined) { - rateLimitLogger?.debug?.("Unable to parse reset duration; skipping", { - rateLimitKey: this.key, - resetRequests, - }); - return undefined; + if (!state) { + if (retryAfterMs === undefined) { + rateLimitLogger?.trace?.("Retry-After missing or unparsable; skipping", { + rateLimitKey: this.key, + retryAfter, + }); + return undefined; + } + const targetAt = now + retryAfterMs; + const isSameWindow = !!existing && now < existing.resetAt; + state = { + limit: existing?.limit ?? 1, + remaining: 0, + resetAt: isSameWindow ? Math.max(existing.resetAt, targetAt) : targetAt, + reserved: Math.max(0, existing?.reserved ?? 0), + nextAllowedAt: Math.max(existing?.nextAllowedAt ?? now, targetAt), + }; + headerSnapshot = { retryAfter, retryAfterMs }; + } else if (retryAfterMs !== undefined) { + const targetAt = now + retryAfterMs; + state = { + ...state, + remaining: 0, + resetAt: Math.max(state.resetAt, targetAt), + nextAllowedAt: Math.max(state.nextAllowedAt, targetAt), + }; + headerSnapshot = { ...headerSnapshot, retryAfter, retryAfterMs }; } - const now = Date.now(); - const parsedResetAt = now + resetRequestsMs; - - const existing = this.state; - const isSameWindow = !!existing && now < existing.resetAt; - const resetAt = isSameWindow ? Math.max(existing.resetAt, parsedResetAt) : parsedResetAt; - const nextAllowedAt = isSameWindow ? Math.max(existing.nextAllowedAt, now) : now; - const reserved = Math.max(0, existing?.reserved ?? 0); - - const state: RateLimitWindowState = { - limit, - remaining: isSameWindow ? Math.min(existing.remaining, remaining) : remaining, - resetAt, - reserved, - nextAllowedAt, - }; - this.state = state; rateLimitLogger?.debug?.("Applied rate limit headers to state", { rateLimitKey: this.key, - limit, - remaining, + limit: state.limit, + remaining: state.remaining, effectiveRemaining: Math.max(0, state.remaining - state.reserved), - resetAt, - nextAllowedAt, - isSameWindow, - parsedResetAt, - resetRequestsMs, + resetAt: state.resetAt, + nextAllowedAt: state.nextAllowedAt, + resetRequestsMs: headerSnapshot?.resetRequestsMs, + retryAfterMs: headerSnapshot?.retryAfterMs, }); return { key: this.key, - headerSnapshot: { - limitRequests, - remainingRequests, - resetRequests, - resetRequestsMs, - }, + headerSnapshot: headerSnapshot ?? {}, state, }; } diff --git a/packages/core/src/traffic/traffic-retry.spec.ts b/packages/core/src/traffic/traffic-retry.spec.ts new file mode 100644 index 000000000..2360ca109 --- /dev/null +++ b/packages/core/src/traffic/traffic-retry.spec.ts @@ -0,0 +1,45 @@ +import { describe, expect, it, vi } from "vitest"; +import { buildRetryPlan } from "./traffic-retry"; + +describe("buildRetryPlan", () => { + it("respects Retry-After for 429s", () => { + const randomSpy = vi.spyOn(Math, "random").mockReturnValue(0); + try { + const plan = buildRetryPlan( + { + status: 429, + response: { headers: { "retry-after": "2" } }, + }, + 1, + ); + + expect(plan).toBeTruthy(); + expect(plan?.reason).toBe("rateLimit"); + expect(plan?.delayMs).toBeGreaterThanOrEqual(2_000); + } finally { + randomSpy.mockRestore(); + } + }); + + it("parses HTTP-date Retry-After values", () => { + vi.useFakeTimers(); + const randomSpy = vi.spyOn(Math, "random").mockReturnValue(0); + + try { + vi.setSystemTime(new Date("2020-01-01T00:00:00.000Z")); + const plan = buildRetryPlan( + { + statusCode: 429, + response: { headers: { "retry-after": "Wed, 01 Jan 2020 00:00:03 GMT" } }, + }, + 1, + ); + + expect(plan).toBeTruthy(); + expect(plan?.delayMs).toBeGreaterThanOrEqual(3_000); + } finally { + vi.useRealTimers(); + randomSpy.mockRestore(); + } + }); +}); diff --git a/packages/core/src/traffic/traffic-retry.ts b/packages/core/src/traffic/traffic-retry.ts index 056a552d3..a1cd363c1 100644 --- a/packages/core/src/traffic/traffic-retry.ts +++ b/packages/core/src/traffic/traffic-retry.ts @@ -9,7 +9,8 @@ import { TIMEOUT_JITTER_FACTOR, TIMEOUT_RETRY_ATTEMPTS, } from "./traffic-constants"; -import { extractStatusCode, isTimeoutError } from "./traffic-error-utils"; +import { extractRetryAfterMs, extractStatusCode, isTimeoutError } from "./traffic-error-utils"; +import { RateLimitedUpstreamError } from "./traffic-errors"; export type RetryReason = "rateLimit" | "serverError" | "timeout"; @@ -35,8 +36,24 @@ export function buildRetryPlan( return undefined; } - const delayMs = computeBackoffDelay(reason, attempt); - retryLogger?.debug?.("Retry plan built", { attempt, reason, delayMs, max }); + const computedDelayMs = computeBackoffDelay(reason, attempt); + const retryAfterMs = + reason === "rateLimit" + ? error instanceof RateLimitedUpstreamError + ? error.retryAfterMs + : extractRetryAfterMs(error, retryLogger) + : undefined; + const delayMs = + retryAfterMs === undefined ? computedDelayMs : Math.max(computedDelayMs, retryAfterMs); + + retryLogger?.debug?.("Retry plan built", { + attempt, + reason, + delayMs, + computedDelayMs, + retryAfterMs, + max, + }); return { reason, @@ -45,6 +62,7 @@ export function buildRetryPlan( } function getRetryReason(error: unknown, logger?: Logger): RetryReason | undefined { + if (error instanceof RateLimitedUpstreamError) return "rateLimit"; const status = extractStatusCode(error, logger); if (status === 429) return "rateLimit"; if (status && status >= 500) return "serverError"; diff --git a/tmp/test/traffic-retry-after.ts b/tmp/test/traffic-retry-after.ts new file mode 100644 index 000000000..c0c213ebe --- /dev/null +++ b/tmp/test/traffic-retry-after.ts @@ -0,0 +1,245 @@ +// @ts-nocheck +/** + * Manual test: Retry-After handling (429 retry + 200 OK header ingestion). + * + * What this exercises: + * - Retry-After on 429 errors increases retry delay (TrafficController retry plan). + * - Retry-After on successful responses throttles subsequent requests for the same provider::model. + * + * Run: + * - pnpm -C packages/core build + * - pnpm ts-node tmp/test/traffic-retry-after.ts + * - VERBOSE=1 pnpm ts-node tmp/test/traffic-retry-after.ts + */ + +import { safeStringify } from "@voltagent/internal"; +import { + Agent, + RateLimitedUpstreamError, + getTrafficController, +} from "../../packages/core/dist/index.js"; + +const verbose = process.env.VERBOSE === "1"; +if (!verbose) { + console.debug = () => {}; +} + +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); +const now = () => new Date().toISOString(); + +function extractLabel(prompt: any): string { + if (!Array.isArray(prompt)) { + return "unknown"; + } + + for (let index = prompt.length - 1; index >= 0; index -= 1) { + const message = prompt[index]; + if (!message || message.role !== "user" || !Array.isArray(message.content)) { + continue; + } + + const textPart = message.content.find((part: any) => part?.type === "text"); + if (textPart?.text) { + return String(textPart.text); + } + } + + return "unknown"; +} + +function make429RetryAfterModel(args: { + provider: string; + modelId: string; + retryAfterSeconds: number; + mode: "headers" | "typedError"; +}) { + const { provider, modelId, retryAfterSeconds, mode } = args; + let calls = 0; + const startedAt: number[] = []; + + return { + specificationVersion: "v2", + provider, + modelId, + startedAt, + doGenerate: async (options: any) => { + calls += 1; + const start = Date.now(); + startedAt.push(start); + + const label = extractLabel(options?.prompt); + console.log(`[${now()}] [model] ${provider}::${modelId} start call=${calls} input=${label}`); + + if (calls === 1) { + const retryAfterValue = String(retryAfterSeconds); + + if (mode === "typedError") { + throw new RateLimitedUpstreamError( + `rate limited (typed) retry-after=${retryAfterValue}s`, + { provider, model: modelId }, + Math.round(retryAfterSeconds * 1000), + ); + } + + const err: any = new Error(`rate limited (headers) retry-after=${retryAfterValue}s`); + err.status = 429; + err.response = { + status: 429, + headers: { + "retry-after": retryAfterValue, + }, + }; + throw err; + } + + return { + content: [{ type: "text", text: `ok:${label}` }], + finishReason: "stop", + usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, + warnings: [], + response: { modelId, headers: {} }, + }; + }, + }; +} + +function makeSuccessRetryAfterModel(args: { + provider: string; + modelId: string; + retryAfterSeconds: number; + latencyMs: number; +}) { + const { provider, modelId, retryAfterSeconds, latencyMs } = args; + let calls = 0; + const startedAt: number[] = []; + const endedAt: number[] = []; + + return { + specificationVersion: "v2", + provider, + modelId, + startedAt, + endedAt, + doGenerate: async (options: any) => { + calls += 1; + const start = Date.now(); + startedAt.push(start); + + const label = extractLabel(options?.prompt); + console.log(`[${now()}] [model] ${provider}::${modelId} start call=${calls} input=${label}`); + await sleep(latencyMs); + + const end = Date.now(); + endedAt.push(end); + console.log(`[${now()}] [model] ${provider}::${modelId} end call=${calls} input=${label}`); + + return { + content: [{ type: "text", text: `ok:${label}` }], + finishReason: "stop", + usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, + warnings: [], + response: { + modelId, + headers: + calls === 1 + ? { + "retry-after": String(retryAfterSeconds), + } + : {}, + }, + }; + }, + }; +} + +async function test_retryAfterOn429(mode: "headers" | "typedError") { + const retryAfterSeconds = 1; + const provider = `retry-after-429-${mode}`; + const modelId = "ra-429"; + const tenantId = `ra-429-${mode}`; + + const model = make429RetryAfterModel({ provider, modelId, retryAfterSeconds, mode }); + const agent = new Agent({ + name: `ra-429-${mode}`, + instructions: "echo", + model, + temperature: 0, + maxOutputTokens: 32, + }); + + console.log(`\n=== Test: Retry-After on 429 (${mode}) ===`); + const result = await agent.generateText("hello", { tenantId, trafficPriority: "P1" }); + + const times = model.startedAt; + const deltaMs = times.length >= 2 ? times[1] - times[0] : undefined; + + console.log( + `[result] text=${result.text} calls=${times.length} startedAt=${safeStringify(times)} deltaMs=${deltaMs}`, + ); + + if (deltaMs === undefined || deltaMs < retryAfterSeconds * 1000) { + throw new Error( + `Expected retry delay >= ${retryAfterSeconds * 1000}ms, got ${deltaMs ?? "n/a"}ms`, + ); + } +} + +async function test_retryAfterOnSuccessResponse() { + const retryAfterSeconds = 0.3; + const provider = "retry-after-200"; + const modelId = "ra-200"; + const tenantId = "ra-200"; + + const model = makeSuccessRetryAfterModel({ + provider, + modelId, + retryAfterSeconds, + latencyMs: 20, + }); + + const agent = new Agent({ + name: "ra-200", + instructions: "echo", + model, + temperature: 0, + maxOutputTokens: 32, + }); + + console.log("\n=== Test: Retry-After on 200 response headers ==="); + const first = agent.generateText("first", { tenantId, trafficPriority: "P1" }); + const second = agent.generateText("second", { tenantId, trafficPriority: "P1" }); + + const [r1, r2] = await Promise.all([first, second]); + + const end1 = model.endedAt[0]; + const start2 = model.startedAt[1]; + const enforcedDelayMs = start2 && end1 ? start2 - end1 : undefined; + + console.log( + `[result] texts=${safeStringify([r1.text, r2.text])} startedAt=${safeStringify( + model.startedAt, + )} endedAt=${safeStringify(model.endedAt)} enforcedDelayMs=${enforcedDelayMs}`, + ); + + if (enforcedDelayMs === undefined || enforcedDelayMs < retryAfterSeconds * 1000) { + throw new Error( + `Expected rate-limit delay >= ${retryAfterSeconds * 1000}ms, got ${enforcedDelayMs ?? "n/a"}ms`, + ); + } +} + +async function main() { + // Create controller early so all Agent calls share the same singleton. + getTrafficController({ maxConcurrent: 1 }); + + await test_retryAfterOn429("headers"); + await test_retryAfterOn429("typedError"); + await test_retryAfterOnSuccessResponse(); + + console.log("\n[done] All Retry-After manual checks passed."); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); From 006f1a056f99c17c163a90a0558a8d5ccc95dfcb Mon Sep 17 00:00:00 2001 From: riturajFi Date: Mon, 15 Dec 2025 17:40:42 +0530 Subject: [PATCH 13/41] feat: added timeout to ckt breaker --- .../src/traffic/traffic-circuit-breaker.ts | 56 ++++++++++++++---- .../core/src/traffic/traffic-constants.ts | 2 + .../traffic/traffic-controller-internal.ts | 1 + .../core/src/traffic/traffic-error-utils.ts | 59 ++++++++++++------- 4 files changed, 87 insertions(+), 31 deletions(-) diff --git a/packages/core/src/traffic/traffic-circuit-breaker.ts b/packages/core/src/traffic/traffic-circuit-breaker.ts index be11b6d36..d44c2e332 100644 --- a/packages/core/src/traffic/traffic-circuit-breaker.ts +++ b/packages/core/src/traffic/traffic-circuit-breaker.ts @@ -3,6 +3,8 @@ import { CIRCUIT_COOLDOWN_MS, CIRCUIT_FAILURE_THRESHOLD, CIRCUIT_FAILURE_WINDOW_MS, + CIRCUIT_TIMEOUT_THRESHOLD, + CIRCUIT_TIMEOUT_WINDOW_MS, DEFAULT_FALLBACK_CHAINS, } from "./traffic-constants"; import type { @@ -11,7 +13,7 @@ import type { DispatchDecision, QueuedRequest, } from "./traffic-controller-internal"; -import { extractStatusCode } from "./traffic-error-utils"; +import { extractStatusCode, isTimeoutError } from "./traffic-error-utils"; import { CircuitBreakerOpenError } from "./traffic-errors"; import type { TrafficRequestMetadata } from "./traffic-types"; @@ -128,44 +130,75 @@ export class TrafficCircuitBreaker { logger?: Logger, ): void { const circuitLogger = logger?.child({ module: "circuit-breaker" }); + const key = this.buildRateLimitKey(metadata); const status = extractStatusCode(error, logger); + const isTimeout = status === 408 || isTimeoutError(error, logger); + const isStatusEligible = this.isCircuitBreakerStatus(status); + const isTimeoutEligible = !isStatusEligible && isTimeout; + const isEligible = isStatusEligible || isTimeoutEligible; + circuitLogger?.debug?.("Circuit failure observed", { - circuitKey: this.buildRateLimitKey(metadata), + circuitKey: key, status, + isTimeout, + eligible: isEligible, provider: metadata?.provider, model: metadata?.model, }); - if (!this.isCircuitBreakerStatus(status)) { - this.circuitBreakers.delete(this.buildRateLimitKey(metadata)); + + if (!isEligible) { + this.circuitBreakers.delete(key); circuitLogger?.debug?.("Failure not eligible for circuit breaker; cleared circuit state", { - circuitKey: this.buildRateLimitKey(metadata), + circuitKey: key, status, + isTimeout, }); return; } - const key = this.buildRateLimitKey(metadata); const now = Date.now(); const state = this.circuitBreakers.get(key) ?? - ({ status: "closed", failureTimestamps: [] } as CircuitState); + ({ status: "closed", failureTimestamps: [], timeoutTimestamps: [] } as CircuitState); state.failureTimestamps = state.failureTimestamps.filter( (t) => now - t <= CIRCUIT_FAILURE_WINDOW_MS, ); + state.timeoutTimestamps = state.timeoutTimestamps.filter( + (t) => now - t <= CIRCUIT_TIMEOUT_WINDOW_MS, + ); + state.failureTimestamps.push(now); + if (isTimeoutEligible) { + state.timeoutTimestamps.push(now); + } if ( state.status === "half-open" || - state.failureTimestamps.length >= CIRCUIT_FAILURE_THRESHOLD + state.failureTimestamps.length >= CIRCUIT_FAILURE_THRESHOLD || + state.timeoutTimestamps.length >= CIRCUIT_TIMEOUT_THRESHOLD ) { + const openReasons: string[] = []; + if (state.status === "half-open") openReasons.push("half-open-failure"); + if (state.failureTimestamps.length >= CIRCUIT_FAILURE_THRESHOLD) { + openReasons.push("failure-threshold"); + } + if (state.timeoutTimestamps.length >= CIRCUIT_TIMEOUT_THRESHOLD) { + openReasons.push("timeout-threshold"); + } + state.status = "open"; state.openedAt = now; state.trialInFlight = false; circuitLogger?.warn?.("Circuit opened", { circuitKey: key, + openReasons, + status, + isTimeout, failureCount: state.failureTimestamps.length, - threshold: CIRCUIT_FAILURE_THRESHOLD, + failureThreshold: CIRCUIT_FAILURE_THRESHOLD, + timeoutCount: state.timeoutTimestamps.length, + timeoutThreshold: CIRCUIT_TIMEOUT_THRESHOLD, openedAt: state.openedAt, }); } @@ -175,7 +208,9 @@ export class TrafficCircuitBreaker { circuitKey: key, status: state.status, failureCount: state.failureTimestamps.length, - windowMs: CIRCUIT_FAILURE_WINDOW_MS, + failureWindowMs: CIRCUIT_FAILURE_WINDOW_MS, + timeoutCount: state.timeoutTimestamps.length, + timeoutWindowMs: CIRCUIT_TIMEOUT_WINDOW_MS, }); } @@ -201,6 +236,7 @@ export class TrafficCircuitBreaker { state.status = "half-open"; state.trialInFlight = false; state.failureTimestamps = []; + state.timeoutTimestamps = []; logger?.debug?.("Circuit transitioned to half-open", { circuitKey: key }); return { allowRequest: true, state: "half-open" }; } diff --git a/packages/core/src/traffic/traffic-constants.ts b/packages/core/src/traffic/traffic-constants.ts index 74845ab65..c21f8a17e 100644 --- a/packages/core/src/traffic/traffic-constants.ts +++ b/packages/core/src/traffic/traffic-constants.ts @@ -11,6 +11,8 @@ export const TIMEOUT_JITTER_FACTOR = 0.5; export const CIRCUIT_FAILURE_THRESHOLD = 5; export const CIRCUIT_FAILURE_WINDOW_MS = 10_000; +export const CIRCUIT_TIMEOUT_THRESHOLD = CIRCUIT_FAILURE_THRESHOLD; +export const CIRCUIT_TIMEOUT_WINDOW_MS = CIRCUIT_FAILURE_WINDOW_MS; export const CIRCUIT_COOLDOWN_MS = 30_000; export const RATE_LIMIT_EXHAUSTION_BUFFER = 1; diff --git a/packages/core/src/traffic/traffic-controller-internal.ts b/packages/core/src/traffic/traffic-controller-internal.ts index 48a3b9894..cc71fe74a 100644 --- a/packages/core/src/traffic/traffic-controller-internal.ts +++ b/packages/core/src/traffic/traffic-controller-internal.ts @@ -12,6 +12,7 @@ export type CircuitStateStatus = "closed" | "open" | "half-open"; export interface CircuitState { status: CircuitStateStatus; failureTimestamps: number[]; + timeoutTimestamps: number[]; openedAt?: number; trialInFlight?: boolean; } diff --git a/packages/core/src/traffic/traffic-error-utils.ts b/packages/core/src/traffic/traffic-error-utils.ts index 69d15d84c..946f1c0be 100644 --- a/packages/core/src/traffic/traffic-error-utils.ts +++ b/packages/core/src/traffic/traffic-error-utils.ts @@ -96,28 +96,45 @@ export function extractRetryAfterMs(error: unknown, logger?: Logger): number | u } export function isTimeoutError(error: unknown, logger?: Logger): boolean { - const code = readObjectProperty(error, "code"); - const name = readObjectProperty(error, "name"); - const message = readObjectProperty(error, "message"); - const isTimeout = - String(code ?? "") - .toLowerCase() - .includes("timeout") || - String(name ?? "") - .toLowerCase() - .includes("timeout") || - String(message ?? "") - .toLowerCase() - .includes("timeout"); - - logger?.trace?.("Checked timeout error", { - isTimeout, - code, - name, - messagePreview: typeof message === "string" ? message.slice(0, 160) : message, - }); + const candidates: unknown[] = [error]; + + const cause = readObjectProperty(error, "cause"); + if (cause) { + candidates.push(cause); + const nestedCause = readObjectProperty(cause, "cause"); + if (nestedCause) candidates.push(nestedCause); + } + + for (const candidate of candidates) { + const code = readObjectProperty(candidate, "code"); + const name = readObjectProperty(candidate, "name"); + const message = readObjectProperty(candidate, "message"); + + const codeText = String(code ?? "").toLowerCase(); + const nameText = String(name ?? "").toLowerCase(); + const messageText = String(message ?? "").toLowerCase(); + + const isTimeout = + codeText.includes("timeout") || + codeText.includes("timedout") || + nameText.includes("timeout") || + nameText.includes("timedout") || + messageText.includes("timeout") || + messageText.includes("timedout") || + messageText.includes("timed out"); + + logger?.trace?.("Checked timeout error", { + isTimeout, + code, + name, + messagePreview: typeof message === "string" ? message.slice(0, 160) : message, + hasCause: candidate !== error, + }); + + if (isTimeout) return true; + } - return isTimeout; + return false; } export function isPromiseLike(value: unknown): value is PromiseLike { From bee794e5e1a5e09f750798dce77c725a8ee27b71 Mon Sep 17 00:00:00 2001 From: riturajFi Date: Tue, 16 Dec 2025 01:48:22 +0530 Subject: [PATCH 14/41] feat: tenant based ckt breaker --- .../src/traffic/traffic-circuit-breaker.ts | 2 + .../traffic/traffic-concurrency-limiter.ts | 235 ++++++++++++++++++ .../traffic/traffic-controller-internal.ts | 3 + .../core/src/traffic/traffic-controller.ts | 122 +++++---- packages/core/src/traffic/traffic-types.ts | 12 + 5 files changed, 328 insertions(+), 46 deletions(-) create mode 100644 packages/core/src/traffic/traffic-concurrency-limiter.ts diff --git a/packages/core/src/traffic/traffic-circuit-breaker.ts b/packages/core/src/traffic/traffic-circuit-breaker.ts index d44c2e332..812b29213 100644 --- a/packages/core/src/traffic/traffic-circuit-breaker.ts +++ b/packages/core/src/traffic/traffic-circuit-breaker.ts @@ -91,6 +91,8 @@ export class TrafficCircuitBreaker { next.request = fallbackRequest; next.attempt = 1; + next.tenantConcurrencyKey = undefined; + next.providerModelConcurrencyKey = undefined; next.rateLimitKey = undefined; next.etaMs = undefined; next.circuitKey = undefined; diff --git a/packages/core/src/traffic/traffic-concurrency-limiter.ts b/packages/core/src/traffic/traffic-concurrency-limiter.ts new file mode 100644 index 000000000..e15256127 --- /dev/null +++ b/packages/core/src/traffic/traffic-concurrency-limiter.ts @@ -0,0 +1,235 @@ +import type { Logger } from "../logger"; +import type { QueuedRequest } from "./traffic-controller-internal"; +import type { + ProviderModelConcurrencyLimit, + TenantConcurrencyLimit, + TrafficRequestMetadata, +} from "./traffic-types"; + +export type ConcurrencyBlockReason = + | { + gate: "providerModel"; + key: string; + inFlight: number; + limit: number; + } + | { + gate: "tenant"; + key: string; + inFlight: number; + limit: number; + }; + +export type ConcurrencyDecision = + | { kind: "allow" } + | { kind: "wait"; reasons: ConcurrencyBlockReason[] }; + +function toNonNegativeIntegerLimit(raw: unknown): number | undefined { + if (raw === undefined || raw === null) return undefined; + const n = typeof raw === "number" ? raw : Number(raw); + if (!Number.isFinite(n)) return undefined; + if (n <= 0) return 0; + return Math.floor(n); +} + +function getInFlight(map: Map, key: string): number { + return map.get(key) ?? 0; +} + +function incrementInFlight(map: Map, key: string): void { + map.set(key, getInFlight(map, key) + 1); +} + +function decrementInFlight(map: Map, key: string): void { + const current = getInFlight(map, key); + if (current <= 1) { + map.delete(key); + return; + } + map.set(key, current - 1); +} + +export class TrafficConcurrencyLimiter { + private readonly inFlightByProviderModel = new Map(); + private readonly inFlightByTenant = new Map(); + + private readonly buildProviderModelKey: (metadata?: TrafficRequestMetadata) => string; + private readonly providerModelLimit?: ProviderModelConcurrencyLimit; + private readonly tenantLimit?: TenantConcurrencyLimit; + private readonly providerModelEnabled: boolean; + private readonly tenantEnabled: boolean; + + constructor(options: { + buildProviderModelKey: (metadata?: TrafficRequestMetadata) => string; + maxConcurrentPerProviderModel?: ProviderModelConcurrencyLimit; + maxConcurrentPerTenant?: TenantConcurrencyLimit; + }) { + this.buildProviderModelKey = options.buildProviderModelKey; + this.providerModelLimit = options.maxConcurrentPerProviderModel; + this.tenantLimit = options.maxConcurrentPerTenant; + this.providerModelEnabled = options.maxConcurrentPerProviderModel !== undefined; + this.tenantEnabled = options.maxConcurrentPerTenant !== undefined; + } + + resolve(next: QueuedRequest, logger?: Logger): ConcurrencyDecision { + if (!this.providerModelEnabled && !this.tenantEnabled) return { kind: "allow" }; + const concurrencyLogger = logger?.child({ module: "concurrency-limiter" }); + const reasons: ConcurrencyBlockReason[] = []; + + if (this.providerModelEnabled) { + const providerModelKey = this.buildProviderModelKey(next.request.metadata); + const providerModelLimit = this.resolveProviderModelLimit( + providerModelKey, + next.request.metadata, + concurrencyLogger, + ); + if (providerModelLimit !== undefined) { + const inFlight = getInFlight(this.inFlightByProviderModel, providerModelKey); + if (inFlight >= providerModelLimit) { + reasons.push({ + gate: "providerModel", + key: providerModelKey, + inFlight, + limit: providerModelLimit, + }); + } + } + } + + if (this.tenantEnabled) { + const tenantKey = next.tenantId; + const tenantLimit = this.resolveTenantLimit( + tenantKey, + next.request.metadata, + concurrencyLogger, + ); + if (tenantLimit !== undefined) { + const inFlight = getInFlight(this.inFlightByTenant, tenantKey); + if (inFlight >= tenantLimit) { + reasons.push({ + gate: "tenant", + key: tenantKey, + inFlight, + limit: tenantLimit, + }); + } + } + } + + if (reasons.length === 0) return { kind: "allow" }; + + concurrencyLogger?.trace?.("Concurrency gate blocked request", { + tenantId: next.tenantId, + reasons, + }); + return { kind: "wait", reasons }; + } + + acquire(next: QueuedRequest, logger?: Logger): void { + if (!this.providerModelEnabled && !this.tenantEnabled) return; + const concurrencyLogger = logger?.child({ module: "concurrency-limiter" }); + + let tenantKey: string | undefined; + if (this.tenantEnabled) { + tenantKey = next.tenantId; + next.tenantConcurrencyKey = tenantKey; + incrementInFlight(this.inFlightByTenant, tenantKey); + } + + let providerModelKey: string | undefined; + if (this.providerModelEnabled) { + providerModelKey = this.buildProviderModelKey(next.request.metadata); + next.providerModelConcurrencyKey = providerModelKey; + incrementInFlight(this.inFlightByProviderModel, providerModelKey); + } + + concurrencyLogger?.trace?.("Concurrency slots acquired", { + tenantId: tenantKey, + tenantInFlight: tenantKey ? getInFlight(this.inFlightByTenant, tenantKey) : undefined, + providerModelKey, + providerModelInFlight: providerModelKey + ? getInFlight(this.inFlightByProviderModel, providerModelKey) + : undefined, + }); + } + + release(next: QueuedRequest, logger?: Logger): void { + const concurrencyLogger = logger?.child({ module: "concurrency-limiter" }); + const tenantKey = next.tenantConcurrencyKey; + const providerModelKey = next.providerModelConcurrencyKey; + + if (tenantKey) { + decrementInFlight(this.inFlightByTenant, tenantKey); + } + + if (providerModelKey) { + decrementInFlight(this.inFlightByProviderModel, providerModelKey); + } + + if (tenantKey || providerModelKey) { + concurrencyLogger?.trace?.("Concurrency slots released", { + tenantId: tenantKey, + tenantInFlight: tenantKey ? getInFlight(this.inFlightByTenant, tenantKey) : undefined, + providerModelKey, + providerModelInFlight: providerModelKey + ? getInFlight(this.inFlightByProviderModel, providerModelKey) + : undefined, + }); + } + + next.tenantConcurrencyKey = undefined; + next.providerModelConcurrencyKey = undefined; + } + + private resolveTenantLimit( + tenantId: string, + metadata: TrafficRequestMetadata | undefined, + logger?: Logger, + ): number | undefined { + const policy = this.tenantLimit; + if (policy === undefined) return undefined; + + if (typeof policy === "number") return toNonNegativeIntegerLimit(policy); + if (typeof policy === "function") { + try { + return toNonNegativeIntegerLimit(policy(tenantId, metadata)); + } catch (error) { + logger?.warn?.("Tenant concurrency resolver threw; ignoring", { + tenantId, + errorName: (error as { name?: unknown } | null)?.name, + errorMessage: (error as { message?: unknown } | null)?.message, + }); + return undefined; + } + } + + return toNonNegativeIntegerLimit(policy[tenantId]); + } + + private resolveProviderModelLimit( + key: string, + metadata: TrafficRequestMetadata | undefined, + logger?: Logger, + ): number | undefined { + const policy = this.providerModelLimit; + if (policy === undefined) return undefined; + + if (typeof policy === "number") return toNonNegativeIntegerLimit(policy); + if (typeof policy === "function") { + try { + return toNonNegativeIntegerLimit(policy(metadata, key)); + } catch (error) { + logger?.warn?.("Provider/model concurrency resolver threw; ignoring", { + key, + provider: metadata?.provider, + model: metadata?.model, + errorName: (error as { name?: unknown } | null)?.name, + errorMessage: (error as { message?: unknown } | null)?.message, + }); + return undefined; + } + } + + return toNonNegativeIntegerLimit(policy[key]); + } +} diff --git a/packages/core/src/traffic/traffic-controller-internal.ts b/packages/core/src/traffic/traffic-controller-internal.ts index cc71fe74a..065c92c1d 100644 --- a/packages/core/src/traffic/traffic-controller-internal.ts +++ b/packages/core/src/traffic/traffic-controller-internal.ts @@ -38,6 +38,9 @@ export interface QueuedRequest { priority: TrafficPriority; tenantId: string; + tenantConcurrencyKey?: string; + providerModelConcurrencyKey?: string; + rateLimitKey?: string; etaMs?: number; diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index 8dcb9a350..421eea79b 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -1,14 +1,17 @@ import type { Logger } from "../logger"; import { LoggerProxy } from "../logger"; import { TrafficCircuitBreaker } from "./traffic-circuit-breaker"; +import { TrafficConcurrencyLimiter } from "./traffic-concurrency-limiter"; import type { DispatchDecision, QueuedRequest, Scheduler } from "./traffic-controller-internal"; import { CircuitBreakerOpenError, RateLimitedUpstreamError } from "./traffic-errors"; import { type RateLimitUpdateResult, TrafficRateLimiter } from "./traffic-rate-limiter"; import { type RetryReason, buildRetryPlan } from "./traffic-retry"; import type { + ProviderModelConcurrencyLimit, RateLimitConfig, RateLimitKey, RateLimitOptions, + TenantConcurrencyLimit, TenantUsage, TrafficControllerOptions, TrafficPriority, @@ -24,9 +27,11 @@ import { TrafficUsageTracker } from "./traffic-usage-tracker"; */ export type { + ProviderModelConcurrencyLimit, RateLimitConfig, RateLimitKey, RateLimitOptions, + TenantConcurrencyLimit, TenantUsage, TrafficControllerOptions, TrafficPriority, @@ -46,6 +51,7 @@ export class TrafficController { private readonly logger: Logger; private readonly trafficLogger: Logger; private readonly controllerLogger: Logger; + private readonly concurrencyLimiter: TrafficConcurrencyLimiter; private readonly queues: Record = { P0: [], @@ -77,10 +83,17 @@ export class TrafficController { fallbackChains: options.fallbackChains, buildRateLimitKey: (metadata) => this.buildRateLimitKey(metadata), }); + this.concurrencyLimiter = new TrafficConcurrencyLimiter({ + buildProviderModelKey: (metadata) => this.buildRateLimitKey(metadata), + maxConcurrentPerProviderModel: options.maxConcurrentPerProviderModel, + maxConcurrentPerTenant: options.maxConcurrentPerTenant, + }); this.controllerLogger.debug("Initialized TrafficController", { maxConcurrent: this.maxConcurrent, hasFallbackChains: !!options.fallbackChains, + hasProviderModelConcurrency: options.maxConcurrentPerProviderModel !== undefined, + hasTenantConcurrency: options.maxConcurrentPerTenant !== undefined, }); } @@ -230,60 +243,73 @@ export class TrafficController { let earliestWakeUpAt: number | undefined; - for (const priority of this.priorityOrder) { - const next = this.queues[priority][0]; - if (!next) continue; + const observeWakeUpAt = (candidate?: number): void => { + if (candidate === undefined) return; + earliestWakeUpAt = + earliestWakeUpAt === undefined ? candidate : Math.min(earliestWakeUpAt, candidate); + }; - this.controllerLogger.trace("Evaluate next queued request", { - priority, - type: next.type, - tenantId: next.tenantId, - attempt: next.attempt, - provider: next.request.metadata?.provider, - model: next.request.metadata?.model, - }); + for (const priority of this.priorityOrder) { + const queue = this.queues[priority]; + for (let index = 0; index < queue.length; index++) { + const next = queue[index]; + if (!next) continue; - const circuit = this.resolveCircuit(next); - if (circuit) { - this.controllerLogger.trace("Circuit resolution returned decision", { + this.controllerLogger.trace("Evaluate next queued request", { priority, - decision: circuit, - circuitKey: next.circuitKey, - circuitStatus: next.circuitStatus, + queueIndex: index, + queueLength: queue.length, + type: next.type, + tenantId: next.tenantId, + attempt: next.attempt, + provider: next.request.metadata?.provider, + model: next.request.metadata?.model, }); - if (circuit.kind === "skip") { - this.queues[priority].shift(); - return { kind: "skip" }; - } - if (circuit.kind === "wait") { - if (circuit.wakeUpAt !== undefined) { - earliestWakeUpAt = - earliestWakeUpAt === undefined - ? circuit.wakeUpAt - : Math.min(earliestWakeUpAt, circuit.wakeUpAt); + + const circuit = this.resolveCircuit(next); + if (circuit) { + this.controllerLogger.trace("Circuit resolution returned decision", { + priority, + decision: circuit, + circuitKey: next.circuitKey, + circuitStatus: next.circuitStatus, + }); + if (circuit.kind === "skip") { + queue.splice(index, 1); + return { kind: "skip" }; } + if (circuit.kind === "wait") { + observeWakeUpAt(circuit.wakeUpAt); + continue; + } + } + + const concurrency = this.concurrencyLimiter.resolve(next, this.trafficLogger); + if (concurrency.kind === "wait") { + this.controllerLogger.trace("Concurrency gate blocked request", { + priority, + tenantId: next.tenantId, + provider: next.request.metadata?.provider, + model: next.request.metadata?.model, + reasons: concurrency.reasons, + }); continue; } - } - const rateLimit = this.resolveRateLimit(next); - if (rateLimit) { - this.controllerLogger.trace("Rate limit resolution returned decision", { - priority, - decision: rateLimit, - rateLimitKey: next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), - }); - if (rateLimit.kind === "wait" && rateLimit.wakeUpAt !== undefined) { - earliestWakeUpAt = - earliestWakeUpAt === undefined - ? rateLimit.wakeUpAt - : Math.min(earliestWakeUpAt, rateLimit.wakeUpAt); + const rateLimit = this.resolveRateLimit(next); + if (rateLimit) { + this.controllerLogger.trace("Rate limit resolution returned decision", { + priority, + decision: rateLimit, + rateLimitKey: next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), + }); + if (rateLimit.kind === "wait") observeWakeUpAt(rateLimit.wakeUpAt); + continue; } - continue; - } - this.startRequest(next); - return { kind: "dispatch" }; + this.startRequest(next, index); + return { kind: "dispatch" }; + } } return earliestWakeUpAt !== undefined @@ -291,7 +317,7 @@ export class TrafficController { : { kind: "wait" }; } - private startRequest(item: QueuedRequest): void { + private startRequest(item: QueuedRequest, queueIndex: number): void { this.controllerLogger.debug("Start request", { priority: item.priority, type: item.type, @@ -300,8 +326,9 @@ export class TrafficController { provider: item.request.metadata?.provider, model: item.request.metadata?.model, }); - this.queues[item.priority].shift(); + this.queues[item.priority].splice(queueIndex, 1); this.activeCount++; + this.concurrencyLimiter.acquire(item, this.trafficLogger); this.rateLimiter.notifyDispatch(item.rateLimitKey, this.trafficLogger); this.circuitBreaker.markTrial(item, this.trafficLogger); void this.executeRequest(item); @@ -375,6 +402,7 @@ export class TrafficController { } } finally { this.rateLimiter.releaseReservation(item.rateLimitKey, this.trafficLogger); + this.concurrencyLimiter.release(item, this.trafficLogger); this.activeCount = Math.max(0, this.activeCount - 1); this.controllerLogger.trace("Request finished; slot released", { tenantId: item.tenantId, @@ -411,6 +439,8 @@ export class TrafficController { this.queues[item.priority].push({ ...item, attempt: item.attempt + 1, + tenantConcurrencyKey: undefined, + providerModelConcurrencyKey: undefined, rateLimitKey: undefined, etaMs: undefined, circuitKey: undefined, diff --git a/packages/core/src/traffic/traffic-types.ts b/packages/core/src/traffic/traffic-types.ts index ed3b4f316..9aaf5eb0d 100644 --- a/packages/core/src/traffic/traffic-types.ts +++ b/packages/core/src/traffic/traffic-types.ts @@ -22,6 +22,16 @@ export interface TrafficRequestMetadata { tenantId?: string; } +export type ProviderModelConcurrencyLimit = + | number + | Record + | ((metadata: TrafficRequestMetadata | undefined, key: string) => number | undefined); + +export type TenantConcurrencyLimit = + | number + | Record + | ((tenantId: string, metadata: TrafficRequestMetadata | undefined) => number | undefined); + export interface TrafficRequest { tenantId: string; metadata?: TrafficRequestMetadata; @@ -35,6 +45,8 @@ export interface TrafficRequest { export interface TrafficControllerOptions { maxConcurrent?: number; + maxConcurrentPerProviderModel?: ProviderModelConcurrencyLimit; + maxConcurrentPerTenant?: TenantConcurrencyLimit; rateLimits?: RateLimitConfig; logger?: Logger; fallbackChains?: Record; From 3b0f14dda8e8bff53a2c64812f12546b7bdf388a Mon Sep 17 00:00:00 2001 From: riturajFi Date: Tue, 16 Dec 2025 15:06:11 +0530 Subject: [PATCH 15/41] feat: token bucket rate limit strategy --- packages/core/src/agent/agent.ts | 27 +- .../default-rate-limit-strategy.ts | 243 +++++++++++++ .../openai-window-rate-limit-strategy.ts | 61 ++++ .../rate-limit-strategy.ts | 33 ++ .../rate-limit-strategies/rate-limit-utils.ts | 26 ++ .../token-bucket-rate-limit-strategy.ts | 208 ++++++++++++ .../core/src/traffic/traffic-controller.ts | 20 +- .../core/src/traffic/traffic-rate-limiter.ts | 320 ++---------------- 8 files changed, 635 insertions(+), 303 deletions(-) create mode 100644 packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts create mode 100644 packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts create mode 100644 packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts create mode 100644 packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts create mode 100644 packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index adf6e92f9..ed2adc777 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -638,6 +638,7 @@ export class Agent { }); this.updateTrafficControllerRateLimits(result.response, trafficMetadata, methodLogger); } catch (error) { + this.updateTrafficControllerRateLimits(error, trafficMetadata, methodLogger); finalizeLLMSpan(SpanStatusCode.ERROR, { message: (error as Error).message }); throw error; } @@ -1012,6 +1013,7 @@ export class Agent { modelName: this.getModelName(), }); + this.updateTrafficControllerRateLimits(actualError, trafficMetadata, methodLogger); finalizeLLMSpan(SpanStatusCode.ERROR, { message: (actualError as Error)?.message }); // History update removed - using OpenTelemetry only @@ -4059,15 +4061,26 @@ export class Agent { metadata: TrafficRequestMetadata | undefined, logger?: Logger, ): void { - if (!response || typeof response !== "object") { - logger?.debug?.("[Traffic] No response object available for rate limit update"); - return; - } + const readObjectProperty = (value: unknown, key: string): unknown => { + if (!value || typeof value !== "object") return undefined; + return (value as Record)[key]; + }; - const responseWithHeaders = response as { headers?: unknown } | null; - const headers = responseWithHeaders?.headers; + const headerCandidates: unknown[] = [ + readObjectProperty(response, "headers"), + readObjectProperty(readObjectProperty(response, "response"), "headers"), + readObjectProperty(readObjectProperty(response, "cause"), "headers"), + readObjectProperty( + readObjectProperty(readObjectProperty(response, "cause"), "response"), + "headers", + ), + ]; + + const headers = headerCandidates.find( + (candidate) => candidate !== undefined && candidate !== null, + ); if (!headers) { - logger?.debug?.("[Traffic] Response missing headers; skipping rate limit update"); + logger?.debug?.("[Traffic] No headers found for rate limit update"); return; } diff --git a/packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts new file mode 100644 index 000000000..652b7e59a --- /dev/null +++ b/packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts @@ -0,0 +1,243 @@ +import type { Logger } from "../../logger"; +import { + RATE_LIMIT_EXHAUSTION_BUFFER, + RATE_LIMIT_MIN_PACE_INTERVAL_MS, + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS, + RATE_LIMIT_PROBE_DELAY_MS, +} from "../traffic-constants"; +import type { + DispatchDecision, + QueuedRequest, + RateLimitWindowState, +} from "../traffic-controller-internal"; +import { parseRetryAfterMs, readHeaderValue } from "../traffic-error-utils"; +import type { TrafficRequestMetadata } from "../traffic-types"; +import type { + RateLimitHeaderSnapshot, + RateLimitStrategy, + RateLimitUpdateResult, +} from "./rate-limit-strategy"; +import { parseResetDurationToMs } from "./rate-limit-utils"; + +export class DefaultRateLimitStrategy implements RateLimitStrategy { + private state?: RateLimitWindowState; + private readonly key: string; + + constructor(key: string) { + this.key = key; + } + + resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { + const rateLimitLogger = logger?.child({ module: "rate-limiter" }); + const state = this.state; + if (!state) { + rateLimitLogger?.trace?.("Rate limit state missing; allow request", { + rateLimitKey: this.key, + }); + return null; + } + + const now = Date.now(); + const effectiveRemaining = Math.max(0, state.remaining - state.reserved); + const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; + + if (effectiveRemaining <= RATE_LIMIT_EXHAUSTION_BUFFER) { + if (now < probeAt) { + rateLimitLogger?.debug?.("Rate limit exhausted; waiting for probe", { + rateLimitKey: this.key, + remaining: state.remaining, + reserved: state.reserved, + effectiveRemaining, + resetAt: state.resetAt, + probeAt, + }); + return { kind: "wait", wakeUpAt: probeAt }; + } + if (state.reserved > 0) { + rateLimitLogger?.debug?.("Rate limit exhausted but in-flight reservations exist; waiting", { + rateLimitKey: this.key, + remaining: state.remaining, + reserved: state.reserved, + effectiveRemaining, + resetAt: state.resetAt, + }); + return { kind: "wait" }; + } + } + + if (now < state.nextAllowedAt) { + rateLimitLogger?.debug?.("Rate limit pacing; waiting until nextAllowedAt", { + rateLimitKey: this.key, + nextAllowedAt: state.nextAllowedAt, + resetAt: state.resetAt, + waitMs: Math.min(state.resetAt, state.nextAllowedAt) - now, + }); + return { kind: "wait", wakeUpAt: Math.min(state.resetAt, state.nextAllowedAt) }; + } + + state.reserved += 1; + next.rateLimitKey = this.key; + rateLimitLogger?.trace?.("Reserved rate limit token", { + rateLimitKey: this.key, + reserved: state.reserved, + remaining: state.remaining, + resetAt: state.resetAt, + nextAllowedAt: state.nextAllowedAt, + }); + + const remainingWindowMs = Math.max(0, state.resetAt - now); + const intervalMs = Math.max( + RATE_LIMIT_MIN_PACE_INTERVAL_MS, + Math.ceil(remainingWindowMs / Math.max(effectiveRemaining, 1)), + ); + + const candidateNext = Math.max(state.nextAllowedAt, now + intervalMs); + if ( + state.nextAllowedAt <= now || + candidateNext >= state.nextAllowedAt + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS + ) { + state.nextAllowedAt = candidateNext; + rateLimitLogger?.trace?.("Updated pacing nextAllowedAt", { + rateLimitKey: this.key, + nextAllowedAt: state.nextAllowedAt, + intervalMs, + remainingWindowMs, + effectiveRemaining, + }); + } + + return null; + } + + onDispatch(_logger?: Logger): void {} + + onComplete(logger?: Logger): void { + const rateLimitLogger = logger?.child({ module: "rate-limiter" }); + const state = this.state; + if (!state || state.reserved <= 0) return; + state.reserved -= 1; + rateLimitLogger?.trace?.("Released rate limit reservation", { + rateLimitKey: this.key, + reserved: state.reserved, + remaining: state.remaining, + resetAt: state.resetAt, + nextAllowedAt: state.nextAllowedAt, + }); + } + + updateFromHeaders( + _metadata: TrafficRequestMetadata | undefined, + headers: unknown, + logger?: Logger, + ): RateLimitUpdateResult | undefined { + const rateLimitLogger = logger?.child({ module: "rate-limiter" }); + const limitRequests = readHeaderValue(headers, "x-ratelimit-limit-requests"); + const remainingRequests = readHeaderValue(headers, "x-ratelimit-remaining-requests"); + const resetRequests = readHeaderValue(headers, "x-ratelimit-reset-requests"); + const retryAfter = readHeaderValue(headers, "retry-after"); + const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter) : undefined; + + const now = Date.now(); + const existing = this.state; + let state: RateLimitWindowState | undefined; + let headerSnapshot: RateLimitHeaderSnapshot | undefined; + + if (limitRequests && remainingRequests && resetRequests) { + const limit = Number(limitRequests); + const remaining = Number(remainingRequests); + if (!Number.isFinite(limit) || !Number.isFinite(remaining)) { + rateLimitLogger?.debug?.("Invalid rate limit numeric headers; skipping", { + rateLimitKey: this.key, + limitRequests, + remainingRequests, + }); + return undefined; + } + + const resetRequestsMs = parseResetDurationToMs(resetRequests); + if (resetRequestsMs === undefined) { + rateLimitLogger?.debug?.("Unable to parse reset duration; skipping", { + rateLimitKey: this.key, + resetRequests, + }); + return undefined; + } + + const parsedResetAt = now + resetRequestsMs; + const isSameWindow = !!existing && now < existing.resetAt; + const resetAt = isSameWindow ? Math.max(existing.resetAt, parsedResetAt) : parsedResetAt; + const nextAllowedAt = isSameWindow ? Math.max(existing.nextAllowedAt, now) : now; + const reserved = Math.max(0, existing?.reserved ?? 0); + + state = { + limit, + remaining: isSameWindow ? Math.min(existing.remaining, remaining) : remaining, + resetAt, + reserved, + nextAllowedAt, + }; + headerSnapshot = { + limitRequests, + remainingRequests, + resetRequests, + resetRequestsMs, + }; + } else if (retryAfterMs === undefined) { + rateLimitLogger?.trace?.("Missing rate limit headers; skipping", { + rateLimitKey: this.key, + hasLimit: !!limitRequests, + hasRemaining: !!remainingRequests, + hasReset: !!resetRequests, + hasRetryAfter: !!retryAfter, + }); + return undefined; + } + + if (!state) { + if (retryAfterMs === undefined) { + rateLimitLogger?.trace?.("Retry-After missing or unparsable; skipping", { + rateLimitKey: this.key, + retryAfter, + }); + return undefined; + } + const targetAt = now + retryAfterMs; + const isSameWindow = !!existing && now < existing.resetAt; + state = { + limit: existing?.limit ?? 1, + remaining: 0, + resetAt: isSameWindow ? Math.max(existing.resetAt, targetAt) : targetAt, + reserved: Math.max(0, existing?.reserved ?? 0), + nextAllowedAt: Math.max(existing?.nextAllowedAt ?? now, targetAt), + }; + headerSnapshot = { retryAfter, retryAfterMs }; + } else if (retryAfterMs !== undefined) { + const targetAt = now + retryAfterMs; + state = { + ...state, + remaining: 0, + resetAt: Math.max(state.resetAt, targetAt), + nextAllowedAt: Math.max(state.nextAllowedAt, targetAt), + }; + headerSnapshot = { ...headerSnapshot, retryAfter, retryAfterMs }; + } + + this.state = state; + rateLimitLogger?.debug?.("Applied rate limit headers to state", { + rateLimitKey: this.key, + limit: state.limit, + remaining: state.remaining, + effectiveRemaining: Math.max(0, state.remaining - state.reserved), + resetAt: state.resetAt, + nextAllowedAt: state.nextAllowedAt, + resetRequestsMs: headerSnapshot?.resetRequestsMs, + retryAfterMs: headerSnapshot?.retryAfterMs, + }); + + return { + key: this.key, + headerSnapshot: headerSnapshot ?? {}, + state, + }; + } +} diff --git a/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts new file mode 100644 index 000000000..f81e72696 --- /dev/null +++ b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts @@ -0,0 +1,61 @@ +import type { Logger } from "../../logger"; +import type { DispatchDecision, QueuedRequest } from "../traffic-controller-internal"; +import type { TrafficRequestMetadata } from "../traffic-types"; +import { DefaultRateLimitStrategy } from "./default-rate-limit-strategy"; +import type { RateLimitStrategy, RateLimitUpdateResult } from "./rate-limit-strategy"; + +export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { + private readonly window: DefaultRateLimitStrategy; + private readonly key: string; + private bootstrapReserved = 0; + + constructor(key: string) { + this.key = key; + this.window = new DefaultRateLimitStrategy(key); + } + + resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { + const decision = this.window.resolve(next, logger); + if (decision) return decision; + + if (next.rateLimitKey) { + return null; + } + + const rateLimitLogger = logger?.child({ module: "rate-limiter" }); + if (this.bootstrapReserved >= 1) { + rateLimitLogger?.debug?.("OpenAI rate limit bootstrap active; waiting", { + rateLimitKey: this.key, + bootstrapReserved: this.bootstrapReserved, + }); + return { kind: "wait" }; + } + + this.bootstrapReserved += 1; + next.rateLimitKey = this.key; + rateLimitLogger?.debug?.("OpenAI rate limit bootstrap reserved", { + rateLimitKey: this.key, + bootstrapReserved: this.bootstrapReserved, + }); + return null; + } + + onDispatch(logger?: Logger): void { + this.window.onDispatch(logger); + } + + onComplete(logger?: Logger): void { + if (this.bootstrapReserved > 0) { + this.bootstrapReserved -= 1; + } + this.window.onComplete(logger); + } + + updateFromHeaders( + metadata: TrafficRequestMetadata | undefined, + headers: unknown, + logger?: Logger, + ): RateLimitUpdateResult | undefined { + return this.window.updateFromHeaders(metadata, headers, logger); + } +} diff --git a/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts new file mode 100644 index 000000000..4e5a06245 --- /dev/null +++ b/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts @@ -0,0 +1,33 @@ +import type { Logger } from "../../logger"; +import type { + DispatchDecision, + QueuedRequest, + RateLimitWindowState, +} from "../traffic-controller-internal"; +import type { TrafficRequestMetadata } from "../traffic-types"; + +export type RateLimitHeaderSnapshot = { + limitRequests?: string; + remainingRequests?: string; + resetRequests?: string; + resetRequestsMs?: number; + retryAfter?: string; + retryAfterMs?: number; +}; + +export type RateLimitUpdateResult = { + key: string; + headerSnapshot: RateLimitHeaderSnapshot; + state: RateLimitWindowState; +}; + +export interface RateLimitStrategy { + resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null; + onDispatch(logger?: Logger): void; + onComplete(logger?: Logger): void; + updateFromHeaders( + metadata: TrafficRequestMetadata | undefined, + headers: unknown, + logger?: Logger, + ): RateLimitUpdateResult | undefined; +} diff --git a/packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts b/packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts new file mode 100644 index 000000000..310c9a7e6 --- /dev/null +++ b/packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts @@ -0,0 +1,26 @@ +export function parseResetDurationToMs(raw: string): number | undefined { + const value = raw.trim(); + if (!value) return undefined; + + let totalMs = 0; + const regex = /(\d+(?:\.\d+)?)(ms|s|m|h|d)/g; + let matched = false; + for (const match of value.matchAll(regex)) { + matched = true; + const amount = Number.parseFloat(match[1] ?? ""); + if (!Number.isFinite(amount)) continue; + const unit = match[2]; + if (unit === "ms") totalMs += amount; + else if (unit === "s") totalMs += amount * 1000; + else if (unit === "m") totalMs += amount * 60_000; + else if (unit === "h") totalMs += amount * 3_600_000; + else if (unit === "d") totalMs += amount * 86_400_000; + } + + if (matched) { + return Math.round(totalMs); + } + + const n = Number(value); + return Number.isFinite(n) ? Math.round(n) : undefined; +} diff --git a/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts new file mode 100644 index 000000000..314beac8e --- /dev/null +++ b/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts @@ -0,0 +1,208 @@ +import type { Logger } from "../../logger"; +import type { + DispatchDecision, + QueuedRequest, + RateLimitWindowState, +} from "../traffic-controller-internal"; +import { parseRetryAfterMs, readHeaderValue } from "../traffic-error-utils"; +import type { RateLimitOptions, TrafficRequestMetadata } from "../traffic-types"; +import type { + RateLimitHeaderSnapshot, + RateLimitStrategy, + RateLimitUpdateResult, +} from "./rate-limit-strategy"; +import { parseResetDurationToMs } from "./rate-limit-utils"; + +type TokenBucketState = { + capacity: number; + refillPerSecond: number; + tokens: number; + updatedAt: number; +}; + +function normalizeTokenBucketOptions( + raw: RateLimitOptions | undefined, +): Omit { + const capacityRaw = raw?.capacity; + const refillRaw = raw?.refillPerSecond; + + const capacity = typeof capacityRaw === "number" ? capacityRaw : Number(capacityRaw); + const refillPerSecond = typeof refillRaw === "number" ? refillRaw : Number(refillRaw); + + const safeCapacity = Number.isFinite(capacity) ? capacity : 0; + const safeRefill = Number.isFinite(refillPerSecond) ? refillPerSecond : 0; + + return { + capacity: safeCapacity > 0 ? Math.max(1, safeCapacity) : 0, + refillPerSecond: safeRefill > 0 ? safeRefill : 0, + }; +} +function refillTokenBucket(bucket: TokenBucketState, now: number): void { + const elapsedMs = now - bucket.updatedAt; + if (elapsedMs <= 0) return; + bucket.updatedAt = now; + if (bucket.capacity <= 0 || bucket.refillPerSecond <= 0) return; + + const refill = (elapsedMs / 1000) * bucket.refillPerSecond; + if (refill <= 0) return; + bucket.tokens = Math.min(bucket.capacity, bucket.tokens + refill); +} + +export class TokenBucketRateLimitStrategy implements RateLimitStrategy { + private readonly key: string; + private bucket?: TokenBucketState; + private cooldownUntil?: number; + + constructor(key: string, options?: RateLimitOptions) { + this.key = key; + if (!options) return; + const normalized = normalizeTokenBucketOptions(options); + const now = Date.now(); + this.bucket = { + ...normalized, + tokens: normalized.capacity, + updatedAt: now, + }; + } + + resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { + const rateLimitLogger = logger?.child({ module: "rate-limiter" }); + const now = Date.now(); + + if (this.cooldownUntil !== undefined && now < this.cooldownUntil) { + rateLimitLogger?.debug?.("Token bucket cooldown active; waiting", { + rateLimitKey: this.key, + cooldownUntil: this.cooldownUntil, + waitMs: this.cooldownUntil - now, + }); + return { kind: "wait", wakeUpAt: this.cooldownUntil }; + } + + const bucket = this.bucket; + if (!bucket) return null; + + refillTokenBucket(bucket, now); + + if (bucket.capacity <= 0) { + rateLimitLogger?.debug?.("Token bucket misconfigured; blocking", { + rateLimitKey: this.key, + capacity: bucket.capacity, + refillPerSecond: bucket.refillPerSecond, + }); + return { kind: "wait" }; + } + + if (bucket.tokens >= 1) { + bucket.tokens -= 1; + next.rateLimitKey = this.key; + rateLimitLogger?.trace?.("Consumed token bucket token", { + rateLimitKey: this.key, + tokens: bucket.tokens, + capacity: bucket.capacity, + refillPerSecond: bucket.refillPerSecond, + }); + return null; + } + + if (bucket.refillPerSecond <= 0) { + rateLimitLogger?.debug?.("Token bucket has no refill; blocking", { + rateLimitKey: this.key, + capacity: bucket.capacity, + refillPerSecond: bucket.refillPerSecond, + }); + return { kind: "wait" }; + } + + const requiredTokens = 1 - bucket.tokens; + const waitMs = Math.max(1, Math.ceil((requiredTokens / bucket.refillPerSecond) * 1000)); + const wakeUpAt = now + waitMs; + rateLimitLogger?.debug?.("Token bucket empty; waiting", { + rateLimitKey: this.key, + tokens: bucket.tokens, + capacity: bucket.capacity, + refillPerSecond: bucket.refillPerSecond, + wakeUpAt, + waitMs, + }); + return { kind: "wait", wakeUpAt }; + } + + onDispatch(_logger?: Logger): void {} + + onComplete(_logger?: Logger): void {} + + updateFromHeaders( + _metadata: TrafficRequestMetadata | undefined, + headers: unknown, + logger?: Logger, + ): RateLimitUpdateResult | undefined { + const rateLimitLogger = logger?.child({ module: "rate-limiter" }); + const now = Date.now(); + + const retryAfter = readHeaderValue(headers, "retry-after"); + const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter, now) : undefined; + + const remainingRequests = readHeaderValue(headers, "x-ratelimit-remaining-requests"); + const resetRequests = readHeaderValue(headers, "x-ratelimit-reset-requests"); + const resetRequestsMs = resetRequests ? parseResetDurationToMs(resetRequests) : undefined; + + let appliedUntil: number | undefined; + + if (retryAfterMs !== undefined) { + const targetAt = now + retryAfterMs; + this.cooldownUntil = + this.cooldownUntil === undefined ? targetAt : Math.max(this.cooldownUntil, targetAt); + appliedUntil = this.cooldownUntil; + } + + if (remainingRequests && resetRequestsMs !== undefined) { + const remaining = Number(remainingRequests); + if (Number.isFinite(remaining) && remaining <= 0) { + const targetAt = now + resetRequestsMs; + this.cooldownUntil = + this.cooldownUntil === undefined ? targetAt : Math.max(this.cooldownUntil, targetAt); + appliedUntil = this.cooldownUntil; + } + } + + if (appliedUntil === undefined) { + rateLimitLogger?.trace?.("No applicable cooldown headers; skipping", { + rateLimitKey: this.key, + hasRetryAfter: !!retryAfter, + hasRemainingRequests: !!remainingRequests, + hasResetRequests: !!resetRequests, + }); + return undefined; + } + + rateLimitLogger?.debug?.("Applied token bucket cooldown from headers", { + rateLimitKey: this.key, + cooldownUntil: appliedUntil, + inMs: Math.max(0, appliedUntil - now), + retryAfterMs, + resetRequestsMs, + }); + + const headerSnapshot: RateLimitHeaderSnapshot = { + remainingRequests, + resetRequests, + resetRequestsMs, + retryAfter, + retryAfterMs, + }; + + const state: RateLimitWindowState = { + limit: 1, + remaining: 0, + resetAt: appliedUntil, + reserved: 0, + nextAllowedAt: appliedUntil, + }; + + return { + key: this.key, + headerSnapshot, + state, + }; + } +} diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index 421eea79b..627b4dbcc 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -4,7 +4,12 @@ import { TrafficCircuitBreaker } from "./traffic-circuit-breaker"; import { TrafficConcurrencyLimiter } from "./traffic-concurrency-limiter"; import type { DispatchDecision, QueuedRequest, Scheduler } from "./traffic-controller-internal"; import { CircuitBreakerOpenError, RateLimitedUpstreamError } from "./traffic-errors"; -import { type RateLimitUpdateResult, TrafficRateLimiter } from "./traffic-rate-limiter"; +import { + OpenAIWindowRateLimitStrategy, + type RateLimitUpdateResult, + TokenBucketRateLimitStrategy, + TrafficRateLimiter, +} from "./traffic-rate-limiter"; import { type RetryReason, buildRetryPlan } from "./traffic-retry"; import type { ProviderModelConcurrencyLimit, @@ -78,7 +83,17 @@ export class TrafficController { this.logger = new LoggerProxy({ component: "traffic-controller" }, options.logger); this.trafficLogger = this.logger.child({ subsystem: "traffic" }); this.controllerLogger = this.trafficLogger.child({ module: "controller" }); - this.rateLimiter = new TrafficRateLimiter(() => this.scheduleDrain()); + const rateLimits = options.rateLimits; + this.rateLimiter = new TrafficRateLimiter( + () => this.scheduleDrain(), + (key) => { + const provider = key.split("::")[0] ?? ""; + if (provider.startsWith("openai")) { + return new OpenAIWindowRateLimitStrategy(key); + } + return new TokenBucketRateLimitStrategy(key, rateLimits?.[key]); + }, + ); this.circuitBreaker = new TrafficCircuitBreaker({ fallbackChains: options.fallbackChains, buildRateLimitKey: (metadata) => this.buildRateLimitKey(metadata), @@ -94,6 +109,7 @@ export class TrafficController { hasFallbackChains: !!options.fallbackChains, hasProviderModelConcurrency: options.maxConcurrentPerProviderModel !== undefined, hasTenantConcurrency: options.maxConcurrentPerTenant !== undefined, + hasConfigRateLimits: options.rateLimits !== undefined, }); } diff --git a/packages/core/src/traffic/traffic-rate-limiter.ts b/packages/core/src/traffic/traffic-rate-limiter.ts index f05396f52..4c495e297 100644 --- a/packages/core/src/traffic/traffic-rate-limiter.ts +++ b/packages/core/src/traffic/traffic-rate-limiter.ts @@ -1,296 +1,23 @@ import type { Logger } from "../logger"; -import { - RATE_LIMIT_EXHAUSTION_BUFFER, - RATE_LIMIT_MIN_PACE_INTERVAL_MS, - RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS, - RATE_LIMIT_PROBE_DELAY_MS, -} from "./traffic-constants"; import type { - DispatchDecision, - QueuedRequest, - RateLimitWindowState, -} from "./traffic-controller-internal"; -import { parseRetryAfterMs, readHeaderValue } from "./traffic-error-utils"; + RateLimitStrategy, + RateLimitUpdateResult, +} from "./rate-limit-strategies/rate-limit-strategy"; +import { TokenBucketRateLimitStrategy } from "./rate-limit-strategies/token-bucket-rate-limit-strategy"; +import type { DispatchDecision, QueuedRequest } from "./traffic-controller-internal"; import type { TrafficRequestMetadata } from "./traffic-types"; -export type RateLimitHeaderSnapshot = { - limitRequests?: string; - remainingRequests?: string; - resetRequests?: string; - resetRequestsMs?: number; - retryAfter?: string; - retryAfterMs?: number; -}; - -export type RateLimitUpdateResult = { - key: string; - headerSnapshot: RateLimitHeaderSnapshot; - state: RateLimitWindowState; -}; +export type { + RateLimitHeaderSnapshot, + RateLimitStrategy, + RateLimitUpdateResult, +} from "./rate-limit-strategies/rate-limit-strategy"; +export { DefaultRateLimitStrategy } from "./rate-limit-strategies/default-rate-limit-strategy"; +export { OpenAIWindowRateLimitStrategy } from "./rate-limit-strategies/openai-window-rate-limit-strategy"; +export { TokenBucketRateLimitStrategy } from "./rate-limit-strategies/token-bucket-rate-limit-strategy"; type SchedulerCallback = () => void; -export interface RateLimitStrategy { - resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null; - onDispatch(logger?: Logger): void; - onComplete(logger?: Logger): void; - updateFromHeaders( - metadata: TrafficRequestMetadata | undefined, - headers: unknown, - logger?: Logger, - ): RateLimitUpdateResult | undefined; -} - -function parseResetDurationToMs(raw: string): number | undefined { - const value = raw.trim(); - if (!value) return undefined; - - let totalMs = 0; - const regex = /(\d+(?:\.\d+)?)(ms|s|m|h|d)/g; - let matched = false; - for (const match of value.matchAll(regex)) { - matched = true; - const amount = Number.parseFloat(match[1] ?? ""); - if (!Number.isFinite(amount)) continue; - const unit = match[2]; - if (unit === "ms") totalMs += amount; - else if (unit === "s") totalMs += amount * 1000; - else if (unit === "m") totalMs += amount * 60_000; - else if (unit === "h") totalMs += amount * 3_600_000; - else if (unit === "d") totalMs += amount * 86_400_000; - } - - if (matched) { - return Math.round(totalMs); - } - - const n = Number(value); - return Number.isFinite(n) ? Math.round(n) : undefined; -} - -export class DefaultRateLimitStrategy implements RateLimitStrategy { - private state?: RateLimitWindowState; - private readonly key: string; - - constructor(key: string) { - this.key = key; - } - - resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { - const rateLimitLogger = logger?.child({ module: "rate-limiter" }); - const state = this.state; - if (!state) { - rateLimitLogger?.trace?.("Rate limit state missing; allow request", { - rateLimitKey: this.key, - }); - return null; - } - - const now = Date.now(); - const effectiveRemaining = Math.max(0, state.remaining - state.reserved); - const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; - - if (effectiveRemaining <= RATE_LIMIT_EXHAUSTION_BUFFER) { - if (now < probeAt) { - rateLimitLogger?.debug?.("Rate limit exhausted; waiting for probe", { - rateLimitKey: this.key, - remaining: state.remaining, - reserved: state.reserved, - effectiveRemaining, - resetAt: state.resetAt, - probeAt, - }); - return { kind: "wait", wakeUpAt: probeAt }; - } - if (state.reserved > 0) { - rateLimitLogger?.debug?.("Rate limit exhausted but in-flight reservations exist; waiting", { - rateLimitKey: this.key, - remaining: state.remaining, - reserved: state.reserved, - effectiveRemaining, - resetAt: state.resetAt, - }); - return { kind: "wait" }; - } - } - - if (now < state.nextAllowedAt) { - rateLimitLogger?.debug?.("Rate limit pacing; waiting until nextAllowedAt", { - rateLimitKey: this.key, - nextAllowedAt: state.nextAllowedAt, - resetAt: state.resetAt, - waitMs: Math.min(state.resetAt, state.nextAllowedAt) - now, - }); - return { kind: "wait", wakeUpAt: Math.min(state.resetAt, state.nextAllowedAt) }; - } - - state.reserved += 1; - next.rateLimitKey = this.key; - rateLimitLogger?.trace?.("Reserved rate limit token", { - rateLimitKey: this.key, - reserved: state.reserved, - remaining: state.remaining, - resetAt: state.resetAt, - nextAllowedAt: state.nextAllowedAt, - }); - - const remainingWindowMs = Math.max(0, state.resetAt - now); - const intervalMs = Math.max( - RATE_LIMIT_MIN_PACE_INTERVAL_MS, - Math.ceil(remainingWindowMs / Math.max(effectiveRemaining, 1)), - ); - - const candidateNext = Math.max(state.nextAllowedAt, now + intervalMs); - if ( - state.nextAllowedAt <= now || - candidateNext >= state.nextAllowedAt + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS - ) { - state.nextAllowedAt = candidateNext; - rateLimitLogger?.trace?.("Updated pacing nextAllowedAt", { - rateLimitKey: this.key, - nextAllowedAt: state.nextAllowedAt, - intervalMs, - remainingWindowMs, - effectiveRemaining, - }); - } - - return null; - } - - onDispatch(_logger?: Logger): void {} - - onComplete(logger?: Logger): void { - const rateLimitLogger = logger?.child({ module: "rate-limiter" }); - const state = this.state; - if (!state || state.reserved <= 0) return; - state.reserved -= 1; - rateLimitLogger?.trace?.("Released rate limit reservation", { - rateLimitKey: this.key, - reserved: state.reserved, - remaining: state.remaining, - resetAt: state.resetAt, - nextAllowedAt: state.nextAllowedAt, - }); - } - - updateFromHeaders( - _metadata: TrafficRequestMetadata | undefined, - headers: unknown, - logger?: Logger, - ): RateLimitUpdateResult | undefined { - const rateLimitLogger = logger?.child({ module: "rate-limiter" }); - const limitRequests = readHeaderValue(headers, "x-ratelimit-limit-requests"); - const remainingRequests = readHeaderValue(headers, "x-ratelimit-remaining-requests"); - const resetRequests = readHeaderValue(headers, "x-ratelimit-reset-requests"); - const retryAfter = readHeaderValue(headers, "retry-after"); - const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter) : undefined; - - const now = Date.now(); - const existing = this.state; - let state: RateLimitWindowState | undefined; - let headerSnapshot: RateLimitHeaderSnapshot | undefined; - - if (limitRequests && remainingRequests && resetRequests) { - const limit = Number(limitRequests); - const remaining = Number(remainingRequests); - if (!Number.isFinite(limit) || !Number.isFinite(remaining)) { - rateLimitLogger?.debug?.("Invalid rate limit numeric headers; skipping", { - rateLimitKey: this.key, - limitRequests, - remainingRequests, - }); - return undefined; - } - - const resetRequestsMs = parseResetDurationToMs(resetRequests); - if (resetRequestsMs === undefined) { - rateLimitLogger?.debug?.("Unable to parse reset duration; skipping", { - rateLimitKey: this.key, - resetRequests, - }); - return undefined; - } - - const parsedResetAt = now + resetRequestsMs; - const isSameWindow = !!existing && now < existing.resetAt; - const resetAt = isSameWindow ? Math.max(existing.resetAt, parsedResetAt) : parsedResetAt; - const nextAllowedAt = isSameWindow ? Math.max(existing.nextAllowedAt, now) : now; - const reserved = Math.max(0, existing?.reserved ?? 0); - - state = { - limit, - remaining: isSameWindow ? Math.min(existing.remaining, remaining) : remaining, - resetAt, - reserved, - nextAllowedAt, - }; - headerSnapshot = { - limitRequests, - remainingRequests, - resetRequests, - resetRequestsMs, - }; - } else if (retryAfterMs === undefined) { - rateLimitLogger?.trace?.("Missing rate limit headers; skipping", { - rateLimitKey: this.key, - hasLimit: !!limitRequests, - hasRemaining: !!remainingRequests, - hasReset: !!resetRequests, - hasRetryAfter: !!retryAfter, - }); - return undefined; - } - - if (!state) { - if (retryAfterMs === undefined) { - rateLimitLogger?.trace?.("Retry-After missing or unparsable; skipping", { - rateLimitKey: this.key, - retryAfter, - }); - return undefined; - } - const targetAt = now + retryAfterMs; - const isSameWindow = !!existing && now < existing.resetAt; - state = { - limit: existing?.limit ?? 1, - remaining: 0, - resetAt: isSameWindow ? Math.max(existing.resetAt, targetAt) : targetAt, - reserved: Math.max(0, existing?.reserved ?? 0), - nextAllowedAt: Math.max(existing?.nextAllowedAt ?? now, targetAt), - }; - headerSnapshot = { retryAfter, retryAfterMs }; - } else if (retryAfterMs !== undefined) { - const targetAt = now + retryAfterMs; - state = { - ...state, - remaining: 0, - resetAt: Math.max(state.resetAt, targetAt), - nextAllowedAt: Math.max(state.nextAllowedAt, targetAt), - }; - headerSnapshot = { ...headerSnapshot, retryAfter, retryAfterMs }; - } - - this.state = state; - rateLimitLogger?.debug?.("Applied rate limit headers to state", { - rateLimitKey: this.key, - limit: state.limit, - remaining: state.remaining, - effectiveRemaining: Math.max(0, state.remaining - state.reserved), - resetAt: state.resetAt, - nextAllowedAt: state.nextAllowedAt, - resetRequestsMs: headerSnapshot?.resetRequestsMs, - retryAfterMs: headerSnapshot?.retryAfterMs, - }); - - return { - key: this.key, - headerSnapshot: headerSnapshot ?? {}, - state, - }; - } -} - export type RateLimitStrategyFactory = (key: string) => RateLimitStrategy; export class TrafficRateLimiter { @@ -302,17 +29,11 @@ export class TrafficRateLimiter { constructor(onWakeUp: SchedulerCallback, strategyFactory?: RateLimitStrategyFactory) { this.onWakeUp = onWakeUp; - this.strategyFactory = strategyFactory ?? ((key) => new DefaultRateLimitStrategy(key)); + this.strategyFactory = strategyFactory ?? ((key) => new TokenBucketRateLimitStrategy(key)); } resolve(next: QueuedRequest, key: string, logger?: Logger): DispatchDecision | null { - const strategy = this.strategies.get(key); - if (!strategy) { - logger - ?.child({ module: "rate-limiter" }) - ?.trace?.("Rate limit state missing; allow request", { rateLimitKey: key }); - return null; - } + const strategy = this.strategies.get(key) ?? this.createStrategy(key, logger); return strategy.resolve(next, logger); } @@ -365,10 +86,21 @@ export class TrafficRateLimiter { ): RateLimitUpdateResult | undefined { const existing = this.strategies.get(key); if (existing) return existing.updateFromHeaders(metadata, headers, logger); + const created = this.strategyFactory(key); const update = created.updateFromHeaders(metadata, headers, logger); if (!update) return undefined; this.strategies.set(key, created); return update; } + + private createStrategy(key: string, logger?: Logger): RateLimitStrategy { + const created = this.strategyFactory(key); + this.strategies.set(key, created); + logger?.child({ module: "rate-limiter" })?.trace?.("Created rate limit strategy", { + rateLimitKey: key, + strategy: created.constructor.name, + }); + return created; + } } From 97c3c0d89b18cfff9e1d9546da549a99ab3a9edd Mon Sep 17 00:00:00 2001 From: riturajFi Date: Tue, 23 Dec 2025 00:47:40 +0530 Subject: [PATCH 16/41] fix: make sure the system updates rate limits and failures for the correct model/tenant/provider --- packages/core/src/agent/agent.ts | 97 +++++++++++++++++++------------- 1 file changed, 58 insertions(+), 39 deletions(-) diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index ed2adc777..8835efc99 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -462,15 +462,18 @@ export class Agent { ): Promise { const controller = getTrafficController({ logger: this.logger }); // Use shared controller so all agent calls flow through central queue/metrics const tenantId = this.resolveTenantId(options); - const buildRequest = (modelOverride?: LanguageModel) => ({ - tenantId, - metadata: this.buildTrafficMetadata(modelOverride ?? options?.model, options), // Pass model/provider info for future rate limiting keys - execute: () => - this.executeGenerateText(input, this.mergeOptionsWithModel(options, modelOverride)), // Defer actual execution so controller can schedule it - extractUsage: (result: GenerateTextResultWithContext) => - this.extractUsageFromResponse(result), - createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), - }); + const buildRequest = (modelOverride?: LanguageModel) => { + const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); + const metadata = this.buildTrafficMetadata(mergedOptions?.model, mergedOptions); // Compute once per queued request (including per-call model overrides) + return { + tenantId, + metadata, + execute: () => this.executeGenerateText(input, mergedOptions, metadata), // Defer actual execution so controller can schedule it + extractUsage: (result: GenerateTextResultWithContext) => + this.extractUsageFromResponse(result), + createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), + }; + }; return controller.handleText(buildRequest(options?.model)); } @@ -824,14 +827,18 @@ export class Agent { ): Promise { const controller = getTrafficController({ logger: this.logger }); // Same controller handles streaming to keep ordering/backpressure consistent const tenantId = this.resolveTenantId(options); - const buildRequest = (modelOverride?: LanguageModel) => ({ - tenantId, - metadata: this.buildTrafficMetadata(modelOverride ?? options?.model, options), // Include identifiers to support per-provider/model policies later - execute: () => - this.executeStreamText(input, this.mergeOptionsWithModel(options, modelOverride)), // Actual streaming work happens after the controller dequeues us - extractUsage: (result: StreamTextResultWithContext) => this.extractUsageFromResponse(result), - createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), - }); + const buildRequest = (modelOverride?: LanguageModel) => { + const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); + const metadata = this.buildTrafficMetadata(mergedOptions?.model, mergedOptions); // Compute once per queued request (including per-call model overrides) + return { + tenantId, + metadata, + execute: () => this.executeStreamText(input, mergedOptions, metadata), // Actual streaming work happens after the controller dequeues us + extractUsage: (result: StreamTextResultWithContext) => + this.extractUsageFromResponse(result), + createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), + }; + }; return controller.handleStream(buildRequest(options?.model)); } @@ -1519,19 +1526,18 @@ export class Agent { ): Promise>> { const controller = getTrafficController({ logger: this.logger }); const tenantId = this.resolveTenantId(options); - const buildRequest = (modelOverride?: LanguageModel) => ({ - tenantId, - metadata: this.buildTrafficMetadata(modelOverride ?? options?.model, options), - execute: () => - this.executeGenerateObject( - input, - schema, - this.mergeOptionsWithModel(options, modelOverride), - ), - extractUsage: (result: GenerateObjectResultWithContext>) => - this.extractUsageFromResponse(result), - createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), - }); + const buildRequest = (modelOverride?: LanguageModel) => { + const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); + const metadata = this.buildTrafficMetadata(mergedOptions?.model, mergedOptions); // Compute once per queued request (including per-call model overrides) + return { + tenantId, + metadata, + execute: () => this.executeGenerateObject(input, schema, mergedOptions, metadata), + extractUsage: (result: GenerateObjectResultWithContext>) => + this.extractUsageFromResponse(result), + createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), + }; + }; return controller.handleText(buildRequest(options?.model)); } @@ -1540,6 +1546,7 @@ export class Agent { input: string | UIMessage[] | BaseMessage[], schema: T, options?: GenerateObjectOptions, + trafficMetadata?: TrafficRequestMetadata, ): Promise>> { const startTime = Date.now(); const oc = this.createOperationContext(input, options); @@ -1655,6 +1662,7 @@ export class Agent { warnings: result.warnings, rawResult: safeStringify(result), }); + this.updateTrafficControllerRateLimits(result.response, trafficMetadata, methodLogger); const usageInfo = convertUsage(result.usage); const finalObject = await executeOutputGuardrails({ @@ -1763,6 +1771,7 @@ export class Agent { context: oc.context, }; } catch (error) { + this.updateTrafficControllerRateLimits(error, trafficMetadata, methodLogger); await this.flushPendingMessagesOnError(oc).catch(() => {}); return this.handleError(error as Error, oc, options, startTime); } finally { @@ -1783,15 +1792,18 @@ export class Agent { ): Promise>> { const controller = getTrafficController({ logger: this.logger }); const tenantId = this.resolveTenantId(options); - const buildRequest = (modelOverride?: LanguageModel) => ({ - tenantId, - metadata: this.buildTrafficMetadata(modelOverride ?? options?.model, options), - execute: () => - this.executeStreamObject(input, schema, this.mergeOptionsWithModel(options, modelOverride)), - extractUsage: (result: StreamObjectResultWithContext>) => - this.extractUsageFromResponse(result), - createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), - }); + const buildRequest = (modelOverride?: LanguageModel) => { + const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); + const metadata = this.buildTrafficMetadata(mergedOptions?.model, mergedOptions); // Compute once per queued request (including per-call model overrides) + return { + tenantId, + metadata, + execute: () => this.executeStreamObject(input, schema, mergedOptions, metadata), + extractUsage: (result: StreamObjectResultWithContext>) => + this.extractUsageFromResponse(result), + createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), + }; + }; return controller.handleStream(buildRequest(options?.model)); } @@ -1800,6 +1812,7 @@ export class Agent { input: string | UIMessage[] | BaseMessage[], schema: T, options?: StreamObjectOptions, + trafficMetadata?: TrafficRequestMetadata, ): Promise>> { const startTime = Date.now(); const oc = this.createOperationContext(input, options); @@ -1926,6 +1939,7 @@ export class Agent { modelName: this.getModelName(model), schemaName: schemaName, }); + this.updateTrafficControllerRateLimits(actualError, trafficMetadata, methodLogger); // History update removed - using OpenTelemetry only @@ -1957,6 +1971,11 @@ export class Agent { usage: finalResult.usage ? safeStringify(finalResult.usage) : undefined, rawResult: safeStringify(finalResult), }); + this.updateTrafficControllerRateLimits( + finalResult.response, + trafficMetadata, + methodLogger, + ); const usageInfo = convertUsage(finalResult.usage as any); let finalObject = finalResult.object as z.infer; if (guardrailSet.output.length > 0) { From df8833bd234ad41bf2c04130ea0f734544b1c884 Mon Sep 17 00:00:00 2001 From: riturajFi Date: Tue, 23 Dec 2025 01:55:23 +0530 Subject: [PATCH 17/41] fix: hadnle stream failure --- packages/core/src/agent/agent.ts | 6 +++ .../src/traffic/traffic-controller.spec.ts | 44 +++++++++++++++++++ .../core/src/traffic/traffic-controller.ts | 34 +++++++++++++- 3 files changed, 83 insertions(+), 1 deletion(-) diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index 8835efc99..da900b404 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -971,6 +971,7 @@ export class Agent { }, }); const finalizeLLMSpan = this.createLLMSpanFinalizer(llmSpan); + const trafficController = getTrafficController({ logger: this.logger }); methodLogger.info("[AI SDK] Calling streamText", { messageCount: messages.length, @@ -1021,6 +1022,7 @@ export class Agent { }); this.updateTrafficControllerRateLimits(actualError, trafficMetadata, methodLogger); + trafficController.reportStreamFailure(trafficMetadata, actualError); finalizeLLMSpan(SpanStatusCode.ERROR, { message: (actualError as Error)?.message }); // History update removed - using OpenTelemetry only @@ -1057,6 +1059,7 @@ export class Agent { trafficMetadata, methodLogger, ); + trafficController.reportStreamSuccess(trafficMetadata); const providerUsage = finalResult.usage ? await Promise.resolve(finalResult.usage) : undefined; @@ -1906,6 +1909,7 @@ export class Agent { let guardrailObjectPromise!: Promise>; let resolveGuardrailObject: ((value: z.infer) => void) | undefined; let rejectGuardrailObject: ((reason: unknown) => void) | undefined; + const trafficController = getTrafficController({ logger: this.logger }); methodLogger.info("[AI SDK] Calling streamObject", { messageCount: messages.length, @@ -1940,6 +1944,7 @@ export class Agent { schemaName: schemaName, }); this.updateTrafficControllerRateLimits(actualError, trafficMetadata, methodLogger); + trafficController.reportStreamFailure(trafficMetadata, actualError); // History update removed - using OpenTelemetry only @@ -1976,6 +1981,7 @@ export class Agent { trafficMetadata, methodLogger, ); + trafficController.reportStreamSuccess(trafficMetadata); const usageInfo = convertUsage(finalResult.usage as any); let finalObject = finalResult.object as z.infer; if (guardrailSet.output.length > 0) { diff --git a/packages/core/src/traffic/traffic-controller.spec.ts b/packages/core/src/traffic/traffic-controller.spec.ts index 6640e0324..6b59a266e 100644 --- a/packages/core/src/traffic/traffic-controller.spec.ts +++ b/packages/core/src/traffic/traffic-controller.spec.ts @@ -1,4 +1,5 @@ import { describe, expect, it, vi } from "vitest"; +import { CIRCUIT_FAILURE_THRESHOLD } from "./traffic-constants"; import { TrafficController } from "./traffic-controller"; describe("TrafficController priority scheduling", () => { @@ -230,3 +231,46 @@ describe("TrafficController rate limit headers", () => { } }); }); + +describe("TrafficController stream reporting", () => { + it("treats post-start stream failures as circuit breaker failures", async () => { + const controller = new TrafficController({ + maxConcurrent: 1, + fallbackChains: { + primary: ["fallback"], + }, + }); + const tenantId = "tenant-1"; + const metadata = { provider: "p", model: "primary", priority: "P1" as const }; + + await controller.handleStream({ + tenantId, + metadata, + execute: async () => ({ ok: true }), + }); + + for (let i = 0; i < CIRCUIT_FAILURE_THRESHOLD; i += 1) { + controller.reportStreamFailure(metadata, new Error("stream-failure")); + } + + const order: string[] = []; + await controller.handleStream({ + tenantId, + metadata, + execute: async () => { + order.push("primary"); + return "primary"; + }, + createFallbackRequest: (modelId) => ({ + tenantId, + metadata: { provider: "p", model: modelId, priority: "P1" }, + execute: async () => { + order.push(modelId); + return modelId; + }, + }), + }); + + expect(order).toEqual(["fallback"]); + }); +}); diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index 627b4dbcc..5d76221f8 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -138,6 +138,30 @@ export class TrafficController { return this.enqueue("stream", request); } + reportStreamSuccess(metadata?: TrafficRequestMetadata): void { + this.controllerLogger.debug("Stream reported success", { + provider: metadata?.provider, + model: metadata?.model, + tenantId: metadata?.tenantId, + priority: metadata?.priority, + }); + this.circuitBreaker.recordSuccess(metadata, this.trafficLogger); + } + + reportStreamFailure(metadata: TrafficRequestMetadata | undefined, error: unknown): void { + this.controllerLogger.warn("Stream reported failure", { + provider: metadata?.provider, + model: metadata?.model, + tenantId: metadata?.tenantId, + priority: metadata?.priority, + errorName: (error as { name?: unknown } | null)?.name, + errorMessage: (error as { message?: unknown } | null)?.message, + status: (error as { status?: unknown } | null)?.status, + statusCode: (error as { statusCode?: unknown } | null)?.statusCode, + }); + this.circuitBreaker.recordFailure(metadata, error, this.trafficLogger); + } + updateRateLimitFromHeaders( metadata: TrafficRequestMetadata | undefined, headers: unknown, @@ -378,7 +402,15 @@ export class TrafficController { model: item.request.metadata?.model, elapsedMs: Date.now() - startedAt, }); - this.circuitBreaker.recordSuccess(item.request.metadata, this.trafficLogger); + if (item.type === "stream") { + this.controllerLogger.trace("Stream started successfully", { + tenantId: item.tenantId, + provider: item.request.metadata?.provider, + model: item.request.metadata?.model, + }); + } else { + this.circuitBreaker.recordSuccess(item.request.metadata, this.trafficLogger); + } this.usageTracker.recordUsage(item, result, this.trafficLogger); item.resolve(result); } catch (error) { From 9f4e169b14318946940b9c4b13e934222ff2afd8 Mon Sep 17 00:00:00 2001 From: riturajFi Date: Tue, 23 Dec 2025 02:09:10 +0530 Subject: [PATCH 18/41] feat: search headers --- packages/core/src/agent/agent.ts | 32 ++++++------------- .../core/src/traffic/traffic-error-utils.ts | 24 ++++++++------ 2 files changed, 24 insertions(+), 32 deletions(-) diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index da900b404..562af7776 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -53,6 +53,7 @@ import { type TrafficRequestMetadata, getTrafficController, } from "../traffic/traffic-controller"; +import { findHeaders } from "../traffic/traffic-error-utils"; import { randomUUID } from "../utils/id"; import { convertModelMessagesToUIMessages } from "../utils/message-converter"; import { NodeType, createNodeId } from "../utils/node-utils"; @@ -4086,34 +4087,19 @@ export class Agent { metadata: TrafficRequestMetadata | undefined, logger?: Logger, ): void { - const readObjectProperty = (value: unknown, key: string): unknown => { - if (!value || typeof value !== "object") return undefined; - return (value as Record)[key]; - }; - - const headerCandidates: unknown[] = [ - readObjectProperty(response, "headers"), - readObjectProperty(readObjectProperty(response, "response"), "headers"), - readObjectProperty(readObjectProperty(response, "cause"), "headers"), - readObjectProperty( - readObjectProperty(readObjectProperty(response, "cause"), "response"), - "headers", - ), - ]; - - const headers = headerCandidates.find( - (candidate) => candidate !== undefined && candidate !== null, - ); - if (!headers) { + const headerCandidates = findHeaders(response); + if (headerCandidates.length === 0) { logger?.debug?.("[Traffic] No headers found for rate limit update"); return; } const controller = getTrafficController(); - const updateResult = controller.updateRateLimitFromHeaders( - metadata ?? this.buildTrafficMetadata(), - headers, - ); + const effectiveMetadata = metadata ?? this.buildTrafficMetadata(); + let updateResult: ReturnType | undefined; + for (const headers of headerCandidates) { + updateResult = controller.updateRateLimitFromHeaders(effectiveMetadata, headers); + if (updateResult) break; + } if (!updateResult) { logger?.debug?.("[Traffic] No rate limit headers applied from response"); diff --git a/packages/core/src/traffic/traffic-error-utils.ts b/packages/core/src/traffic/traffic-error-utils.ts index 946f1c0be..4cbb98b52 100644 --- a/packages/core/src/traffic/traffic-error-utils.ts +++ b/packages/core/src/traffic/traffic-error-utils.ts @@ -5,6 +5,20 @@ function readObjectProperty(value: unknown, key: string): unknown { return (value as Record)[key]; } +export function findHeaders(value: unknown): unknown[] { + const candidates: unknown[] = [ + readObjectProperty(value, "headers"), + readObjectProperty(readObjectProperty(value, "response"), "headers"), + readObjectProperty(readObjectProperty(value, "cause"), "headers"), + readObjectProperty( + readObjectProperty(readObjectProperty(value, "cause"), "response"), + "headers", + ), + ]; + + return candidates.filter((candidate) => candidate !== undefined && candidate !== null); +} + export function readHeaderValue(headers: unknown, name: string): string | undefined { if (!headers) return undefined; @@ -73,15 +87,7 @@ export function extractStatusCode(error: unknown, logger?: Logger): number | und export function extractRetryAfterMs(error: unknown, logger?: Logger): number | undefined { const retryAfterLogger = logger?.child({ module: "retry-after" }); - const candidates: unknown[] = [ - readObjectProperty(error, "headers"), - readObjectProperty(readObjectProperty(error, "response"), "headers"), - readObjectProperty(readObjectProperty(error, "cause"), "headers"), - readObjectProperty( - readObjectProperty(readObjectProperty(error, "cause"), "response"), - "headers", - ), - ]; + const candidates = findHeaders(error); for (const headers of candidates) { const raw = readHeaderValue(headers, "retry-after"); From 70458f647b6dc5172859b4142f6e109a48e0663c Mon Sep 17 00:00:00 2001 From: riturajFi Date: Tue, 23 Dec 2025 17:22:18 +0530 Subject: [PATCH 19/41] feat: tokens + requests per minute enforcement --- packages/core/src/agent/agent.ts | 32 ++- .../openai-window-rate-limit-strategy.ts | 214 +++++++++++++++++- .../rate-limit-strategy.ts | 8 + .../token-bucket-rate-limit-strategy.ts | 18 +- .../core/src/traffic/traffic-controller.ts | 45 +++- .../core/src/traffic/traffic-rate-limiter.ts | 169 +++++++++++++- packages/core/src/traffic/traffic-types.ts | 15 +- .../core/src/traffic/traffic-usage-tracker.ts | 30 ++- 8 files changed, 493 insertions(+), 38 deletions(-) diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index 562af7776..753f75b88 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -4152,13 +4152,41 @@ export class Agent { return undefined; } + const normalizeUsage = ( + usage: LanguageModelUsage | undefined, + ): LanguageModelUsage | undefined => { + if (!usage) return undefined; + const input = Number.isFinite(usage.inputTokens) ? (usage.inputTokens as number) : undefined; + const output = Number.isFinite(usage.outputTokens) + ? (usage.outputTokens as number) + : undefined; + const total = Number.isFinite(usage.totalTokens) ? (usage.totalTokens as number) : undefined; + + if (total === undefined && input === undefined && output === undefined) { + return undefined; + } + + const safeInput = input ?? 0; + const safeOutput = output ?? 0; + const safeTotal = total ?? safeInput + safeOutput; + + return { + ...usage, + inputTokens: safeInput, + outputTokens: safeOutput, + totalTokens: safeTotal, + }; + }; + if ( typeof (usageCandidate as PromiseLike).then === "function" ) { - return (usageCandidate as Promise).catch(() => undefined); + return (usageCandidate as Promise) + .then((usage) => normalizeUsage(usage)) + .catch(() => undefined); } - return usageCandidate as LanguageModelUsage; + return normalizeUsage(usageCandidate as LanguageModelUsage); } private resolveProvider( diff --git a/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts index f81e72696..8e8b6f868 100644 --- a/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts +++ b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts @@ -1,20 +1,52 @@ import type { Logger } from "../../logger"; -import type { DispatchDecision, QueuedRequest } from "../traffic-controller-internal"; -import type { TrafficRequestMetadata } from "../traffic-types"; +import { + RATE_LIMIT_EXHAUSTION_BUFFER, + RATE_LIMIT_MIN_PACE_INTERVAL_MS, + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS, + RATE_LIMIT_PROBE_DELAY_MS, +} from "../traffic-constants"; +import type { + DispatchDecision, + QueuedRequest, + RateLimitWindowState, +} from "../traffic-controller-internal"; +import type { RateLimitOptions, TrafficRequestMetadata } from "../traffic-types"; import { DefaultRateLimitStrategy } from "./default-rate-limit-strategy"; -import type { RateLimitStrategy, RateLimitUpdateResult } from "./rate-limit-strategy"; +import type { + RateLimitStrategy, + RateLimitUpdateResult, + RateLimitUsage, +} from "./rate-limit-strategy"; export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { + readonly handlesTokenLimits: boolean; private readonly window: DefaultRateLimitStrategy; private readonly key: string; + private readonly requestsPerMinute?: number; + private readonly tokensPerMinute?: number; + private requestState?: RateLimitWindowState; + private tokenState?: RateLimitWindowState; private bootstrapReserved = 0; + private readonly windowMs = 60_000; - constructor(key: string) { + constructor(key: string, options?: RateLimitOptions) { this.key = key; this.window = new DefaultRateLimitStrategy(key); + // Window strategy enforces fixed 60s windows; burstSize is intentionally ignored here. + this.requestsPerMinute = this.normalizeLimit(options?.requestsPerMinute); + this.tokensPerMinute = this.normalizeLimit(options?.tokensPerMinute); + this.handlesTokenLimits = this.tokensPerMinute !== undefined; } resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { + if (this.requestsPerMinute !== undefined) { + const requestDecision = this.resolveRequestWindow(next, logger); + if (requestDecision) return requestDecision; + const tokenDecision = this.resolveTokenWindow(logger); + if (tokenDecision) return tokenDecision; + return null; + } + const decision = this.window.resolve(next, logger); if (decision) return decision; @@ -41,21 +73,193 @@ export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { } onDispatch(logger?: Logger): void { - this.window.onDispatch(logger); + if (this.requestsPerMinute === undefined) { + this.window.onDispatch(logger); + } } onComplete(logger?: Logger): void { + if (this.requestsPerMinute !== undefined) { + const now = Date.now(); + const state = this.ensureRequestState(now); + if (state.reserved > 0) { + state.reserved -= 1; + } + state.remaining = Math.max(0, state.remaining - 1); + return; + } + if (this.bootstrapReserved > 0) { this.bootstrapReserved -= 1; } this.window.onComplete(logger); } + recordUsage(usage: RateLimitUsage, logger?: Logger): void { + if (this.tokensPerMinute === undefined) return; + const tokens = this.resolveTokenCount(usage); + if (tokens <= 0) return; + + const now = Date.now(); + const state = this.ensureTokenState(now); + state.remaining = Math.max(0, state.remaining - tokens); + logger?.child({ module: "rate-limiter" })?.trace?.("OpenAI token usage recorded", { + rateLimitKey: this.key, + tokens, + remaining: state.remaining, + resetAt: state.resetAt, + }); + } + updateFromHeaders( metadata: TrafficRequestMetadata | undefined, headers: unknown, logger?: Logger, ): RateLimitUpdateResult | undefined { + if (this.requestsPerMinute !== undefined) { + return undefined; + } return this.window.updateFromHeaders(metadata, headers, logger); } + + private resolveRequestWindow(next: QueuedRequest, logger?: Logger): DispatchDecision | null { + const rateLimitLogger = logger?.child({ module: "rate-limiter" }); + const now = Date.now(); + const state = this.ensureRequestState(now); + const effectiveRemaining = Math.max(0, state.remaining - state.reserved); + const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; + + if (effectiveRemaining <= RATE_LIMIT_EXHAUSTION_BUFFER) { + if (now < probeAt) { + rateLimitLogger?.debug?.("OpenAI request window exhausted; waiting for probe", { + rateLimitKey: this.key, + remaining: state.remaining, + reserved: state.reserved, + effectiveRemaining, + resetAt: state.resetAt, + probeAt, + }); + return { kind: "wait", wakeUpAt: probeAt }; + } + if (state.reserved > 0) { + rateLimitLogger?.debug?.( + "OpenAI request window exhausted but in-flight reservations exist; waiting", + { + rateLimitKey: this.key, + remaining: state.remaining, + reserved: state.reserved, + effectiveRemaining, + resetAt: state.resetAt, + }, + ); + return { kind: "wait" }; + } + } + + if (now < state.nextAllowedAt) { + rateLimitLogger?.debug?.("OpenAI request window pacing; waiting until nextAllowedAt", { + rateLimitKey: this.key, + nextAllowedAt: state.nextAllowedAt, + resetAt: state.resetAt, + waitMs: Math.min(state.resetAt, state.nextAllowedAt) - now, + }); + return { kind: "wait", wakeUpAt: Math.min(state.resetAt, state.nextAllowedAt) }; + } + + state.reserved += 1; + next.rateLimitKey = this.key; + rateLimitLogger?.trace?.("Reserved OpenAI request window slot", { + rateLimitKey: this.key, + reserved: state.reserved, + remaining: state.remaining, + resetAt: state.resetAt, + nextAllowedAt: state.nextAllowedAt, + }); + + const remainingWindowMs = Math.max(0, state.resetAt - now); + const intervalMs = Math.max( + RATE_LIMIT_MIN_PACE_INTERVAL_MS, + Math.ceil(remainingWindowMs / Math.max(effectiveRemaining, 1)), + ); + + const candidateNext = Math.max(state.nextAllowedAt, now + intervalMs); + if ( + state.nextAllowedAt <= now || + candidateNext >= state.nextAllowedAt + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS + ) { + state.nextAllowedAt = candidateNext; + rateLimitLogger?.trace?.("Updated OpenAI request pacing nextAllowedAt", { + rateLimitKey: this.key, + nextAllowedAt: state.nextAllowedAt, + intervalMs, + remainingWindowMs, + effectiveRemaining, + }); + } + + return null; + } + + private resolveTokenWindow(logger?: Logger): DispatchDecision | null { + if (this.tokensPerMinute === undefined) return null; + const rateLimitLogger = logger?.child({ module: "rate-limiter" }); + const now = Date.now(); + const state = this.ensureTokenState(now); + + if (state.remaining > 0) return null; + + const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; + rateLimitLogger?.debug?.("OpenAI token window exhausted; waiting", { + rateLimitKey: this.key, + remaining: state.remaining, + resetAt: state.resetAt, + probeAt, + }); + return { kind: "wait", wakeUpAt: probeAt }; + } + + private ensureRequestState(now: number): RateLimitWindowState { + const limit = this.requestsPerMinute ?? 0; + const state = this.requestState; + if (!state || now >= state.resetAt) { + this.requestState = { + limit, + remaining: limit, + resetAt: now + this.windowMs, + reserved: 0, + nextAllowedAt: now, + }; + return this.requestState; + } + return state; + } + + private ensureTokenState(now: number): RateLimitWindowState { + const limit = this.tokensPerMinute ?? 0; + const state = this.tokenState; + if (!state || now >= state.resetAt) { + this.tokenState = { + limit, + remaining: limit, + resetAt: now + this.windowMs, + reserved: 0, + nextAllowedAt: now, + }; + return this.tokenState; + } + return state; + } + + private normalizeLimit(value: number | undefined): number | undefined { + const numeric = typeof value === "number" ? value : Number(value); + return Number.isFinite(numeric) && numeric > 0 ? numeric : undefined; + } + + private resolveTokenCount(usage: RateLimitUsage): number { + const total = Number.isFinite(usage.totalTokens) ? usage.totalTokens : undefined; + if (total !== undefined) return total; + const input = Number.isFinite(usage.inputTokens) ? usage.inputTokens : 0; + const output = Number.isFinite(usage.outputTokens) ? usage.outputTokens : 0; + return input + output; + } } diff --git a/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts index 4e5a06245..6657c6b26 100644 --- a/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts +++ b/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts @@ -21,10 +21,18 @@ export type RateLimitUpdateResult = { state: RateLimitWindowState; }; +export type RateLimitUsage = { + inputTokens?: number; + outputTokens?: number; + totalTokens?: number; +}; + export interface RateLimitStrategy { + readonly handlesTokenLimits?: boolean; resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null; onDispatch(logger?: Logger): void; onComplete(logger?: Logger): void; + recordUsage?(usage: RateLimitUsage, logger?: Logger): void; updateFromHeaders( metadata: TrafficRequestMetadata | undefined, headers: unknown, diff --git a/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts index 314beac8e..2ae7b1892 100644 --- a/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts +++ b/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts @@ -23,18 +23,20 @@ type TokenBucketState = { function normalizeTokenBucketOptions( raw: RateLimitOptions | undefined, ): Omit { - const capacityRaw = raw?.capacity; - const refillRaw = raw?.refillPerSecond; + const requestsPerMinuteRaw = raw?.requestsPerMinute; + const burstSizeRaw = raw?.burstSize; - const capacity = typeof capacityRaw === "number" ? capacityRaw : Number(capacityRaw); - const refillPerSecond = typeof refillRaw === "number" ? refillRaw : Number(refillRaw); + const requestsPerMinute = + typeof requestsPerMinuteRaw === "number" ? requestsPerMinuteRaw : Number(requestsPerMinuteRaw); + const burstSize = typeof burstSizeRaw === "number" ? burstSizeRaw : Number(burstSizeRaw); - const safeCapacity = Number.isFinite(capacity) ? capacity : 0; - const safeRefill = Number.isFinite(refillPerSecond) ? refillPerSecond : 0; + const safeRequestsPerMinute = Number.isFinite(requestsPerMinute) ? requestsPerMinute : 0; + const safeBurst = Number.isFinite(burstSize) ? burstSize : safeRequestsPerMinute; + const refillPerSecond = safeRequestsPerMinute > 0 ? safeRequestsPerMinute / 60 : 0; return { - capacity: safeCapacity > 0 ? Math.max(1, safeCapacity) : 0, - refillPerSecond: safeRefill > 0 ? safeRefill : 0, + capacity: safeBurst > 0 ? Math.max(1, safeBurst) : 0, + refillPerSecond, }; } function refillTokenBucket(bucket: TokenBucketState, now: number): void { diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index 5d76221f8..6d55a3290 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -15,7 +15,6 @@ import type { ProviderModelConcurrencyLimit, RateLimitConfig, RateLimitKey, - RateLimitOptions, TenantConcurrencyLimit, TenantUsage, TrafficControllerOptions, @@ -35,7 +34,6 @@ export type { ProviderModelConcurrencyLimit, RateLimitConfig, RateLimitKey, - RateLimitOptions, TenantConcurrencyLimit, TenantUsage, TrafficControllerOptions, @@ -53,6 +51,7 @@ export class TrafficController { private readonly scheduler: Scheduler; private readonly maxConcurrent: number; + private readonly rateLimitKeyBuilder: (metadata?: TrafficRequestMetadata) => string; private readonly logger: Logger; private readonly trafficLogger: Logger; private readonly controllerLogger: Logger; @@ -80,20 +79,21 @@ export class TrafficController { constructor(options: TrafficControllerOptions = {}) { this.maxConcurrent = options.maxConcurrent ?? Number.POSITIVE_INFINITY; this.scheduler = this.createScheduler(); + this.rateLimitKeyBuilder = options.rateLimitKeyBuilder ?? buildRateLimitKeyFromMetadata; this.logger = new LoggerProxy({ component: "traffic-controller" }, options.logger); this.trafficLogger = this.logger.child({ subsystem: "traffic" }); this.controllerLogger = this.trafficLogger.child({ module: "controller" }); const rateLimits = options.rateLimits; - this.rateLimiter = new TrafficRateLimiter( - () => this.scheduleDrain(), - (key) => { + this.rateLimiter = new TrafficRateLimiter(() => this.scheduleDrain(), { + rateLimits, + strategyFactory: (key) => { const provider = key.split("::")[0] ?? ""; if (provider.startsWith("openai")) { - return new OpenAIWindowRateLimitStrategy(key); + return new OpenAIWindowRateLimitStrategy(key, rateLimits?.[key]); } return new TokenBucketRateLimitStrategy(key, rateLimits?.[key]); }, - ); + }); this.circuitBreaker = new TrafficCircuitBreaker({ fallbackChains: options.fallbackChains, buildRateLimitKey: (metadata) => this.buildRateLimitKey(metadata), @@ -411,7 +411,9 @@ export class TrafficController { } else { this.circuitBreaker.recordSuccess(item.request.metadata, this.trafficLogger); } - this.usageTracker.recordUsage(item, result, this.trafficLogger); + const usage = this.usageTracker.recordUsage(item, result, this.trafficLogger); + const rateLimitKey = item.rateLimitKey ?? this.buildRateLimitKey(item.request.metadata); + this.rateLimiter.recordUsage(rateLimitKey, usage, this.trafficLogger); item.resolve(result); } catch (error) { this.controllerLogger.warn("Request failed", { @@ -531,7 +533,7 @@ export class TrafficController { } private buildRateLimitKey(metadata?: TrafficRequestMetadata): string { - return `${metadata?.provider ?? "default-provider"}::${metadata?.model ?? "default-model"}`; + return this.rateLimitKeyBuilder(metadata); } } @@ -548,3 +550,28 @@ export function getTrafficController(options?: TrafficControllerOptions): Traffi } return singletonController; } + +function buildRateLimitKeyFromMetadata(metadata?: TrafficRequestMetadata): string { + const provider = metadata?.provider ?? "default-provider"; + const model = metadata?.model ?? "default-model"; + const parts = [provider, model]; + + // SOP: Add new metadata fields in one place with a stable label and ordering. + // 1) Add the optional field to TrafficRequestMetadata. + // 2) Add it here with a stable label so keys stay predictable. + // Example: { label: "org", value: metadata?.orgId } + const optionalFields: Array<{ label: string; value?: string }> = [ + { label: "apiKey", value: metadata?.apiKeyId }, + { label: "region", value: metadata?.region }, + { label: "endpoint", value: metadata?.endpoint }, + { label: "tenantTier", value: metadata?.tenantTier }, + { label: "taskType", value: metadata?.taskType }, + ]; + + for (const field of optionalFields) { + if (!field.value) continue; + parts.push(`${field.label}=${encodeURIComponent(field.value)}`); + } + + return parts.join("::"); +} diff --git a/packages/core/src/traffic/traffic-rate-limiter.ts b/packages/core/src/traffic/traffic-rate-limiter.ts index 4c495e297..a77a0423d 100644 --- a/packages/core/src/traffic/traffic-rate-limiter.ts +++ b/packages/core/src/traffic/traffic-rate-limiter.ts @@ -5,7 +5,7 @@ import type { } from "./rate-limit-strategies/rate-limit-strategy"; import { TokenBucketRateLimitStrategy } from "./rate-limit-strategies/token-bucket-rate-limit-strategy"; import type { DispatchDecision, QueuedRequest } from "./traffic-controller-internal"; -import type { TrafficRequestMetadata } from "./traffic-types"; +import type { RateLimitConfig, TrafficRequestMetadata } from "./traffic-types"; export type { RateLimitHeaderSnapshot, @@ -20,21 +20,65 @@ type SchedulerCallback = () => void; export type RateLimitStrategyFactory = (key: string) => RateLimitStrategy; +type UsageCounters = { + inputTokens?: number; + outputTokens?: number; + totalTokens?: number; +}; + +type TokenRateState = { + capacity: number; + refillPerSecond: number; + tokens: number; + updatedAt: number; +}; + export class TrafficRateLimiter { private readonly strategies = new Map(); + private readonly tokenRates = new Map(); private wakeUpTimeout?: ReturnType; private wakeUpAt?: number; private readonly onWakeUp: SchedulerCallback; private readonly strategyFactory: RateLimitStrategyFactory; + private readonly rateLimits?: RateLimitConfig; - constructor(onWakeUp: SchedulerCallback, strategyFactory?: RateLimitStrategyFactory) { + constructor( + onWakeUp: SchedulerCallback, + options?: { strategyFactory?: RateLimitStrategyFactory; rateLimits?: RateLimitConfig }, + ) { this.onWakeUp = onWakeUp; - this.strategyFactory = strategyFactory ?? ((key) => new TokenBucketRateLimitStrategy(key)); + this.rateLimits = options?.rateLimits; + this.strategyFactory = + options?.strategyFactory ?? + ((key) => new TokenBucketRateLimitStrategy(key, this.rateLimits?.[key])); } resolve(next: QueuedRequest, key: string, logger?: Logger): DispatchDecision | null { const strategy = this.strategies.get(key) ?? this.createStrategy(key, logger); - return strategy.resolve(next, logger); + const requestDecision = strategy.resolve(next, logger); + if (requestDecision?.kind === "wait") { + const tokenDecision = strategy.handlesTokenLimits + ? null + : this.resolveTokenLimit(key, logger); + if (tokenDecision?.kind === "wait") { + const requestWakeUp = requestDecision.wakeUpAt; + const tokenWakeUp = tokenDecision.wakeUpAt; + if (tokenWakeUp !== undefined && requestWakeUp !== undefined) { + return { kind: "wait", wakeUpAt: Math.min(requestWakeUp, tokenWakeUp) }; + } + if (tokenWakeUp !== undefined && requestWakeUp === undefined) { + return tokenDecision; + } + } + return requestDecision; + } + + const tokenDecision = strategy.handlesTokenLimits ? null : this.resolveTokenLimit(key, logger); + if (tokenDecision?.kind === "wait") { + return tokenDecision; + } + + return requestDecision; } notifyDispatch(key: string | undefined, logger?: Logger): void { @@ -78,6 +122,42 @@ export class TrafficRateLimiter { this.strategies.get(key)?.onComplete(logger); } + recordUsage( + key: string | undefined, + usage: UsageCounters | Promise | undefined, + logger?: Logger, + ): void { + if (!key || !usage) return; + if (typeof (usage as PromiseLike).then === "function") { + void (usage as Promise) + .then((resolved) => this.recordUsage(key, resolved, logger)) + .catch(() => {}); + return; + } + + const strategy = this.strategies.get(key); + if (strategy?.recordUsage) { + strategy.recordUsage(usage, logger); + return; + } + + const tokens = this.resolveTokenCount(usage); + if (tokens <= 0) return; + + const bucket = this.getTokenRateState(key, logger); + if (!bucket) return; + + const now = Date.now(); + this.refillTokenRate(bucket, now); + bucket.tokens = Math.min(bucket.capacity, bucket.tokens); + bucket.tokens -= tokens; + + if (bucket.tokens < 0 && bucket.refillPerSecond > 0) { + const waitMs = Math.max(1, Math.ceil((-bucket.tokens / bucket.refillPerSecond) * 1000)); + this.scheduleWakeUpAt(now + waitMs, logger); + } + } + updateFromHeaders( metadata: TrafficRequestMetadata | undefined, headers: unknown, @@ -103,4 +183,85 @@ export class TrafficRateLimiter { }); return created; } + + private resolveTokenLimit(key: string, logger?: Logger): DispatchDecision | null { + const bucket = this.getTokenRateState(key, logger); + if (!bucket) return null; + + const now = Date.now(); + this.refillTokenRate(bucket, now); + + if (bucket.capacity <= 0) { + logger?.child({ module: "rate-limiter" })?.debug?.("Token limit misconfigured; blocking", { + rateLimitKey: key, + capacity: bucket.capacity, + refillPerSecond: bucket.refillPerSecond, + }); + return { kind: "wait" }; + } + + if (bucket.tokens >= 0) return null; + + if (bucket.refillPerSecond <= 0) { + logger?.child({ module: "rate-limiter" })?.debug?.("Token limit has no refill; blocking", { + rateLimitKey: key, + capacity: bucket.capacity, + refillPerSecond: bucket.refillPerSecond, + }); + return { kind: "wait" }; + } + + const requiredTokens = -bucket.tokens; + const waitMs = Math.max(1, Math.ceil((requiredTokens / bucket.refillPerSecond) * 1000)); + return { kind: "wait", wakeUpAt: now + waitMs }; + } + + private getTokenRateState(key: string, logger?: Logger): TokenRateState | undefined { + const existing = this.tokenRates.get(key); + if (existing) return existing; + + const options = this.rateLimits?.[key]; + if (!options) return undefined; + + const tokensPerMinute = Number(options.tokensPerMinute); + if (!Number.isFinite(tokensPerMinute) || tokensPerMinute <= 0) { + return undefined; + } + + // Token pacing uses a 1-minute burst by default; request bursts are handled separately. + const refillPerSecond = tokensPerMinute / 60; + const capacity = tokensPerMinute; + const now = Date.now(); + const created: TokenRateState = { + capacity, + refillPerSecond, + tokens: capacity, + updatedAt: now, + }; + this.tokenRates.set(key, created); + logger?.child({ module: "rate-limiter" })?.trace?.("Created token rate state", { + rateLimitKey: key, + capacity, + refillPerSecond, + }); + return created; + } + + private refillTokenRate(bucket: TokenRateState, now: number): void { + const elapsedMs = now - bucket.updatedAt; + if (elapsedMs <= 0) return; + bucket.updatedAt = now; + if (bucket.capacity <= 0 || bucket.refillPerSecond <= 0) return; + const refill = (elapsedMs / 1000) * bucket.refillPerSecond; + if (refill <= 0) return; + bucket.tokens = Math.min(bucket.capacity, bucket.tokens + refill); + } + + private resolveTokenCount(usage: UsageCounters): number { + const total = Number.isFinite(usage.totalTokens) ? usage.totalTokens : undefined; + if (total !== undefined) return total; + const input = Number.isFinite(usage.inputTokens) ? usage.inputTokens : 0; + const output = Number.isFinite(usage.outputTokens) ? usage.outputTokens : 0; + return input + output; + } } diff --git a/packages/core/src/traffic/traffic-types.ts b/packages/core/src/traffic/traffic-types.ts index 9aaf5eb0d..6669c6f3a 100644 --- a/packages/core/src/traffic/traffic-types.ts +++ b/packages/core/src/traffic/traffic-types.ts @@ -20,6 +20,11 @@ export interface TrafficRequestMetadata { provider?: string; priority?: TrafficPriority; tenantId?: string; + apiKeyId?: string; + region?: string; + endpoint?: string; + tenantTier?: string; + taskType?: string; } export type ProviderModelConcurrencyLimit = @@ -48,13 +53,19 @@ export interface TrafficControllerOptions { maxConcurrentPerProviderModel?: ProviderModelConcurrencyLimit; maxConcurrentPerTenant?: TenantConcurrencyLimit; rateLimits?: RateLimitConfig; + /** + * Optional override for rate-limit key construction. + * Useful when you need to add new metadata fields without changing core logic. + */ + rateLimitKeyBuilder?: (metadata?: TrafficRequestMetadata) => string; logger?: Logger; fallbackChains?: Record; } export interface RateLimitOptions { - capacity: number; - refillPerSecond: number; + requestsPerMinute: number; + tokensPerMinute: number; + burstSize?: number; } export type RateLimitKey = string; diff --git a/packages/core/src/traffic/traffic-usage-tracker.ts b/packages/core/src/traffic/traffic-usage-tracker.ts index e875d21c2..c79b311ad 100644 --- a/packages/core/src/traffic/traffic-usage-tracker.ts +++ b/packages/core/src/traffic/traffic-usage-tracker.ts @@ -17,12 +17,16 @@ export class TrafficUsageTracker { return usage ? { ...usage } : undefined; } - recordUsage(item: QueuedRequest, result: TResponse, logger?: Logger): void { + recordUsage( + item: QueuedRequest, + result: TResponse, + logger?: Logger, + ): UsageCounters | Promise | undefined { const usageLogger = logger?.child({ module: "usage-tracker" }); const extractor = item.extractUsage ?? item.request.extractUsage; if (!extractor) { usageLogger?.trace?.("No usage extractor; skipping usage", { tenantId: item.tenantId }); - return; + return undefined; } const usage = extractor(result); @@ -30,7 +34,7 @@ export class TrafficUsageTracker { usageLogger?.trace?.("Usage extractor returned empty; skipping usage", { tenantId: item.tenantId, }); - return; + return undefined; } if (isPromiseLike(usage)) { @@ -38,9 +42,10 @@ export class TrafficUsageTracker { tenantId: item.tenantId, }); void usage.then((u) => u && this.incrementTenantUsage(item.tenantId, u, usageLogger)); - } else { - this.incrementTenantUsage(item.tenantId, usage, usageLogger); + return usage; } + this.incrementTenantUsage(item.tenantId, usage, usageLogger); + return usage; } private incrementTenantUsage(tenantId: string, usage: UsageCounters, logger?: Logger): void { @@ -50,9 +55,18 @@ export class TrafficUsageTracker { totalTokens: 0, }; - const input = usage.inputTokens ?? 0; - const output = usage.outputTokens ?? 0; - const total = usage.totalTokens ?? input + output; + const input = + typeof usage.inputTokens === "number" && Number.isFinite(usage.inputTokens) + ? usage.inputTokens + : 0; + const output = + typeof usage.outputTokens === "number" && Number.isFinite(usage.outputTokens) + ? usage.outputTokens + : 0; + const total = + typeof usage.totalTokens === "number" && Number.isFinite(usage.totalTokens) + ? usage.totalTokens + : input + output; this.tenantUsage.set(tenantId, { inputTokens: current.inputTokens + input, From 9db04813dde245735ff1c7f295500c788f3b184c Mon Sep 17 00:00:00 2001 From: riturajFi Date: Tue, 23 Dec 2025 18:03:35 +0530 Subject: [PATCH 20/41] fix: rate limit mapping config --- .../core/src/traffic/traffic-controller.ts | 23 +++++++++++++++++-- packages/core/src/traffic/traffic-types.ts | 13 +++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index 6d55a3290..ba2812060 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -15,6 +15,8 @@ import type { ProviderModelConcurrencyLimit, RateLimitConfig, RateLimitKey, + RateLimitStrategyConfig, + RateLimitStrategyKind, TenantConcurrencyLimit, TenantUsage, TrafficControllerOptions, @@ -34,6 +36,8 @@ export type { ProviderModelConcurrencyLimit, RateLimitConfig, RateLimitKey, + RateLimitStrategyConfig, + RateLimitStrategyKind, TenantConcurrencyLimit, TenantUsage, TrafficControllerOptions, @@ -84,11 +88,12 @@ export class TrafficController { this.trafficLogger = this.logger.child({ subsystem: "traffic" }); this.controllerLogger = this.trafficLogger.child({ module: "controller" }); const rateLimits = options.rateLimits; + const rateLimitStrategy = options.rateLimitStrategy; this.rateLimiter = new TrafficRateLimiter(() => this.scheduleDrain(), { rateLimits, strategyFactory: (key) => { - const provider = key.split("::")[0] ?? ""; - if (provider.startsWith("openai")) { + const strategyKind = this.resolveRateLimitStrategy(key, rateLimitStrategy); + if (strategyKind === "window") { return new OpenAIWindowRateLimitStrategy(key, rateLimits?.[key]); } return new TokenBucketRateLimitStrategy(key, rateLimits?.[key]); @@ -110,6 +115,7 @@ export class TrafficController { hasProviderModelConcurrency: options.maxConcurrentPerProviderModel !== undefined, hasTenantConcurrency: options.maxConcurrentPerTenant !== undefined, hasConfigRateLimits: options.rateLimits !== undefined, + hasStrategyOverrides: options.rateLimitStrategy !== undefined, }); } @@ -535,6 +541,19 @@ export class TrafficController { private buildRateLimitKey(metadata?: TrafficRequestMetadata): string { return this.rateLimitKeyBuilder(metadata); } + + private resolveRateLimitStrategy( + key: string, + config?: RateLimitStrategyConfig, + ): RateLimitStrategyKind { + const modelOverride = config?.models?.[key]; + if (modelOverride) return modelOverride; + const provider = key.split("::")[0] ?? ""; + const providerOverride = config?.providers?.[provider]; + if (providerOverride) return providerOverride; + if (provider.startsWith("openai")) return "window"; + return "token-bucket"; + } } /* ============================================================ diff --git a/packages/core/src/traffic/traffic-types.ts b/packages/core/src/traffic/traffic-types.ts index 6669c6f3a..c68d91ad2 100644 --- a/packages/core/src/traffic/traffic-types.ts +++ b/packages/core/src/traffic/traffic-types.ts @@ -58,10 +58,23 @@ export interface TrafficControllerOptions { * Useful when you need to add new metadata fields without changing core logic. */ rateLimitKeyBuilder?: (metadata?: TrafficRequestMetadata) => string; + /** + * Select a rate-limit strategy by provider/model. + * Example: + * { providers: { openai: "window" }, models: { "openai::gpt-4o": "window" } } + */ + rateLimitStrategy?: RateLimitStrategyConfig; logger?: Logger; fallbackChains?: Record; } +export type RateLimitStrategyKind = "window" | "token-bucket"; + +export type RateLimitStrategyConfig = { + providers?: Record; + models?: Record; +}; + export interface RateLimitOptions { requestsPerMinute: number; tokensPerMinute: number; From f4cbdcb5370177f43b06bb99ed7e3d03a9886c6a Mon Sep 17 00:00:00 2001 From: riturajFi Date: Wed, 24 Dec 2025 00:22:39 +0530 Subject: [PATCH 21/41] fix: error policy --- .../core/src/traffic/traffic-controller.ts | 88 +++++++++++++---- packages/core/src/traffic/traffic-errors.ts | 95 ++++++++++++++++++- packages/core/src/traffic/traffic-retry.ts | 58 ++++++++++- packages/core/src/traffic/traffic-types.ts | 29 ++++++ 4 files changed, 246 insertions(+), 24 deletions(-) diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index ba2812060..387e273ee 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -3,20 +3,26 @@ import { LoggerProxy } from "../logger"; import { TrafficCircuitBreaker } from "./traffic-circuit-breaker"; import { TrafficConcurrencyLimiter } from "./traffic-concurrency-limiter"; import type { DispatchDecision, QueuedRequest, Scheduler } from "./traffic-controller-internal"; -import { CircuitBreakerOpenError, RateLimitedUpstreamError } from "./traffic-errors"; +import { + CircuitBreakerOpenError, + RateLimitedUpstreamError, + normalizeRateLimitError, +} from "./traffic-errors"; import { OpenAIWindowRateLimitStrategy, type RateLimitUpdateResult, TokenBucketRateLimitStrategy, TrafficRateLimiter, } from "./traffic-rate-limiter"; -import { type RetryReason, buildRetryPlan } from "./traffic-retry"; +import { buildRetryPlanWithPolicy } from "./traffic-retry"; import type { ProviderModelConcurrencyLimit, RateLimitConfig, RateLimitKey, RateLimitStrategyConfig, RateLimitStrategyKind, + RetryPlan, + RetryPolicyConfig, TenantConcurrencyLimit, TenantUsage, TrafficControllerOptions, @@ -56,6 +62,7 @@ export class TrafficController { private readonly scheduler: Scheduler; private readonly maxConcurrent: number; private readonly rateLimitKeyBuilder: (metadata?: TrafficRequestMetadata) => string; + private readonly retryPolicy?: RetryPolicyConfig; private readonly logger: Logger; private readonly trafficLogger: Logger; private readonly controllerLogger: Logger; @@ -84,6 +91,7 @@ export class TrafficController { this.maxConcurrent = options.maxConcurrent ?? Number.POSITIVE_INFINITY; this.scheduler = this.createScheduler(); this.rateLimitKeyBuilder = options.rateLimitKeyBuilder ?? buildRateLimitKeyFromMetadata; + this.retryPolicy = options.retryPolicy; this.logger = new LoggerProxy({ component: "traffic-controller" }, options.logger); this.trafficLogger = this.logger.child({ subsystem: "traffic" }); this.controllerLogger = this.trafficLogger.child({ module: "controller" }); @@ -116,6 +124,7 @@ export class TrafficController { hasTenantConcurrency: options.maxConcurrentPerTenant !== undefined, hasConfigRateLimits: options.rateLimits !== undefined, hasStrategyOverrides: options.rateLimitStrategy !== undefined, + hasRetryPolicy: options.retryPolicy !== undefined, }); } @@ -422,6 +431,16 @@ export class TrafficController { this.rateLimiter.recordUsage(rateLimitKey, usage, this.trafficLogger); item.resolve(result); } catch (error) { + const rateLimitKey = item.rateLimitKey ?? this.buildRateLimitKey(item.request.metadata); + const normalizedRateLimitError = normalizeRateLimitError({ + error, + metadata: item.request.metadata, + tenantId: item.tenantId, + key: rateLimitKey, + logger: this.trafficLogger, + }); + const errorForHandling = normalizedRateLimitError ?? error; + this.controllerLogger.warn("Request failed", { tenantId: item.tenantId, attempt: item.attempt, @@ -433,20 +452,45 @@ export class TrafficController { status: (error as { status?: unknown } | null)?.status, statusCode: (error as { statusCode?: unknown } | null)?.statusCode, }); - this.circuitBreaker.recordFailure(item.request.metadata, error, this.trafficLogger); - - const retry = buildRetryPlan(error, item.attempt, this.trafficLogger); - if (retry) { - this.controllerLogger.debug("Retrying request", { - tenantId: item.tenantId, + this.circuitBreaker.recordFailure( + item.request.metadata, + errorForHandling, + this.trafficLogger, + ); + + const retry = buildRetryPlanWithPolicy( + { + error: errorForHandling, attempt: item.attempt, - nextAttempt: item.attempt + 1, - reason: retry.reason, - delayMs: retry.delayMs, - provider: item.request.metadata?.provider, - model: item.request.metadata?.model, - }); - this.scheduleRetry(item, retry); + metadata: item.request.metadata, + key: rateLimitKey, + logger: this.trafficLogger, + }, + this.retryPolicy, + ); + if (retry) { + if (!this.canRetryWithinDeadline(item, retry.delayMs)) { + this.controllerLogger.debug("Retry skipped; deadline exceeded", { + tenantId: item.tenantId, + attempt: item.attempt, + provider: item.request.metadata?.provider, + model: item.request.metadata?.model, + deadlineAt: item.request.deadlineAt, + delayMs: retry.delayMs, + }); + item.reject(errorForHandling); + } else { + this.controllerLogger.debug("Retrying request", { + tenantId: item.tenantId, + attempt: item.attempt, + nextAttempt: item.attempt + 1, + reason: retry.reason, + delayMs: retry.delayMs, + provider: item.request.metadata?.provider, + model: item.request.metadata?.model, + }); + this.scheduleRetry(item, retry); + } } else { this.controllerLogger.debug("No retry plan; rejecting request", { tenantId: item.tenantId, @@ -454,7 +498,7 @@ export class TrafficController { provider: item.request.metadata?.provider, model: item.request.metadata?.model, }); - item.reject(error); + item.reject(errorForHandling); } } finally { this.rateLimiter.releaseReservation(item.rateLimitKey, this.trafficLogger); @@ -474,10 +518,7 @@ export class TrafficController { * ============================================================ */ - private scheduleRetry( - item: QueuedRequest, - plan: { delayMs: number; reason: RetryReason }, - ): void { + private scheduleRetry(item: QueuedRequest, plan: RetryPlan): void { this.controllerLogger.debug("Schedule retry", { tenantId: item.tenantId, priority: item.priority, @@ -506,6 +547,13 @@ export class TrafficController { }, plan.delayMs); } + private canRetryWithinDeadline(item: QueuedRequest, delayMs: number): boolean { + const deadlineAt = item.request.deadlineAt; + if (!deadlineAt) return true; + const nextAttemptAt = Date.now() + delayMs; + return nextAttemptAt <= deadlineAt; + } + /* ============================================================ * Rate limiting (verbatim logic) * ============================================================ diff --git a/packages/core/src/traffic/traffic-errors.ts b/packages/core/src/traffic/traffic-errors.ts index 2fd93890b..6cee702d6 100644 --- a/packages/core/src/traffic/traffic-errors.ts +++ b/packages/core/src/traffic/traffic-errors.ts @@ -1,5 +1,14 @@ +import type { Logger } from "../logger"; +import { extractRetryAfterMs, extractStatusCode } from "./traffic-error-utils"; import type { TrafficRequestMetadata } from "./traffic-types"; +export type RateLimitErrorOptions = { + metadata?: TrafficRequestMetadata; + retryAfterMs?: number; + tenantId?: string; + key?: string; +}; + export class CircuitBreakerOpenError extends Error { readonly retryAfterMs?: number; readonly metadata?: TrafficRequestMetadata; @@ -16,11 +25,93 @@ export class RateLimitedUpstreamError extends Error { readonly status = 429; readonly retryAfterMs?: number; readonly metadata?: TrafficRequestMetadata; + readonly provider?: string; + readonly model?: string; + readonly tenantId?: string; + readonly key?: string; - constructor(message: string, metadata?: TrafficRequestMetadata, retryAfterMs?: number) { + constructor( + message: string, + metadata?: TrafficRequestMetadata, + retryAfterMs?: number, + options?: { tenantId?: string; key?: string }, + ); + constructor(message: string, options?: RateLimitErrorOptions); + constructor( + message: string, + metadataOrOptions?: TrafficRequestMetadata | RateLimitErrorOptions, + retryAfterMs?: number, + legacyOptions?: { tenantId?: string; key?: string }, + ) { super(message); this.name = "RateLimitedUpstreamError"; + const isOptions = + metadataOrOptions && + (Object.prototype.hasOwnProperty.call(metadataOrOptions, "metadata") || + Object.prototype.hasOwnProperty.call(metadataOrOptions, "retryAfterMs") || + Object.prototype.hasOwnProperty.call(metadataOrOptions, "key")); + + const metadata = isOptions + ? (metadataOrOptions as RateLimitErrorOptions).metadata + : (metadataOrOptions as TrafficRequestMetadata | undefined); + const retryAfter = isOptions + ? (metadataOrOptions as RateLimitErrorOptions).retryAfterMs + : retryAfterMs; + const tenantId = isOptions + ? (metadataOrOptions as RateLimitErrorOptions).tenantId + : legacyOptions?.tenantId; + const key = isOptions ? (metadataOrOptions as RateLimitErrorOptions).key : legacyOptions?.key; + this.metadata = metadata; - this.retryAfterMs = retryAfterMs; + this.retryAfterMs = retryAfter; + this.provider = metadata?.provider; + this.model = metadata?.model; + this.tenantId = tenantId ?? metadata?.tenantId; + this.key = key; } } + +export function normalizeRateLimitError(options: { + error: unknown; + metadata?: TrafficRequestMetadata; + tenantId?: string; + key?: string; + logger?: Logger; +}): RateLimitedUpstreamError | undefined { + const { error, metadata, tenantId, key, logger } = options; + const retryAfterMs = + error instanceof RateLimitedUpstreamError + ? (error.retryAfterMs ?? extractRetryAfterMs(error, logger)) + : extractRetryAfterMs(error, logger); + + if (error instanceof RateLimitedUpstreamError) { + const baseMetadata = metadata ?? error.metadata; + const baseTenant = tenantId ?? error.tenantId; + const baseKey = key ?? error.key; + if ( + error.metadata === baseMetadata && + error.retryAfterMs === retryAfterMs && + error.tenantId === baseTenant && + error.key === baseKey + ) { + return error; + } + return new RateLimitedUpstreamError(error.message, { + metadata: baseMetadata, + retryAfterMs, + tenantId: baseTenant, + key: baseKey, + }); + } + + const status = extractStatusCode(error, logger); + if (status !== 429) return undefined; + + const message = error instanceof Error ? error.message : "Rate limit exceeded"; + return new RateLimitedUpstreamError(message, { + metadata, + retryAfterMs, + tenantId, + key, + }); +} diff --git a/packages/core/src/traffic/traffic-retry.ts b/packages/core/src/traffic/traffic-retry.ts index a1cd363c1..9604dc53a 100644 --- a/packages/core/src/traffic/traffic-retry.ts +++ b/packages/core/src/traffic/traffic-retry.ts @@ -11,14 +11,27 @@ import { } from "./traffic-constants"; import { extractRetryAfterMs, extractStatusCode, isTimeoutError } from "./traffic-error-utils"; import { RateLimitedUpstreamError } from "./traffic-errors"; +import type { + RetryPlan, + RetryPolicy, + RetryPolicyConfig, + RetryPolicyContext, + RetryReason, +} from "./traffic-types"; -export type RetryReason = "rateLimit" | "serverError" | "timeout"; +export type { + RetryPlan, + RetryPolicy, + RetryPolicyConfig, + RetryPolicyContext, + RetryReason, +} from "./traffic-types"; export function buildRetryPlan( error: unknown, attempt: number, logger?: Logger, -): { delayMs: number; reason: RetryReason } | undefined { +): RetryPlan | undefined { const retryLogger = logger?.child({ module: "retry" }); const reason = getRetryReason(error, retryLogger); if (!reason) { @@ -61,6 +74,47 @@ export function buildRetryPlan( }; } +export function buildRetryPlanWithPolicy( + context: RetryPolicyContext, + policyConfig?: RetryPolicyConfig, +): RetryPlan | undefined { + const retryLogger = context.logger?.child({ module: "retry" }); + const policy = resolveRetryPolicy(context, policyConfig); + if (policy) { + const planned = policy(context); + if (planned) { + retryLogger?.debug?.("Retry policy returned a plan", { + attempt: context.attempt, + reason: planned.reason, + delayMs: planned.delayMs, + }); + return planned; + } + retryLogger?.debug?.("Retry policy declined to retry", { attempt: context.attempt }); + } + + return buildRetryPlan(context.error, context.attempt, context.logger); +} + +function resolveRetryPolicy( + context: RetryPolicyContext, + config?: RetryPolicyConfig, +): RetryPolicy | undefined { + if (!config) return undefined; + const modelPolicy = context.key ? config.models?.[context.key] : undefined; + if (modelPolicy) return modelPolicy; + const providerModelKey = + context.metadata?.provider && context.metadata?.model + ? `${context.metadata.provider}::${context.metadata.model}` + : undefined; + const providerModelPolicy = providerModelKey ? config.models?.[providerModelKey] : undefined; + if (providerModelPolicy) return providerModelPolicy; + const provider = context.metadata?.provider; + const providerPolicy = provider ? config.providers?.[provider] : undefined; + if (providerPolicy) return providerPolicy; + return config.default; +} + function getRetryReason(error: unknown, logger?: Logger): RetryReason | undefined { if (error instanceof RateLimitedUpstreamError) return "rateLimit"; const status = extractStatusCode(error, logger); diff --git a/packages/core/src/traffic/traffic-types.ts b/packages/core/src/traffic/traffic-types.ts index c68d91ad2..2605623f0 100644 --- a/packages/core/src/traffic/traffic-types.ts +++ b/packages/core/src/traffic/traffic-types.ts @@ -10,6 +10,29 @@ type UsageCounters = { totalTokens?: number; }; +export type RetryReason = "rateLimit" | "serverError" | "timeout"; + +export type RetryPlan = { + delayMs: number; + reason: RetryReason; +}; + +export type RetryPolicyContext = { + error: unknown; + attempt: number; + metadata?: TrafficRequestMetadata; + key?: string; + logger?: Logger; +}; + +export type RetryPolicy = (context: RetryPolicyContext) => RetryPlan | undefined; + +export type RetryPolicyConfig = { + default?: RetryPolicy; + providers?: Record; + models?: Record; +}; + export type TrafficRequestType = "text" | "stream"; export type TrafficPriority = "P0" | "P1" | "P2"; @@ -41,6 +64,7 @@ export interface TrafficRequest { tenantId: string; metadata?: TrafficRequestMetadata; execute: () => Promise; + deadlineAt?: number; createFallbackRequest?: (modelId: string) => TrafficRequest | undefined; extractUsage?: BivariantFunction< [response: TResponse], @@ -58,6 +82,11 @@ export interface TrafficControllerOptions { * Useful when you need to add new metadata fields without changing core logic. */ rateLimitKeyBuilder?: (metadata?: TrafficRequestMetadata) => string; + /** + * Optional retry policy overrides by provider/model. + * Models keys can use the rate-limit key or provider::model. + */ + retryPolicy?: RetryPolicyConfig; /** * Select a rate-limit strategy by provider/model. * Example: From f8249f9113d17d31cdac55b62ba0bb0639bfcf30 Mon Sep 17 00:00:00 2001 From: riturajFi Date: Wed, 24 Dec 2025 12:26:36 +0530 Subject: [PATCH 22/41] =?UTF-8?q?fix:=20circuit=E2=80=91breaker=20probe,?= =?UTF-8?q?=20task=E2=80=91aware=20fallback=20policy=20selection,=20and=20?= =?UTF-8?q?cross=E2=80=91provider=20fallback=20target?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packages/core/src/agent/agent.ts | 102 +++++++++++-- packages/core/src/index.ts | 5 + .../src/traffic/traffic-circuit-breaker.ts | 135 ++++++++++++++---- .../core/src/traffic/traffic-constants.ts | 1 + .../traffic/traffic-controller-internal.ts | 1 + .../src/traffic/traffic-controller.spec.ts | 9 +- .../core/src/traffic/traffic-controller.ts | 12 ++ packages/core/src/traffic/traffic-types.ts | 31 +++- 8 files changed, 252 insertions(+), 44 deletions(-) diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index 753f75b88..edb22398b 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -49,6 +49,7 @@ import type { Tool, Toolkit } from "../tool"; import { createTool } from "../tool"; import { ToolManager } from "../tool/manager"; import { + type FallbackChainEntry, type TrafficPriority, type TrafficRequestMetadata, getTrafficController, @@ -276,6 +277,14 @@ export interface BaseGenerationOptions extends Partial { * Defaults to agent-level priority when omitted. */ trafficPriority?: TrafficPriority; + /** + * Optional task classification for circuit-breaker fallback policies. + */ + taskType?: string; + /** + * Optional explicit fallback policy id. + */ + fallbackPolicyId?: string; // Parent tracking parentAgentId?: string; @@ -463,16 +472,24 @@ export class Agent { ): Promise { const controller = getTrafficController({ logger: this.logger }); // Use shared controller so all agent calls flow through central queue/metrics const tenantId = this.resolveTenantId(options); - const buildRequest = (modelOverride?: LanguageModel) => { + const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => { const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); - const metadata = this.buildTrafficMetadata(mergedOptions?.model, mergedOptions); // Compute once per queued request (including per-call model overrides) + const metadata = this.buildTrafficMetadata( + mergedOptions?.model, + mergedOptions, + providerOverride, + ); // Compute once per queued request (including per-call model overrides) return { tenantId, metadata, execute: () => this.executeGenerateText(input, mergedOptions, metadata), // Defer actual execution so controller can schedule it extractUsage: (result: GenerateTextResultWithContext) => this.extractUsageFromResponse(result), - createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), + createFallbackRequest: (fallbackTarget) => { + const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = + this.resolveFallbackTarget(fallbackTarget); + return buildRequest(fallbackModel, fallbackProvider); + }, }; }; @@ -583,12 +600,16 @@ export class Agent { hooks, maxSteps: userMaxSteps, tools: userTools, + taskType, + fallbackPolicyId, experimental_output, providerOptions, model: _model, // Exclude model so aiSDKOptions doesn't override resolved model ...aiSDKOptions } = options || {}; void _model; + void taskType; + void fallbackPolicyId; const llmSpan = this.createLLMSpan(oc, { operation: "generateText", @@ -828,16 +849,24 @@ export class Agent { ): Promise { const controller = getTrafficController({ logger: this.logger }); // Same controller handles streaming to keep ordering/backpressure consistent const tenantId = this.resolveTenantId(options); - const buildRequest = (modelOverride?: LanguageModel) => { + const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => { const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); - const metadata = this.buildTrafficMetadata(mergedOptions?.model, mergedOptions); // Compute once per queued request (including per-call model overrides) + const metadata = this.buildTrafficMetadata( + mergedOptions?.model, + mergedOptions, + providerOverride, + ); // Compute once per queued request (including per-call model overrides) return { tenantId, metadata, execute: () => this.executeStreamText(input, mergedOptions, metadata), // Actual streaming work happens after the controller dequeues us extractUsage: (result: StreamTextResultWithContext) => this.extractUsageFromResponse(result), - createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), + createFallbackRequest: (fallbackTarget) => { + const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = + this.resolveFallbackTarget(fallbackTarget); + return buildRequest(fallbackModel, fallbackProvider); + }, }; }; @@ -945,12 +974,16 @@ export class Agent { maxSteps: userMaxSteps, tools: userTools, onFinish: userOnFinish, + taskType, + fallbackPolicyId, experimental_output, providerOptions, model: _model, // Exclude model from aiSDKOptions to avoid overriding resolved model ...aiSDKOptions } = options || {}; void _model; + void taskType; + void fallbackPolicyId; const guardrailStreamingEnabled = guardrailSet.output.length > 0; @@ -1530,16 +1563,24 @@ export class Agent { ): Promise>> { const controller = getTrafficController({ logger: this.logger }); const tenantId = this.resolveTenantId(options); - const buildRequest = (modelOverride?: LanguageModel) => { + const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => { const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); - const metadata = this.buildTrafficMetadata(mergedOptions?.model, mergedOptions); // Compute once per queued request (including per-call model overrides) + const metadata = this.buildTrafficMetadata( + mergedOptions?.model, + mergedOptions, + providerOverride, + ); // Compute once per queued request (including per-call model overrides) return { tenantId, metadata, execute: () => this.executeGenerateObject(input, schema, mergedOptions, metadata), extractUsage: (result: GenerateObjectResultWithContext>) => this.extractUsageFromResponse(result), - createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), + createFallbackRequest: (fallbackTarget) => { + const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = + this.resolveFallbackTarget(fallbackTarget); + return buildRequest(fallbackModel, fallbackProvider); + }, }; }; @@ -1633,11 +1674,15 @@ export class Agent { hooks, maxSteps: userMaxSteps, tools: userTools, + taskType, + fallbackPolicyId, providerOptions, model: _model, // Exclude model so spread does not override resolved model ...aiSDKOptions } = options || {}; void _model; + void taskType; + void fallbackPolicyId; methodLogger.info("[AI SDK] Calling generateObject", { messageCount: messages.length, @@ -1796,16 +1841,24 @@ export class Agent { ): Promise>> { const controller = getTrafficController({ logger: this.logger }); const tenantId = this.resolveTenantId(options); - const buildRequest = (modelOverride?: LanguageModel) => { + const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => { const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); - const metadata = this.buildTrafficMetadata(mergedOptions?.model, mergedOptions); // Compute once per queued request (including per-call model overrides) + const metadata = this.buildTrafficMetadata( + mergedOptions?.model, + mergedOptions, + providerOverride, + ); // Compute once per queued request (including per-call model overrides) return { tenantId, metadata, execute: () => this.executeStreamObject(input, schema, mergedOptions, metadata), extractUsage: (result: StreamObjectResultWithContext>) => this.extractUsageFromResponse(result), - createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), + createFallbackRequest: (fallbackTarget) => { + const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = + this.resolveFallbackTarget(fallbackTarget); + return buildRequest(fallbackModel, fallbackProvider); + }, }; }; @@ -1901,11 +1954,15 @@ export class Agent { maxSteps: userMaxSteps, tools: userTools, onFinish: userOnFinish, + taskType, + fallbackPolicyId, providerOptions, model: _model, // Exclude model so aiSDKOptions cannot override resolved model ...aiSDKOptions } = options || {}; void _model; + void taskType; + void fallbackPolicyId; let guardrailObjectPromise!: Promise>; let resolveGuardrailObject: ((value: z.infer) => void) | undefined; @@ -4067,9 +4124,13 @@ export class Agent { private buildTrafficMetadata( modelOverride?: LanguageModel | DynamicValue, options?: BaseGenerationOptions, + providerOverride?: string, ): TrafficRequestMetadata { const provider = - this.resolveProvider(modelOverride) ?? this.resolveProvider(this.model) ?? undefined; + providerOverride ?? + this.resolveProvider(modelOverride) ?? + this.resolveProvider(this.model) ?? + undefined; const priority = this.resolveTrafficPriority(options); return { @@ -4079,6 +4140,21 @@ export class Agent { provider, // Allows per-provider throttling later priority, tenantId: this.resolveTenantId(options), + taskType: options?.taskType, + fallbackPolicyId: options?.fallbackPolicyId, + }; + } + + private resolveFallbackTarget(target: FallbackChainEntry): { + modelOverride?: LanguageModel; + providerOverride?: string; + } { + if (typeof target === "string") { + return { modelOverride: target }; + } + return { + modelOverride: target.model, + providerOverride: target.provider, }; } diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index f9dd9fef3..6a74d346e 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -27,6 +27,11 @@ export { CircuitBreakerOpenError, RateLimitedUpstreamError, getTrafficController, + type FallbackChainEntry, + type FallbackPolicy, + type FallbackPolicyConfig, + type FallbackPolicyMode, + type FallbackTarget, type RateLimitConfig, type RateLimitKey, type RateLimitOptions, diff --git a/packages/core/src/traffic/traffic-circuit-breaker.ts b/packages/core/src/traffic/traffic-circuit-breaker.ts index 812b29213..5abd73539 100644 --- a/packages/core/src/traffic/traffic-circuit-breaker.ts +++ b/packages/core/src/traffic/traffic-circuit-breaker.ts @@ -3,6 +3,7 @@ import { CIRCUIT_COOLDOWN_MS, CIRCUIT_FAILURE_THRESHOLD, CIRCUIT_FAILURE_WINDOW_MS, + CIRCUIT_PROBE_INTERVAL_MS, CIRCUIT_TIMEOUT_THRESHOLD, CIRCUIT_TIMEOUT_WINDOW_MS, DEFAULT_FALLBACK_CHAINS, @@ -15,38 +16,45 @@ import type { } from "./traffic-controller-internal"; import { extractStatusCode, isTimeoutError } from "./traffic-error-utils"; import { CircuitBreakerOpenError } from "./traffic-errors"; -import type { TrafficRequestMetadata } from "./traffic-types"; +import type { + FallbackChainEntry, + FallbackPolicy, + FallbackPolicyConfig, + FallbackTarget, + TrafficRequestMetadata, +} from "./traffic-types"; export class TrafficCircuitBreaker { private readonly circuitBreakers = new Map(); - private readonly fallbackChains: Map; + private readonly fallbackChains: Map; + private readonly fallbackPolicy?: FallbackPolicyConfig; private readonly buildRateLimitKey: (metadata?: TrafficRequestMetadata) => string; constructor(options: { - fallbackChains?: Record; + fallbackChains?: Record; + fallbackPolicy?: FallbackPolicyConfig; buildRateLimitKey: (metadata?: TrafficRequestMetadata) => string; }) { this.buildRateLimitKey = options.buildRateLimitKey; const chains = options.fallbackChains ?? DEFAULT_FALLBACK_CHAINS; this.fallbackChains = new Map(Object.entries(chains)); + this.fallbackPolicy = options.fallbackPolicy; } resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { const circuitLogger = logger?.child({ module: "circuit-breaker" }); - const visited = new Set(); + const visitedKeys = new Set(); while (true) { const key = this.buildRateLimitKey(next.request.metadata); next.circuitKey = key; + visitedKeys.add(key); circuitLogger?.trace?.("Circuit resolve step", { circuitKey: key, provider: next.request.metadata?.provider, model: next.request.metadata?.model, }); - const model = next.request.metadata?.model; - if (model) visited.add(model); - const evaluation = this.evaluateCircuitState(key, circuitLogger); next.circuitStatus = evaluation.state; circuitLogger?.debug?.("Circuit evaluated", { @@ -58,12 +66,25 @@ export class TrafficCircuitBreaker { if (evaluation.allowRequest) return null; - const fallback = this.findFallbackModel(next.request.metadata, visited, circuitLogger); + const { policy, policyId } = this.resolveFallbackPolicy(next.request.metadata); + if (policy.mode === "wait") { + const wakeUpAt = + evaluation.retryAfterMs !== undefined ? Date.now() + evaluation.retryAfterMs : undefined; + circuitLogger?.debug?.("Circuit open; waiting per fallback policy", { + circuitKey: key, + policyId, + retryAfterMs: evaluation.retryAfterMs, + wakeUpAt, + }); + return { kind: "wait", wakeUpAt }; + } + + const fallback = this.findFallbackTarget(next.request.metadata, visitedKeys, circuitLogger); circuitLogger?.debug?.("Circuit open; attempting fallback", { circuitKey: key, currentModel: next.request.metadata?.model, fallback, - visitedModels: Array.from(visited), + visitedKeys: Array.from(visitedKeys), }); if (!fallback || !next.request.createFallbackRequest) { next.reject( @@ -192,6 +213,7 @@ export class TrafficCircuitBreaker { state.status = "open"; state.openedAt = now; state.trialInFlight = false; + state.nextProbeAt = now + CIRCUIT_PROBE_INTERVAL_MS; circuitLogger?.warn?.("Circuit opened", { circuitKey: key, openReasons, @@ -234,18 +256,27 @@ export class TrafficCircuitBreaker { if (state.status === "open") { const elapsed = state.openedAt ? now - state.openedAt : 0; - if (elapsed >= CIRCUIT_COOLDOWN_MS) { + if (state.nextProbeAt === undefined) { + state.nextProbeAt = now + CIRCUIT_PROBE_INTERVAL_MS; + } + const cooldownRemaining = Math.max(0, CIRCUIT_COOLDOWN_MS - elapsed); + const probeRemaining = Math.max(0, state.nextProbeAt - now); + if (probeRemaining === 0 || cooldownRemaining === 0) { state.status = "half-open"; state.trialInFlight = false; state.failureTimestamps = []; state.timeoutTimestamps = []; - logger?.debug?.("Circuit transitioned to half-open", { circuitKey: key }); + state.nextProbeAt = undefined; + logger?.debug?.("Circuit transitioned to half-open", { + circuitKey: key, + reason: cooldownRemaining === 0 ? "cooldown" : "probe", + }); return { allowRequest: true, state: "half-open" }; } return { allowRequest: false, state: "open", - retryAfterMs: CIRCUIT_COOLDOWN_MS - elapsed, + retryAfterMs: Math.min(cooldownRemaining, probeRemaining), }; } @@ -256,40 +287,65 @@ export class TrafficCircuitBreaker { return { allowRequest: true, state: state.status }; } - private findFallbackModel( + private resolveFallbackPolicy(metadata: TrafficRequestMetadata | undefined): { + policy: FallbackPolicy; + policyId?: string; + } { + const policyId = + metadata?.fallbackPolicyId ?? + (metadata?.taskType + ? this.fallbackPolicy?.taskTypePolicyIds?.[metadata.taskType] + : undefined) ?? + this.fallbackPolicy?.defaultPolicyId; + + const policy = policyId ? this.fallbackPolicy?.policies?.[policyId] : undefined; + return { + policy: policy ?? { mode: "fallback" }, + policyId, + }; + } + + private findFallbackTarget( metadata: TrafficRequestMetadata | undefined, - visitedModels: Set, + visitedKeys: Set, logger?: Logger, - ): string | undefined { + ): FallbackChainEntry | undefined { const currentModel = metadata?.model; if (!currentModel) { logger?.trace?.("No current model; no fallback", {}); return undefined; } - const chain = this.fallbackChains.get(currentModel); + const provider = metadata?.provider; + const chain = this.resolveFallbackChain(provider, currentModel); if (!chain) { - logger?.trace?.("No fallback chain for model", { currentModel }); + logger?.trace?.("No fallback chain for model", { + currentModel, + provider, + }); return undefined; } - const provider = metadata?.provider; for (const candidate of chain) { - if (visitedModels.has(candidate)) { + const target = this.normalizeFallbackTarget(candidate, provider); + const candidateMetadata: TrafficRequestMetadata = { + ...(metadata ?? {}), + provider: target.provider ?? provider, + model: target.model, + }; + const candidateKey = this.buildRateLimitKey(candidateMetadata); + if (visitedKeys.has(candidateKey)) { continue; } - const candidateKey = this.buildRateLimitKey({ - provider, - model: candidate, - }); - const evaluation = this.evaluateCircuitState(candidateKey, logger); if (evaluation.allowRequest) { - visitedModels.add(candidate); - logger?.debug?.("Selected fallback model", { + visitedKeys.add(candidateKey); + logger?.debug?.("Selected fallback target", { currentModel, - fallbackModel: candidate, + currentProvider: provider, + fallbackModel: target.model, + fallbackProvider: target.provider ?? provider, fallbackCircuitKey: candidateKey, }); return candidate; @@ -299,6 +355,31 @@ export class TrafficCircuitBreaker { return undefined; } + private resolveFallbackChain( + provider: string | undefined, + model: string, + ): FallbackChainEntry[] | undefined { + const providerKey = provider ? `${provider}::${model}` : undefined; + if (providerKey) { + const providerChain = this.fallbackChains.get(providerKey); + if (providerChain) return providerChain; + } + return this.fallbackChains.get(model); + } + + private normalizeFallbackTarget( + candidate: FallbackChainEntry, + provider: string | undefined, + ): FallbackTarget { + if (typeof candidate === "string") { + return { provider, model: candidate }; + } + return { + provider: candidate.provider ?? provider, + model: candidate.model, + }; + } + private isCircuitBreakerStatus(status?: number): boolean { return status === 429 || (status !== undefined && status >= 500); } diff --git a/packages/core/src/traffic/traffic-constants.ts b/packages/core/src/traffic/traffic-constants.ts index c21f8a17e..68d99df78 100644 --- a/packages/core/src/traffic/traffic-constants.ts +++ b/packages/core/src/traffic/traffic-constants.ts @@ -14,6 +14,7 @@ export const CIRCUIT_FAILURE_WINDOW_MS = 10_000; export const CIRCUIT_TIMEOUT_THRESHOLD = CIRCUIT_FAILURE_THRESHOLD; export const CIRCUIT_TIMEOUT_WINDOW_MS = CIRCUIT_FAILURE_WINDOW_MS; export const CIRCUIT_COOLDOWN_MS = 30_000; +export const CIRCUIT_PROBE_INTERVAL_MS = 5_000; export const RATE_LIMIT_EXHAUSTION_BUFFER = 1; export const RATE_LIMIT_PROBE_DELAY_MS = 50; diff --git a/packages/core/src/traffic/traffic-controller-internal.ts b/packages/core/src/traffic/traffic-controller-internal.ts index 065c92c1d..7221ccd0e 100644 --- a/packages/core/src/traffic/traffic-controller-internal.ts +++ b/packages/core/src/traffic/traffic-controller-internal.ts @@ -15,6 +15,7 @@ export interface CircuitState { timeoutTimestamps: number[]; openedAt?: number; trialInFlight?: boolean; + nextProbeAt?: number; } export interface RateLimitWindowState { diff --git a/packages/core/src/traffic/traffic-controller.spec.ts b/packages/core/src/traffic/traffic-controller.spec.ts index 6b59a266e..b3f331b20 100644 --- a/packages/core/src/traffic/traffic-controller.spec.ts +++ b/packages/core/src/traffic/traffic-controller.spec.ts @@ -261,10 +261,15 @@ describe("TrafficController stream reporting", () => { order.push("primary"); return "primary"; }, - createFallbackRequest: (modelId) => ({ + createFallbackRequest: (target) => ({ tenantId, - metadata: { provider: "p", model: modelId, priority: "P1" }, + metadata: { + provider: "p", + model: typeof target === "string" ? target : target.model, + priority: "P1", + }, execute: async () => { + const modelId = typeof target === "string" ? target : target.model; order.push(modelId); return modelId; }, diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index 387e273ee..c5e52f57e 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -16,6 +16,11 @@ import { } from "./traffic-rate-limiter"; import { buildRetryPlanWithPolicy } from "./traffic-retry"; import type { + FallbackChainEntry, + FallbackPolicy, + FallbackPolicyConfig, + FallbackPolicyMode, + FallbackTarget, ProviderModelConcurrencyLimit, RateLimitConfig, RateLimitKey, @@ -39,6 +44,11 @@ import { TrafficUsageTracker } from "./traffic-usage-tracker"; */ export type { + FallbackChainEntry, + FallbackPolicy, + FallbackPolicyConfig, + FallbackPolicyMode, + FallbackTarget, ProviderModelConcurrencyLimit, RateLimitConfig, RateLimitKey, @@ -109,6 +119,7 @@ export class TrafficController { }); this.circuitBreaker = new TrafficCircuitBreaker({ fallbackChains: options.fallbackChains, + fallbackPolicy: options.fallbackPolicy, buildRateLimitKey: (metadata) => this.buildRateLimitKey(metadata), }); this.concurrencyLimiter = new TrafficConcurrencyLimiter({ @@ -120,6 +131,7 @@ export class TrafficController { this.controllerLogger.debug("Initialized TrafficController", { maxConcurrent: this.maxConcurrent, hasFallbackChains: !!options.fallbackChains, + hasFallbackPolicy: options.fallbackPolicy !== undefined, hasProviderModelConcurrency: options.maxConcurrentPerProviderModel !== undefined, hasTenantConcurrency: options.maxConcurrentPerTenant !== undefined, hasConfigRateLimits: options.rateLimits !== undefined, diff --git a/packages/core/src/traffic/traffic-types.ts b/packages/core/src/traffic/traffic-types.ts index 2605623f0..e0be91792 100644 --- a/packages/core/src/traffic/traffic-types.ts +++ b/packages/core/src/traffic/traffic-types.ts @@ -48,8 +48,28 @@ export interface TrafficRequestMetadata { endpoint?: string; tenantTier?: string; taskType?: string; + fallbackPolicyId?: string; } +export type FallbackTarget = { + provider?: string; + model: string; +}; + +export type FallbackChainEntry = string | FallbackTarget; + +export type FallbackPolicyMode = "fallback" | "wait"; + +export type FallbackPolicy = { + mode: FallbackPolicyMode; +}; + +export type FallbackPolicyConfig = { + defaultPolicyId?: string; + policies?: Record; + taskTypePolicyIds?: Record; +}; + export type ProviderModelConcurrencyLimit = | number | Record @@ -65,7 +85,10 @@ export interface TrafficRequest { metadata?: TrafficRequestMetadata; execute: () => Promise; deadlineAt?: number; - createFallbackRequest?: (modelId: string) => TrafficRequest | undefined; + createFallbackRequest?: BivariantFunction< + [target: FallbackChainEntry], + TrafficRequest | undefined + >; extractUsage?: BivariantFunction< [response: TResponse], Promise | UsageCounters | undefined @@ -87,6 +110,10 @@ export interface TrafficControllerOptions { * Models keys can use the rate-limit key or provider::model. */ retryPolicy?: RetryPolicyConfig; + /** + * Optional fallback policy selection by task type or explicit policy id. + */ + fallbackPolicy?: FallbackPolicyConfig; /** * Select a rate-limit strategy by provider/model. * Example: @@ -94,7 +121,7 @@ export interface TrafficControllerOptions { */ rateLimitStrategy?: RateLimitStrategyConfig; logger?: Logger; - fallbackChains?: Record; + fallbackChains?: Record; } export type RateLimitStrategyKind = "window" | "token-bucket"; From 21755175ed9fc1d90ee5624e9ccbd48ed1de212f Mon Sep 17 00:00:00 2001 From: riturajFi Date: Wed, 24 Dec 2025 15:29:58 +0530 Subject: [PATCH 23/41] fix: strict queue wait limits --- packages/core/src/agent/agent.ts | 16 ++ packages/core/src/index.ts | 1 + .../src/traffic/traffic-circuit-breaker.ts | 80 +++++++- .../traffic/traffic-controller-internal.ts | 1 + .../core/src/traffic/traffic-controller.ts | 171 +++++++++++++++++- packages/core/src/traffic/traffic-errors.ts | 24 +++ packages/core/src/traffic/traffic-types.ts | 1 + 7 files changed, 283 insertions(+), 11 deletions(-) diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index edb22398b..04e86bb9f 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -277,6 +277,10 @@ export interface BaseGenerationOptions extends Partial { * Defaults to agent-level priority when omitted. */ trafficPriority?: TrafficPriority; + /** + * Optional maximum time to wait in the queue before timing out. + */ + maxQueueWaitMs?: number; /** * Optional task classification for circuit-breaker fallback policies. */ @@ -482,6 +486,7 @@ export class Agent { return { tenantId, metadata, + maxQueueWaitMs: options?.maxQueueWaitMs, execute: () => this.executeGenerateText(input, mergedOptions, metadata), // Defer actual execution so controller can schedule it extractUsage: (result: GenerateTextResultWithContext) => this.extractUsageFromResponse(result), @@ -600,6 +605,7 @@ export class Agent { hooks, maxSteps: userMaxSteps, tools: userTools, + maxQueueWaitMs, taskType, fallbackPolicyId, experimental_output, @@ -608,6 +614,7 @@ export class Agent { ...aiSDKOptions } = options || {}; void _model; + void maxQueueWaitMs; void taskType; void fallbackPolicyId; @@ -859,6 +866,7 @@ export class Agent { return { tenantId, metadata, + maxQueueWaitMs: options?.maxQueueWaitMs, execute: () => this.executeStreamText(input, mergedOptions, metadata), // Actual streaming work happens after the controller dequeues us extractUsage: (result: StreamTextResultWithContext) => this.extractUsageFromResponse(result), @@ -974,6 +982,7 @@ export class Agent { maxSteps: userMaxSteps, tools: userTools, onFinish: userOnFinish, + maxQueueWaitMs, taskType, fallbackPolicyId, experimental_output, @@ -982,6 +991,7 @@ export class Agent { ...aiSDKOptions } = options || {}; void _model; + void maxQueueWaitMs; void taskType; void fallbackPolicyId; @@ -1573,6 +1583,7 @@ export class Agent { return { tenantId, metadata, + maxQueueWaitMs: options?.maxQueueWaitMs, execute: () => this.executeGenerateObject(input, schema, mergedOptions, metadata), extractUsage: (result: GenerateObjectResultWithContext>) => this.extractUsageFromResponse(result), @@ -1676,6 +1687,7 @@ export class Agent { tools: userTools, taskType, fallbackPolicyId, + maxQueueWaitMs, providerOptions, model: _model, // Exclude model so spread does not override resolved model ...aiSDKOptions @@ -1683,6 +1695,7 @@ export class Agent { void _model; void taskType; void fallbackPolicyId; + void maxQueueWaitMs; methodLogger.info("[AI SDK] Calling generateObject", { messageCount: messages.length, @@ -1851,6 +1864,7 @@ export class Agent { return { tenantId, metadata, + maxQueueWaitMs: options?.maxQueueWaitMs, execute: () => this.executeStreamObject(input, schema, mergedOptions, metadata), extractUsage: (result: StreamObjectResultWithContext>) => this.extractUsageFromResponse(result), @@ -1956,6 +1970,7 @@ export class Agent { onFinish: userOnFinish, taskType, fallbackPolicyId, + maxQueueWaitMs, providerOptions, model: _model, // Exclude model so aiSDKOptions cannot override resolved model ...aiSDKOptions @@ -1963,6 +1978,7 @@ export class Agent { void _model; void taskType; void fallbackPolicyId; + void maxQueueWaitMs; let guardrailObjectPromise!: Promise>; let resolveGuardrailObject: ((value: z.infer) => void) | undefined; diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 6a74d346e..70e37bf23 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -25,6 +25,7 @@ export { // Surface traffic controller so downstream consumers can route agent calls through the shared scheduler TrafficController, CircuitBreakerOpenError, + QueueWaitTimeoutError, RateLimitedUpstreamError, getTrafficController, type FallbackChainEntry, diff --git a/packages/core/src/traffic/traffic-circuit-breaker.ts b/packages/core/src/traffic/traffic-circuit-breaker.ts index 5abd73539..00f6fd93c 100644 --- a/packages/core/src/traffic/traffic-circuit-breaker.ts +++ b/packages/core/src/traffic/traffic-circuit-breaker.ts @@ -110,21 +110,58 @@ export class TrafficCircuitBreaker { return { kind: "skip" }; } - next.request = fallbackRequest; - next.attempt = 1; - next.tenantConcurrencyKey = undefined; - next.providerModelConcurrencyKey = undefined; - next.rateLimitKey = undefined; - next.etaMs = undefined; - next.circuitKey = undefined; - next.circuitStatus = undefined; - circuitLogger?.debug?.("Switched to fallback request", { + this.applyFallbackRequest(next, fallbackRequest, fallback, circuitLogger, { previousCircuitKey: key, - fallbackModel: fallback, + reason: "circuit-open", }); } } + tryFallback(next: QueuedRequest, reason: "queue-timeout", logger?: Logger): boolean { + const circuitLogger = logger?.child({ module: "circuit-breaker" }); + const { policy, policyId } = this.resolveFallbackPolicy(next.request.metadata); + if (policy.mode === "wait") { + circuitLogger?.debug?.("Fallback skipped by policy", { + policyId, + reason, + provider: next.request.metadata?.provider, + model: next.request.metadata?.model, + }); + return false; + } + + const visitedKeys = new Set(); + const key = this.buildRateLimitKey(next.request.metadata); + visitedKeys.add(key); + + const fallback = this.findFallbackTarget(next.request.metadata, visitedKeys, circuitLogger); + if (!fallback || !next.request.createFallbackRequest) { + circuitLogger?.debug?.("Fallback unavailable for request", { + reason, + provider: next.request.metadata?.provider, + model: next.request.metadata?.model, + fallback, + }); + return false; + } + + const fallbackRequest = next.request.createFallbackRequest(fallback); + if (!fallbackRequest) { + circuitLogger?.warn?.("createFallbackRequest returned undefined; skipping", { + reason, + fallback, + }); + return false; + } + + this.applyFallbackRequest(next, fallbackRequest, fallback, circuitLogger, { + previousCircuitKey: key, + reason, + policyId, + }); + return true; + } + markTrial(item: QueuedRequest, logger?: Logger): void { const circuitLogger = logger?.child({ module: "circuit-breaker" }); const key = item.circuitKey; @@ -305,6 +342,29 @@ export class TrafficCircuitBreaker { }; } + private applyFallbackRequest( + next: QueuedRequest, + fallbackRequest: QueuedRequest["request"], + fallback: FallbackChainEntry, + logger?: Logger, + context?: { previousCircuitKey?: string; reason?: string; policyId?: string }, + ): void { + next.request = fallbackRequest; + next.attempt = 1; + next.tenantConcurrencyKey = undefined; + next.providerModelConcurrencyKey = undefined; + next.rateLimitKey = undefined; + next.etaMs = undefined; + next.circuitKey = undefined; + next.circuitStatus = undefined; + logger?.debug?.("Switched to fallback request", { + previousCircuitKey: context?.previousCircuitKey, + fallbackModel: fallback, + reason: context?.reason, + policyId: context?.policyId, + }); + } + private findFallbackTarget( metadata: TrafficRequestMetadata | undefined, visitedKeys: Set, diff --git a/packages/core/src/traffic/traffic-controller-internal.ts b/packages/core/src/traffic/traffic-controller-internal.ts index 7221ccd0e..cbcaab277 100644 --- a/packages/core/src/traffic/traffic-controller-internal.ts +++ b/packages/core/src/traffic/traffic-controller-internal.ts @@ -38,6 +38,7 @@ export interface QueuedRequest { attempt: number; priority: TrafficPriority; tenantId: string; + enqueuedAt: number; tenantConcurrencyKey?: string; providerModelConcurrencyKey?: string; diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index c5e52f57e..6a1a5d26d 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -5,6 +5,7 @@ import { TrafficConcurrencyLimiter } from "./traffic-concurrency-limiter"; import type { DispatchDecision, QueuedRequest, Scheduler } from "./traffic-controller-internal"; import { CircuitBreakerOpenError, + QueueWaitTimeoutError, RateLimitedUpstreamError, normalizeRateLimitError, } from "./traffic-errors"; @@ -64,6 +65,7 @@ export type { }; export { CircuitBreakerOpenError }; +export { QueueWaitTimeoutError }; export { RateLimitedUpstreamError }; export class TrafficController { @@ -256,6 +258,7 @@ export class TrafficController { attempt: 1, priority, tenantId: request.tenantId, + enqueuedAt: Date.now(), extractUsage: request.extractUsage, }); this.scheduleDrain(); @@ -321,6 +324,22 @@ export class TrafficController { for (let index = 0; index < queue.length; index++) { const next = queue[index]; if (!next) continue; + const now = Date.now(); + const queueTimeoutAt = this.resolveQueueTimeoutAt(next); + const queueTimeoutTriggered = this.handleQueueTimeout( + next, + queue, + index, + now, + queueTimeoutAt, + ); + if (queueTimeoutTriggered === "rejected") { + return { kind: "skip" }; + } + if (queueTimeoutAt !== undefined && now < queueTimeoutAt) { + observeWakeUpAt(queueTimeoutAt); + } + const queueTimeoutExpired = queueTimeoutTriggered === "expired"; this.controllerLogger.trace("Evaluate next queued request", { priority, @@ -346,6 +365,23 @@ export class TrafficController { return { kind: "skip" }; } if (circuit.kind === "wait") { + if ( + this.rejectIfQueueTimedOut( + queueTimeoutExpired, + next, + queue, + index, + now, + "circuit wait", + ) + ) { + return { kind: "skip" }; + } + if (circuit.wakeUpAt !== undefined) { + next.etaMs = Math.max(0, circuit.wakeUpAt - now); + } else { + next.etaMs = undefined; + } observeWakeUpAt(circuit.wakeUpAt); continue; } @@ -360,6 +396,19 @@ export class TrafficController { model: next.request.metadata?.model, reasons: concurrency.reasons, }); + if ( + this.rejectIfQueueTimedOut( + queueTimeoutExpired, + next, + queue, + index, + now, + "concurrency wait", + ) + ) { + return { kind: "skip" }; + } + next.etaMs = undefined; continue; } @@ -370,10 +419,45 @@ export class TrafficController { decision: rateLimit, rateLimitKey: next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), }); - if (rateLimit.kind === "wait") observeWakeUpAt(rateLimit.wakeUpAt); + if (rateLimit.kind === "wait") { + if ( + this.rejectIfQueueTimedOut( + queueTimeoutExpired, + next, + queue, + index, + now, + "rate limit wait", + ) + ) { + return { kind: "skip" }; + } + if (rateLimit.wakeUpAt !== undefined) { + next.etaMs = Math.max(0, rateLimit.wakeUpAt - now); + } else { + next.etaMs = undefined; + } + observeWakeUpAt(rateLimit.wakeUpAt); + } continue; } + if (queueTimeoutExpired) { + const timeoutError = this.createQueueTimeoutError(next, now); + this.controllerLogger.warn("Queue wait timed out before dispatch", { + tenantId: next.tenantId, + waitedMs: timeoutError.waitedMs, + maxQueueWaitMs: timeoutError.maxQueueWaitMs, + deadlineAt: timeoutError.deadlineAt, + provider: next.request.metadata?.provider, + model: next.request.metadata?.model, + rateLimitKey: timeoutError.rateLimitKey, + }); + queue.splice(index, 1); + next.reject(timeoutError); + return { kind: "skip" }; + } + this.startRequest(next, index); return { kind: "dispatch" }; } @@ -548,6 +632,7 @@ export class TrafficController { this.queues[item.priority].push({ ...item, attempt: item.attempt + 1, + enqueuedAt: Date.now(), tenantConcurrencyKey: undefined, providerModelConcurrencyKey: undefined, rateLimitKey: undefined, @@ -594,6 +679,90 @@ export class TrafficController { * ============================================================ */ + private resolveQueueTimeoutAt(next: QueuedRequest): number | undefined { + const maxQueueWaitMs = next.request.maxQueueWaitMs; + const normalizedMaxWait = + typeof maxQueueWaitMs === "number" && Number.isFinite(maxQueueWaitMs) + ? Math.max(0, maxQueueWaitMs) + : undefined; + const timeoutAt = + normalizedMaxWait !== undefined ? next.enqueuedAt + normalizedMaxWait : undefined; + const deadlineAt = next.request.deadlineAt; + if (timeoutAt === undefined) return deadlineAt; + if (deadlineAt === undefined) return timeoutAt; + return Math.min(timeoutAt, deadlineAt); + } + + private handleQueueTimeout( + next: QueuedRequest, + queue: QueuedRequest[], + index: number, + now: number, + queueTimeoutAt?: number, + ): "none" | "expired" | "rejected" { + if (queueTimeoutAt === undefined) return "none"; + if (now < queueTimeoutAt) return "none"; + + const fallbackApplied = this.circuitBreaker.tryFallback( + next, + "queue-timeout", + this.trafficLogger, + ); + if (fallbackApplied) { + return "expired"; + } + + const timeoutError = this.createQueueTimeoutError(next, now); + this.controllerLogger.warn("Queue wait timed out; rejecting request", { + tenantId: next.tenantId, + waitedMs: timeoutError.waitedMs, + maxQueueWaitMs: timeoutError.maxQueueWaitMs, + deadlineAt: timeoutError.deadlineAt, + provider: next.request.metadata?.provider, + model: next.request.metadata?.model, + rateLimitKey: timeoutError.rateLimitKey, + }); + queue.splice(index, 1); + next.reject(timeoutError); + return "rejected"; + } + + private rejectIfQueueTimedOut( + queueTimeoutExpired: boolean, + next: QueuedRequest, + queue: QueuedRequest[], + index: number, + now: number, + reason: string, + ): boolean { + if (!queueTimeoutExpired) return false; + const timeoutError = this.createQueueTimeoutError(next, now); + this.controllerLogger.warn("Queue wait timed out during gate wait", { + tenantId: next.tenantId, + waitedMs: timeoutError.waitedMs, + maxQueueWaitMs: timeoutError.maxQueueWaitMs, + deadlineAt: timeoutError.deadlineAt, + provider: next.request.metadata?.provider, + model: next.request.metadata?.model, + rateLimitKey: timeoutError.rateLimitKey, + reason, + }); + queue.splice(index, 1); + next.reject(timeoutError); + return true; + } + + private createQueueTimeoutError(next: QueuedRequest, now: number): QueueWaitTimeoutError { + const waitedMs = Math.max(0, now - next.enqueuedAt); + return new QueueWaitTimeoutError({ + waitedMs, + maxQueueWaitMs: next.request.maxQueueWaitMs, + deadlineAt: next.request.deadlineAt, + metadata: next.request.metadata, + rateLimitKey: next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), + }); + } + private resolvePriority(metadata?: TrafficRequestMetadata): TrafficPriority { return metadata?.priority ?? "P1"; } diff --git a/packages/core/src/traffic/traffic-errors.ts b/packages/core/src/traffic/traffic-errors.ts index 6cee702d6..4943c89fd 100644 --- a/packages/core/src/traffic/traffic-errors.ts +++ b/packages/core/src/traffic/traffic-errors.ts @@ -21,6 +21,30 @@ export class CircuitBreakerOpenError extends Error { } } +export class QueueWaitTimeoutError extends Error { + readonly waitedMs: number; + readonly maxQueueWaitMs?: number; + readonly deadlineAt?: number; + readonly metadata?: TrafficRequestMetadata; + readonly rateLimitKey?: string; + + constructor(options: { + waitedMs: number; + maxQueueWaitMs?: number; + deadlineAt?: number; + metadata?: TrafficRequestMetadata; + rateLimitKey?: string; + }) { + super("Queue wait time exceeded"); + this.name = "QueueWaitTimeoutError"; + this.waitedMs = options.waitedMs; + this.maxQueueWaitMs = options.maxQueueWaitMs; + this.deadlineAt = options.deadlineAt; + this.metadata = options.metadata; + this.rateLimitKey = options.rateLimitKey; + } +} + export class RateLimitedUpstreamError extends Error { readonly status = 429; readonly retryAfterMs?: number; diff --git a/packages/core/src/traffic/traffic-types.ts b/packages/core/src/traffic/traffic-types.ts index e0be91792..7108442cd 100644 --- a/packages/core/src/traffic/traffic-types.ts +++ b/packages/core/src/traffic/traffic-types.ts @@ -85,6 +85,7 @@ export interface TrafficRequest { metadata?: TrafficRequestMetadata; execute: () => Promise; deadlineAt?: number; + maxQueueWaitMs?: number; createFallbackRequest?: BivariantFunction< [target: FallbackChainEntry], TrafficRequest | undefined From 04921a3c1499934084ad1f15819ca242fb33d1d9 Mon Sep 17 00:00:00 2001 From: riturajFi Date: Wed, 24 Dec 2025 22:52:28 +0530 Subject: [PATCH 24/41] fix: strict queue wait limits --- packages/core/src/index.ts | 3 + .../src/traffic/traffic-circuit-breaker.ts | 20 +- .../traffic/traffic-controller-internal.ts | 1 + .../core/src/traffic/traffic-controller.ts | 524 ++++++++++++++++-- packages/core/src/traffic/traffic-types.ts | 25 + .../src/handlers/agent.handlers.ts | 96 +++- packages/server-core/src/index.ts | 1 + packages/server-core/src/types/responses.ts | 3 + packages/server-core/src/utils/traffic.ts | 35 ++ packages/server-hono/src/routes/index.ts | 11 +- packages/serverless-hono/src/routes.ts | 7 +- 11 files changed, 650 insertions(+), 76 deletions(-) create mode 100644 packages/server-core/src/utils/traffic.ts diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 70e37bf23..3850f0acf 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -36,8 +36,11 @@ export { type RateLimitConfig, type RateLimitKey, type RateLimitOptions, + type AdaptiveLimiterConfig, + type PriorityBurstLimits, type TrafficRequest, type TrafficRequestMetadata, + type TrafficResponseMetadata, type TrafficPriority, type TrafficRequestType, } from "./traffic/traffic-controller"; diff --git a/packages/core/src/traffic/traffic-circuit-breaker.ts b/packages/core/src/traffic/traffic-circuit-breaker.ts index 00f6fd93c..f240ce405 100644 --- a/packages/core/src/traffic/traffic-circuit-breaker.ts +++ b/packages/core/src/traffic/traffic-circuit-breaker.ts @@ -22,6 +22,7 @@ import type { FallbackPolicyConfig, FallbackTarget, TrafficRequestMetadata, + TrafficResponseMetadata, } from "./traffic-types"; export class TrafficCircuitBreaker { @@ -87,13 +88,20 @@ export class TrafficCircuitBreaker { visitedKeys: Array.from(visitedKeys), }); if (!fallback || !next.request.createFallbackRequest) { - next.reject( - new CircuitBreakerOpenError( - `Circuit open for ${key}`, - next.request.metadata, - evaluation.retryAfterMs, - ), + const error = new CircuitBreakerOpenError( + `Circuit open for ${key}`, + next.request.metadata, + evaluation.retryAfterMs, ); + const traffic: TrafficResponseMetadata = { + rateLimitKey: key, + retryAfterMs: evaluation.retryAfterMs, + tenantId: next.request.metadata?.tenantId ?? next.tenantId, + priority: next.request.metadata?.priority, + taskType: next.request.metadata?.taskType, + }; + (error as Record).traffic = traffic; + next.reject(error); circuitLogger?.warn?.("No fallback available; rejecting request", { circuitKey: key, retryAfterMs: evaluation.retryAfterMs, diff --git a/packages/core/src/traffic/traffic-controller-internal.ts b/packages/core/src/traffic/traffic-controller-internal.ts index cbcaab277..cf4358542 100644 --- a/packages/core/src/traffic/traffic-controller-internal.ts +++ b/packages/core/src/traffic/traffic-controller-internal.ts @@ -39,6 +39,7 @@ export interface QueuedRequest { priority: TrafficPriority; tenantId: string; enqueuedAt: number; + dispatchedAt?: number; tenantConcurrencyKey?: string; providerModelConcurrencyKey?: string; diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index 6a1a5d26d..c26b914d8 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -17,11 +17,13 @@ import { } from "./traffic-rate-limiter"; import { buildRetryPlanWithPolicy } from "./traffic-retry"; import type { + AdaptiveLimiterConfig, FallbackChainEntry, FallbackPolicy, FallbackPolicyConfig, FallbackPolicyMode, FallbackTarget, + PriorityBurstLimits, ProviderModelConcurrencyLimit, RateLimitConfig, RateLimitKey, @@ -36,6 +38,7 @@ import type { TrafficRequest, TrafficRequestMetadata, TrafficRequestType, + TrafficResponseMetadata, } from "./traffic-types"; import { TrafficUsageTracker } from "./traffic-usage-tracker"; @@ -45,11 +48,13 @@ import { TrafficUsageTracker } from "./traffic-usage-tracker"; */ export type { + AdaptiveLimiterConfig, FallbackChainEntry, FallbackPolicy, FallbackPolicyConfig, FallbackPolicyMode, FallbackTarget, + PriorityBurstLimits, ProviderModelConcurrencyLimit, RateLimitConfig, RateLimitKey, @@ -61,6 +66,7 @@ export type { TrafficPriority, TrafficRequest, TrafficRequestMetadata, + TrafficResponseMetadata, TrafficRequestType, }; @@ -68,6 +74,42 @@ export { CircuitBreakerOpenError }; export { QueueWaitTimeoutError }; export { RateLimitedUpstreamError }; +type TenantQueueState = { + order: string[]; + index: number; + queues: Map; +}; + +type RateLimitSnapshot = { + limit?: number; + remaining?: number; + resetAt?: number; + nextAllowedAt?: number; + retryAfterMs?: number; +}; + +type AdaptiveLimiterState = { + recent429s: number[]; + penaltyMs: number; + cooldownUntil?: number; + last429At?: number; +}; + +const DEFAULT_PRIORITY_BURST_LIMITS: Record = { + P0: 5, + P1: 3, + P2: 2, +}; + +const DEFAULT_ADAPTIVE_LIMITER: Required = { + windowMs: 30_000, + threshold: 3, + minPenaltyMs: 500, + maxPenaltyMs: 10_000, + penaltyMultiplier: 2, + decayMs: 10_000, +}; + export class TrafficController { /* ---------- Core ---------- */ @@ -80,12 +122,18 @@ export class TrafficController { private readonly controllerLogger: Logger; private readonly concurrencyLimiter: TrafficConcurrencyLimiter; - private readonly queues: Record = { - P0: [], - P1: [], - P2: [], + private readonly queues: Record = { + P0: { order: [], index: 0, queues: new Map() }, + P1: { order: [], index: 0, queues: new Map() }, + P2: { order: [], index: 0, queues: new Map() }, }; private readonly priorityOrder: TrafficPriority[] = ["P0", "P1", "P2"]; + private readonly priorityBurstLimits: Record; + private readonly priorityBurstCounts: Record = { + P0: 0, + P1: 0, + P2: 0, + }; private activeCount = 0; private drainScheduled = false; @@ -99,11 +147,26 @@ export class TrafficController { /* ---------- Usage ---------- */ private readonly usageTracker = new TrafficUsageTracker(); + /* ---------- Traffic metadata ---------- */ + private readonly rateLimitSnapshots = new Map(); + + /* ---------- Adaptive limiter ---------- */ + private readonly adaptiveLimiterConfig: Required; + private readonly adaptiveLimiterState = new Map(); + constructor(options: TrafficControllerOptions = {}) { this.maxConcurrent = options.maxConcurrent ?? Number.POSITIVE_INFINITY; this.scheduler = this.createScheduler(); this.rateLimitKeyBuilder = options.rateLimitKeyBuilder ?? buildRateLimitKeyFromMetadata; this.retryPolicy = options.retryPolicy; + this.priorityBurstLimits = { + ...DEFAULT_PRIORITY_BURST_LIMITS, + ...(options.priorityBurstLimits ?? {}), + }; + this.adaptiveLimiterConfig = { + ...DEFAULT_ADAPTIVE_LIMITER, + ...(options.adaptiveLimiter ?? {}), + }; this.logger = new LoggerProxy({ component: "traffic-controller" }, options.logger); this.trafficLogger = this.logger.child({ subsystem: "traffic" }); this.controllerLogger = this.trafficLogger.child({ module: "controller" }); @@ -139,6 +202,8 @@ export class TrafficController { hasConfigRateLimits: options.rateLimits !== undefined, hasStrategyOverrides: options.rateLimitStrategy !== undefined, hasRetryPolicy: options.retryPolicy !== undefined, + hasPriorityBurstLimits: options.priorityBurstLimits !== undefined, + hasAdaptiveLimiter: options.adaptiveLimiter !== undefined, }); } @@ -175,6 +240,13 @@ export class TrafficController { priority: metadata?.priority, }); this.circuitBreaker.recordSuccess(metadata, this.trafficLogger); + const rateLimitKey = this.buildRateLimitKey(metadata); + const adaptiveKey = this.buildAdaptiveKey( + metadata, + metadata?.tenantId ?? "default", + rateLimitKey, + ); + this.recordAdaptiveSuccess(adaptiveKey); } reportStreamFailure(metadata: TrafficRequestMetadata | undefined, error: unknown): void { @@ -189,6 +261,19 @@ export class TrafficController { statusCode: (error as { statusCode?: unknown } | null)?.statusCode, }); this.circuitBreaker.recordFailure(metadata, error, this.trafficLogger); + const rateLimitKey = this.buildRateLimitKey(metadata); + const adaptiveKey = this.buildAdaptiveKey( + metadata, + metadata?.tenantId ?? "default", + rateLimitKey, + ); + if (error instanceof RateLimitedUpstreamError) { + this.recordAdaptiveRateLimitHit(adaptiveKey, error.retryAfterMs); + } + this.attachTrafficMetadata( + error, + this.buildTrafficResponseMetadataFromMetadata(metadata, rateLimitKey, Date.now(), error), + ); } updateRateLimitFromHeaders( @@ -220,6 +305,14 @@ export class TrafficController { resetRequestsMs: update.headerSnapshot.resetRequestsMs, }); + this.rateLimitSnapshots.set(update.key, { + limit: update.state.limit, + remaining: update.state.remaining, + resetAt: update.state.resetAt, + nextAllowedAt: update.state.nextAllowedAt, + retryAfterMs: update.headerSnapshot.retryAfterMs, + }); + return update; } @@ -243,21 +336,22 @@ export class TrafficController { ): Promise { return new Promise((resolve, reject) => { const priority = this.resolvePriority(request.metadata); + const tenantId = this.resolveTenantId(request); this.controllerLogger.debug("Enqueue request", { type, - tenantId: request.tenantId, + tenantId, priority, provider: request.metadata?.provider, model: request.metadata?.model, }); - this.queues[priority].push({ + this.enqueueItem({ type, request, resolve, reject, attempt: 1, priority, - tenantId: request.tenantId, + tenantId, enqueuedAt: Date.now(), extractUsage: request.extractUsage, }); @@ -281,9 +375,9 @@ export class TrafficController { this.controllerLogger.trace("Drain start", { activeCount: this.activeCount, maxConcurrent: this.maxConcurrent, - queuedP0: this.queues.P0.length, - queuedP1: this.queues.P1.length, - queuedP2: this.queues.P2.length, + queuedP0: this.getQueuedCount("P0"), + queuedP1: this.getQueuedCount("P1"), + queuedP2: this.getQueuedCount("P2"), }); while (true) { const decision = this.tryDispatchNext(); @@ -319,21 +413,25 @@ export class TrafficController { earliestWakeUpAt === undefined ? candidate : Math.min(earliestWakeUpAt, candidate); }; - for (const priority of this.priorityOrder) { - const queue = this.queues[priority]; - for (let index = 0; index < queue.length; index++) { - const next = queue[index]; - if (!next) continue; + const priorities = this.getPriorityDispatchOrder(); + for (const priority of priorities) { + const state = this.queues[priority]; + if (state.order.length === 0) continue; + + let attempts = 0; + const maxAttempts = state.order.length; + + while (attempts < maxAttempts) { + const candidate = this.getNextTenantCandidate(priority); + if (!candidate) break; + attempts += 1; + + const { item: next, queue, tenantId } = candidate; const now = Date.now(); const queueTimeoutAt = this.resolveQueueTimeoutAt(next); - const queueTimeoutTriggered = this.handleQueueTimeout( - next, - queue, - index, - now, - queueTimeoutAt, - ); + const queueTimeoutTriggered = this.handleQueueTimeout(next, queue, 0, now, queueTimeoutAt); if (queueTimeoutTriggered === "rejected") { + this.cleanupTenantQueue(priority, tenantId, queue); return { kind: "skip" }; } if (queueTimeoutAt !== undefined && now < queueTimeoutAt) { @@ -343,13 +441,12 @@ export class TrafficController { this.controllerLogger.trace("Evaluate next queued request", { priority, - queueIndex: index, - queueLength: queue.length, - type: next.type, tenantId: next.tenantId, + type: next.type, attempt: next.attempt, provider: next.request.metadata?.provider, model: next.request.metadata?.model, + queueLength: queue.length, }); const circuit = this.resolveCircuit(next); @@ -361,27 +458,19 @@ export class TrafficController { circuitStatus: next.circuitStatus, }); if (circuit.kind === "skip") { - queue.splice(index, 1); + queue.shift(); + this.cleanupTenantQueue(priority, tenantId, queue); return { kind: "skip" }; } if (circuit.kind === "wait") { if ( - this.rejectIfQueueTimedOut( - queueTimeoutExpired, - next, - queue, - index, - now, - "circuit wait", - ) + this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "circuit wait") ) { + this.cleanupTenantQueue(priority, tenantId, queue); return { kind: "skip" }; } - if (circuit.wakeUpAt !== undefined) { - next.etaMs = Math.max(0, circuit.wakeUpAt - now); - } else { - next.etaMs = undefined; - } + next.etaMs = + circuit.wakeUpAt !== undefined ? Math.max(0, circuit.wakeUpAt - now) : undefined; observeWakeUpAt(circuit.wakeUpAt); continue; } @@ -397,21 +486,29 @@ export class TrafficController { reasons: concurrency.reasons, }); if ( - this.rejectIfQueueTimedOut( - queueTimeoutExpired, - next, - queue, - index, - now, - "concurrency wait", - ) + this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "concurrency wait") ) { + this.cleanupTenantQueue(priority, tenantId, queue); return { kind: "skip" }; } next.etaMs = undefined; continue; } + const adaptive = this.resolveAdaptiveLimit(next, now); + if (adaptive?.kind === "wait") { + if ( + this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "adaptive wait") + ) { + this.cleanupTenantQueue(priority, tenantId, queue); + return { kind: "skip" }; + } + next.etaMs = + adaptive.wakeUpAt !== undefined ? Math.max(0, adaptive.wakeUpAt - now) : undefined; + observeWakeUpAt(adaptive.wakeUpAt); + continue; + } + const rateLimit = this.resolveRateLimit(next); if (rateLimit) { this.controllerLogger.trace("Rate limit resolution returned decision", { @@ -425,18 +522,16 @@ export class TrafficController { queueTimeoutExpired, next, queue, - index, + 0, now, "rate limit wait", ) ) { + this.cleanupTenantQueue(priority, tenantId, queue); return { kind: "skip" }; } - if (rateLimit.wakeUpAt !== undefined) { - next.etaMs = Math.max(0, rateLimit.wakeUpAt - now); - } else { - next.etaMs = undefined; - } + next.etaMs = + rateLimit.wakeUpAt !== undefined ? Math.max(0, rateLimit.wakeUpAt - now) : undefined; observeWakeUpAt(rateLimit.wakeUpAt); } continue; @@ -444,6 +539,15 @@ export class TrafficController { if (queueTimeoutExpired) { const timeoutError = this.createQueueTimeoutError(next, now); + this.attachTrafficMetadata( + timeoutError, + this.buildTrafficResponseMetadata( + next, + timeoutError.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), + now, + timeoutError, + ), + ); this.controllerLogger.warn("Queue wait timed out before dispatch", { tenantId: next.tenantId, waitedMs: timeoutError.waitedMs, @@ -453,12 +557,13 @@ export class TrafficController { model: next.request.metadata?.model, rateLimitKey: timeoutError.rateLimitKey, }); - queue.splice(index, 1); + queue.shift(); + this.cleanupTenantQueue(priority, tenantId, queue); next.reject(timeoutError); return { kind: "skip" }; } - this.startRequest(next, index); + this.startRequest(next, queue, tenantId); return { kind: "dispatch" }; } } @@ -468,7 +573,7 @@ export class TrafficController { : { kind: "wait" }; } - private startRequest(item: QueuedRequest, queueIndex: number): void { + private startRequest(item: QueuedRequest, queue: QueuedRequest[], tenantId: string): void { this.controllerLogger.debug("Start request", { priority: item.priority, type: item.type, @@ -477,7 +582,10 @@ export class TrafficController { provider: item.request.metadata?.provider, model: item.request.metadata?.model, }); - this.queues[item.priority].splice(queueIndex, 1); + item.dispatchedAt = Date.now(); + queue.shift(); + this.cleanupTenantQueue(item.priority, tenantId, queue); + this.recordPriorityDispatch(item.priority); this.activeCount++; this.concurrencyLimiter.acquire(item, this.trafficLogger); this.rateLimiter.notifyDispatch(item.rateLimitKey, this.trafficLogger); @@ -506,6 +614,8 @@ export class TrafficController { activeCount: this.activeCount, }); const result = await item.request.execute(); + const rateLimitKey = item.rateLimitKey ?? this.buildRateLimitKey(item.request.metadata); + const adaptiveKey = this.buildAdaptiveKey(item.request.metadata, item.tenantId, rateLimitKey); this.controllerLogger.debug("Request succeeded", { tenantId: item.tenantId, attempt: item.attempt, @@ -523,8 +633,12 @@ export class TrafficController { this.circuitBreaker.recordSuccess(item.request.metadata, this.trafficLogger); } const usage = this.usageTracker.recordUsage(item, result, this.trafficLogger); - const rateLimitKey = item.rateLimitKey ?? this.buildRateLimitKey(item.request.metadata); this.rateLimiter.recordUsage(rateLimitKey, usage, this.trafficLogger); + this.recordAdaptiveSuccess(adaptiveKey); + this.attachTrafficMetadata( + result, + this.buildTrafficResponseMetadata(item, rateLimitKey, Date.now()), + ); item.resolve(result); } catch (error) { const rateLimitKey = item.rateLimitKey ?? this.buildRateLimitKey(item.request.metadata); @@ -536,6 +650,10 @@ export class TrafficController { logger: this.trafficLogger, }); const errorForHandling = normalizedRateLimitError ?? error; + const adaptiveKey = this.buildAdaptiveKey(item.request.metadata, item.tenantId, rateLimitKey); + if (errorForHandling instanceof RateLimitedUpstreamError) { + this.recordAdaptiveRateLimitHit(adaptiveKey, errorForHandling.retryAfterMs); + } this.controllerLogger.warn("Request failed", { tenantId: item.tenantId, @@ -553,6 +671,10 @@ export class TrafficController { errorForHandling, this.trafficLogger, ); + this.attachTrafficMetadata( + errorForHandling, + this.buildTrafficResponseMetadata(item, rateLimitKey, Date.now(), errorForHandling), + ); const retry = buildRetryPlanWithPolicy( { @@ -629,10 +751,11 @@ export class TrafficController { priority: item.priority, nextAttempt: item.attempt + 1, }); - this.queues[item.priority].push({ + this.enqueueItem({ ...item, attempt: item.attempt + 1, enqueuedAt: Date.now(), + dispatchedAt: undefined, tenantConcurrencyKey: undefined, providerModelConcurrencyKey: undefined, rateLimitKey: undefined, @@ -713,6 +836,15 @@ export class TrafficController { } const timeoutError = this.createQueueTimeoutError(next, now); + this.attachTrafficMetadata( + timeoutError, + this.buildTrafficResponseMetadata( + next, + timeoutError.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), + now, + timeoutError, + ), + ); this.controllerLogger.warn("Queue wait timed out; rejecting request", { tenantId: next.tenantId, waitedMs: timeoutError.waitedMs, @@ -737,6 +869,15 @@ export class TrafficController { ): boolean { if (!queueTimeoutExpired) return false; const timeoutError = this.createQueueTimeoutError(next, now); + this.attachTrafficMetadata( + timeoutError, + this.buildTrafficResponseMetadata( + next, + timeoutError.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), + now, + timeoutError, + ), + ); this.controllerLogger.warn("Queue wait timed out during gate wait", { tenantId: next.tenantId, waitedMs: timeoutError.waitedMs, @@ -763,6 +904,115 @@ export class TrafficController { }); } + private resolveTenantId(request: TrafficRequest): string { + return request.tenantId ?? request.metadata?.tenantId ?? "default"; + } + + private enqueueItem(item: QueuedRequest): void { + const state = this.queues[item.priority]; + const tenantId = item.tenantId; + let queue = state.queues.get(tenantId); + if (!queue) { + queue = []; + state.queues.set(tenantId, queue); + state.order.push(tenantId); + } + queue.push(item); + } + + private getQueuedCount(priority: TrafficPriority): number { + const state = this.queues[priority]; + let total = 0; + for (const queue of state.queues.values()) { + total += queue.length; + } + return total; + } + + private hasQueuedWorkBelow(priority: TrafficPriority): boolean { + const index = this.priorityOrder.indexOf(priority); + if (index < 0) return false; + for (let i = index + 1; i < this.priorityOrder.length; i += 1) { + if (this.getQueuedCount(this.priorityOrder[i]) > 0) { + return true; + } + } + return false; + } + + private canDispatchPriority(priority: TrafficPriority): boolean { + const limit = this.priorityBurstLimits[priority]; + if (!Number.isFinite(limit) || limit <= 0) return true; + if (this.priorityBurstCounts[priority] < limit) return true; + return !this.hasQueuedWorkBelow(priority); + } + + private recordPriorityDispatch(priority: TrafficPriority): void { + for (const key of this.priorityOrder) { + if (key !== priority) { + this.priorityBurstCounts[key] = 0; + } + } + this.priorityBurstCounts[priority] += 1; + } + + private getPriorityDispatchOrder(): TrafficPriority[] { + return this.priorityOrder.filter((priority) => this.canDispatchPriority(priority)); + } + + private getNextTenantCandidate( + priority: TrafficPriority, + ): { item: QueuedRequest; queue: QueuedRequest[]; tenantId: string } | undefined { + const state = this.queues[priority]; + if (state.order.length === 0) return undefined; + const maxAttempts = state.order.length; + let attempts = 0; + + while (attempts < maxAttempts && state.order.length > 0) { + const index = state.index % state.order.length; + const tenantId = state.order[index]; + const queue = state.queues.get(tenantId); + attempts += 1; + + if (!queue || queue.length === 0) { + this.removeTenantQueue(priority, tenantId); + continue; + } + + state.index = (index + 1) % state.order.length; + return { item: queue[0], queue, tenantId }; + } + + return undefined; + } + + private cleanupTenantQueue( + priority: TrafficPriority, + tenantId: string, + queue: QueuedRequest[], + ): void { + if (queue.length > 0) return; + this.removeTenantQueue(priority, tenantId); + } + + private removeTenantQueue(priority: TrafficPriority, tenantId: string): void { + const state = this.queues[priority]; + state.queues.delete(tenantId); + const index = state.order.indexOf(tenantId); + if (index === -1) return; + state.order.splice(index, 1); + if (state.order.length === 0) { + state.index = 0; + return; + } + if (state.index > index) { + state.index -= 1; + } + if (state.index >= state.order.length) { + state.index = 0; + } + } + private resolvePriority(metadata?: TrafficRequestMetadata): TrafficPriority { return metadata?.priority ?? "P1"; } @@ -771,6 +1021,161 @@ export class TrafficController { return this.rateLimitKeyBuilder(metadata); } + private resolveAdaptiveLimit(next: QueuedRequest, now: number): DispatchDecision | null { + const rateLimitKey = next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata); + const adaptiveKey = this.buildAdaptiveKey(next.request.metadata, next.tenantId, rateLimitKey); + const state = this.adaptiveLimiterState.get(adaptiveKey); + if (!state) return null; + + this.applyAdaptiveDecay(state, now); + if (state.cooldownUntil !== undefined && now < state.cooldownUntil) { + return { kind: "wait", wakeUpAt: state.cooldownUntil }; + } + + return null; + } + + private recordAdaptiveRateLimitHit(key: string, retryAfterMs?: number): void { + const state = this.getAdaptiveState(key); + const now = Date.now(); + const { windowMs, threshold, minPenaltyMs, maxPenaltyMs, penaltyMultiplier } = + this.adaptiveLimiterConfig; + + state.last429At = now; + state.recent429s = state.recent429s.filter((timestamp) => now - timestamp <= windowMs); + state.recent429s.push(now); + + if (state.recent429s.length < threshold) { + return; + } + + const basePenalty = state.penaltyMs > 0 ? state.penaltyMs : minPenaltyMs; + const nextPenalty = Math.min( + maxPenaltyMs, + Math.max(minPenaltyMs, Math.round(basePenalty * penaltyMultiplier)), + ); + state.penaltyMs = nextPenalty; + const retryPenalty = typeof retryAfterMs === "number" ? retryAfterMs : 0; + const cooldownMs = Math.max(nextPenalty, retryPenalty); + state.cooldownUntil = now + cooldownMs; + } + + private recordAdaptiveSuccess(key: string): void { + const state = this.adaptiveLimiterState.get(key); + if (!state) return; + + const now = Date.now(); + this.applyAdaptiveDecay(state, now); + if (state.penaltyMs === 0) { + state.cooldownUntil = undefined; + state.recent429s = []; + state.last429At = undefined; + } + } + + private applyAdaptiveDecay(state: AdaptiveLimiterState, now: number): void { + const { decayMs, penaltyMultiplier } = this.adaptiveLimiterConfig; + if (state.last429At && now - state.last429At < decayMs) { + return; + } + + if (state.penaltyMs > 0) { + state.penaltyMs = Math.max(0, Math.floor(state.penaltyMs / penaltyMultiplier)); + } + } + + private getAdaptiveState(key: string): AdaptiveLimiterState { + const existing = this.adaptiveLimiterState.get(key); + if (existing) return existing; + const created: AdaptiveLimiterState = { + recent429s: [], + penaltyMs: 0, + }; + this.adaptiveLimiterState.set(key, created); + return created; + } + + private buildAdaptiveKey( + metadata: TrafficRequestMetadata | undefined, + tenantId: string, + rateLimitKey: string, + ): string { + if (rateLimitKey.includes("tenant=")) { + return rateLimitKey; + } + const tenant = metadata?.tenantId ?? tenantId ?? "default"; + return `${rateLimitKey}::tenant=${encodeURIComponent(tenant)}`; + } + + private buildTrafficResponseMetadata( + item: QueuedRequest, + rateLimitKey: string, + now: number, + error?: unknown, + ): TrafficResponseMetadata { + const snapshot = this.rateLimitSnapshots.get(rateLimitKey); + const retryAfterMs = this.resolveRetryAfterMs(error, snapshot); + const queuedForMs = + item.dispatchedAt !== undefined ? item.dispatchedAt - item.enqueuedAt : now - item.enqueuedAt; + const queueEtaMs = item.etaMs ?? Math.max(0, queuedForMs); + + return { + rateLimitKey, + retryAfterMs, + rateLimitRemaining: snapshot?.remaining, + rateLimitResetAt: snapshot?.resetAt, + rateLimitResetInMs: + snapshot?.resetAt !== undefined ? Math.max(0, snapshot.resetAt - now) : undefined, + queueEtaMs, + tenantId: item.tenantId, + priority: item.request.metadata?.priority, + taskType: item.request.metadata?.taskType, + }; + } + + private buildTrafficResponseMetadataFromMetadata( + metadata: TrafficRequestMetadata | undefined, + rateLimitKey: string, + now: number, + error?: unknown, + ): TrafficResponseMetadata { + const snapshot = this.rateLimitSnapshots.get(rateLimitKey); + const retryAfterMs = this.resolveRetryAfterMs(error, snapshot); + + return { + rateLimitKey, + retryAfterMs, + rateLimitRemaining: snapshot?.remaining, + rateLimitResetAt: snapshot?.resetAt, + rateLimitResetInMs: + snapshot?.resetAt !== undefined ? Math.max(0, snapshot.resetAt - now) : undefined, + tenantId: metadata?.tenantId, + priority: metadata?.priority, + taskType: metadata?.taskType, + }; + } + + private attachTrafficMetadata(target: unknown, info: TrafficResponseMetadata): void { + if (!target || typeof target !== "object") return; + (target as Record).traffic = info; + } + + private resolveRetryAfterMs( + error: unknown | undefined, + snapshot?: RateLimitSnapshot, + ): number | undefined { + if (error && typeof error === "object" && "retryAfterMs" in error) { + const candidate = (error as { retryAfterMs?: unknown }).retryAfterMs; + if (typeof candidate === "number" && Number.isFinite(candidate)) { + return candidate; + } + } + if (snapshot?.retryAfterMs !== undefined) { + return snapshot.retryAfterMs; + } + return undefined; + } + private resolveRateLimitStrategy( key: string, config?: RateLimitStrategyConfig, @@ -812,6 +1217,7 @@ function buildRateLimitKeyFromMetadata(metadata?: TrafficRequestMetadata): strin { label: "apiKey", value: metadata?.apiKeyId }, { label: "region", value: metadata?.region }, { label: "endpoint", value: metadata?.endpoint }, + { label: "tenant", value: metadata?.tenantId }, { label: "tenantTier", value: metadata?.tenantTier }, { label: "taskType", value: metadata?.taskType }, ]; diff --git a/packages/core/src/traffic/traffic-types.ts b/packages/core/src/traffic/traffic-types.ts index 7108442cd..f2ebbafbb 100644 --- a/packages/core/src/traffic/traffic-types.ts +++ b/packages/core/src/traffic/traffic-types.ts @@ -51,6 +51,18 @@ export interface TrafficRequestMetadata { fallbackPolicyId?: string; } +export type TrafficResponseMetadata = { + rateLimitKey?: string; + retryAfterMs?: number; + rateLimitRemaining?: number; + rateLimitResetAt?: number; + rateLimitResetInMs?: number; + queueEtaMs?: number; + tenantId?: string; + priority?: TrafficPriority; + taskType?: string; +}; + export type FallbackTarget = { provider?: string; model: string; @@ -80,6 +92,17 @@ export type TenantConcurrencyLimit = | Record | ((tenantId: string, metadata: TrafficRequestMetadata | undefined) => number | undefined); +export type PriorityBurstLimits = Partial>; + +export type AdaptiveLimiterConfig = { + windowMs?: number; + threshold?: number; + minPenaltyMs?: number; + maxPenaltyMs?: number; + penaltyMultiplier?: number; + decayMs?: number; +}; + export interface TrafficRequest { tenantId: string; metadata?: TrafficRequestMetadata; @@ -101,6 +124,8 @@ export interface TrafficControllerOptions { maxConcurrentPerProviderModel?: ProviderModelConcurrencyLimit; maxConcurrentPerTenant?: TenantConcurrencyLimit; rateLimits?: RateLimitConfig; + priorityBurstLimits?: PriorityBurstLimits; + adaptiveLimiter?: AdaptiveLimiterConfig; /** * Optional override for rate-limit key construction. * Useful when you need to add new metadata fields without changing core logic. diff --git a/packages/server-core/src/handlers/agent.handlers.ts b/packages/server-core/src/handlers/agent.handlers.ts index 00c0f2ee9..37fbeaf4e 100644 --- a/packages/server-core/src/handlers/agent.handlers.ts +++ b/packages/server-core/src/handlers/agent.handlers.ts @@ -1,11 +1,70 @@ -import { ClientHTTPError, type ServerProviderDeps } from "@voltagent/core"; -import { convertUsage } from "@voltagent/core"; +import { + ClientHTTPError, + type ServerProviderDeps, + type TrafficResponseMetadata, + convertUsage, +} from "@voltagent/core"; import { type Logger, safeStringify } from "@voltagent/internal"; import { z } from "zod"; import { convertJsonSchemaToZod } from "zod-from-json-schema"; import { convertJsonSchemaToZod as convertJsonSchemaToZodV3 } from "zod-from-json-schema-v3"; import type { ApiResponse } from "../types"; import { processAgentOptions } from "../utils/options"; +import { buildTrafficHeaders } from "../utils/traffic"; + +function extractTrafficMetadata(value: unknown): TrafficResponseMetadata | undefined { + if (!value || typeof value !== "object") return undefined; + const traffic = (value as { traffic?: unknown }).traffic; + if (!traffic || typeof traffic !== "object") return undefined; + return traffic as TrafficResponseMetadata; +} + +function wrapStreamWithTraffic( + baseResponse: Response, + traffic?: TrafficResponseMetadata, +): Response { + if (!traffic) return baseResponse; + const headers = new Headers(baseResponse.headers); + const trafficHeaders = buildTrafficHeaders(traffic); + for (const [key, value] of Object.entries(trafficHeaders)) { + headers.set(key, value); + } + const baseBody = baseResponse.body; + if (!baseBody) { + return new Response(baseBody, { + status: baseResponse.status, + headers, + }); + } + + const encoder = new TextEncoder(); + const stream = new ReadableStream({ + async start(controller) { + const trafficEvent = `data: ${safeStringify({ type: "traffic", traffic })}\n\n`; + controller.enqueue(encoder.encode(trafficEvent)); + const reader = baseBody.getReader(); + try { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + if (value !== undefined) { + controller.enqueue(value); + } + } + } catch (error) { + controller.error(error); + } finally { + reader.releaseLock(); + controller.close(); + } + }, + }); + + return new Response(stream, { + status: baseResponse.status, + headers, + }); +} /** * Handler for listing all agents @@ -79,6 +138,7 @@ export async function handleGenerateText( const options = processAgentOptions(body, signal); const result = await agent.generateText(input, options); + const traffic = extractTrafficMetadata(result); // Convert usage format if present const usage = result.usage ? convertUsage(result.usage) : undefined; @@ -102,9 +162,11 @@ export async function handleGenerateText( } })(), }, + traffic, }; } catch (error) { logger.error("Failed to generate text", { error }); + const traffic = extractTrafficMetadata(error); if (error instanceof ClientHTTPError) { return { success: false, @@ -112,11 +174,13 @@ export async function handleGenerateText( code: error.code, name: error.name, httpStatus: error.httpStatus, + traffic, }; } return { success: false, error: error instanceof Error ? error.message : "Unknown error", + traffic, }; } } @@ -153,6 +217,7 @@ export async function handleStreamText( const options = processAgentOptions(body, signal); const result = await agent.streamText(input, options); + const traffic = extractTrafficMetadata(result); // Access the fullStream property const { fullStream } = result; @@ -178,7 +243,7 @@ export async function handleStreamText( }, }); - return new Response(stream, { + const response = new Response(stream, { status: 200, headers: { "Content-Type": "text/event-stream", @@ -186,20 +251,25 @@ export async function handleStreamText( Connection: "keep-alive", }, }); + return wrapStreamWithTraffic(response, traffic); } catch (error) { logger.error("Failed to handle stream text request", { error }); const errorMessage = error instanceof Error ? error.message : "Unknown error"; + const traffic = extractTrafficMetadata(error); + const trafficHeaders = buildTrafficHeaders(traffic); return new Response( safeStringify({ error: errorMessage, message: errorMessage, + traffic, }), { status: 500, headers: { "Content-Type": "application/json", + ...trafficHeaders, }, }, ); @@ -238,26 +308,32 @@ export async function handleChatStream( const options = processAgentOptions(body, signal); const result = await agent.streamText(input, options); + const traffic = extractTrafficMetadata(result); // Use the built-in toUIMessageStreamResponse - it handles errors properly - return result.toUIMessageStreamResponse({ + const response = result.toUIMessageStreamResponse({ sendReasoning: true, sendSources: true, }); + return wrapStreamWithTraffic(response, traffic); } catch (error) { logger.error("Failed to handle chat stream request", { error }); const errorMessage = error instanceof Error ? error.message : "Unknown error"; + const traffic = extractTrafficMetadata(error); + const trafficHeaders = buildTrafficHeaders(traffic); return new Response( safeStringify({ error: errorMessage, message: errorMessage, + traffic, }), { status: 500, headers: { "Content-Type": "application/json", + ...trafficHeaders, }, }, ); @@ -293,16 +369,20 @@ export async function handleGenerateObject( ) as any; const result = await agent.generateObject(input, zodSchema, options); + const traffic = extractTrafficMetadata(result); return { success: true, data: result.object, + traffic, }; } catch (error) { logger.error("Failed to generate object", { error }); + const traffic = extractTrafficMetadata(error); return { success: false, error: error instanceof Error ? error.message : "Unknown error", + traffic, }; } } @@ -344,23 +424,29 @@ export async function handleStreamObject( ) as any; const result = await agent.streamObject(input, zodSchema, options); + const traffic = extractTrafficMetadata(result); // Use the built-in toTextStreamResponse - it handles errors properly - return result.toTextStreamResponse(); + const response = result.toTextStreamResponse(); + return wrapStreamWithTraffic(response, traffic); } catch (error) { logger.error("Failed to handle stream object request", { error }); const errorMessage = error instanceof Error ? error.message : "Unknown error"; + const traffic = extractTrafficMetadata(error); + const trafficHeaders = buildTrafficHeaders(traffic); return new Response( safeStringify({ error: errorMessage, message: errorMessage, + traffic, }), { status: 500, headers: { "Content-Type": "application/json", + ...trafficHeaders, }, }, ); diff --git a/packages/server-core/src/index.ts b/packages/server-core/src/index.ts index 1fe7e206a..2f7ed826a 100644 --- a/packages/server-core/src/index.ts +++ b/packages/server-core/src/index.ts @@ -40,6 +40,7 @@ export * from "./utils/server-utils"; export * from "./utils/ui-templates"; export * from "./utils/response-mappers"; export * from "./utils/sse"; +export * from "./utils/traffic"; export * from "./utils/announcements"; // Export WebSocket utilities diff --git a/packages/server-core/src/types/responses.ts b/packages/server-core/src/types/responses.ts index 2098c2f64..4935a535b 100644 --- a/packages/server-core/src/types/responses.ts +++ b/packages/server-core/src/types/responses.ts @@ -1,10 +1,12 @@ /** * Framework-agnostic response types for server handlers */ +import type { TrafficResponseMetadata } from "@voltagent/core"; export interface SuccessResponse { success: true; data: T; + traffic?: TrafficResponseMetadata; } export interface ErrorResponse { @@ -13,6 +15,7 @@ export interface ErrorResponse { httpStatus?: number; code?: string; name?: string; + traffic?: TrafficResponseMetadata; } export type ApiResponse = SuccessResponse | ErrorResponse; diff --git a/packages/server-core/src/utils/traffic.ts b/packages/server-core/src/utils/traffic.ts new file mode 100644 index 000000000..f9be1845a --- /dev/null +++ b/packages/server-core/src/utils/traffic.ts @@ -0,0 +1,35 @@ +import type { TrafficResponseMetadata } from "@voltagent/core"; + +export function buildTrafficHeaders(traffic?: TrafficResponseMetadata): Record { + if (!traffic) return {}; + + const headers: Record = {}; + + if (typeof traffic.retryAfterMs === "number" && Number.isFinite(traffic.retryAfterMs)) { + headers["Retry-After"] = String(Math.max(0, Math.ceil(traffic.retryAfterMs / 1000))); + } + + if (traffic.rateLimitRemaining !== undefined) { + headers["X-RateLimit-Remaining"] = String(traffic.rateLimitRemaining); + } + + if (typeof traffic.rateLimitResetAt === "number" && Number.isFinite(traffic.rateLimitResetAt)) { + headers["X-RateLimit-Reset"] = String(Math.max(0, Math.ceil(traffic.rateLimitResetAt / 1000))); + } else if ( + typeof traffic.rateLimitResetInMs === "number" && + Number.isFinite(traffic.rateLimitResetInMs) + ) { + const resetAt = Date.now() + Math.max(0, traffic.rateLimitResetInMs); + headers["X-RateLimit-Reset"] = String(Math.max(0, Math.ceil(resetAt / 1000))); + } + + if (traffic.queueEtaMs !== undefined) { + headers["X-Queue-ETA"] = String(traffic.queueEtaMs); + } + + if (traffic.rateLimitKey) { + headers["X-RateLimit-Key"] = traffic.rateLimitKey; + } + + return headers; +} diff --git a/packages/server-hono/src/routes/index.ts b/packages/server-hono/src/routes/index.ts index a5af82146..336a5bf47 100644 --- a/packages/server-hono/src/routes/index.ts +++ b/packages/server-hono/src/routes/index.ts @@ -2,6 +2,7 @@ import type { ServerProviderDeps } from "@voltagent/core"; import type { Logger } from "@voltagent/internal"; import { UPDATE_ROUTES, + buildTrafficHeaders, handleCancelWorkflow, handleChatStream, handleCheckUpdates, @@ -87,11 +88,12 @@ export function registerAgentRoutes( const signal = c.req.raw.signal; const response = await handleGenerateText(agentId, body, deps, logger, signal); + const trafficHeaders = buildTrafficHeaders(response.traffic); if (!response.success) { const { httpStatus, ...details } = response; - return c.json(details, httpStatus || 500); + return c.json(details, httpStatus || 500, trafficHeaders); } - return c.json(response, 200); + return c.json(response, 200, trafficHeaders); }); // POST /agents/:id/stream - Stream text (raw fullStream SSE) @@ -131,11 +133,12 @@ export function registerAgentRoutes( const body = await c.req.json(); const signal = c.req.raw.signal; const response = await handleGenerateObject(agentId, body, deps, logger, signal); + const trafficHeaders = buildTrafficHeaders(response.traffic); if (!response.success) { const { httpStatus, ...details } = response; - return c.json(details, httpStatus || 500); + return c.json(details, httpStatus || 500, trafficHeaders); } - return c.json(response, 200); + return c.json(response, 200, trafficHeaders); }); // POST /agents/:id/stream-object - Stream object diff --git a/packages/serverless-hono/src/routes.ts b/packages/serverless-hono/src/routes.ts index d377ce4b3..39eabcf76 100644 --- a/packages/serverless-hono/src/routes.ts +++ b/packages/serverless-hono/src/routes.ts @@ -28,6 +28,7 @@ import { type TriggerHttpRequestContext, UPDATE_ROUTES, WORKFLOW_ROUTES, + buildTrafficHeaders, executeA2ARequest, executeTriggerHandler, getConversationMessagesHandler, @@ -165,7 +166,8 @@ export function registerAgentRoutes(app: Hono, deps: ServerProviderDeps, logger: } const signal = c.req.raw.signal; const response = await handleGenerateText(agentId, body, deps, logger, signal); - return c.json(response, response.success ? 200 : 500); + const trafficHeaders = buildTrafficHeaders(response.traffic); + return c.json(response, response.success ? 200 : 500, trafficHeaders); }); app.post(AGENT_ROUTES.streamText.path, async (c) => { @@ -197,7 +199,8 @@ export function registerAgentRoutes(app: Hono, deps: ServerProviderDeps, logger: } const signal = c.req.raw.signal; const response = await handleGenerateObject(agentId, body, deps, logger, signal); - return c.json(response, response.success ? 200 : 500); + const trafficHeaders = buildTrafficHeaders(response.traffic); + return c.json(response, response.success ? 200 : 500, trafficHeaders); }); app.post(AGENT_ROUTES.streamObject.path, async (c) => { From def0fa52e61e22c2b5d9838a9f2e954ea355bf2c Mon Sep 17 00:00:00 2001 From: riturajFi Date: Thu, 25 Dec 2025 15:10:57 +0530 Subject: [PATCH 25/41] fix: token caps --- diff.txt | 8093 ++++++++++++++--- .../src/traffic/traffic-controller.spec.ts | 40 + .../core/src/traffic/traffic-controller.ts | 8 +- 3 files changed, 6882 insertions(+), 1259 deletions(-) diff --git a/diff.txt b/diff.txt index 8e81b0333..d84dc9926 100644 --- a/diff.txt +++ b/diff.txt @@ -11,6 +11,9 @@ index 00000000..73fd43c5 +66d74dd2 +53f34370 \ No newline at end of file +diff --git a/diff.txt b/diff.txt +new file mode 100644 +index 00000000..e69de29b diff --git a/examples/with-client-side-tools/next-env.d.ts b/examples/with-client-side-tools/next-env.d.ts index 1b3be084..9edff1c7 100644 --- a/examples/with-client-side-tools/next-env.d.ts @@ -136,23 +139,41 @@ index 00000000..d1c5bf43 + }; + }, +}); +diff --git a/package.json b/package.json +index 7c80f7c5..7e3ef8ba 100644 +--- a/package.json ++++ b/package.json +@@ -32,9 +32,10 @@ + "publint": "^0.3.8", + "rimraf": "^5.0.5", + "syncpack": "^13.0.2", ++ "ts-node": "^10.9.2", + "tslib": "^2.3.0", + "tsup": "^8.5.0", +- "typescript": "^5.8.2", ++ "typescript": "^5.9.2", + "vite": "^7.2.7", + "vitest": "^3.2.4" + }, diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts -index 95a6a413..2486335c 100644 +index 291bdf7f..04e86bb9 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts -@@ -48,6 +48,11 @@ import type { BaseRetriever } from "../retriever/retriever"; +@@ -48,6 +48,13 @@ import type { BaseRetriever } from "../retriever/retriever"; import type { Tool, Toolkit } from "../tool"; import { createTool } from "../tool"; import { ToolManager } from "../tool/manager"; +import { ++ type FallbackChainEntry, + type TrafficPriority, + type TrafficRequestMetadata, + getTrafficController, +} from "../traffic/traffic-controller"; ++import { findHeaders } from "../traffic/traffic-error-utils"; import { randomUUID } from "../utils/id"; import { convertModelMessagesToUIMessages } from "../utils/message-converter"; import { NodeType, createNodeId } from "../utils/node-utils"; -@@ -262,8 +267,14 @@ export interface BaseGenerationOptions extends Partial { +@@ -262,8 +269,26 @@ export interface BaseGenerationOptions extends Partial { // Context userId?: string; conversationId?: string; @@ -164,10 +185,22 @@ index 95a6a413..2486335c 100644 + * Defaults to agent-level priority when omitted. + */ + trafficPriority?: TrafficPriority; ++ /** ++ * Optional maximum time to wait in the queue before timing out. ++ */ ++ maxQueueWaitMs?: number; ++ /** ++ * Optional task classification for circuit-breaker fallback policies. ++ */ ++ taskType?: string; ++ /** ++ * Optional explicit fallback policy id. ++ */ ++ fallbackPolicyId?: string; // Parent tracking parentAgentId?: string; -@@ -303,6 +314,8 @@ export interface BaseGenerationOptions extends Partial { +@@ -303,6 +328,8 @@ export interface BaseGenerationOptions extends Partial { // Provider-specific options providerOptions?: ProviderOptions; @@ -176,7 +209,7 @@ index 95a6a413..2486335c 100644 // Experimental output (for structured generation) experimental_output?: ReturnType | ReturnType; -@@ -347,6 +360,7 @@ export class Agent { +@@ -347,6 +374,7 @@ export class Agent { readonly voice?: Voice; readonly retriever?: BaseRetriever; readonly supervisorConfig?: SupervisorConfig; @@ -184,7 +217,7 @@ index 95a6a413..2486335c 100644 private readonly context?: Map; private readonly logger: Logger; -@@ -372,6 +386,7 @@ export class Agent { +@@ -372,6 +400,7 @@ export class Agent { this.temperature = options.temperature; this.maxOutputTokens = options.maxOutputTokens; this.maxSteps = options.maxSteps || 5; @@ -192,22 +225,34 @@ index 95a6a413..2486335c 100644 this.stopWhen = options.stopWhen; this.markdown = options.markdown ?? false; this.voice = options.voice; -@@ -444,6 +459,26 @@ export class Agent { +@@ -444,6 +473,38 @@ export class Agent { async generateText( input: string | UIMessage[] | BaseMessage[], options?: GenerateTextOptions, + ): Promise { + const controller = getTrafficController({ logger: this.logger }); // Use shared controller so all agent calls flow through central queue/metrics + const tenantId = this.resolveTenantId(options); -+ const buildRequest = (modelOverride?: LanguageModel) => ({ -+ tenantId, -+ metadata: this.buildTrafficMetadata(modelOverride ?? options?.model, options), // Pass model/provider info for future rate limiting keys -+ execute: () => -+ this.executeGenerateText(input, this.mergeOptionsWithModel(options, modelOverride)), // Defer actual execution so controller can schedule it -+ extractUsage: (result: GenerateTextResultWithContext) => -+ this.extractUsageFromResponse(result), -+ createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), -+ }); ++ const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => { ++ const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); ++ const metadata = this.buildTrafficMetadata( ++ mergedOptions?.model, ++ mergedOptions, ++ providerOverride, ++ ); // Compute once per queued request (including per-call model overrides) ++ return { ++ tenantId, ++ metadata, ++ maxQueueWaitMs: options?.maxQueueWaitMs, ++ execute: () => this.executeGenerateText(input, mergedOptions, metadata), // Defer actual execution so controller can schedule it ++ extractUsage: (result: GenerateTextResultWithContext) => ++ this.extractUsageFromResponse(result), ++ createFallbackRequest: (fallbackTarget) => { ++ const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = ++ this.resolveFallbackTarget(fallbackTarget); ++ return buildRequest(fallbackModel, fallbackProvider); ++ }, ++ }; ++ }; + + return controller.handleText(buildRequest(options?.model)); + } @@ -219,7 +264,7 @@ index 95a6a413..2486335c 100644 ): Promise { const startTime = Date.now(); const oc = this.createOperationContext(input, options); -@@ -471,7 +506,7 @@ export class Agent { +@@ -471,7 +532,7 @@ export class Agent { options, ); @@ -228,18 +273,26 @@ index 95a6a413..2486335c 100644 const contextLimit = options?.contextLimit; // Add model attributes and all options -@@ -546,8 +581,10 @@ export class Agent { +@@ -544,10 +605,18 @@ export class Agent { + hooks, + maxSteps: userMaxSteps, tools: userTools, ++ maxQueueWaitMs, ++ taskType, ++ fallbackPolicyId, experimental_output, providerOptions, + model: _model, // Exclude model so aiSDKOptions doesn't override resolved model ...aiSDKOptions } = options || {}; + void _model; ++ void maxQueueWaitMs; ++ void taskType; ++ void fallbackPolicyId; const llmSpan = this.createLLMSpan(oc, { operation: "generateText", -@@ -567,6 +604,11 @@ export class Agent { +@@ -567,6 +636,11 @@ export class Agent { let result!: GenerateTextResult; try { @@ -251,7 +304,7 @@ index 95a6a413..2486335c 100644 result = await oc.traceContext.withSpan(llmSpan, () => generateText({ model, -@@ -575,7 +617,7 @@ export class Agent { +@@ -575,7 +649,7 @@ export class Agent { // Default values temperature: this.temperature, maxOutputTokens: this.maxOutputTokens, @@ -260,7 +313,7 @@ index 95a6a413..2486335c 100644 stopWhen: options?.stopWhen ?? this.stopWhen ?? stepCountIs(maxSteps), // User overrides from AI SDK options ...aiSDKOptions, -@@ -588,6 +630,13 @@ export class Agent { +@@ -588,7 +662,15 @@ export class Agent { onStepFinish: this.createStepHandler(oc, options), }), ); @@ -272,23 +325,38 @@ index 95a6a413..2486335c 100644 + }); + this.updateTrafficControllerRateLimits(result.response, trafficMetadata, methodLogger); } catch (error) { ++ this.updateTrafficControllerRateLimits(error, trafficMetadata, methodLogger); finalizeLLMSpan(SpanStatusCode.ERROR, { message: (error as Error).message }); throw error; -@@ -771,6 +820,25 @@ export class Agent { + } +@@ -771,6 +853,38 @@ export class Agent { async streamText( input: string | UIMessage[] | BaseMessage[], options?: StreamTextOptions, + ): Promise { + const controller = getTrafficController({ logger: this.logger }); // Same controller handles streaming to keep ordering/backpressure consistent + const tenantId = this.resolveTenantId(options); -+ const buildRequest = (modelOverride?: LanguageModel) => ({ -+ tenantId, -+ metadata: this.buildTrafficMetadata(modelOverride ?? options?.model, options), // Include identifiers to support per-provider/model policies later -+ execute: () => -+ this.executeStreamText(input, this.mergeOptionsWithModel(options, modelOverride)), // Actual streaming work happens after the controller dequeues us -+ extractUsage: (result: StreamTextResultWithContext) => this.extractUsageFromResponse(result), -+ createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), -+ }); ++ const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => { ++ const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); ++ const metadata = this.buildTrafficMetadata( ++ mergedOptions?.model, ++ mergedOptions, ++ providerOverride, ++ ); // Compute once per queued request (including per-call model overrides) ++ return { ++ tenantId, ++ metadata, ++ maxQueueWaitMs: options?.maxQueueWaitMs, ++ execute: () => this.executeStreamText(input, mergedOptions, metadata), // Actual streaming work happens after the controller dequeues us ++ extractUsage: (result: StreamTextResultWithContext) => ++ this.extractUsageFromResponse(result), ++ createFallbackRequest: (fallbackTarget) => { ++ const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = ++ this.resolveFallbackTarget(fallbackTarget); ++ return buildRequest(fallbackModel, fallbackProvider); ++ }, ++ }; ++ }; + + return controller.handleStream(buildRequest(options?.model)); + } @@ -300,7 +368,7 @@ index 95a6a413..2486335c 100644 ): Promise { const startTime = Date.now(); const oc = this.createOperationContext(input, options); -@@ -800,7 +868,7 @@ export class Agent { +@@ -800,7 +914,7 @@ export class Agent { options, ); @@ -309,20 +377,30 @@ index 95a6a413..2486335c 100644 const contextLimit = options?.contextLimit; // Add model attributes to root span if TraceContext exists -@@ -870,8 +938,10 @@ export class Agent { +@@ -868,10 +982,18 @@ export class Agent { + maxSteps: userMaxSteps, + tools: userTools, onFinish: userOnFinish, ++ maxQueueWaitMs, ++ taskType, ++ fallbackPolicyId, experimental_output, providerOptions, + model: _model, // Exclude model from aiSDKOptions to avoid overriding resolved model ...aiSDKOptions } = options || {}; + void _model; ++ void maxQueueWaitMs; ++ void taskType; ++ void fallbackPolicyId; const guardrailStreamingEnabled = guardrailSet.output.length > 0; -@@ -894,6 +964,11 @@ export class Agent { +@@ -893,7 +1015,13 @@ export class Agent { + }, }); const finalizeLLMSpan = this.createLLMSpanFinalizer(llmSpan); ++ const trafficController = getTrafficController({ logger: this.logger }); + methodLogger.info("[AI SDK] Calling streamText", { + messageCount: messages.length, @@ -332,7 +410,7 @@ index 95a6a413..2486335c 100644 const result = streamText({ model, messages, -@@ -901,7 +976,7 @@ export class Agent { +@@ -901,7 +1029,7 @@ export class Agent { // Default values temperature: this.temperature, maxOutputTokens: this.maxOutputTokens, @@ -341,7 +419,16 @@ index 95a6a413..2486335c 100644 stopWhen: options?.stopWhen ?? this.stopWhen ?? stepCountIs(maxSteps), // User overrides from AI SDK options ...aiSDKOptions, -@@ -962,6 +1037,17 @@ export class Agent { +@@ -937,6 +1065,8 @@ export class Agent { + modelName: this.getModelName(), + }); + ++ this.updateTrafficControllerRateLimits(actualError, trafficMetadata, methodLogger); ++ trafficController.reportStreamFailure(trafficMetadata, actualError); + finalizeLLMSpan(SpanStatusCode.ERROR, { message: (actualError as Error)?.message }); + + // History update removed - using OpenTelemetry only +@@ -962,6 +1092,18 @@ export class Agent { .catch(() => {}); }, onFinish: async (finalResult) => { @@ -356,29 +443,38 @@ index 95a6a413..2486335c 100644 + trafficMetadata, + methodLogger, + ); ++ trafficController.reportStreamSuccess(trafficMetadata); const providerUsage = finalResult.usage ? await Promise.resolve(finalResult.usage) : undefined; -@@ -1428,6 +1514,30 @@ export class Agent { +@@ -1428,6 +1570,39 @@ export class Agent { input: string | UIMessage[] | BaseMessage[], schema: T, options?: GenerateObjectOptions, + ): Promise>> { + const controller = getTrafficController({ logger: this.logger }); + const tenantId = this.resolveTenantId(options); -+ const buildRequest = (modelOverride?: LanguageModel) => ({ -+ tenantId, -+ metadata: this.buildTrafficMetadata(modelOverride ?? options?.model, options), -+ execute: () => -+ this.executeGenerateObject( -+ input, -+ schema, -+ this.mergeOptionsWithModel(options, modelOverride), -+ ), -+ extractUsage: (result: GenerateObjectResultWithContext>) => -+ this.extractUsageFromResponse(result), -+ createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), -+ }); ++ const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => { ++ const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); ++ const metadata = this.buildTrafficMetadata( ++ mergedOptions?.model, ++ mergedOptions, ++ providerOverride, ++ ); // Compute once per queued request (including per-call model overrides) ++ return { ++ tenantId, ++ metadata, ++ maxQueueWaitMs: options?.maxQueueWaitMs, ++ execute: () => this.executeGenerateObject(input, schema, mergedOptions, metadata), ++ extractUsage: (result: GenerateObjectResultWithContext>) => ++ this.extractUsageFromResponse(result), ++ createFallbackRequest: (fallbackTarget) => { ++ const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = ++ this.resolveFallbackTarget(fallbackTarget); ++ return buildRequest(fallbackModel, fallbackProvider); ++ }, ++ }; ++ }; + + return controller.handleText(buildRequest(options?.model)); + } @@ -387,10 +483,11 @@ index 95a6a413..2486335c 100644 + input: string | UIMessage[] | BaseMessage[], + schema: T, + options?: GenerateObjectOptions, ++ trafficMetadata?: TrafficRequestMetadata, ): Promise>> { const startTime = Date.now(); const oc = this.createOperationContext(input, options); -@@ -1452,7 +1562,7 @@ export class Agent { +@@ -1452,7 +1627,7 @@ export class Agent { options, ); @@ -399,14 +496,21 @@ index 95a6a413..2486335c 100644 const schemaName = schema.description || "unknown"; // Add model attributes and all options -@@ -1511,9 +1621,16 @@ export class Agent { +@@ -1510,10 +1685,23 @@ export class Agent { + hooks, maxSteps: userMaxSteps, tools: userTools, ++ taskType, ++ fallbackPolicyId, ++ maxQueueWaitMs, providerOptions, + model: _model, // Exclude model so spread does not override resolved model ...aiSDKOptions } = options || {}; + void _model; ++ void taskType; ++ void fallbackPolicyId; ++ void maxQueueWaitMs; + methodLogger.info("[AI SDK] Calling generateObject", { + messageCount: messages.length, @@ -416,7 +520,7 @@ index 95a6a413..2486335c 100644 const result = await generateObject({ model, messages, -@@ -1522,7 +1639,7 @@ export class Agent { +@@ -1522,7 +1710,7 @@ export class Agent { // Default values maxOutputTokens: this.maxOutputTokens, temperature: this.temperature, @@ -425,7 +529,7 @@ index 95a6a413..2486335c 100644 // User overrides from AI SDK options ...aiSDKOptions, // Provider-specific options -@@ -1530,6 +1647,12 @@ export class Agent { +@@ -1530,6 +1718,13 @@ export class Agent { // VoltAgent controlled abortSignal: oc.abortController.signal, }); @@ -435,25 +539,46 @@ index 95a6a413..2486335c 100644 + warnings: result.warnings, + rawResult: safeStringify(result), + }); ++ this.updateTrafficControllerRateLimits(result.response, trafficMetadata, methodLogger); const usageInfo = convertUsage(result.usage); const finalObject = await executeOutputGuardrails({ -@@ -1655,6 +1778,26 @@ export class Agent { +@@ -1638,6 +1833,7 @@ export class Agent { + context: oc.context, + }; + } catch (error) { ++ this.updateTrafficControllerRateLimits(error, trafficMetadata, methodLogger); + await this.flushPendingMessagesOnError(oc).catch(() => {}); + return this.handleError(error as Error, oc, options, startTime); + } finally { +@@ -1655,6 +1851,39 @@ export class Agent { input: string | UIMessage[] | BaseMessage[], schema: T, options?: StreamObjectOptions, + ): Promise>> { + const controller = getTrafficController({ logger: this.logger }); + const tenantId = this.resolveTenantId(options); -+ const buildRequest = (modelOverride?: LanguageModel) => ({ -+ tenantId, -+ metadata: this.buildTrafficMetadata(modelOverride ?? options?.model, options), -+ execute: () => -+ this.executeStreamObject(input, schema, this.mergeOptionsWithModel(options, modelOverride)), -+ extractUsage: (result: StreamObjectResultWithContext>) => -+ this.extractUsageFromResponse(result), -+ createFallbackRequest: (fallbackModel: string) => buildRequest(fallbackModel), -+ }); ++ const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => { ++ const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); ++ const metadata = this.buildTrafficMetadata( ++ mergedOptions?.model, ++ mergedOptions, ++ providerOverride, ++ ); // Compute once per queued request (including per-call model overrides) ++ return { ++ tenantId, ++ metadata, ++ maxQueueWaitMs: options?.maxQueueWaitMs, ++ execute: () => this.executeStreamObject(input, schema, mergedOptions, metadata), ++ extractUsage: (result: StreamObjectResultWithContext>) => ++ this.extractUsageFromResponse(result), ++ createFallbackRequest: (fallbackTarget) => { ++ const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = ++ this.resolveFallbackTarget(fallbackTarget); ++ return buildRequest(fallbackModel, fallbackProvider); ++ }, ++ }; ++ }; + + return controller.handleStream(buildRequest(options?.model)); + } @@ -462,10 +587,11 @@ index 95a6a413..2486335c 100644 + input: string | UIMessage[] | BaseMessage[], + schema: T, + options?: StreamObjectOptions, ++ trafficMetadata?: TrafficRequestMetadata, ): Promise>> { const startTime = Date.now(); const oc = this.createOperationContext(input, options); -@@ -1680,7 +1823,7 @@ export class Agent { +@@ -1680,7 +1909,7 @@ export class Agent { options, ); @@ -474,18 +600,26 @@ index 95a6a413..2486335c 100644 const schemaName = schema.description || "unknown"; // Add model attributes and all options -@@ -1740,13 +1883,20 @@ export class Agent { +@@ -1739,14 +1968,28 @@ export class Agent { + maxSteps: userMaxSteps, tools: userTools, onFinish: userOnFinish, ++ taskType, ++ fallbackPolicyId, ++ maxQueueWaitMs, providerOptions, + model: _model, // Exclude model so aiSDKOptions cannot override resolved model ...aiSDKOptions } = options || {}; + void _model; ++ void taskType; ++ void fallbackPolicyId; ++ void maxQueueWaitMs; let guardrailObjectPromise!: Promise>; let resolveGuardrailObject: ((value: z.infer) => void) | undefined; let rejectGuardrailObject: ((reason: unknown) => void) | undefined; ++ const trafficController = getTrafficController({ logger: this.logger }); + methodLogger.info("[AI SDK] Calling streamObject", { + messageCount: messages.length, @@ -495,7 +629,7 @@ index 95a6a413..2486335c 100644 const result = streamObject({ model, messages, -@@ -1755,7 +1905,7 @@ export class Agent { +@@ -1755,7 +1998,7 @@ export class Agent { // Default values maxOutputTokens: this.maxOutputTokens, temperature: this.temperature, @@ -504,7 +638,7 @@ index 95a6a413..2486335c 100644 // User overrides from AI SDK options ...aiSDKOptions, // Provider-specific options -@@ -1771,7 +1921,7 @@ export class Agent { +@@ -1771,9 +2014,11 @@ export class Agent { methodLogger.error("Stream object error occurred", { error: actualError, agentName: this.name, @@ -512,8 +646,12 @@ index 95a6a413..2486335c 100644 + modelName: this.getModelName(model), schemaName: schemaName, }); ++ this.updateTrafficControllerRateLimits(actualError, trafficMetadata, methodLogger); ++ trafficController.reportStreamFailure(trafficMetadata, actualError); -@@ -1800,6 +1950,11 @@ export class Agent { + // History update removed - using OpenTelemetry only + +@@ -1800,6 +2045,17 @@ export class Agent { }, onFinish: async (finalResult: any) => { try { @@ -522,10 +660,16 @@ index 95a6a413..2486335c 100644 + usage: finalResult.usage ? safeStringify(finalResult.usage) : undefined, + rawResult: safeStringify(finalResult), + }); ++ this.updateTrafficControllerRateLimits( ++ finalResult.response, ++ trafficMetadata, ++ methodLogger, ++ ); ++ trafficController.reportStreamSuccess(trafficMetadata); const usageInfo = convertUsage(finalResult.usage as any); let finalObject = finalResult.object as z.infer; if (guardrailSet.output.length > 0) { -@@ -2021,8 +2176,9 @@ export class Agent { +@@ -2021,8 +2277,9 @@ export class Agent { // Calculate maxSteps (use provided option or calculate based on subagents) const maxSteps = options?.maxSteps ?? this.calculateMaxSteps(); @@ -537,7 +681,7 @@ index 95a6a413..2486335c 100644 const dynamicToolList = (await this.resolveValue(this.dynamicTools, oc)) || []; // Merge agent tools with option tools -@@ -2073,6 +2229,8 @@ export class Agent { +@@ -2073,6 +2330,8 @@ export class Agent { ): OperationContext { const operationId = randomUUID(); const startTimeDate = new Date(); @@ -546,7 +690,7 @@ index 95a6a413..2486335c 100644 // Prefer reusing an existing context instance to preserve reference across calls/subagents const runtimeContext = toContextMap(options?.context); -@@ -2123,6 +2281,7 @@ export class Agent { +@@ -2123,6 +2382,7 @@ export class Agent { operationId, userId: options?.userId, conversationId: options?.conversationId, @@ -554,7 +698,7 @@ index 95a6a413..2486335c 100644 executionId: operationId, }); -@@ -2137,6 +2296,9 @@ export class Agent { +@@ -2137,6 +2397,9 @@ export class Agent { parentAgentId: options?.parentAgentId, input, }); @@ -564,7 +708,7 @@ index 95a6a413..2486335c 100644 traceContext.getRootSpan().setAttribute("voltagent.operation_id", operationId); // Use parent's AbortController if available, otherwise create new one -@@ -2174,8 +2336,10 @@ export class Agent { +@@ -2174,8 +2437,10 @@ export class Agent { logger, conversationSteps: options?.parentOperationContext?.conversationSteps || [], abortController, @@ -575,7 +719,7 @@ index 95a6a413..2486335c 100644 parentAgentId: options?.parentAgentId, traceContext, startTime: startTimeDate, -@@ -3147,6 +3311,20 @@ export class Agent { +@@ -3170,6 +3435,20 @@ export class Agent { return value; } @@ -596,7 +740,7 @@ index 95a6a413..2486335c 100644 /** * Prepare tools with execution context */ -@@ -3799,17 +3977,159 @@ export class Agent { +@@ -3822,17 +4101,213 @@ export class Agent { return this.subAgentManager.calculateMaxSteps(this.maxSteps); } @@ -639,9 +783,13 @@ index 95a6a413..2486335c 100644 + private buildTrafficMetadata( + modelOverride?: LanguageModel | DynamicValue, + options?: BaseGenerationOptions, ++ providerOverride?: string, + ): TrafficRequestMetadata { + const provider = -+ this.resolveProvider(modelOverride) ?? this.resolveProvider(this.model) ?? undefined; ++ providerOverride ?? ++ this.resolveProvider(modelOverride) ?? ++ this.resolveProvider(this.model) ?? ++ undefined; + const priority = this.resolveTrafficPriority(options); + + return { @@ -651,6 +799,21 @@ index 95a6a413..2486335c 100644 + provider, // Allows per-provider throttling later + priority, + tenantId: this.resolveTenantId(options), ++ taskType: options?.taskType, ++ fallbackPolicyId: options?.fallbackPolicyId, ++ }; ++ } ++ ++ private resolveFallbackTarget(target: FallbackChainEntry): { ++ modelOverride?: LanguageModel; ++ providerOverride?: string; ++ } { ++ if (typeof target === "string") { ++ return { modelOverride: target }; ++ } ++ return { ++ modelOverride: target.model, ++ providerOverride: target.provider, + }; + } + @@ -659,35 +822,42 @@ index 95a6a413..2486335c 100644 + metadata: TrafficRequestMetadata | undefined, + logger?: Logger, + ): void { -+ if (!response || typeof response !== "object") { -+ logger?.debug?.("[Traffic] No response object available for rate limit update"); -+ return; -+ } -+ -+ const responseWithHeaders = response as { headers?: unknown } | null; -+ const headers = responseWithHeaders?.headers; -+ if (!headers) { -+ logger?.debug?.("[Traffic] Response missing headers; skipping rate limit update"); ++ const headerCandidates = findHeaders(response); ++ if (headerCandidates.length === 0) { ++ logger?.debug?.("[Traffic] No headers found for rate limit update"); + return; + } + + const controller = getTrafficController(); -+ const updateResult = controller.updateRateLimitFromHeaders( -+ metadata ?? this.buildTrafficMetadata(), -+ headers, -+ ); ++ const effectiveMetadata = metadata ?? this.buildTrafficMetadata(); ++ let updateResult: ReturnType | undefined; ++ for (const headers of headerCandidates) { ++ updateResult = controller.updateRateLimitFromHeaders(effectiveMetadata, headers); ++ if (updateResult) break; ++ } + + if (!updateResult) { + logger?.debug?.("[Traffic] No rate limit headers applied from response"); + return; + } + -+ const refillPerSecond = updateResult.normalized.refillPerMs * 1000; ++ const now = Date.now(); ++ const effectiveRemaining = Math.max( ++ 0, ++ updateResult.state.remaining - updateResult.state.reserved, ++ ); ++ const resetInMs = Math.max(0, updateResult.state.resetAt - now); ++ const nextAllowedInMs = Math.max(0, updateResult.state.nextAllowedAt - now); + logger?.info?.("[Traffic] Applied rate limit from response headers", { + rateLimitKey: updateResult.key, -+ capacity: updateResult.normalized.capacity, -+ refillPerSecond, -+ appliedTokens: updateResult.appliedTokens, ++ limit: updateResult.state.limit, ++ remaining: updateResult.state.remaining, ++ reserved: updateResult.state.reserved, ++ effectiveRemaining, ++ resetAt: updateResult.state.resetAt, ++ resetInMs, ++ nextAllowedAt: updateResult.state.nextAllowedAt, ++ nextAllowedInMs, + headers: { + limitRequests: updateResult.headerSnapshot.limitRequests, + remainingRequests: updateResult.headerSnapshot.remainingRequests, @@ -717,13 +887,41 @@ index 95a6a413..2486335c 100644 + return undefined; + } + ++ const normalizeUsage = ( ++ usage: LanguageModelUsage | undefined, ++ ): LanguageModelUsage | undefined => { ++ if (!usage) return undefined; ++ const input = Number.isFinite(usage.inputTokens) ? (usage.inputTokens as number) : undefined; ++ const output = Number.isFinite(usage.outputTokens) ++ ? (usage.outputTokens as number) ++ : undefined; ++ const total = Number.isFinite(usage.totalTokens) ? (usage.totalTokens as number) : undefined; ++ ++ if (total === undefined && input === undefined && output === undefined) { ++ return undefined; ++ } ++ ++ const safeInput = input ?? 0; ++ const safeOutput = output ?? 0; ++ const safeTotal = total ?? safeInput + safeOutput; ++ ++ return { ++ ...usage, ++ inputTokens: safeInput, ++ outputTokens: safeOutput, ++ totalTokens: safeTotal, ++ }; ++ }; ++ + if ( + typeof (usageCandidate as PromiseLike).then === "function" + ) { -+ return (usageCandidate as Promise).catch(() => undefined); ++ return (usageCandidate as Promise) ++ .then((usage) => normalizeUsage(usage)) ++ .catch(() => undefined); + } + -+ return usageCandidate as LanguageModelUsage; ++ return normalizeUsage(usageCandidate as LanguageModelUsage); + } + + private resolveProvider( @@ -826,10 +1024,10 @@ index dd5fb29d..add69edf 100644 abortController: AbortController; diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts -index 8753f039..0aef165a 100644 +index 8753f039..3850f0ac 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts -@@ -21,6 +21,19 @@ export type { +@@ -21,6 +21,29 @@ export type { WorkflowTimelineEvent, RegisteredWorkflow, } from "./workflow"; @@ -837,665 +1035,1145 @@ index 8753f039..0aef165a 100644 + // Surface traffic controller so downstream consumers can route agent calls through the shared scheduler + TrafficController, + CircuitBreakerOpenError, ++ QueueWaitTimeoutError, ++ RateLimitedUpstreamError, + getTrafficController, ++ type FallbackChainEntry, ++ type FallbackPolicy, ++ type FallbackPolicyConfig, ++ type FallbackPolicyMode, ++ type FallbackTarget, + type RateLimitConfig, + type RateLimitKey, + type RateLimitOptions, ++ type AdaptiveLimiterConfig, ++ type PriorityBurstLimits, + type TrafficRequest, + type TrafficRequestMetadata, ++ type TrafficResponseMetadata, + type TrafficPriority, + type TrafficRequestType, +} from "./traffic/traffic-controller"; // Export new Agent from agent.ts export { Agent, -diff --git a/packages/core/src/traffic/traffic-controller.spec.ts b/packages/core/src/traffic/traffic-controller.spec.ts +diff --git a/packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts new file mode 100644 -index 00000000..9b89d4b8 +index 00000000..652b7e59 --- /dev/null -+++ b/packages/core/src/traffic/traffic-controller.spec.ts -@@ -0,0 +1,87 @@ -+import { describe, expect, it, vi } from "vitest"; -+import { TrafficController } from "./traffic-controller"; ++++ b/packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts +@@ -0,0 +1,243 @@ ++import type { Logger } from "../../logger"; ++import { ++ RATE_LIMIT_EXHAUSTION_BUFFER, ++ RATE_LIMIT_MIN_PACE_INTERVAL_MS, ++ RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS, ++ RATE_LIMIT_PROBE_DELAY_MS, ++} from "../traffic-constants"; ++import type { ++ DispatchDecision, ++ QueuedRequest, ++ RateLimitWindowState, ++} from "../traffic-controller-internal"; ++import { parseRetryAfterMs, readHeaderValue } from "../traffic-error-utils"; ++import type { TrafficRequestMetadata } from "../traffic-types"; ++import type { ++ RateLimitHeaderSnapshot, ++ RateLimitStrategy, ++ RateLimitUpdateResult, ++} from "./rate-limit-strategy"; ++import { parseResetDurationToMs } from "./rate-limit-utils"; ++ ++export class DefaultRateLimitStrategy implements RateLimitStrategy { ++ private state?: RateLimitWindowState; ++ private readonly key: string; ++ ++ constructor(key: string) { ++ this.key = key; ++ } + -+describe("TrafficController priority scheduling", () => { -+ it("prioritizes P0 over lower priorities when runnable", async () => { -+ const controller = new TrafficController({ maxConcurrent: 1 }); -+ const order: string[] = []; ++ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const state = this.state; ++ if (!state) { ++ rateLimitLogger?.trace?.("Rate limit state missing; allow request", { ++ rateLimitKey: this.key, ++ }); ++ return null; ++ } + -+ const p1 = controller.handleText({ -+ metadata: { provider: "p", model: "m1", priority: "P1" }, -+ execute: async () => { -+ order.push("P1"); -+ return "P1"; -+ }, -+ }); ++ const now = Date.now(); ++ const effectiveRemaining = Math.max(0, state.remaining - state.reserved); ++ const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; ++ ++ if (effectiveRemaining <= RATE_LIMIT_EXHAUSTION_BUFFER) { ++ if (now < probeAt) { ++ rateLimitLogger?.debug?.("Rate limit exhausted; waiting for probe", { ++ rateLimitKey: this.key, ++ remaining: state.remaining, ++ reserved: state.reserved, ++ effectiveRemaining, ++ resetAt: state.resetAt, ++ probeAt, ++ }); ++ return { kind: "wait", wakeUpAt: probeAt }; ++ } ++ if (state.reserved > 0) { ++ rateLimitLogger?.debug?.("Rate limit exhausted but in-flight reservations exist; waiting", { ++ rateLimitKey: this.key, ++ remaining: state.remaining, ++ reserved: state.reserved, ++ effectiveRemaining, ++ resetAt: state.resetAt, ++ }); ++ return { kind: "wait" }; ++ } ++ } + -+ const p2 = controller.handleText({ -+ metadata: { provider: "p", model: "m2", priority: "P2" }, -+ execute: async () => { -+ order.push("P2"); -+ return "P2"; -+ }, -+ }); ++ if (now < state.nextAllowedAt) { ++ rateLimitLogger?.debug?.("Rate limit pacing; waiting until nextAllowedAt", { ++ rateLimitKey: this.key, ++ nextAllowedAt: state.nextAllowedAt, ++ resetAt: state.resetAt, ++ waitMs: Math.min(state.resetAt, state.nextAllowedAt) - now, ++ }); ++ return { kind: "wait", wakeUpAt: Math.min(state.resetAt, state.nextAllowedAt) }; ++ } + -+ const p0 = controller.handleText({ -+ metadata: { provider: "p", model: "m0", priority: "P0" }, -+ execute: async () => { -+ order.push("P0"); -+ return "P0"; -+ }, ++ state.reserved += 1; ++ next.rateLimitKey = this.key; ++ rateLimitLogger?.trace?.("Reserved rate limit token", { ++ rateLimitKey: this.key, ++ reserved: state.reserved, ++ remaining: state.remaining, ++ resetAt: state.resetAt, ++ nextAllowedAt: state.nextAllowedAt, + }); + -+ await Promise.all([p0, p1, p2]); ++ const remainingWindowMs = Math.max(0, state.resetAt - now); ++ const intervalMs = Math.max( ++ RATE_LIMIT_MIN_PACE_INTERVAL_MS, ++ Math.ceil(remainingWindowMs / Math.max(effectiveRemaining, 1)), ++ ); + -+ expect(order[0]).toBe("P0"); -+ expect(order).toEqual(["P0", "P1", "P2"]); -+ }); ++ const candidateNext = Math.max(state.nextAllowedAt, now + intervalMs); ++ if ( ++ state.nextAllowedAt <= now || ++ candidateNext >= state.nextAllowedAt + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS ++ ) { ++ state.nextAllowedAt = candidateNext; ++ rateLimitLogger?.trace?.("Updated pacing nextAllowedAt", { ++ rateLimitKey: this.key, ++ nextAllowedAt: state.nextAllowedAt, ++ intervalMs, ++ remainingWindowMs, ++ effectiveRemaining, ++ }); ++ } + -+ it("allows lower priorities to proceed when a higher priority request is rate limited", async () => { -+ vi.useFakeTimers(); ++ return null; ++ } + -+ try { -+ const controller = new TrafficController({ -+ maxConcurrent: 1, -+ rateLimits: { -+ "p0::m0": { capacity: 1, refillPerSecond: 1 }, -+ }, -+ }); ++ onDispatch(_logger?: Logger): void {} ++ ++ onComplete(logger?: Logger): void { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const state = this.state; ++ if (!state || state.reserved <= 0) return; ++ state.reserved -= 1; ++ rateLimitLogger?.trace?.("Released rate limit reservation", { ++ rateLimitKey: this.key, ++ reserved: state.reserved, ++ remaining: state.remaining, ++ resetAt: state.resetAt, ++ nextAllowedAt: state.nextAllowedAt, ++ }); ++ } + -+ // Exhaust the bucket for the P0 key so it initially waits -+ const buckets = (controller as unknown as { rateLimitBuckets: Map }) -+ .rateLimitBuckets; -+ buckets.set("p0::m0", { -+ tokens: 0, -+ capacity: 1, -+ refillPerMs: 1 / 1000, -+ lastRefill: Date.now(), -+ }); ++ updateFromHeaders( ++ _metadata: TrafficRequestMetadata | undefined, ++ headers: unknown, ++ logger?: Logger, ++ ): RateLimitUpdateResult | undefined { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const limitRequests = readHeaderValue(headers, "x-ratelimit-limit-requests"); ++ const remainingRequests = readHeaderValue(headers, "x-ratelimit-remaining-requests"); ++ const resetRequests = readHeaderValue(headers, "x-ratelimit-reset-requests"); ++ const retryAfter = readHeaderValue(headers, "retry-after"); ++ const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter) : undefined; + -+ const order: string[] = []; ++ const now = Date.now(); ++ const existing = this.state; ++ let state: RateLimitWindowState | undefined; ++ let headerSnapshot: RateLimitHeaderSnapshot | undefined; ++ ++ if (limitRequests && remainingRequests && resetRequests) { ++ const limit = Number(limitRequests); ++ const remaining = Number(remainingRequests); ++ if (!Number.isFinite(limit) || !Number.isFinite(remaining)) { ++ rateLimitLogger?.debug?.("Invalid rate limit numeric headers; skipping", { ++ rateLimitKey: this.key, ++ limitRequests, ++ remainingRequests, ++ }); ++ return undefined; ++ } + -+ const p0 = controller.handleText({ -+ metadata: { provider: "p0", model: "m0", priority: "P0" }, -+ execute: async () => { -+ order.push("P0"); -+ return "P0"; -+ }, -+ }); ++ const resetRequestsMs = parseResetDurationToMs(resetRequests); ++ if (resetRequestsMs === undefined) { ++ rateLimitLogger?.debug?.("Unable to parse reset duration; skipping", { ++ rateLimitKey: this.key, ++ resetRequests, ++ }); ++ return undefined; ++ } + -+ const p1 = controller.handleText({ -+ metadata: { provider: "p1", model: "m1", priority: "P1" }, -+ execute: async () => { -+ order.push("P1"); -+ return "P1"; -+ }, ++ const parsedResetAt = now + resetRequestsMs; ++ const isSameWindow = !!existing && now < existing.resetAt; ++ const resetAt = isSameWindow ? Math.max(existing.resetAt, parsedResetAt) : parsedResetAt; ++ const nextAllowedAt = isSameWindow ? Math.max(existing.nextAllowedAt, now) : now; ++ const reserved = Math.max(0, existing?.reserved ?? 0); ++ ++ state = { ++ limit, ++ remaining: isSameWindow ? Math.min(existing.remaining, remaining) : remaining, ++ resetAt, ++ reserved, ++ nextAllowedAt, ++ }; ++ headerSnapshot = { ++ limitRequests, ++ remainingRequests, ++ resetRequests, ++ resetRequestsMs, ++ }; ++ } else if (retryAfterMs === undefined) { ++ rateLimitLogger?.trace?.("Missing rate limit headers; skipping", { ++ rateLimitKey: this.key, ++ hasLimit: !!limitRequests, ++ hasRemaining: !!remainingRequests, ++ hasReset: !!resetRequests, ++ hasRetryAfter: !!retryAfter, + }); ++ return undefined; ++ } + -+ await vi.runAllTimersAsync(); -+ await Promise.all([p0, p1]); -+ -+ expect(order[0]).toBe("P1"); -+ expect(order[1]).toBe("P0"); -+ } finally { -+ vi.useRealTimers(); ++ if (!state) { ++ if (retryAfterMs === undefined) { ++ rateLimitLogger?.trace?.("Retry-After missing or unparsable; skipping", { ++ rateLimitKey: this.key, ++ retryAfter, ++ }); ++ return undefined; ++ } ++ const targetAt = now + retryAfterMs; ++ const isSameWindow = !!existing && now < existing.resetAt; ++ state = { ++ limit: existing?.limit ?? 1, ++ remaining: 0, ++ resetAt: isSameWindow ? Math.max(existing.resetAt, targetAt) : targetAt, ++ reserved: Math.max(0, existing?.reserved ?? 0), ++ nextAllowedAt: Math.max(existing?.nextAllowedAt ?? now, targetAt), ++ }; ++ headerSnapshot = { retryAfter, retryAfterMs }; ++ } else if (retryAfterMs !== undefined) { ++ const targetAt = now + retryAfterMs; ++ state = { ++ ...state, ++ remaining: 0, ++ resetAt: Math.max(state.resetAt, targetAt), ++ nextAllowedAt: Math.max(state.nextAllowedAt, targetAt), ++ }; ++ headerSnapshot = { ...headerSnapshot, retryAfter, retryAfterMs }; + } -+ }); -+}); -diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts ++ ++ this.state = state; ++ rateLimitLogger?.debug?.("Applied rate limit headers to state", { ++ rateLimitKey: this.key, ++ limit: state.limit, ++ remaining: state.remaining, ++ effectiveRemaining: Math.max(0, state.remaining - state.reserved), ++ resetAt: state.resetAt, ++ nextAllowedAt: state.nextAllowedAt, ++ resetRequestsMs: headerSnapshot?.resetRequestsMs, ++ retryAfterMs: headerSnapshot?.retryAfterMs, ++ }); ++ ++ return { ++ key: this.key, ++ headerSnapshot: headerSnapshot ?? {}, ++ state, ++ }; ++ } ++} +diff --git a/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts new file mode 100644 -index 00000000..8d82e8a5 +index 00000000..8e8b6f86 --- /dev/null -+++ b/packages/core/src/traffic/traffic-controller.ts -@@ -0,0 +1,1260 @@ -+import type { Logger } from "../logger"; -+import { LoggerProxy } from "../logger"; ++++ b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts +@@ -0,0 +1,265 @@ ++import type { Logger } from "../../logger"; ++import { ++ RATE_LIMIT_EXHAUSTION_BUFFER, ++ RATE_LIMIT_MIN_PACE_INTERVAL_MS, ++ RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS, ++ RATE_LIMIT_PROBE_DELAY_MS, ++} from "../traffic-constants"; ++import type { ++ DispatchDecision, ++ QueuedRequest, ++ RateLimitWindowState, ++} from "../traffic-controller-internal"; ++import type { RateLimitOptions, TrafficRequestMetadata } from "../traffic-types"; ++import { DefaultRateLimitStrategy } from "./default-rate-limit-strategy"; ++import type { ++ RateLimitStrategy, ++ RateLimitUpdateResult, ++ RateLimitUsage, ++} from "./rate-limit-strategy"; ++ ++export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { ++ readonly handlesTokenLimits: boolean; ++ private readonly window: DefaultRateLimitStrategy; ++ private readonly key: string; ++ private readonly requestsPerMinute?: number; ++ private readonly tokensPerMinute?: number; ++ private requestState?: RateLimitWindowState; ++ private tokenState?: RateLimitWindowState; ++ private bootstrapReserved = 0; ++ private readonly windowMs = 60_000; ++ ++ constructor(key: string, options?: RateLimitOptions) { ++ this.key = key; ++ this.window = new DefaultRateLimitStrategy(key); ++ // Window strategy enforces fixed 60s windows; burstSize is intentionally ignored here. ++ this.requestsPerMinute = this.normalizeLimit(options?.requestsPerMinute); ++ this.tokensPerMinute = this.normalizeLimit(options?.tokensPerMinute); ++ this.handlesTokenLimits = this.tokensPerMinute !== undefined; ++ } + -+type Scheduler = (callback: () => void) => void; -+type BivariantHandler = { -+ bivarianceHack(...args: TArgs): void; -+}["bivarianceHack"]; -+type BivariantFunction = { -+ bivarianceHack(...args: TArgs): TReturn; -+}["bivarianceHack"]; ++ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { ++ if (this.requestsPerMinute !== undefined) { ++ const requestDecision = this.resolveRequestWindow(next, logger); ++ if (requestDecision) return requestDecision; ++ const tokenDecision = this.resolveTokenWindow(logger); ++ if (tokenDecision) return tokenDecision; ++ return null; ++ } + -+type RetryReason = "rateLimit" | "serverError" | "timeout"; -+ -+const MAX_RETRY_ATTEMPTS = 3; -+const TIMEOUT_RETRY_ATTEMPTS = 2; -+const RATE_LIMIT_BASE_BACKOFF_MS = 500; -+const CIRCUIT_FAILURE_THRESHOLD = 5; -+const CIRCUIT_FAILURE_WINDOW_MS = 10_000; -+const CIRCUIT_COOLDOWN_MS = 30_000; -+const SERVER_ERROR_BASE_BACKOFF_MS = 1000; -+const TIMEOUT_BASE_BACKOFF_MS = 750; -+const RATE_LIMIT_JITTER_FACTOR = 0.35; -+const SERVER_ERROR_JITTER_FACTOR = 0.8; -+const TIMEOUT_JITTER_FACTOR = 0.5; -+const DEFAULT_FALLBACK_CHAINS: Record = { -+ "gpt-4o": ["gpt-4o-mini", "gpt-3.5"], -+}; ++ const decision = this.window.resolve(next, logger); ++ if (decision) return decision; + -+interface RateLimitBucket { -+ tokens: number; -+ capacity: number; -+ refillPerMs: number; -+ lastRefill: number; -+} ++ if (next.rateLimitKey) { ++ return null; ++ } + -+type NormalizedRateLimit = { -+ capacity: number; -+ refillPerMs: number; -+}; ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ if (this.bootstrapReserved >= 1) { ++ rateLimitLogger?.debug?.("OpenAI rate limit bootstrap active; waiting", { ++ rateLimitKey: this.key, ++ bootstrapReserved: this.bootstrapReserved, ++ }); ++ return { kind: "wait" }; ++ } + -+export interface RateLimitOptions { -+ capacity: number; -+ refillPerSecond: number; -+} ++ this.bootstrapReserved += 1; ++ next.rateLimitKey = this.key; ++ rateLimitLogger?.debug?.("OpenAI rate limit bootstrap reserved", { ++ rateLimitKey: this.key, ++ bootstrapReserved: this.bootstrapReserved, ++ }); ++ return null; ++ } + -+export type TenantUsage = { -+ inputTokens: number; -+ outputTokens: number; -+ totalTokens: number; -+}; ++ onDispatch(logger?: Logger): void { ++ if (this.requestsPerMinute === undefined) { ++ this.window.onDispatch(logger); ++ } ++ } + -+type UsageCounters = { -+ inputTokens?: number; -+ outputTokens?: number; -+ totalTokens?: number; -+}; ++ onComplete(logger?: Logger): void { ++ if (this.requestsPerMinute !== undefined) { ++ const now = Date.now(); ++ const state = this.ensureRequestState(now); ++ if (state.reserved > 0) { ++ state.reserved -= 1; ++ } ++ state.remaining = Math.max(0, state.remaining - 1); ++ return; ++ } + -+export type RateLimitKey = string; -+export type RateLimitConfig = Record; ++ if (this.bootstrapReserved > 0) { ++ this.bootstrapReserved -= 1; ++ } ++ this.window.onComplete(logger); ++ } + -+type RateLimitHeaderSnapshot = { -+ limitRequests: number; -+ remainingRequests?: number; -+ resetRequestsMs: number; -+}; ++ recordUsage(usage: RateLimitUsage, logger?: Logger): void { ++ if (this.tokensPerMinute === undefined) return; ++ const tokens = this.resolveTokenCount(usage); ++ if (tokens <= 0) return; + -+export type RateLimitUpdateResult = { -+ key: string; -+ headerSnapshot: RateLimitHeaderSnapshot; -+ normalized: NormalizedRateLimit; -+ appliedTokens: number; -+}; ++ const now = Date.now(); ++ const state = this.ensureTokenState(now); ++ state.remaining = Math.max(0, state.remaining - tokens); ++ logger?.child({ module: "rate-limiter" })?.trace?.("OpenAI token usage recorded", { ++ rateLimitKey: this.key, ++ tokens, ++ remaining: state.remaining, ++ resetAt: state.resetAt, ++ }); ++ } + -+export type TrafficRequestType = "text" | "stream"; ++ updateFromHeaders( ++ metadata: TrafficRequestMetadata | undefined, ++ headers: unknown, ++ logger?: Logger, ++ ): RateLimitUpdateResult | undefined { ++ if (this.requestsPerMinute !== undefined) { ++ return undefined; ++ } ++ return this.window.updateFromHeaders(metadata, headers, logger); ++ } + -+export type TrafficPriority = "P0" | "P1" | "P2"; ++ private resolveRequestWindow(next: QueuedRequest, logger?: Logger): DispatchDecision | null { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const now = Date.now(); ++ const state = this.ensureRequestState(now); ++ const effectiveRemaining = Math.max(0, state.remaining - state.reserved); ++ const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; ++ ++ if (effectiveRemaining <= RATE_LIMIT_EXHAUSTION_BUFFER) { ++ if (now < probeAt) { ++ rateLimitLogger?.debug?.("OpenAI request window exhausted; waiting for probe", { ++ rateLimitKey: this.key, ++ remaining: state.remaining, ++ reserved: state.reserved, ++ effectiveRemaining, ++ resetAt: state.resetAt, ++ probeAt, ++ }); ++ return { kind: "wait", wakeUpAt: probeAt }; ++ } ++ if (state.reserved > 0) { ++ rateLimitLogger?.debug?.( ++ "OpenAI request window exhausted but in-flight reservations exist; waiting", ++ { ++ rateLimitKey: this.key, ++ remaining: state.remaining, ++ reserved: state.reserved, ++ effectiveRemaining, ++ resetAt: state.resetAt, ++ }, ++ ); ++ return { kind: "wait" }; ++ } ++ } + -+export interface TrafficRequestMetadata { -+ agentId?: string; -+ agentName?: string; -+ model?: string; -+ provider?: string; -+ priority?: TrafficPriority; -+ tenantId?: string; -+} -+ -+export interface TrafficRequest { -+ tenantId: string; -+ metadata?: TrafficRequestMetadata; -+ execute: () => Promise; -+ createFallbackRequest?: (modelId: string) => TrafficRequest | undefined; -+ extractUsage?: BivariantFunction< -+ [response: TResponse], -+ Promise | UsageCounters | undefined -+ >; -+} -+ -+type CircuitStateStatus = "closed" | "open" | "half-open"; -+ -+interface CircuitState { -+ status: CircuitStateStatus; -+ failureTimestamps: number[]; -+ openedAt?: number; -+ trialInFlight?: boolean; -+} -+ -+interface QueuedRequest { -+ type: TrafficRequestType; -+ request: TrafficRequest; -+ resolve: BivariantHandler<[TResponse | PromiseLike]>; -+ reject: BivariantHandler<[reason?: unknown]>; -+ etaMs?: number; -+ rateLimitKey?: string; -+ attempt?: number; -+ circuitKey?: string; -+ circuitStatus?: CircuitStateStatus; -+ priority: TrafficPriority; -+ tenantId: string; -+ extractUsage?: BivariantFunction< -+ [response: TResponse], -+ Promise | UsageCounters | undefined -+ >; -+} -+ -+export interface TrafficControllerOptions { -+ maxConcurrent?: number; -+ rateLimits?: RateLimitConfig; -+ logger?: Logger; -+ fallbackChains?: Record; -+} ++ if (now < state.nextAllowedAt) { ++ rateLimitLogger?.debug?.("OpenAI request window pacing; waiting until nextAllowedAt", { ++ rateLimitKey: this.key, ++ nextAllowedAt: state.nextAllowedAt, ++ resetAt: state.resetAt, ++ waitMs: Math.min(state.resetAt, state.nextAllowedAt) - now, ++ }); ++ return { kind: "wait", wakeUpAt: Math.min(state.resetAt, state.nextAllowedAt) }; ++ } + -+type ProcessDecision = "process" | "skip" | "wait"; ++ state.reserved += 1; ++ next.rateLimitKey = this.key; ++ rateLimitLogger?.trace?.("Reserved OpenAI request window slot", { ++ rateLimitKey: this.key, ++ reserved: state.reserved, ++ remaining: state.remaining, ++ resetAt: state.resetAt, ++ nextAllowedAt: state.nextAllowedAt, ++ }); + -+// Centralized traffic controller responsible for scheduling LLM calls. -+// Provides a FIFO queue with a non-blocking scheduler and entrypoints -+// for text and stream traffic. -+export class TrafficController { -+ private readonly scheduler: Scheduler; -+ private readonly maxConcurrent: number; -+ private rateLimits?: Map; -+ private readonly rateLimitBuckets = new Map(); -+ private readonly circuitBreakers = new Map(); -+ private readonly fallbackChains: Map; -+ private readonly priorityOrder: TrafficPriority[] = ["P0", "P1", "P2"]; -+ private readonly queues: Record = { -+ P0: [], -+ P1: [], -+ P2: [], -+ }; -+ private activeCount = 0; -+ private drainScheduled = false; -+ private refillTimeout?: ReturnType; -+ private readonly tenantUsage = new Map(); -+ private readonly logger: Logger; ++ const remainingWindowMs = Math.max(0, state.resetAt - now); ++ const intervalMs = Math.max( ++ RATE_LIMIT_MIN_PACE_INTERVAL_MS, ++ Math.ceil(remainingWindowMs / Math.max(effectiveRemaining, 1)), ++ ); + -+ private logDebug(message: string, details?: Record): void { -+ if (typeof console?.debug === "function") { -+ console.debug(message, details); ++ const candidateNext = Math.max(state.nextAllowedAt, now + intervalMs); ++ if ( ++ state.nextAllowedAt <= now || ++ candidateNext >= state.nextAllowedAt + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS ++ ) { ++ state.nextAllowedAt = candidateNext; ++ rateLimitLogger?.trace?.("Updated OpenAI request pacing nextAllowedAt", { ++ rateLimitKey: this.key, ++ nextAllowedAt: state.nextAllowedAt, ++ intervalMs, ++ remainingWindowMs, ++ effectiveRemaining, ++ }); + } ++ ++ return null; + } + -+ constructor(options: TrafficControllerOptions = {}) { -+ this.maxConcurrent = options.maxConcurrent ?? Number.POSITIVE_INFINITY; -+ this.rateLimits = this.normalizeRateLimits(options.rateLimits); -+ this.fallbackChains = this.normalizeFallbackChains(options.fallbackChains); -+ this.scheduler = this.createScheduler(); ++ private resolveTokenWindow(logger?: Logger): DispatchDecision | null { ++ if (this.tokensPerMinute === undefined) return null; ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const now = Date.now(); ++ const state = this.ensureTokenState(now); + -+ // NEW LOGGER (from c2 commit) -+ this.logger = new LoggerProxy({ component: "traffic-controller" }, options.logger); ++ if (state.remaining > 0) return null; + -+ // INIT LOG (from HEAD) — rewritten to use the new logger -+ this.logger.debug("[TrafficController] init", { -+ maxConcurrent: this.maxConcurrent, -+ rateLimits: this.rateLimits ? Array.from(this.rateLimits.entries()) : undefined, ++ const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; ++ rateLimitLogger?.debug?.("OpenAI token window exhausted; waiting", { ++ rateLimitKey: this.key, ++ remaining: state.remaining, ++ resetAt: state.resetAt, ++ probeAt, + }); ++ return { kind: "wait", wakeUpAt: probeAt }; + } + -+ handleText(request: TrafficRequest): Promise { -+ // Route text generation requests into the queue so all LLM calls share the same scheduler -+ return this.enqueue("text", request); ++ private ensureRequestState(now: number): RateLimitWindowState { ++ const limit = this.requestsPerMinute ?? 0; ++ const state = this.requestState; ++ if (!state || now >= state.resetAt) { ++ this.requestState = { ++ limit, ++ remaining: limit, ++ resetAt: now + this.windowMs, ++ reserved: 0, ++ nextAllowedAt: now, ++ }; ++ return this.requestState; ++ } ++ return state; + } + -+ handleStream(request: TrafficRequest): Promise { -+ // Route streaming requests through the same queue to preserve ordering/backpressure rules -+ return this.enqueue("stream", request); ++ private ensureTokenState(now: number): RateLimitWindowState { ++ const limit = this.tokensPerMinute ?? 0; ++ const state = this.tokenState; ++ if (!state || now >= state.resetAt) { ++ this.tokenState = { ++ limit, ++ remaining: limit, ++ resetAt: now + this.windowMs, ++ reserved: 0, ++ nextAllowedAt: now, ++ }; ++ return this.tokenState; ++ } ++ return state; + } + -+ getTenantUsage(tenantId: string): TenantUsage | undefined { -+ const usage = this.tenantUsage.get(tenantId); -+ return usage ? { ...usage } : undefined; ++ private normalizeLimit(value: number | undefined): number | undefined { ++ const numeric = typeof value === "number" ? value : Number(value); ++ return Number.isFinite(numeric) && numeric > 0 ? numeric : undefined; + } + -+ private createScheduler(): Scheduler { -+ // Prefer queueMicrotask to keep the drain loop snappy without starving the event loop -+ if (typeof queueMicrotask === "function") { -+ return queueMicrotask; -+ } -+ -+ return (callback: () => void) => setTimeout(callback, 0); ++ private resolveTokenCount(usage: RateLimitUsage): number { ++ const total = Number.isFinite(usage.totalTokens) ? usage.totalTokens : undefined; ++ if (total !== undefined) return total; ++ const input = Number.isFinite(usage.inputTokens) ? usage.inputTokens : 0; ++ const output = Number.isFinite(usage.outputTokens) ? usage.outputTokens : 0; ++ return input + output; + } ++} +diff --git a/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts +new file mode 100644 +index 00000000..6657c6b2 +--- /dev/null ++++ b/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts +@@ -0,0 +1,41 @@ ++import type { Logger } from "../../logger"; ++import type { ++ DispatchDecision, ++ QueuedRequest, ++ RateLimitWindowState, ++} from "../traffic-controller-internal"; ++import type { TrafficRequestMetadata } from "../traffic-types"; ++ ++export type RateLimitHeaderSnapshot = { ++ limitRequests?: string; ++ remainingRequests?: string; ++ resetRequests?: string; ++ resetRequestsMs?: number; ++ retryAfter?: string; ++ retryAfterMs?: number; ++}; + -+ private enqueue( -+ type: TrafficRequestType, -+ request: TrafficRequest, -+ ): Promise { -+ // Each request gets a promise so callers can await their own result -+ return new Promise((resolve, reject) => { -+ const priority = this.resolvePriority(request.metadata); -+ this.logger.debug("Enqueuing LLM request", { -+ tenantId: request.tenantId, -+ type, -+ priority, -+ }); -+ // Collect the work item and metadata -+ this.getQueue(priority).push({ -+ type, -+ request, -+ resolve, -+ reject, -+ attempt: 1, -+ priority, -+ tenantId: request.tenantId, -+ extractUsage: request.extractUsage, -+ }); ++export type RateLimitUpdateResult = { ++ key: string; ++ headerSnapshot: RateLimitHeaderSnapshot; ++ state: RateLimitWindowState; ++}; + -+ this.logDebug("[TrafficController] enqueue", { -+ type, -+ queueSize: this.getQueueSize(), -+ metadata: request.metadata, -+ }); ++export type RateLimitUsage = { ++ inputTokens?: number; ++ outputTokens?: number; ++ totalTokens?: number; ++}; + -+ // Kick the drain loop to start handling work -+ this.scheduleDrain(); -+ }); ++export interface RateLimitStrategy { ++ readonly handlesTokenLimits?: boolean; ++ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null; ++ onDispatch(logger?: Logger): void; ++ onComplete(logger?: Logger): void; ++ recordUsage?(usage: RateLimitUsage, logger?: Logger): void; ++ updateFromHeaders( ++ metadata: TrafficRequestMetadata | undefined, ++ headers: unknown, ++ logger?: Logger, ++ ): RateLimitUpdateResult | undefined; ++} +diff --git a/packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts b/packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts +new file mode 100644 +index 00000000..310c9a7e +--- /dev/null ++++ b/packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts +@@ -0,0 +1,26 @@ ++export function parseResetDurationToMs(raw: string): number | undefined { ++ const value = raw.trim(); ++ if (!value) return undefined; ++ ++ let totalMs = 0; ++ const regex = /(\d+(?:\.\d+)?)(ms|s|m|h|d)/g; ++ let matched = false; ++ for (const match of value.matchAll(regex)) { ++ matched = true; ++ const amount = Number.parseFloat(match[1] ?? ""); ++ if (!Number.isFinite(amount)) continue; ++ const unit = match[2]; ++ if (unit === "ms") totalMs += amount; ++ else if (unit === "s") totalMs += amount * 1000; ++ else if (unit === "m") totalMs += amount * 60_000; ++ else if (unit === "h") totalMs += amount * 3_600_000; ++ else if (unit === "d") totalMs += amount * 86_400_000; + } + -+ private scheduleDrain(): void { -+ if (this.drainScheduled) { -+ return; -+ } -+ -+ this.drainScheduled = true; // Prevent redundant scheduling when many requests arrive at once -+ this.logDebug("[TrafficController] scheduleDrain", { queueSize: this.getQueueSize() }); -+ this.scheduler(() => { -+ this.drainScheduled = false; -+ this.logDebug("[TrafficController] drainLoopStart", { -+ queueSize: this.getQueueSize(), -+ active: this.activeCount, -+ }); -+ this.drainQueue(); // Drain asynchronously so we never block the caller's tick -+ }); ++ if (matched) { ++ return Math.round(totalMs); + } + -+ private drainQueue(): void { -+ // Pull as many items as we can until we hit capacity or rate limits -+ while (this.hasQueuedWork()) { -+ if (this.activeCount >= this.maxConcurrent) { -+ return; -+ } -+ -+ let selected: { item: QueuedRequest; priority: TrafficPriority } | undefined; -+ let skippedItem = false; ++ const n = Number(value); ++ return Number.isFinite(n) ? Math.round(n) : undefined; ++} +diff --git a/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts +new file mode 100644 +index 00000000..2ae7b189 +--- /dev/null ++++ b/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts +@@ -0,0 +1,210 @@ ++import type { Logger } from "../../logger"; ++import type { ++ DispatchDecision, ++ QueuedRequest, ++ RateLimitWindowState, ++} from "../traffic-controller-internal"; ++import { parseRetryAfterMs, readHeaderValue } from "../traffic-error-utils"; ++import type { RateLimitOptions, TrafficRequestMetadata } from "../traffic-types"; ++import type { ++ RateLimitHeaderSnapshot, ++ RateLimitStrategy, ++ RateLimitUpdateResult, ++} from "./rate-limit-strategy"; ++import { parseResetDurationToMs } from "./rate-limit-utils"; ++ ++type TokenBucketState = { ++ capacity: number; ++ refillPerSecond: number; ++ tokens: number; ++ updatedAt: number; ++}; + -+ for (const priority of this.priorityOrder) { -+ const queue = this.getQueue(priority); -+ if (queue.length === 0) { -+ continue; -+ } ++function normalizeTokenBucketOptions( ++ raw: RateLimitOptions | undefined, ++): Omit { ++ const requestsPerMinuteRaw = raw?.requestsPerMinute; ++ const burstSizeRaw = raw?.burstSize; + -+ const candidate = queue[0]; -+ const decision = this.getProcessDecision(candidate); -+ if (decision === "process") { -+ selected = { item: candidate, priority }; -+ break; -+ } ++ const requestsPerMinute = ++ typeof requestsPerMinuteRaw === "number" ? requestsPerMinuteRaw : Number(requestsPerMinuteRaw); ++ const burstSize = typeof burstSizeRaw === "number" ? burstSizeRaw : Number(burstSizeRaw); + -+ if (decision === "skip") { -+ queue.shift(); // Remove rejected item -+ skippedItem = true; -+ break; // Re-evaluate from highest priority after removing -+ } ++ const safeRequestsPerMinute = Number.isFinite(requestsPerMinute) ? requestsPerMinute : 0; ++ const safeBurst = Number.isFinite(burstSize) ? burstSize : safeRequestsPerMinute; ++ const refillPerSecond = safeRequestsPerMinute > 0 ? safeRequestsPerMinute / 60 : 0; + -+ // If wait, try lower priorities in the same drain cycle -+ } ++ return { ++ capacity: safeBurst > 0 ? Math.max(1, safeBurst) : 0, ++ refillPerSecond, ++ }; ++} ++function refillTokenBucket(bucket: TokenBucketState, now: number): void { ++ const elapsedMs = now - bucket.updatedAt; ++ if (elapsedMs <= 0) return; ++ bucket.updatedAt = now; ++ if (bucket.capacity <= 0 || bucket.refillPerSecond <= 0) return; ++ ++ const refill = (elapsedMs / 1000) * bucket.refillPerSecond; ++ if (refill <= 0) return; ++ bucket.tokens = Math.min(bucket.capacity, bucket.tokens + refill); ++} + -+ if (selected) { -+ const { item, priority } = selected; -+ this.getQueue(priority).shift(); -+ this.activeCount++; // Track in-flight work to enforce concurrency guard -+ this.markCircuitTrial(item); // Reserve the half-open trial slot if needed ++export class TokenBucketRateLimitStrategy implements RateLimitStrategy { ++ private readonly key: string; ++ private bucket?: TokenBucketState; ++ private cooldownUntil?: number; + -+ void this.runRequest(item); // Fire off processing without blocking the loop -+ continue; -+ } ++ constructor(key: string, options?: RateLimitOptions) { ++ this.key = key; ++ if (!options) return; ++ const normalized = normalizeTokenBucketOptions(options); ++ const now = Date.now(); ++ this.bucket = { ++ ...normalized, ++ tokens: normalized.capacity, ++ updatedAt: now, ++ }; ++ } + -+ if (skippedItem) { -+ continue; // We removed a blocked item; re-evaluate queues -+ } ++ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const now = Date.now(); + -+ // No runnable work right now; exit until capacity/rate-limit changes -+ return; ++ if (this.cooldownUntil !== undefined && now < this.cooldownUntil) { ++ rateLimitLogger?.debug?.("Token bucket cooldown active; waiting", { ++ rateLimitKey: this.key, ++ cooldownUntil: this.cooldownUntil, ++ waitMs: this.cooldownUntil - now, ++ }); ++ return { kind: "wait", wakeUpAt: this.cooldownUntil }; + } -+ } + -+ private getProcessDecision(next: QueuedRequest): ProcessDecision { -+ const circuitDecision = this.evaluateCircuitBreaker(next); -+ if (circuitDecision !== "process") { -+ return circuitDecision; -+ } ++ const bucket = this.bucket; ++ if (!bucket) return null; + -+ if (this.activeCount >= this.maxConcurrent) { -+ this.logDebug("[TrafficController] throttle concurrency", { -+ active: this.activeCount, -+ maxConcurrent: this.maxConcurrent, ++ refillTokenBucket(bucket, now); ++ ++ if (bucket.capacity <= 0) { ++ rateLimitLogger?.debug?.("Token bucket misconfigured; blocking", { ++ rateLimitKey: this.key, ++ capacity: bucket.capacity, ++ refillPerSecond: bucket.refillPerSecond, + }); -+ return "wait"; ++ return { kind: "wait" }; + } + -+ const rateLimitConfig = this.getRateLimitConfig(next.request.metadata); -+ if (!rateLimitConfig) { -+ this.logDebug("[TrafficController] no rate limit match", { -+ metadata: next.request.metadata, ++ if (bucket.tokens >= 1) { ++ bucket.tokens -= 1; ++ next.rateLimitKey = this.key; ++ rateLimitLogger?.trace?.("Consumed token bucket token", { ++ rateLimitKey: this.key, ++ tokens: bucket.tokens, ++ capacity: bucket.capacity, ++ refillPerSecond: bucket.refillPerSecond, + }); -+ next.rateLimitKey = undefined; -+ next.etaMs = 0; -+ return "process"; // No rate limit configured for this key ++ return null; + } + -+ const queuedAhead = this.countQueuedAheadWithKey( -+ rateLimitConfig.key, -+ next, -+ /*logDetails*/ true, -+ ); -+ const bucket = this.getRateLimitBucket(rateLimitConfig.key, rateLimitConfig.limit); -+ if (bucket.tokens < 1) { -+ next.rateLimitKey = rateLimitConfig.key; -+ next.etaMs = this.computeEtaMs( -+ bucket, -+ rateLimitConfig.limit, -+ rateLimitConfig.key, -+ next, -+ queuedAhead, -+ ); -+ this.logDebug("[TrafficController] throttle rate", { -+ key: rateLimitConfig.key, -+ tokens: bucket.tokens, -+ etaMs: next.etaMs, -+ queuedAhead, ++ if (bucket.refillPerSecond <= 0) { ++ rateLimitLogger?.debug?.("Token bucket has no refill; blocking", { ++ rateLimitKey: this.key, ++ capacity: bucket.capacity, ++ refillPerSecond: bucket.refillPerSecond, + }); -+ this.scheduleRefill(rateLimitConfig.limit); // Ensure we retry as soon as tokens are replenished -+ return "wait"; ++ return { kind: "wait" }; + } + -+ bucket.tokens -= 1; // Consume a token for this dispatch -+ this.logDebug("[TrafficController] token consumed", { -+ key: rateLimitConfig.key, -+ remaining: bucket.tokens, ++ const requiredTokens = 1 - bucket.tokens; ++ const waitMs = Math.max(1, Math.ceil((requiredTokens / bucket.refillPerSecond) * 1000)); ++ const wakeUpAt = now + waitMs; ++ rateLimitLogger?.debug?.("Token bucket empty; waiting", { ++ rateLimitKey: this.key, ++ tokens: bucket.tokens, + capacity: bucket.capacity, ++ refillPerSecond: bucket.refillPerSecond, ++ wakeUpAt, ++ waitMs, + }); -+ next.rateLimitKey = rateLimitConfig.key; -+ next.etaMs = 0; -+ return "process"; ++ return { kind: "wait", wakeUpAt }; + } + -+ private getRateLimitConfig( -+ metadata?: TrafficRequestMetadata, -+ ): { key: string; limit: NormalizedRateLimit } | undefined { -+ if (!this.rateLimits || this.rateLimits.size === 0) { -+ return undefined; -+ } ++ onDispatch(_logger?: Logger): void {} + -+ const key = this.buildRateLimitKey(metadata); -+ const limit = this.rateLimits.get(key); -+ if (!limit) { -+ return undefined; -+ } ++ onComplete(_logger?: Logger): void {} + -+ this.logDebug("[TrafficController] rateLimitConfig hit", { key }); -+ return { key, limit }; -+ } ++ updateFromHeaders( ++ _metadata: TrafficRequestMetadata | undefined, ++ headers: unknown, ++ logger?: Logger, ++ ): RateLimitUpdateResult | undefined { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const now = Date.now(); + -+ private getRateLimitBucket(key: string, limit: NormalizedRateLimit): RateLimitBucket { -+ const now = Date.now(); // Snapshot time once to avoid drift within this method -+ let bucket = this.rateLimitBuckets.get(key); // Reuse the bucket if it already exists ++ const retryAfter = readHeaderValue(headers, "retry-after"); ++ const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter, now) : undefined; + -+ if (!bucket) { -+ bucket = { -+ tokens: limit.capacity, -+ capacity: limit.capacity, -+ refillPerMs: limit.refillPerMs, -+ lastRefill: now, -+ }; -+ this.rateLimitBuckets.set(key, bucket); -+ this.logDebug("[TrafficController] bucket create", { -+ key, -+ capacity: bucket.capacity, -+ refillPerMs: bucket.refillPerMs, -+ }); -+ return bucket; ++ const remainingRequests = readHeaderValue(headers, "x-ratelimit-remaining-requests"); ++ const resetRequests = readHeaderValue(headers, "x-ratelimit-reset-requests"); ++ const resetRequestsMs = resetRequests ? parseResetDurationToMs(resetRequests) : undefined; ++ ++ let appliedUntil: number | undefined; ++ ++ if (retryAfterMs !== undefined) { ++ const targetAt = now + retryAfterMs; ++ this.cooldownUntil = ++ this.cooldownUntil === undefined ? targetAt : Math.max(this.cooldownUntil, targetAt); ++ appliedUntil = this.cooldownUntil; + } + -+ if ( -+ bucket.capacity !== limit.capacity || -+ Math.abs(bucket.refillPerMs - limit.refillPerMs) > Number.EPSILON -+ ) { -+ bucket.capacity = limit.capacity; -+ bucket.refillPerMs = limit.refillPerMs; -+ bucket.tokens = Math.min(bucket.tokens, bucket.capacity); -+ bucket.lastRefill = now; -+ this.logDebug("[TrafficController] bucket sync with new limit", { -+ key, -+ capacity: bucket.capacity, -+ refillPerMs: bucket.refillPerMs, -+ }); ++ if (remainingRequests && resetRequestsMs !== undefined) { ++ const remaining = Number(remainingRequests); ++ if (Number.isFinite(remaining) && remaining <= 0) { ++ const targetAt = now + resetRequestsMs; ++ this.cooldownUntil = ++ this.cooldownUntil === undefined ? targetAt : Math.max(this.cooldownUntil, targetAt); ++ appliedUntil = this.cooldownUntil; ++ } + } + -+ const elapsedMs = Math.max(0, now - bucket.lastRefill); -+ if (elapsedMs > 0 && bucket.tokens < bucket.capacity) { -+ const refilled = elapsedMs * bucket.refillPerMs; // Refill based on elapsed time -+ bucket.tokens = Math.min(bucket.capacity, bucket.tokens + refilled); // Cap at bucket capacity -+ bucket.lastRefill = now; // Mark refill time for the next calculation -+ this.logDebug("[TrafficController] bucket refill", { -+ key, -+ elapsedMs, -+ tokens: bucket.tokens, ++ if (appliedUntil === undefined) { ++ rateLimitLogger?.trace?.("No applicable cooldown headers; skipping", { ++ rateLimitKey: this.key, ++ hasRetryAfter: !!retryAfter, ++ hasRemainingRequests: !!remainingRequests, ++ hasResetRequests: !!resetRequests, + }); ++ return undefined; + } + -+ return bucket; -+ } -+ -+ private computeEtaMs( -+ bucket: RateLimitBucket, -+ limit: NormalizedRateLimit, -+ key: string, -+ current: QueuedRequest, -+ queuedAhead?: number, -+ ): number { -+ const missingTokens = Math.max(0, 1 - bucket.tokens); -+ const waitForToken = -+ missingTokens > 0 && limit.refillPerMs > 0 ? Math.ceil(missingTokens / limit.refillPerMs) : 0; -+ const aheadCount = -+ typeof queuedAhead === "number" -+ ? queuedAhead -+ : this.countQueuedAheadWithKey(key, current, /*logDetails*/ false); -+ const extraForQueue = -+ aheadCount > 0 && limit.refillPerMs > 0 ? Math.ceil(aheadCount / limit.refillPerMs) : 0; -+ this.logDebug("[TrafficController] computeEtaMs", { -+ key, -+ missingTokens, -+ waitForToken, -+ aheadCount, -+ extraForQueue, -+ eta: waitForToken + extraForQueue, ++ rateLimitLogger?.debug?.("Applied token bucket cooldown from headers", { ++ rateLimitKey: this.key, ++ cooldownUntil: appliedUntil, ++ inMs: Math.max(0, appliedUntil - now), ++ retryAfterMs, ++ resetRequestsMs, + }); -+ return waitForToken + extraForQueue; ++ ++ const headerSnapshot: RateLimitHeaderSnapshot = { ++ remainingRequests, ++ resetRequests, ++ resetRequestsMs, ++ retryAfter, ++ retryAfterMs, ++ }; ++ ++ const state: RateLimitWindowState = { ++ limit: 1, ++ remaining: 0, ++ resetAt: appliedUntil, ++ reserved: 0, ++ nextAllowedAt: appliedUntil, ++ }; ++ ++ return { ++ key: this.key, ++ headerSnapshot, ++ state, ++ }; ++ } ++} +diff --git a/packages/core/src/traffic/traffic-circuit-breaker.ts b/packages/core/src/traffic/traffic-circuit-breaker.ts +new file mode 100644 +index 00000000..f240ce40 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-circuit-breaker.ts +@@ -0,0 +1,454 @@ ++import type { Logger } from "../logger"; ++import { ++ CIRCUIT_COOLDOWN_MS, ++ CIRCUIT_FAILURE_THRESHOLD, ++ CIRCUIT_FAILURE_WINDOW_MS, ++ CIRCUIT_PROBE_INTERVAL_MS, ++ CIRCUIT_TIMEOUT_THRESHOLD, ++ CIRCUIT_TIMEOUT_WINDOW_MS, ++ DEFAULT_FALLBACK_CHAINS, ++} from "./traffic-constants"; ++import type { ++ CircuitState, ++ CircuitStateStatus, ++ DispatchDecision, ++ QueuedRequest, ++} from "./traffic-controller-internal"; ++import { extractStatusCode, isTimeoutError } from "./traffic-error-utils"; ++import { CircuitBreakerOpenError } from "./traffic-errors"; ++import type { ++ FallbackChainEntry, ++ FallbackPolicy, ++ FallbackPolicyConfig, ++ FallbackTarget, ++ TrafficRequestMetadata, ++ TrafficResponseMetadata, ++} from "./traffic-types"; ++ ++export class TrafficCircuitBreaker { ++ private readonly circuitBreakers = new Map(); ++ private readonly fallbackChains: Map; ++ private readonly fallbackPolicy?: FallbackPolicyConfig; ++ private readonly buildRateLimitKey: (metadata?: TrafficRequestMetadata) => string; ++ ++ constructor(options: { ++ fallbackChains?: Record; ++ fallbackPolicy?: FallbackPolicyConfig; ++ buildRateLimitKey: (metadata?: TrafficRequestMetadata) => string; ++ }) { ++ this.buildRateLimitKey = options.buildRateLimitKey; ++ const chains = options.fallbackChains ?? DEFAULT_FALLBACK_CHAINS; ++ this.fallbackChains = new Map(Object.entries(chains)); ++ this.fallbackPolicy = options.fallbackPolicy; + } + -+ private countQueuedAheadWithKey(key: string, current: QueuedRequest, logDetails = false): number { -+ let count = 0; -+ for (const priority of this.priorityOrder) { -+ const queue = this.getQueue(priority); -+ for (const item of queue) { -+ if (item === current) { -+ return count; -+ } ++ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { ++ const circuitLogger = logger?.child({ module: "circuit-breaker" }); ++ const visitedKeys = new Set(); + -+ const itemKey = this.buildRateLimitKey(item.request.metadata); -+ if (itemKey === key) { -+ count += 1; -+ } ++ while (true) { ++ const key = this.buildRateLimitKey(next.request.metadata); ++ next.circuitKey = key; ++ visitedKeys.add(key); ++ circuitLogger?.trace?.("Circuit resolve step", { ++ circuitKey: key, ++ provider: next.request.metadata?.provider, ++ model: next.request.metadata?.model, ++ }); ++ ++ const evaluation = this.evaluateCircuitState(key, circuitLogger); ++ next.circuitStatus = evaluation.state; ++ circuitLogger?.debug?.("Circuit evaluated", { ++ circuitKey: key, ++ state: evaluation.state, ++ allowRequest: evaluation.allowRequest, ++ retryAfterMs: evaluation.retryAfterMs, ++ }); ++ ++ if (evaluation.allowRequest) return null; ++ ++ const { policy, policyId } = this.resolveFallbackPolicy(next.request.metadata); ++ if (policy.mode === "wait") { ++ const wakeUpAt = ++ evaluation.retryAfterMs !== undefined ? Date.now() + evaluation.retryAfterMs : undefined; ++ circuitLogger?.debug?.("Circuit open; waiting per fallback policy", { ++ circuitKey: key, ++ policyId, ++ retryAfterMs: evaluation.retryAfterMs, ++ wakeUpAt, ++ }); ++ return { kind: "wait", wakeUpAt }; + } -+ } -+ if (logDetails) { -+ this.logDebug("[TrafficController] countQueuedAheadWithKey", { -+ key, -+ count, -+ queueSize: this.getQueueSize(), ++ ++ const fallback = this.findFallbackTarget(next.request.metadata, visitedKeys, circuitLogger); ++ circuitLogger?.debug?.("Circuit open; attempting fallback", { ++ circuitKey: key, ++ currentModel: next.request.metadata?.model, ++ fallback, ++ visitedKeys: Array.from(visitedKeys), ++ }); ++ if (!fallback || !next.request.createFallbackRequest) { ++ const error = new CircuitBreakerOpenError( ++ `Circuit open for ${key}`, ++ next.request.metadata, ++ evaluation.retryAfterMs, ++ ); ++ const traffic: TrafficResponseMetadata = { ++ rateLimitKey: key, ++ retryAfterMs: evaluation.retryAfterMs, ++ tenantId: next.request.metadata?.tenantId ?? next.tenantId, ++ priority: next.request.metadata?.priority, ++ taskType: next.request.metadata?.taskType, ++ }; ++ (error as Record).traffic = traffic; ++ next.reject(error); ++ circuitLogger?.warn?.("No fallback available; rejecting request", { ++ circuitKey: key, ++ retryAfterMs: evaluation.retryAfterMs, ++ }); ++ return { kind: "skip" }; ++ } ++ ++ const fallbackRequest = next.request.createFallbackRequest(fallback); ++ if (!fallbackRequest) { ++ circuitLogger?.warn?.("createFallbackRequest returned undefined; skipping", { ++ circuitKey: key, ++ fallback, ++ }); ++ return { kind: "skip" }; ++ } ++ ++ this.applyFallbackRequest(next, fallbackRequest, fallback, circuitLogger, { ++ previousCircuitKey: key, ++ reason: "circuit-open", + }); + } -+ return count; + } + -+ private evaluateCircuitBreaker(next: QueuedRequest): ProcessDecision { -+ return this.evaluateCircuitBreakerForRequest(next, new Set()); -+ } ++ tryFallback(next: QueuedRequest, reason: "queue-timeout", logger?: Logger): boolean { ++ const circuitLogger = logger?.child({ module: "circuit-breaker" }); ++ const { policy, policyId } = this.resolveFallbackPolicy(next.request.metadata); ++ if (policy.mode === "wait") { ++ circuitLogger?.debug?.("Fallback skipped by policy", { ++ policyId, ++ reason, ++ provider: next.request.metadata?.provider, ++ model: next.request.metadata?.model, ++ }); ++ return false; ++ } + -+ private evaluateCircuitBreakerForRequest( -+ next: QueuedRequest, -+ visitedModels: Set, -+ ): ProcessDecision { ++ const visitedKeys = new Set(); + const key = this.buildRateLimitKey(next.request.metadata); -+ next.circuitKey = key; -+ -+ const currentModel = next.request.metadata?.model; -+ if (currentModel) { -+ visitedModels.add(currentModel); ++ visitedKeys.add(key); ++ ++ const fallback = this.findFallbackTarget(next.request.metadata, visitedKeys, circuitLogger); ++ if (!fallback || !next.request.createFallbackRequest) { ++ circuitLogger?.debug?.("Fallback unavailable for request", { ++ reason, ++ provider: next.request.metadata?.provider, ++ model: next.request.metadata?.model, ++ fallback, ++ }); ++ return false; + } + -+ const evaluation = this.evaluateCircuitState(key); -+ next.circuitStatus = evaluation.state; -+ -+ if (evaluation.allowRequest) { -+ return "process"; ++ const fallbackRequest = next.request.createFallbackRequest(fallback); ++ if (!fallbackRequest) { ++ circuitLogger?.warn?.("createFallbackRequest returned undefined; skipping", { ++ reason, ++ fallback, ++ }); ++ return false; + } + -+ const fallbackModel = this.findFallbackModel(next.request.metadata, visitedModels); -+ if (fallbackModel && next.request.createFallbackRequest) { -+ const fallbackRequest = next.request.createFallbackRequest(fallbackModel); -+ if (fallbackRequest) { -+ this.logger.warn("Circuit open; attempting fallback model", { -+ fromModel: currentModel, -+ fallbackModel, -+ provider: next.request.metadata?.provider, -+ }); -+ next.request = fallbackRequest; -+ next.attempt = 1; -+ next.rateLimitKey = undefined; -+ next.etaMs = undefined; -+ next.circuitKey = undefined; -+ next.circuitStatus = undefined; -+ return this.evaluateCircuitBreakerForRequest(next, visitedModels); -+ } ++ this.applyFallbackRequest(next, fallbackRequest, fallback, circuitLogger, { ++ previousCircuitKey: key, ++ reason, ++ policyId, ++ }); ++ return true; ++ } ++ ++ markTrial(item: QueuedRequest, logger?: Logger): void { ++ const circuitLogger = logger?.child({ module: "circuit-breaker" }); ++ const key = item.circuitKey; ++ if (!key) return; ++ const state = this.circuitBreakers.get(key); ++ if (state && state.status === "half-open" && !state.trialInFlight) { ++ state.trialInFlight = true; ++ circuitLogger?.debug?.("Marked half-open trial in flight", { circuitKey: key }); + } ++ } + -+ const retryAfterMs = evaluation.retryAfterMs ?? CIRCUIT_COOLDOWN_MS; -+ this.logger.warn("Circuit open; rejecting request", { ++ recordSuccess(metadata?: TrafficRequestMetadata, logger?: Logger): void { ++ const circuitLogger = logger?.child({ module: "circuit-breaker" }); ++ const key = this.buildRateLimitKey(metadata); ++ this.circuitBreakers.delete(key); ++ circuitLogger?.debug?.("Circuit success; cleared circuit state", { + circuitKey: key, -+ retryAfterMs, -+ metadata: next.request.metadata, ++ provider: metadata?.provider, ++ model: metadata?.model, + }); -+ next.reject( -+ new CircuitBreakerOpenError( -+ `Circuit open for ${key}; retry after ${retryAfterMs}ms`, -+ next.request.metadata, -+ retryAfterMs, -+ ), ++ } ++ ++ recordFailure( ++ metadata: TrafficRequestMetadata | undefined, ++ error: unknown, ++ logger?: Logger, ++ ): void { ++ const circuitLogger = logger?.child({ module: "circuit-breaker" }); ++ const key = this.buildRateLimitKey(metadata); ++ const status = extractStatusCode(error, logger); ++ const isTimeout = status === 408 || isTimeoutError(error, logger); ++ const isStatusEligible = this.isCircuitBreakerStatus(status); ++ const isTimeoutEligible = !isStatusEligible && isTimeout; ++ const isEligible = isStatusEligible || isTimeoutEligible; ++ ++ circuitLogger?.debug?.("Circuit failure observed", { ++ circuitKey: key, ++ status, ++ isTimeout, ++ eligible: isEligible, ++ provider: metadata?.provider, ++ model: metadata?.model, ++ }); ++ ++ if (!isEligible) { ++ this.circuitBreakers.delete(key); ++ circuitLogger?.debug?.("Failure not eligible for circuit breaker; cleared circuit state", { ++ circuitKey: key, ++ status, ++ isTimeout, ++ }); ++ return; ++ } ++ ++ const now = Date.now(); ++ const state = ++ this.circuitBreakers.get(key) ?? ++ ({ status: "closed", failureTimestamps: [], timeoutTimestamps: [] } as CircuitState); ++ ++ state.failureTimestamps = state.failureTimestamps.filter( ++ (t) => now - t <= CIRCUIT_FAILURE_WINDOW_MS, ++ ); ++ state.timeoutTimestamps = state.timeoutTimestamps.filter( ++ (t) => now - t <= CIRCUIT_TIMEOUT_WINDOW_MS, + ); -+ return "skip"; ++ ++ state.failureTimestamps.push(now); ++ if (isTimeoutEligible) { ++ state.timeoutTimestamps.push(now); ++ } ++ ++ if ( ++ state.status === "half-open" || ++ state.failureTimestamps.length >= CIRCUIT_FAILURE_THRESHOLD || ++ state.timeoutTimestamps.length >= CIRCUIT_TIMEOUT_THRESHOLD ++ ) { ++ const openReasons: string[] = []; ++ if (state.status === "half-open") openReasons.push("half-open-failure"); ++ if (state.failureTimestamps.length >= CIRCUIT_FAILURE_THRESHOLD) { ++ openReasons.push("failure-threshold"); ++ } ++ if (state.timeoutTimestamps.length >= CIRCUIT_TIMEOUT_THRESHOLD) { ++ openReasons.push("timeout-threshold"); ++ } ++ ++ state.status = "open"; ++ state.openedAt = now; ++ state.trialInFlight = false; ++ state.nextProbeAt = now + CIRCUIT_PROBE_INTERVAL_MS; ++ circuitLogger?.warn?.("Circuit opened", { ++ circuitKey: key, ++ openReasons, ++ status, ++ isTimeout, ++ failureCount: state.failureTimestamps.length, ++ failureThreshold: CIRCUIT_FAILURE_THRESHOLD, ++ timeoutCount: state.timeoutTimestamps.length, ++ timeoutThreshold: CIRCUIT_TIMEOUT_THRESHOLD, ++ openedAt: state.openedAt, ++ }); ++ } ++ ++ this.circuitBreakers.set(key, state); ++ circuitLogger?.trace?.("Circuit state updated", { ++ circuitKey: key, ++ status: state.status, ++ failureCount: state.failureTimestamps.length, ++ failureWindowMs: CIRCUIT_FAILURE_WINDOW_MS, ++ timeoutCount: state.timeoutTimestamps.length, ++ timeoutWindowMs: CIRCUIT_TIMEOUT_WINDOW_MS, ++ }); + } + -+ private evaluateCircuitState(key: string): { ++ private evaluateCircuitState( ++ key: string, ++ logger?: Logger, ++ ): { + allowRequest: boolean; + state: CircuitStateStatus; + retryAfterMs?: number; + } { + const state = this.circuitBreakers.get(key); + if (!state) { ++ logger?.trace?.("Circuit state missing; allow request", { circuitKey: key }); + return { allowRequest: true, state: "closed" }; + } + @@ -1503,54 +2181,121 @@ index 00000000..8d82e8a5 + + if (state.status === "open") { + const elapsed = state.openedAt ? now - state.openedAt : 0; -+ if (elapsed >= CIRCUIT_COOLDOWN_MS) { ++ if (state.nextProbeAt === undefined) { ++ state.nextProbeAt = now + CIRCUIT_PROBE_INTERVAL_MS; ++ } ++ const cooldownRemaining = Math.max(0, CIRCUIT_COOLDOWN_MS - elapsed); ++ const probeRemaining = Math.max(0, state.nextProbeAt - now); ++ if (probeRemaining === 0 || cooldownRemaining === 0) { + state.status = "half-open"; + state.trialInFlight = false; + state.failureTimestamps = []; -+ this.circuitBreakers.set(key, state); -+ return { allowRequest: true, state: state.status }; ++ state.timeoutTimestamps = []; ++ state.nextProbeAt = undefined; ++ logger?.debug?.("Circuit transitioned to half-open", { ++ circuitKey: key, ++ reason: cooldownRemaining === 0 ? "cooldown" : "probe", ++ }); ++ return { allowRequest: true, state: "half-open" }; + } + return { + allowRequest: false, -+ state: state.status, -+ retryAfterMs: Math.max(0, CIRCUIT_COOLDOWN_MS - elapsed), ++ state: "open", ++ retryAfterMs: Math.min(cooldownRemaining, probeRemaining), + }; + } + -+ if (state.status === "half-open") { -+ if (state.trialInFlight) { -+ return { allowRequest: false, state: state.status }; -+ } -+ return { allowRequest: true, state: state.status }; ++ if (state.status === "half-open" && state.trialInFlight) { ++ return { allowRequest: false, state: "half-open" }; + } + + return { allowRequest: true, state: state.status }; + } + -+ private findFallbackModel( ++ private resolveFallbackPolicy(metadata: TrafficRequestMetadata | undefined): { ++ policy: FallbackPolicy; ++ policyId?: string; ++ } { ++ const policyId = ++ metadata?.fallbackPolicyId ?? ++ (metadata?.taskType ++ ? this.fallbackPolicy?.taskTypePolicyIds?.[metadata.taskType] ++ : undefined) ?? ++ this.fallbackPolicy?.defaultPolicyId; ++ ++ const policy = policyId ? this.fallbackPolicy?.policies?.[policyId] : undefined; ++ return { ++ policy: policy ?? { mode: "fallback" }, ++ policyId, ++ }; ++ } ++ ++ private applyFallbackRequest( ++ next: QueuedRequest, ++ fallbackRequest: QueuedRequest["request"], ++ fallback: FallbackChainEntry, ++ logger?: Logger, ++ context?: { previousCircuitKey?: string; reason?: string; policyId?: string }, ++ ): void { ++ next.request = fallbackRequest; ++ next.attempt = 1; ++ next.tenantConcurrencyKey = undefined; ++ next.providerModelConcurrencyKey = undefined; ++ next.rateLimitKey = undefined; ++ next.etaMs = undefined; ++ next.circuitKey = undefined; ++ next.circuitStatus = undefined; ++ logger?.debug?.("Switched to fallback request", { ++ previousCircuitKey: context?.previousCircuitKey, ++ fallbackModel: fallback, ++ reason: context?.reason, ++ policyId: context?.policyId, ++ }); ++ } ++ ++ private findFallbackTarget( + metadata: TrafficRequestMetadata | undefined, -+ visitedModels: Set, -+ ): string | undefined { ++ visitedKeys: Set, ++ logger?: Logger, ++ ): FallbackChainEntry | undefined { + const currentModel = metadata?.model; + if (!currentModel) { ++ logger?.trace?.("No current model; no fallback", {}); + return undefined; + } + -+ const chain = this.fallbackChains.get(currentModel); ++ const provider = metadata?.provider; ++ const chain = this.resolveFallbackChain(provider, currentModel); + if (!chain) { ++ logger?.trace?.("No fallback chain for model", { ++ currentModel, ++ provider, ++ }); + return undefined; + } + -+ const provider = metadata?.provider; + for (const candidate of chain) { -+ if (visitedModels.has(candidate)) { ++ const target = this.normalizeFallbackTarget(candidate, provider); ++ const candidateMetadata: TrafficRequestMetadata = { ++ ...(metadata ?? {}), ++ provider: target.provider ?? provider, ++ model: target.model, ++ }; ++ const candidateKey = this.buildRateLimitKey(candidateMetadata); ++ if (visitedKeys.has(candidateKey)) { + continue; + } + -+ const candidateKey = this.buildRateLimitKey({ provider, model: candidate }); -+ const evaluation = this.evaluateCircuitState(candidateKey); ++ const evaluation = this.evaluateCircuitState(candidateKey, logger); + if (evaluation.allowRequest) { -+ visitedModels.add(candidate); ++ visitedKeys.add(candidateKey); ++ logger?.debug?.("Selected fallback target", { ++ currentModel, ++ currentProvider: provider, ++ fallbackModel: target.model, ++ fallbackProvider: target.provider ?? provider, ++ fallbackCircuitKey: candidateKey, ++ }); + return candidate; + } + } @@ -1558,655 +2303,2934 @@ index 00000000..8d82e8a5 + return undefined; + } + -+ private markCircuitTrial(next: QueuedRequest): void { -+ const key = next.circuitKey; -+ if (!key) { -+ return; ++ private resolveFallbackChain( ++ provider: string | undefined, ++ model: string, ++ ): FallbackChainEntry[] | undefined { ++ const providerKey = provider ? `${provider}::${model}` : undefined; ++ if (providerKey) { ++ const providerChain = this.fallbackChains.get(providerKey); ++ if (providerChain) return providerChain; + } ++ return this.fallbackChains.get(model); ++ } + -+ const state = this.circuitBreakers.get(key); -+ if (state && state.status === "half-open" && !state.trialInFlight) { -+ state.trialInFlight = true; -+ this.circuitBreakers.set(key, state); ++ private normalizeFallbackTarget( ++ candidate: FallbackChainEntry, ++ provider: string | undefined, ++ ): FallbackTarget { ++ if (typeof candidate === "string") { ++ return { provider, model: candidate }; + } ++ return { ++ provider: candidate.provider ?? provider, ++ model: candidate.model, ++ }; + } + -+ private normalizeRateLimits( -+ rateLimits?: RateLimitConfig, -+ ): Map | undefined { -+ if (!rateLimits) { -+ return undefined; ++ private isCircuitBreakerStatus(status?: number): boolean { ++ return status === 429 || (status !== undefined && status >= 500); ++ } ++} +diff --git a/packages/core/src/traffic/traffic-concurrency-limiter.ts b/packages/core/src/traffic/traffic-concurrency-limiter.ts +new file mode 100644 +index 00000000..e1525612 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-concurrency-limiter.ts +@@ -0,0 +1,235 @@ ++import type { Logger } from "../logger"; ++import type { QueuedRequest } from "./traffic-controller-internal"; ++import type { ++ ProviderModelConcurrencyLimit, ++ TenantConcurrencyLimit, ++ TrafficRequestMetadata, ++} from "./traffic-types"; ++ ++export type ConcurrencyBlockReason = ++ | { ++ gate: "providerModel"; ++ key: string; ++ inFlight: number; ++ limit: number; + } ++ | { ++ gate: "tenant"; ++ key: string; ++ inFlight: number; ++ limit: number; ++ }; + -+ const normalized = new Map(); -+ for (const [key, config] of Object.entries(rateLimits)) { -+ if (config.capacity > 0 && config.refillPerSecond > 0) { -+ normalized.set(key, { -+ capacity: config.capacity, -+ refillPerMs: config.refillPerSecond / 1000, -+ }); -+ } -+ } ++export type ConcurrencyDecision = ++ | { kind: "allow" } ++ | { kind: "wait"; reasons: ConcurrencyBlockReason[] }; + -+ return normalized.size > 0 ? normalized : undefined; -+ } ++function toNonNegativeIntegerLimit(raw: unknown): number | undefined { ++ if (raw === undefined || raw === null) return undefined; ++ const n = typeof raw === "number" ? raw : Number(raw); ++ if (!Number.isFinite(n)) return undefined; ++ if (n <= 0) return 0; ++ return Math.floor(n); ++} + -+ private normalizeFallbackChains( -+ fallbackChains?: Record, -+ ): Map { -+ const configuredChains = fallbackChains ?? DEFAULT_FALLBACK_CHAINS; -+ const normalized = new Map(); ++function getInFlight(map: Map, key: string): number { ++ return map.get(key) ?? 0; ++} + -+ for (const [model, chain] of Object.entries(configuredChains)) { -+ if (Array.isArray(chain) && chain.length > 0) { -+ normalized.set(model, [...chain]); -+ } -+ } ++function incrementInFlight(map: Map, key: string): void { ++ map.set(key, getInFlight(map, key) + 1); ++} + -+ return normalized; ++function decrementInFlight(map: Map, key: string): void { ++ const current = getInFlight(map, key); ++ if (current <= 1) { ++ map.delete(key); ++ return; + } ++ map.set(key, current - 1); ++} + -+ private buildRateLimitKey(metadata?: TrafficRequestMetadata): string { -+ const provider = metadata?.provider ?? "default-provider"; -+ const model = metadata?.model ?? "default-model"; -+ return `${provider}::${model}`; ++export class TrafficConcurrencyLimiter { ++ private readonly inFlightByProviderModel = new Map(); ++ private readonly inFlightByTenant = new Map(); ++ ++ private readonly buildProviderModelKey: (metadata?: TrafficRequestMetadata) => string; ++ private readonly providerModelLimit?: ProviderModelConcurrencyLimit; ++ private readonly tenantLimit?: TenantConcurrencyLimit; ++ private readonly providerModelEnabled: boolean; ++ private readonly tenantEnabled: boolean; ++ ++ constructor(options: { ++ buildProviderModelKey: (metadata?: TrafficRequestMetadata) => string; ++ maxConcurrentPerProviderModel?: ProviderModelConcurrencyLimit; ++ maxConcurrentPerTenant?: TenantConcurrencyLimit; ++ }) { ++ this.buildProviderModelKey = options.buildProviderModelKey; ++ this.providerModelLimit = options.maxConcurrentPerProviderModel; ++ this.tenantLimit = options.maxConcurrentPerTenant; ++ this.providerModelEnabled = options.maxConcurrentPerProviderModel !== undefined; ++ this.tenantEnabled = options.maxConcurrentPerTenant !== undefined; + } + -+ /** -+ * Update (or bootstrap) rate limit buckets based on provider response headers. -+ * This lets the controller adopt server-issued limits without static config. -+ */ -+ updateRateLimitFromHeaders( -+ metadata: TrafficRequestMetadata | undefined, -+ headers: unknown, -+ ): RateLimitUpdateResult | undefined { -+ const headerInfo = this.extractRateLimitHeaders(headers); -+ if (!headerInfo) { -+ this.logDebug("[TrafficController] no rate limit headers found on response", { -+ metadata, -+ }); -+ return undefined; -+ } ++ resolve(next: QueuedRequest, logger?: Logger): ConcurrencyDecision { ++ if (!this.providerModelEnabled && !this.tenantEnabled) return { kind: "allow" }; ++ const concurrencyLogger = logger?.child({ module: "concurrency-limiter" }); ++ const reasons: ConcurrencyBlockReason[] = []; + -+ const normalized = this.normalizeHeaderRateLimit(headerInfo); -+ if (!normalized) { -+ this.logDebug("[TrafficController] rate limit headers present but invalid", { -+ headerInfo, -+ }); -+ return undefined; ++ if (this.providerModelEnabled) { ++ const providerModelKey = this.buildProviderModelKey(next.request.metadata); ++ const providerModelLimit = this.resolveProviderModelLimit( ++ providerModelKey, ++ next.request.metadata, ++ concurrencyLogger, ++ ); ++ if (providerModelLimit !== undefined) { ++ const inFlight = getInFlight(this.inFlightByProviderModel, providerModelKey); ++ if (inFlight >= providerModelLimit) { ++ reasons.push({ ++ gate: "providerModel", ++ key: providerModelKey, ++ inFlight, ++ limit: providerModelLimit, ++ }); ++ } ++ } + } + -+ const key = this.buildRateLimitKey(metadata); -+ if (!this.rateLimits) { -+ this.rateLimits = new Map(); ++ if (this.tenantEnabled) { ++ const tenantKey = next.tenantId; ++ const tenantLimit = this.resolveTenantLimit( ++ tenantKey, ++ next.request.metadata, ++ concurrencyLogger, ++ ); ++ if (tenantLimit !== undefined) { ++ const inFlight = getInFlight(this.inFlightByTenant, tenantKey); ++ if (inFlight >= tenantLimit) { ++ reasons.push({ ++ gate: "tenant", ++ key: tenantKey, ++ inFlight, ++ limit: tenantLimit, ++ }); ++ } ++ } + } -+ this.rateLimits.set(key, normalized); + -+ const now = Date.now(); -+ const remainingTokens = this.coerceRemaining(headerInfo.remainingRequests, normalized.capacity); -+ const existingBucket = this.rateLimitBuckets.get(key); -+ const tokens = remainingTokens ?? existingBucket?.tokens ?? normalized.capacity; -+ -+ if (existingBucket) { -+ existingBucket.capacity = normalized.capacity; -+ existingBucket.refillPerMs = normalized.refillPerMs; -+ existingBucket.tokens = Math.min(tokens, normalized.capacity); -+ existingBucket.lastRefill = now; -+ } else { -+ this.rateLimitBuckets.set(key, { -+ tokens: Math.min(tokens, normalized.capacity), -+ capacity: normalized.capacity, -+ refillPerMs: normalized.refillPerMs, -+ lastRefill: now, -+ }); -+ } ++ if (reasons.length === 0) return { kind: "allow" }; + -+ this.logDebug("[TrafficController] rateLimit updated from headers", { -+ key, -+ capacity: normalized.capacity, -+ refillPerMs: normalized.refillPerMs, -+ remaining: remainingTokens, ++ concurrencyLogger?.trace?.("Concurrency gate blocked request", { ++ tenantId: next.tenantId, ++ reasons, + }); -+ -+ // If we just refilled tokens, try draining again. -+ this.scheduleDrain(); -+ -+ return { -+ key, -+ headerSnapshot: headerInfo, -+ normalized, -+ appliedTokens: Math.min(tokens, normalized.capacity), -+ }; ++ return { kind: "wait", reasons }; + } + -+ private extractRateLimitHeaders(headers: unknown): RateLimitHeaderSnapshot | undefined { -+ const getHeader = this.createHeaderLookup(headers); -+ if (!getHeader) { -+ return undefined; -+ } -+ -+ const limitRequests = this.parseNumberHeader(getHeader, "x-ratelimit-limit-requests"); -+ const resetRequestsMs = this.parseDurationHeaderToMs(getHeader, "x-ratelimit-reset-requests"); ++ acquire(next: QueuedRequest, logger?: Logger): void { ++ if (!this.providerModelEnabled && !this.tenantEnabled) return; ++ const concurrencyLogger = logger?.child({ module: "concurrency-limiter" }); + -+ if ( -+ limitRequests === undefined || -+ limitRequests <= 0 || -+ resetRequestsMs === undefined || -+ resetRequestsMs <= 0 -+ ) { -+ return undefined; ++ let tenantKey: string | undefined; ++ if (this.tenantEnabled) { ++ tenantKey = next.tenantId; ++ next.tenantConcurrencyKey = tenantKey; ++ incrementInFlight(this.inFlightByTenant, tenantKey); + } + -+ const remainingRequests = this.parseNumberHeader(getHeader, "x-ratelimit-remaining-requests"); -+ -+ return { -+ limitRequests, -+ remainingRequests, -+ resetRequestsMs, -+ }; -+ } -+ -+ private normalizeHeaderRateLimit( -+ snapshot: RateLimitHeaderSnapshot, -+ ): NormalizedRateLimit | undefined { -+ if (snapshot.limitRequests <= 0 || snapshot.resetRequestsMs <= 0) { -+ return undefined; ++ let providerModelKey: string | undefined; ++ if (this.providerModelEnabled) { ++ providerModelKey = this.buildProviderModelKey(next.request.metadata); ++ next.providerModelConcurrencyKey = providerModelKey; ++ incrementInFlight(this.inFlightByProviderModel, providerModelKey); + } + -+ return { -+ capacity: snapshot.limitRequests, -+ refillPerMs: snapshot.limitRequests / snapshot.resetRequestsMs, -+ }; ++ concurrencyLogger?.trace?.("Concurrency slots acquired", { ++ tenantId: tenantKey, ++ tenantInFlight: tenantKey ? getInFlight(this.inFlightByTenant, tenantKey) : undefined, ++ providerModelKey, ++ providerModelInFlight: providerModelKey ++ ? getInFlight(this.inFlightByProviderModel, providerModelKey) ++ : undefined, ++ }); + } + -+ private coerceRemaining(remaining: number | undefined, capacity: number): number | undefined { -+ if (remaining === undefined) { -+ return undefined; -+ } ++ release(next: QueuedRequest, logger?: Logger): void { ++ const concurrencyLogger = logger?.child({ module: "concurrency-limiter" }); ++ const tenantKey = next.tenantConcurrencyKey; ++ const providerModelKey = next.providerModelConcurrencyKey; + -+ const parsed = Number(remaining); -+ if (!Number.isFinite(parsed)) { -+ return undefined; ++ if (tenantKey) { ++ decrementInFlight(this.inFlightByTenant, tenantKey); + } + -+ return Math.max(0, Math.min(capacity, Math.floor(parsed))); -+ } -+ -+ private createHeaderLookup(headers: unknown): ((name: string) => string | undefined) | undefined { -+ if (!headers) { -+ return undefined; ++ if (providerModelKey) { ++ decrementInFlight(this.inFlightByProviderModel, providerModelKey); + } + -+ const maybeHeaders = headers as { get?: (name: string) => unknown }; -+ if (typeof maybeHeaders?.get === "function") { -+ return (name: string) => { -+ const value = maybeHeaders.get?.(name); -+ return value === undefined || value === null ? undefined : String(value); -+ }; ++ if (tenantKey || providerModelKey) { ++ concurrencyLogger?.trace?.("Concurrency slots released", { ++ tenantId: tenantKey, ++ tenantInFlight: tenantKey ? getInFlight(this.inFlightByTenant, tenantKey) : undefined, ++ providerModelKey, ++ providerModelInFlight: providerModelKey ++ ? getInFlight(this.inFlightByProviderModel, providerModelKey) ++ : undefined, ++ }); + } + -+ if (typeof headers === "object") { -+ const entries = Object.entries(headers as Record); -+ if (entries.length === 0) { -+ return undefined; -+ } ++ next.tenantConcurrencyKey = undefined; ++ next.providerModelConcurrencyKey = undefined; ++ } + -+ return (name: string) => { -+ const target = name.toLowerCase(); -+ for (const [key, value] of entries) { -+ if (typeof key === "string" && key.toLowerCase() === target) { -+ if (Array.isArray(value)) { -+ const first = value[0]; -+ return first === undefined || first === null ? undefined : String(first); -+ } -+ return value === undefined || value === null ? undefined : String(value); -+ } -+ } ++ private resolveTenantLimit( ++ tenantId: string, ++ metadata: TrafficRequestMetadata | undefined, ++ logger?: Logger, ++ ): number | undefined { ++ const policy = this.tenantLimit; ++ if (policy === undefined) return undefined; ++ ++ if (typeof policy === "number") return toNonNegativeIntegerLimit(policy); ++ if (typeof policy === "function") { ++ try { ++ return toNonNegativeIntegerLimit(policy(tenantId, metadata)); ++ } catch (error) { ++ logger?.warn?.("Tenant concurrency resolver threw; ignoring", { ++ tenantId, ++ errorName: (error as { name?: unknown } | null)?.name, ++ errorMessage: (error as { message?: unknown } | null)?.message, ++ }); + return undefined; -+ }; ++ } + } + -+ return undefined; ++ return toNonNegativeIntegerLimit(policy[tenantId]); + } + -+ private parseNumberHeader( -+ getHeader: (name: string) => string | undefined, -+ name: string, ++ private resolveProviderModelLimit( ++ key: string, ++ metadata: TrafficRequestMetadata | undefined, ++ logger?: Logger, + ): number | undefined { -+ const raw = getHeader(name); -+ if (raw === undefined) { -+ return undefined; ++ const policy = this.providerModelLimit; ++ if (policy === undefined) return undefined; ++ ++ if (typeof policy === "number") return toNonNegativeIntegerLimit(policy); ++ if (typeof policy === "function") { ++ try { ++ return toNonNegativeIntegerLimit(policy(metadata, key)); ++ } catch (error) { ++ logger?.warn?.("Provider/model concurrency resolver threw; ignoring", { ++ key, ++ provider: metadata?.provider, ++ model: metadata?.model, ++ errorName: (error as { name?: unknown } | null)?.name, ++ errorMessage: (error as { message?: unknown } | null)?.message, ++ }); ++ return undefined; ++ } + } + -+ const parsed = Number(raw); -+ return Number.isFinite(parsed) ? parsed : undefined; ++ return toNonNegativeIntegerLimit(policy[key]); + } ++} +diff --git a/packages/core/src/traffic/traffic-constants.ts b/packages/core/src/traffic/traffic-constants.ts +new file mode 100644 +index 00000000..68d99df7 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-constants.ts +@@ -0,0 +1,26 @@ ++export const MAX_RETRY_ATTEMPTS = 3; ++export const TIMEOUT_RETRY_ATTEMPTS = 2; ++ ++export const RATE_LIMIT_BASE_BACKOFF_MS = 500; ++export const SERVER_ERROR_BASE_BACKOFF_MS = 1000; ++export const TIMEOUT_BASE_BACKOFF_MS = 750; ++ ++export const RATE_LIMIT_JITTER_FACTOR = 0.35; ++export const SERVER_ERROR_JITTER_FACTOR = 0.8; ++export const TIMEOUT_JITTER_FACTOR = 0.5; ++ ++export const CIRCUIT_FAILURE_THRESHOLD = 5; ++export const CIRCUIT_FAILURE_WINDOW_MS = 10_000; ++export const CIRCUIT_TIMEOUT_THRESHOLD = CIRCUIT_FAILURE_THRESHOLD; ++export const CIRCUIT_TIMEOUT_WINDOW_MS = CIRCUIT_FAILURE_WINDOW_MS; ++export const CIRCUIT_COOLDOWN_MS = 30_000; ++export const CIRCUIT_PROBE_INTERVAL_MS = 5_000; ++ ++export const RATE_LIMIT_EXHAUSTION_BUFFER = 1; ++export const RATE_LIMIT_PROBE_DELAY_MS = 50; ++export const RATE_LIMIT_MIN_PACE_INTERVAL_MS = 10; ++export const RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS = 10; ++ ++export const DEFAULT_FALLBACK_CHAINS: Record = { ++ "gpt-4o": ["gpt-4o-mini", "gpt-3.5"], ++}; +diff --git a/packages/core/src/traffic/traffic-controller-internal.ts b/packages/core/src/traffic/traffic-controller-internal.ts +new file mode 100644 +index 00000000..cf435854 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-controller-internal.ts +@@ -0,0 +1,54 @@ ++import type { TrafficPriority, TrafficRequest, TrafficRequestType } from "./traffic-types"; + -+ private parseDurationHeaderToMs( -+ getHeader: (name: string) => string | undefined, -+ name: string, -+ ): number | undefined { -+ const raw = getHeader(name); -+ if (!raw) { -+ return undefined; -+ } -+ -+ const trimmed = raw.trim(); -+ const match = trimmed.match(/^(-?\d+(?:\.\d+)?)(ms|s)?$/i); -+ if (!match) { -+ return undefined; -+ } ++export type Scheduler = (callback: () => void) => void; + -+ const value = Number(match[1]); -+ if (!Number.isFinite(value) || value <= 0) { -+ return undefined; -+ } ++export type DispatchDecision = ++ | { kind: "dispatch" } ++ | { kind: "skip" } ++ | { kind: "wait"; wakeUpAt?: number }; + -+ const unit = (match[2] || "s").toLowerCase(); -+ return unit === "ms" ? value : value * 1000; -+ } ++export type CircuitStateStatus = "closed" | "open" | "half-open"; + -+ private resolvePriority(metadata?: TrafficRequestMetadata): TrafficPriority { -+ const candidate = metadata?.priority; -+ if (candidate === "P0" || candidate === "P1" || candidate === "P2") { -+ return candidate; -+ } ++export interface CircuitState { ++ status: CircuitStateStatus; ++ failureTimestamps: number[]; ++ timeoutTimestamps: number[]; ++ openedAt?: number; ++ trialInFlight?: boolean; ++ nextProbeAt?: number; ++} + -+ return "P1"; -+ } ++export interface RateLimitWindowState { ++ limit: number; ++ remaining: number; ++ resetAt: number; ++ reserved: number; ++ nextAllowedAt: number; ++} + -+ private getQueue(priority: TrafficPriority): QueuedRequest[] { -+ return this.queues[priority]; -+ } ++type BivariantHandler = { ++ bivarianceHack(...args: TArgs): void; ++}["bivarianceHack"]; + -+ private hasQueuedWork(): boolean { -+ return this.priorityOrder.some((priority) => this.getQueue(priority).length > 0); -+ } ++export interface QueuedRequest { ++ type: TrafficRequestType; ++ request: TrafficRequest; ++ resolve: BivariantHandler<[TResponse | PromiseLike]>; ++ reject: BivariantHandler<[reason?: unknown]>; ++ attempt: number; ++ priority: TrafficPriority; ++ tenantId: string; ++ enqueuedAt: number; ++ dispatchedAt?: number; + -+ private getQueueSize(): number { -+ let size = 0; -+ for (const priority of this.priorityOrder) { -+ size += this.getQueue(priority).length; -+ } -+ return size; -+ } ++ tenantConcurrencyKey?: string; ++ providerModelConcurrencyKey?: string; + -+ private scheduleRefill(limit: NormalizedRateLimit): void { -+ if (this.refillTimeout) { -+ return; -+ } ++ rateLimitKey?: string; ++ etaMs?: number; + -+ const delayMs = Math.max(1, Math.ceil(1 / limit.refillPerMs)); // Wait long enough for at least one token -+ this.logDebug("[TrafficController] scheduleRefill", { delayMs }); -+ this.refillTimeout = setTimeout(() => { -+ this.refillTimeout = undefined; // Allow future refills to be scheduled -+ this.logDebug("[TrafficController] refillTimeoutFired", { -+ queueSize: this.getQueueSize(), -+ active: this.activeCount, -+ }); -+ this.scheduleDrain(); // Try draining again now that tokens should exist -+ }, delayMs); -+ } ++ circuitKey?: string; ++ circuitStatus?: CircuitStateStatus; + -+ private recordCircuitSuccess(metadata?: TrafficRequestMetadata): void { -+ const key = this.buildRateLimitKey(metadata); -+ if (this.circuitBreakers.has(key)) { -+ this.circuitBreakers.delete(key); -+ } -+ } ++ extractUsage?: TrafficRequest["extractUsage"]; ++} +diff --git a/packages/core/src/traffic/traffic-controller.spec.ts b/packages/core/src/traffic/traffic-controller.spec.ts +new file mode 100644 +index 00000000..b3f331b2 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-controller.spec.ts +@@ -0,0 +1,281 @@ ++import { describe, expect, it, vi } from "vitest"; ++import { CIRCUIT_FAILURE_THRESHOLD } from "./traffic-constants"; ++import { TrafficController } from "./traffic-controller"; + -+ private recordCircuitFailure(metadata: TrafficRequestMetadata | undefined, error: unknown): void { -+ const status = this.extractStatusCode(error); -+ if (!this.isCircuitBreakerStatus(status)) { -+ this.resetCircuitFailures(metadata); -+ return; -+ } ++describe("TrafficController priority scheduling", () => { ++ it("prioritizes P0 over lower priorities when runnable", async () => { ++ const controller = new TrafficController({ maxConcurrent: 1 }); ++ const order: string[] = []; + -+ const key = this.buildRateLimitKey(metadata); -+ const now = Date.now(); -+ const state = -+ this.circuitBreakers.get(key) ?? -+ ({ -+ status: "closed", -+ failureTimestamps: [], -+ } as CircuitState); ++ const p1 = controller.handleText({ ++ metadata: { provider: "p", model: "m1", priority: "P1" }, ++ execute: async () => { ++ order.push("P1"); ++ return "P1"; ++ }, ++ }); + -+ const recentFailures = state.failureTimestamps.filter( -+ (timestamp) => now - timestamp <= CIRCUIT_FAILURE_WINDOW_MS, -+ ); -+ recentFailures.push(now); ++ const p2 = controller.handleText({ ++ metadata: { provider: "p", model: "m2", priority: "P2" }, ++ execute: async () => { ++ order.push("P2"); ++ return "P2"; ++ }, ++ }); + -+ if (state.status === "half-open") { -+ state.status = "open"; -+ state.openedAt = now; -+ state.trialInFlight = false; -+ state.failureTimestamps = [now]; -+ this.circuitBreakers.set(key, state); -+ this.logger.warn("Circuit reopened after half-open failure", { -+ circuitKey: key, -+ statusCode: status, -+ }); -+ return; -+ } ++ const p0 = controller.handleText({ ++ metadata: { provider: "p", model: "m0", priority: "P0" }, ++ execute: async () => { ++ order.push("P0"); ++ return "P0"; ++ }, ++ }); + -+ state.failureTimestamps = recentFailures; -+ if (state.failureTimestamps.length >= CIRCUIT_FAILURE_THRESHOLD) { -+ state.status = "open"; -+ state.openedAt = now; -+ state.trialInFlight = false; -+ this.logger.warn("Circuit opened after consecutive failures", { -+ circuitKey: key, -+ failureCount: state.failureTimestamps.length, -+ statusCode: status, ++ await Promise.all([p0, p1, p2]); ++ ++ expect(order[0]).toBe("P0"); ++ expect(order).toEqual(["P0", "P1", "P2"]); ++ }); ++ ++ it("allows lower priorities to proceed when a higher priority request is rate limited", async () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ maxConcurrent: 1 }); ++ controller.updateRateLimitFromHeaders( ++ { provider: "p0", model: "m0" }, ++ { ++ "x-ratelimit-limit-requests": "1", ++ "x-ratelimit-remaining-requests": "0", ++ "x-ratelimit-reset-requests": "1s", ++ }, ++ ); ++ ++ const order: string[] = []; ++ ++ const p0 = controller.handleText({ ++ metadata: { provider: "p0", model: "m0", priority: "P0" }, ++ execute: async () => { ++ order.push("P0"); ++ return "P0"; ++ }, + }); -+ } + -+ this.circuitBreakers.set(key, state); -+ } ++ const p1 = controller.handleText({ ++ metadata: { provider: "p1", model: "m1", priority: "P1" }, ++ execute: async () => { ++ order.push("P1"); ++ return "P1"; ++ }, ++ }); + -+ private resetCircuitFailures(metadata?: TrafficRequestMetadata): void { -+ const key = this.buildRateLimitKey(metadata); -+ const state = this.circuitBreakers.get(key); -+ if (!state) { -+ return; -+ } ++ await vi.runAllTimersAsync(); ++ await Promise.all([p0, p1]); + -+ state.failureTimestamps = []; -+ if (state.status !== "open") { -+ state.status = "closed"; -+ state.trialInFlight = false; ++ expect(order[0]).toBe("P1"); ++ expect(order[1]).toBe("P0"); ++ } finally { ++ vi.useRealTimers(); + } ++ }); ++}); + -+ this.circuitBreakers.set(key, state); -+ } ++describe("TrafficController rate limit headers", () => { ++ it("parses OpenAI-style compound reset durations (e.g. 1m30.951s)", () => { ++ vi.useFakeTimers(); + -+ private recordUsageFromResult( -+ item: QueuedRequest, -+ result: TResponse, -+ ): void { -+ const extractor = item.extractUsage ?? item.request.extractUsage; -+ if (!extractor) { -+ return; ++ try { ++ vi.setSystemTime(new Date(1_000_000)); ++ const controller = new TrafficController({ maxConcurrent: 1 }); ++ const now = Date.now(); ++ ++ const result = controller.updateRateLimitFromHeaders( ++ { provider: "openai.responses", model: "gpt-4o-mini" }, ++ { ++ "x-ratelimit-limit-requests": "10000", ++ "x-ratelimit-remaining-requests": "9989", ++ "x-ratelimit-reset-requests": "1m30.951s", ++ }, ++ ); ++ ++ expect(result).toBeTruthy(); ++ expect(result?.headerSnapshot.resetRequestsMs).toBeCloseTo(90_951, 6); ++ expect(result?.state.limit).toBe(10000); ++ expect(result?.state.remaining).toBe(9989); ++ expect(result?.state.resetAt).toBe(now + 90_951); ++ expect(result?.state.reserved).toBe(0); ++ expect(result?.state.nextAllowedAt).toBe(now); ++ } finally { ++ vi.useRealTimers(); + } ++ }); ++ ++ it("keeps resetAt monotonic when headers shorten the reset duration", () => { ++ vi.useFakeTimers(); + + try { -+ const usageCandidate = extractor(result); -+ if (!usageCandidate) { -+ return; -+ } ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ maxConcurrent: 1 }); ++ ++ const first = controller.updateRateLimitFromHeaders( ++ { provider: "openai.responses", model: "gpt-4o-mini" }, ++ { ++ "x-ratelimit-limit-requests": "10000", ++ "x-ratelimit-remaining-requests": "9999", ++ "x-ratelimit-reset-requests": "60s", ++ }, ++ ); + -+ if (this.isPromiseLike(usageCandidate)) { -+ void Promise.resolve(usageCandidate) -+ .then((usage) => { -+ if (usage) { -+ this.incrementTenantUsage(item.tenantId, usage); -+ } -+ }) -+ .catch((error) => { -+ this.logger.debug("Failed to record tenant usage", { tenantId: item.tenantId, error }); -+ }); -+ return; -+ } ++ expect(first).toBeTruthy(); ++ expect(first?.state.resetAt).toBe(60_000); + -+ this.incrementTenantUsage(item.tenantId, usageCandidate as UsageCounters); -+ } catch (error) { -+ this.logger.debug("Failed to record tenant usage", { tenantId: item.tenantId, error }); ++ vi.setSystemTime(new Date(10_000)); ++ const second = controller.updateRateLimitFromHeaders( ++ { provider: "openai.responses", model: "gpt-4o-mini" }, ++ { ++ "x-ratelimit-limit-requests": "10000", ++ "x-ratelimit-remaining-requests": "9998", ++ "x-ratelimit-reset-requests": "5s", ++ }, ++ ); ++ ++ expect(second).toBeTruthy(); ++ expect(second?.state.resetAt).toBe(60_000); ++ } finally { ++ vi.useRealTimers(); + } -+ } ++ }); + -+ private incrementTenantUsage(tenantId: string, usage: UsageCounters): void { -+ const current = this.tenantUsage.get(tenantId) ?? { -+ inputTokens: 0, -+ outputTokens: 0, -+ totalTokens: 0, -+ }; -+ const inputTokens = usage.inputTokens ?? 0; -+ const outputTokens = usage.outputTokens ?? 0; -+ const totalTokens = usage.totalTokens ?? inputTokens + outputTokens; -+ const updated: TenantUsage = { -+ inputTokens: current.inputTokens + inputTokens, -+ outputTokens: current.outputTokens + outputTokens, -+ totalTokens: current.totalTokens + totalTokens, -+ }; -+ this.tenantUsage.set(tenantId, updated); -+ this.logger.debug("Recorded tenant usage", { tenantId, usage: updated }); -+ } ++ it("never increases remaining within the same window", () => { ++ vi.useFakeTimers(); + -+ private isPromiseLike(value: unknown): value is PromiseLike { -+ return ( -+ typeof value === "object" && -+ value !== null && -+ typeof (value as PromiseLike).then === "function" -+ ); -+ } ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ maxConcurrent: 1 }); ++ ++ const first = controller.updateRateLimitFromHeaders( ++ { provider: "openai.responses", model: "gpt-4o-mini" }, ++ { ++ "x-ratelimit-limit-requests": "10", ++ "x-ratelimit-remaining-requests": "9", ++ "x-ratelimit-reset-requests": "60s", ++ }, ++ ); + -+ private isCircuitBreakerStatus(status?: number): boolean { -+ if (status === 429) { -+ return true; -+ } ++ expect(first?.state.remaining).toBe(9); ++ expect(first?.state.resetAt).toBe(60_000); + -+ return status !== undefined && status >= 500 && status < 600; -+ } ++ vi.setSystemTime(new Date(10_000)); ++ const second = controller.updateRateLimitFromHeaders( ++ { provider: "openai.responses", model: "gpt-4o-mini" }, ++ { ++ "x-ratelimit-limit-requests": "10", ++ "x-ratelimit-remaining-requests": "8", ++ "x-ratelimit-reset-requests": "50s", ++ }, ++ ); + -+ private async runRequest(item: QueuedRequest): Promise { -+ const attempt = item.attempt ?? 1; ++ expect(second?.state.remaining).toBe(8); ++ expect(second?.state.resetAt).toBe(60_000); + -+ this.logDebug("[TrafficController] runRequest start", { -+ type: item.type, -+ rateLimitKey: item.rateLimitKey, -+ etaMs: item.etaMs, -+ active: this.activeCount, -+ queueSize: this.getQueueSize(), -+ }); ++ vi.setSystemTime(new Date(20_000)); ++ const third = controller.updateRateLimitFromHeaders( ++ { provider: "openai.responses", model: "gpt-4o-mini" }, ++ { ++ "x-ratelimit-limit-requests": "10", ++ "x-ratelimit-remaining-requests": "9", ++ "x-ratelimit-reset-requests": "40s", ++ }, ++ ); + -+ try { -+ const result = await item.request.execute(); // Execute the user's operation -+ this.recordCircuitSuccess(item.request.metadata); -+ this.recordUsageFromResult(item, result); -+ item.resolve(result); // Deliver successful result back to the waiting caller -+ } catch (error) { -+ this.recordCircuitFailure(item.request.metadata, error); -+ const retryPlan = this.buildRetryPlan(error, attempt); -+ if (retryPlan) { -+ this.scheduleRetry(item, attempt + 1, retryPlan.delayMs, retryPlan.reason); -+ } else { -+ item.reject(error); // Surface failures to the caller -+ } ++ expect(third?.state.remaining).toBe(8); ++ expect(third?.state.resetAt).toBe(60_000); + } finally { -+ this.activeCount = Math.max(0, this.activeCount - 1); // Ensure counter never underflows -+ this.logDebug("[TrafficController] runRequest complete", { -+ type: item.type, -+ active: this.activeCount, -+ queueSize: this.getQueueSize(), -+ }); -+ this.scheduleDrain(); // Immediately try to pull the next request ++ vi.useRealTimers(); + } -+ } ++ }); + -+ private buildRetryPlan( -+ error: unknown, -+ attempt: number, -+ ): { delayMs: number; reason: RetryReason } | undefined { -+ const reason = this.getRetryReason(error); -+ if (!reason) { -+ return undefined; -+ } ++ it("applies Retry-After even when x-ratelimit headers are missing", async () => { ++ vi.useFakeTimers(); + -+ const maxAttempts = reason === "timeout" ? TIMEOUT_RETRY_ATTEMPTS : MAX_RETRY_ATTEMPTS; -+ if (attempt >= maxAttempts) { -+ return undefined; -+ } ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ maxConcurrent: 1 }); ++ const order: string[] = []; + -+ return { -+ reason, -+ delayMs: this.computeBackoffDelay(reason, attempt), -+ }; -+ } ++ controller.updateRateLimitFromHeaders( ++ { provider: "p", model: "m" }, ++ { ++ "retry-after": "2", ++ }, ++ ); + -+ private getRetryReason(error: unknown): RetryReason | undefined { -+ const statusCode = this.extractStatusCode(error); -+ if (statusCode === 429) { -+ return "rateLimit"; -+ } ++ const p0 = controller.handleText({ ++ metadata: { provider: "p", model: "m", priority: "P0" }, ++ execute: async () => { ++ order.push("P0"); ++ return "P0"; ++ }, ++ }); + -+ if (statusCode !== undefined && statusCode >= 500 && statusCode < 600) { -+ return "serverError"; -+ } ++ await vi.advanceTimersByTimeAsync(1_999); ++ expect(order).toEqual([]); + -+ if (statusCode === 408 || this.isTimeoutError(error)) { -+ return "timeout"; ++ await vi.advanceTimersByTimeAsync(1); ++ await vi.runAllTimersAsync(); ++ await p0; ++ expect(order).toEqual(["P0"]); ++ } finally { ++ vi.useRealTimers(); + } ++ }); ++}); + -+ return undefined; -+ } ++describe("TrafficController stream reporting", () => { ++ it("treats post-start stream failures as circuit breaker failures", async () => { ++ const controller = new TrafficController({ ++ maxConcurrent: 1, ++ fallbackChains: { ++ primary: ["fallback"], ++ }, ++ }); ++ const tenantId = "tenant-1"; ++ const metadata = { provider: "p", model: "primary", priority: "P1" as const }; + -+ private extractStatusCode(error: unknown): number | undefined { -+ if (!error || typeof error !== "object") { -+ return undefined; -+ } ++ await controller.handleStream({ ++ tenantId, ++ metadata, ++ execute: async () => ({ ok: true }), ++ }); + -+ const candidate = error as { status?: unknown; statusCode?: unknown; httpStatus?: unknown }; -+ const directStatus = -+ this.coerceStatus(candidate.status) ?? -+ this.coerceStatus(candidate.statusCode) ?? -+ this.coerceStatus(candidate.httpStatus); -+ if (directStatus !== undefined) { -+ return directStatus; ++ for (let i = 0; i < CIRCUIT_FAILURE_THRESHOLD; i += 1) { ++ controller.reportStreamFailure(metadata, new Error("stream-failure")); + } + -+ const responseStatus = (error as { response?: { status?: unknown } }).response?.status; -+ const normalizedResponseStatus = this.coerceStatus(responseStatus); -+ if (normalizedResponseStatus !== undefined) { -+ return normalizedResponseStatus; -+ } ++ const order: string[] = []; ++ await controller.handleStream({ ++ tenantId, ++ metadata, ++ execute: async () => { ++ order.push("primary"); ++ return "primary"; ++ }, ++ createFallbackRequest: (target) => ({ ++ tenantId, ++ metadata: { ++ provider: "p", ++ model: typeof target === "string" ? target : target.model, ++ priority: "P1", ++ }, ++ execute: async () => { ++ const modelId = typeof target === "string" ? target : target.model; ++ order.push(modelId); ++ return modelId; ++ }, ++ }), ++ }); + -+ const causeStatus = (error as { cause?: { status?: unknown; statusCode?: unknown } }).cause; -+ if (causeStatus) { -+ const normalizedCauseStatus = -+ this.coerceStatus(causeStatus.status) ?? this.coerceStatus(causeStatus.statusCode); -+ if (normalizedCauseStatus !== undefined) { -+ return normalizedCauseStatus; -+ } -+ } ++ expect(order).toEqual(["fallback"]); ++ }); ++}); +diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts +new file mode 100644 +index 00000000..c26b914d +--- /dev/null ++++ b/packages/core/src/traffic/traffic-controller.ts +@@ -0,0 +1,1231 @@ ++import type { Logger } from "../logger"; ++import { LoggerProxy } from "../logger"; ++import { TrafficCircuitBreaker } from "./traffic-circuit-breaker"; ++import { TrafficConcurrencyLimiter } from "./traffic-concurrency-limiter"; ++import type { DispatchDecision, QueuedRequest, Scheduler } from "./traffic-controller-internal"; ++import { ++ CircuitBreakerOpenError, ++ QueueWaitTimeoutError, ++ RateLimitedUpstreamError, ++ normalizeRateLimitError, ++} from "./traffic-errors"; ++import { ++ OpenAIWindowRateLimitStrategy, ++ type RateLimitUpdateResult, ++ TokenBucketRateLimitStrategy, ++ TrafficRateLimiter, ++} from "./traffic-rate-limiter"; ++import { buildRetryPlanWithPolicy } from "./traffic-retry"; ++import type { ++ AdaptiveLimiterConfig, ++ FallbackChainEntry, ++ FallbackPolicy, ++ FallbackPolicyConfig, ++ FallbackPolicyMode, ++ FallbackTarget, ++ PriorityBurstLimits, ++ ProviderModelConcurrencyLimit, ++ RateLimitConfig, ++ RateLimitKey, ++ RateLimitStrategyConfig, ++ RateLimitStrategyKind, ++ RetryPlan, ++ RetryPolicyConfig, ++ TenantConcurrencyLimit, ++ TenantUsage, ++ TrafficControllerOptions, ++ TrafficPriority, ++ TrafficRequest, ++ TrafficRequestMetadata, ++ TrafficRequestType, ++ TrafficResponseMetadata, ++} from "./traffic-types"; ++import { TrafficUsageTracker } from "./traffic-usage-tracker"; ++ ++/* ============================================================ ++ * Traffic Controller ++ * ============================================================ ++ */ + -+ return undefined; -+ } ++export type { ++ AdaptiveLimiterConfig, ++ FallbackChainEntry, ++ FallbackPolicy, ++ FallbackPolicyConfig, ++ FallbackPolicyMode, ++ FallbackTarget, ++ PriorityBurstLimits, ++ ProviderModelConcurrencyLimit, ++ RateLimitConfig, ++ RateLimitKey, ++ RateLimitStrategyConfig, ++ RateLimitStrategyKind, ++ TenantConcurrencyLimit, ++ TenantUsage, ++ TrafficControllerOptions, ++ TrafficPriority, ++ TrafficRequest, ++ TrafficRequestMetadata, ++ TrafficResponseMetadata, ++ TrafficRequestType, ++}; + -+ private isTimeoutError(error: unknown): boolean { -+ const candidates = [error, (error as { cause?: unknown })?.cause]; ++export { CircuitBreakerOpenError }; ++export { QueueWaitTimeoutError }; ++export { RateLimitedUpstreamError }; + -+ for (const candidate of candidates) { -+ if (!candidate || typeof candidate !== "object") { -+ continue; -+ } ++type TenantQueueState = { ++ order: string[]; ++ index: number; ++ queues: Map; ++}; + -+ const timeoutCode = (candidate as { code?: unknown }).code; -+ if (typeof timeoutCode === "string" && timeoutCode.toLowerCase().includes("timeout")) { -+ return true; -+ } ++type RateLimitSnapshot = { ++ limit?: number; ++ remaining?: number; ++ resetAt?: number; ++ nextAllowedAt?: number; ++ retryAfterMs?: number; ++}; + -+ const name = (candidate as { name?: unknown }).name; -+ if (typeof name === "string" && name.toLowerCase().includes("timeout")) { -+ return true; -+ } ++type AdaptiveLimiterState = { ++ recent429s: number[]; ++ penaltyMs: number; ++ cooldownUntil?: number; ++ last429At?: number; ++}; + -+ const message = (candidate as { message?: unknown }).message; -+ if (typeof message === "string" && message.toLowerCase().includes("timeout")) { -+ return true; -+ } -+ } ++const DEFAULT_PRIORITY_BURST_LIMITS: Record = { ++ P0: 5, ++ P1: 3, ++ P2: 2, ++}; + -+ return false; -+ } ++const DEFAULT_ADAPTIVE_LIMITER: Required = { ++ windowMs: 30_000, ++ threshold: 3, ++ minPenaltyMs: 500, ++ maxPenaltyMs: 10_000, ++ penaltyMultiplier: 2, ++ decayMs: 10_000, ++}; + -+ private coerceStatus(value: unknown): number | undefined { -+ if (typeof value === "number" && Number.isFinite(value)) { -+ return value; -+ } ++export class TrafficController { ++ /* ---------- Core ---------- */ + -+ if (typeof value === "string") { -+ const parsed = Number(value); -+ if (Number.isFinite(parsed)) { -+ return parsed; -+ } -+ } ++ private readonly scheduler: Scheduler; ++ private readonly maxConcurrent: number; ++ private readonly rateLimitKeyBuilder: (metadata?: TrafficRequestMetadata) => string; ++ private readonly retryPolicy?: RetryPolicyConfig; ++ private readonly logger: Logger; ++ private readonly trafficLogger: Logger; ++ private readonly controllerLogger: Logger; ++ private readonly concurrencyLimiter: TrafficConcurrencyLimiter; ++ ++ private readonly queues: Record = { ++ P0: { order: [], index: 0, queues: new Map() }, ++ P1: { order: [], index: 0, queues: new Map() }, ++ P2: { order: [], index: 0, queues: new Map() }, ++ }; ++ private readonly priorityOrder: TrafficPriority[] = ["P0", "P1", "P2"]; ++ private readonly priorityBurstLimits: Record; ++ private readonly priorityBurstCounts: Record = { ++ P0: 0, ++ P1: 0, ++ P2: 0, ++ }; + -+ return undefined; -+ } ++ private activeCount = 0; ++ private drainScheduled = false; + -+ private computeBackoffDelay(reason: RetryReason, attempt: number): number { -+ const base = -+ reason === "serverError" -+ ? SERVER_ERROR_BASE_BACKOFF_MS -+ : reason === "timeout" -+ ? TIMEOUT_BASE_BACKOFF_MS -+ : RATE_LIMIT_BASE_BACKOFF_MS; ++ /* ---------- Rate limits ---------- */ ++ private readonly rateLimiter: TrafficRateLimiter; + -+ const jitterFactor = -+ reason === "serverError" -+ ? SERVER_ERROR_JITTER_FACTOR -+ : reason === "timeout" -+ ? TIMEOUT_JITTER_FACTOR -+ : RATE_LIMIT_JITTER_FACTOR; ++ /* ---------- Circuit breakers ---------- */ ++ private readonly circuitBreaker: TrafficCircuitBreaker; + -+ const exponential = base * 2 ** Math.max(0, attempt - 1); -+ const jitter = exponential * jitterFactor * Math.random(); -+ return Math.max(1, Math.round(exponential + jitter)); -+ } ++ /* ---------- Usage ---------- */ ++ private readonly usageTracker = new TrafficUsageTracker(); + -+ private scheduleRetry( -+ item: QueuedRequest, -+ nextAttempt: number, -+ delayMs: number, -+ reason: RetryReason, -+ ): void { -+ this.logger.debug("Retrying request through controller", { -+ reason, -+ delayMs, -+ attempt: nextAttempt, -+ maxAttempts: reason === "timeout" ? TIMEOUT_RETRY_ATTEMPTS : MAX_RETRY_ATTEMPTS, -+ metadata: item.request.metadata, ++ /* ---------- Traffic metadata ---------- */ ++ private readonly rateLimitSnapshots = new Map(); ++ ++ /* ---------- Adaptive limiter ---------- */ ++ private readonly adaptiveLimiterConfig: Required; ++ private readonly adaptiveLimiterState = new Map(); ++ ++ constructor(options: TrafficControllerOptions = {}) { ++ this.maxConcurrent = options.maxConcurrent ?? Number.POSITIVE_INFINITY; ++ this.scheduler = this.createScheduler(); ++ this.rateLimitKeyBuilder = options.rateLimitKeyBuilder ?? buildRateLimitKeyFromMetadata; ++ this.retryPolicy = options.retryPolicy; ++ this.priorityBurstLimits = { ++ ...DEFAULT_PRIORITY_BURST_LIMITS, ++ ...(options.priorityBurstLimits ?? {}), ++ }; ++ this.adaptiveLimiterConfig = { ++ ...DEFAULT_ADAPTIVE_LIMITER, ++ ...(options.adaptiveLimiter ?? {}), ++ }; ++ this.logger = new LoggerProxy({ component: "traffic-controller" }, options.logger); ++ this.trafficLogger = this.logger.child({ subsystem: "traffic" }); ++ this.controllerLogger = this.trafficLogger.child({ module: "controller" }); ++ const rateLimits = options.rateLimits; ++ const rateLimitStrategy = options.rateLimitStrategy; ++ this.rateLimiter = new TrafficRateLimiter(() => this.scheduleDrain(), { ++ rateLimits, ++ strategyFactory: (key) => { ++ const strategyKind = this.resolveRateLimitStrategy(key, rateLimitStrategy); ++ if (strategyKind === "window") { ++ return new OpenAIWindowRateLimitStrategy(key, rateLimits?.[key]); ++ } ++ return new TokenBucketRateLimitStrategy(key, rateLimits?.[key]); ++ }, ++ }); ++ this.circuitBreaker = new TrafficCircuitBreaker({ ++ fallbackChains: options.fallbackChains, ++ fallbackPolicy: options.fallbackPolicy, ++ buildRateLimitKey: (metadata) => this.buildRateLimitKey(metadata), ++ }); ++ this.concurrencyLimiter = new TrafficConcurrencyLimiter({ ++ buildProviderModelKey: (metadata) => this.buildRateLimitKey(metadata), ++ maxConcurrentPerProviderModel: options.maxConcurrentPerProviderModel, ++ maxConcurrentPerTenant: options.maxConcurrentPerTenant, + }); + -+ setTimeout(() => { -+ const retryPriority = item.priority; -+ this.getQueue(retryPriority).push({ -+ ...item, -+ attempt: nextAttempt, -+ etaMs: undefined, -+ rateLimitKey: undefined, -+ circuitKey: undefined, -+ circuitStatus: undefined, -+ }); -+ this.scheduleDrain(); -+ }, delayMs); ++ this.controllerLogger.debug("Initialized TrafficController", { ++ maxConcurrent: this.maxConcurrent, ++ hasFallbackChains: !!options.fallbackChains, ++ hasFallbackPolicy: options.fallbackPolicy !== undefined, ++ hasProviderModelConcurrency: options.maxConcurrentPerProviderModel !== undefined, ++ hasTenantConcurrency: options.maxConcurrentPerTenant !== undefined, ++ hasConfigRateLimits: options.rateLimits !== undefined, ++ hasStrategyOverrides: options.rateLimitStrategy !== undefined, ++ hasRetryPolicy: options.retryPolicy !== undefined, ++ hasPriorityBurstLimits: options.priorityBurstLimits !== undefined, ++ hasAdaptiveLimiter: options.adaptiveLimiter !== undefined, ++ }); + } -+} + -+let singletonController: TrafficController | undefined; ++ /* ============================================================ ++ * Public API ++ * ============================================================ ++ */ + -+export class CircuitBreakerOpenError extends Error { -+ readonly retryAfterMs?: number; -+ readonly metadata?: TrafficRequestMetadata; ++ handleText(request: TrafficRequest): Promise { ++ this.controllerLogger.trace("handleText called", { ++ tenantId: request.tenantId, ++ provider: request.metadata?.provider, ++ model: request.metadata?.model, ++ priority: request.metadata?.priority, ++ }); ++ return this.enqueue("text", request); ++ } + -+ constructor(message: string, metadata?: TrafficRequestMetadata, retryAfterMs?: number) { -+ super(message); -+ this.name = "CircuitBreakerOpenError"; -+ this.metadata = metadata; ++ handleStream(request: TrafficRequest): Promise { ++ this.controllerLogger.trace("handleStream called", { ++ tenantId: request.tenantId, ++ provider: request.metadata?.provider, ++ model: request.metadata?.model, ++ priority: request.metadata?.priority, ++ }); ++ return this.enqueue("stream", request); ++ } ++ ++ reportStreamSuccess(metadata?: TrafficRequestMetadata): void { ++ this.controllerLogger.debug("Stream reported success", { ++ provider: metadata?.provider, ++ model: metadata?.model, ++ tenantId: metadata?.tenantId, ++ priority: metadata?.priority, ++ }); ++ this.circuitBreaker.recordSuccess(metadata, this.trafficLogger); ++ const rateLimitKey = this.buildRateLimitKey(metadata); ++ const adaptiveKey = this.buildAdaptiveKey( ++ metadata, ++ metadata?.tenantId ?? "default", ++ rateLimitKey, ++ ); ++ this.recordAdaptiveSuccess(adaptiveKey); ++ } ++ ++ reportStreamFailure(metadata: TrafficRequestMetadata | undefined, error: unknown): void { ++ this.controllerLogger.warn("Stream reported failure", { ++ provider: metadata?.provider, ++ model: metadata?.model, ++ tenantId: metadata?.tenantId, ++ priority: metadata?.priority, ++ errorName: (error as { name?: unknown } | null)?.name, ++ errorMessage: (error as { message?: unknown } | null)?.message, ++ status: (error as { status?: unknown } | null)?.status, ++ statusCode: (error as { statusCode?: unknown } | null)?.statusCode, ++ }); ++ this.circuitBreaker.recordFailure(metadata, error, this.trafficLogger); ++ const rateLimitKey = this.buildRateLimitKey(metadata); ++ const adaptiveKey = this.buildAdaptiveKey( ++ metadata, ++ metadata?.tenantId ?? "default", ++ rateLimitKey, ++ ); ++ if (error instanceof RateLimitedUpstreamError) { ++ this.recordAdaptiveRateLimitHit(adaptiveKey, error.retryAfterMs); ++ } ++ this.attachTrafficMetadata( ++ error, ++ this.buildTrafficResponseMetadataFromMetadata(metadata, rateLimitKey, Date.now(), error), ++ ); ++ } ++ ++ updateRateLimitFromHeaders( ++ metadata: TrafficRequestMetadata | undefined, ++ headers: unknown, ++ ): RateLimitUpdateResult | undefined { ++ const key = this.buildRateLimitKey(metadata); ++ this.controllerLogger.debug("updateRateLimitFromHeaders called", { ++ rateLimitKey: key, ++ provider: metadata?.provider, ++ model: metadata?.model, ++ }); ++ ++ const update = this.rateLimiter.updateFromHeaders(metadata, headers, key, this.trafficLogger); ++ if (!update) { ++ this.controllerLogger.debug("updateRateLimitFromHeaders skipped (no headers applied)", { ++ rateLimitKey: key, ++ }); ++ return undefined; ++ } ++ ++ this.controllerLogger.debug("Rate limit headers applied", { ++ rateLimitKey: update.key, ++ limit: update.state.limit, ++ remaining: update.state.remaining, ++ reserved: update.state.reserved, ++ resetAt: update.state.resetAt, ++ nextAllowedAt: update.state.nextAllowedAt, ++ resetRequestsMs: update.headerSnapshot.resetRequestsMs, ++ }); ++ ++ this.rateLimitSnapshots.set(update.key, { ++ limit: update.state.limit, ++ remaining: update.state.remaining, ++ resetAt: update.state.resetAt, ++ nextAllowedAt: update.state.nextAllowedAt, ++ retryAfterMs: update.headerSnapshot.retryAfterMs, ++ }); ++ ++ return update; ++ } ++ ++ getTenantUsage(tenantId: string): TenantUsage | undefined { ++ this.controllerLogger.trace("getTenantUsage called", { tenantId }); ++ return this.usageTracker.getTenantUsage(tenantId); ++ } ++ ++ /* ============================================================ ++ * Scheduler & Queue ++ * ============================================================ ++ */ ++ ++ private createScheduler(): Scheduler { ++ return typeof queueMicrotask === "function" ? queueMicrotask : (cb) => setTimeout(cb, 0); ++ } ++ ++ private enqueue( ++ type: TrafficRequestType, ++ request: TrafficRequest, ++ ): Promise { ++ return new Promise((resolve, reject) => { ++ const priority = this.resolvePriority(request.metadata); ++ const tenantId = this.resolveTenantId(request); ++ this.controllerLogger.debug("Enqueue request", { ++ type, ++ tenantId, ++ priority, ++ provider: request.metadata?.provider, ++ model: request.metadata?.model, ++ }); ++ this.enqueueItem({ ++ type, ++ request, ++ resolve, ++ reject, ++ attempt: 1, ++ priority, ++ tenantId, ++ enqueuedAt: Date.now(), ++ extractUsage: request.extractUsage, ++ }); ++ this.scheduleDrain(); ++ }); ++ } ++ ++ private scheduleDrain(): void { ++ if (this.drainScheduled) return; ++ this.drainScheduled = true; ++ ++ this.controllerLogger.trace("Drain scheduled"); ++ this.scheduler(() => { ++ this.drainScheduled = false; ++ this.controllerLogger.trace("Drain tick"); ++ this.drainQueue(); ++ }); ++ } ++ ++ private drainQueue(): void { ++ this.controllerLogger.trace("Drain start", { ++ activeCount: this.activeCount, ++ maxConcurrent: this.maxConcurrent, ++ queuedP0: this.getQueuedCount("P0"), ++ queuedP1: this.getQueuedCount("P1"), ++ queuedP2: this.getQueuedCount("P2"), ++ }); ++ while (true) { ++ const decision = this.tryDispatchNext(); ++ this.controllerLogger.trace("Dispatch decision", decision); ++ if (decision.kind === "dispatch" || decision.kind === "skip") continue; ++ if (decision.kind === "wait") { ++ if (decision.wakeUpAt) { ++ this.controllerLogger.debug("Rate limit wait; scheduling wakeup", { ++ wakeUpAt: decision.wakeUpAt, ++ inMs: Math.max(0, decision.wakeUpAt - Date.now()), ++ }); ++ this.scheduleRateLimitWakeUpAt(decision.wakeUpAt); ++ } ++ return; ++ } ++ return; ++ } ++ } ++ ++ /* ============================================================ ++ * Dispatch ++ * ============================================================ ++ */ ++ ++ private tryDispatchNext(): DispatchDecision { ++ if (this.activeCount >= this.maxConcurrent) return { kind: "wait" }; ++ ++ let earliestWakeUpAt: number | undefined; ++ ++ const observeWakeUpAt = (candidate?: number): void => { ++ if (candidate === undefined) return; ++ earliestWakeUpAt = ++ earliestWakeUpAt === undefined ? candidate : Math.min(earliestWakeUpAt, candidate); ++ }; ++ ++ const priorities = this.getPriorityDispatchOrder(); ++ for (const priority of priorities) { ++ const state = this.queues[priority]; ++ if (state.order.length === 0) continue; ++ ++ let attempts = 0; ++ const maxAttempts = state.order.length; ++ ++ while (attempts < maxAttempts) { ++ const candidate = this.getNextTenantCandidate(priority); ++ if (!candidate) break; ++ attempts += 1; ++ ++ const { item: next, queue, tenantId } = candidate; ++ const now = Date.now(); ++ const queueTimeoutAt = this.resolveQueueTimeoutAt(next); ++ const queueTimeoutTriggered = this.handleQueueTimeout(next, queue, 0, now, queueTimeoutAt); ++ if (queueTimeoutTriggered === "rejected") { ++ this.cleanupTenantQueue(priority, tenantId, queue); ++ return { kind: "skip" }; ++ } ++ if (queueTimeoutAt !== undefined && now < queueTimeoutAt) { ++ observeWakeUpAt(queueTimeoutAt); ++ } ++ const queueTimeoutExpired = queueTimeoutTriggered === "expired"; ++ ++ this.controllerLogger.trace("Evaluate next queued request", { ++ priority, ++ tenantId: next.tenantId, ++ type: next.type, ++ attempt: next.attempt, ++ provider: next.request.metadata?.provider, ++ model: next.request.metadata?.model, ++ queueLength: queue.length, ++ }); ++ ++ const circuit = this.resolveCircuit(next); ++ if (circuit) { ++ this.controllerLogger.trace("Circuit resolution returned decision", { ++ priority, ++ decision: circuit, ++ circuitKey: next.circuitKey, ++ circuitStatus: next.circuitStatus, ++ }); ++ if (circuit.kind === "skip") { ++ queue.shift(); ++ this.cleanupTenantQueue(priority, tenantId, queue); ++ return { kind: "skip" }; ++ } ++ if (circuit.kind === "wait") { ++ if ( ++ this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "circuit wait") ++ ) { ++ this.cleanupTenantQueue(priority, tenantId, queue); ++ return { kind: "skip" }; ++ } ++ next.etaMs = ++ circuit.wakeUpAt !== undefined ? Math.max(0, circuit.wakeUpAt - now) : undefined; ++ observeWakeUpAt(circuit.wakeUpAt); ++ continue; ++ } ++ } ++ ++ const concurrency = this.concurrencyLimiter.resolve(next, this.trafficLogger); ++ if (concurrency.kind === "wait") { ++ this.controllerLogger.trace("Concurrency gate blocked request", { ++ priority, ++ tenantId: next.tenantId, ++ provider: next.request.metadata?.provider, ++ model: next.request.metadata?.model, ++ reasons: concurrency.reasons, ++ }); ++ if ( ++ this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "concurrency wait") ++ ) { ++ this.cleanupTenantQueue(priority, tenantId, queue); ++ return { kind: "skip" }; ++ } ++ next.etaMs = undefined; ++ continue; ++ } ++ ++ const adaptive = this.resolveAdaptiveLimit(next, now); ++ if (adaptive?.kind === "wait") { ++ if ( ++ this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "adaptive wait") ++ ) { ++ this.cleanupTenantQueue(priority, tenantId, queue); ++ return { kind: "skip" }; ++ } ++ next.etaMs = ++ adaptive.wakeUpAt !== undefined ? Math.max(0, adaptive.wakeUpAt - now) : undefined; ++ observeWakeUpAt(adaptive.wakeUpAt); ++ continue; ++ } ++ ++ const rateLimit = this.resolveRateLimit(next); ++ if (rateLimit) { ++ this.controllerLogger.trace("Rate limit resolution returned decision", { ++ priority, ++ decision: rateLimit, ++ rateLimitKey: next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), ++ }); ++ if (rateLimit.kind === "wait") { ++ if ( ++ this.rejectIfQueueTimedOut( ++ queueTimeoutExpired, ++ next, ++ queue, ++ 0, ++ now, ++ "rate limit wait", ++ ) ++ ) { ++ this.cleanupTenantQueue(priority, tenantId, queue); ++ return { kind: "skip" }; ++ } ++ next.etaMs = ++ rateLimit.wakeUpAt !== undefined ? Math.max(0, rateLimit.wakeUpAt - now) : undefined; ++ observeWakeUpAt(rateLimit.wakeUpAt); ++ } ++ continue; ++ } ++ ++ if (queueTimeoutExpired) { ++ const timeoutError = this.createQueueTimeoutError(next, now); ++ this.attachTrafficMetadata( ++ timeoutError, ++ this.buildTrafficResponseMetadata( ++ next, ++ timeoutError.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), ++ now, ++ timeoutError, ++ ), ++ ); ++ this.controllerLogger.warn("Queue wait timed out before dispatch", { ++ tenantId: next.tenantId, ++ waitedMs: timeoutError.waitedMs, ++ maxQueueWaitMs: timeoutError.maxQueueWaitMs, ++ deadlineAt: timeoutError.deadlineAt, ++ provider: next.request.metadata?.provider, ++ model: next.request.metadata?.model, ++ rateLimitKey: timeoutError.rateLimitKey, ++ }); ++ queue.shift(); ++ this.cleanupTenantQueue(priority, tenantId, queue); ++ next.reject(timeoutError); ++ return { kind: "skip" }; ++ } ++ ++ this.startRequest(next, queue, tenantId); ++ return { kind: "dispatch" }; ++ } ++ } ++ ++ return earliestWakeUpAt !== undefined ++ ? { kind: "wait", wakeUpAt: earliestWakeUpAt } ++ : { kind: "wait" }; ++ } ++ ++ private startRequest(item: QueuedRequest, queue: QueuedRequest[], tenantId: string): void { ++ this.controllerLogger.debug("Start request", { ++ priority: item.priority, ++ type: item.type, ++ tenantId: item.tenantId, ++ attempt: item.attempt, ++ provider: item.request.metadata?.provider, ++ model: item.request.metadata?.model, ++ }); ++ item.dispatchedAt = Date.now(); ++ queue.shift(); ++ this.cleanupTenantQueue(item.priority, tenantId, queue); ++ this.recordPriorityDispatch(item.priority); ++ this.activeCount++; ++ this.concurrencyLimiter.acquire(item, this.trafficLogger); ++ this.rateLimiter.notifyDispatch(item.rateLimitKey, this.trafficLogger); ++ this.circuitBreaker.markTrial(item, this.trafficLogger); ++ void this.executeRequest(item); ++ } ++ ++ /* ============================================================ ++ * Execution ++ * ============================================================ ++ */ ++ ++ private async executeRequest(item: QueuedRequest): Promise { ++ const startedAt = Date.now(); ++ try { ++ this.controllerLogger.debug("Execute request", { ++ priority: item.priority, ++ type: item.type, ++ tenantId: item.tenantId, ++ attempt: item.attempt, ++ provider: item.request.metadata?.provider, ++ model: item.request.metadata?.model, ++ rateLimitKey: item.rateLimitKey, ++ circuitKey: item.circuitKey, ++ circuitStatus: item.circuitStatus, ++ activeCount: this.activeCount, ++ }); ++ const result = await item.request.execute(); ++ const rateLimitKey = item.rateLimitKey ?? this.buildRateLimitKey(item.request.metadata); ++ const adaptiveKey = this.buildAdaptiveKey(item.request.metadata, item.tenantId, rateLimitKey); ++ this.controllerLogger.debug("Request succeeded", { ++ tenantId: item.tenantId, ++ attempt: item.attempt, ++ provider: item.request.metadata?.provider, ++ model: item.request.metadata?.model, ++ elapsedMs: Date.now() - startedAt, ++ }); ++ if (item.type === "stream") { ++ this.controllerLogger.trace("Stream started successfully", { ++ tenantId: item.tenantId, ++ provider: item.request.metadata?.provider, ++ model: item.request.metadata?.model, ++ }); ++ } else { ++ this.circuitBreaker.recordSuccess(item.request.metadata, this.trafficLogger); ++ } ++ const usage = this.usageTracker.recordUsage(item, result, this.trafficLogger); ++ this.rateLimiter.recordUsage(rateLimitKey, usage, this.trafficLogger); ++ this.recordAdaptiveSuccess(adaptiveKey); ++ this.attachTrafficMetadata( ++ result, ++ this.buildTrafficResponseMetadata(item, rateLimitKey, Date.now()), ++ ); ++ item.resolve(result); ++ } catch (error) { ++ const rateLimitKey = item.rateLimitKey ?? this.buildRateLimitKey(item.request.metadata); ++ const normalizedRateLimitError = normalizeRateLimitError({ ++ error, ++ metadata: item.request.metadata, ++ tenantId: item.tenantId, ++ key: rateLimitKey, ++ logger: this.trafficLogger, ++ }); ++ const errorForHandling = normalizedRateLimitError ?? error; ++ const adaptiveKey = this.buildAdaptiveKey(item.request.metadata, item.tenantId, rateLimitKey); ++ if (errorForHandling instanceof RateLimitedUpstreamError) { ++ this.recordAdaptiveRateLimitHit(adaptiveKey, errorForHandling.retryAfterMs); ++ } ++ ++ this.controllerLogger.warn("Request failed", { ++ tenantId: item.tenantId, ++ attempt: item.attempt, ++ provider: item.request.metadata?.provider, ++ model: item.request.metadata?.model, ++ elapsedMs: Date.now() - startedAt, ++ errorName: (error as { name?: unknown } | null)?.name, ++ errorMessage: (error as { message?: unknown } | null)?.message, ++ status: (error as { status?: unknown } | null)?.status, ++ statusCode: (error as { statusCode?: unknown } | null)?.statusCode, ++ }); ++ this.circuitBreaker.recordFailure( ++ item.request.metadata, ++ errorForHandling, ++ this.trafficLogger, ++ ); ++ this.attachTrafficMetadata( ++ errorForHandling, ++ this.buildTrafficResponseMetadata(item, rateLimitKey, Date.now(), errorForHandling), ++ ); ++ ++ const retry = buildRetryPlanWithPolicy( ++ { ++ error: errorForHandling, ++ attempt: item.attempt, ++ metadata: item.request.metadata, ++ key: rateLimitKey, ++ logger: this.trafficLogger, ++ }, ++ this.retryPolicy, ++ ); ++ if (retry) { ++ if (!this.canRetryWithinDeadline(item, retry.delayMs)) { ++ this.controllerLogger.debug("Retry skipped; deadline exceeded", { ++ tenantId: item.tenantId, ++ attempt: item.attempt, ++ provider: item.request.metadata?.provider, ++ model: item.request.metadata?.model, ++ deadlineAt: item.request.deadlineAt, ++ delayMs: retry.delayMs, ++ }); ++ item.reject(errorForHandling); ++ } else { ++ this.controllerLogger.debug("Retrying request", { ++ tenantId: item.tenantId, ++ attempt: item.attempt, ++ nextAttempt: item.attempt + 1, ++ reason: retry.reason, ++ delayMs: retry.delayMs, ++ provider: item.request.metadata?.provider, ++ model: item.request.metadata?.model, ++ }); ++ this.scheduleRetry(item, retry); ++ } ++ } else { ++ this.controllerLogger.debug("No retry plan; rejecting request", { ++ tenantId: item.tenantId, ++ attempt: item.attempt, ++ provider: item.request.metadata?.provider, ++ model: item.request.metadata?.model, ++ }); ++ item.reject(errorForHandling); ++ } ++ } finally { ++ this.rateLimiter.releaseReservation(item.rateLimitKey, this.trafficLogger); ++ this.concurrencyLimiter.release(item, this.trafficLogger); ++ this.activeCount = Math.max(0, this.activeCount - 1); ++ this.controllerLogger.trace("Request finished; slot released", { ++ tenantId: item.tenantId, ++ activeCount: this.activeCount, ++ maxConcurrent: this.maxConcurrent, ++ }); ++ this.scheduleDrain(); ++ } ++ } ++ ++ /* ============================================================ ++ * Retry logic ++ * ============================================================ ++ */ ++ ++ private scheduleRetry(item: QueuedRequest, plan: RetryPlan): void { ++ this.controllerLogger.debug("Schedule retry", { ++ tenantId: item.tenantId, ++ priority: item.priority, ++ currentAttempt: item.attempt, ++ nextAttempt: item.attempt + 1, ++ reason: plan.reason, ++ delayMs: plan.delayMs, ++ }); ++ setTimeout(() => { ++ this.controllerLogger.debug("Retry timer fired", { ++ tenantId: item.tenantId, ++ priority: item.priority, ++ nextAttempt: item.attempt + 1, ++ }); ++ this.enqueueItem({ ++ ...item, ++ attempt: item.attempt + 1, ++ enqueuedAt: Date.now(), ++ dispatchedAt: undefined, ++ tenantConcurrencyKey: undefined, ++ providerModelConcurrencyKey: undefined, ++ rateLimitKey: undefined, ++ etaMs: undefined, ++ circuitKey: undefined, ++ circuitStatus: undefined, ++ }); ++ this.scheduleDrain(); ++ }, plan.delayMs); ++ } ++ ++ private canRetryWithinDeadline(item: QueuedRequest, delayMs: number): boolean { ++ const deadlineAt = item.request.deadlineAt; ++ if (!deadlineAt) return true; ++ const nextAttemptAt = Date.now() + delayMs; ++ return nextAttemptAt <= deadlineAt; ++ } ++ ++ /* ============================================================ ++ * Rate limiting (verbatim logic) ++ * ============================================================ ++ */ ++ ++ private resolveRateLimit(next: QueuedRequest): DispatchDecision | null { ++ const key = this.buildRateLimitKey(next.request.metadata); ++ return this.rateLimiter.resolve(next, key, this.trafficLogger); ++ } ++ ++ private scheduleRateLimitWakeUpAt(wakeUpAt: number): void { ++ this.rateLimiter.scheduleWakeUpAt(wakeUpAt, this.trafficLogger); ++ } ++ ++ /* ============================================================ ++ * Circuit breakers (verbatim logic, linearized) ++ * ============================================================ ++ */ ++ ++ private resolveCircuit(next: QueuedRequest): DispatchDecision | null { ++ return this.circuitBreaker.resolve(next, this.trafficLogger); ++ } ++ ++ /* ============================================================ ++ * Utilities ++ * ============================================================ ++ */ ++ ++ private resolveQueueTimeoutAt(next: QueuedRequest): number | undefined { ++ const maxQueueWaitMs = next.request.maxQueueWaitMs; ++ const normalizedMaxWait = ++ typeof maxQueueWaitMs === "number" && Number.isFinite(maxQueueWaitMs) ++ ? Math.max(0, maxQueueWaitMs) ++ : undefined; ++ const timeoutAt = ++ normalizedMaxWait !== undefined ? next.enqueuedAt + normalizedMaxWait : undefined; ++ const deadlineAt = next.request.deadlineAt; ++ if (timeoutAt === undefined) return deadlineAt; ++ if (deadlineAt === undefined) return timeoutAt; ++ return Math.min(timeoutAt, deadlineAt); ++ } ++ ++ private handleQueueTimeout( ++ next: QueuedRequest, ++ queue: QueuedRequest[], ++ index: number, ++ now: number, ++ queueTimeoutAt?: number, ++ ): "none" | "expired" | "rejected" { ++ if (queueTimeoutAt === undefined) return "none"; ++ if (now < queueTimeoutAt) return "none"; ++ ++ const fallbackApplied = this.circuitBreaker.tryFallback( ++ next, ++ "queue-timeout", ++ this.trafficLogger, ++ ); ++ if (fallbackApplied) { ++ return "expired"; ++ } ++ ++ const timeoutError = this.createQueueTimeoutError(next, now); ++ this.attachTrafficMetadata( ++ timeoutError, ++ this.buildTrafficResponseMetadata( ++ next, ++ timeoutError.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), ++ now, ++ timeoutError, ++ ), ++ ); ++ this.controllerLogger.warn("Queue wait timed out; rejecting request", { ++ tenantId: next.tenantId, ++ waitedMs: timeoutError.waitedMs, ++ maxQueueWaitMs: timeoutError.maxQueueWaitMs, ++ deadlineAt: timeoutError.deadlineAt, ++ provider: next.request.metadata?.provider, ++ model: next.request.metadata?.model, ++ rateLimitKey: timeoutError.rateLimitKey, ++ }); ++ queue.splice(index, 1); ++ next.reject(timeoutError); ++ return "rejected"; ++ } ++ ++ private rejectIfQueueTimedOut( ++ queueTimeoutExpired: boolean, ++ next: QueuedRequest, ++ queue: QueuedRequest[], ++ index: number, ++ now: number, ++ reason: string, ++ ): boolean { ++ if (!queueTimeoutExpired) return false; ++ const timeoutError = this.createQueueTimeoutError(next, now); ++ this.attachTrafficMetadata( ++ timeoutError, ++ this.buildTrafficResponseMetadata( ++ next, ++ timeoutError.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), ++ now, ++ timeoutError, ++ ), ++ ); ++ this.controllerLogger.warn("Queue wait timed out during gate wait", { ++ tenantId: next.tenantId, ++ waitedMs: timeoutError.waitedMs, ++ maxQueueWaitMs: timeoutError.maxQueueWaitMs, ++ deadlineAt: timeoutError.deadlineAt, ++ provider: next.request.metadata?.provider, ++ model: next.request.metadata?.model, ++ rateLimitKey: timeoutError.rateLimitKey, ++ reason, ++ }); ++ queue.splice(index, 1); ++ next.reject(timeoutError); ++ return true; ++ } ++ ++ private createQueueTimeoutError(next: QueuedRequest, now: number): QueueWaitTimeoutError { ++ const waitedMs = Math.max(0, now - next.enqueuedAt); ++ return new QueueWaitTimeoutError({ ++ waitedMs, ++ maxQueueWaitMs: next.request.maxQueueWaitMs, ++ deadlineAt: next.request.deadlineAt, ++ metadata: next.request.metadata, ++ rateLimitKey: next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), ++ }); ++ } ++ ++ private resolveTenantId(request: TrafficRequest): string { ++ return request.tenantId ?? request.metadata?.tenantId ?? "default"; ++ } ++ ++ private enqueueItem(item: QueuedRequest): void { ++ const state = this.queues[item.priority]; ++ const tenantId = item.tenantId; ++ let queue = state.queues.get(tenantId); ++ if (!queue) { ++ queue = []; ++ state.queues.set(tenantId, queue); ++ state.order.push(tenantId); ++ } ++ queue.push(item); ++ } ++ ++ private getQueuedCount(priority: TrafficPriority): number { ++ const state = this.queues[priority]; ++ let total = 0; ++ for (const queue of state.queues.values()) { ++ total += queue.length; ++ } ++ return total; ++ } ++ ++ private hasQueuedWorkBelow(priority: TrafficPriority): boolean { ++ const index = this.priorityOrder.indexOf(priority); ++ if (index < 0) return false; ++ for (let i = index + 1; i < this.priorityOrder.length; i += 1) { ++ if (this.getQueuedCount(this.priorityOrder[i]) > 0) { ++ return true; ++ } ++ } ++ return false; ++ } ++ ++ private canDispatchPriority(priority: TrafficPriority): boolean { ++ const limit = this.priorityBurstLimits[priority]; ++ if (!Number.isFinite(limit) || limit <= 0) return true; ++ if (this.priorityBurstCounts[priority] < limit) return true; ++ return !this.hasQueuedWorkBelow(priority); ++ } ++ ++ private recordPriorityDispatch(priority: TrafficPriority): void { ++ for (const key of this.priorityOrder) { ++ if (key !== priority) { ++ this.priorityBurstCounts[key] = 0; ++ } ++ } ++ this.priorityBurstCounts[priority] += 1; ++ } ++ ++ private getPriorityDispatchOrder(): TrafficPriority[] { ++ return this.priorityOrder.filter((priority) => this.canDispatchPriority(priority)); ++ } ++ ++ private getNextTenantCandidate( ++ priority: TrafficPriority, ++ ): { item: QueuedRequest; queue: QueuedRequest[]; tenantId: string } | undefined { ++ const state = this.queues[priority]; ++ if (state.order.length === 0) return undefined; ++ const maxAttempts = state.order.length; ++ let attempts = 0; ++ ++ while (attempts < maxAttempts && state.order.length > 0) { ++ const index = state.index % state.order.length; ++ const tenantId = state.order[index]; ++ const queue = state.queues.get(tenantId); ++ attempts += 1; ++ ++ if (!queue || queue.length === 0) { ++ this.removeTenantQueue(priority, tenantId); ++ continue; ++ } ++ ++ state.index = (index + 1) % state.order.length; ++ return { item: queue[0], queue, tenantId }; ++ } ++ ++ return undefined; ++ } ++ ++ private cleanupTenantQueue( ++ priority: TrafficPriority, ++ tenantId: string, ++ queue: QueuedRequest[], ++ ): void { ++ if (queue.length > 0) return; ++ this.removeTenantQueue(priority, tenantId); ++ } ++ ++ private removeTenantQueue(priority: TrafficPriority, tenantId: string): void { ++ const state = this.queues[priority]; ++ state.queues.delete(tenantId); ++ const index = state.order.indexOf(tenantId); ++ if (index === -1) return; ++ state.order.splice(index, 1); ++ if (state.order.length === 0) { ++ state.index = 0; ++ return; ++ } ++ if (state.index > index) { ++ state.index -= 1; ++ } ++ if (state.index >= state.order.length) { ++ state.index = 0; ++ } ++ } ++ ++ private resolvePriority(metadata?: TrafficRequestMetadata): TrafficPriority { ++ return metadata?.priority ?? "P1"; ++ } ++ ++ private buildRateLimitKey(metadata?: TrafficRequestMetadata): string { ++ return this.rateLimitKeyBuilder(metadata); ++ } ++ ++ private resolveAdaptiveLimit(next: QueuedRequest, now: number): DispatchDecision | null { ++ const rateLimitKey = next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata); ++ const adaptiveKey = this.buildAdaptiveKey(next.request.metadata, next.tenantId, rateLimitKey); ++ const state = this.adaptiveLimiterState.get(adaptiveKey); ++ if (!state) return null; ++ ++ this.applyAdaptiveDecay(state, now); ++ if (state.cooldownUntil !== undefined && now < state.cooldownUntil) { ++ return { kind: "wait", wakeUpAt: state.cooldownUntil }; ++ } ++ ++ return null; ++ } ++ ++ private recordAdaptiveRateLimitHit(key: string, retryAfterMs?: number): void { ++ const state = this.getAdaptiveState(key); ++ const now = Date.now(); ++ const { windowMs, threshold, minPenaltyMs, maxPenaltyMs, penaltyMultiplier } = ++ this.adaptiveLimiterConfig; ++ ++ state.last429At = now; ++ state.recent429s = state.recent429s.filter((timestamp) => now - timestamp <= windowMs); ++ state.recent429s.push(now); ++ ++ if (state.recent429s.length < threshold) { ++ return; ++ } ++ ++ const basePenalty = state.penaltyMs > 0 ? state.penaltyMs : minPenaltyMs; ++ const nextPenalty = Math.min( ++ maxPenaltyMs, ++ Math.max(minPenaltyMs, Math.round(basePenalty * penaltyMultiplier)), ++ ); ++ state.penaltyMs = nextPenalty; ++ const retryPenalty = typeof retryAfterMs === "number" ? retryAfterMs : 0; ++ const cooldownMs = Math.max(nextPenalty, retryPenalty); ++ state.cooldownUntil = now + cooldownMs; ++ } ++ ++ private recordAdaptiveSuccess(key: string): void { ++ const state = this.adaptiveLimiterState.get(key); ++ if (!state) return; ++ ++ const now = Date.now(); ++ this.applyAdaptiveDecay(state, now); ++ if (state.penaltyMs === 0) { ++ state.cooldownUntil = undefined; ++ state.recent429s = []; ++ state.last429At = undefined; ++ } ++ } ++ ++ private applyAdaptiveDecay(state: AdaptiveLimiterState, now: number): void { ++ const { decayMs, penaltyMultiplier } = this.adaptiveLimiterConfig; ++ if (state.last429At && now - state.last429At < decayMs) { ++ return; ++ } ++ ++ if (state.penaltyMs > 0) { ++ state.penaltyMs = Math.max(0, Math.floor(state.penaltyMs / penaltyMultiplier)); ++ } ++ } ++ ++ private getAdaptiveState(key: string): AdaptiveLimiterState { ++ const existing = this.adaptiveLimiterState.get(key); ++ if (existing) return existing; ++ const created: AdaptiveLimiterState = { ++ recent429s: [], ++ penaltyMs: 0, ++ }; ++ this.adaptiveLimiterState.set(key, created); ++ return created; ++ } ++ ++ private buildAdaptiveKey( ++ metadata: TrafficRequestMetadata | undefined, ++ tenantId: string, ++ rateLimitKey: string, ++ ): string { ++ if (rateLimitKey.includes("tenant=")) { ++ return rateLimitKey; ++ } ++ const tenant = metadata?.tenantId ?? tenantId ?? "default"; ++ return `${rateLimitKey}::tenant=${encodeURIComponent(tenant)}`; ++ } ++ ++ private buildTrafficResponseMetadata( ++ item: QueuedRequest, ++ rateLimitKey: string, ++ now: number, ++ error?: unknown, ++ ): TrafficResponseMetadata { ++ const snapshot = this.rateLimitSnapshots.get(rateLimitKey); ++ const retryAfterMs = this.resolveRetryAfterMs(error, snapshot); ++ const queuedForMs = ++ item.dispatchedAt !== undefined ? item.dispatchedAt - item.enqueuedAt : now - item.enqueuedAt; ++ const queueEtaMs = item.etaMs ?? Math.max(0, queuedForMs); ++ ++ return { ++ rateLimitKey, ++ retryAfterMs, ++ rateLimitRemaining: snapshot?.remaining, ++ rateLimitResetAt: snapshot?.resetAt, ++ rateLimitResetInMs: ++ snapshot?.resetAt !== undefined ? Math.max(0, snapshot.resetAt - now) : undefined, ++ queueEtaMs, ++ tenantId: item.tenantId, ++ priority: item.request.metadata?.priority, ++ taskType: item.request.metadata?.taskType, ++ }; ++ } ++ ++ private buildTrafficResponseMetadataFromMetadata( ++ metadata: TrafficRequestMetadata | undefined, ++ rateLimitKey: string, ++ now: number, ++ error?: unknown, ++ ): TrafficResponseMetadata { ++ const snapshot = this.rateLimitSnapshots.get(rateLimitKey); ++ const retryAfterMs = this.resolveRetryAfterMs(error, snapshot); ++ ++ return { ++ rateLimitKey, ++ retryAfterMs, ++ rateLimitRemaining: snapshot?.remaining, ++ rateLimitResetAt: snapshot?.resetAt, ++ rateLimitResetInMs: ++ snapshot?.resetAt !== undefined ? Math.max(0, snapshot.resetAt - now) : undefined, ++ tenantId: metadata?.tenantId, ++ priority: metadata?.priority, ++ taskType: metadata?.taskType, ++ }; ++ } ++ ++ private attachTrafficMetadata(target: unknown, info: TrafficResponseMetadata): void { ++ if (!target || typeof target !== "object") return; ++ (target as Record).traffic = info; ++ } ++ ++ private resolveRetryAfterMs( ++ error: unknown | undefined, ++ snapshot?: RateLimitSnapshot, ++ ): number | undefined { ++ if (error && typeof error === "object" && "retryAfterMs" in error) { ++ const candidate = (error as { retryAfterMs?: unknown }).retryAfterMs; ++ if (typeof candidate === "number" && Number.isFinite(candidate)) { ++ return candidate; ++ } ++ } ++ if (snapshot?.retryAfterMs !== undefined) { ++ return snapshot.retryAfterMs; ++ } ++ return undefined; ++ } ++ ++ private resolveRateLimitStrategy( ++ key: string, ++ config?: RateLimitStrategyConfig, ++ ): RateLimitStrategyKind { ++ const modelOverride = config?.models?.[key]; ++ if (modelOverride) return modelOverride; ++ const provider = key.split("::")[0] ?? ""; ++ const providerOverride = config?.providers?.[provider]; ++ if (providerOverride) return providerOverride; ++ if (provider.startsWith("openai")) return "window"; ++ return "token-bucket"; ++ } ++} ++ ++/* ============================================================ ++ * Error + Singleton ++ * ============================================================ ++ */ ++ ++let singletonController: TrafficController | undefined; ++ ++export function getTrafficController(options?: TrafficControllerOptions): TrafficController { ++ if (!singletonController) { ++ singletonController = new TrafficController(options); ++ } ++ return singletonController; ++} ++ ++function buildRateLimitKeyFromMetadata(metadata?: TrafficRequestMetadata): string { ++ const provider = metadata?.provider ?? "default-provider"; ++ const model = metadata?.model ?? "default-model"; ++ const parts = [provider, model]; ++ ++ // SOP: Add new metadata fields in one place with a stable label and ordering. ++ // 1) Add the optional field to TrafficRequestMetadata. ++ // 2) Add it here with a stable label so keys stay predictable. ++ // Example: { label: "org", value: metadata?.orgId } ++ const optionalFields: Array<{ label: string; value?: string }> = [ ++ { label: "apiKey", value: metadata?.apiKeyId }, ++ { label: "region", value: metadata?.region }, ++ { label: "endpoint", value: metadata?.endpoint }, ++ { label: "tenant", value: metadata?.tenantId }, ++ { label: "tenantTier", value: metadata?.tenantTier }, ++ { label: "taskType", value: metadata?.taskType }, ++ ]; ++ ++ for (const field of optionalFields) { ++ if (!field.value) continue; ++ parts.push(`${field.label}=${encodeURIComponent(field.value)}`); ++ } ++ ++ return parts.join("::"); ++} +diff --git a/packages/core/src/traffic/traffic-error-utils.ts b/packages/core/src/traffic/traffic-error-utils.ts +new file mode 100644 +index 00000000..4cbb98b5 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-error-utils.ts +@@ -0,0 +1,148 @@ ++import type { Logger } from "../logger"; ++ ++function readObjectProperty(value: unknown, key: string): unknown { ++ if (!value || typeof value !== "object") return undefined; ++ return (value as Record)[key]; ++} ++ ++export function findHeaders(value: unknown): unknown[] { ++ const candidates: unknown[] = [ ++ readObjectProperty(value, "headers"), ++ readObjectProperty(readObjectProperty(value, "response"), "headers"), ++ readObjectProperty(readObjectProperty(value, "cause"), "headers"), ++ readObjectProperty( ++ readObjectProperty(readObjectProperty(value, "cause"), "response"), ++ "headers", ++ ), ++ ]; ++ ++ return candidates.filter((candidate) => candidate !== undefined && candidate !== null); ++} ++ ++export function readHeaderValue(headers: unknown, name: string): string | undefined { ++ if (!headers) return undefined; ++ ++ if (typeof (headers as { get?: unknown }).get === "function") { ++ const v = (headers as { get: (name: string) => unknown }).get(name); ++ return v === null || v === undefined ? undefined : String(v); ++ } ++ ++ if (typeof headers !== "object") return undefined; ++ ++ const entries = Object.entries(headers as Record); ++ const target = name.toLowerCase(); ++ const match = entries.find(([k]) => String(k).toLowerCase() === target); ++ if (!match) return undefined; ++ ++ const value = match[1]; ++ if (Array.isArray(value)) { ++ const first = value[0]; ++ return first === null || first === undefined ? undefined : String(first); ++ } ++ return value === null || value === undefined ? undefined : String(value); ++} ++ ++export function parseRetryAfterMs(value: string, nowMs: number = Date.now()): number | undefined { ++ const raw = value.trim(); ++ if (!raw) return undefined; ++ ++ const seconds = Number(raw); ++ if (Number.isFinite(seconds)) { ++ return Math.max(0, Math.round(seconds * 1000)); ++ } ++ ++ const parsedAt = Date.parse(raw); ++ if (Number.isFinite(parsedAt)) { ++ return Math.max(0, parsedAt - nowMs); ++ } ++ ++ return undefined; ++} ++ ++export function coerceStatus(value: unknown): number | undefined { ++ const n = Number(value); ++ return Number.isFinite(n) ? n : undefined; ++} ++ ++export function extractStatusCode(error: unknown, logger?: Logger): number | undefined { ++ const status = ++ coerceStatus(readObjectProperty(error, "status")) ?? ++ coerceStatus(readObjectProperty(error, "statusCode")) ?? ++ coerceStatus(readObjectProperty(error, "httpStatus")) ?? ++ coerceStatus(readObjectProperty(readObjectProperty(error, "response"), "status")) ?? ++ coerceStatus(readObjectProperty(readObjectProperty(error, "cause"), "status")); ++ ++ logger?.trace?.("Extracted status code", { ++ status, ++ hasStatus: readObjectProperty(error, "status") !== undefined, ++ hasStatusCode: readObjectProperty(error, "statusCode") !== undefined, ++ hasHttpStatus: readObjectProperty(error, "httpStatus") !== undefined, ++ hasResponseStatus: ++ readObjectProperty(readObjectProperty(error, "response"), "status") !== undefined, ++ hasCauseStatus: readObjectProperty(readObjectProperty(error, "cause"), "status") !== undefined, ++ }); ++ ++ return status; ++} ++ ++export function extractRetryAfterMs(error: unknown, logger?: Logger): number | undefined { ++ const retryAfterLogger = logger?.child({ module: "retry-after" }); ++ const candidates = findHeaders(error); ++ ++ for (const headers of candidates) { ++ const raw = readHeaderValue(headers, "retry-after"); ++ if (!raw) continue; ++ const parsed = parseRetryAfterMs(raw); ++ retryAfterLogger?.trace?.("Parsed Retry-After header", { raw, parsedMs: parsed }); ++ if (parsed !== undefined) return parsed; ++ } ++ ++ retryAfterLogger?.trace?.("Retry-After header missing or unparsable"); ++ return undefined; ++} ++ ++export function isTimeoutError(error: unknown, logger?: Logger): boolean { ++ const candidates: unknown[] = [error]; ++ ++ const cause = readObjectProperty(error, "cause"); ++ if (cause) { ++ candidates.push(cause); ++ const nestedCause = readObjectProperty(cause, "cause"); ++ if (nestedCause) candidates.push(nestedCause); ++ } ++ ++ for (const candidate of candidates) { ++ const code = readObjectProperty(candidate, "code"); ++ const name = readObjectProperty(candidate, "name"); ++ const message = readObjectProperty(candidate, "message"); ++ ++ const codeText = String(code ?? "").toLowerCase(); ++ const nameText = String(name ?? "").toLowerCase(); ++ const messageText = String(message ?? "").toLowerCase(); ++ ++ const isTimeout = ++ codeText.includes("timeout") || ++ codeText.includes("timedout") || ++ nameText.includes("timeout") || ++ nameText.includes("timedout") || ++ messageText.includes("timeout") || ++ messageText.includes("timedout") || ++ messageText.includes("timed out"); ++ ++ logger?.trace?.("Checked timeout error", { ++ isTimeout, ++ code, ++ name, ++ messagePreview: typeof message === "string" ? message.slice(0, 160) : message, ++ hasCause: candidate !== error, ++ }); ++ ++ if (isTimeout) return true; ++ } ++ ++ return false; ++} ++ ++export function isPromiseLike(value: unknown): value is PromiseLike { ++ return !!value && typeof (value as { then?: unknown }).then === "function"; ++} +diff --git a/packages/core/src/traffic/traffic-errors.ts b/packages/core/src/traffic/traffic-errors.ts +new file mode 100644 +index 00000000..4943c89f +--- /dev/null ++++ b/packages/core/src/traffic/traffic-errors.ts +@@ -0,0 +1,141 @@ ++import type { Logger } from "../logger"; ++import { extractRetryAfterMs, extractStatusCode } from "./traffic-error-utils"; ++import type { TrafficRequestMetadata } from "./traffic-types"; ++ ++export type RateLimitErrorOptions = { ++ metadata?: TrafficRequestMetadata; ++ retryAfterMs?: number; ++ tenantId?: string; ++ key?: string; ++}; ++ ++export class CircuitBreakerOpenError extends Error { ++ readonly retryAfterMs?: number; ++ readonly metadata?: TrafficRequestMetadata; ++ ++ constructor(message: string, metadata?: TrafficRequestMetadata, retryAfterMs?: number) { ++ super(message); ++ this.name = "CircuitBreakerOpenError"; ++ this.metadata = metadata; + this.retryAfterMs = retryAfterMs; + } +} + -+/** -+ * Retrieve the shared traffic controller instance. -+ */ -+export function getTrafficController(options?: TrafficControllerOptions): TrafficController { -+ if (!singletonController) { -+ // Create a singleton controller so all agents share the same queue/scheduling behavior -+ singletonController = new TrafficController(options); ++export class QueueWaitTimeoutError extends Error { ++ readonly waitedMs: number; ++ readonly maxQueueWaitMs?: number; ++ readonly deadlineAt?: number; ++ readonly metadata?: TrafficRequestMetadata; ++ readonly rateLimitKey?: string; ++ ++ constructor(options: { ++ waitedMs: number; ++ maxQueueWaitMs?: number; ++ deadlineAt?: number; ++ metadata?: TrafficRequestMetadata; ++ rateLimitKey?: string; ++ }) { ++ super("Queue wait time exceeded"); ++ this.name = "QueueWaitTimeoutError"; ++ this.waitedMs = options.waitedMs; ++ this.maxQueueWaitMs = options.maxQueueWaitMs; ++ this.deadlineAt = options.deadlineAt; ++ this.metadata = options.metadata; ++ this.rateLimitKey = options.rateLimitKey; ++ } ++} ++ ++export class RateLimitedUpstreamError extends Error { ++ readonly status = 429; ++ readonly retryAfterMs?: number; ++ readonly metadata?: TrafficRequestMetadata; ++ readonly provider?: string; ++ readonly model?: string; ++ readonly tenantId?: string; ++ readonly key?: string; ++ ++ constructor( ++ message: string, ++ metadata?: TrafficRequestMetadata, ++ retryAfterMs?: number, ++ options?: { tenantId?: string; key?: string }, ++ ); ++ constructor(message: string, options?: RateLimitErrorOptions); ++ constructor( ++ message: string, ++ metadataOrOptions?: TrafficRequestMetadata | RateLimitErrorOptions, ++ retryAfterMs?: number, ++ legacyOptions?: { tenantId?: string; key?: string }, ++ ) { ++ super(message); ++ this.name = "RateLimitedUpstreamError"; ++ const isOptions = ++ metadataOrOptions && ++ (Object.prototype.hasOwnProperty.call(metadataOrOptions, "metadata") || ++ Object.prototype.hasOwnProperty.call(metadataOrOptions, "retryAfterMs") || ++ Object.prototype.hasOwnProperty.call(metadataOrOptions, "key")); ++ ++ const metadata = isOptions ++ ? (metadataOrOptions as RateLimitErrorOptions).metadata ++ : (metadataOrOptions as TrafficRequestMetadata | undefined); ++ const retryAfter = isOptions ++ ? (metadataOrOptions as RateLimitErrorOptions).retryAfterMs ++ : retryAfterMs; ++ const tenantId = isOptions ++ ? (metadataOrOptions as RateLimitErrorOptions).tenantId ++ : legacyOptions?.tenantId; ++ const key = isOptions ? (metadataOrOptions as RateLimitErrorOptions).key : legacyOptions?.key; ++ ++ this.metadata = metadata; ++ this.retryAfterMs = retryAfter; ++ this.provider = metadata?.provider; ++ this.model = metadata?.model; ++ this.tenantId = tenantId ?? metadata?.tenantId; ++ this.key = key; ++ } ++} ++ ++export function normalizeRateLimitError(options: { ++ error: unknown; ++ metadata?: TrafficRequestMetadata; ++ tenantId?: string; ++ key?: string; ++ logger?: Logger; ++}): RateLimitedUpstreamError | undefined { ++ const { error, metadata, tenantId, key, logger } = options; ++ const retryAfterMs = ++ error instanceof RateLimitedUpstreamError ++ ? (error.retryAfterMs ?? extractRetryAfterMs(error, logger)) ++ : extractRetryAfterMs(error, logger); ++ ++ if (error instanceof RateLimitedUpstreamError) { ++ const baseMetadata = metadata ?? error.metadata; ++ const baseTenant = tenantId ?? error.tenantId; ++ const baseKey = key ?? error.key; ++ if ( ++ error.metadata === baseMetadata && ++ error.retryAfterMs === retryAfterMs && ++ error.tenantId === baseTenant && ++ error.key === baseKey ++ ) { ++ return error; ++ } ++ return new RateLimitedUpstreamError(error.message, { ++ metadata: baseMetadata, ++ retryAfterMs, ++ tenantId: baseTenant, ++ key: baseKey, ++ }); ++ } ++ ++ const status = extractStatusCode(error, logger); ++ if (status !== 429) return undefined; ++ ++ const message = error instanceof Error ? error.message : "Rate limit exceeded"; ++ return new RateLimitedUpstreamError(message, { ++ metadata, ++ retryAfterMs, ++ tenantId, ++ key, ++ }); ++} +diff --git a/packages/core/src/traffic/traffic-rate-limiter.ts b/packages/core/src/traffic/traffic-rate-limiter.ts +new file mode 100644 +index 00000000..a77a0423 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-rate-limiter.ts +@@ -0,0 +1,267 @@ ++import type { Logger } from "../logger"; ++import type { ++ RateLimitStrategy, ++ RateLimitUpdateResult, ++} from "./rate-limit-strategies/rate-limit-strategy"; ++import { TokenBucketRateLimitStrategy } from "./rate-limit-strategies/token-bucket-rate-limit-strategy"; ++import type { DispatchDecision, QueuedRequest } from "./traffic-controller-internal"; ++import type { RateLimitConfig, TrafficRequestMetadata } from "./traffic-types"; ++ ++export type { ++ RateLimitHeaderSnapshot, ++ RateLimitStrategy, ++ RateLimitUpdateResult, ++} from "./rate-limit-strategies/rate-limit-strategy"; ++export { DefaultRateLimitStrategy } from "./rate-limit-strategies/default-rate-limit-strategy"; ++export { OpenAIWindowRateLimitStrategy } from "./rate-limit-strategies/openai-window-rate-limit-strategy"; ++export { TokenBucketRateLimitStrategy } from "./rate-limit-strategies/token-bucket-rate-limit-strategy"; ++ ++type SchedulerCallback = () => void; ++ ++export type RateLimitStrategyFactory = (key: string) => RateLimitStrategy; ++ ++type UsageCounters = { ++ inputTokens?: number; ++ outputTokens?: number; ++ totalTokens?: number; ++}; ++ ++type TokenRateState = { ++ capacity: number; ++ refillPerSecond: number; ++ tokens: number; ++ updatedAt: number; ++}; ++ ++export class TrafficRateLimiter { ++ private readonly strategies = new Map(); ++ private readonly tokenRates = new Map(); ++ private wakeUpTimeout?: ReturnType; ++ private wakeUpAt?: number; ++ private readonly onWakeUp: SchedulerCallback; ++ private readonly strategyFactory: RateLimitStrategyFactory; ++ private readonly rateLimits?: RateLimitConfig; ++ ++ constructor( ++ onWakeUp: SchedulerCallback, ++ options?: { strategyFactory?: RateLimitStrategyFactory; rateLimits?: RateLimitConfig }, ++ ) { ++ this.onWakeUp = onWakeUp; ++ this.rateLimits = options?.rateLimits; ++ this.strategyFactory = ++ options?.strategyFactory ?? ++ ((key) => new TokenBucketRateLimitStrategy(key, this.rateLimits?.[key])); ++ } ++ ++ resolve(next: QueuedRequest, key: string, logger?: Logger): DispatchDecision | null { ++ const strategy = this.strategies.get(key) ?? this.createStrategy(key, logger); ++ const requestDecision = strategy.resolve(next, logger); ++ if (requestDecision?.kind === "wait") { ++ const tokenDecision = strategy.handlesTokenLimits ++ ? null ++ : this.resolveTokenLimit(key, logger); ++ if (tokenDecision?.kind === "wait") { ++ const requestWakeUp = requestDecision.wakeUpAt; ++ const tokenWakeUp = tokenDecision.wakeUpAt; ++ if (tokenWakeUp !== undefined && requestWakeUp !== undefined) { ++ return { kind: "wait", wakeUpAt: Math.min(requestWakeUp, tokenWakeUp) }; ++ } ++ if (tokenWakeUp !== undefined && requestWakeUp === undefined) { ++ return tokenDecision; ++ } ++ } ++ return requestDecision; ++ } ++ ++ const tokenDecision = strategy.handlesTokenLimits ? null : this.resolveTokenLimit(key, logger); ++ if (tokenDecision?.kind === "wait") { ++ return tokenDecision; ++ } ++ ++ return requestDecision; ++ } ++ ++ notifyDispatch(key: string | undefined, logger?: Logger): void { ++ if (!key) return; ++ this.strategies.get(key)?.onDispatch(logger); ++ } ++ ++ scheduleWakeUpAt(wakeUpAt: number, logger?: Logger): void { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const now = Date.now(); ++ const target = Math.max(now, wakeUpAt); ++ ++ if (this.wakeUpTimeout && this.wakeUpAt !== undefined && this.wakeUpAt <= target) { ++ rateLimitLogger?.trace?.("Wakeup already scheduled earlier; skipping", { ++ currentWakeUpAt: this.wakeUpAt, ++ requestedWakeUpAt: target, ++ }); ++ return; ++ } ++ ++ if (this.wakeUpTimeout) clearTimeout(this.wakeUpTimeout); ++ ++ this.wakeUpAt = target; ++ rateLimitLogger?.debug?.("Scheduling rate limit wakeup", { ++ wakeUpAt: target, ++ inMs: Math.max(1, target - now), ++ }); ++ this.wakeUpTimeout = setTimeout( ++ () => { ++ this.wakeUpTimeout = undefined; ++ this.wakeUpAt = undefined; ++ rateLimitLogger?.debug?.("Rate limit wakeup fired"); ++ this.onWakeUp(); ++ }, ++ Math.max(1, target - now), ++ ); ++ } ++ ++ releaseReservation(key?: string, logger?: Logger): void { ++ if (!key) return; ++ this.strategies.get(key)?.onComplete(logger); ++ } ++ ++ recordUsage( ++ key: string | undefined, ++ usage: UsageCounters | Promise | undefined, ++ logger?: Logger, ++ ): void { ++ if (!key || !usage) return; ++ if (typeof (usage as PromiseLike).then === "function") { ++ void (usage as Promise) ++ .then((resolved) => this.recordUsage(key, resolved, logger)) ++ .catch(() => {}); ++ return; ++ } ++ ++ const strategy = this.strategies.get(key); ++ if (strategy?.recordUsage) { ++ strategy.recordUsage(usage, logger); ++ return; ++ } ++ ++ const tokens = this.resolveTokenCount(usage); ++ if (tokens <= 0) return; ++ ++ const bucket = this.getTokenRateState(key, logger); ++ if (!bucket) return; ++ ++ const now = Date.now(); ++ this.refillTokenRate(bucket, now); ++ bucket.tokens = Math.min(bucket.capacity, bucket.tokens); ++ bucket.tokens -= tokens; ++ ++ if (bucket.tokens < 0 && bucket.refillPerSecond > 0) { ++ const waitMs = Math.max(1, Math.ceil((-bucket.tokens / bucket.refillPerSecond) * 1000)); ++ this.scheduleWakeUpAt(now + waitMs, logger); ++ } ++ } ++ ++ updateFromHeaders( ++ metadata: TrafficRequestMetadata | undefined, ++ headers: unknown, ++ key: string, ++ logger?: Logger, ++ ): RateLimitUpdateResult | undefined { ++ const existing = this.strategies.get(key); ++ if (existing) return existing.updateFromHeaders(metadata, headers, logger); ++ ++ const created = this.strategyFactory(key); ++ const update = created.updateFromHeaders(metadata, headers, logger); ++ if (!update) return undefined; ++ this.strategies.set(key, created); ++ return update; ++ } ++ ++ private createStrategy(key: string, logger?: Logger): RateLimitStrategy { ++ const created = this.strategyFactory(key); ++ this.strategies.set(key, created); ++ logger?.child({ module: "rate-limiter" })?.trace?.("Created rate limit strategy", { ++ rateLimitKey: key, ++ strategy: created.constructor.name, ++ }); ++ return created; ++ } ++ ++ private resolveTokenLimit(key: string, logger?: Logger): DispatchDecision | null { ++ const bucket = this.getTokenRateState(key, logger); ++ if (!bucket) return null; ++ ++ const now = Date.now(); ++ this.refillTokenRate(bucket, now); ++ ++ if (bucket.capacity <= 0) { ++ logger?.child({ module: "rate-limiter" })?.debug?.("Token limit misconfigured; blocking", { ++ rateLimitKey: key, ++ capacity: bucket.capacity, ++ refillPerSecond: bucket.refillPerSecond, ++ }); ++ return { kind: "wait" }; ++ } ++ ++ if (bucket.tokens >= 0) return null; ++ ++ if (bucket.refillPerSecond <= 0) { ++ logger?.child({ module: "rate-limiter" })?.debug?.("Token limit has no refill; blocking", { ++ rateLimitKey: key, ++ capacity: bucket.capacity, ++ refillPerSecond: bucket.refillPerSecond, ++ }); ++ return { kind: "wait" }; ++ } ++ ++ const requiredTokens = -bucket.tokens; ++ const waitMs = Math.max(1, Math.ceil((requiredTokens / bucket.refillPerSecond) * 1000)); ++ return { kind: "wait", wakeUpAt: now + waitMs }; ++ } ++ ++ private getTokenRateState(key: string, logger?: Logger): TokenRateState | undefined { ++ const existing = this.tokenRates.get(key); ++ if (existing) return existing; ++ ++ const options = this.rateLimits?.[key]; ++ if (!options) return undefined; ++ ++ const tokensPerMinute = Number(options.tokensPerMinute); ++ if (!Number.isFinite(tokensPerMinute) || tokensPerMinute <= 0) { ++ return undefined; ++ } ++ ++ // Token pacing uses a 1-minute burst by default; request bursts are handled separately. ++ const refillPerSecond = tokensPerMinute / 60; ++ const capacity = tokensPerMinute; ++ const now = Date.now(); ++ const created: TokenRateState = { ++ capacity, ++ refillPerSecond, ++ tokens: capacity, ++ updatedAt: now, ++ }; ++ this.tokenRates.set(key, created); ++ logger?.child({ module: "rate-limiter" })?.trace?.("Created token rate state", { ++ rateLimitKey: key, ++ capacity, ++ refillPerSecond, ++ }); ++ return created; ++ } ++ ++ private refillTokenRate(bucket: TokenRateState, now: number): void { ++ const elapsedMs = now - bucket.updatedAt; ++ if (elapsedMs <= 0) return; ++ bucket.updatedAt = now; ++ if (bucket.capacity <= 0 || bucket.refillPerSecond <= 0) return; ++ const refill = (elapsedMs / 1000) * bucket.refillPerSecond; ++ if (refill <= 0) return; ++ bucket.tokens = Math.min(bucket.capacity, bucket.tokens + refill); ++ } ++ ++ private resolveTokenCount(usage: UsageCounters): number { ++ const total = Number.isFinite(usage.totalTokens) ? usage.totalTokens : undefined; ++ if (total !== undefined) return total; ++ const input = Number.isFinite(usage.inputTokens) ? usage.inputTokens : 0; ++ const output = Number.isFinite(usage.outputTokens) ? usage.outputTokens : 0; ++ return input + output; ++ } ++} +diff --git a/packages/core/src/traffic/traffic-retry.spec.ts b/packages/core/src/traffic/traffic-retry.spec.ts +new file mode 100644 +index 00000000..2360ca10 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-retry.spec.ts +@@ -0,0 +1,45 @@ ++import { describe, expect, it, vi } from "vitest"; ++import { buildRetryPlan } from "./traffic-retry"; ++ ++describe("buildRetryPlan", () => { ++ it("respects Retry-After for 429s", () => { ++ const randomSpy = vi.spyOn(Math, "random").mockReturnValue(0); ++ try { ++ const plan = buildRetryPlan( ++ { ++ status: 429, ++ response: { headers: { "retry-after": "2" } }, ++ }, ++ 1, ++ ); ++ ++ expect(plan).toBeTruthy(); ++ expect(plan?.reason).toBe("rateLimit"); ++ expect(plan?.delayMs).toBeGreaterThanOrEqual(2_000); ++ } finally { ++ randomSpy.mockRestore(); ++ } ++ }); ++ ++ it("parses HTTP-date Retry-After values", () => { ++ vi.useFakeTimers(); ++ const randomSpy = vi.spyOn(Math, "random").mockReturnValue(0); ++ ++ try { ++ vi.setSystemTime(new Date("2020-01-01T00:00:00.000Z")); ++ const plan = buildRetryPlan( ++ { ++ statusCode: 429, ++ response: { headers: { "retry-after": "Wed, 01 Jan 2020 00:00:03 GMT" } }, ++ }, ++ 1, ++ ); ++ ++ expect(plan).toBeTruthy(); ++ expect(plan?.delayMs).toBeGreaterThanOrEqual(3_000); ++ } finally { ++ vi.useRealTimers(); ++ randomSpy.mockRestore(); ++ } ++ }); ++}); +diff --git a/packages/core/src/traffic/traffic-retry.ts b/packages/core/src/traffic/traffic-retry.ts +new file mode 100644 +index 00000000..9604dc53 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-retry.ts +@@ -0,0 +1,144 @@ ++import type { Logger } from "../logger"; ++import { ++ MAX_RETRY_ATTEMPTS, ++ RATE_LIMIT_BASE_BACKOFF_MS, ++ RATE_LIMIT_JITTER_FACTOR, ++ SERVER_ERROR_BASE_BACKOFF_MS, ++ SERVER_ERROR_JITTER_FACTOR, ++ TIMEOUT_BASE_BACKOFF_MS, ++ TIMEOUT_JITTER_FACTOR, ++ TIMEOUT_RETRY_ATTEMPTS, ++} from "./traffic-constants"; ++import { extractRetryAfterMs, extractStatusCode, isTimeoutError } from "./traffic-error-utils"; ++import { RateLimitedUpstreamError } from "./traffic-errors"; ++import type { ++ RetryPlan, ++ RetryPolicy, ++ RetryPolicyConfig, ++ RetryPolicyContext, ++ RetryReason, ++} from "./traffic-types"; ++ ++export type { ++ RetryPlan, ++ RetryPolicy, ++ RetryPolicyConfig, ++ RetryPolicyContext, ++ RetryReason, ++} from "./traffic-types"; ++ ++export function buildRetryPlan( ++ error: unknown, ++ attempt: number, ++ logger?: Logger, ++): RetryPlan | undefined { ++ const retryLogger = logger?.child({ module: "retry" }); ++ const reason = getRetryReason(error, retryLogger); ++ if (!reason) { ++ retryLogger?.debug?.("No retry reason detected; skipping retry", { attempt }); ++ return undefined; ++ } ++ ++ const max = reason === "timeout" ? TIMEOUT_RETRY_ATTEMPTS : MAX_RETRY_ATTEMPTS; ++ if (attempt >= max) { ++ retryLogger?.debug?.("Retry attempts exhausted; skipping retry", { ++ attempt, ++ max, ++ reason, ++ }); ++ return undefined; ++ } ++ ++ const computedDelayMs = computeBackoffDelay(reason, attempt); ++ const retryAfterMs = ++ reason === "rateLimit" ++ ? error instanceof RateLimitedUpstreamError ++ ? error.retryAfterMs ++ : extractRetryAfterMs(error, retryLogger) ++ : undefined; ++ const delayMs = ++ retryAfterMs === undefined ? computedDelayMs : Math.max(computedDelayMs, retryAfterMs); ++ ++ retryLogger?.debug?.("Retry plan built", { ++ attempt, ++ reason, ++ delayMs, ++ computedDelayMs, ++ retryAfterMs, ++ max, ++ }); ++ ++ return { ++ reason, ++ delayMs, ++ }; ++} ++ ++export function buildRetryPlanWithPolicy( ++ context: RetryPolicyContext, ++ policyConfig?: RetryPolicyConfig, ++): RetryPlan | undefined { ++ const retryLogger = context.logger?.child({ module: "retry" }); ++ const policy = resolveRetryPolicy(context, policyConfig); ++ if (policy) { ++ const planned = policy(context); ++ if (planned) { ++ retryLogger?.debug?.("Retry policy returned a plan", { ++ attempt: context.attempt, ++ reason: planned.reason, ++ delayMs: planned.delayMs, ++ }); ++ return planned; ++ } ++ retryLogger?.debug?.("Retry policy declined to retry", { attempt: context.attempt }); ++ } ++ ++ return buildRetryPlan(context.error, context.attempt, context.logger); ++} ++ ++function resolveRetryPolicy( ++ context: RetryPolicyContext, ++ config?: RetryPolicyConfig, ++): RetryPolicy | undefined { ++ if (!config) return undefined; ++ const modelPolicy = context.key ? config.models?.[context.key] : undefined; ++ if (modelPolicy) return modelPolicy; ++ const providerModelKey = ++ context.metadata?.provider && context.metadata?.model ++ ? `${context.metadata.provider}::${context.metadata.model}` ++ : undefined; ++ const providerModelPolicy = providerModelKey ? config.models?.[providerModelKey] : undefined; ++ if (providerModelPolicy) return providerModelPolicy; ++ const provider = context.metadata?.provider; ++ const providerPolicy = provider ? config.providers?.[provider] : undefined; ++ if (providerPolicy) return providerPolicy; ++ return config.default; ++} ++ ++function getRetryReason(error: unknown, logger?: Logger): RetryReason | undefined { ++ if (error instanceof RateLimitedUpstreamError) return "rateLimit"; ++ const status = extractStatusCode(error, logger); ++ if (status === 429) return "rateLimit"; ++ if (status && status >= 500) return "serverError"; ++ if (status === 408 || isTimeoutError(error, logger)) return "timeout"; ++ return undefined; ++} ++ ++function computeBackoffDelay(reason: RetryReason, attempt: number): number { ++ const base = ++ reason === "serverError" ++ ? SERVER_ERROR_BASE_BACKOFF_MS ++ : reason === "timeout" ++ ? TIMEOUT_BASE_BACKOFF_MS ++ : RATE_LIMIT_BASE_BACKOFF_MS; ++ ++ const jitter = ++ reason === "serverError" ++ ? SERVER_ERROR_JITTER_FACTOR ++ : reason === "timeout" ++ ? TIMEOUT_JITTER_FACTOR ++ : RATE_LIMIT_JITTER_FACTOR; ++ ++ const exp = base * 2 ** (attempt - 1); ++ return Math.round(exp + exp * jitter * Math.random()); ++} +diff --git a/packages/core/src/traffic/traffic-types.ts b/packages/core/src/traffic/traffic-types.ts +new file mode 100644 +index 00000000..f2ebbafb +--- /dev/null ++++ b/packages/core/src/traffic/traffic-types.ts +@@ -0,0 +1,173 @@ ++import type { Logger } from "../logger"; ++ ++type BivariantFunction = { ++ bivarianceHack(...args: TArgs): TReturn; ++}["bivarianceHack"]; ++ ++type UsageCounters = { ++ inputTokens?: number; ++ outputTokens?: number; ++ totalTokens?: number; ++}; ++ ++export type RetryReason = "rateLimit" | "serverError" | "timeout"; ++ ++export type RetryPlan = { ++ delayMs: number; ++ reason: RetryReason; ++}; ++ ++export type RetryPolicyContext = { ++ error: unknown; ++ attempt: number; ++ metadata?: TrafficRequestMetadata; ++ key?: string; ++ logger?: Logger; ++}; ++ ++export type RetryPolicy = (context: RetryPolicyContext) => RetryPlan | undefined; ++ ++export type RetryPolicyConfig = { ++ default?: RetryPolicy; ++ providers?: Record; ++ models?: Record; ++}; ++ ++export type TrafficRequestType = "text" | "stream"; ++export type TrafficPriority = "P0" | "P1" | "P2"; ++ ++export interface TrafficRequestMetadata { ++ agentId?: string; ++ agentName?: string; ++ model?: string; ++ provider?: string; ++ priority?: TrafficPriority; ++ tenantId?: string; ++ apiKeyId?: string; ++ region?: string; ++ endpoint?: string; ++ tenantTier?: string; ++ taskType?: string; ++ fallbackPolicyId?: string; ++} ++ ++export type TrafficResponseMetadata = { ++ rateLimitKey?: string; ++ retryAfterMs?: number; ++ rateLimitRemaining?: number; ++ rateLimitResetAt?: number; ++ rateLimitResetInMs?: number; ++ queueEtaMs?: number; ++ tenantId?: string; ++ priority?: TrafficPriority; ++ taskType?: string; ++}; ++ ++export type FallbackTarget = { ++ provider?: string; ++ model: string; ++}; ++ ++export type FallbackChainEntry = string | FallbackTarget; ++ ++export type FallbackPolicyMode = "fallback" | "wait"; ++ ++export type FallbackPolicy = { ++ mode: FallbackPolicyMode; ++}; ++ ++export type FallbackPolicyConfig = { ++ defaultPolicyId?: string; ++ policies?: Record; ++ taskTypePolicyIds?: Record; ++}; ++ ++export type ProviderModelConcurrencyLimit = ++ | number ++ | Record ++ | ((metadata: TrafficRequestMetadata | undefined, key: string) => number | undefined); ++ ++export type TenantConcurrencyLimit = ++ | number ++ | Record ++ | ((tenantId: string, metadata: TrafficRequestMetadata | undefined) => number | undefined); ++ ++export type PriorityBurstLimits = Partial>; ++ ++export type AdaptiveLimiterConfig = { ++ windowMs?: number; ++ threshold?: number; ++ minPenaltyMs?: number; ++ maxPenaltyMs?: number; ++ penaltyMultiplier?: number; ++ decayMs?: number; ++}; ++ ++export interface TrafficRequest { ++ tenantId: string; ++ metadata?: TrafficRequestMetadata; ++ execute: () => Promise; ++ deadlineAt?: number; ++ maxQueueWaitMs?: number; ++ createFallbackRequest?: BivariantFunction< ++ [target: FallbackChainEntry], ++ TrafficRequest | undefined ++ >; ++ extractUsage?: BivariantFunction< ++ [response: TResponse], ++ Promise | UsageCounters | undefined ++ >; ++} ++ ++export interface TrafficControllerOptions { ++ maxConcurrent?: number; ++ maxConcurrentPerProviderModel?: ProviderModelConcurrencyLimit; ++ maxConcurrentPerTenant?: TenantConcurrencyLimit; ++ rateLimits?: RateLimitConfig; ++ priorityBurstLimits?: PriorityBurstLimits; ++ adaptiveLimiter?: AdaptiveLimiterConfig; ++ /** ++ * Optional override for rate-limit key construction. ++ * Useful when you need to add new metadata fields without changing core logic. ++ */ ++ rateLimitKeyBuilder?: (metadata?: TrafficRequestMetadata) => string; ++ /** ++ * Optional retry policy overrides by provider/model. ++ * Models keys can use the rate-limit key or provider::model. ++ */ ++ retryPolicy?: RetryPolicyConfig; ++ /** ++ * Optional fallback policy selection by task type or explicit policy id. ++ */ ++ fallbackPolicy?: FallbackPolicyConfig; ++ /** ++ * Select a rate-limit strategy by provider/model. ++ * Example: ++ * { providers: { openai: "window" }, models: { "openai::gpt-4o": "window" } } ++ */ ++ rateLimitStrategy?: RateLimitStrategyConfig; ++ logger?: Logger; ++ fallbackChains?: Record; ++} ++ ++export type RateLimitStrategyKind = "window" | "token-bucket"; ++ ++export type RateLimitStrategyConfig = { ++ providers?: Record; ++ models?: Record; ++}; ++ ++export interface RateLimitOptions { ++ requestsPerMinute: number; ++ tokensPerMinute: number; ++ burstSize?: number; ++} ++ ++export type RateLimitKey = string; ++export type RateLimitConfig = Record; ++ ++export type TenantUsage = { ++ inputTokens: number; ++ outputTokens: number; ++ totalTokens: number; ++}; +diff --git a/packages/core/src/traffic/traffic-usage-tracker.ts b/packages/core/src/traffic/traffic-usage-tracker.ts +new file mode 100644 +index 00000000..c79b311a +--- /dev/null ++++ b/packages/core/src/traffic/traffic-usage-tracker.ts +@@ -0,0 +1,83 @@ ++import type { Logger } from "../logger"; ++import type { QueuedRequest } from "./traffic-controller-internal"; ++import { isPromiseLike } from "./traffic-error-utils"; ++import type { TenantUsage } from "./traffic-types"; ++ ++type UsageCounters = { ++ inputTokens?: number; ++ outputTokens?: number; ++ totalTokens?: number; ++}; ++ ++export class TrafficUsageTracker { ++ private readonly tenantUsage = new Map(); ++ ++ getTenantUsage(tenantId: string): TenantUsage | undefined { ++ const usage = this.tenantUsage.get(tenantId); ++ return usage ? { ...usage } : undefined; + } + -+ return singletonController; ++ recordUsage( ++ item: QueuedRequest, ++ result: TResponse, ++ logger?: Logger, ++ ): UsageCounters | Promise | undefined { ++ const usageLogger = logger?.child({ module: "usage-tracker" }); ++ const extractor = item.extractUsage ?? item.request.extractUsage; ++ if (!extractor) { ++ usageLogger?.trace?.("No usage extractor; skipping usage", { tenantId: item.tenantId }); ++ return undefined; ++ } ++ ++ const usage = extractor(result); ++ if (!usage) { ++ usageLogger?.trace?.("Usage extractor returned empty; skipping usage", { ++ tenantId: item.tenantId, ++ }); ++ return undefined; ++ } ++ ++ if (isPromiseLike(usage)) { ++ usageLogger?.trace?.("Usage extractor returned promise; awaiting", { ++ tenantId: item.tenantId, ++ }); ++ void usage.then((u) => u && this.incrementTenantUsage(item.tenantId, u, usageLogger)); ++ return usage; ++ } ++ this.incrementTenantUsage(item.tenantId, usage, usageLogger); ++ return usage; ++ } ++ ++ private incrementTenantUsage(tenantId: string, usage: UsageCounters, logger?: Logger): void { ++ const current = this.tenantUsage.get(tenantId) ?? { ++ inputTokens: 0, ++ outputTokens: 0, ++ totalTokens: 0, ++ }; ++ ++ const input = ++ typeof usage.inputTokens === "number" && Number.isFinite(usage.inputTokens) ++ ? usage.inputTokens ++ : 0; ++ const output = ++ typeof usage.outputTokens === "number" && Number.isFinite(usage.outputTokens) ++ ? usage.outputTokens ++ : 0; ++ const total = ++ typeof usage.totalTokens === "number" && Number.isFinite(usage.totalTokens) ++ ? usage.totalTokens ++ : input + output; ++ ++ this.tenantUsage.set(tenantId, { ++ inputTokens: current.inputTokens + input, ++ outputTokens: current.outputTokens + output, ++ totalTokens: current.totalTokens + total, ++ }); ++ ++ logger?.debug?.("Tenant usage incremented", { ++ tenantId, ++ delta: { inputTokens: input, outputTokens: output, totalTokens: total }, ++ total: this.tenantUsage.get(tenantId), ++ }); ++ } +} diff --git a/packages/core/src/workflow/core.ts b/packages/core/src/workflow/core.ts index 3136511c..2b273d58 100644 @@ -2463,108 +5487,2661 @@ index e6e86510..2c6053fc 100644 + }); const statements = extractResponse.object.statements; - if (statements.length === 0) { -@@ -152,7 +157,9 @@ export function createContextRecallScorer< - contextText, - ).replace("{{statement}}", statement); + if (statements.length === 0) { +@@ -152,7 +157,9 @@ export function createContextRecallScorer< + contextText, + ).replace("{{statement}}", statement); + +- const verifyResponse = await agent.generateObject(verifyPrompt, VERIFY_SCHEMA); ++ const verifyResponse = await agent.generateObject(verifyPrompt, VERIFY_SCHEMA, { ++ tenantId, ++ }); + verdicts.push({ + statement, + verdict: verifyResponse.object.verdict, +diff --git a/packages/scorers/src/llm/context-relevancy.ts b/packages/scorers/src/llm/context-relevancy.ts +index ee882b5b..aca608b2 100644 +--- a/packages/scorers/src/llm/context-relevancy.ts ++++ b/packages/scorers/src/llm/context-relevancy.ts +@@ -7,6 +7,7 @@ import { + import { safeStringify } from "@voltagent/internal/utils"; + import type { LanguageModel } from "ai"; + import { z } from "zod"; ++import { extractTenantId } from "./utils"; + + const CONTEXT_RELEVANCY_PROMPT = `Analyze the provided context and identify which parts are relevant to answering the given question. For each context sentence or passage, determine its relevance level. + +@@ -144,6 +145,7 @@ export function createContextRelevancyScorer< + const agent = new Agent({ + name: "context-relevancy-evaluator", + model, ++ trafficPriority: "P2", + instructions: "You evaluate how relevant provided context is to answering questions", + }); + +@@ -151,13 +153,16 @@ export function createContextRelevancyScorer< + const contextText = Array.isArray(payload.context) + ? payload.context.join("\n") + : payload.context; ++ const tenantId = extractTenantId(context); + + const prompt = CONTEXT_RELEVANCY_PROMPT.replace("{{question}}", payload.input).replace( + "{{context}}", + contextText, + ); + +- const response = await agent.generateObject(prompt, CONTEXT_RELEVANCY_SCHEMA); ++ const response = await agent.generateObject(prompt, CONTEXT_RELEVANCY_SCHEMA, { ++ tenantId, ++ }); + const evaluations = response.object.evaluations; + + context.results.raw.contextRelevancyEvaluations = evaluations; +diff --git a/packages/scorers/src/llm/moderation.ts b/packages/scorers/src/llm/moderation.ts +index 03563bfe..1055927f 100644 +--- a/packages/scorers/src/llm/moderation.ts ++++ b/packages/scorers/src/llm/moderation.ts +@@ -7,6 +7,7 @@ import { + } from "@voltagent/core"; + import { safeStringify } from "@voltagent/internal/utils"; + import { z } from "zod"; ++import { extractTenantId } from "./utils"; + + export interface ModerationScorerOptions { + id?: string; +@@ -220,6 +221,7 @@ async function runModerationJudge(args: { + typeof context.results.prepare === "string" + ? context.results.prepare + : normalizeText(context.payload.output); ++ const tenantId = extractTenantId(context); + + const prompt = await buildPrompt({ + output: normalizedOutput, +@@ -232,12 +234,14 @@ async function runModerationJudge(args: { + const agent = new Agent({ + name: "moderation-judge", + model, ++ trafficPriority: "P2", + instructions: + "You are a safety classifier. Respond with JSON that matches the provided schema containing flagged, scores, and reason.", + }); + + const response = await agent.generateObject(prompt, MODERATION_SCHEMA, { + maxOutputTokens, ++ tenantId, + }); + + const parsed = mapModerationResponse(response.object, threshold); +diff --git a/packages/scorers/src/llm/utils.ts b/packages/scorers/src/llm/utils.ts +new file mode 100644 +index 00000000..75e886e3 +--- /dev/null ++++ b/packages/scorers/src/llm/utils.ts +@@ -0,0 +1,14 @@ ++import type { BuilderPrepareContext, BuilderScoreContext } from "@voltagent/core"; ++ ++type TenantAwareContext = BuilderScoreContext, Record> & ++ BuilderPrepareContext, Record>; ++ ++export function extractTenantId( ++ context: ++ | BuilderScoreContext, Record> ++ | BuilderPrepareContext, Record> ++ | TenantAwareContext, ++): string | undefined { ++ const candidate = (context.payload as { tenantId?: unknown })?.tenantId; ++ return typeof candidate === "string" ? candidate : undefined; ++} +diff --git a/packages/server-core/src/handlers/agent.handlers.ts b/packages/server-core/src/handlers/agent.handlers.ts +index 00c0f2ee..37fbeaf4 100644 +--- a/packages/server-core/src/handlers/agent.handlers.ts ++++ b/packages/server-core/src/handlers/agent.handlers.ts +@@ -1,11 +1,70 @@ +-import { ClientHTTPError, type ServerProviderDeps } from "@voltagent/core"; +-import { convertUsage } from "@voltagent/core"; ++import { ++ ClientHTTPError, ++ type ServerProviderDeps, ++ type TrafficResponseMetadata, ++ convertUsage, ++} from "@voltagent/core"; + import { type Logger, safeStringify } from "@voltagent/internal"; + import { z } from "zod"; + import { convertJsonSchemaToZod } from "zod-from-json-schema"; + import { convertJsonSchemaToZod as convertJsonSchemaToZodV3 } from "zod-from-json-schema-v3"; + import type { ApiResponse } from "../types"; + import { processAgentOptions } from "../utils/options"; ++import { buildTrafficHeaders } from "../utils/traffic"; ++ ++function extractTrafficMetadata(value: unknown): TrafficResponseMetadata | undefined { ++ if (!value || typeof value !== "object") return undefined; ++ const traffic = (value as { traffic?: unknown }).traffic; ++ if (!traffic || typeof traffic !== "object") return undefined; ++ return traffic as TrafficResponseMetadata; ++} ++ ++function wrapStreamWithTraffic( ++ baseResponse: Response, ++ traffic?: TrafficResponseMetadata, ++): Response { ++ if (!traffic) return baseResponse; ++ const headers = new Headers(baseResponse.headers); ++ const trafficHeaders = buildTrafficHeaders(traffic); ++ for (const [key, value] of Object.entries(trafficHeaders)) { ++ headers.set(key, value); ++ } ++ const baseBody = baseResponse.body; ++ if (!baseBody) { ++ return new Response(baseBody, { ++ status: baseResponse.status, ++ headers, ++ }); ++ } ++ ++ const encoder = new TextEncoder(); ++ const stream = new ReadableStream({ ++ async start(controller) { ++ const trafficEvent = `data: ${safeStringify({ type: "traffic", traffic })}\n\n`; ++ controller.enqueue(encoder.encode(trafficEvent)); ++ const reader = baseBody.getReader(); ++ try { ++ while (true) { ++ const { done, value } = await reader.read(); ++ if (done) break; ++ if (value !== undefined) { ++ controller.enqueue(value); ++ } ++ } ++ } catch (error) { ++ controller.error(error); ++ } finally { ++ reader.releaseLock(); ++ controller.close(); ++ } ++ }, ++ }); ++ ++ return new Response(stream, { ++ status: baseResponse.status, ++ headers, ++ }); ++} + + /** + * Handler for listing all agents +@@ -79,6 +138,7 @@ export async function handleGenerateText( + const options = processAgentOptions(body, signal); + + const result = await agent.generateText(input, options); ++ const traffic = extractTrafficMetadata(result); + + // Convert usage format if present + const usage = result.usage ? convertUsage(result.usage) : undefined; +@@ -102,9 +162,11 @@ export async function handleGenerateText( + } + })(), + }, ++ traffic, + }; + } catch (error) { + logger.error("Failed to generate text", { error }); ++ const traffic = extractTrafficMetadata(error); + if (error instanceof ClientHTTPError) { + return { + success: false, +@@ -112,11 +174,13 @@ export async function handleGenerateText( + code: error.code, + name: error.name, + httpStatus: error.httpStatus, ++ traffic, + }; + } + return { + success: false, + error: error instanceof Error ? error.message : "Unknown error", ++ traffic, + }; + } + } +@@ -153,6 +217,7 @@ export async function handleStreamText( + const options = processAgentOptions(body, signal); + + const result = await agent.streamText(input, options); ++ const traffic = extractTrafficMetadata(result); + + // Access the fullStream property + const { fullStream } = result; +@@ -178,7 +243,7 @@ export async function handleStreamText( + }, + }); + +- return new Response(stream, { ++ const response = new Response(stream, { + status: 200, + headers: { + "Content-Type": "text/event-stream", +@@ -186,20 +251,25 @@ export async function handleStreamText( + Connection: "keep-alive", + }, + }); ++ return wrapStreamWithTraffic(response, traffic); + } catch (error) { + logger.error("Failed to handle stream text request", { error }); + + const errorMessage = error instanceof Error ? error.message : "Unknown error"; ++ const traffic = extractTrafficMetadata(error); ++ const trafficHeaders = buildTrafficHeaders(traffic); + + return new Response( + safeStringify({ + error: errorMessage, + message: errorMessage, ++ traffic, + }), + { + status: 500, + headers: { + "Content-Type": "application/json", ++ ...trafficHeaders, + }, + }, + ); +@@ -238,26 +308,32 @@ export async function handleChatStream( + const options = processAgentOptions(body, signal); + + const result = await agent.streamText(input, options); ++ const traffic = extractTrafficMetadata(result); + + // Use the built-in toUIMessageStreamResponse - it handles errors properly +- return result.toUIMessageStreamResponse({ ++ const response = result.toUIMessageStreamResponse({ + sendReasoning: true, + sendSources: true, + }); ++ return wrapStreamWithTraffic(response, traffic); + } catch (error) { + logger.error("Failed to handle chat stream request", { error }); + + const errorMessage = error instanceof Error ? error.message : "Unknown error"; ++ const traffic = extractTrafficMetadata(error); ++ const trafficHeaders = buildTrafficHeaders(traffic); + + return new Response( + safeStringify({ + error: errorMessage, + message: errorMessage, ++ traffic, + }), + { + status: 500, + headers: { + "Content-Type": "application/json", ++ ...trafficHeaders, + }, + }, + ); +@@ -293,16 +369,20 @@ export async function handleGenerateObject( + ) as any; + + const result = await agent.generateObject(input, zodSchema, options); ++ const traffic = extractTrafficMetadata(result); + + return { + success: true, + data: result.object, ++ traffic, + }; + } catch (error) { + logger.error("Failed to generate object", { error }); ++ const traffic = extractTrafficMetadata(error); + return { + success: false, + error: error instanceof Error ? error.message : "Unknown error", ++ traffic, + }; + } + } +@@ -344,23 +424,29 @@ export async function handleStreamObject( + ) as any; + + const result = await agent.streamObject(input, zodSchema, options); ++ const traffic = extractTrafficMetadata(result); + + // Use the built-in toTextStreamResponse - it handles errors properly +- return result.toTextStreamResponse(); ++ const response = result.toTextStreamResponse(); ++ return wrapStreamWithTraffic(response, traffic); + } catch (error) { + logger.error("Failed to handle stream object request", { error }); + + const errorMessage = error instanceof Error ? error.message : "Unknown error"; ++ const traffic = extractTrafficMetadata(error); ++ const trafficHeaders = buildTrafficHeaders(traffic); + + return new Response( + safeStringify({ + error: errorMessage, + message: errorMessage, ++ traffic, + }), + { + status: 500, + headers: { + "Content-Type": "application/json", ++ ...trafficHeaders, + }, + }, + ); +diff --git a/packages/server-core/src/index.ts b/packages/server-core/src/index.ts +index 1fe7e206..2f7ed826 100644 +--- a/packages/server-core/src/index.ts ++++ b/packages/server-core/src/index.ts +@@ -40,6 +40,7 @@ export * from "./utils/server-utils"; + export * from "./utils/ui-templates"; + export * from "./utils/response-mappers"; + export * from "./utils/sse"; ++export * from "./utils/traffic"; + export * from "./utils/announcements"; + + // Export WebSocket utilities +diff --git a/packages/server-core/src/types/responses.ts b/packages/server-core/src/types/responses.ts +index 2098c2f6..4935a535 100644 +--- a/packages/server-core/src/types/responses.ts ++++ b/packages/server-core/src/types/responses.ts +@@ -1,10 +1,12 @@ + /** + * Framework-agnostic response types for server handlers + */ ++import type { TrafficResponseMetadata } from "@voltagent/core"; + + export interface SuccessResponse { + success: true; + data: T; ++ traffic?: TrafficResponseMetadata; + } + + export interface ErrorResponse { +@@ -13,6 +15,7 @@ export interface ErrorResponse { + httpStatus?: number; + code?: string; + name?: string; ++ traffic?: TrafficResponseMetadata; + } + + export type ApiResponse = SuccessResponse | ErrorResponse; +diff --git a/packages/server-core/src/utils/traffic.ts b/packages/server-core/src/utils/traffic.ts +new file mode 100644 +index 00000000..f9be1845 +--- /dev/null ++++ b/packages/server-core/src/utils/traffic.ts +@@ -0,0 +1,35 @@ ++import type { TrafficResponseMetadata } from "@voltagent/core"; ++ ++export function buildTrafficHeaders(traffic?: TrafficResponseMetadata): Record { ++ if (!traffic) return {}; ++ ++ const headers: Record = {}; ++ ++ if (typeof traffic.retryAfterMs === "number" && Number.isFinite(traffic.retryAfterMs)) { ++ headers["Retry-After"] = String(Math.max(0, Math.ceil(traffic.retryAfterMs / 1000))); ++ } ++ ++ if (traffic.rateLimitRemaining !== undefined) { ++ headers["X-RateLimit-Remaining"] = String(traffic.rateLimitRemaining); ++ } ++ ++ if (typeof traffic.rateLimitResetAt === "number" && Number.isFinite(traffic.rateLimitResetAt)) { ++ headers["X-RateLimit-Reset"] = String(Math.max(0, Math.ceil(traffic.rateLimitResetAt / 1000))); ++ } else if ( ++ typeof traffic.rateLimitResetInMs === "number" && ++ Number.isFinite(traffic.rateLimitResetInMs) ++ ) { ++ const resetAt = Date.now() + Math.max(0, traffic.rateLimitResetInMs); ++ headers["X-RateLimit-Reset"] = String(Math.max(0, Math.ceil(resetAt / 1000))); ++ } ++ ++ if (traffic.queueEtaMs !== undefined) { ++ headers["X-Queue-ETA"] = String(traffic.queueEtaMs); ++ } ++ ++ if (traffic.rateLimitKey) { ++ headers["X-RateLimit-Key"] = traffic.rateLimitKey; ++ } ++ ++ return headers; ++} +diff --git a/packages/server-hono/src/routes/index.ts b/packages/server-hono/src/routes/index.ts +index a5af8214..336a5bf4 100644 +--- a/packages/server-hono/src/routes/index.ts ++++ b/packages/server-hono/src/routes/index.ts +@@ -2,6 +2,7 @@ import type { ServerProviderDeps } from "@voltagent/core"; + import type { Logger } from "@voltagent/internal"; + import { + UPDATE_ROUTES, ++ buildTrafficHeaders, + handleCancelWorkflow, + handleChatStream, + handleCheckUpdates, +@@ -87,11 +88,12 @@ export function registerAgentRoutes( + + const signal = c.req.raw.signal; + const response = await handleGenerateText(agentId, body, deps, logger, signal); ++ const trafficHeaders = buildTrafficHeaders(response.traffic); + if (!response.success) { + const { httpStatus, ...details } = response; +- return c.json(details, httpStatus || 500); ++ return c.json(details, httpStatus || 500, trafficHeaders); + } +- return c.json(response, 200); ++ return c.json(response, 200, trafficHeaders); + }); + + // POST /agents/:id/stream - Stream text (raw fullStream SSE) +@@ -131,11 +133,12 @@ export function registerAgentRoutes( + const body = await c.req.json(); + const signal = c.req.raw.signal; + const response = await handleGenerateObject(agentId, body, deps, logger, signal); ++ const trafficHeaders = buildTrafficHeaders(response.traffic); + if (!response.success) { + const { httpStatus, ...details } = response; +- return c.json(details, httpStatus || 500); ++ return c.json(details, httpStatus || 500, trafficHeaders); + } +- return c.json(response, 200); ++ return c.json(response, 200, trafficHeaders); + }); + + // POST /agents/:id/stream-object - Stream object +diff --git a/packages/serverless-hono/src/routes.ts b/packages/serverless-hono/src/routes.ts +index d377ce4b..39eabcf7 100644 +--- a/packages/serverless-hono/src/routes.ts ++++ b/packages/serverless-hono/src/routes.ts +@@ -28,6 +28,7 @@ import { + type TriggerHttpRequestContext, + UPDATE_ROUTES, + WORKFLOW_ROUTES, ++ buildTrafficHeaders, + executeA2ARequest, + executeTriggerHandler, + getConversationMessagesHandler, +@@ -165,7 +166,8 @@ export function registerAgentRoutes(app: Hono, deps: ServerProviderDeps, logger: + } + const signal = c.req.raw.signal; + const response = await handleGenerateText(agentId, body, deps, logger, signal); +- return c.json(response, response.success ? 200 : 500); ++ const trafficHeaders = buildTrafficHeaders(response.traffic); ++ return c.json(response, response.success ? 200 : 500, trafficHeaders); + }); + + app.post(AGENT_ROUTES.streamText.path, async (c) => { +@@ -197,7 +199,8 @@ export function registerAgentRoutes(app: Hono, deps: ServerProviderDeps, logger: + } + const signal = c.req.raw.signal; + const response = await handleGenerateObject(agentId, body, deps, logger, signal); +- return c.json(response, response.success ? 200 : 500); ++ const trafficHeaders = buildTrafficHeaders(response.traffic); ++ return c.json(response, response.success ? 200 : 500, trafficHeaders); + }); + + app.post(AGENT_ROUTES.streamObject.path, async (c) => { +diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml +index 6675056e..244ce4d1 100644 +--- a/pnpm-lock.yaml ++++ b/pnpm-lock.yaml +@@ -37,7 +37,7 @@ importers: + version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2) + '@nx/plugin': + specifier: 20.4.6 +- version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(typescript@5.9.2) ++ version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2) + '@nx/vite': + specifier: 20.4.6 + version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2)(vite@7.2.7)(vitest@3.2.4) +@@ -92,6 +92,9 @@ importers: + syncpack: + specifier: ^13.0.2 + version: 13.0.4(typescript@5.9.2) ++ ts-node: ++ specifier: ^10.9.2 ++ version: 10.9.2(@swc/core@1.5.29)(@types/node@24.2.1)(typescript@5.9.2) + tslib: + specifier: ^2.3.0 + version: 2.8.1 +@@ -99,7 +102,7 @@ importers: + specifier: ^8.5.0 + version: 8.5.0(@swc/core@1.5.29)(typescript@5.9.2) + typescript: +- specifier: ^5.8.2 ++ specifier: ^5.9.2 + version: 5.9.2 + vite: + specifier: ^7.2.7 +@@ -2750,6 +2753,61 @@ importers: + specifier: ^0.5.3 + version: 0.5.3(@tanstack/query-core@5.89.0)(@tanstack/react-query@5.89.0)(@tanstack/router-core@1.131.44)(@types/node@24.2.1)(@types/react@19.2.7)(@vitejs/plugin-react@5.1.2)(tsx@4.20.4)(vite@7.2.7) -- const verifyResponse = await agent.generateObject(verifyPrompt, VERIFY_SCHEMA); -+ const verifyResponse = await agent.generateObject(verifyPrompt, VERIFY_SCHEMA, { -+ tenantId, -+ }); - verdicts.push({ - statement, - verdict: verifyResponse.object.verdict, -diff --git a/packages/scorers/src/llm/context-relevancy.ts b/packages/scorers/src/llm/context-relevancy.ts -index ee882b5b..aca608b2 100644 ---- a/packages/scorers/src/llm/context-relevancy.ts -+++ b/packages/scorers/src/llm/context-relevancy.ts -@@ -7,6 +7,7 @@ import { - import { safeStringify } from "@voltagent/internal/utils"; - import type { LanguageModel } from "ai"; - import { z } from "zod"; -+import { extractTenantId } from "./utils"; ++ examples/with-viteval/dist: ++ dependencies: ++ '@ai-sdk/openai': ++ specifier: ^2.0.52 ++ version: 2.0.85(zod@3.25.76) ++ '@voltagent/cli': ++ specifier: ^0.1.16 ++ version: link:../../../packages/cli ++ '@voltagent/core': ++ specifier: ^1.2.15 ++ version: link:../../../packages/core ++ '@voltagent/libsql': ++ specifier: ^1.0.13 ++ version: link:../../../packages/libsql ++ '@voltagent/logger': ++ specifier: ^1.0.4 ++ version: link:../../../packages/logger ++ '@voltagent/server-hono': ++ specifier: ^1.2.5 ++ version: link:../../../packages/server-hono ++ ai: ++ specifier: ^5.0.76 ++ version: 5.0.113(zod@3.25.76) ++ consola: ++ specifier: ^3.4.2 ++ version: 3.4.2 ++ envalid: ++ specifier: ^8.1.0 ++ version: 8.1.0 ++ yargs: ++ specifier: ^18.0.0 ++ version: 18.0.0 ++ zod: ++ specifier: ^3.25.76 ++ version: 3.25.76 ++ devDependencies: ++ '@tsconfig/node24': ++ specifier: ^24.0.1 ++ version: 24.0.1 ++ '@types/yargs': ++ specifier: ^17.0.33 ++ version: 17.0.33 ++ dotenv: ++ specifier: ^16.4.5 ++ version: 16.6.1 ++ tsx: ++ specifier: ^4.19.3 ++ version: 4.20.4 ++ typescript: ++ specifier: ^5.8.2 ++ version: 5.9.2 ++ viteval: ++ specifier: ^0.5.3 ++ version: 0.5.3(@tanstack/query-core@5.89.0)(@tanstack/react-query@5.89.0)(@tanstack/router-core@1.131.44)(@types/node@24.2.1)(@types/react@19.2.7)(@vitejs/plugin-react@5.1.2)(tsx@4.20.4)(vite@7.2.7) ++ + examples/with-voice-elevenlabs: + dependencies: + '@ai-sdk/openai': +@@ -3509,7 +3567,7 @@ importers: + version: 3.2.4(vitest@3.2.4) + jest: + specifier: ^29.5.0 +- version: 29.7.0(@types/node@24.2.1) ++ version: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) + ts-jest: + specifier: ^29.1.0 + version: 29.4.1(@babel/core@7.28.5)(esbuild@0.25.10)(jest@29.7.0)(typescript@5.9.2) +@@ -9966,7 +10024,7 @@ packages: + slash: 3.0.0 + dev: true - const CONTEXT_RELEVANCY_PROMPT = `Analyze the provided context and identify which parts are relevant to answering the given question. For each context sentence or passage, determine its relevance level. +- /@jest/core@29.7.0: ++ /@jest/core@29.7.0(ts-node@10.9.2): + resolution: {integrity: sha512-n7aeXWKMnGtDA48y8TLWJPJmLmmZ642Ceo78cYWEpiD7FzDgmNDV/GCVRorPABdXLJZ/9wzzgZAlHjXjxDHGsg==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + peerDependencies: +@@ -9987,7 +10045,7 @@ packages: + exit: 0.1.2 + graceful-fs: 4.2.11 + jest-changed-files: 29.7.0 +- jest-config: 29.7.0(@types/node@24.6.2) ++ jest-config: 29.7.0(@types/node@24.6.2)(ts-node@10.9.2) + jest-haste-map: 29.7.0 + jest-message-util: 29.7.0 + jest-regex-util: 29.6.3 +@@ -12403,7 +12461,7 @@ packages: + - verdaccio + dev: true -@@ -144,6 +145,7 @@ export function createContextRelevancyScorer< - const agent = new Agent({ - name: "context-relevancy-evaluator", - model, -+ trafficPriority: "P2", - instructions: "You evaluate how relevant provided context is to answering questions", - }); +- /@nx/jest@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2): ++ /@nx/jest@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2): + resolution: {integrity: sha512-yZOZJOQFtpdY3Fu/WYNoDx81TwvF9yfwvalFpLD19bz+2YGl7B89l0S1ZrtSRXFfKXA/w7gb0gmKwthJtQhx9Q==} + dependencies: + '@jest/reporters': 29.7.0 +@@ -12412,7 +12470,7 @@ packages: + '@nx/js': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2) + '@phenomnomnominal/tsquery': 5.0.1(typescript@5.9.2) + identity-obj-proxy: 3.0.0 +- jest-config: 29.7.0(@types/node@24.2.1) ++ jest-config: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) + jest-resolve: 29.7.0 + jest-util: 29.7.0 + minimatch: 9.0.3 +@@ -12807,12 +12865,12 @@ packages: + dev: true + optional: true -@@ -151,13 +153,16 @@ export function createContextRelevancyScorer< - const contextText = Array.isArray(payload.context) - ? payload.context.join("\n") - : payload.context; -+ const tenantId = extractTenantId(context); +- /@nx/plugin@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(typescript@5.9.2): ++ /@nx/plugin@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2): + resolution: {integrity: sha512-7Jlv+BVqGoO0BolQN7P5Z87160phuE1i7H6C8xFwQnlQ3ZfwQCJzk2dkg1UyzxDkWl6lvVsqBjZPXD55gFQ3+w==} + dependencies: + '@nx/devkit': 20.4.6(nx@20.8.2) + '@nx/eslint': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2) +- '@nx/jest': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2) ++ '@nx/jest': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2) + '@nx/js': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2) + tslib: 2.8.1 + transitivePeerDependencies: +@@ -17770,8 +17828,8 @@ packages: + '@babel/plugin-syntax-jsx': 7.27.1(@babel/core@7.28.5) + '@babel/plugin-syntax-typescript': 7.27.1(@babel/core@7.28.5) + '@babel/template': 7.27.2 +- '@babel/traverse': 7.28.4 +- '@babel/types': 7.28.4 ++ '@babel/traverse': 7.28.5 ++ '@babel/types': 7.28.5 + '@tanstack/react-router': 1.131.44(react-dom@19.2.3)(react@19.2.3) + '@tanstack/router-core': 1.131.44 + '@tanstack/router-generator': 1.131.44 +@@ -22783,7 +22841,7 @@ packages: + crc-32: 1.2.2 + readable-stream: 4.7.0 - const prompt = CONTEXT_RELEVANCY_PROMPT.replace("{{question}}", payload.input).replace( - "{{context}}", - contextText, - ); +- /create-jest@29.7.0(@types/node@24.2.1): ++ /create-jest@29.7.0(@types/node@24.2.1)(ts-node@10.9.2): + resolution: {integrity: sha512-Adz2bdH0Vq3F53KEMJOoftQFutWCukm6J24wbPWRO4k1kMY7gS7ds/uoJkNuV8wDCtWWnuwGcJwpWcih+zEW1Q==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + hasBin: true +@@ -22792,7 +22850,7 @@ packages: + chalk: 4.1.2 + exit: 0.1.2 + graceful-fs: 4.2.11 +- jest-config: 29.7.0(@types/node@24.2.1) ++ jest-config: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) + jest-util: 29.7.0 + prompts: 2.4.2 + transitivePeerDependencies: +@@ -27641,7 +27699,7 @@ packages: + - supports-color + dev: true -- const response = await agent.generateObject(prompt, CONTEXT_RELEVANCY_SCHEMA); -+ const response = await agent.generateObject(prompt, CONTEXT_RELEVANCY_SCHEMA, { +- /jest-cli@29.7.0(@types/node@24.2.1): ++ /jest-cli@29.7.0(@types/node@24.2.1)(ts-node@10.9.2): + resolution: {integrity: sha512-OVVobw2IubN/GSYsxETi+gOe7Ka59EFMR/twOU3Jb2GnKKeMGJB5SGUUrEz3SFVmJASUdZUzy83sLNNQ2gZslg==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + hasBin: true +@@ -27651,14 +27709,14 @@ packages: + node-notifier: + optional: true + dependencies: +- '@jest/core': 29.7.0 ++ '@jest/core': 29.7.0(ts-node@10.9.2) + '@jest/test-result': 29.7.0 + '@jest/types': 29.6.3 + chalk: 4.1.2 +- create-jest: 29.7.0(@types/node@24.2.1) ++ create-jest: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) + exit: 0.1.2 + import-local: 3.2.0 +- jest-config: 29.7.0(@types/node@24.2.1) ++ jest-config: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) + jest-util: 29.7.0 + jest-validate: 29.7.0 + yargs: 17.7.2 +@@ -27669,7 +27727,7 @@ packages: + - ts-node + dev: true + +- /jest-config@29.7.0(@types/node@24.2.1): ++ /jest-config@29.7.0(@types/node@24.2.1)(ts-node@10.9.2): + resolution: {integrity: sha512-uXbpfeQ7R6TZBqI3/TxCU4q4ttk3u0PJeC+E0zbfSoSjq6bJ7buBPxzQPL0ifrkY4DNu4JUdk0ImlBUYi840eQ==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + peerDependencies: +@@ -27704,12 +27762,13 @@ packages: + pretty-format: 29.7.0 + slash: 3.0.0 + strip-json-comments: 3.1.1 ++ ts-node: 10.9.2(@swc/core@1.5.29)(@types/node@24.2.1)(typescript@5.9.2) + transitivePeerDependencies: + - babel-plugin-macros + - supports-color + dev: true + +- /jest-config@29.7.0(@types/node@24.6.2): ++ /jest-config@29.7.0(@types/node@24.6.2)(ts-node@10.9.2): + resolution: {integrity: sha512-uXbpfeQ7R6TZBqI3/TxCU4q4ttk3u0PJeC+E0zbfSoSjq6bJ7buBPxzQPL0ifrkY4DNu4JUdk0ImlBUYi840eQ==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + peerDependencies: +@@ -27744,6 +27803,7 @@ packages: + pretty-format: 29.7.0 + slash: 3.0.0 + strip-json-comments: 3.1.1 ++ ts-node: 10.9.2(@swc/core@1.5.29)(@types/node@24.2.1)(typescript@5.9.2) + transitivePeerDependencies: + - babel-plugin-macros + - supports-color +@@ -28041,7 +28101,7 @@ packages: + supports-color: 8.1.1 + dev: true + +- /jest@29.7.0(@types/node@24.2.1): ++ /jest@29.7.0(@types/node@24.2.1)(ts-node@10.9.2): + resolution: {integrity: sha512-NIy3oAFp9shda19hy4HK0HRTWKtPJmGdnvywu01nOqNC2vZg+Z+fvJDxpMQA88eb2I9EcafcdjYgsDthnYTvGw==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + hasBin: true +@@ -28051,10 +28111,10 @@ packages: + node-notifier: + optional: true + dependencies: +- '@jest/core': 29.7.0 ++ '@jest/core': 29.7.0(ts-node@10.9.2) + '@jest/types': 29.6.3 + import-local: 3.2.0 +- jest-cli: 29.7.0(@types/node@24.2.1) ++ jest-cli: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) + transitivePeerDependencies: + - '@types/node' + - babel-plugin-macros +@@ -36767,7 +36827,7 @@ packages: + esbuild: 0.25.10 + fast-json-stable-stringify: 2.1.0 + handlebars: 4.7.8 +- jest: 29.7.0(@types/node@24.2.1) ++ jest: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) + json5: 2.2.3 + lodash.memoize: 4.1.2 + make-error: 1.3.6 +diff --git a/tmp/test/traffic-concurrency.ts b/tmp/test/traffic-concurrency.ts +new file mode 100644 +index 00000000..d12fc5c9 +--- /dev/null ++++ b/tmp/test/traffic-concurrency.ts +@@ -0,0 +1,91 @@ ++// @ts-nocheck ++/** ++ * Manual test: TrafficController maxConcurrent scheduling. ++ * ++ * What to look for: ++ * - `inFlight` should never exceed `maxConcurrent`. ++ * - Requests should start in bursts up to `maxConcurrent`. ++ * ++ * Run: ++ * - pnpm ts-node tmp/test/traffic-concurrency.ts ++ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-concurrency.ts (enable controller debug logs) ++ */ ++ ++import { safeStringify } from "@voltagent/internal"; ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); ++const now = () => new Date().toISOString(); ++ ++const maxConcurrent = 3; ++const controller = getTrafficController({ maxConcurrent }); ++ ++let inFlight = 0; ++let maxObserved = 0; ++ ++function makeModel(id: string, durationMs: number) { ++ return { ++ specificationVersion: "v2", ++ provider: "sim", ++ modelId: `concurrency-${id}`, ++ doGenerate: async () => { ++ inFlight += 1; ++ maxObserved = Math.max(maxObserved, inFlight); ++ console.log(`[${now()}] start ${id} inFlight=${inFlight}`); ++ ++ try { ++ await sleep(durationMs); ++ return { ++ content: [{ type: "text", text: `ok:${id}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { modelId: `concurrency-${id}`, headers: {} }, ++ }; ++ } finally { ++ inFlight -= 1; ++ console.log(`[${now()}] end ${id} inFlight=${inFlight}`); ++ } ++ }, ++ }; ++} ++ ++async function main() { ++ console.log(`\n=== TrafficController concurrency (maxConcurrent=${maxConcurrent}) ===`); ++ void controller; ++ ++ const agent = new Agent({ ++ name: "traffic-concurrency", ++ instructions: "echo", ++ model: makeModel("base", 0), ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ const ids = ["A", "B", "C", "D", "E"]; ++ const jobs = ids.map((id) => ++ agent.generateText(id, { ++ tenantId: "default", ++ trafficPriority: "P1", ++ model: makeModel(id, 700), ++ }), ++ ); ++ ++ const settled = await Promise.allSettled(jobs); ++ console.log(`\n[done] maxObserved=${maxObserved}`); ++ console.log( ++ `[done] results=${safeStringify( ++ settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)), ++ )}`, ++ ); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-fallback-chain.ts b/tmp/test/traffic-fallback-chain.ts +new file mode 100644 +index 00000000..0cd77b2b +--- /dev/null ++++ b/tmp/test/traffic-fallback-chain.ts +@@ -0,0 +1,168 @@ ++// @ts-nocheck ++/** ++ * Manual test: TrafficController circuit breaker + fallback chains. ++ * ++ * Scenarios: ++ * - Test 1: Open primary circuit (via repeated 429s), then route to fallback1. ++ * - Test 2: Open fallback1 circuit, then route to fallback2 (success). ++ * - Test 3: No fallback configured → CircuitBreakerOpenError. ++ * ++ * Run: ++ * - pnpm ts-node tmp/test/traffic-fallback-chain.ts ++ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-fallback-chain.ts ++ */ ++ ++import { safeStringify } from "@voltagent/internal"; ++import { MockLanguageModelV2, MockProviderV2 } from "ai/test"; ++import { ++ Agent, ++ CircuitBreakerOpenError, ++ getTrafficController, ++} from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); ++const now = () => new Date().toISOString(); ++ ++type ModelId = "primary" | "fallback1" | "fallback2" | "no-fallback"; ++ ++const provider = "test-provider"; ++ ++const controller = getTrafficController({ ++ maxConcurrent: 1, ++ fallbackChains: { ++ primary: ["fallback1", "fallback2"], ++ fallback1: ["fallback2"], ++ }, ++}); ++ ++function makeAlways429Model(modelId: ModelId) { ++ let attempts = 0; ++ return new MockLanguageModelV2({ ++ provider, ++ modelId, ++ doGenerate: async () => { ++ attempts += 1; ++ console.log(`[${now()}] doGenerate model=${modelId} attempt=${attempts} -> 429`); ++ await sleep(25); ++ const err: any = new Error(`forced 429 for model=${modelId} attempt=${attempts}`); ++ err.status = 429; ++ throw err; ++ }, ++ }); ++} ++ ++function makeAlwaysOkModel(modelId: ModelId) { ++ let attempts = 0; ++ return new MockLanguageModelV2({ ++ provider, ++ modelId, ++ doGenerate: async () => { ++ attempts += 1; ++ console.log(`[${now()}] doGenerate model=${modelId} attempt=${attempts} -> ok`); ++ await sleep(25); ++ return { ++ content: [{ type: "text", text: `ok:${modelId}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { modelId, headers: {} }, ++ }; ++ }, ++ }); ++} ++ ++const primaryModel = makeAlways429Model("primary"); ++const fallback1Model = makeAlways429Model("fallback1"); ++const fallback2Model = makeAlwaysOkModel("fallback2"); ++const noFallbackModel = makeAlways429Model("no-fallback"); ++ ++// Required so Agent fallbacks (string model IDs) resolve without network calls. ++(globalThis as any).AI_SDK_DEFAULT_PROVIDER = new MockProviderV2({ ++ languageModels: { ++ primary: primaryModel, ++ fallback1: fallback1Model, ++ fallback2: fallback2Model, ++ "no-fallback": noFallbackModel, ++ }, ++}); ++ ++const primaryAgent = new Agent({ ++ name: "traffic-fallback-primary", ++ instructions: "echo", ++ model: primaryModel, ++ temperature: 0, ++ maxOutputTokens: 32, ++}); ++ ++const noFallbackAgent = new Agent({ ++ name: "traffic-fallback-none", ++ instructions: "echo", ++ model: noFallbackModel, ++ temperature: 0, ++ maxOutputTokens: 32, ++}); ++ ++async function runOnce(label: string, agent: any) { ++ console.log(`\n--- ${label} ---`); ++ try { ++ const result = await agent.generateText(label, { ++ tenantId: "default", ++ trafficPriority: "P1", ++ }); ++ console.log( ++ `[${label}] success text=${result.text} responseModel=${result.response?.modelId ?? "n/a"}`, ++ ); ++ } catch (err: any) { ++ if (err instanceof CircuitBreakerOpenError) { ++ console.log( ++ `[${label}] CircuitBreakerOpenError retryAfterMs=${err.retryAfterMs} msg=${err.message}`, ++ ); ++ } else { ++ console.log( ++ `[${label}] failed name=${err?.name ?? "Error"} status=${err?.status ?? err?.statusCode ?? "n/a"} msg=${err?.message}`, ++ ); ++ } ++ } ++} ++ ++async function main() { ++ console.log("\n=== Circuit breaker + fallback chain ==="); ++ void controller; ++ ++ console.log("\n[Test 1] Open primary circuit, then route to fallback1"); ++ // Two calls * (up to 3 retries each) ≈ 6 failures → should open the circuit (threshold=5). ++ await runOnce("primary-warmup-1", primaryAgent); ++ await runOnce("primary-warmup-2", primaryAgent); ++ await runOnce("primary-after-open", primaryAgent); // should execute fallback1 (still closed) ++ ++ console.log("\n[Test 2] Open fallback1 circuit, then route to fallback2"); ++ // Build enough failures on fallback1 by routing multiple requests to it via primary circuit-open path. ++ await runOnce("fallback1-warmup-1-via-primary", primaryAgent); ++ await runOnce("fallback1-warmup-2-via-primary", primaryAgent); ++ await runOnce("primary-should-hit-fallback2", primaryAgent); // should execute fallback2 and succeed ++ ++ console.log("\n[Test 3] No fallback configured → CircuitBreakerOpenError"); ++ await runOnce("no-fallback-warmup-1", noFallbackAgent); ++ await runOnce("no-fallback-warmup-2", noFallbackAgent); ++ await runOnce("no-fallback-after-open", noFallbackAgent); ++ ++ console.log("\n[debug] model call counts:"); ++ console.log( ++ safeStringify({ ++ primary: primaryModel.doGenerateCalls?.length, ++ fallback1: fallback1Model.doGenerateCalls?.length, ++ fallback2: fallback2Model.doGenerateCalls?.length, ++ "no-fallback": noFallbackModel.doGenerateCalls?.length, ++ }), ++ ); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-priority-openai-real.ts b/tmp/test/traffic-priority-openai-real.ts +new file mode 100644 +index 00000000..223263ba +--- /dev/null ++++ b/tmp/test/traffic-priority-openai-real.ts +@@ -0,0 +1,117 @@ ++// @ts-nocheck ++/** ++ * Manual test: TrafficController + AI SDK with real OpenAI calls. ++ * ++ * What this exercises: ++ * - Priority scheduling (P0/P1/P2) with `maxConcurrent=1` ++ * - Rate limit header ingestion via `updateRateLimitFromHeaders()` (if headers are present) ++ * - Tenant usage aggregation via `extractUsage` + `getTenantUsage()` ++ * ++ * Prereqs: ++ * - Set `OPENAI_API_KEY` ++ * ++ * Run: ++ * - OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-priority-openai-real.ts ++ * - VERBOSE=1 OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-priority-openai-real.ts ++ * ++ * Notes: ++ * - This will make real network calls and may incur cost. ++ */ ++ ++import { openai } from "@ai-sdk/openai"; ++import { safeStringify } from "@voltagent/internal"; ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const apiKey = process.env.OPENAI_API_KEY; ++if (!apiKey) { ++ console.error("Missing OPENAI_API_KEY. Example:"); ++ console.error(" OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-priority-openai-real.ts"); ++ process.exit(1); ++} ++ ++const _now = () => new Date().toISOString(); ++const preview = (value: unknown, max = 140) => { ++ if (typeof value !== "string") return String(value ?? ""); ++ return value.length > max ? `${value.slice(0, max)}…` : value; ++}; ++ ++const tenantId = process.env.TENANT_ID ?? "openai-real"; ++const defaultModelId = process.env.OPENAI_MODEL ?? "gpt-4o-mini"; ++ ++const controller = getTrafficController({ maxConcurrent: 1 }); ++ ++function getHeader(headers: any, name: string): string | undefined { ++ if (!headers) return undefined; ++ if (typeof headers.get === "function") { ++ const v = headers.get(name); ++ return v === null || v === undefined ? undefined : String(v); ++ } ++ const key = Object.keys(headers).find((k) => k.toLowerCase() === name.toLowerCase()); ++ if (!key) return undefined; ++ const v = headers[key]; ++ return v === null || v === undefined ? undefined : String(Array.isArray(v) ? v[0] : v); ++} ++ ++async function main() { ++ console.log( ++ `\n=== OpenAI real: priority scheduling (tenantId=${tenantId}, model=${defaultModelId}) ===`, ++ ); ++ void controller; ++ ++ const agent = new Agent({ ++ name: "openai-real-traffic", ++ instructions: "Reply exactly with the requested token.", ++ model: openai(defaultModelId), ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ // Enqueue in reverse priority order; controller should still execute P0 first. ++ const p2 = agent.generateText("Reply with only: P2", { tenantId, trafficPriority: "P2" }); ++ const p1 = agent.generateText("Reply with only: P1", { tenantId, trafficPriority: "P1" }); ++ const p0 = agent.generateText("Reply with only: P0", { tenantId, trafficPriority: "P0" }); ++ ++ const settled = await Promise.allSettled([p0, p1, p2]); ++ for (const result of settled) { ++ if (result.status !== "fulfilled") { ++ console.log(`[result] rejected=${result.reason?.message ?? String(result.reason)}`); ++ continue; ++ } ++ ++ const headers = result.value.response?.headers; ++ const limit = getHeader(headers, "x-ratelimit-limit-requests"); ++ const remaining = getHeader(headers, "x-ratelimit-remaining-requests"); ++ const reset = getHeader(headers, "x-ratelimit-reset-requests"); ++ ++ console.log( ++ `[result] text=${preview(result.value.text)} finishReason=${result.value.finishReason} usage=${safeStringify(result.value.usage)}`, ++ ); ++ console.log( ++ `[result] ratelimitHeaders=${safeStringify({ ++ limit, ++ remaining, ++ reset, ++ })}`, ++ ); ++ } ++ ++ console.log( ++ `\n[done] settled=${safeStringify( ++ settled.map((s) => (s.status === "fulfilled" ? preview(s.value.text) : s.reason?.message)), ++ )}`, ++ ); ++ ++ console.log( ++ `[done] tenantUsage(${tenantId})=${safeStringify(controller.getTenantUsage(tenantId))}`, ++ ); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-priority-openai-sim.ts b/tmp/test/traffic-priority-openai-sim.ts +new file mode 100644 +index 00000000..9d36a7d1 +--- /dev/null ++++ b/tmp/test/traffic-priority-openai-sim.ts +@@ -0,0 +1,114 @@ ++// @ts-nocheck ++/** ++ * Manual test: Agent → TrafficController priority scheduling (OpenAI-like stub models). ++ * ++ * This keeps the Agent + AI SDK path, but avoids real network calls by using stub models ++ * that pretend to be `provider="openai"` with modelIds like `gpt-4o`/`gpt-4o-mini`. ++ * ++ * Scenarios: ++ * - Test 1: P0 runs before P1/P2 when all runnable. ++ * - Test 2: P0 request (gpt-4o) is rate-limited → P1 (gpt-4o-mini) proceeds. ++ * ++ * Note: ++ * - Rate-limit wakeups include a small probe delay; a "1s" reset may unblock slightly after 1s. ++ * ++ * Run: ++ * - pnpm ts-node tmp/test/traffic-priority-openai-sim.ts ++ */ ++ ++import { safeStringify } from "@voltagent/internal"; ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); ++const now = () => new Date().toISOString(); ++ ++function makeOpenAIStubModel(modelId: string, delayMs: number) { ++ let calls = 0; ++ return { ++ specificationVersion: "v2", ++ provider: "openai", ++ modelId, ++ doGenerate: async () => { ++ calls += 1; ++ console.log(`[${now()}] [model] ${modelId} doGenerate call=${calls}`); ++ await sleep(delayMs); ++ return { ++ content: [{ type: "text", text: `ok:${modelId}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { modelId, headers: {} }, ++ }; ++ }, ++ }; ++} ++ ++const controller = getTrafficController({ maxConcurrent: 1 }); ++ ++const modelMini = makeOpenAIStubModel("gpt-4o-mini", 80); ++const modelBig = makeOpenAIStubModel("gpt-4o", 80); ++ ++const agent = new Agent({ ++ name: "priority-openai-sim", ++ instructions: "echo", ++ model: modelMini, ++ temperature: 0, ++ maxOutputTokens: 32, ++}); ++ ++async function test1_priorityOrder() { ++ console.log("\n=== Test 1: P0 ordering via Agent ==="); ++ ++ const p2 = agent.generateText("P2", { trafficPriority: "P2", tenantId: "sim" }); ++ const p1 = agent.generateText("P1", { trafficPriority: "P1", tenantId: "sim" }); ++ const p0 = agent.generateText("P0", { trafficPriority: "P0", tenantId: "sim" }); ++ ++ const results = await Promise.all([p0, p1, p2]); ++ console.log(`[Test 1] results=${safeStringify(results.map((r) => r.text))}`); ++} ++ ++async function test2_p1RunsWhenP0RateLimited() { ++ console.log("\n=== Test 2: P1 proceeds when P0 is rate-limited ==="); ++ ++ // Seed remaining=0 for openai::gpt-4o so the P0 head item initially waits. ++ const applied = controller.updateRateLimitFromHeaders( ++ { provider: "openai", model: "gpt-4o" }, ++ { ++ "x-ratelimit-limit-requests": "1", ++ "x-ratelimit-remaining-requests": "0", ++ "x-ratelimit-reset-requests": "1s", ++ }, ++ ); ++ console.log(`[Test 2] updateRateLimitFromHeaders=${safeStringify(applied)}`); ++ ++ const p0Blocked = agent.generateText("P0 (gpt-4o, rate-limited)", { ++ trafficPriority: "P0", ++ tenantId: "sim", ++ model: modelBig, // per-call model override (new in this branch) ++ }); ++ ++ const p1Free = agent.generateText("P1 (gpt-4o-mini)", { ++ trafficPriority: "P1", ++ tenantId: "sim", ++ model: modelMini, ++ }); ++ ++ const [r0, r1] = await Promise.all([p0Blocked, p1Free]); ++ console.log(`[Test 2] p0 text=${r0.text}`); ++ console.log(`[Test 2] p1 text=${r1.text}`); ++} ++ ++async function main() { ++ await test1_priorityOrder(); ++ await test2_p1RunsWhenP0RateLimited(); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-priority.ts b/tmp/test/traffic-priority.ts +new file mode 100644 +index 00000000..409e1078 +--- /dev/null ++++ b/tmp/test/traffic-priority.ts +@@ -0,0 +1,159 @@ ++// @ts-nocheck ++/** ++ * Manual test: TrafficController priority scheduling. ++ * ++ * Scenarios: ++ * - Test 1: P0 should run before P1/P2 when runnable. ++ * - Test 2: If a P0 request is rate-limited, a lower priority (P1) can proceed. ++ * ++ * Note: ++ * - Rate-limit wakeups include a small probe delay; a "1s" reset may unblock slightly after 1s. ++ * ++ * Run: ++ * - pnpm ts-node tmp/test/traffic-priority.ts ++ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-priority.ts ++ */ ++ ++import { safeStringify } from "@voltagent/internal"; ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); ++const now = () => new Date().toISOString(); ++ ++const controller = getTrafficController({ maxConcurrent: 1 }); ++ ++function extractLabel(prompt: any): string { ++ if (!Array.isArray(prompt)) { ++ return "unknown"; ++ } ++ ++ for (let index = prompt.length - 1; index >= 0; index -= 1) { ++ const message = prompt[index]; ++ if (!message || message.role !== "user" || !Array.isArray(message.content)) { ++ continue; ++ } ++ ++ const textPart = message.content.find((part: any) => part?.type === "text"); ++ if (textPart?.text) { ++ return String(textPart.text); ++ } ++ } ++ ++ return "unknown"; ++} ++ ++function makeModel(provider: string, modelId: string, delayMs = 50) { ++ let calls = 0; ++ let lastStartAt = 0; ++ ++ return { ++ specificationVersion: "v2", ++ provider, ++ modelId, ++ doGenerate: async (options: any) => { ++ calls += 1; ++ const startAt = Date.now(); ++ const delta = lastStartAt ? startAt - lastStartAt : 0; ++ lastStartAt = startAt; ++ ++ const label = extractLabel(options?.prompt); ++ console.log( ++ `[${now()}] doGenerate start model=${provider}::${modelId} call=${calls} (+${delta}ms) input=${label}`, ++ ); ++ await sleep(delayMs); ++ console.log(`[${now()}] doGenerate end model=${provider}::${modelId} input=${label}`); ++ ++ return { ++ content: [{ type: "text", text: `ok:${label}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { modelId, headers: {} }, ++ }; ++ }, ++ }; ++} ++ ++async function test1_priorityOrder() { ++ console.log("\n=== Test 1: priority order (P0 before P1/P2) ==="); ++ ++ const sharedModel = makeModel("p", "shared-model", 50); ++ const agent = new Agent({ ++ name: "traffic-priority", ++ instructions: "echo", ++ model: sharedModel, ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ // Enqueue in reverse order; scheduler should still run P0 first. ++ const p2 = agent.generateText("P2", { tenantId: "default", trafficPriority: "P2" }); ++ const p1 = agent.generateText("P1", { tenantId: "default", trafficPriority: "P1" }); ++ const p0 = agent.generateText("P0", { tenantId: "default", trafficPriority: "P0" }); ++ ++ const settled = await Promise.allSettled([p0, p1, p2]); ++ console.log( ++ `[Test 1] results=${safeStringify( ++ settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)), ++ )}`, ++ ); ++} ++ ++async function test2_lowerPriorityWhenP0RateLimited() { ++ console.log("\n=== Test 2: P1 proceeds when P0 rate-limited ==="); ++ ++ const applied = controller.updateRateLimitFromHeaders( ++ { provider: "p0", model: "m0" }, ++ { ++ "x-ratelimit-limit-requests": "1", ++ "x-ratelimit-remaining-requests": "0", ++ "x-ratelimit-reset-requests": "1s", ++ }, ++ ); ++ console.log(`[Test 2] updateRateLimitFromHeaders=${safeStringify(applied)}`); ++ ++ const modelP0 = makeModel("p0", "m0", 50); ++ const modelP1 = makeModel("p1", "m1", 50); ++ const agent = new Agent({ ++ name: "traffic-priority-rate-limit", ++ instructions: "echo", ++ model: modelP1, ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ // Now the next P0 request is at the head of the queue but rate-limited, ++ // so a runnable P1 request should execute first. ++ const p0Blocked = agent.generateText("P0-blocked (rate limited)", { ++ tenantId: "default", ++ trafficPriority: "P0", ++ model: modelP0, ++ }); ++ const p1Free = agent.generateText("P1-free (should run first)", { ++ tenantId: "default", ++ trafficPriority: "P1", ++ model: modelP1, ++ }); ++ ++ const settled = await Promise.allSettled([p0Blocked, p1Free]); ++ console.log( ++ `[Test 2] results=${safeStringify( ++ settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)), ++ )}`, ++ ); ++} ++ ++async function main() { ++ await test1_priorityOrder(); ++ await test2_lowerPriorityWhenP0RateLimited(); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-rate-limit-from-headers.ts b/tmp/test/traffic-rate-limit-from-headers.ts +new file mode 100644 +index 00000000..d8262661 +--- /dev/null ++++ b/tmp/test/traffic-rate-limit-from-headers.ts +@@ -0,0 +1,158 @@ ++// @ts-nocheck ++/** ++ * Manual test: TrafficController dynamic rate limits from OpenAI response headers. ++ * ++ * This hits the real OpenAI model via Agent + AI SDK, and relies on the ++ * `x-ratelimit-*` response headers to seed/update the TrafficController. ++ * ++ * What to look for: ++ * - Each request prints the observed `x-ratelimit-*` headers (if present). ++ * - Agent should also log: "[Traffic] Applied rate limit from response headers". ++ * - With enough parallel requests, some requests may take longer due to controller throttling. ++ * ++ * Prereqs: ++ * - Set `OPENAI_API_KEY` ++ * ++ * Optional env: ++ * - `OPENAI_MODEL` (default: gpt-4o-mini) ++ * - `REQUESTS` (default: 10) ++ * - `MAX_CONCURRENT` (default: 50) ++ * - `TENANT_ID` (default: openai-rate-limit-headers) ++ * ++ * Run: ++ * - OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-from-headers.ts ++ * - VERBOSE=1 OPENAI_API_KEY=... REQUESTS=30 pnpm ts-node tmp/test/traffic-rate-limit-from-headers.ts ++ */ ++ ++import { openai } from "@ai-sdk/openai"; ++import { safeStringify } from "@voltagent/internal"; ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const now = () => new Date().toISOString(); ++ ++const apiKey = process.env.OPENAI_API_KEY; ++if (!apiKey) { ++ console.error("Missing OPENAI_API_KEY. Example:"); ++ console.error(" OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-from-headers.ts"); ++ process.exit(1); ++} ++ ++const provider = "openai"; ++const modelId = process.env.OPENAI_MODEL ?? "gpt-4o-mini"; ++const tenantId = process.env.TENANT_ID ?? "openai-rate-limit-headers"; ++const requestCountRaw = Number(process.env.REQUESTS ?? "10"); ++const maxConcurrentRaw = Number(process.env.MAX_CONCURRENT ?? "50"); ++const requestCount = Number.isFinite(requestCountRaw) && requestCountRaw > 0 ? requestCountRaw : 10; ++const maxConcurrent = ++ Number.isFinite(maxConcurrentRaw) && maxConcurrentRaw > 0 ? maxConcurrentRaw : 50; ++ ++const key = `${provider}::${modelId}`; ++const controller = getTrafficController({ maxConcurrent }); ++ ++function getHeader(headers: any, name: string): string | undefined { ++ if (!headers) return undefined; ++ if (typeof headers.get === "function") { ++ const v = headers.get(name); ++ return v === null || v === undefined ? undefined : String(v); ++ } ++ ++ const entries = Object.entries(headers as Record); ++ const target = name.toLowerCase(); ++ const match = entries.find(([k]) => String(k).toLowerCase() === target); ++ if (!match) return undefined; ++ ++ const value = match[1]; ++ if (Array.isArray(value)) { ++ const first = value[0]; ++ return first === null || first === undefined ? undefined : String(first); ++ } ++ ++ return value === null || value === undefined ? undefined : String(value); ++} ++ ++async function main() { ++ console.log( ++ `\n=== OpenAI rate limit headers → TrafficController (${key}, maxConcurrent=${maxConcurrent}, requests=${requestCount}) ===`, ++ ); ++ void controller; ++ ++ const agent = new Agent({ ++ name: "openai-rate-limit-from-headers", ++ instructions: "Reply with only the requested token.", ++ model: openai(modelId), ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ console.log("\n[seed] Making one request to capture headers..."); ++ const seedStartedAt = Date.now(); ++ const seed = await agent.generateText("Reply with only: seed", { ++ tenantId, ++ trafficPriority: "P1", ++ }); ++ const seedElapsedMs = Date.now() - seedStartedAt; ++ ++ const seedHeaders = seed.response?.headers; ++ console.log(`[seed] done in ${seedElapsedMs}ms text=${seed.text}`); ++ console.log( ++ `[seed] x-ratelimit-*=${safeStringify({ ++ limit: getHeader(seedHeaders, "x-ratelimit-limit-requests"), ++ remaining: getHeader(seedHeaders, "x-ratelimit-remaining-requests"), ++ reset: getHeader(seedHeaders, "x-ratelimit-reset-requests"), ++ })}`, ++ ); ++ ++ console.log(`\n[burst] Scheduling ${requestCount} parallel requests...`); ++ const jobs = Array.from({ length: requestCount }, (_, idx) => { ++ const label = `req-${idx + 1}`; ++ const enqueuedAt = Date.now(); ++ console.log(`[${now()}] enqueue ${label}`); ++ ++ return agent ++ .generateText(`Reply with only: ${label}`, { tenantId, trafficPriority: "P1" }) ++ .then((result) => { ++ const elapsedMs = Date.now() - enqueuedAt; ++ const headers = result.response?.headers; ++ console.log( ++ `[${now()}] done ${label} in ${elapsedMs}ms text=${result.text} x-ratelimit-remaining=${getHeader( ++ headers, ++ "x-ratelimit-remaining-requests", ++ )}`, ++ ); ++ return { ++ label, ++ elapsedMs, ++ text: result.text, ++ headers: { ++ limit: getHeader(headers, "x-ratelimit-limit-requests"), ++ remaining: getHeader(headers, "x-ratelimit-remaining-requests"), ++ reset: getHeader(headers, "x-ratelimit-reset-requests"), ++ }, ++ }; ++ }) ++ .catch((error) => { ++ const elapsedMs = Date.now() - enqueuedAt; ++ console.log( ++ `[${now()}] failed ${label} in ${elapsedMs}ms name=${error?.name ?? "Error"} status=${error?.status ?? error?.statusCode ?? "n/a"} msg=${error?.message}`, ++ ); ++ throw error; ++ }); ++ }); ++ ++ const settled = await Promise.allSettled(jobs); ++ ++ console.log(`\n[done] settled=${safeStringify(settled.map((s) => s.status))}`); ++ console.log( ++ `[done] tenantUsage(${tenantId})=${safeStringify(controller.getTenantUsage(tenantId))}`, ++ ); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-rate-limit-openai-window-sim.ts b/tmp/test/traffic-rate-limit-openai-window-sim.ts +new file mode 100644 +index 00000000..35232faa +--- /dev/null ++++ b/tmp/test/traffic-rate-limit-openai-window-sim.ts +@@ -0,0 +1,247 @@ ++// @ts-nocheck ++/** ++ * Manual test (real network): Simulate OpenAI "window remaining + reset" semantics and watch ++ * TrafficController pace + probe behavior via logs. ++ * ++ * Why "simulate"? ++ * - Real OpenAI headers usually show very large remaining values, so pacing is hard to observe. ++ * - This script still hits the real OpenAI model, but it drives the controller state using ++ * synthetic `x-ratelimit-*` headers to force a small window (e.g. remaining=3, reset=30s). ++ * ++ * What this demonstrates (matches your Step 1–7): ++ * 1) We seed controller with remaining + reset window. ++ * 2) We enqueue many requests. ++ * 3) Controller subtracts `reserved` from `remaining` to avoid stampedes. ++ * 4) When `effectiveRemaining <= 1`, controller waits until `resetAt + probeDelay`. ++ * 5) When room exists, controller paces using `nextAllowedAt`. ++ * 6) When a request finishes, we release reservation (controller) and apply new headers (this script). ++ * 7) After reset, controller sends a probe even when remaining==0; probe "fetches" fresh headers and flow resumes. ++ * ++ * Prereqs: ++ * - Set `OPENAI_API_KEY` ++ * ++ * Suggested logging: ++ * - `VOLTAGENT_LOG_LEVEL=trace` (to see traffic controller internals) ++ * ++ * Run: ++ * - VOLTAGENT_LOG_LEVEL=trace OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-openai-window-sim.ts ++ * ++ * Optional env: ++ * - OPENAI_MODEL (default: gpt-4o-mini) ++ * - WINDOW_SECONDS (default: 30) ++ * - REMAINING (default: 3) ++ * - REQUESTS (default: 10) ++ * - MAX_CONCURRENT (default: 50) ++ */ ++ ++import { safeStringify } from "@voltagent/internal"; ++import { TrafficController } from "../../packages/core/dist/index.js"; ++ ++const apiKey = process.env.OPENAI_API_KEY; ++if (!apiKey) { ++ console.error("Missing OPENAI_API_KEY. Example:"); ++ console.error( ++ " VOLTAGENT_LOG_LEVEL=trace OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-openai-window-sim.ts", ++ ); ++ process.exit(1); ++} ++ ++const now = () => new Date().toISOString(); ++ ++const modelId = process.env.OPENAI_MODEL ?? "gpt-4o-mini"; ++const windowSecondsRaw = Number(process.env.WINDOW_SECONDS ?? "30"); ++const remainingRaw = Number(process.env.REMAINING ?? "3"); ++const requestsRaw = Number(process.env.REQUESTS ?? "10"); ++const maxConcurrentRaw = Number(process.env.MAX_CONCURRENT ?? "50"); ++ ++const windowSeconds = ++ Number.isFinite(windowSecondsRaw) && windowSecondsRaw > 0 ? windowSecondsRaw : 30; ++const initialRemaining = ++ Number.isFinite(remainingRaw) && remainingRaw > 0 ? Math.floor(remainingRaw) : 3; ++const requestCount = Number.isFinite(requestsRaw) && requestsRaw > 0 ? Math.floor(requestsRaw) : 10; ++const maxConcurrent = ++ Number.isFinite(maxConcurrentRaw) && maxConcurrentRaw > 0 ? Math.floor(maxConcurrentRaw) : 50; ++ ++const provider = "openai"; ++const tenantId = "openai-window-sim"; ++const windowMs = Math.round(windowSeconds * 1000); ++ ++async function callOpenAIResponses(label: string): Promise<{ ++ status: number; ++ headers: Record; ++ textPreview: string; ++}> { ++ const url = "https://api.openai.com/v1/responses"; ++ const body = safeStringify({ ++ model: modelId, ++ input: `Reply with only: ${label}`, ++ max_output_tokens: 16, ++ }); ++ ++ const startedAt = Date.now(); ++ const res = await fetch(url, { ++ method: "POST", ++ headers: { ++ authorization: `Bearer ${apiKey}`, ++ "content-type": "application/json", ++ }, ++ body, ++ }); ++ ++ const limit = res.headers.get("x-ratelimit-limit-requests") ?? undefined; ++ const remaining = res.headers.get("x-ratelimit-remaining-requests") ?? undefined; ++ const reset = res.headers.get("x-ratelimit-reset-requests") ?? undefined; ++ ++ if (!res.ok) { ++ const text = await res.text().catch(() => ""); ++ throw new Error( ++ `OpenAI error status=${res.status} elapsedMs=${Date.now() - startedAt} body=${text.slice(0, 280)}`, ++ ); ++ } ++ ++ const data: any = await res.json(); ++ const outputText = ++ data?.output?.[0]?.content?.find?.((c: any) => c?.type === "output_text")?.text ?? ++ data?.output_text ?? ++ data?.output?.[0]?.content?.[0]?.text ?? ++ ""; ++ ++ return { ++ status: res.status, ++ headers: { ++ "x-ratelimit-limit-requests": limit, ++ "x-ratelimit-remaining-requests": remaining, ++ "x-ratelimit-reset-requests": reset, ++ }, ++ textPreview: String(outputText).slice(0, 80), ++ }; ++} ++ ++async function main() { ++ console.log( ++ `\n=== OpenAI real + synthetic window rate limit (provider=${provider}, model=${modelId}) ===`, ++ ); ++ console.log( ++ `[config] maxConcurrent=${maxConcurrent} windowSeconds=${windowSeconds} initialRemaining=${initialRemaining} requests=${requestCount}`, ++ ); ++ console.log( ++ "[hint] Set VOLTAGENT_LOG_LEVEL=trace to see TrafficController internals (reserved/effectiveRemaining/nextAllowedAt).", ++ ); ++ ++ const controller = new TrafficController({ maxConcurrent }); ++ ++ // --- Step 1: seed "remaining + reset window" into controller --- ++ let windowResetAt = Date.now() + windowMs; ++ let remainingInWindow = initialRemaining; ++ ++ const applySyntheticHeaders = (source: string) => { ++ const resetMs = Math.max(1, windowResetAt - Date.now()); ++ const applied = controller.updateRateLimitFromHeaders( ++ { provider, model: modelId, tenantId }, ++ { ++ "x-ratelimit-limit-requests": String(initialRemaining), ++ "x-ratelimit-remaining-requests": String(Math.max(0, remainingInWindow)), ++ "x-ratelimit-reset-requests": `${resetMs}ms`, ++ }, ++ ); ++ console.log( ++ `[${now()}] [synthetic] source=${source} remaining=${remainingInWindow} resetInMs=${resetMs} applied=${safeStringify( ++ applied && { ++ key: applied.key, ++ state: { ++ remaining: applied.state.remaining, ++ reserved: applied.state.reserved, ++ resetAt: applied.state.resetAt, ++ nextAllowedAt: applied.state.nextAllowedAt, ++ }, ++ }, ++ )}`, ++ ); ++ }; ++ ++ applySyntheticHeaders("seed"); ++ ++ console.log("\n[seed] Making one real request to confirm connectivity + show real headers..."); ++ const seed = await callOpenAIResponses("seed"); ++ console.log( ++ `[${now()}] [seed] ok status=${seed.status} text=${seed.textPreview} realHeaders=${safeStringify( ++ seed.headers, ++ )}`, ++ ); ++ ++ console.log(`\n[burst] Enqueueing ${requestCount} controller-managed requests...`); ++ ++ const jobs = Array.from({ length: requestCount }, (_, index) => { ++ const label = `req-${index + 1}`; ++ const enqueuedAt = Date.now(); ++ console.log(`[${now()}] [enqueue] ${label}`); ++ ++ return controller ++ .handleText({ + tenantId, ++ metadata: { ++ tenantId, ++ provider, ++ model: modelId, ++ priority: "P1", ++ agentName: "openai-window-sim", ++ agentId: label, ++ }, ++ execute: async () => { ++ const startedAt = Date.now(); ++ console.log(`[${now()}] [execute-start] ${label}`); ++ ++ const result = await callOpenAIResponses(label); ++ ++ console.log( ++ `[${now()}] [execute-end] ${label} elapsedMs=${Date.now() - startedAt} realHeaders=${safeStringify( ++ result.headers, ++ )}`, ++ ); ++ ++ // --- Step 6: decrement remaining + apply new "headers" --- ++ const nowMs = Date.now(); ++ if (nowMs >= windowResetAt) { ++ // --- Step 7: reset happened; probe request fetched "fresh" headers for the next window --- ++ console.log( ++ `[${now()}] [reset] window elapsed; starting new synthetic window (windowSeconds=${windowSeconds})`, ++ ); ++ windowResetAt = nowMs + windowMs; ++ remainingInWindow = initialRemaining; ++ } ++ ++ remainingInWindow = Math.max(0, remainingInWindow - 1); ++ applySyntheticHeaders("response"); ++ ++ return result; ++ }, ++ }) ++ .then((r) => { ++ const totalElapsedMs = Date.now() - enqueuedAt; ++ console.log( ++ `[${now()}] [done] ${label} totalElapsedMs=${totalElapsedMs} text=${r.textPreview}`, ++ ); ++ return { label, totalElapsedMs, status: "fulfilled" as const }; ++ }) ++ .catch((error: any) => { ++ const totalElapsedMs = Date.now() - enqueuedAt; ++ console.log( ++ `[${now()}] [fail] ${label} totalElapsedMs=${totalElapsedMs} name=${error?.name ?? "Error"} msg=${ ++ error?.message ?? String(error) ++ }`, ++ ); ++ return { label, totalElapsedMs, status: "rejected" as const }; + }); - const evaluations = response.object.evaluations; - - context.results.raw.contextRelevancyEvaluations = evaluations; -diff --git a/packages/scorers/src/llm/moderation.ts b/packages/scorers/src/llm/moderation.ts -index 03563bfe..1055927f 100644 ---- a/packages/scorers/src/llm/moderation.ts -+++ b/packages/scorers/src/llm/moderation.ts -@@ -7,6 +7,7 @@ import { - } from "@voltagent/core"; - import { safeStringify } from "@voltagent/internal/utils"; - import { z } from "zod"; -+import { extractTenantId } from "./utils"; - - export interface ModerationScorerOptions { - id?: string; -@@ -220,6 +221,7 @@ async function runModerationJudge(args: { - typeof context.results.prepare === "string" - ? context.results.prepare - : normalizeText(context.payload.output); -+ const tenantId = extractTenantId(context); - - const prompt = await buildPrompt({ - output: normalizedOutput, -@@ -232,12 +234,14 @@ async function runModerationJudge(args: { - const agent = new Agent({ - name: "moderation-judge", - model, -+ trafficPriority: "P2", - instructions: - "You are a safety classifier. Respond with JSON that matches the provided schema containing flagged, scores, and reason.", - }); - - const response = await agent.generateObject(prompt, MODERATION_SCHEMA, { - maxOutputTokens, -+ tenantId, - }); - - const parsed = mapModerationResponse(response.object, threshold); -diff --git a/packages/scorers/src/llm/utils.ts b/packages/scorers/src/llm/utils.ts ++ }); ++ ++ const settled = await Promise.all(jobs); ++ console.log(`\n[done] settled=${safeStringify(settled.map((s) => s.status))}`); ++ console.log( ++ `[done] tenantUsage(${tenantId})=${safeStringify(controller.getTenantUsage(tenantId))}`, ++ ); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-rate-limit-static.ts b/tmp/test/traffic-rate-limit-static.ts new file mode 100644 -index 00000000..75e886e3 +index 00000000..3f91d5bb --- /dev/null -+++ b/packages/scorers/src/llm/utils.ts -@@ -0,0 +1,14 @@ -+import type { BuilderPrepareContext, BuilderScoreContext } from "@voltagent/core"; ++++ b/tmp/test/traffic-rate-limit-static.ts +@@ -0,0 +1,149 @@ ++// @ts-nocheck ++/** ++ * Manual test: TrafficController window-based rate limiting (simulated OpenAI headers). ++ * ++ * What to look for: ++ * - Requests should be paced out across the window (no steady "refill" math). ++ * - If responses arrive out-of-order, remaining headers might "increase"; controller should ++ * keep remaining monotonic within the same window. ++ * ++ * Run: ++ * - pnpm ts-node tmp/test/traffic-rate-limit-static.ts ++ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-rate-limit-static.ts ++ * ++ * Optional env: ++ * - LIMIT=6 WINDOW_MS=3000 pnpm ts-node tmp/test/traffic-rate-limit-static.ts ++ */ + -+type TenantAwareContext = BuilderScoreContext, Record> & -+ BuilderPrepareContext, Record>; ++import { safeStringify } from "@voltagent/internal"; ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; + -+export function extractTenantId( -+ context: -+ | BuilderScoreContext, Record> -+ | BuilderPrepareContext, Record> -+ | TenantAwareContext, -+): string | undefined { -+ const candidate = (context.payload as { tenantId?: unknown })?.tenantId; -+ return typeof candidate === "string" ? candidate : undefined; ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); ++const now = () => new Date().toISOString(); ++ ++const provider = "sim"; ++const model = "rate-limited-model"; ++const key = `${provider}::${model}`; ++ ++const controller = getTrafficController({ maxConcurrent: 50 }); ++ ++const limit = Number(process.env.LIMIT ?? 6); ++const windowMs = Number(process.env.WINDOW_MS ?? 3000); ++let windowStartAt = Date.now(); ++let windowResetAt = windowStartAt + windowMs; ++let usedInWindow = 0; ++ ++function extractLabel(prompt: any): string { ++ if (!Array.isArray(prompt)) { ++ return "unknown"; ++ } ++ ++ for (let index = prompt.length - 1; index >= 0; index -= 1) { ++ const message = prompt[index]; ++ if (!message || message.role !== "user" || !Array.isArray(message.content)) { ++ continue; ++ } ++ ++ const textPart = message.content.find((part: any) => part?.type === "text"); ++ if (textPart?.text) { ++ return String(textPart.text); ++ } ++ } ++ ++ return "unknown"; ++} ++ ++async function main() { ++ console.log( ++ `\n=== Window rate limit for ${key} (limit=${limit}, windowMs=${windowMs}, jobs=10) ===`, ++ ); ++ ++ const seeded = controller.updateRateLimitFromHeaders( ++ { provider, model }, ++ { ++ "x-ratelimit-limit-requests": String(limit), ++ "x-ratelimit-remaining-requests": String(limit), ++ "x-ratelimit-reset-requests": `${windowMs}ms`, ++ }, ++ ); ++ console.log(`[seed] updateRateLimitFromHeaders=${safeStringify(seeded)}`); ++ ++ let calls = 0; ++ let lastStartAt = 0; ++ const rateLimitedModel = { ++ specificationVersion: "v2", ++ provider, ++ modelId: model, ++ doGenerate: async (options: any) => { ++ const simulatedLatencyMs = 10 + Math.floor(Math.random() * 120); ++ const nowMs = Date.now(); ++ if (nowMs >= windowResetAt) { ++ windowStartAt = nowMs; ++ windowResetAt = windowStartAt + windowMs; ++ usedInWindow = 0; ++ } ++ ++ calls += 1; ++ usedInWindow += 1; ++ const startAt = Date.now(); ++ const delta = lastStartAt ? startAt - lastStartAt : 0; ++ lastStartAt = startAt; ++ ++ const label = extractLabel(options?.prompt); ++ console.log( ++ `[${now()}] doGenerate start call=${calls} (+${delta}ms) input=${label} latencyMs=${simulatedLatencyMs}`, ++ ); ++ await sleep(simulatedLatencyMs); ++ console.log(`[${now()}] doGenerate end input=${label}`); ++ ++ const remainingAfterThis = Math.max(0, limit - usedInWindow); ++ const resetMs = Math.max(1, windowResetAt - Date.now()); ++ return { ++ content: [{ type: "text", text: `ok:${label}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { ++ modelId: model, ++ headers: { ++ "x-ratelimit-limit-requests": String(limit), ++ "x-ratelimit-remaining-requests": String(remainingAfterThis), ++ "x-ratelimit-reset-requests": `${resetMs}ms`, ++ }, ++ }, ++ }; ++ }, ++ }; ++ ++ const agent = new Agent({ ++ name: "traffic-rate-limit-static", ++ instructions: "echo", ++ model: rateLimitedModel, ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ const jobs = Array.from({ length: 10 }, (_, idx) => ++ agent.generateText(`req-${idx + 1}`, { ++ tenantId: "default", ++ trafficPriority: "P1", ++ }), ++ ); ++ ++ const settled = await Promise.allSettled(jobs); ++ console.log( ++ `\n[done] results=${safeStringify( ++ settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)), ++ )}`, ++ ); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-retry-after.ts b/tmp/test/traffic-retry-after.ts +new file mode 100644 +index 00000000..c0c213eb +--- /dev/null ++++ b/tmp/test/traffic-retry-after.ts +@@ -0,0 +1,245 @@ ++// @ts-nocheck ++/** ++ * Manual test: Retry-After handling (429 retry + 200 OK header ingestion). ++ * ++ * What this exercises: ++ * - Retry-After on 429 errors increases retry delay (TrafficController retry plan). ++ * - Retry-After on successful responses throttles subsequent requests for the same provider::model. ++ * ++ * Run: ++ * - pnpm -C packages/core build ++ * - pnpm ts-node tmp/test/traffic-retry-after.ts ++ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-retry-after.ts ++ */ ++ ++import { safeStringify } from "@voltagent/internal"; ++import { ++ Agent, ++ RateLimitedUpstreamError, ++ getTrafficController, ++} from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); ++const now = () => new Date().toISOString(); ++ ++function extractLabel(prompt: any): string { ++ if (!Array.isArray(prompt)) { ++ return "unknown"; ++ } ++ ++ for (let index = prompt.length - 1; index >= 0; index -= 1) { ++ const message = prompt[index]; ++ if (!message || message.role !== "user" || !Array.isArray(message.content)) { ++ continue; ++ } ++ ++ const textPart = message.content.find((part: any) => part?.type === "text"); ++ if (textPart?.text) { ++ return String(textPart.text); ++ } ++ } ++ ++ return "unknown"; ++} ++ ++function make429RetryAfterModel(args: { ++ provider: string; ++ modelId: string; ++ retryAfterSeconds: number; ++ mode: "headers" | "typedError"; ++}) { ++ const { provider, modelId, retryAfterSeconds, mode } = args; ++ let calls = 0; ++ const startedAt: number[] = []; ++ ++ return { ++ specificationVersion: "v2", ++ provider, ++ modelId, ++ startedAt, ++ doGenerate: async (options: any) => { ++ calls += 1; ++ const start = Date.now(); ++ startedAt.push(start); ++ ++ const label = extractLabel(options?.prompt); ++ console.log(`[${now()}] [model] ${provider}::${modelId} start call=${calls} input=${label}`); ++ ++ if (calls === 1) { ++ const retryAfterValue = String(retryAfterSeconds); ++ ++ if (mode === "typedError") { ++ throw new RateLimitedUpstreamError( ++ `rate limited (typed) retry-after=${retryAfterValue}s`, ++ { provider, model: modelId }, ++ Math.round(retryAfterSeconds * 1000), ++ ); ++ } ++ ++ const err: any = new Error(`rate limited (headers) retry-after=${retryAfterValue}s`); ++ err.status = 429; ++ err.response = { ++ status: 429, ++ headers: { ++ "retry-after": retryAfterValue, ++ }, ++ }; ++ throw err; ++ } ++ ++ return { ++ content: [{ type: "text", text: `ok:${label}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { modelId, headers: {} }, ++ }; ++ }, ++ }; ++} ++ ++function makeSuccessRetryAfterModel(args: { ++ provider: string; ++ modelId: string; ++ retryAfterSeconds: number; ++ latencyMs: number; ++}) { ++ const { provider, modelId, retryAfterSeconds, latencyMs } = args; ++ let calls = 0; ++ const startedAt: number[] = []; ++ const endedAt: number[] = []; ++ ++ return { ++ specificationVersion: "v2", ++ provider, ++ modelId, ++ startedAt, ++ endedAt, ++ doGenerate: async (options: any) => { ++ calls += 1; ++ const start = Date.now(); ++ startedAt.push(start); ++ ++ const label = extractLabel(options?.prompt); ++ console.log(`[${now()}] [model] ${provider}::${modelId} start call=${calls} input=${label}`); ++ await sleep(latencyMs); ++ ++ const end = Date.now(); ++ endedAt.push(end); ++ console.log(`[${now()}] [model] ${provider}::${modelId} end call=${calls} input=${label}`); ++ ++ return { ++ content: [{ type: "text", text: `ok:${label}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { ++ modelId, ++ headers: ++ calls === 1 ++ ? { ++ "retry-after": String(retryAfterSeconds), ++ } ++ : {}, ++ }, ++ }; ++ }, ++ }; ++} ++ ++async function test_retryAfterOn429(mode: "headers" | "typedError") { ++ const retryAfterSeconds = 1; ++ const provider = `retry-after-429-${mode}`; ++ const modelId = "ra-429"; ++ const tenantId = `ra-429-${mode}`; ++ ++ const model = make429RetryAfterModel({ provider, modelId, retryAfterSeconds, mode }); ++ const agent = new Agent({ ++ name: `ra-429-${mode}`, ++ instructions: "echo", ++ model, ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ console.log(`\n=== Test: Retry-After on 429 (${mode}) ===`); ++ const result = await agent.generateText("hello", { tenantId, trafficPriority: "P1" }); ++ ++ const times = model.startedAt; ++ const deltaMs = times.length >= 2 ? times[1] - times[0] : undefined; ++ ++ console.log( ++ `[result] text=${result.text} calls=${times.length} startedAt=${safeStringify(times)} deltaMs=${deltaMs}`, ++ ); ++ ++ if (deltaMs === undefined || deltaMs < retryAfterSeconds * 1000) { ++ throw new Error( ++ `Expected retry delay >= ${retryAfterSeconds * 1000}ms, got ${deltaMs ?? "n/a"}ms`, ++ ); ++ } ++} ++ ++async function test_retryAfterOnSuccessResponse() { ++ const retryAfterSeconds = 0.3; ++ const provider = "retry-after-200"; ++ const modelId = "ra-200"; ++ const tenantId = "ra-200"; ++ ++ const model = makeSuccessRetryAfterModel({ ++ provider, ++ modelId, ++ retryAfterSeconds, ++ latencyMs: 20, ++ }); ++ ++ const agent = new Agent({ ++ name: "ra-200", ++ instructions: "echo", ++ model, ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ console.log("\n=== Test: Retry-After on 200 response headers ==="); ++ const first = agent.generateText("first", { tenantId, trafficPriority: "P1" }); ++ const second = agent.generateText("second", { tenantId, trafficPriority: "P1" }); ++ ++ const [r1, r2] = await Promise.all([first, second]); ++ ++ const end1 = model.endedAt[0]; ++ const start2 = model.startedAt[1]; ++ const enforcedDelayMs = start2 && end1 ? start2 - end1 : undefined; ++ ++ console.log( ++ `[result] texts=${safeStringify([r1.text, r2.text])} startedAt=${safeStringify( ++ model.startedAt, ++ )} endedAt=${safeStringify(model.endedAt)} enforcedDelayMs=${enforcedDelayMs}`, ++ ); ++ ++ if (enforcedDelayMs === undefined || enforcedDelayMs < retryAfterSeconds * 1000) { ++ throw new Error( ++ `Expected rate-limit delay >= ${retryAfterSeconds * 1000}ms, got ${enforcedDelayMs ?? "n/a"}ms`, ++ ); ++ } ++} ++ ++async function main() { ++ // Create controller early so all Agent calls share the same singleton. ++ getTrafficController({ maxConcurrent: 1 }); ++ ++ await test_retryAfterOn429("headers"); ++ await test_retryAfterOn429("typedError"); ++ await test_retryAfterOnSuccessResponse(); ++ ++ console.log("\n[done] All Retry-After manual checks passed."); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-retry-behavior.ts b/tmp/test/traffic-retry-behavior.ts +new file mode 100644 +index 00000000..273af55a +--- /dev/null ++++ b/tmp/test/traffic-retry-behavior.ts +@@ -0,0 +1,169 @@ ++// @ts-nocheck ++/** ++ * Manual test: TrafficController retry behavior via Agent + AI SDK path (stub model). ++ * ++ * Scenarios included: ++ * - 5xx retries (up to 3 attempts) ++ * - 429 retries (up to 3 attempts) ++ * - timeout retries (up to 2 attempts) ++ * - non-retriable 4xx does not retry ++ * ++ * Run: ++ * - pnpm ts-node tmp/test/traffic-retry-behavior.ts ++ * ++ * Notes: ++ * - Uses a stub LanguageModel; no network calls. ++ * - Watch the `[model] attempt=...` logs to confirm retries. ++ */ ++ ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++type Scenario = ++ | "server-error" ++ | "rate-limit" ++ | "timeout" ++ | "bad-request" ++ | "forbidden" ++ // Variations to hit different retry-detection branches. ++ | "server-error-status-string" ++ | "server-error-statusCode" ++ | "server-error-response-status" ++ | "server-error-cause-status" ++ | "rate-limit-statusCode" ++ | "timeout-code-only" ++ | "timeout-name-only" ++ | "timeout-message-only" ++ // Variations that should STOP retrying (hit max attempts). ++ | "server-error-exceed-max" ++ | "timeout-exceed-max"; ++ ++type RetryPlan = { ++ failCountBeforeSuccess: number; ++ status?: number | string; ++ statusCode?: number | string; ++ httpStatus?: number | string; ++ responseStatus?: number | string; ++ causeStatus?: number | string; ++ code?: string; ++ name?: string; ++ message?: string; ++}; ++ ++const plans: Record = { ++ "server-error": { failCountBeforeSuccess: 2, status: 500 }, ++ "rate-limit": { failCountBeforeSuccess: 2, status: 429 }, ++ timeout: { failCountBeforeSuccess: 1, status: 408, code: "ETIMEDOUT", message: "timeout" }, ++ "bad-request": { failCountBeforeSuccess: 10, status: 400 }, ++ forbidden: { failCountBeforeSuccess: 10, status: 403 }, ++ "server-error-status-string": { failCountBeforeSuccess: 2, status: "500" }, ++ "server-error-statusCode": { failCountBeforeSuccess: 2, statusCode: 502 }, ++ "server-error-response-status": { failCountBeforeSuccess: 2, responseStatus: 503 }, ++ "server-error-cause-status": { failCountBeforeSuccess: 2, causeStatus: 500 }, ++ "rate-limit-statusCode": { failCountBeforeSuccess: 2, statusCode: 429 }, ++ "timeout-code-only": { failCountBeforeSuccess: 1, code: "timeout" }, ++ "timeout-name-only": { failCountBeforeSuccess: 1, name: "TimeoutError" }, ++ "timeout-message-only": { failCountBeforeSuccess: 1, message: "this is a TIMEOUT" }, ++ "server-error-exceed-max": { failCountBeforeSuccess: 10, status: 500 }, ++ "timeout-exceed-max": { failCountBeforeSuccess: 10, message: "timeout" }, ++}; ++ ++function makeModel(modelId: string, plan: RetryPlan) { ++ let counter = 0; ++ let lastAttemptAt = 0; ++ ++ return { ++ specificationVersion: "v2", ++ provider: "retry-provider", ++ modelId, ++ doGenerate: async () => { ++ counter += 1; ++ const now = Date.now(); ++ const delta = lastAttemptAt ? now - lastAttemptAt : 0; ++ lastAttemptAt = now; ++ ++ console.log(`[model] modelId=${modelId} attempt=${counter} (+${delta}ms)`); ++ ++ if (counter <= plan.failCountBeforeSuccess) { ++ const err: any = new Error(plan.message ?? `forced failure ${counter} for ${modelId}`); ++ if (plan.status !== undefined) err.status = plan.status; ++ if (plan.statusCode !== undefined) err.statusCode = plan.statusCode; ++ if (plan.httpStatus !== undefined) err.httpStatus = plan.httpStatus; ++ if (plan.responseStatus !== undefined) err.response = { status: plan.responseStatus }; ++ if (plan.causeStatus !== undefined) err.cause = { status: plan.causeStatus }; ++ if (plan.code !== undefined) err.code = plan.code; ++ if (plan.name !== undefined) err.name = plan.name; ++ throw err; ++ } ++ ++ return { ++ content: [{ type: "text", text: "ok" }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { modelId, headers: {} }, ++ }; ++ }, ++ }; ++} ++ ++async function runScenario(name: Scenario) { ++ const plan = plans[name]; ++ const modelId = `retry-${name}`; ++ const model = makeModel(modelId, plan); ++ ++ const agent = new Agent({ ++ name: `RetryAgent-${name}`, ++ instructions: "echo", ++ model, ++ maxOutputTokens: 32, ++ temperature: 0, ++ }); ++ ++ console.log(`\n=== ${name} ===`); ++ try { ++ const result = await agent.generateText(name, { tenantId: "retry-test" }); ++ console.log(`[${name}] succeeded. text=${result.text}`); ++ } catch (err: any) { ++ console.log( ++ `[${name}] failed. status=${err?.status ?? err?.statusCode ?? err?.response?.status ?? "n/a"}`, ++ ); ++ } ++} ++ ++async function main() { ++ // Create controller early so all Agent calls share the same singleton. ++ getTrafficController({ maxConcurrent: 1 }); ++ ++ const runs: Scenario[] = [ ++ "server-error", ++ "rate-limit", ++ "timeout", ++ "bad-request", ++ "forbidden", ++ // Uncomment for additional coverage: ++ // "server-error-status-string", ++ // "server-error-statusCode", ++ // "server-error-response-status", ++ // "server-error-cause-status", ++ // "rate-limit-statusCode", ++ // "timeout-code-only", ++ // "timeout-name-only", ++ // "timeout-message-only", ++ // "server-error-exceed-max", ++ // "timeout-exceed-max", ++ ]; ++ ++ for (const name of runs) { ++ await runScenario(name); ++ } ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-tenant-usage.ts b/tmp/test/traffic-tenant-usage.ts +new file mode 100644 +index 00000000..801d7761 +--- /dev/null ++++ b/tmp/test/traffic-tenant-usage.ts +@@ -0,0 +1,71 @@ ++// @ts-nocheck ++/** ++ * Manual test: Tenant usage aggregation (via Agent → TrafficController). ++ * ++ * What to look for: ++ * - `getTenantUsage(tenantId)` should increase after each agent call. ++ * ++ * Run: ++ * - pnpm ts-node tmp/test/traffic-tenant-usage.ts ++ */ ++ ++import { safeStringify } from "@voltagent/internal"; ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++function makeModel(modelId: string) { ++ return { ++ specificationVersion: "v2", ++ provider: "usage-provider", ++ modelId, ++ doGenerate: async () => { ++ return { ++ content: [{ type: "text", text: `ok:${modelId}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 2, outputTokens: 3, totalTokens: 5 }, ++ warnings: [], ++ response: { modelId, headers: {} }, ++ }; ++ }, ++ }; ++} ++ ++const controller = getTrafficController({ maxConcurrent: 10 }); ++ ++async function run(label: string, tenantId: string) { ++ const model = makeModel("tenant-usage-model"); ++ const agent = new Agent({ ++ name: `TenantUsageAgent-${label}`, ++ instructions: "echo", ++ model, ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ console.log(`\n=== ${label} tenantId=${tenantId} ===`); ++ const result = await agent.generateText(`hello:${label}`, { tenantId }); ++ console.log(`[${label}] text=${result.text}`); ++ ++ const usage = controller.getTenantUsage(tenantId); ++ console.log(`[${label}] controller.getTenantUsage(${tenantId})=${safeStringify(usage)}`); ++} ++ ++async function main() { ++ await run("A1", "tenant-a"); ++ await run("A2", "tenant-a"); ++ await run("B1", "tenant-b"); ++ ++ console.log("\n=== Final usage snapshot ==="); ++ console.log(`tenant-a=${safeStringify(controller.getTenantUsage("tenant-a"))}`); ++ console.log(`tenant-b=${safeStringify(controller.getTenantUsage("tenant-b"))}`); ++ console.log(`default=${safeStringify(controller.getTenantUsage("default"))}`); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-text-vs-stream.ts b/tmp/test/traffic-text-vs-stream.ts +new file mode 100644 +index 00000000..41aa484d +--- /dev/null ++++ b/tmp/test/traffic-text-vs-stream.ts +@@ -0,0 +1,128 @@ ++// @ts-nocheck ++/** ++ * Manual test: Text + stream traffic share the same TrafficController queue. ++ * ++ * What to look for: ++ * - Stream and text requests should respect the same maxConcurrent + priority rules. ++ * ++ * Run: ++ * - pnpm ts-node tmp/test/traffic-text-vs-stream.ts ++ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-text-vs-stream.ts ++ */ ++ ++import { ReadableStream } from "node:stream/web"; ++import { safeStringify } from "@voltagent/internal"; ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); ++const now = () => new Date().toISOString(); ++ ++const controller = getTrafficController({ maxConcurrent: 1 }); ++ ++function extractLabel(prompt: any): string { ++ if (!Array.isArray(prompt)) { ++ return "unknown"; ++ } ++ ++ for (let index = prompt.length - 1; index >= 0; index -= 1) { ++ const message = prompt[index]; ++ if (!message || message.role !== "user" || !Array.isArray(message.content)) { ++ continue; ++ } ++ ++ const textPart = message.content.find((part: any) => part?.type === "text"); ++ if (textPart?.text) { ++ return String(textPart.text); ++ } ++ } ++ ++ return "unknown"; +} ++ ++async function main() { ++ console.log("\n=== Text vs Stream (shared scheduler) ==="); ++ void controller; ++ ++ const provider = "sim"; ++ const modelId = "shared-queue"; ++ ++ const model = { ++ specificationVersion: "v2", ++ provider, ++ modelId, ++ doGenerate: async (options: any) => { ++ const label = extractLabel(options?.prompt); ++ console.log(`[${now()}] doGenerate start input=${label}`); ++ await sleep(50); ++ console.log(`[${now()}] doGenerate end input=${label}`); ++ return { ++ content: [{ type: "text", text: `text:${label}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { modelId, headers: {} }, ++ }; ++ }, ++ doStream: async (options: any) => { ++ const label = extractLabel(options?.prompt); ++ console.log(`[${now()}] doStream start input=${label}`); ++ ++ // Hold the controller slot for a bit so ordering is visible. ++ await sleep(400); ++ ++ console.log(`[${now()}] doStream ready input=${label}`); ++ const streamId = `text-${label}`; ++ const text = `stream:${label}`; ++ ++ const stream = new ReadableStream({ ++ start(streamController) { ++ streamController.enqueue({ type: "stream-start", warnings: [] }); ++ streamController.enqueue({ type: "text-start", id: streamId }); ++ streamController.enqueue({ type: "text-delta", id: streamId, delta: text }); ++ streamController.enqueue({ type: "text-end", id: streamId }); ++ streamController.enqueue({ ++ type: "finish", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ finishReason: "stop", ++ }); ++ streamController.close(); ++ }, ++ }); ++ ++ return { stream, response: { headers: {} } }; ++ }, ++ }; ++ ++ const agent = new Agent({ ++ name: "traffic-text-vs-stream", ++ instructions: "echo", ++ model, ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ const streamP1 = agent.streamText("S1", { tenantId: "default", trafficPriority: "P1" }); ++ const textP0 = agent.generateText("T0", { tenantId: "default", trafficPriority: "P0" }); ++ const textP1 = agent.generateText("T1", { tenantId: "default", trafficPriority: "P1" }); ++ ++ const [streamResult, t0, t1] = await Promise.all([streamP1, textP0, textP1]); ++ const streamText = await streamResult.text; ++ ++ console.log( ++ `\n[done] results=${safeStringify({ ++ streamText, ++ textP0: t0.text, ++ textP1: t1.text, ++ })}`, ++ ); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); diff --git a/packages/core/src/traffic/traffic-controller.spec.ts b/packages/core/src/traffic/traffic-controller.spec.ts index b3f331b20..11b0069e3 100644 --- a/packages/core/src/traffic/traffic-controller.spec.ts +++ b/packages/core/src/traffic/traffic-controller.spec.ts @@ -81,6 +81,46 @@ describe("TrafficController priority scheduling", () => { }); }); +describe("TrafficController concurrency limits", () => { + it("shares provider/model limits across tenants", async () => { + const controller = new TrafficController({ + maxConcurrent: 2, + maxConcurrentPerProviderModel: 1, + }); + const started: string[] = []; + let releaseFirst!: () => void; + const firstGate = new Promise((resolve) => { + releaseFirst = resolve; + }); + + const first = controller.handleText({ + tenantId: "tenant-a", + metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, + execute: async () => { + started.push("tenant-a"); + await firstGate; + return "a"; + }, + }); + + const second = controller.handleText({ + tenantId: "tenant-b", + metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, + execute: async () => { + started.push("tenant-b"); + return "b"; + }, + }); + + await new Promise((resolve) => setTimeout(resolve, 0)); + expect(started).toEqual(["tenant-a"]); + + releaseFirst(); + await Promise.all([first, second]); + expect(started).toEqual(["tenant-a", "tenant-b"]); + }); +}); + describe("TrafficController rate limit headers", () => { it("parses OpenAI-style compound reset durations (e.g. 1m30.951s)", () => { vi.useFakeTimers(); diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index c26b914d8..8de3abbe9 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -188,7 +188,7 @@ export class TrafficController { buildRateLimitKey: (metadata) => this.buildRateLimitKey(metadata), }); this.concurrencyLimiter = new TrafficConcurrencyLimiter({ - buildProviderModelKey: (metadata) => this.buildRateLimitKey(metadata), + buildProviderModelKey: (metadata) => buildProviderModelKeyFromMetadata(metadata), maxConcurrentPerProviderModel: options.maxConcurrentPerProviderModel, maxConcurrentPerTenant: options.maxConcurrentPerTenant, }); @@ -1229,3 +1229,9 @@ function buildRateLimitKeyFromMetadata(metadata?: TrafficRequestMetadata): strin return parts.join("::"); } + +function buildProviderModelKeyFromMetadata(metadata?: TrafficRequestMetadata): string { + const provider = metadata?.provider ?? "default-provider"; + const model = metadata?.model ?? "default-model"; + return `${provider}::${model}`; +} From 87c284aabcea4868b123a8231d8b003638d5b8b7 Mon Sep 17 00:00:00 2001 From: riturajFi Date: Thu, 25 Dec 2025 17:08:37 +0530 Subject: [PATCH 26/41] fix: open ai rate limitter --- .../openai-window-rate-limit-strategy.ts | 44 +++++++-------- .../src/traffic/traffic-controller.spec.ts | 55 ++++++++++++++++++- 2 files changed, 75 insertions(+), 24 deletions(-) diff --git a/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts index 8e8b6f868..f180db9cd 100644 --- a/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts +++ b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts @@ -42,33 +42,31 @@ export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { if (this.requestsPerMinute !== undefined) { const requestDecision = this.resolveRequestWindow(next, logger); if (requestDecision) return requestDecision; - const tokenDecision = this.resolveTokenWindow(logger); - if (tokenDecision) return tokenDecision; - return null; - } - - const decision = this.window.resolve(next, logger); - if (decision) return decision; + } else { + const decision = this.window.resolve(next, logger); + if (decision) return decision; - if (next.rateLimitKey) { - return null; - } + if (!next.rateLimitKey && this.tokensPerMinute === undefined) { + const rateLimitLogger = logger?.child({ module: "rate-limiter" }); + if (this.bootstrapReserved >= 1) { + rateLimitLogger?.debug?.("OpenAI rate limit bootstrap active; waiting", { + rateLimitKey: this.key, + bootstrapReserved: this.bootstrapReserved, + }); + return { kind: "wait" }; + } - const rateLimitLogger = logger?.child({ module: "rate-limiter" }); - if (this.bootstrapReserved >= 1) { - rateLimitLogger?.debug?.("OpenAI rate limit bootstrap active; waiting", { - rateLimitKey: this.key, - bootstrapReserved: this.bootstrapReserved, - }); - return { kind: "wait" }; + this.bootstrapReserved += 1; + next.rateLimitKey = this.key; + rateLimitLogger?.debug?.("OpenAI rate limit bootstrap reserved", { + rateLimitKey: this.key, + bootstrapReserved: this.bootstrapReserved, + }); + } } - this.bootstrapReserved += 1; - next.rateLimitKey = this.key; - rateLimitLogger?.debug?.("OpenAI rate limit bootstrap reserved", { - rateLimitKey: this.key, - bootstrapReserved: this.bootstrapReserved, - }); + const tokenDecision = this.resolveTokenWindow(logger); + if (tokenDecision) return tokenDecision; return null; } diff --git a/packages/core/src/traffic/traffic-controller.spec.ts b/packages/core/src/traffic/traffic-controller.spec.ts index 11b0069e3..3abbee2ba 100644 --- a/packages/core/src/traffic/traffic-controller.spec.ts +++ b/packages/core/src/traffic/traffic-controller.spec.ts @@ -1,5 +1,5 @@ import { describe, expect, it, vi } from "vitest"; -import { CIRCUIT_FAILURE_THRESHOLD } from "./traffic-constants"; +import { CIRCUIT_FAILURE_THRESHOLD, RATE_LIMIT_PROBE_DELAY_MS } from "./traffic-constants"; import { TrafficController } from "./traffic-controller"; describe("TrafficController priority scheduling", () => { @@ -272,6 +272,59 @@ describe("TrafficController rate limit headers", () => { }); }); +describe("TrafficController token limits", () => { + it("blocks OpenAI when the token window is exhausted even without RPM config", async () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(0)); + const controller = new TrafficController({ + maxConcurrent: 1, + rateLimits: { + "openai::gpt-4o": { + requestsPerMinute: 0, + tokensPerMinute: 2, + }, + }, + }); + const order: string[] = []; + + const first = controller.handleText({ + tenantId: "tenant-a", + metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, + execute: async () => { + order.push("first"); + return "first"; + }, + extractUsage: () => ({ totalTokens: 2 }), + }); + + const second = controller.handleText({ + tenantId: "tenant-b", + metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, + execute: async () => { + order.push("second"); + return "second"; + }, + extractUsage: () => ({ totalTokens: 1 }), + }); + + await first; + expect(order).toEqual(["first"]); + + await vi.advanceTimersByTimeAsync(60_000 + RATE_LIMIT_PROBE_DELAY_MS - 1); + expect(order).toEqual(["first"]); + + await vi.advanceTimersByTimeAsync(1); + await vi.runAllTimersAsync(); + await second; + expect(order).toEqual(["first", "second"]); + } finally { + vi.useRealTimers(); + } + }); +}); + describe("TrafficController stream reporting", () => { it("treats post-start stream failures as circuit breaker failures", async () => { const controller = new TrafficController({ From f41b8b7582401a5c110fc22de702fbe83d032c5d Mon Sep 17 00:00:00 2001 From: riturajFi Date: Thu, 25 Dec 2025 17:21:28 +0530 Subject: [PATCH 27/41] fix: stream failure handle --- .../src/traffic/traffic-controller.spec.ts | 54 +++++++++++++++++++ .../core/src/traffic/traffic-controller.ts | 29 +++++++--- 2 files changed, 76 insertions(+), 7 deletions(-) diff --git a/packages/core/src/traffic/traffic-controller.spec.ts b/packages/core/src/traffic/traffic-controller.spec.ts index 3abbee2ba..881a51723 100644 --- a/packages/core/src/traffic/traffic-controller.spec.ts +++ b/packages/core/src/traffic/traffic-controller.spec.ts @@ -326,6 +326,60 @@ describe("TrafficController token limits", () => { }); describe("TrafficController stream reporting", () => { + it("slows down after stream 429 errors", async () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(0)); + const controller = new TrafficController({ + maxConcurrent: 1, + adaptiveLimiter: { + windowMs: 1_000, + threshold: 1, + minPenaltyMs: 10, + maxPenaltyMs: 10, + penaltyMultiplier: 1, + decayMs: 1_000, + }, + }); + const metadata = { + provider: "p", + model: "m", + priority: "P1" as const, + tenantId: "tenant-a", + }; + + controller.reportStreamFailure( + metadata, + Object.assign(new Error("rate limit"), { status: 429 }), + ); + + const order: string[] = []; + const request = controller.handleText({ + tenantId: "tenant-a", + metadata, + execute: async () => { + order.push("run"); + return "ok"; + }, + }); + + await Promise.resolve(); + expect(order).toEqual([]); + + await vi.advanceTimersByTimeAsync(9); + await Promise.resolve(); + expect(order).toEqual([]); + + await vi.advanceTimersByTimeAsync(1); + await vi.runAllTimersAsync(); + await request; + expect(order).toEqual(["run"]); + } finally { + vi.useRealTimers(); + } + }); + it("treats post-start stream failures as circuit breaker failures", async () => { const controller = new TrafficController({ maxConcurrent: 1, diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index 8de3abbe9..ed43ba35e 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -250,6 +250,16 @@ export class TrafficController { } reportStreamFailure(metadata: TrafficRequestMetadata | undefined, error: unknown): void { + const rateLimitKey = this.buildRateLimitKey(metadata); + const normalizedRateLimitError = normalizeRateLimitError({ + error, + metadata, + tenantId: metadata?.tenantId, + key: rateLimitKey, + logger: this.trafficLogger, + }); + const errorForHandling = normalizedRateLimitError ?? error; + this.controllerLogger.warn("Stream reported failure", { provider: metadata?.provider, model: metadata?.model, @@ -260,20 +270,25 @@ export class TrafficController { status: (error as { status?: unknown } | null)?.status, statusCode: (error as { statusCode?: unknown } | null)?.statusCode, }); - this.circuitBreaker.recordFailure(metadata, error, this.trafficLogger); - const rateLimitKey = this.buildRateLimitKey(metadata); + this.circuitBreaker.recordFailure(metadata, errorForHandling, this.trafficLogger); const adaptiveKey = this.buildAdaptiveKey( metadata, metadata?.tenantId ?? "default", rateLimitKey, ); - if (error instanceof RateLimitedUpstreamError) { - this.recordAdaptiveRateLimitHit(adaptiveKey, error.retryAfterMs); + if (errorForHandling instanceof RateLimitedUpstreamError) { + this.recordAdaptiveRateLimitHit(adaptiveKey, errorForHandling.retryAfterMs); } - this.attachTrafficMetadata( - error, - this.buildTrafficResponseMetadataFromMetadata(metadata, rateLimitKey, Date.now(), error), + const traffic = this.buildTrafficResponseMetadataFromMetadata( + metadata, + rateLimitKey, + Date.now(), + errorForHandling, ); + this.attachTrafficMetadata(errorForHandling, traffic); + if (errorForHandling !== error) { + this.attachTrafficMetadata(error, traffic); + } } updateRateLimitFromHeaders( From 7f3c562beaea21895ee10dacce7fc9ee7e47dc85 Mon Sep 17 00:00:00 2001 From: riturajFi Date: Thu, 25 Dec 2025 18:00:39 +0530 Subject: [PATCH 28/41] fix: pre detection of token count --- packages/core/src/agent/agent.ts | 39 ++++++++++++ .../openai-window-rate-limit-strategy.ts | 25 ++++++-- .../rate-limit-strategy.ts | 2 +- .../traffic/traffic-controller-internal.ts | 2 + .../src/traffic/traffic-controller.spec.ts | 62 +++++++++++++++++++ .../core/src/traffic/traffic-controller.ts | 12 +++- .../core/src/traffic/traffic-rate-limiter.ts | 44 ++++++++++--- packages/core/src/traffic/traffic-types.ts | 1 + 8 files changed, 172 insertions(+), 15 deletions(-) diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index 04e86bb9f..6ad913d2c 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -487,6 +487,7 @@ export class Agent { tenantId, metadata, maxQueueWaitMs: options?.maxQueueWaitMs, + estimatedTokens: this.estimateTokens(input, mergedOptions), execute: () => this.executeGenerateText(input, mergedOptions, metadata), // Defer actual execution so controller can schedule it extractUsage: (result: GenerateTextResultWithContext) => this.extractUsageFromResponse(result), @@ -867,6 +868,7 @@ export class Agent { tenantId, metadata, maxQueueWaitMs: options?.maxQueueWaitMs, + estimatedTokens: this.estimateTokens(input, mergedOptions), execute: () => this.executeStreamText(input, mergedOptions, metadata), // Actual streaming work happens after the controller dequeues us extractUsage: (result: StreamTextResultWithContext) => this.extractUsageFromResponse(result), @@ -1584,6 +1586,7 @@ export class Agent { tenantId, metadata, maxQueueWaitMs: options?.maxQueueWaitMs, + estimatedTokens: this.estimateTokens(input, mergedOptions), execute: () => this.executeGenerateObject(input, schema, mergedOptions, metadata), extractUsage: (result: GenerateObjectResultWithContext>) => this.extractUsageFromResponse(result), @@ -1865,6 +1868,7 @@ export class Agent { tenantId, metadata, maxQueueWaitMs: options?.maxQueueWaitMs, + estimatedTokens: this.estimateTokens(input, mergedOptions), execute: () => this.executeStreamObject(input, schema, mergedOptions, metadata), extractUsage: (result: StreamObjectResultWithContext>) => this.extractUsageFromResponse(result), @@ -4161,6 +4165,41 @@ export class Agent { }; } + private estimateTokens( + input: string | UIMessage[] | BaseMessage[], + options?: BaseGenerationOptions, + ): number | undefined { + let text = ""; + if (typeof input === "string") { + text = input; + } else if (Array.isArray(input)) { + text = input + .map((message) => { + if (typeof message === "string") return message; + if (message && typeof message === "object") { + const content = (message as { content?: unknown }).content; + if (typeof content === "string") return content; + if (content !== undefined) return safeStringify(content); + return safeStringify(message); + } + return String(message ?? ""); + }) + .join(" "); + } else if (input) { + text = safeStringify(input); + } + + const inputTokens = text ? Math.ceil(text.length / 4) : 0; + const outputTokensRaw = + typeof options?.maxOutputTokens === "number" ? options.maxOutputTokens : this.maxOutputTokens; + const outputTokens = + typeof outputTokensRaw === "number" && Number.isFinite(outputTokensRaw) + ? Math.max(0, Math.floor(outputTokensRaw)) + : 0; + const total = inputTokens + outputTokens; + return total > 0 ? total : undefined; + } + private resolveFallbackTarget(target: FallbackChainEntry): { modelOverride?: LanguageModel; providerOverride?: string; diff --git a/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts index f180db9cd..32ffc7e45 100644 --- a/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts +++ b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts @@ -65,7 +65,7 @@ export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { } } - const tokenDecision = this.resolveTokenWindow(logger); + const tokenDecision = this.resolveTokenWindow(next, logger); if (tokenDecision) return tokenDecision; return null; } @@ -93,14 +93,20 @@ export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { this.window.onComplete(logger); } - recordUsage(usage: RateLimitUsage, logger?: Logger): void { + recordUsage(usage: RateLimitUsage, logger?: Logger, reservedTokens?: number): void { if (this.tokensPerMinute === undefined) return; const tokens = this.resolveTokenCount(usage); if (tokens <= 0) return; const now = Date.now(); const state = this.ensureTokenState(now); - state.remaining = Math.max(0, state.remaining - tokens); + const reserved = typeof reservedTokens === "number" ? reservedTokens : 0; + const delta = tokens - reserved; + if (delta > 0) { + state.remaining = Math.max(0, state.remaining - delta); + } else if (delta < 0) { + state.remaining = Math.min(state.limit, state.remaining + Math.abs(delta)); + } logger?.child({ module: "rate-limiter" })?.trace?.("OpenAI token usage recorded", { rateLimitKey: this.key, tokens, @@ -198,13 +204,22 @@ export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { return null; } - private resolveTokenWindow(logger?: Logger): DispatchDecision | null { + private resolveTokenWindow(next: QueuedRequest, logger?: Logger): DispatchDecision | null { if (this.tokensPerMinute === undefined) return null; const rateLimitLogger = logger?.child({ module: "rate-limiter" }); const now = Date.now(); const state = this.ensureTokenState(now); + const estimatedTokens = next.estimatedTokens; - if (state.remaining > 0) return null; + if (typeof estimatedTokens === "number" && estimatedTokens > 0) { + if (state.remaining >= estimatedTokens) { + state.remaining = Math.max(0, state.remaining - estimatedTokens); + next.reservedTokens = estimatedTokens; + return null; + } + } else if (state.remaining > 0) { + return null; + } const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; rateLimitLogger?.debug?.("OpenAI token window exhausted; waiting", { diff --git a/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts index 6657c6b26..653fdaf2f 100644 --- a/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts +++ b/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts @@ -32,7 +32,7 @@ export interface RateLimitStrategy { resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null; onDispatch(logger?: Logger): void; onComplete(logger?: Logger): void; - recordUsage?(usage: RateLimitUsage, logger?: Logger): void; + recordUsage?(usage: RateLimitUsage, logger?: Logger, reservedTokens?: number): void; updateFromHeaders( metadata: TrafficRequestMetadata | undefined, headers: unknown, diff --git a/packages/core/src/traffic/traffic-controller-internal.ts b/packages/core/src/traffic/traffic-controller-internal.ts index cf4358542..aa808b6da 100644 --- a/packages/core/src/traffic/traffic-controller-internal.ts +++ b/packages/core/src/traffic/traffic-controller-internal.ts @@ -40,6 +40,8 @@ export interface QueuedRequest { tenantId: string; enqueuedAt: number; dispatchedAt?: number; + estimatedTokens?: number; + reservedTokens?: number; tenantConcurrencyKey?: string; providerModelConcurrencyKey?: string; diff --git a/packages/core/src/traffic/traffic-controller.spec.ts b/packages/core/src/traffic/traffic-controller.spec.ts index 881a51723..4c2672788 100644 --- a/packages/core/src/traffic/traffic-controller.spec.ts +++ b/packages/core/src/traffic/traffic-controller.spec.ts @@ -323,6 +323,68 @@ describe("TrafficController token limits", () => { vi.useRealTimers(); } }); + + it("reserves estimated tokens before dispatch", async () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(0)); + const controller = new TrafficController({ + maxConcurrent: 2, + rateLimits: { + "openai::gpt-4o": { + requestsPerMinute: 0, + tokensPerMinute: 2, + }, + }, + }); + const order: string[] = []; + let releaseFirst!: () => void; + const firstGate = new Promise((resolve) => { + releaseFirst = resolve; + }); + + const first = controller.handleText({ + tenantId: "tenant-a", + metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, + estimatedTokens: 2, + execute: async () => { + order.push("first"); + await firstGate; + return "first"; + }, + extractUsage: () => ({ totalTokens: 2 }), + }); + + const second = controller.handleText({ + tenantId: "tenant-b", + metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, + estimatedTokens: 1, + execute: async () => { + order.push("second"); + return "second"; + }, + extractUsage: () => ({ totalTokens: 1 }), + }); + + await Promise.resolve(); + expect(order).toEqual(["first"]); + + await vi.advanceTimersByTimeAsync(60_000 + RATE_LIMIT_PROBE_DELAY_MS - 1); + await Promise.resolve(); + expect(order).toEqual(["first"]); + + await vi.advanceTimersByTimeAsync(1); + await vi.runAllTimersAsync(); + await Promise.resolve(); + expect(order).toEqual(["first", "second"]); + + releaseFirst(); + await Promise.all([first, second]); + } finally { + vi.useRealTimers(); + } + }); }); describe("TrafficController stream reporting", () => { diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index ed43ba35e..1a8c2f2ef 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -368,6 +368,7 @@ export class TrafficController { priority, tenantId, enqueuedAt: Date.now(), + estimatedTokens: request.estimatedTokens, extractUsage: request.extractUsage, }); this.scheduleDrain(); @@ -648,7 +649,7 @@ export class TrafficController { this.circuitBreaker.recordSuccess(item.request.metadata, this.trafficLogger); } const usage = this.usageTracker.recordUsage(item, result, this.trafficLogger); - this.rateLimiter.recordUsage(rateLimitKey, usage, this.trafficLogger); + this.rateLimiter.recordUsage(rateLimitKey, usage, this.trafficLogger, item.reservedTokens); this.recordAdaptiveSuccess(adaptiveKey); this.attachTrafficMetadata( result, @@ -666,6 +667,14 @@ export class TrafficController { }); const errorForHandling = normalizedRateLimitError ?? error; const adaptiveKey = this.buildAdaptiveKey(item.request.metadata, item.tenantId, rateLimitKey); + if (typeof item.reservedTokens === "number" && item.reservedTokens > 0) { + this.rateLimiter.recordUsage( + rateLimitKey, + { totalTokens: 0 }, + this.trafficLogger, + item.reservedTokens, + ); + } if (errorForHandling instanceof RateLimitedUpstreamError) { this.recordAdaptiveRateLimitHit(adaptiveKey, errorForHandling.retryAfterMs); } @@ -771,6 +780,7 @@ export class TrafficController { attempt: item.attempt + 1, enqueuedAt: Date.now(), dispatchedAt: undefined, + reservedTokens: undefined, tenantConcurrencyKey: undefined, providerModelConcurrencyKey: undefined, rateLimitKey: undefined, diff --git a/packages/core/src/traffic/traffic-rate-limiter.ts b/packages/core/src/traffic/traffic-rate-limiter.ts index a77a0423d..3e5aefbed 100644 --- a/packages/core/src/traffic/traffic-rate-limiter.ts +++ b/packages/core/src/traffic/traffic-rate-limiter.ts @@ -59,7 +59,7 @@ export class TrafficRateLimiter { if (requestDecision?.kind === "wait") { const tokenDecision = strategy.handlesTokenLimits ? null - : this.resolveTokenLimit(key, logger); + : this.resolveTokenLimit(next, key, logger, false); if (tokenDecision?.kind === "wait") { const requestWakeUp = requestDecision.wakeUpAt; const tokenWakeUp = tokenDecision.wakeUpAt; @@ -73,7 +73,9 @@ export class TrafficRateLimiter { return requestDecision; } - const tokenDecision = strategy.handlesTokenLimits ? null : this.resolveTokenLimit(key, logger); + const tokenDecision = strategy.handlesTokenLimits + ? null + : this.resolveTokenLimit(next, key, logger, true); if (tokenDecision?.kind === "wait") { return tokenDecision; } @@ -126,18 +128,19 @@ export class TrafficRateLimiter { key: string | undefined, usage: UsageCounters | Promise | undefined, logger?: Logger, + reservedTokens?: number, ): void { if (!key || !usage) return; if (typeof (usage as PromiseLike).then === "function") { void (usage as Promise) - .then((resolved) => this.recordUsage(key, resolved, logger)) + .then((resolved) => this.recordUsage(key, resolved, logger, reservedTokens)) .catch(() => {}); return; } const strategy = this.strategies.get(key); if (strategy?.recordUsage) { - strategy.recordUsage(usage, logger); + strategy.recordUsage(usage, logger, reservedTokens); return; } @@ -150,7 +153,13 @@ export class TrafficRateLimiter { const now = Date.now(); this.refillTokenRate(bucket, now); bucket.tokens = Math.min(bucket.capacity, bucket.tokens); - bucket.tokens -= tokens; + const reserved = typeof reservedTokens === "number" ? reservedTokens : 0; + const delta = tokens - reserved; + if (delta > 0) { + bucket.tokens -= delta; + } else if (delta < 0) { + bucket.tokens = Math.min(bucket.capacity, bucket.tokens + Math.abs(delta)); + } if (bucket.tokens < 0 && bucket.refillPerSecond > 0) { const waitMs = Math.max(1, Math.ceil((-bucket.tokens / bucket.refillPerSecond) * 1000)); @@ -184,7 +193,12 @@ export class TrafficRateLimiter { return created; } - private resolveTokenLimit(key: string, logger?: Logger): DispatchDecision | null { + private resolveTokenLimit( + next: QueuedRequest, + key: string, + logger?: Logger, + reserveTokens = true, + ): DispatchDecision | null { const bucket = this.getTokenRateState(key, logger); if (!bucket) return null; @@ -200,7 +214,18 @@ export class TrafficRateLimiter { return { kind: "wait" }; } - if (bucket.tokens >= 0) return null; + const estimatedTokens = next.estimatedTokens; + if (typeof estimatedTokens === "number" && estimatedTokens > 0) { + if (bucket.tokens >= estimatedTokens) { + if (reserveTokens) { + bucket.tokens -= estimatedTokens; + next.reservedTokens = estimatedTokens; + } + return null; + } + } else if (bucket.tokens >= 0) { + return null; + } if (bucket.refillPerSecond <= 0) { logger?.child({ module: "rate-limiter" })?.debug?.("Token limit has no refill; blocking", { @@ -211,7 +236,10 @@ export class TrafficRateLimiter { return { kind: "wait" }; } - const requiredTokens = -bucket.tokens; + const requiredTokens = + typeof estimatedTokens === "number" && estimatedTokens > 0 + ? Math.max(estimatedTokens - bucket.tokens, 1) + : -bucket.tokens; const waitMs = Math.max(1, Math.ceil((requiredTokens / bucket.refillPerSecond) * 1000)); return { kind: "wait", wakeUpAt: now + waitMs }; } diff --git a/packages/core/src/traffic/traffic-types.ts b/packages/core/src/traffic/traffic-types.ts index f2ebbafbb..80cfc3724 100644 --- a/packages/core/src/traffic/traffic-types.ts +++ b/packages/core/src/traffic/traffic-types.ts @@ -109,6 +109,7 @@ export interface TrafficRequest { execute: () => Promise; deadlineAt?: number; maxQueueWaitMs?: number; + estimatedTokens?: number; createFallbackRequest?: BivariantFunction< [target: FallbackChainEntry], TrafficRequest | undefined From 726487a33ee7440952f3c53be2052a525c0eb3ef Mon Sep 17 00:00:00 2001 From: riturajFi Date: Thu, 25 Dec 2025 20:43:04 +0530 Subject: [PATCH 29/41] =?UTF-8?q?fix:=20per=E2=80=91key=20traffic=20fields?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packages/core/src/agent/agent.ts | 32 ++++++++ packages/core/src/agent/types.ts | 12 +++ .../src/traffic/traffic-circuit-breaker.ts | 7 ++ .../src/traffic/traffic-controller.spec.ts | 81 +++++++++++++++++++ .../core/src/traffic/traffic-controller.ts | 2 +- 5 files changed, 133 insertions(+), 1 deletion(-) diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index 6ad913d2c..f9f5cc1c4 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -270,6 +270,22 @@ export interface BaseGenerationOptions extends Partial { userId?: string; conversationId?: string; tenantId?: string; + /** + * Optional key metadata for per-key rate limits. + */ + apiKeyId?: string; + /** + * Optional region metadata for per-region rate limits. + */ + region?: string; + /** + * Optional endpoint metadata for per-endpoint rate limits. + */ + endpoint?: string; + /** + * Optional tenant tier metadata for per-tier rate limits. + */ + tenantTier?: string; context?: ContextInput; elicitation?: (request: unknown) => Promise; /** @@ -2336,6 +2352,10 @@ export class Agent { const startTimeDate = new Date(); const priority = this.resolveTrafficPriority(options); const tenantId = this.resolveTenantId(options); + const apiKeyId = options?.apiKeyId ?? options?.parentOperationContext?.apiKeyId; + const region = options?.region ?? options?.parentOperationContext?.region; + const endpoint = options?.endpoint ?? options?.parentOperationContext?.endpoint; + const tenantTier = options?.tenantTier ?? options?.parentOperationContext?.tenantTier; // Prefer reusing an existing context instance to preserve reference across calls/subagents const runtimeContext = toContextMap(options?.context); @@ -2445,6 +2465,10 @@ export class Agent { userId: options?.userId, conversationId: options?.conversationId, tenantId, + apiKeyId, + region, + endpoint, + tenantTier, parentAgentId: options?.parentAgentId, traceContext, startTime: startTimeDate, @@ -4152,6 +4176,10 @@ export class Agent { this.resolveProvider(this.model) ?? undefined; const priority = this.resolveTrafficPriority(options); + const apiKeyId = options?.apiKeyId ?? options?.parentOperationContext?.apiKeyId; + const region = options?.region ?? options?.parentOperationContext?.region; + const endpoint = options?.endpoint ?? options?.parentOperationContext?.endpoint; + const tenantTier = options?.tenantTier ?? options?.parentOperationContext?.tenantTier; return { agentId: this.id, // Identify which agent issued the request @@ -4160,6 +4188,10 @@ export class Agent { provider, // Allows per-provider throttling later priority, tenantId: this.resolveTenantId(options), + apiKeyId, + region, + endpoint, + tenantTier, taskType: options?.taskType, fallbackPolicyId: options?.fallbackPolicyId, }; diff --git a/packages/core/src/agent/types.ts b/packages/core/src/agent/types.ts index add69edfd..c70bd478e 100644 --- a/packages/core/src/agent/types.ts +++ b/packages/core/src/agent/types.ts @@ -900,6 +900,18 @@ export type OperationContext = { /** Optional tenant identifier propagated across nested operations */ tenantId?: string; + /** Optional key identifier for per-key traffic limits */ + apiKeyId?: string; + + /** Optional region identifier for per-region traffic limits */ + region?: string; + + /** Optional endpoint identifier for per-endpoint traffic limits */ + endpoint?: string; + + /** Optional tenant tier identifier for per-tier traffic limits */ + tenantTier?: string; + /** User-managed context map for this operation */ readonly context: Map; diff --git a/packages/core/src/traffic/traffic-circuit-breaker.ts b/packages/core/src/traffic/traffic-circuit-breaker.ts index f240ce405..2936a5870 100644 --- a/packages/core/src/traffic/traffic-circuit-breaker.ts +++ b/packages/core/src/traffic/traffic-circuit-breaker.ts @@ -359,12 +359,19 @@ export class TrafficCircuitBreaker { ): void { next.request = fallbackRequest; next.attempt = 1; + next.estimatedTokens = fallbackRequest.estimatedTokens; + next.reservedTokens = undefined; next.tenantConcurrencyKey = undefined; next.providerModelConcurrencyKey = undefined; next.rateLimitKey = undefined; next.etaMs = undefined; next.circuitKey = undefined; next.circuitStatus = undefined; + next.extractUsage = fallbackRequest.extractUsage; + if (context?.reason === "queue-timeout") { + next.enqueuedAt = Date.now(); + next.dispatchedAt = undefined; + } logger?.debug?.("Switched to fallback request", { previousCircuitKey: context?.previousCircuitKey, fallbackModel: fallback, diff --git a/packages/core/src/traffic/traffic-controller.spec.ts b/packages/core/src/traffic/traffic-controller.spec.ts index 4c2672788..51c08c9da 100644 --- a/packages/core/src/traffic/traffic-controller.spec.ts +++ b/packages/core/src/traffic/traffic-controller.spec.ts @@ -488,3 +488,84 @@ describe("TrafficController stream reporting", () => { expect(order).toEqual(["fallback"]); }); }); + +describe("TrafficController queue timeouts", () => { + it("lets fallback requests wait after queue timeout without rejecting", async () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(0)); + const controller = new TrafficController({ + maxConcurrent: 1, + fallbackChains: { + "p::m": ["m-fallback"], + }, + }); + const order: string[] = []; + let releaseFirst!: () => void; + const firstGate = new Promise((resolve) => { + releaseFirst = resolve; + }); + + const first = controller.handleText({ + tenantId: "tenant-a", + metadata: { provider: "p", model: "m", priority: "P1" }, + execute: async () => { + order.push("first"); + await firstGate; + return "first"; + }, + }); + + const second = controller.handleText({ + tenantId: "tenant-a", + metadata: { provider: "p", model: "m", priority: "P1" }, + maxQueueWaitMs: 1, + execute: async () => { + order.push("primary"); + return "primary"; + }, + createFallbackRequest: (target) => ({ + tenantId: "tenant-a", + metadata: { + provider: "p", + model: typeof target === "string" ? target : target.model, + priority: "P1", + }, + maxQueueWaitMs: 1, + execute: async () => { + order.push("fallback"); + return "fallback"; + }, + }), + }); + + await Promise.resolve(); + expect(order).toEqual(["first"]); + + await vi.advanceTimersByTimeAsync(2); + + const third = controller.handleText({ + tenantId: "tenant-a", + metadata: { provider: "p", model: "other", priority: "P1" }, + execute: async () => { + order.push("third"); + return "third"; + }, + }); + + await Promise.resolve(); + expect(order).toEqual(["first"]); + + releaseFirst(); + await vi.runAllTimersAsync(); + + await expect(second).resolves.toBe("fallback"); + await Promise.all([first, third]); + + expect(order).toEqual(["first", "fallback", "third"]); + } finally { + vi.useRealTimers(); + } + }); +}); diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index 1a8c2f2ef..3c704dbde 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -857,7 +857,7 @@ export class TrafficController { this.trafficLogger, ); if (fallbackApplied) { - return "expired"; + return "none"; } const timeoutError = this.createQueueTimeoutError(next, now); From 426960e7ece717582e404eb1c676c8bff7c09851 Mon Sep 17 00:00:00 2001 From: riturajFi Date: Thu, 25 Dec 2025 20:49:15 +0530 Subject: [PATCH 30/41] chore: sanity --- diff.txt | 8147 ------------------------------------------------------ 1 file changed, 8147 deletions(-) delete mode 100644 diff.txt diff --git a/diff.txt b/diff.txt deleted file mode 100644 index d84dc9926..000000000 --- a/diff.txt +++ /dev/null @@ -1,8147 +0,0 @@ -diff --git a/commits.txt b/commits.txt -new file mode 100644 -index 00000000..73fd43c5 ---- /dev/null -+++ b/commits.txt -@@ -0,0 +1,6 @@ -+e8443df2 -+9503a0a6 -+293fe825 -+a88ecd67 -+66d74dd2 -+53f34370 -\ No newline at end of file -diff --git a/diff.txt b/diff.txt -new file mode 100644 -index 00000000..e69de29b -diff --git a/examples/with-client-side-tools/next-env.d.ts b/examples/with-client-side-tools/next-env.d.ts -index 1b3be084..9edff1c7 100644 ---- a/examples/with-client-side-tools/next-env.d.ts -+++ b/examples/with-client-side-tools/next-env.d.ts -@@ -1,5 +1,6 @@ - /// - /// -+import "./.next/types/routes.d.ts"; - - // NOTE: This file should not be edited - // see https://nextjs.org/docs/app/api-reference/config/typescript for more information. -diff --git a/examples/with-client-side-tools/tsconfig.json b/examples/with-client-side-tools/tsconfig.json -index 3697fcb9..0fca67d3 100644 ---- a/examples/with-client-side-tools/tsconfig.json -+++ b/examples/with-client-side-tools/tsconfig.json -@@ -1,6 +1,10 @@ - { - "compilerOptions": { -- "lib": ["dom", "dom.iterable", "esnext"], -+ "lib": [ -+ "dom", -+ "dom.iterable", -+ "esnext" -+ ], - "allowJs": true, - "skipLibCheck": true, - "strict": true, -@@ -11,7 +15,7 @@ - "resolveJsonModule": true, - "isolatedModules": true, - "sourceMap": true, -- "jsx": "preserve", -+ "jsx": "react-jsx", - "incremental": true, - "plugins": [ - { -@@ -19,10 +23,20 @@ - } - ], - "paths": { -- "@/*": ["./*"] -+ "@/*": [ -+ "./*" -+ ] - }, - "target": "ES2017" - }, -- "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"], -- "exclude": ["node_modules"] -+ "include": [ -+ "next-env.d.ts", -+ "**/*.ts", -+ "**/*.tsx", -+ ".next/types/**/*.ts", -+ ".next/dev/types/**/*.ts" -+ ], -+ "exclude": [ -+ "node_modules" -+ ] - } -diff --git a/examples/with-netlify-functions/netlify/functions/voltagent.js b/examples/with-netlify-functions/netlify/functions/voltagent.js -new file mode 100644 -index 00000000..0ec386b8 ---- /dev/null -+++ b/examples/with-netlify-functions/netlify/functions/voltagent.js -@@ -0,0 +1,4 @@ -+import { createNetlifyFunctionHandler } from "@voltagent/serverless-hono"; -+import { getVoltAgent } from "../../src/index"; -+const voltAgent = getVoltAgent(); -+export const handler = createNetlifyFunctionHandler(voltAgent); -diff --git a/examples/with-netlify-functions/src/index.js b/examples/with-netlify-functions/src/index.js -new file mode 100644 -index 00000000..af385b50 ---- /dev/null -+++ b/examples/with-netlify-functions/src/index.js -@@ -0,0 +1,17 @@ -+import { openai } from "@ai-sdk/openai"; -+import { Agent, VoltAgent } from "@voltagent/core"; -+import { serverlessHono } from "@voltagent/serverless-hono"; -+import { weatherTool } from "./tools"; -+const agent = new Agent({ -+ name: "netlify-function-agent", -+ instructions: "Help the user quickly and call tools when needed.", -+ model: openai("gpt-4o-mini"), -+ tools: [weatherTool], -+}); -+const voltAgent = new VoltAgent({ -+ agents: { agent }, -+ serverless: serverlessHono(), -+}); -+export function getVoltAgent() { -+ return voltAgent; -+} -diff --git a/examples/with-netlify-functions/src/tools/index.js b/examples/with-netlify-functions/src/tools/index.js -new file mode 100644 -index 00000000..d1c5bf43 ---- /dev/null -+++ b/examples/with-netlify-functions/src/tools/index.js -@@ -0,0 +1,26 @@ -+import { createTool } from "@voltagent/core"; -+import z from "zod"; -+export const weatherTool = createTool({ -+ id: "get-weather", -+ name: "getWeather", -+ description: "Return a mock weather report for the requested location", -+ parameters: z.object({ -+ location: z.string().describe("City or location to look up"), -+ }), -+ execute: async ({ location }, context) => { -+ context?.logger.info(`Fetching weather for ${location}`); -+ const mockWeatherData = { -+ location, -+ temperature: Math.floor(Math.random() * 30) + 5, -+ condition: ["Sunny", "Cloudy", "Rainy", "Snowy", "Partly Cloudy"][ -+ Math.floor(Math.random() * 5) -+ ], -+ humidity: Math.floor(Math.random() * 60) + 30, -+ windSpeed: Math.floor(Math.random() * 30), -+ }; -+ return { -+ weather: mockWeatherData, -+ message: `Current weather in ${location}: ${mockWeatherData.temperature}°C and ${mockWeatherData.condition.toLowerCase()} with ${mockWeatherData.humidity}% humidity and wind speed of ${mockWeatherData.windSpeed} km/h.`, -+ }; -+ }, -+}); -diff --git a/package.json b/package.json -index 7c80f7c5..7e3ef8ba 100644 ---- a/package.json -+++ b/package.json -@@ -32,9 +32,10 @@ - "publint": "^0.3.8", - "rimraf": "^5.0.5", - "syncpack": "^13.0.2", -+ "ts-node": "^10.9.2", - "tslib": "^2.3.0", - "tsup": "^8.5.0", -- "typescript": "^5.8.2", -+ "typescript": "^5.9.2", - "vite": "^7.2.7", - "vitest": "^3.2.4" - }, -diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts -index 291bdf7f..04e86bb9 100644 ---- a/packages/core/src/agent/agent.ts -+++ b/packages/core/src/agent/agent.ts -@@ -48,6 +48,13 @@ import type { BaseRetriever } from "../retriever/retriever"; - import type { Tool, Toolkit } from "../tool"; - import { createTool } from "../tool"; - import { ToolManager } from "../tool/manager"; -+import { -+ type FallbackChainEntry, -+ type TrafficPriority, -+ type TrafficRequestMetadata, -+ getTrafficController, -+} from "../traffic/traffic-controller"; -+import { findHeaders } from "../traffic/traffic-error-utils"; - import { randomUUID } from "../utils/id"; - import { convertModelMessagesToUIMessages } from "../utils/message-converter"; - import { NodeType, createNodeId } from "../utils/node-utils"; -@@ -262,8 +269,26 @@ export interface BaseGenerationOptions extends Partial { - // Context - userId?: string; - conversationId?: string; -+ tenantId?: string; - context?: ContextInput; - elicitation?: (request: unknown) => Promise; -+ /** -+ * Optional priority override for scheduling. -+ * Defaults to agent-level priority when omitted. -+ */ -+ trafficPriority?: TrafficPriority; -+ /** -+ * Optional maximum time to wait in the queue before timing out. -+ */ -+ maxQueueWaitMs?: number; -+ /** -+ * Optional task classification for circuit-breaker fallback policies. -+ */ -+ taskType?: string; -+ /** -+ * Optional explicit fallback policy id. -+ */ -+ fallbackPolicyId?: string; - - // Parent tracking - parentAgentId?: string; -@@ -303,6 +328,8 @@ export interface BaseGenerationOptions extends Partial { - - // Provider-specific options - providerOptions?: ProviderOptions; -+ // Optional per-call model override (used for fallbacks) -+ model?: LanguageModel; - - // Experimental output (for structured generation) - experimental_output?: ReturnType | ReturnType; -@@ -347,6 +374,7 @@ export class Agent { - readonly voice?: Voice; - readonly retriever?: BaseRetriever; - readonly supervisorConfig?: SupervisorConfig; -+ private readonly trafficPriority: TrafficPriority; - private readonly context?: Map; - - private readonly logger: Logger; -@@ -372,6 +400,7 @@ export class Agent { - this.temperature = options.temperature; - this.maxOutputTokens = options.maxOutputTokens; - this.maxSteps = options.maxSteps || 5; -+ this.trafficPriority = options.trafficPriority ?? "P1"; - this.stopWhen = options.stopWhen; - this.markdown = options.markdown ?? false; - this.voice = options.voice; -@@ -444,6 +473,38 @@ export class Agent { - async generateText( - input: string | UIMessage[] | BaseMessage[], - options?: GenerateTextOptions, -+ ): Promise { -+ const controller = getTrafficController({ logger: this.logger }); // Use shared controller so all agent calls flow through central queue/metrics -+ const tenantId = this.resolveTenantId(options); -+ const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => { -+ const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); -+ const metadata = this.buildTrafficMetadata( -+ mergedOptions?.model, -+ mergedOptions, -+ providerOverride, -+ ); // Compute once per queued request (including per-call model overrides) -+ return { -+ tenantId, -+ metadata, -+ maxQueueWaitMs: options?.maxQueueWaitMs, -+ execute: () => this.executeGenerateText(input, mergedOptions, metadata), // Defer actual execution so controller can schedule it -+ extractUsage: (result: GenerateTextResultWithContext) => -+ this.extractUsageFromResponse(result), -+ createFallbackRequest: (fallbackTarget) => { -+ const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = -+ this.resolveFallbackTarget(fallbackTarget); -+ return buildRequest(fallbackModel, fallbackProvider); -+ }, -+ }; -+ }; -+ -+ return controller.handleText(buildRequest(options?.model)); -+ } -+ -+ private async executeGenerateText( -+ input: string | UIMessage[] | BaseMessage[], -+ options?: GenerateTextOptions, -+ trafficMetadata?: TrafficRequestMetadata, - ): Promise { - const startTime = Date.now(); - const oc = this.createOperationContext(input, options); -@@ -471,7 +532,7 @@ export class Agent { - options, - ); - -- const modelName = this.getModelName(); -+ const modelName = this.getModelName(model); - const contextLimit = options?.contextLimit; - - // Add model attributes and all options -@@ -544,10 +605,18 @@ export class Agent { - hooks, - maxSteps: userMaxSteps, - tools: userTools, -+ maxQueueWaitMs, -+ taskType, -+ fallbackPolicyId, - experimental_output, - providerOptions, -+ model: _model, // Exclude model so aiSDKOptions doesn't override resolved model - ...aiSDKOptions - } = options || {}; -+ void _model; -+ void maxQueueWaitMs; -+ void taskType; -+ void fallbackPolicyId; - - const llmSpan = this.createLLMSpan(oc, { - operation: "generateText", -@@ -567,6 +636,11 @@ export class Agent { - - let result!: GenerateTextResult; - try { -+ methodLogger.info("[AI SDK] Calling generateText", { -+ messageCount: messages.length, -+ modelName, -+ tools: tools ? Object.keys(tools) : [], -+ }); - result = await oc.traceContext.withSpan(llmSpan, () => - generateText({ - model, -@@ -575,7 +649,7 @@ export class Agent { - // Default values - temperature: this.temperature, - maxOutputTokens: this.maxOutputTokens, -- maxRetries: 3, -+ maxRetries: 0, - stopWhen: options?.stopWhen ?? this.stopWhen ?? stepCountIs(maxSteps), - // User overrides from AI SDK options - ...aiSDKOptions, -@@ -588,7 +662,15 @@ export class Agent { - onStepFinish: this.createStepHandler(oc, options), - }), - ); -+ methodLogger.info("[AI SDK] Received generateText result", { -+ finishReason: result.finishReason, -+ usage: result.usage ? safeStringify(result.usage) : undefined, -+ stepCount: result.steps?.length ?? 0, -+ rawResult: safeStringify(result), -+ }); -+ this.updateTrafficControllerRateLimits(result.response, trafficMetadata, methodLogger); - } catch (error) { -+ this.updateTrafficControllerRateLimits(error, trafficMetadata, methodLogger); - finalizeLLMSpan(SpanStatusCode.ERROR, { message: (error as Error).message }); - throw error; - } -@@ -771,6 +853,38 @@ export class Agent { - async streamText( - input: string | UIMessage[] | BaseMessage[], - options?: StreamTextOptions, -+ ): Promise { -+ const controller = getTrafficController({ logger: this.logger }); // Same controller handles streaming to keep ordering/backpressure consistent -+ const tenantId = this.resolveTenantId(options); -+ const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => { -+ const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); -+ const metadata = this.buildTrafficMetadata( -+ mergedOptions?.model, -+ mergedOptions, -+ providerOverride, -+ ); // Compute once per queued request (including per-call model overrides) -+ return { -+ tenantId, -+ metadata, -+ maxQueueWaitMs: options?.maxQueueWaitMs, -+ execute: () => this.executeStreamText(input, mergedOptions, metadata), // Actual streaming work happens after the controller dequeues us -+ extractUsage: (result: StreamTextResultWithContext) => -+ this.extractUsageFromResponse(result), -+ createFallbackRequest: (fallbackTarget) => { -+ const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = -+ this.resolveFallbackTarget(fallbackTarget); -+ return buildRequest(fallbackModel, fallbackProvider); -+ }, -+ }; -+ }; -+ -+ return controller.handleStream(buildRequest(options?.model)); -+ } -+ -+ private async executeStreamText( -+ input: string | UIMessage[] | BaseMessage[], -+ options?: StreamTextOptions, -+ trafficMetadata?: TrafficRequestMetadata, - ): Promise { - const startTime = Date.now(); - const oc = this.createOperationContext(input, options); -@@ -800,7 +914,7 @@ export class Agent { - options, - ); - -- const modelName = this.getModelName(); -+ const modelName = this.getModelName(model); - const contextLimit = options?.contextLimit; - - // Add model attributes to root span if TraceContext exists -@@ -868,10 +982,18 @@ export class Agent { - maxSteps: userMaxSteps, - tools: userTools, - onFinish: userOnFinish, -+ maxQueueWaitMs, -+ taskType, -+ fallbackPolicyId, - experimental_output, - providerOptions, -+ model: _model, // Exclude model from aiSDKOptions to avoid overriding resolved model - ...aiSDKOptions - } = options || {}; -+ void _model; -+ void maxQueueWaitMs; -+ void taskType; -+ void fallbackPolicyId; - - const guardrailStreamingEnabled = guardrailSet.output.length > 0; - -@@ -893,7 +1015,13 @@ export class Agent { - }, - }); - const finalizeLLMSpan = this.createLLMSpanFinalizer(llmSpan); -+ const trafficController = getTrafficController({ logger: this.logger }); - -+ methodLogger.info("[AI SDK] Calling streamText", { -+ messageCount: messages.length, -+ modelName, -+ tools: tools ? Object.keys(tools) : [], -+ }); - const result = streamText({ - model, - messages, -@@ -901,7 +1029,7 @@ export class Agent { - // Default values - temperature: this.temperature, - maxOutputTokens: this.maxOutputTokens, -- maxRetries: 3, -+ maxRetries: 0, // Retry via traffic controller to avoid provider-level storms - stopWhen: options?.stopWhen ?? this.stopWhen ?? stepCountIs(maxSteps), - // User overrides from AI SDK options - ...aiSDKOptions, -@@ -937,6 +1065,8 @@ export class Agent { - modelName: this.getModelName(), - }); - -+ this.updateTrafficControllerRateLimits(actualError, trafficMetadata, methodLogger); -+ trafficController.reportStreamFailure(trafficMetadata, actualError); - finalizeLLMSpan(SpanStatusCode.ERROR, { message: (actualError as Error)?.message }); - - // History update removed - using OpenTelemetry only -@@ -962,6 +1092,18 @@ export class Agent { - .catch(() => {}); - }, - onFinish: async (finalResult) => { -+ methodLogger.info("[AI SDK] streamText finished", { -+ finishReason: finalResult.finishReason, -+ usage: finalResult.totalUsage ? safeStringify(finalResult.totalUsage) : undefined, -+ stepCount: finalResult.steps?.length ?? 0, -+ rawResult: safeStringify(finalResult), -+ }); -+ this.updateTrafficControllerRateLimits( -+ finalResult.response, -+ trafficMetadata, -+ methodLogger, -+ ); -+ trafficController.reportStreamSuccess(trafficMetadata); - const providerUsage = finalResult.usage - ? await Promise.resolve(finalResult.usage) - : undefined; -@@ -1428,6 +1570,39 @@ export class Agent { - input: string | UIMessage[] | BaseMessage[], - schema: T, - options?: GenerateObjectOptions, -+ ): Promise>> { -+ const controller = getTrafficController({ logger: this.logger }); -+ const tenantId = this.resolveTenantId(options); -+ const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => { -+ const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); -+ const metadata = this.buildTrafficMetadata( -+ mergedOptions?.model, -+ mergedOptions, -+ providerOverride, -+ ); // Compute once per queued request (including per-call model overrides) -+ return { -+ tenantId, -+ metadata, -+ maxQueueWaitMs: options?.maxQueueWaitMs, -+ execute: () => this.executeGenerateObject(input, schema, mergedOptions, metadata), -+ extractUsage: (result: GenerateObjectResultWithContext>) => -+ this.extractUsageFromResponse(result), -+ createFallbackRequest: (fallbackTarget) => { -+ const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = -+ this.resolveFallbackTarget(fallbackTarget); -+ return buildRequest(fallbackModel, fallbackProvider); -+ }, -+ }; -+ }; -+ -+ return controller.handleText(buildRequest(options?.model)); -+ } -+ -+ private async executeGenerateObject( -+ input: string | UIMessage[] | BaseMessage[], -+ schema: T, -+ options?: GenerateObjectOptions, -+ trafficMetadata?: TrafficRequestMetadata, - ): Promise>> { - const startTime = Date.now(); - const oc = this.createOperationContext(input, options); -@@ -1452,7 +1627,7 @@ export class Agent { - options, - ); - -- const modelName = this.getModelName(); -+ const modelName = this.getModelName(model); - const schemaName = schema.description || "unknown"; - - // Add model attributes and all options -@@ -1510,10 +1685,23 @@ export class Agent { - hooks, - maxSteps: userMaxSteps, - tools: userTools, -+ taskType, -+ fallbackPolicyId, -+ maxQueueWaitMs, - providerOptions, -+ model: _model, // Exclude model so spread does not override resolved model - ...aiSDKOptions - } = options || {}; -+ void _model; -+ void taskType; -+ void fallbackPolicyId; -+ void maxQueueWaitMs; - -+ methodLogger.info("[AI SDK] Calling generateObject", { -+ messageCount: messages.length, -+ modelName, -+ schemaName, -+ }); - const result = await generateObject({ - model, - messages, -@@ -1522,7 +1710,7 @@ export class Agent { - // Default values - maxOutputTokens: this.maxOutputTokens, - temperature: this.temperature, -- maxRetries: 3, -+ maxRetries: 0, - // User overrides from AI SDK options - ...aiSDKOptions, - // Provider-specific options -@@ -1530,6 +1718,13 @@ export class Agent { - // VoltAgent controlled - abortSignal: oc.abortController.signal, - }); -+ methodLogger.info("[AI SDK] Received generateObject result", { -+ finishReason: result.finishReason, -+ usage: result.usage ? safeStringify(result.usage) : undefined, -+ warnings: result.warnings, -+ rawResult: safeStringify(result), -+ }); -+ this.updateTrafficControllerRateLimits(result.response, trafficMetadata, methodLogger); - - const usageInfo = convertUsage(result.usage); - const finalObject = await executeOutputGuardrails({ -@@ -1638,6 +1833,7 @@ export class Agent { - context: oc.context, - }; - } catch (error) { -+ this.updateTrafficControllerRateLimits(error, trafficMetadata, methodLogger); - await this.flushPendingMessagesOnError(oc).catch(() => {}); - return this.handleError(error as Error, oc, options, startTime); - } finally { -@@ -1655,6 +1851,39 @@ export class Agent { - input: string | UIMessage[] | BaseMessage[], - schema: T, - options?: StreamObjectOptions, -+ ): Promise>> { -+ const controller = getTrafficController({ logger: this.logger }); -+ const tenantId = this.resolveTenantId(options); -+ const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => { -+ const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); -+ const metadata = this.buildTrafficMetadata( -+ mergedOptions?.model, -+ mergedOptions, -+ providerOverride, -+ ); // Compute once per queued request (including per-call model overrides) -+ return { -+ tenantId, -+ metadata, -+ maxQueueWaitMs: options?.maxQueueWaitMs, -+ execute: () => this.executeStreamObject(input, schema, mergedOptions, metadata), -+ extractUsage: (result: StreamObjectResultWithContext>) => -+ this.extractUsageFromResponse(result), -+ createFallbackRequest: (fallbackTarget) => { -+ const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = -+ this.resolveFallbackTarget(fallbackTarget); -+ return buildRequest(fallbackModel, fallbackProvider); -+ }, -+ }; -+ }; -+ -+ return controller.handleStream(buildRequest(options?.model)); -+ } -+ -+ private async executeStreamObject( -+ input: string | UIMessage[] | BaseMessage[], -+ schema: T, -+ options?: StreamObjectOptions, -+ trafficMetadata?: TrafficRequestMetadata, - ): Promise>> { - const startTime = Date.now(); - const oc = this.createOperationContext(input, options); -@@ -1680,7 +1909,7 @@ export class Agent { - options, - ); - -- const modelName = this.getModelName(); -+ const modelName = this.getModelName(model); - const schemaName = schema.description || "unknown"; - - // Add model attributes and all options -@@ -1739,14 +1968,28 @@ export class Agent { - maxSteps: userMaxSteps, - tools: userTools, - onFinish: userOnFinish, -+ taskType, -+ fallbackPolicyId, -+ maxQueueWaitMs, - providerOptions, -+ model: _model, // Exclude model so aiSDKOptions cannot override resolved model - ...aiSDKOptions - } = options || {}; -+ void _model; -+ void taskType; -+ void fallbackPolicyId; -+ void maxQueueWaitMs; - - let guardrailObjectPromise!: Promise>; - let resolveGuardrailObject: ((value: z.infer) => void) | undefined; - let rejectGuardrailObject: ((reason: unknown) => void) | undefined; -+ const trafficController = getTrafficController({ logger: this.logger }); - -+ methodLogger.info("[AI SDK] Calling streamObject", { -+ messageCount: messages.length, -+ modelName, -+ schemaName, -+ }); - const result = streamObject({ - model, - messages, -@@ -1755,7 +1998,7 @@ export class Agent { - // Default values - maxOutputTokens: this.maxOutputTokens, - temperature: this.temperature, -- maxRetries: 3, -+ maxRetries: 0, - // User overrides from AI SDK options - ...aiSDKOptions, - // Provider-specific options -@@ -1771,9 +2014,11 @@ export class Agent { - methodLogger.error("Stream object error occurred", { - error: actualError, - agentName: this.name, -- modelName: this.getModelName(), -+ modelName: this.getModelName(model), - schemaName: schemaName, - }); -+ this.updateTrafficControllerRateLimits(actualError, trafficMetadata, methodLogger); -+ trafficController.reportStreamFailure(trafficMetadata, actualError); - - // History update removed - using OpenTelemetry only - -@@ -1800,6 +2045,17 @@ export class Agent { - }, - onFinish: async (finalResult: any) => { - try { -+ methodLogger.info("[AI SDK] streamObject finished", { -+ finishReason: finalResult.finishReason, -+ usage: finalResult.usage ? safeStringify(finalResult.usage) : undefined, -+ rawResult: safeStringify(finalResult), -+ }); -+ this.updateTrafficControllerRateLimits( -+ finalResult.response, -+ trafficMetadata, -+ methodLogger, -+ ); -+ trafficController.reportStreamSuccess(trafficMetadata); - const usageInfo = convertUsage(finalResult.usage as any); - let finalObject = finalResult.object as z.infer; - if (guardrailSet.output.length > 0) { -@@ -2021,8 +2277,9 @@ export class Agent { - // Calculate maxSteps (use provided option or calculate based on subagents) - const maxSteps = options?.maxSteps ?? this.calculateMaxSteps(); - -- // Resolve dynamic values -- const model = await this.resolveValue(this.model, oc); -+ // Resolve dynamic values (allow per-call model override for fallbacks) -+ const selectedModel = options?.model ?? this.model; -+ const model = await this.resolveValue(selectedModel, oc); - const dynamicToolList = (await this.resolveValue(this.dynamicTools, oc)) || []; - - // Merge agent tools with option tools -@@ -2073,6 +2330,8 @@ export class Agent { - ): OperationContext { - const operationId = randomUUID(); - const startTimeDate = new Date(); -+ const priority = this.resolveTrafficPriority(options); -+ const tenantId = this.resolveTenantId(options); - - // Prefer reusing an existing context instance to preserve reference across calls/subagents - const runtimeContext = toContextMap(options?.context); -@@ -2123,6 +2382,7 @@ export class Agent { - operationId, - userId: options?.userId, - conversationId: options?.conversationId, -+ tenantId, - executionId: operationId, - }); - -@@ -2137,6 +2397,9 @@ export class Agent { - parentAgentId: options?.parentAgentId, - input, - }); -+ if (tenantId) { -+ traceContext.getRootSpan().setAttribute("tenant.id", tenantId); -+ } - traceContext.getRootSpan().setAttribute("voltagent.operation_id", operationId); - - // Use parent's AbortController if available, otherwise create new one -@@ -2174,8 +2437,10 @@ export class Agent { - logger, - conversationSteps: options?.parentOperationContext?.conversationSteps || [], - abortController, -+ priority, - userId: options?.userId, - conversationId: options?.conversationId, -+ tenantId, - parentAgentId: options?.parentAgentId, - traceContext, - startTime: startTimeDate, -@@ -3170,6 +3435,20 @@ export class Agent { - return value; - } - -+ private mergeOptionsWithModel( -+ options: BaseGenerationOptions | undefined, -+ modelOverride?: LanguageModel, -+ ): BaseGenerationOptions | undefined { -+ if (!options && modelOverride === undefined) { -+ return undefined; -+ } -+ -+ return { -+ ...(options ?? {}), -+ ...(modelOverride !== undefined ? { model: modelOverride } : {}), -+ }; -+ } -+ - /** - * Prepare tools with execution context - */ -@@ -3822,17 +4101,213 @@ export class Agent { - return this.subAgentManager.calculateMaxSteps(this.maxSteps); - } - -+ private resolveTrafficPriority(options?: BaseGenerationOptions): TrafficPriority { -+ const normalize = (value?: TrafficPriority): TrafficPriority | undefined => { -+ if (value === "P0" || value === "P1" || value === "P2") { -+ return value; -+ } -+ return undefined; -+ }; -+ -+ const parentPriority = normalize(options?.parentOperationContext?.priority); -+ const localPriority = normalize(options?.trafficPriority) ?? this.trafficPriority ?? "P1"; -+ -+ if (parentPriority) { -+ return this.pickHigherPriority(parentPriority, localPriority); -+ } -+ -+ return localPriority; -+ } -+ -+ private resolveTenantId(options?: BaseGenerationOptions): string { -+ const parentTenant = options?.parentOperationContext?.tenantId; -+ if (parentTenant) { -+ return parentTenant; -+ } -+ -+ if (options?.tenantId) { -+ return options.tenantId; -+ } -+ -+ return "default"; -+ } -+ -+ private pickHigherPriority(a: TrafficPriority, b: TrafficPriority): TrafficPriority { -+ const rank: Record = { P0: 0, P1: 1, P2: 2 }; -+ return rank[a] <= rank[b] ? a : b; -+ } -+ -+ private buildTrafficMetadata( -+ modelOverride?: LanguageModel | DynamicValue, -+ options?: BaseGenerationOptions, -+ providerOverride?: string, -+ ): TrafficRequestMetadata { -+ const provider = -+ providerOverride ?? -+ this.resolveProvider(modelOverride) ?? -+ this.resolveProvider(this.model) ?? -+ undefined; -+ const priority = this.resolveTrafficPriority(options); -+ -+ return { -+ agentId: this.id, // Identify which agent issued the request -+ agentName: this.name, // Human-readable label for logs/metrics -+ model: this.getModelName(modelOverride), // Used for future capacity policies -+ provider, // Allows per-provider throttling later -+ priority, -+ tenantId: this.resolveTenantId(options), -+ taskType: options?.taskType, -+ fallbackPolicyId: options?.fallbackPolicyId, -+ }; -+ } -+ -+ private resolveFallbackTarget(target: FallbackChainEntry): { -+ modelOverride?: LanguageModel; -+ providerOverride?: string; -+ } { -+ if (typeof target === "string") { -+ return { modelOverride: target }; -+ } -+ return { -+ modelOverride: target.model, -+ providerOverride: target.provider, -+ }; -+ } -+ -+ private updateTrafficControllerRateLimits( -+ response: unknown, -+ metadata: TrafficRequestMetadata | undefined, -+ logger?: Logger, -+ ): void { -+ const headerCandidates = findHeaders(response); -+ if (headerCandidates.length === 0) { -+ logger?.debug?.("[Traffic] No headers found for rate limit update"); -+ return; -+ } -+ -+ const controller = getTrafficController(); -+ const effectiveMetadata = metadata ?? this.buildTrafficMetadata(); -+ let updateResult: ReturnType | undefined; -+ for (const headers of headerCandidates) { -+ updateResult = controller.updateRateLimitFromHeaders(effectiveMetadata, headers); -+ if (updateResult) break; -+ } -+ -+ if (!updateResult) { -+ logger?.debug?.("[Traffic] No rate limit headers applied from response"); -+ return; -+ } -+ -+ const now = Date.now(); -+ const effectiveRemaining = Math.max( -+ 0, -+ updateResult.state.remaining - updateResult.state.reserved, -+ ); -+ const resetInMs = Math.max(0, updateResult.state.resetAt - now); -+ const nextAllowedInMs = Math.max(0, updateResult.state.nextAllowedAt - now); -+ logger?.info?.("[Traffic] Applied rate limit from response headers", { -+ rateLimitKey: updateResult.key, -+ limit: updateResult.state.limit, -+ remaining: updateResult.state.remaining, -+ reserved: updateResult.state.reserved, -+ effectiveRemaining, -+ resetAt: updateResult.state.resetAt, -+ resetInMs, -+ nextAllowedAt: updateResult.state.nextAllowedAt, -+ nextAllowedInMs, -+ headers: { -+ limitRequests: updateResult.headerSnapshot.limitRequests, -+ remainingRequests: updateResult.headerSnapshot.remainingRequests, -+ resetRequestsMs: updateResult.headerSnapshot.resetRequestsMs, -+ }, -+ }); -+ } -+ -+ private extractUsageFromResponse( -+ result: -+ | { -+ usage?: LanguageModelUsage | Promise; -+ totalUsage?: LanguageModelUsage | Promise; -+ } -+ | undefined, -+ ): Promise | LanguageModelUsage | undefined { -+ if (!result) { -+ return undefined; -+ } -+ -+ const usageCandidate = -+ (result as { totalUsage?: LanguageModelUsage | Promise }) -+ ?.totalUsage ?? -+ (result as { usage?: LanguageModelUsage | Promise })?.usage; -+ -+ if (!usageCandidate) { -+ return undefined; -+ } -+ -+ const normalizeUsage = ( -+ usage: LanguageModelUsage | undefined, -+ ): LanguageModelUsage | undefined => { -+ if (!usage) return undefined; -+ const input = Number.isFinite(usage.inputTokens) ? (usage.inputTokens as number) : undefined; -+ const output = Number.isFinite(usage.outputTokens) -+ ? (usage.outputTokens as number) -+ : undefined; -+ const total = Number.isFinite(usage.totalTokens) ? (usage.totalTokens as number) : undefined; -+ -+ if (total === undefined && input === undefined && output === undefined) { -+ return undefined; -+ } -+ -+ const safeInput = input ?? 0; -+ const safeOutput = output ?? 0; -+ const safeTotal = total ?? safeInput + safeOutput; -+ -+ return { -+ ...usage, -+ inputTokens: safeInput, -+ outputTokens: safeOutput, -+ totalTokens: safeTotal, -+ }; -+ }; -+ -+ if ( -+ typeof (usageCandidate as PromiseLike).then === "function" -+ ) { -+ return (usageCandidate as Promise) -+ .then((usage) => normalizeUsage(usage)) -+ .catch(() => undefined); -+ } -+ -+ return normalizeUsage(usageCandidate as LanguageModelUsage); -+ } -+ -+ private resolveProvider( -+ model: LanguageModel | DynamicValue | undefined, -+ ): string | undefined { -+ if ( -+ model && -+ typeof model === "object" && -+ "provider" in model && -+ typeof (model as any).provider === "string" -+ ) { -+ return (model as any).provider; -+ } -+ -+ return undefined; -+ } -+ - /** - * Get the model name - */ -- public getModelName(): string { -- if (typeof this.model === "function") { -+ public getModelName(modelOverride?: LanguageModel | DynamicValue): string { -+ const selectedModel = modelOverride ?? this.model; -+ if (typeof selectedModel === "function") { - return "dynamic"; - } -- if (typeof this.model === "string") { -- return this.model; -+ if (typeof selectedModel === "string") { -+ return selectedModel; - } -- return this.model.modelId || "unknown"; -+ return selectedModel.modelId || "unknown"; - } - - /** -diff --git a/packages/core/src/agent/eval.ts b/packages/core/src/agent/eval.ts -index 9e4fe9f2..de712505 100644 ---- a/packages/core/src/agent/eval.ts -+++ b/packages/core/src/agent/eval.ts -@@ -711,6 +711,7 @@ function buildEvalPayload( - rawOutput: output, - userId: oc.userId, - conversationId: oc.conversationId, -+ tenantId: oc.tenantId, - traceId: spanContext.traceId, - spanId: spanContext.spanId, - metadata, -diff --git a/packages/core/src/agent/types.ts b/packages/core/src/agent/types.ts -index dd5fb29d..add69edf 100644 ---- a/packages/core/src/agent/types.ts -+++ b/packages/core/src/agent/types.ts -@@ -29,6 +29,7 @@ import type { Logger } from "@voltagent/internal"; - import type { LocalScorerDefinition, SamplingPolicy } from "../eval/runtime"; - import type { MemoryOptions, MemoryStorageMetadata, WorkingMemorySummary } from "../memory/types"; - import type { VoltAgentObservability } from "../observability"; -+import type { TrafficPriority } from "../traffic/traffic-controller"; - import type { - DynamicValue, - DynamicValueOptions, -@@ -456,6 +457,11 @@ export type AgentOptions = { - temperature?: number; - maxOutputTokens?: number; - maxSteps?: number; -+ /** -+ * Default scheduling priority for this agent's LLM calls. -+ * Defaults to P1 when unspecified. -+ */ -+ trafficPriority?: TrafficPriority; - /** - * Default stop condition for step execution (ai-sdk `stopWhen`). - * Per-call `stopWhen` in method options overrides this. -@@ -493,6 +499,7 @@ export interface AgentEvalPayload { - rawOutput?: unknown; - userId?: string; - conversationId?: string; -+ tenantId?: string; - traceId: string; - spanId: string; - metadata?: Record; -@@ -890,6 +897,9 @@ export type OperationContext = { - /** Optional conversation identifier associated with this operation */ - conversationId?: string; - -+ /** Optional tenant identifier propagated across nested operations */ -+ tenantId?: string; -+ - /** User-managed context map for this operation */ - readonly context: Map; - -@@ -914,6 +924,9 @@ export type OperationContext = { - /** Conversation steps for building full message history including tool calls/results */ - conversationSteps?: StepWithContent[]; - -+ /** Scheduling priority propagated from parent calls */ -+ priority?: TrafficPriority; -+ - /** AbortController for cancelling the operation and accessing the signal */ - abortController: AbortController; - -diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts -index 8753f039..3850f0ac 100644 ---- a/packages/core/src/index.ts -+++ b/packages/core/src/index.ts -@@ -21,6 +21,29 @@ export type { - WorkflowTimelineEvent, - RegisteredWorkflow, - } from "./workflow"; -+export { -+ // Surface traffic controller so downstream consumers can route agent calls through the shared scheduler -+ TrafficController, -+ CircuitBreakerOpenError, -+ QueueWaitTimeoutError, -+ RateLimitedUpstreamError, -+ getTrafficController, -+ type FallbackChainEntry, -+ type FallbackPolicy, -+ type FallbackPolicyConfig, -+ type FallbackPolicyMode, -+ type FallbackTarget, -+ type RateLimitConfig, -+ type RateLimitKey, -+ type RateLimitOptions, -+ type AdaptiveLimiterConfig, -+ type PriorityBurstLimits, -+ type TrafficRequest, -+ type TrafficRequestMetadata, -+ type TrafficResponseMetadata, -+ type TrafficPriority, -+ type TrafficRequestType, -+} from "./traffic/traffic-controller"; - // Export new Agent from agent.ts - export { - Agent, -diff --git a/packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts -new file mode 100644 -index 00000000..652b7e59 ---- /dev/null -+++ b/packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts -@@ -0,0 +1,243 @@ -+import type { Logger } from "../../logger"; -+import { -+ RATE_LIMIT_EXHAUSTION_BUFFER, -+ RATE_LIMIT_MIN_PACE_INTERVAL_MS, -+ RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS, -+ RATE_LIMIT_PROBE_DELAY_MS, -+} from "../traffic-constants"; -+import type { -+ DispatchDecision, -+ QueuedRequest, -+ RateLimitWindowState, -+} from "../traffic-controller-internal"; -+import { parseRetryAfterMs, readHeaderValue } from "../traffic-error-utils"; -+import type { TrafficRequestMetadata } from "../traffic-types"; -+import type { -+ RateLimitHeaderSnapshot, -+ RateLimitStrategy, -+ RateLimitUpdateResult, -+} from "./rate-limit-strategy"; -+import { parseResetDurationToMs } from "./rate-limit-utils"; -+ -+export class DefaultRateLimitStrategy implements RateLimitStrategy { -+ private state?: RateLimitWindowState; -+ private readonly key: string; -+ -+ constructor(key: string) { -+ this.key = key; -+ } -+ -+ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { -+ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); -+ const state = this.state; -+ if (!state) { -+ rateLimitLogger?.trace?.("Rate limit state missing; allow request", { -+ rateLimitKey: this.key, -+ }); -+ return null; -+ } -+ -+ const now = Date.now(); -+ const effectiveRemaining = Math.max(0, state.remaining - state.reserved); -+ const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; -+ -+ if (effectiveRemaining <= RATE_LIMIT_EXHAUSTION_BUFFER) { -+ if (now < probeAt) { -+ rateLimitLogger?.debug?.("Rate limit exhausted; waiting for probe", { -+ rateLimitKey: this.key, -+ remaining: state.remaining, -+ reserved: state.reserved, -+ effectiveRemaining, -+ resetAt: state.resetAt, -+ probeAt, -+ }); -+ return { kind: "wait", wakeUpAt: probeAt }; -+ } -+ if (state.reserved > 0) { -+ rateLimitLogger?.debug?.("Rate limit exhausted but in-flight reservations exist; waiting", { -+ rateLimitKey: this.key, -+ remaining: state.remaining, -+ reserved: state.reserved, -+ effectiveRemaining, -+ resetAt: state.resetAt, -+ }); -+ return { kind: "wait" }; -+ } -+ } -+ -+ if (now < state.nextAllowedAt) { -+ rateLimitLogger?.debug?.("Rate limit pacing; waiting until nextAllowedAt", { -+ rateLimitKey: this.key, -+ nextAllowedAt: state.nextAllowedAt, -+ resetAt: state.resetAt, -+ waitMs: Math.min(state.resetAt, state.nextAllowedAt) - now, -+ }); -+ return { kind: "wait", wakeUpAt: Math.min(state.resetAt, state.nextAllowedAt) }; -+ } -+ -+ state.reserved += 1; -+ next.rateLimitKey = this.key; -+ rateLimitLogger?.trace?.("Reserved rate limit token", { -+ rateLimitKey: this.key, -+ reserved: state.reserved, -+ remaining: state.remaining, -+ resetAt: state.resetAt, -+ nextAllowedAt: state.nextAllowedAt, -+ }); -+ -+ const remainingWindowMs = Math.max(0, state.resetAt - now); -+ const intervalMs = Math.max( -+ RATE_LIMIT_MIN_PACE_INTERVAL_MS, -+ Math.ceil(remainingWindowMs / Math.max(effectiveRemaining, 1)), -+ ); -+ -+ const candidateNext = Math.max(state.nextAllowedAt, now + intervalMs); -+ if ( -+ state.nextAllowedAt <= now || -+ candidateNext >= state.nextAllowedAt + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS -+ ) { -+ state.nextAllowedAt = candidateNext; -+ rateLimitLogger?.trace?.("Updated pacing nextAllowedAt", { -+ rateLimitKey: this.key, -+ nextAllowedAt: state.nextAllowedAt, -+ intervalMs, -+ remainingWindowMs, -+ effectiveRemaining, -+ }); -+ } -+ -+ return null; -+ } -+ -+ onDispatch(_logger?: Logger): void {} -+ -+ onComplete(logger?: Logger): void { -+ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); -+ const state = this.state; -+ if (!state || state.reserved <= 0) return; -+ state.reserved -= 1; -+ rateLimitLogger?.trace?.("Released rate limit reservation", { -+ rateLimitKey: this.key, -+ reserved: state.reserved, -+ remaining: state.remaining, -+ resetAt: state.resetAt, -+ nextAllowedAt: state.nextAllowedAt, -+ }); -+ } -+ -+ updateFromHeaders( -+ _metadata: TrafficRequestMetadata | undefined, -+ headers: unknown, -+ logger?: Logger, -+ ): RateLimitUpdateResult | undefined { -+ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); -+ const limitRequests = readHeaderValue(headers, "x-ratelimit-limit-requests"); -+ const remainingRequests = readHeaderValue(headers, "x-ratelimit-remaining-requests"); -+ const resetRequests = readHeaderValue(headers, "x-ratelimit-reset-requests"); -+ const retryAfter = readHeaderValue(headers, "retry-after"); -+ const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter) : undefined; -+ -+ const now = Date.now(); -+ const existing = this.state; -+ let state: RateLimitWindowState | undefined; -+ let headerSnapshot: RateLimitHeaderSnapshot | undefined; -+ -+ if (limitRequests && remainingRequests && resetRequests) { -+ const limit = Number(limitRequests); -+ const remaining = Number(remainingRequests); -+ if (!Number.isFinite(limit) || !Number.isFinite(remaining)) { -+ rateLimitLogger?.debug?.("Invalid rate limit numeric headers; skipping", { -+ rateLimitKey: this.key, -+ limitRequests, -+ remainingRequests, -+ }); -+ return undefined; -+ } -+ -+ const resetRequestsMs = parseResetDurationToMs(resetRequests); -+ if (resetRequestsMs === undefined) { -+ rateLimitLogger?.debug?.("Unable to parse reset duration; skipping", { -+ rateLimitKey: this.key, -+ resetRequests, -+ }); -+ return undefined; -+ } -+ -+ const parsedResetAt = now + resetRequestsMs; -+ const isSameWindow = !!existing && now < existing.resetAt; -+ const resetAt = isSameWindow ? Math.max(existing.resetAt, parsedResetAt) : parsedResetAt; -+ const nextAllowedAt = isSameWindow ? Math.max(existing.nextAllowedAt, now) : now; -+ const reserved = Math.max(0, existing?.reserved ?? 0); -+ -+ state = { -+ limit, -+ remaining: isSameWindow ? Math.min(existing.remaining, remaining) : remaining, -+ resetAt, -+ reserved, -+ nextAllowedAt, -+ }; -+ headerSnapshot = { -+ limitRequests, -+ remainingRequests, -+ resetRequests, -+ resetRequestsMs, -+ }; -+ } else if (retryAfterMs === undefined) { -+ rateLimitLogger?.trace?.("Missing rate limit headers; skipping", { -+ rateLimitKey: this.key, -+ hasLimit: !!limitRequests, -+ hasRemaining: !!remainingRequests, -+ hasReset: !!resetRequests, -+ hasRetryAfter: !!retryAfter, -+ }); -+ return undefined; -+ } -+ -+ if (!state) { -+ if (retryAfterMs === undefined) { -+ rateLimitLogger?.trace?.("Retry-After missing or unparsable; skipping", { -+ rateLimitKey: this.key, -+ retryAfter, -+ }); -+ return undefined; -+ } -+ const targetAt = now + retryAfterMs; -+ const isSameWindow = !!existing && now < existing.resetAt; -+ state = { -+ limit: existing?.limit ?? 1, -+ remaining: 0, -+ resetAt: isSameWindow ? Math.max(existing.resetAt, targetAt) : targetAt, -+ reserved: Math.max(0, existing?.reserved ?? 0), -+ nextAllowedAt: Math.max(existing?.nextAllowedAt ?? now, targetAt), -+ }; -+ headerSnapshot = { retryAfter, retryAfterMs }; -+ } else if (retryAfterMs !== undefined) { -+ const targetAt = now + retryAfterMs; -+ state = { -+ ...state, -+ remaining: 0, -+ resetAt: Math.max(state.resetAt, targetAt), -+ nextAllowedAt: Math.max(state.nextAllowedAt, targetAt), -+ }; -+ headerSnapshot = { ...headerSnapshot, retryAfter, retryAfterMs }; -+ } -+ -+ this.state = state; -+ rateLimitLogger?.debug?.("Applied rate limit headers to state", { -+ rateLimitKey: this.key, -+ limit: state.limit, -+ remaining: state.remaining, -+ effectiveRemaining: Math.max(0, state.remaining - state.reserved), -+ resetAt: state.resetAt, -+ nextAllowedAt: state.nextAllowedAt, -+ resetRequestsMs: headerSnapshot?.resetRequestsMs, -+ retryAfterMs: headerSnapshot?.retryAfterMs, -+ }); -+ -+ return { -+ key: this.key, -+ headerSnapshot: headerSnapshot ?? {}, -+ state, -+ }; -+ } -+} -diff --git a/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts -new file mode 100644 -index 00000000..8e8b6f86 ---- /dev/null -+++ b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts -@@ -0,0 +1,265 @@ -+import type { Logger } from "../../logger"; -+import { -+ RATE_LIMIT_EXHAUSTION_BUFFER, -+ RATE_LIMIT_MIN_PACE_INTERVAL_MS, -+ RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS, -+ RATE_LIMIT_PROBE_DELAY_MS, -+} from "../traffic-constants"; -+import type { -+ DispatchDecision, -+ QueuedRequest, -+ RateLimitWindowState, -+} from "../traffic-controller-internal"; -+import type { RateLimitOptions, TrafficRequestMetadata } from "../traffic-types"; -+import { DefaultRateLimitStrategy } from "./default-rate-limit-strategy"; -+import type { -+ RateLimitStrategy, -+ RateLimitUpdateResult, -+ RateLimitUsage, -+} from "./rate-limit-strategy"; -+ -+export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { -+ readonly handlesTokenLimits: boolean; -+ private readonly window: DefaultRateLimitStrategy; -+ private readonly key: string; -+ private readonly requestsPerMinute?: number; -+ private readonly tokensPerMinute?: number; -+ private requestState?: RateLimitWindowState; -+ private tokenState?: RateLimitWindowState; -+ private bootstrapReserved = 0; -+ private readonly windowMs = 60_000; -+ -+ constructor(key: string, options?: RateLimitOptions) { -+ this.key = key; -+ this.window = new DefaultRateLimitStrategy(key); -+ // Window strategy enforces fixed 60s windows; burstSize is intentionally ignored here. -+ this.requestsPerMinute = this.normalizeLimit(options?.requestsPerMinute); -+ this.tokensPerMinute = this.normalizeLimit(options?.tokensPerMinute); -+ this.handlesTokenLimits = this.tokensPerMinute !== undefined; -+ } -+ -+ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { -+ if (this.requestsPerMinute !== undefined) { -+ const requestDecision = this.resolveRequestWindow(next, logger); -+ if (requestDecision) return requestDecision; -+ const tokenDecision = this.resolveTokenWindow(logger); -+ if (tokenDecision) return tokenDecision; -+ return null; -+ } -+ -+ const decision = this.window.resolve(next, logger); -+ if (decision) return decision; -+ -+ if (next.rateLimitKey) { -+ return null; -+ } -+ -+ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); -+ if (this.bootstrapReserved >= 1) { -+ rateLimitLogger?.debug?.("OpenAI rate limit bootstrap active; waiting", { -+ rateLimitKey: this.key, -+ bootstrapReserved: this.bootstrapReserved, -+ }); -+ return { kind: "wait" }; -+ } -+ -+ this.bootstrapReserved += 1; -+ next.rateLimitKey = this.key; -+ rateLimitLogger?.debug?.("OpenAI rate limit bootstrap reserved", { -+ rateLimitKey: this.key, -+ bootstrapReserved: this.bootstrapReserved, -+ }); -+ return null; -+ } -+ -+ onDispatch(logger?: Logger): void { -+ if (this.requestsPerMinute === undefined) { -+ this.window.onDispatch(logger); -+ } -+ } -+ -+ onComplete(logger?: Logger): void { -+ if (this.requestsPerMinute !== undefined) { -+ const now = Date.now(); -+ const state = this.ensureRequestState(now); -+ if (state.reserved > 0) { -+ state.reserved -= 1; -+ } -+ state.remaining = Math.max(0, state.remaining - 1); -+ return; -+ } -+ -+ if (this.bootstrapReserved > 0) { -+ this.bootstrapReserved -= 1; -+ } -+ this.window.onComplete(logger); -+ } -+ -+ recordUsage(usage: RateLimitUsage, logger?: Logger): void { -+ if (this.tokensPerMinute === undefined) return; -+ const tokens = this.resolveTokenCount(usage); -+ if (tokens <= 0) return; -+ -+ const now = Date.now(); -+ const state = this.ensureTokenState(now); -+ state.remaining = Math.max(0, state.remaining - tokens); -+ logger?.child({ module: "rate-limiter" })?.trace?.("OpenAI token usage recorded", { -+ rateLimitKey: this.key, -+ tokens, -+ remaining: state.remaining, -+ resetAt: state.resetAt, -+ }); -+ } -+ -+ updateFromHeaders( -+ metadata: TrafficRequestMetadata | undefined, -+ headers: unknown, -+ logger?: Logger, -+ ): RateLimitUpdateResult | undefined { -+ if (this.requestsPerMinute !== undefined) { -+ return undefined; -+ } -+ return this.window.updateFromHeaders(metadata, headers, logger); -+ } -+ -+ private resolveRequestWindow(next: QueuedRequest, logger?: Logger): DispatchDecision | null { -+ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); -+ const now = Date.now(); -+ const state = this.ensureRequestState(now); -+ const effectiveRemaining = Math.max(0, state.remaining - state.reserved); -+ const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; -+ -+ if (effectiveRemaining <= RATE_LIMIT_EXHAUSTION_BUFFER) { -+ if (now < probeAt) { -+ rateLimitLogger?.debug?.("OpenAI request window exhausted; waiting for probe", { -+ rateLimitKey: this.key, -+ remaining: state.remaining, -+ reserved: state.reserved, -+ effectiveRemaining, -+ resetAt: state.resetAt, -+ probeAt, -+ }); -+ return { kind: "wait", wakeUpAt: probeAt }; -+ } -+ if (state.reserved > 0) { -+ rateLimitLogger?.debug?.( -+ "OpenAI request window exhausted but in-flight reservations exist; waiting", -+ { -+ rateLimitKey: this.key, -+ remaining: state.remaining, -+ reserved: state.reserved, -+ effectiveRemaining, -+ resetAt: state.resetAt, -+ }, -+ ); -+ return { kind: "wait" }; -+ } -+ } -+ -+ if (now < state.nextAllowedAt) { -+ rateLimitLogger?.debug?.("OpenAI request window pacing; waiting until nextAllowedAt", { -+ rateLimitKey: this.key, -+ nextAllowedAt: state.nextAllowedAt, -+ resetAt: state.resetAt, -+ waitMs: Math.min(state.resetAt, state.nextAllowedAt) - now, -+ }); -+ return { kind: "wait", wakeUpAt: Math.min(state.resetAt, state.nextAllowedAt) }; -+ } -+ -+ state.reserved += 1; -+ next.rateLimitKey = this.key; -+ rateLimitLogger?.trace?.("Reserved OpenAI request window slot", { -+ rateLimitKey: this.key, -+ reserved: state.reserved, -+ remaining: state.remaining, -+ resetAt: state.resetAt, -+ nextAllowedAt: state.nextAllowedAt, -+ }); -+ -+ const remainingWindowMs = Math.max(0, state.resetAt - now); -+ const intervalMs = Math.max( -+ RATE_LIMIT_MIN_PACE_INTERVAL_MS, -+ Math.ceil(remainingWindowMs / Math.max(effectiveRemaining, 1)), -+ ); -+ -+ const candidateNext = Math.max(state.nextAllowedAt, now + intervalMs); -+ if ( -+ state.nextAllowedAt <= now || -+ candidateNext >= state.nextAllowedAt + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS -+ ) { -+ state.nextAllowedAt = candidateNext; -+ rateLimitLogger?.trace?.("Updated OpenAI request pacing nextAllowedAt", { -+ rateLimitKey: this.key, -+ nextAllowedAt: state.nextAllowedAt, -+ intervalMs, -+ remainingWindowMs, -+ effectiveRemaining, -+ }); -+ } -+ -+ return null; -+ } -+ -+ private resolveTokenWindow(logger?: Logger): DispatchDecision | null { -+ if (this.tokensPerMinute === undefined) return null; -+ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); -+ const now = Date.now(); -+ const state = this.ensureTokenState(now); -+ -+ if (state.remaining > 0) return null; -+ -+ const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; -+ rateLimitLogger?.debug?.("OpenAI token window exhausted; waiting", { -+ rateLimitKey: this.key, -+ remaining: state.remaining, -+ resetAt: state.resetAt, -+ probeAt, -+ }); -+ return { kind: "wait", wakeUpAt: probeAt }; -+ } -+ -+ private ensureRequestState(now: number): RateLimitWindowState { -+ const limit = this.requestsPerMinute ?? 0; -+ const state = this.requestState; -+ if (!state || now >= state.resetAt) { -+ this.requestState = { -+ limit, -+ remaining: limit, -+ resetAt: now + this.windowMs, -+ reserved: 0, -+ nextAllowedAt: now, -+ }; -+ return this.requestState; -+ } -+ return state; -+ } -+ -+ private ensureTokenState(now: number): RateLimitWindowState { -+ const limit = this.tokensPerMinute ?? 0; -+ const state = this.tokenState; -+ if (!state || now >= state.resetAt) { -+ this.tokenState = { -+ limit, -+ remaining: limit, -+ resetAt: now + this.windowMs, -+ reserved: 0, -+ nextAllowedAt: now, -+ }; -+ return this.tokenState; -+ } -+ return state; -+ } -+ -+ private normalizeLimit(value: number | undefined): number | undefined { -+ const numeric = typeof value === "number" ? value : Number(value); -+ return Number.isFinite(numeric) && numeric > 0 ? numeric : undefined; -+ } -+ -+ private resolveTokenCount(usage: RateLimitUsage): number { -+ const total = Number.isFinite(usage.totalTokens) ? usage.totalTokens : undefined; -+ if (total !== undefined) return total; -+ const input = Number.isFinite(usage.inputTokens) ? usage.inputTokens : 0; -+ const output = Number.isFinite(usage.outputTokens) ? usage.outputTokens : 0; -+ return input + output; -+ } -+} -diff --git a/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts -new file mode 100644 -index 00000000..6657c6b2 ---- /dev/null -+++ b/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts -@@ -0,0 +1,41 @@ -+import type { Logger } from "../../logger"; -+import type { -+ DispatchDecision, -+ QueuedRequest, -+ RateLimitWindowState, -+} from "../traffic-controller-internal"; -+import type { TrafficRequestMetadata } from "../traffic-types"; -+ -+export type RateLimitHeaderSnapshot = { -+ limitRequests?: string; -+ remainingRequests?: string; -+ resetRequests?: string; -+ resetRequestsMs?: number; -+ retryAfter?: string; -+ retryAfterMs?: number; -+}; -+ -+export type RateLimitUpdateResult = { -+ key: string; -+ headerSnapshot: RateLimitHeaderSnapshot; -+ state: RateLimitWindowState; -+}; -+ -+export type RateLimitUsage = { -+ inputTokens?: number; -+ outputTokens?: number; -+ totalTokens?: number; -+}; -+ -+export interface RateLimitStrategy { -+ readonly handlesTokenLimits?: boolean; -+ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null; -+ onDispatch(logger?: Logger): void; -+ onComplete(logger?: Logger): void; -+ recordUsage?(usage: RateLimitUsage, logger?: Logger): void; -+ updateFromHeaders( -+ metadata: TrafficRequestMetadata | undefined, -+ headers: unknown, -+ logger?: Logger, -+ ): RateLimitUpdateResult | undefined; -+} -diff --git a/packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts b/packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts -new file mode 100644 -index 00000000..310c9a7e ---- /dev/null -+++ b/packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts -@@ -0,0 +1,26 @@ -+export function parseResetDurationToMs(raw: string): number | undefined { -+ const value = raw.trim(); -+ if (!value) return undefined; -+ -+ let totalMs = 0; -+ const regex = /(\d+(?:\.\d+)?)(ms|s|m|h|d)/g; -+ let matched = false; -+ for (const match of value.matchAll(regex)) { -+ matched = true; -+ const amount = Number.parseFloat(match[1] ?? ""); -+ if (!Number.isFinite(amount)) continue; -+ const unit = match[2]; -+ if (unit === "ms") totalMs += amount; -+ else if (unit === "s") totalMs += amount * 1000; -+ else if (unit === "m") totalMs += amount * 60_000; -+ else if (unit === "h") totalMs += amount * 3_600_000; -+ else if (unit === "d") totalMs += amount * 86_400_000; -+ } -+ -+ if (matched) { -+ return Math.round(totalMs); -+ } -+ -+ const n = Number(value); -+ return Number.isFinite(n) ? Math.round(n) : undefined; -+} -diff --git a/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts -new file mode 100644 -index 00000000..2ae7b189 ---- /dev/null -+++ b/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts -@@ -0,0 +1,210 @@ -+import type { Logger } from "../../logger"; -+import type { -+ DispatchDecision, -+ QueuedRequest, -+ RateLimitWindowState, -+} from "../traffic-controller-internal"; -+import { parseRetryAfterMs, readHeaderValue } from "../traffic-error-utils"; -+import type { RateLimitOptions, TrafficRequestMetadata } from "../traffic-types"; -+import type { -+ RateLimitHeaderSnapshot, -+ RateLimitStrategy, -+ RateLimitUpdateResult, -+} from "./rate-limit-strategy"; -+import { parseResetDurationToMs } from "./rate-limit-utils"; -+ -+type TokenBucketState = { -+ capacity: number; -+ refillPerSecond: number; -+ tokens: number; -+ updatedAt: number; -+}; -+ -+function normalizeTokenBucketOptions( -+ raw: RateLimitOptions | undefined, -+): Omit { -+ const requestsPerMinuteRaw = raw?.requestsPerMinute; -+ const burstSizeRaw = raw?.burstSize; -+ -+ const requestsPerMinute = -+ typeof requestsPerMinuteRaw === "number" ? requestsPerMinuteRaw : Number(requestsPerMinuteRaw); -+ const burstSize = typeof burstSizeRaw === "number" ? burstSizeRaw : Number(burstSizeRaw); -+ -+ const safeRequestsPerMinute = Number.isFinite(requestsPerMinute) ? requestsPerMinute : 0; -+ const safeBurst = Number.isFinite(burstSize) ? burstSize : safeRequestsPerMinute; -+ const refillPerSecond = safeRequestsPerMinute > 0 ? safeRequestsPerMinute / 60 : 0; -+ -+ return { -+ capacity: safeBurst > 0 ? Math.max(1, safeBurst) : 0, -+ refillPerSecond, -+ }; -+} -+function refillTokenBucket(bucket: TokenBucketState, now: number): void { -+ const elapsedMs = now - bucket.updatedAt; -+ if (elapsedMs <= 0) return; -+ bucket.updatedAt = now; -+ if (bucket.capacity <= 0 || bucket.refillPerSecond <= 0) return; -+ -+ const refill = (elapsedMs / 1000) * bucket.refillPerSecond; -+ if (refill <= 0) return; -+ bucket.tokens = Math.min(bucket.capacity, bucket.tokens + refill); -+} -+ -+export class TokenBucketRateLimitStrategy implements RateLimitStrategy { -+ private readonly key: string; -+ private bucket?: TokenBucketState; -+ private cooldownUntil?: number; -+ -+ constructor(key: string, options?: RateLimitOptions) { -+ this.key = key; -+ if (!options) return; -+ const normalized = normalizeTokenBucketOptions(options); -+ const now = Date.now(); -+ this.bucket = { -+ ...normalized, -+ tokens: normalized.capacity, -+ updatedAt: now, -+ }; -+ } -+ -+ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { -+ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); -+ const now = Date.now(); -+ -+ if (this.cooldownUntil !== undefined && now < this.cooldownUntil) { -+ rateLimitLogger?.debug?.("Token bucket cooldown active; waiting", { -+ rateLimitKey: this.key, -+ cooldownUntil: this.cooldownUntil, -+ waitMs: this.cooldownUntil - now, -+ }); -+ return { kind: "wait", wakeUpAt: this.cooldownUntil }; -+ } -+ -+ const bucket = this.bucket; -+ if (!bucket) return null; -+ -+ refillTokenBucket(bucket, now); -+ -+ if (bucket.capacity <= 0) { -+ rateLimitLogger?.debug?.("Token bucket misconfigured; blocking", { -+ rateLimitKey: this.key, -+ capacity: bucket.capacity, -+ refillPerSecond: bucket.refillPerSecond, -+ }); -+ return { kind: "wait" }; -+ } -+ -+ if (bucket.tokens >= 1) { -+ bucket.tokens -= 1; -+ next.rateLimitKey = this.key; -+ rateLimitLogger?.trace?.("Consumed token bucket token", { -+ rateLimitKey: this.key, -+ tokens: bucket.tokens, -+ capacity: bucket.capacity, -+ refillPerSecond: bucket.refillPerSecond, -+ }); -+ return null; -+ } -+ -+ if (bucket.refillPerSecond <= 0) { -+ rateLimitLogger?.debug?.("Token bucket has no refill; blocking", { -+ rateLimitKey: this.key, -+ capacity: bucket.capacity, -+ refillPerSecond: bucket.refillPerSecond, -+ }); -+ return { kind: "wait" }; -+ } -+ -+ const requiredTokens = 1 - bucket.tokens; -+ const waitMs = Math.max(1, Math.ceil((requiredTokens / bucket.refillPerSecond) * 1000)); -+ const wakeUpAt = now + waitMs; -+ rateLimitLogger?.debug?.("Token bucket empty; waiting", { -+ rateLimitKey: this.key, -+ tokens: bucket.tokens, -+ capacity: bucket.capacity, -+ refillPerSecond: bucket.refillPerSecond, -+ wakeUpAt, -+ waitMs, -+ }); -+ return { kind: "wait", wakeUpAt }; -+ } -+ -+ onDispatch(_logger?: Logger): void {} -+ -+ onComplete(_logger?: Logger): void {} -+ -+ updateFromHeaders( -+ _metadata: TrafficRequestMetadata | undefined, -+ headers: unknown, -+ logger?: Logger, -+ ): RateLimitUpdateResult | undefined { -+ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); -+ const now = Date.now(); -+ -+ const retryAfter = readHeaderValue(headers, "retry-after"); -+ const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter, now) : undefined; -+ -+ const remainingRequests = readHeaderValue(headers, "x-ratelimit-remaining-requests"); -+ const resetRequests = readHeaderValue(headers, "x-ratelimit-reset-requests"); -+ const resetRequestsMs = resetRequests ? parseResetDurationToMs(resetRequests) : undefined; -+ -+ let appliedUntil: number | undefined; -+ -+ if (retryAfterMs !== undefined) { -+ const targetAt = now + retryAfterMs; -+ this.cooldownUntil = -+ this.cooldownUntil === undefined ? targetAt : Math.max(this.cooldownUntil, targetAt); -+ appliedUntil = this.cooldownUntil; -+ } -+ -+ if (remainingRequests && resetRequestsMs !== undefined) { -+ const remaining = Number(remainingRequests); -+ if (Number.isFinite(remaining) && remaining <= 0) { -+ const targetAt = now + resetRequestsMs; -+ this.cooldownUntil = -+ this.cooldownUntil === undefined ? targetAt : Math.max(this.cooldownUntil, targetAt); -+ appliedUntil = this.cooldownUntil; -+ } -+ } -+ -+ if (appliedUntil === undefined) { -+ rateLimitLogger?.trace?.("No applicable cooldown headers; skipping", { -+ rateLimitKey: this.key, -+ hasRetryAfter: !!retryAfter, -+ hasRemainingRequests: !!remainingRequests, -+ hasResetRequests: !!resetRequests, -+ }); -+ return undefined; -+ } -+ -+ rateLimitLogger?.debug?.("Applied token bucket cooldown from headers", { -+ rateLimitKey: this.key, -+ cooldownUntil: appliedUntil, -+ inMs: Math.max(0, appliedUntil - now), -+ retryAfterMs, -+ resetRequestsMs, -+ }); -+ -+ const headerSnapshot: RateLimitHeaderSnapshot = { -+ remainingRequests, -+ resetRequests, -+ resetRequestsMs, -+ retryAfter, -+ retryAfterMs, -+ }; -+ -+ const state: RateLimitWindowState = { -+ limit: 1, -+ remaining: 0, -+ resetAt: appliedUntil, -+ reserved: 0, -+ nextAllowedAt: appliedUntil, -+ }; -+ -+ return { -+ key: this.key, -+ headerSnapshot, -+ state, -+ }; -+ } -+} -diff --git a/packages/core/src/traffic/traffic-circuit-breaker.ts b/packages/core/src/traffic/traffic-circuit-breaker.ts -new file mode 100644 -index 00000000..f240ce40 ---- /dev/null -+++ b/packages/core/src/traffic/traffic-circuit-breaker.ts -@@ -0,0 +1,454 @@ -+import type { Logger } from "../logger"; -+import { -+ CIRCUIT_COOLDOWN_MS, -+ CIRCUIT_FAILURE_THRESHOLD, -+ CIRCUIT_FAILURE_WINDOW_MS, -+ CIRCUIT_PROBE_INTERVAL_MS, -+ CIRCUIT_TIMEOUT_THRESHOLD, -+ CIRCUIT_TIMEOUT_WINDOW_MS, -+ DEFAULT_FALLBACK_CHAINS, -+} from "./traffic-constants"; -+import type { -+ CircuitState, -+ CircuitStateStatus, -+ DispatchDecision, -+ QueuedRequest, -+} from "./traffic-controller-internal"; -+import { extractStatusCode, isTimeoutError } from "./traffic-error-utils"; -+import { CircuitBreakerOpenError } from "./traffic-errors"; -+import type { -+ FallbackChainEntry, -+ FallbackPolicy, -+ FallbackPolicyConfig, -+ FallbackTarget, -+ TrafficRequestMetadata, -+ TrafficResponseMetadata, -+} from "./traffic-types"; -+ -+export class TrafficCircuitBreaker { -+ private readonly circuitBreakers = new Map(); -+ private readonly fallbackChains: Map; -+ private readonly fallbackPolicy?: FallbackPolicyConfig; -+ private readonly buildRateLimitKey: (metadata?: TrafficRequestMetadata) => string; -+ -+ constructor(options: { -+ fallbackChains?: Record; -+ fallbackPolicy?: FallbackPolicyConfig; -+ buildRateLimitKey: (metadata?: TrafficRequestMetadata) => string; -+ }) { -+ this.buildRateLimitKey = options.buildRateLimitKey; -+ const chains = options.fallbackChains ?? DEFAULT_FALLBACK_CHAINS; -+ this.fallbackChains = new Map(Object.entries(chains)); -+ this.fallbackPolicy = options.fallbackPolicy; -+ } -+ -+ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { -+ const circuitLogger = logger?.child({ module: "circuit-breaker" }); -+ const visitedKeys = new Set(); -+ -+ while (true) { -+ const key = this.buildRateLimitKey(next.request.metadata); -+ next.circuitKey = key; -+ visitedKeys.add(key); -+ circuitLogger?.trace?.("Circuit resolve step", { -+ circuitKey: key, -+ provider: next.request.metadata?.provider, -+ model: next.request.metadata?.model, -+ }); -+ -+ const evaluation = this.evaluateCircuitState(key, circuitLogger); -+ next.circuitStatus = evaluation.state; -+ circuitLogger?.debug?.("Circuit evaluated", { -+ circuitKey: key, -+ state: evaluation.state, -+ allowRequest: evaluation.allowRequest, -+ retryAfterMs: evaluation.retryAfterMs, -+ }); -+ -+ if (evaluation.allowRequest) return null; -+ -+ const { policy, policyId } = this.resolveFallbackPolicy(next.request.metadata); -+ if (policy.mode === "wait") { -+ const wakeUpAt = -+ evaluation.retryAfterMs !== undefined ? Date.now() + evaluation.retryAfterMs : undefined; -+ circuitLogger?.debug?.("Circuit open; waiting per fallback policy", { -+ circuitKey: key, -+ policyId, -+ retryAfterMs: evaluation.retryAfterMs, -+ wakeUpAt, -+ }); -+ return { kind: "wait", wakeUpAt }; -+ } -+ -+ const fallback = this.findFallbackTarget(next.request.metadata, visitedKeys, circuitLogger); -+ circuitLogger?.debug?.("Circuit open; attempting fallback", { -+ circuitKey: key, -+ currentModel: next.request.metadata?.model, -+ fallback, -+ visitedKeys: Array.from(visitedKeys), -+ }); -+ if (!fallback || !next.request.createFallbackRequest) { -+ const error = new CircuitBreakerOpenError( -+ `Circuit open for ${key}`, -+ next.request.metadata, -+ evaluation.retryAfterMs, -+ ); -+ const traffic: TrafficResponseMetadata = { -+ rateLimitKey: key, -+ retryAfterMs: evaluation.retryAfterMs, -+ tenantId: next.request.metadata?.tenantId ?? next.tenantId, -+ priority: next.request.metadata?.priority, -+ taskType: next.request.metadata?.taskType, -+ }; -+ (error as Record).traffic = traffic; -+ next.reject(error); -+ circuitLogger?.warn?.("No fallback available; rejecting request", { -+ circuitKey: key, -+ retryAfterMs: evaluation.retryAfterMs, -+ }); -+ return { kind: "skip" }; -+ } -+ -+ const fallbackRequest = next.request.createFallbackRequest(fallback); -+ if (!fallbackRequest) { -+ circuitLogger?.warn?.("createFallbackRequest returned undefined; skipping", { -+ circuitKey: key, -+ fallback, -+ }); -+ return { kind: "skip" }; -+ } -+ -+ this.applyFallbackRequest(next, fallbackRequest, fallback, circuitLogger, { -+ previousCircuitKey: key, -+ reason: "circuit-open", -+ }); -+ } -+ } -+ -+ tryFallback(next: QueuedRequest, reason: "queue-timeout", logger?: Logger): boolean { -+ const circuitLogger = logger?.child({ module: "circuit-breaker" }); -+ const { policy, policyId } = this.resolveFallbackPolicy(next.request.metadata); -+ if (policy.mode === "wait") { -+ circuitLogger?.debug?.("Fallback skipped by policy", { -+ policyId, -+ reason, -+ provider: next.request.metadata?.provider, -+ model: next.request.metadata?.model, -+ }); -+ return false; -+ } -+ -+ const visitedKeys = new Set(); -+ const key = this.buildRateLimitKey(next.request.metadata); -+ visitedKeys.add(key); -+ -+ const fallback = this.findFallbackTarget(next.request.metadata, visitedKeys, circuitLogger); -+ if (!fallback || !next.request.createFallbackRequest) { -+ circuitLogger?.debug?.("Fallback unavailable for request", { -+ reason, -+ provider: next.request.metadata?.provider, -+ model: next.request.metadata?.model, -+ fallback, -+ }); -+ return false; -+ } -+ -+ const fallbackRequest = next.request.createFallbackRequest(fallback); -+ if (!fallbackRequest) { -+ circuitLogger?.warn?.("createFallbackRequest returned undefined; skipping", { -+ reason, -+ fallback, -+ }); -+ return false; -+ } -+ -+ this.applyFallbackRequest(next, fallbackRequest, fallback, circuitLogger, { -+ previousCircuitKey: key, -+ reason, -+ policyId, -+ }); -+ return true; -+ } -+ -+ markTrial(item: QueuedRequest, logger?: Logger): void { -+ const circuitLogger = logger?.child({ module: "circuit-breaker" }); -+ const key = item.circuitKey; -+ if (!key) return; -+ const state = this.circuitBreakers.get(key); -+ if (state && state.status === "half-open" && !state.trialInFlight) { -+ state.trialInFlight = true; -+ circuitLogger?.debug?.("Marked half-open trial in flight", { circuitKey: key }); -+ } -+ } -+ -+ recordSuccess(metadata?: TrafficRequestMetadata, logger?: Logger): void { -+ const circuitLogger = logger?.child({ module: "circuit-breaker" }); -+ const key = this.buildRateLimitKey(metadata); -+ this.circuitBreakers.delete(key); -+ circuitLogger?.debug?.("Circuit success; cleared circuit state", { -+ circuitKey: key, -+ provider: metadata?.provider, -+ model: metadata?.model, -+ }); -+ } -+ -+ recordFailure( -+ metadata: TrafficRequestMetadata | undefined, -+ error: unknown, -+ logger?: Logger, -+ ): void { -+ const circuitLogger = logger?.child({ module: "circuit-breaker" }); -+ const key = this.buildRateLimitKey(metadata); -+ const status = extractStatusCode(error, logger); -+ const isTimeout = status === 408 || isTimeoutError(error, logger); -+ const isStatusEligible = this.isCircuitBreakerStatus(status); -+ const isTimeoutEligible = !isStatusEligible && isTimeout; -+ const isEligible = isStatusEligible || isTimeoutEligible; -+ -+ circuitLogger?.debug?.("Circuit failure observed", { -+ circuitKey: key, -+ status, -+ isTimeout, -+ eligible: isEligible, -+ provider: metadata?.provider, -+ model: metadata?.model, -+ }); -+ -+ if (!isEligible) { -+ this.circuitBreakers.delete(key); -+ circuitLogger?.debug?.("Failure not eligible for circuit breaker; cleared circuit state", { -+ circuitKey: key, -+ status, -+ isTimeout, -+ }); -+ return; -+ } -+ -+ const now = Date.now(); -+ const state = -+ this.circuitBreakers.get(key) ?? -+ ({ status: "closed", failureTimestamps: [], timeoutTimestamps: [] } as CircuitState); -+ -+ state.failureTimestamps = state.failureTimestamps.filter( -+ (t) => now - t <= CIRCUIT_FAILURE_WINDOW_MS, -+ ); -+ state.timeoutTimestamps = state.timeoutTimestamps.filter( -+ (t) => now - t <= CIRCUIT_TIMEOUT_WINDOW_MS, -+ ); -+ -+ state.failureTimestamps.push(now); -+ if (isTimeoutEligible) { -+ state.timeoutTimestamps.push(now); -+ } -+ -+ if ( -+ state.status === "half-open" || -+ state.failureTimestamps.length >= CIRCUIT_FAILURE_THRESHOLD || -+ state.timeoutTimestamps.length >= CIRCUIT_TIMEOUT_THRESHOLD -+ ) { -+ const openReasons: string[] = []; -+ if (state.status === "half-open") openReasons.push("half-open-failure"); -+ if (state.failureTimestamps.length >= CIRCUIT_FAILURE_THRESHOLD) { -+ openReasons.push("failure-threshold"); -+ } -+ if (state.timeoutTimestamps.length >= CIRCUIT_TIMEOUT_THRESHOLD) { -+ openReasons.push("timeout-threshold"); -+ } -+ -+ state.status = "open"; -+ state.openedAt = now; -+ state.trialInFlight = false; -+ state.nextProbeAt = now + CIRCUIT_PROBE_INTERVAL_MS; -+ circuitLogger?.warn?.("Circuit opened", { -+ circuitKey: key, -+ openReasons, -+ status, -+ isTimeout, -+ failureCount: state.failureTimestamps.length, -+ failureThreshold: CIRCUIT_FAILURE_THRESHOLD, -+ timeoutCount: state.timeoutTimestamps.length, -+ timeoutThreshold: CIRCUIT_TIMEOUT_THRESHOLD, -+ openedAt: state.openedAt, -+ }); -+ } -+ -+ this.circuitBreakers.set(key, state); -+ circuitLogger?.trace?.("Circuit state updated", { -+ circuitKey: key, -+ status: state.status, -+ failureCount: state.failureTimestamps.length, -+ failureWindowMs: CIRCUIT_FAILURE_WINDOW_MS, -+ timeoutCount: state.timeoutTimestamps.length, -+ timeoutWindowMs: CIRCUIT_TIMEOUT_WINDOW_MS, -+ }); -+ } -+ -+ private evaluateCircuitState( -+ key: string, -+ logger?: Logger, -+ ): { -+ allowRequest: boolean; -+ state: CircuitStateStatus; -+ retryAfterMs?: number; -+ } { -+ const state = this.circuitBreakers.get(key); -+ if (!state) { -+ logger?.trace?.("Circuit state missing; allow request", { circuitKey: key }); -+ return { allowRequest: true, state: "closed" }; -+ } -+ -+ const now = Date.now(); -+ -+ if (state.status === "open") { -+ const elapsed = state.openedAt ? now - state.openedAt : 0; -+ if (state.nextProbeAt === undefined) { -+ state.nextProbeAt = now + CIRCUIT_PROBE_INTERVAL_MS; -+ } -+ const cooldownRemaining = Math.max(0, CIRCUIT_COOLDOWN_MS - elapsed); -+ const probeRemaining = Math.max(0, state.nextProbeAt - now); -+ if (probeRemaining === 0 || cooldownRemaining === 0) { -+ state.status = "half-open"; -+ state.trialInFlight = false; -+ state.failureTimestamps = []; -+ state.timeoutTimestamps = []; -+ state.nextProbeAt = undefined; -+ logger?.debug?.("Circuit transitioned to half-open", { -+ circuitKey: key, -+ reason: cooldownRemaining === 0 ? "cooldown" : "probe", -+ }); -+ return { allowRequest: true, state: "half-open" }; -+ } -+ return { -+ allowRequest: false, -+ state: "open", -+ retryAfterMs: Math.min(cooldownRemaining, probeRemaining), -+ }; -+ } -+ -+ if (state.status === "half-open" && state.trialInFlight) { -+ return { allowRequest: false, state: "half-open" }; -+ } -+ -+ return { allowRequest: true, state: state.status }; -+ } -+ -+ private resolveFallbackPolicy(metadata: TrafficRequestMetadata | undefined): { -+ policy: FallbackPolicy; -+ policyId?: string; -+ } { -+ const policyId = -+ metadata?.fallbackPolicyId ?? -+ (metadata?.taskType -+ ? this.fallbackPolicy?.taskTypePolicyIds?.[metadata.taskType] -+ : undefined) ?? -+ this.fallbackPolicy?.defaultPolicyId; -+ -+ const policy = policyId ? this.fallbackPolicy?.policies?.[policyId] : undefined; -+ return { -+ policy: policy ?? { mode: "fallback" }, -+ policyId, -+ }; -+ } -+ -+ private applyFallbackRequest( -+ next: QueuedRequest, -+ fallbackRequest: QueuedRequest["request"], -+ fallback: FallbackChainEntry, -+ logger?: Logger, -+ context?: { previousCircuitKey?: string; reason?: string; policyId?: string }, -+ ): void { -+ next.request = fallbackRequest; -+ next.attempt = 1; -+ next.tenantConcurrencyKey = undefined; -+ next.providerModelConcurrencyKey = undefined; -+ next.rateLimitKey = undefined; -+ next.etaMs = undefined; -+ next.circuitKey = undefined; -+ next.circuitStatus = undefined; -+ logger?.debug?.("Switched to fallback request", { -+ previousCircuitKey: context?.previousCircuitKey, -+ fallbackModel: fallback, -+ reason: context?.reason, -+ policyId: context?.policyId, -+ }); -+ } -+ -+ private findFallbackTarget( -+ metadata: TrafficRequestMetadata | undefined, -+ visitedKeys: Set, -+ logger?: Logger, -+ ): FallbackChainEntry | undefined { -+ const currentModel = metadata?.model; -+ if (!currentModel) { -+ logger?.trace?.("No current model; no fallback", {}); -+ return undefined; -+ } -+ -+ const provider = metadata?.provider; -+ const chain = this.resolveFallbackChain(provider, currentModel); -+ if (!chain) { -+ logger?.trace?.("No fallback chain for model", { -+ currentModel, -+ provider, -+ }); -+ return undefined; -+ } -+ -+ for (const candidate of chain) { -+ const target = this.normalizeFallbackTarget(candidate, provider); -+ const candidateMetadata: TrafficRequestMetadata = { -+ ...(metadata ?? {}), -+ provider: target.provider ?? provider, -+ model: target.model, -+ }; -+ const candidateKey = this.buildRateLimitKey(candidateMetadata); -+ if (visitedKeys.has(candidateKey)) { -+ continue; -+ } -+ -+ const evaluation = this.evaluateCircuitState(candidateKey, logger); -+ if (evaluation.allowRequest) { -+ visitedKeys.add(candidateKey); -+ logger?.debug?.("Selected fallback target", { -+ currentModel, -+ currentProvider: provider, -+ fallbackModel: target.model, -+ fallbackProvider: target.provider ?? provider, -+ fallbackCircuitKey: candidateKey, -+ }); -+ return candidate; -+ } -+ } -+ -+ return undefined; -+ } -+ -+ private resolveFallbackChain( -+ provider: string | undefined, -+ model: string, -+ ): FallbackChainEntry[] | undefined { -+ const providerKey = provider ? `${provider}::${model}` : undefined; -+ if (providerKey) { -+ const providerChain = this.fallbackChains.get(providerKey); -+ if (providerChain) return providerChain; -+ } -+ return this.fallbackChains.get(model); -+ } -+ -+ private normalizeFallbackTarget( -+ candidate: FallbackChainEntry, -+ provider: string | undefined, -+ ): FallbackTarget { -+ if (typeof candidate === "string") { -+ return { provider, model: candidate }; -+ } -+ return { -+ provider: candidate.provider ?? provider, -+ model: candidate.model, -+ }; -+ } -+ -+ private isCircuitBreakerStatus(status?: number): boolean { -+ return status === 429 || (status !== undefined && status >= 500); -+ } -+} -diff --git a/packages/core/src/traffic/traffic-concurrency-limiter.ts b/packages/core/src/traffic/traffic-concurrency-limiter.ts -new file mode 100644 -index 00000000..e1525612 ---- /dev/null -+++ b/packages/core/src/traffic/traffic-concurrency-limiter.ts -@@ -0,0 +1,235 @@ -+import type { Logger } from "../logger"; -+import type { QueuedRequest } from "./traffic-controller-internal"; -+import type { -+ ProviderModelConcurrencyLimit, -+ TenantConcurrencyLimit, -+ TrafficRequestMetadata, -+} from "./traffic-types"; -+ -+export type ConcurrencyBlockReason = -+ | { -+ gate: "providerModel"; -+ key: string; -+ inFlight: number; -+ limit: number; -+ } -+ | { -+ gate: "tenant"; -+ key: string; -+ inFlight: number; -+ limit: number; -+ }; -+ -+export type ConcurrencyDecision = -+ | { kind: "allow" } -+ | { kind: "wait"; reasons: ConcurrencyBlockReason[] }; -+ -+function toNonNegativeIntegerLimit(raw: unknown): number | undefined { -+ if (raw === undefined || raw === null) return undefined; -+ const n = typeof raw === "number" ? raw : Number(raw); -+ if (!Number.isFinite(n)) return undefined; -+ if (n <= 0) return 0; -+ return Math.floor(n); -+} -+ -+function getInFlight(map: Map, key: string): number { -+ return map.get(key) ?? 0; -+} -+ -+function incrementInFlight(map: Map, key: string): void { -+ map.set(key, getInFlight(map, key) + 1); -+} -+ -+function decrementInFlight(map: Map, key: string): void { -+ const current = getInFlight(map, key); -+ if (current <= 1) { -+ map.delete(key); -+ return; -+ } -+ map.set(key, current - 1); -+} -+ -+export class TrafficConcurrencyLimiter { -+ private readonly inFlightByProviderModel = new Map(); -+ private readonly inFlightByTenant = new Map(); -+ -+ private readonly buildProviderModelKey: (metadata?: TrafficRequestMetadata) => string; -+ private readonly providerModelLimit?: ProviderModelConcurrencyLimit; -+ private readonly tenantLimit?: TenantConcurrencyLimit; -+ private readonly providerModelEnabled: boolean; -+ private readonly tenantEnabled: boolean; -+ -+ constructor(options: { -+ buildProviderModelKey: (metadata?: TrafficRequestMetadata) => string; -+ maxConcurrentPerProviderModel?: ProviderModelConcurrencyLimit; -+ maxConcurrentPerTenant?: TenantConcurrencyLimit; -+ }) { -+ this.buildProviderModelKey = options.buildProviderModelKey; -+ this.providerModelLimit = options.maxConcurrentPerProviderModel; -+ this.tenantLimit = options.maxConcurrentPerTenant; -+ this.providerModelEnabled = options.maxConcurrentPerProviderModel !== undefined; -+ this.tenantEnabled = options.maxConcurrentPerTenant !== undefined; -+ } -+ -+ resolve(next: QueuedRequest, logger?: Logger): ConcurrencyDecision { -+ if (!this.providerModelEnabled && !this.tenantEnabled) return { kind: "allow" }; -+ const concurrencyLogger = logger?.child({ module: "concurrency-limiter" }); -+ const reasons: ConcurrencyBlockReason[] = []; -+ -+ if (this.providerModelEnabled) { -+ const providerModelKey = this.buildProviderModelKey(next.request.metadata); -+ const providerModelLimit = this.resolveProviderModelLimit( -+ providerModelKey, -+ next.request.metadata, -+ concurrencyLogger, -+ ); -+ if (providerModelLimit !== undefined) { -+ const inFlight = getInFlight(this.inFlightByProviderModel, providerModelKey); -+ if (inFlight >= providerModelLimit) { -+ reasons.push({ -+ gate: "providerModel", -+ key: providerModelKey, -+ inFlight, -+ limit: providerModelLimit, -+ }); -+ } -+ } -+ } -+ -+ if (this.tenantEnabled) { -+ const tenantKey = next.tenantId; -+ const tenantLimit = this.resolveTenantLimit( -+ tenantKey, -+ next.request.metadata, -+ concurrencyLogger, -+ ); -+ if (tenantLimit !== undefined) { -+ const inFlight = getInFlight(this.inFlightByTenant, tenantKey); -+ if (inFlight >= tenantLimit) { -+ reasons.push({ -+ gate: "tenant", -+ key: tenantKey, -+ inFlight, -+ limit: tenantLimit, -+ }); -+ } -+ } -+ } -+ -+ if (reasons.length === 0) return { kind: "allow" }; -+ -+ concurrencyLogger?.trace?.("Concurrency gate blocked request", { -+ tenantId: next.tenantId, -+ reasons, -+ }); -+ return { kind: "wait", reasons }; -+ } -+ -+ acquire(next: QueuedRequest, logger?: Logger): void { -+ if (!this.providerModelEnabled && !this.tenantEnabled) return; -+ const concurrencyLogger = logger?.child({ module: "concurrency-limiter" }); -+ -+ let tenantKey: string | undefined; -+ if (this.tenantEnabled) { -+ tenantKey = next.tenantId; -+ next.tenantConcurrencyKey = tenantKey; -+ incrementInFlight(this.inFlightByTenant, tenantKey); -+ } -+ -+ let providerModelKey: string | undefined; -+ if (this.providerModelEnabled) { -+ providerModelKey = this.buildProviderModelKey(next.request.metadata); -+ next.providerModelConcurrencyKey = providerModelKey; -+ incrementInFlight(this.inFlightByProviderModel, providerModelKey); -+ } -+ -+ concurrencyLogger?.trace?.("Concurrency slots acquired", { -+ tenantId: tenantKey, -+ tenantInFlight: tenantKey ? getInFlight(this.inFlightByTenant, tenantKey) : undefined, -+ providerModelKey, -+ providerModelInFlight: providerModelKey -+ ? getInFlight(this.inFlightByProviderModel, providerModelKey) -+ : undefined, -+ }); -+ } -+ -+ release(next: QueuedRequest, logger?: Logger): void { -+ const concurrencyLogger = logger?.child({ module: "concurrency-limiter" }); -+ const tenantKey = next.tenantConcurrencyKey; -+ const providerModelKey = next.providerModelConcurrencyKey; -+ -+ if (tenantKey) { -+ decrementInFlight(this.inFlightByTenant, tenantKey); -+ } -+ -+ if (providerModelKey) { -+ decrementInFlight(this.inFlightByProviderModel, providerModelKey); -+ } -+ -+ if (tenantKey || providerModelKey) { -+ concurrencyLogger?.trace?.("Concurrency slots released", { -+ tenantId: tenantKey, -+ tenantInFlight: tenantKey ? getInFlight(this.inFlightByTenant, tenantKey) : undefined, -+ providerModelKey, -+ providerModelInFlight: providerModelKey -+ ? getInFlight(this.inFlightByProviderModel, providerModelKey) -+ : undefined, -+ }); -+ } -+ -+ next.tenantConcurrencyKey = undefined; -+ next.providerModelConcurrencyKey = undefined; -+ } -+ -+ private resolveTenantLimit( -+ tenantId: string, -+ metadata: TrafficRequestMetadata | undefined, -+ logger?: Logger, -+ ): number | undefined { -+ const policy = this.tenantLimit; -+ if (policy === undefined) return undefined; -+ -+ if (typeof policy === "number") return toNonNegativeIntegerLimit(policy); -+ if (typeof policy === "function") { -+ try { -+ return toNonNegativeIntegerLimit(policy(tenantId, metadata)); -+ } catch (error) { -+ logger?.warn?.("Tenant concurrency resolver threw; ignoring", { -+ tenantId, -+ errorName: (error as { name?: unknown } | null)?.name, -+ errorMessage: (error as { message?: unknown } | null)?.message, -+ }); -+ return undefined; -+ } -+ } -+ -+ return toNonNegativeIntegerLimit(policy[tenantId]); -+ } -+ -+ private resolveProviderModelLimit( -+ key: string, -+ metadata: TrafficRequestMetadata | undefined, -+ logger?: Logger, -+ ): number | undefined { -+ const policy = this.providerModelLimit; -+ if (policy === undefined) return undefined; -+ -+ if (typeof policy === "number") return toNonNegativeIntegerLimit(policy); -+ if (typeof policy === "function") { -+ try { -+ return toNonNegativeIntegerLimit(policy(metadata, key)); -+ } catch (error) { -+ logger?.warn?.("Provider/model concurrency resolver threw; ignoring", { -+ key, -+ provider: metadata?.provider, -+ model: metadata?.model, -+ errorName: (error as { name?: unknown } | null)?.name, -+ errorMessage: (error as { message?: unknown } | null)?.message, -+ }); -+ return undefined; -+ } -+ } -+ -+ return toNonNegativeIntegerLimit(policy[key]); -+ } -+} -diff --git a/packages/core/src/traffic/traffic-constants.ts b/packages/core/src/traffic/traffic-constants.ts -new file mode 100644 -index 00000000..68d99df7 ---- /dev/null -+++ b/packages/core/src/traffic/traffic-constants.ts -@@ -0,0 +1,26 @@ -+export const MAX_RETRY_ATTEMPTS = 3; -+export const TIMEOUT_RETRY_ATTEMPTS = 2; -+ -+export const RATE_LIMIT_BASE_BACKOFF_MS = 500; -+export const SERVER_ERROR_BASE_BACKOFF_MS = 1000; -+export const TIMEOUT_BASE_BACKOFF_MS = 750; -+ -+export const RATE_LIMIT_JITTER_FACTOR = 0.35; -+export const SERVER_ERROR_JITTER_FACTOR = 0.8; -+export const TIMEOUT_JITTER_FACTOR = 0.5; -+ -+export const CIRCUIT_FAILURE_THRESHOLD = 5; -+export const CIRCUIT_FAILURE_WINDOW_MS = 10_000; -+export const CIRCUIT_TIMEOUT_THRESHOLD = CIRCUIT_FAILURE_THRESHOLD; -+export const CIRCUIT_TIMEOUT_WINDOW_MS = CIRCUIT_FAILURE_WINDOW_MS; -+export const CIRCUIT_COOLDOWN_MS = 30_000; -+export const CIRCUIT_PROBE_INTERVAL_MS = 5_000; -+ -+export const RATE_LIMIT_EXHAUSTION_BUFFER = 1; -+export const RATE_LIMIT_PROBE_DELAY_MS = 50; -+export const RATE_LIMIT_MIN_PACE_INTERVAL_MS = 10; -+export const RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS = 10; -+ -+export const DEFAULT_FALLBACK_CHAINS: Record = { -+ "gpt-4o": ["gpt-4o-mini", "gpt-3.5"], -+}; -diff --git a/packages/core/src/traffic/traffic-controller-internal.ts b/packages/core/src/traffic/traffic-controller-internal.ts -new file mode 100644 -index 00000000..cf435854 ---- /dev/null -+++ b/packages/core/src/traffic/traffic-controller-internal.ts -@@ -0,0 +1,54 @@ -+import type { TrafficPriority, TrafficRequest, TrafficRequestType } from "./traffic-types"; -+ -+export type Scheduler = (callback: () => void) => void; -+ -+export type DispatchDecision = -+ | { kind: "dispatch" } -+ | { kind: "skip" } -+ | { kind: "wait"; wakeUpAt?: number }; -+ -+export type CircuitStateStatus = "closed" | "open" | "half-open"; -+ -+export interface CircuitState { -+ status: CircuitStateStatus; -+ failureTimestamps: number[]; -+ timeoutTimestamps: number[]; -+ openedAt?: number; -+ trialInFlight?: boolean; -+ nextProbeAt?: number; -+} -+ -+export interface RateLimitWindowState { -+ limit: number; -+ remaining: number; -+ resetAt: number; -+ reserved: number; -+ nextAllowedAt: number; -+} -+ -+type BivariantHandler = { -+ bivarianceHack(...args: TArgs): void; -+}["bivarianceHack"]; -+ -+export interface QueuedRequest { -+ type: TrafficRequestType; -+ request: TrafficRequest; -+ resolve: BivariantHandler<[TResponse | PromiseLike]>; -+ reject: BivariantHandler<[reason?: unknown]>; -+ attempt: number; -+ priority: TrafficPriority; -+ tenantId: string; -+ enqueuedAt: number; -+ dispatchedAt?: number; -+ -+ tenantConcurrencyKey?: string; -+ providerModelConcurrencyKey?: string; -+ -+ rateLimitKey?: string; -+ etaMs?: number; -+ -+ circuitKey?: string; -+ circuitStatus?: CircuitStateStatus; -+ -+ extractUsage?: TrafficRequest["extractUsage"]; -+} -diff --git a/packages/core/src/traffic/traffic-controller.spec.ts b/packages/core/src/traffic/traffic-controller.spec.ts -new file mode 100644 -index 00000000..b3f331b2 ---- /dev/null -+++ b/packages/core/src/traffic/traffic-controller.spec.ts -@@ -0,0 +1,281 @@ -+import { describe, expect, it, vi } from "vitest"; -+import { CIRCUIT_FAILURE_THRESHOLD } from "./traffic-constants"; -+import { TrafficController } from "./traffic-controller"; -+ -+describe("TrafficController priority scheduling", () => { -+ it("prioritizes P0 over lower priorities when runnable", async () => { -+ const controller = new TrafficController({ maxConcurrent: 1 }); -+ const order: string[] = []; -+ -+ const p1 = controller.handleText({ -+ metadata: { provider: "p", model: "m1", priority: "P1" }, -+ execute: async () => { -+ order.push("P1"); -+ return "P1"; -+ }, -+ }); -+ -+ const p2 = controller.handleText({ -+ metadata: { provider: "p", model: "m2", priority: "P2" }, -+ execute: async () => { -+ order.push("P2"); -+ return "P2"; -+ }, -+ }); -+ -+ const p0 = controller.handleText({ -+ metadata: { provider: "p", model: "m0", priority: "P0" }, -+ execute: async () => { -+ order.push("P0"); -+ return "P0"; -+ }, -+ }); -+ -+ await Promise.all([p0, p1, p2]); -+ -+ expect(order[0]).toBe("P0"); -+ expect(order).toEqual(["P0", "P1", "P2"]); -+ }); -+ -+ it("allows lower priorities to proceed when a higher priority request is rate limited", async () => { -+ vi.useFakeTimers(); -+ -+ try { -+ vi.setSystemTime(new Date(0)); -+ const controller = new TrafficController({ maxConcurrent: 1 }); -+ controller.updateRateLimitFromHeaders( -+ { provider: "p0", model: "m0" }, -+ { -+ "x-ratelimit-limit-requests": "1", -+ "x-ratelimit-remaining-requests": "0", -+ "x-ratelimit-reset-requests": "1s", -+ }, -+ ); -+ -+ const order: string[] = []; -+ -+ const p0 = controller.handleText({ -+ metadata: { provider: "p0", model: "m0", priority: "P0" }, -+ execute: async () => { -+ order.push("P0"); -+ return "P0"; -+ }, -+ }); -+ -+ const p1 = controller.handleText({ -+ metadata: { provider: "p1", model: "m1", priority: "P1" }, -+ execute: async () => { -+ order.push("P1"); -+ return "P1"; -+ }, -+ }); -+ -+ await vi.runAllTimersAsync(); -+ await Promise.all([p0, p1]); -+ -+ expect(order[0]).toBe("P1"); -+ expect(order[1]).toBe("P0"); -+ } finally { -+ vi.useRealTimers(); -+ } -+ }); -+}); -+ -+describe("TrafficController rate limit headers", () => { -+ it("parses OpenAI-style compound reset durations (e.g. 1m30.951s)", () => { -+ vi.useFakeTimers(); -+ -+ try { -+ vi.setSystemTime(new Date(1_000_000)); -+ const controller = new TrafficController({ maxConcurrent: 1 }); -+ const now = Date.now(); -+ -+ const result = controller.updateRateLimitFromHeaders( -+ { provider: "openai.responses", model: "gpt-4o-mini" }, -+ { -+ "x-ratelimit-limit-requests": "10000", -+ "x-ratelimit-remaining-requests": "9989", -+ "x-ratelimit-reset-requests": "1m30.951s", -+ }, -+ ); -+ -+ expect(result).toBeTruthy(); -+ expect(result?.headerSnapshot.resetRequestsMs).toBeCloseTo(90_951, 6); -+ expect(result?.state.limit).toBe(10000); -+ expect(result?.state.remaining).toBe(9989); -+ expect(result?.state.resetAt).toBe(now + 90_951); -+ expect(result?.state.reserved).toBe(0); -+ expect(result?.state.nextAllowedAt).toBe(now); -+ } finally { -+ vi.useRealTimers(); -+ } -+ }); -+ -+ it("keeps resetAt monotonic when headers shorten the reset duration", () => { -+ vi.useFakeTimers(); -+ -+ try { -+ vi.setSystemTime(new Date(0)); -+ const controller = new TrafficController({ maxConcurrent: 1 }); -+ -+ const first = controller.updateRateLimitFromHeaders( -+ { provider: "openai.responses", model: "gpt-4o-mini" }, -+ { -+ "x-ratelimit-limit-requests": "10000", -+ "x-ratelimit-remaining-requests": "9999", -+ "x-ratelimit-reset-requests": "60s", -+ }, -+ ); -+ -+ expect(first).toBeTruthy(); -+ expect(first?.state.resetAt).toBe(60_000); -+ -+ vi.setSystemTime(new Date(10_000)); -+ const second = controller.updateRateLimitFromHeaders( -+ { provider: "openai.responses", model: "gpt-4o-mini" }, -+ { -+ "x-ratelimit-limit-requests": "10000", -+ "x-ratelimit-remaining-requests": "9998", -+ "x-ratelimit-reset-requests": "5s", -+ }, -+ ); -+ -+ expect(second).toBeTruthy(); -+ expect(second?.state.resetAt).toBe(60_000); -+ } finally { -+ vi.useRealTimers(); -+ } -+ }); -+ -+ it("never increases remaining within the same window", () => { -+ vi.useFakeTimers(); -+ -+ try { -+ vi.setSystemTime(new Date(0)); -+ const controller = new TrafficController({ maxConcurrent: 1 }); -+ -+ const first = controller.updateRateLimitFromHeaders( -+ { provider: "openai.responses", model: "gpt-4o-mini" }, -+ { -+ "x-ratelimit-limit-requests": "10", -+ "x-ratelimit-remaining-requests": "9", -+ "x-ratelimit-reset-requests": "60s", -+ }, -+ ); -+ -+ expect(first?.state.remaining).toBe(9); -+ expect(first?.state.resetAt).toBe(60_000); -+ -+ vi.setSystemTime(new Date(10_000)); -+ const second = controller.updateRateLimitFromHeaders( -+ { provider: "openai.responses", model: "gpt-4o-mini" }, -+ { -+ "x-ratelimit-limit-requests": "10", -+ "x-ratelimit-remaining-requests": "8", -+ "x-ratelimit-reset-requests": "50s", -+ }, -+ ); -+ -+ expect(second?.state.remaining).toBe(8); -+ expect(second?.state.resetAt).toBe(60_000); -+ -+ vi.setSystemTime(new Date(20_000)); -+ const third = controller.updateRateLimitFromHeaders( -+ { provider: "openai.responses", model: "gpt-4o-mini" }, -+ { -+ "x-ratelimit-limit-requests": "10", -+ "x-ratelimit-remaining-requests": "9", -+ "x-ratelimit-reset-requests": "40s", -+ }, -+ ); -+ -+ expect(third?.state.remaining).toBe(8); -+ expect(third?.state.resetAt).toBe(60_000); -+ } finally { -+ vi.useRealTimers(); -+ } -+ }); -+ -+ it("applies Retry-After even when x-ratelimit headers are missing", async () => { -+ vi.useFakeTimers(); -+ -+ try { -+ vi.setSystemTime(new Date(0)); -+ const controller = new TrafficController({ maxConcurrent: 1 }); -+ const order: string[] = []; -+ -+ controller.updateRateLimitFromHeaders( -+ { provider: "p", model: "m" }, -+ { -+ "retry-after": "2", -+ }, -+ ); -+ -+ const p0 = controller.handleText({ -+ metadata: { provider: "p", model: "m", priority: "P0" }, -+ execute: async () => { -+ order.push("P0"); -+ return "P0"; -+ }, -+ }); -+ -+ await vi.advanceTimersByTimeAsync(1_999); -+ expect(order).toEqual([]); -+ -+ await vi.advanceTimersByTimeAsync(1); -+ await vi.runAllTimersAsync(); -+ await p0; -+ expect(order).toEqual(["P0"]); -+ } finally { -+ vi.useRealTimers(); -+ } -+ }); -+}); -+ -+describe("TrafficController stream reporting", () => { -+ it("treats post-start stream failures as circuit breaker failures", async () => { -+ const controller = new TrafficController({ -+ maxConcurrent: 1, -+ fallbackChains: { -+ primary: ["fallback"], -+ }, -+ }); -+ const tenantId = "tenant-1"; -+ const metadata = { provider: "p", model: "primary", priority: "P1" as const }; -+ -+ await controller.handleStream({ -+ tenantId, -+ metadata, -+ execute: async () => ({ ok: true }), -+ }); -+ -+ for (let i = 0; i < CIRCUIT_FAILURE_THRESHOLD; i += 1) { -+ controller.reportStreamFailure(metadata, new Error("stream-failure")); -+ } -+ -+ const order: string[] = []; -+ await controller.handleStream({ -+ tenantId, -+ metadata, -+ execute: async () => { -+ order.push("primary"); -+ return "primary"; -+ }, -+ createFallbackRequest: (target) => ({ -+ tenantId, -+ metadata: { -+ provider: "p", -+ model: typeof target === "string" ? target : target.model, -+ priority: "P1", -+ }, -+ execute: async () => { -+ const modelId = typeof target === "string" ? target : target.model; -+ order.push(modelId); -+ return modelId; -+ }, -+ }), -+ }); -+ -+ expect(order).toEqual(["fallback"]); -+ }); -+}); -diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts -new file mode 100644 -index 00000000..c26b914d ---- /dev/null -+++ b/packages/core/src/traffic/traffic-controller.ts -@@ -0,0 +1,1231 @@ -+import type { Logger } from "../logger"; -+import { LoggerProxy } from "../logger"; -+import { TrafficCircuitBreaker } from "./traffic-circuit-breaker"; -+import { TrafficConcurrencyLimiter } from "./traffic-concurrency-limiter"; -+import type { DispatchDecision, QueuedRequest, Scheduler } from "./traffic-controller-internal"; -+import { -+ CircuitBreakerOpenError, -+ QueueWaitTimeoutError, -+ RateLimitedUpstreamError, -+ normalizeRateLimitError, -+} from "./traffic-errors"; -+import { -+ OpenAIWindowRateLimitStrategy, -+ type RateLimitUpdateResult, -+ TokenBucketRateLimitStrategy, -+ TrafficRateLimiter, -+} from "./traffic-rate-limiter"; -+import { buildRetryPlanWithPolicy } from "./traffic-retry"; -+import type { -+ AdaptiveLimiterConfig, -+ FallbackChainEntry, -+ FallbackPolicy, -+ FallbackPolicyConfig, -+ FallbackPolicyMode, -+ FallbackTarget, -+ PriorityBurstLimits, -+ ProviderModelConcurrencyLimit, -+ RateLimitConfig, -+ RateLimitKey, -+ RateLimitStrategyConfig, -+ RateLimitStrategyKind, -+ RetryPlan, -+ RetryPolicyConfig, -+ TenantConcurrencyLimit, -+ TenantUsage, -+ TrafficControllerOptions, -+ TrafficPriority, -+ TrafficRequest, -+ TrafficRequestMetadata, -+ TrafficRequestType, -+ TrafficResponseMetadata, -+} from "./traffic-types"; -+import { TrafficUsageTracker } from "./traffic-usage-tracker"; -+ -+/* ============================================================ -+ * Traffic Controller -+ * ============================================================ -+ */ -+ -+export type { -+ AdaptiveLimiterConfig, -+ FallbackChainEntry, -+ FallbackPolicy, -+ FallbackPolicyConfig, -+ FallbackPolicyMode, -+ FallbackTarget, -+ PriorityBurstLimits, -+ ProviderModelConcurrencyLimit, -+ RateLimitConfig, -+ RateLimitKey, -+ RateLimitStrategyConfig, -+ RateLimitStrategyKind, -+ TenantConcurrencyLimit, -+ TenantUsage, -+ TrafficControllerOptions, -+ TrafficPriority, -+ TrafficRequest, -+ TrafficRequestMetadata, -+ TrafficResponseMetadata, -+ TrafficRequestType, -+}; -+ -+export { CircuitBreakerOpenError }; -+export { QueueWaitTimeoutError }; -+export { RateLimitedUpstreamError }; -+ -+type TenantQueueState = { -+ order: string[]; -+ index: number; -+ queues: Map; -+}; -+ -+type RateLimitSnapshot = { -+ limit?: number; -+ remaining?: number; -+ resetAt?: number; -+ nextAllowedAt?: number; -+ retryAfterMs?: number; -+}; -+ -+type AdaptiveLimiterState = { -+ recent429s: number[]; -+ penaltyMs: number; -+ cooldownUntil?: number; -+ last429At?: number; -+}; -+ -+const DEFAULT_PRIORITY_BURST_LIMITS: Record = { -+ P0: 5, -+ P1: 3, -+ P2: 2, -+}; -+ -+const DEFAULT_ADAPTIVE_LIMITER: Required = { -+ windowMs: 30_000, -+ threshold: 3, -+ minPenaltyMs: 500, -+ maxPenaltyMs: 10_000, -+ penaltyMultiplier: 2, -+ decayMs: 10_000, -+}; -+ -+export class TrafficController { -+ /* ---------- Core ---------- */ -+ -+ private readonly scheduler: Scheduler; -+ private readonly maxConcurrent: number; -+ private readonly rateLimitKeyBuilder: (metadata?: TrafficRequestMetadata) => string; -+ private readonly retryPolicy?: RetryPolicyConfig; -+ private readonly logger: Logger; -+ private readonly trafficLogger: Logger; -+ private readonly controllerLogger: Logger; -+ private readonly concurrencyLimiter: TrafficConcurrencyLimiter; -+ -+ private readonly queues: Record = { -+ P0: { order: [], index: 0, queues: new Map() }, -+ P1: { order: [], index: 0, queues: new Map() }, -+ P2: { order: [], index: 0, queues: new Map() }, -+ }; -+ private readonly priorityOrder: TrafficPriority[] = ["P0", "P1", "P2"]; -+ private readonly priorityBurstLimits: Record; -+ private readonly priorityBurstCounts: Record = { -+ P0: 0, -+ P1: 0, -+ P2: 0, -+ }; -+ -+ private activeCount = 0; -+ private drainScheduled = false; -+ -+ /* ---------- Rate limits ---------- */ -+ private readonly rateLimiter: TrafficRateLimiter; -+ -+ /* ---------- Circuit breakers ---------- */ -+ private readonly circuitBreaker: TrafficCircuitBreaker; -+ -+ /* ---------- Usage ---------- */ -+ private readonly usageTracker = new TrafficUsageTracker(); -+ -+ /* ---------- Traffic metadata ---------- */ -+ private readonly rateLimitSnapshots = new Map(); -+ -+ /* ---------- Adaptive limiter ---------- */ -+ private readonly adaptiveLimiterConfig: Required; -+ private readonly adaptiveLimiterState = new Map(); -+ -+ constructor(options: TrafficControllerOptions = {}) { -+ this.maxConcurrent = options.maxConcurrent ?? Number.POSITIVE_INFINITY; -+ this.scheduler = this.createScheduler(); -+ this.rateLimitKeyBuilder = options.rateLimitKeyBuilder ?? buildRateLimitKeyFromMetadata; -+ this.retryPolicy = options.retryPolicy; -+ this.priorityBurstLimits = { -+ ...DEFAULT_PRIORITY_BURST_LIMITS, -+ ...(options.priorityBurstLimits ?? {}), -+ }; -+ this.adaptiveLimiterConfig = { -+ ...DEFAULT_ADAPTIVE_LIMITER, -+ ...(options.adaptiveLimiter ?? {}), -+ }; -+ this.logger = new LoggerProxy({ component: "traffic-controller" }, options.logger); -+ this.trafficLogger = this.logger.child({ subsystem: "traffic" }); -+ this.controllerLogger = this.trafficLogger.child({ module: "controller" }); -+ const rateLimits = options.rateLimits; -+ const rateLimitStrategy = options.rateLimitStrategy; -+ this.rateLimiter = new TrafficRateLimiter(() => this.scheduleDrain(), { -+ rateLimits, -+ strategyFactory: (key) => { -+ const strategyKind = this.resolveRateLimitStrategy(key, rateLimitStrategy); -+ if (strategyKind === "window") { -+ return new OpenAIWindowRateLimitStrategy(key, rateLimits?.[key]); -+ } -+ return new TokenBucketRateLimitStrategy(key, rateLimits?.[key]); -+ }, -+ }); -+ this.circuitBreaker = new TrafficCircuitBreaker({ -+ fallbackChains: options.fallbackChains, -+ fallbackPolicy: options.fallbackPolicy, -+ buildRateLimitKey: (metadata) => this.buildRateLimitKey(metadata), -+ }); -+ this.concurrencyLimiter = new TrafficConcurrencyLimiter({ -+ buildProviderModelKey: (metadata) => this.buildRateLimitKey(metadata), -+ maxConcurrentPerProviderModel: options.maxConcurrentPerProviderModel, -+ maxConcurrentPerTenant: options.maxConcurrentPerTenant, -+ }); -+ -+ this.controllerLogger.debug("Initialized TrafficController", { -+ maxConcurrent: this.maxConcurrent, -+ hasFallbackChains: !!options.fallbackChains, -+ hasFallbackPolicy: options.fallbackPolicy !== undefined, -+ hasProviderModelConcurrency: options.maxConcurrentPerProviderModel !== undefined, -+ hasTenantConcurrency: options.maxConcurrentPerTenant !== undefined, -+ hasConfigRateLimits: options.rateLimits !== undefined, -+ hasStrategyOverrides: options.rateLimitStrategy !== undefined, -+ hasRetryPolicy: options.retryPolicy !== undefined, -+ hasPriorityBurstLimits: options.priorityBurstLimits !== undefined, -+ hasAdaptiveLimiter: options.adaptiveLimiter !== undefined, -+ }); -+ } -+ -+ /* ============================================================ -+ * Public API -+ * ============================================================ -+ */ -+ -+ handleText(request: TrafficRequest): Promise { -+ this.controllerLogger.trace("handleText called", { -+ tenantId: request.tenantId, -+ provider: request.metadata?.provider, -+ model: request.metadata?.model, -+ priority: request.metadata?.priority, -+ }); -+ return this.enqueue("text", request); -+ } -+ -+ handleStream(request: TrafficRequest): Promise { -+ this.controllerLogger.trace("handleStream called", { -+ tenantId: request.tenantId, -+ provider: request.metadata?.provider, -+ model: request.metadata?.model, -+ priority: request.metadata?.priority, -+ }); -+ return this.enqueue("stream", request); -+ } -+ -+ reportStreamSuccess(metadata?: TrafficRequestMetadata): void { -+ this.controllerLogger.debug("Stream reported success", { -+ provider: metadata?.provider, -+ model: metadata?.model, -+ tenantId: metadata?.tenantId, -+ priority: metadata?.priority, -+ }); -+ this.circuitBreaker.recordSuccess(metadata, this.trafficLogger); -+ const rateLimitKey = this.buildRateLimitKey(metadata); -+ const adaptiveKey = this.buildAdaptiveKey( -+ metadata, -+ metadata?.tenantId ?? "default", -+ rateLimitKey, -+ ); -+ this.recordAdaptiveSuccess(adaptiveKey); -+ } -+ -+ reportStreamFailure(metadata: TrafficRequestMetadata | undefined, error: unknown): void { -+ this.controllerLogger.warn("Stream reported failure", { -+ provider: metadata?.provider, -+ model: metadata?.model, -+ tenantId: metadata?.tenantId, -+ priority: metadata?.priority, -+ errorName: (error as { name?: unknown } | null)?.name, -+ errorMessage: (error as { message?: unknown } | null)?.message, -+ status: (error as { status?: unknown } | null)?.status, -+ statusCode: (error as { statusCode?: unknown } | null)?.statusCode, -+ }); -+ this.circuitBreaker.recordFailure(metadata, error, this.trafficLogger); -+ const rateLimitKey = this.buildRateLimitKey(metadata); -+ const adaptiveKey = this.buildAdaptiveKey( -+ metadata, -+ metadata?.tenantId ?? "default", -+ rateLimitKey, -+ ); -+ if (error instanceof RateLimitedUpstreamError) { -+ this.recordAdaptiveRateLimitHit(adaptiveKey, error.retryAfterMs); -+ } -+ this.attachTrafficMetadata( -+ error, -+ this.buildTrafficResponseMetadataFromMetadata(metadata, rateLimitKey, Date.now(), error), -+ ); -+ } -+ -+ updateRateLimitFromHeaders( -+ metadata: TrafficRequestMetadata | undefined, -+ headers: unknown, -+ ): RateLimitUpdateResult | undefined { -+ const key = this.buildRateLimitKey(metadata); -+ this.controllerLogger.debug("updateRateLimitFromHeaders called", { -+ rateLimitKey: key, -+ provider: metadata?.provider, -+ model: metadata?.model, -+ }); -+ -+ const update = this.rateLimiter.updateFromHeaders(metadata, headers, key, this.trafficLogger); -+ if (!update) { -+ this.controllerLogger.debug("updateRateLimitFromHeaders skipped (no headers applied)", { -+ rateLimitKey: key, -+ }); -+ return undefined; -+ } -+ -+ this.controllerLogger.debug("Rate limit headers applied", { -+ rateLimitKey: update.key, -+ limit: update.state.limit, -+ remaining: update.state.remaining, -+ reserved: update.state.reserved, -+ resetAt: update.state.resetAt, -+ nextAllowedAt: update.state.nextAllowedAt, -+ resetRequestsMs: update.headerSnapshot.resetRequestsMs, -+ }); -+ -+ this.rateLimitSnapshots.set(update.key, { -+ limit: update.state.limit, -+ remaining: update.state.remaining, -+ resetAt: update.state.resetAt, -+ nextAllowedAt: update.state.nextAllowedAt, -+ retryAfterMs: update.headerSnapshot.retryAfterMs, -+ }); -+ -+ return update; -+ } -+ -+ getTenantUsage(tenantId: string): TenantUsage | undefined { -+ this.controllerLogger.trace("getTenantUsage called", { tenantId }); -+ return this.usageTracker.getTenantUsage(tenantId); -+ } -+ -+ /* ============================================================ -+ * Scheduler & Queue -+ * ============================================================ -+ */ -+ -+ private createScheduler(): Scheduler { -+ return typeof queueMicrotask === "function" ? queueMicrotask : (cb) => setTimeout(cb, 0); -+ } -+ -+ private enqueue( -+ type: TrafficRequestType, -+ request: TrafficRequest, -+ ): Promise { -+ return new Promise((resolve, reject) => { -+ const priority = this.resolvePriority(request.metadata); -+ const tenantId = this.resolveTenantId(request); -+ this.controllerLogger.debug("Enqueue request", { -+ type, -+ tenantId, -+ priority, -+ provider: request.metadata?.provider, -+ model: request.metadata?.model, -+ }); -+ this.enqueueItem({ -+ type, -+ request, -+ resolve, -+ reject, -+ attempt: 1, -+ priority, -+ tenantId, -+ enqueuedAt: Date.now(), -+ extractUsage: request.extractUsage, -+ }); -+ this.scheduleDrain(); -+ }); -+ } -+ -+ private scheduleDrain(): void { -+ if (this.drainScheduled) return; -+ this.drainScheduled = true; -+ -+ this.controllerLogger.trace("Drain scheduled"); -+ this.scheduler(() => { -+ this.drainScheduled = false; -+ this.controllerLogger.trace("Drain tick"); -+ this.drainQueue(); -+ }); -+ } -+ -+ private drainQueue(): void { -+ this.controllerLogger.trace("Drain start", { -+ activeCount: this.activeCount, -+ maxConcurrent: this.maxConcurrent, -+ queuedP0: this.getQueuedCount("P0"), -+ queuedP1: this.getQueuedCount("P1"), -+ queuedP2: this.getQueuedCount("P2"), -+ }); -+ while (true) { -+ const decision = this.tryDispatchNext(); -+ this.controllerLogger.trace("Dispatch decision", decision); -+ if (decision.kind === "dispatch" || decision.kind === "skip") continue; -+ if (decision.kind === "wait") { -+ if (decision.wakeUpAt) { -+ this.controllerLogger.debug("Rate limit wait; scheduling wakeup", { -+ wakeUpAt: decision.wakeUpAt, -+ inMs: Math.max(0, decision.wakeUpAt - Date.now()), -+ }); -+ this.scheduleRateLimitWakeUpAt(decision.wakeUpAt); -+ } -+ return; -+ } -+ return; -+ } -+ } -+ -+ /* ============================================================ -+ * Dispatch -+ * ============================================================ -+ */ -+ -+ private tryDispatchNext(): DispatchDecision { -+ if (this.activeCount >= this.maxConcurrent) return { kind: "wait" }; -+ -+ let earliestWakeUpAt: number | undefined; -+ -+ const observeWakeUpAt = (candidate?: number): void => { -+ if (candidate === undefined) return; -+ earliestWakeUpAt = -+ earliestWakeUpAt === undefined ? candidate : Math.min(earliestWakeUpAt, candidate); -+ }; -+ -+ const priorities = this.getPriorityDispatchOrder(); -+ for (const priority of priorities) { -+ const state = this.queues[priority]; -+ if (state.order.length === 0) continue; -+ -+ let attempts = 0; -+ const maxAttempts = state.order.length; -+ -+ while (attempts < maxAttempts) { -+ const candidate = this.getNextTenantCandidate(priority); -+ if (!candidate) break; -+ attempts += 1; -+ -+ const { item: next, queue, tenantId } = candidate; -+ const now = Date.now(); -+ const queueTimeoutAt = this.resolveQueueTimeoutAt(next); -+ const queueTimeoutTriggered = this.handleQueueTimeout(next, queue, 0, now, queueTimeoutAt); -+ if (queueTimeoutTriggered === "rejected") { -+ this.cleanupTenantQueue(priority, tenantId, queue); -+ return { kind: "skip" }; -+ } -+ if (queueTimeoutAt !== undefined && now < queueTimeoutAt) { -+ observeWakeUpAt(queueTimeoutAt); -+ } -+ const queueTimeoutExpired = queueTimeoutTriggered === "expired"; -+ -+ this.controllerLogger.trace("Evaluate next queued request", { -+ priority, -+ tenantId: next.tenantId, -+ type: next.type, -+ attempt: next.attempt, -+ provider: next.request.metadata?.provider, -+ model: next.request.metadata?.model, -+ queueLength: queue.length, -+ }); -+ -+ const circuit = this.resolveCircuit(next); -+ if (circuit) { -+ this.controllerLogger.trace("Circuit resolution returned decision", { -+ priority, -+ decision: circuit, -+ circuitKey: next.circuitKey, -+ circuitStatus: next.circuitStatus, -+ }); -+ if (circuit.kind === "skip") { -+ queue.shift(); -+ this.cleanupTenantQueue(priority, tenantId, queue); -+ return { kind: "skip" }; -+ } -+ if (circuit.kind === "wait") { -+ if ( -+ this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "circuit wait") -+ ) { -+ this.cleanupTenantQueue(priority, tenantId, queue); -+ return { kind: "skip" }; -+ } -+ next.etaMs = -+ circuit.wakeUpAt !== undefined ? Math.max(0, circuit.wakeUpAt - now) : undefined; -+ observeWakeUpAt(circuit.wakeUpAt); -+ continue; -+ } -+ } -+ -+ const concurrency = this.concurrencyLimiter.resolve(next, this.trafficLogger); -+ if (concurrency.kind === "wait") { -+ this.controllerLogger.trace("Concurrency gate blocked request", { -+ priority, -+ tenantId: next.tenantId, -+ provider: next.request.metadata?.provider, -+ model: next.request.metadata?.model, -+ reasons: concurrency.reasons, -+ }); -+ if ( -+ this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "concurrency wait") -+ ) { -+ this.cleanupTenantQueue(priority, tenantId, queue); -+ return { kind: "skip" }; -+ } -+ next.etaMs = undefined; -+ continue; -+ } -+ -+ const adaptive = this.resolveAdaptiveLimit(next, now); -+ if (adaptive?.kind === "wait") { -+ if ( -+ this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "adaptive wait") -+ ) { -+ this.cleanupTenantQueue(priority, tenantId, queue); -+ return { kind: "skip" }; -+ } -+ next.etaMs = -+ adaptive.wakeUpAt !== undefined ? Math.max(0, adaptive.wakeUpAt - now) : undefined; -+ observeWakeUpAt(adaptive.wakeUpAt); -+ continue; -+ } -+ -+ const rateLimit = this.resolveRateLimit(next); -+ if (rateLimit) { -+ this.controllerLogger.trace("Rate limit resolution returned decision", { -+ priority, -+ decision: rateLimit, -+ rateLimitKey: next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), -+ }); -+ if (rateLimit.kind === "wait") { -+ if ( -+ this.rejectIfQueueTimedOut( -+ queueTimeoutExpired, -+ next, -+ queue, -+ 0, -+ now, -+ "rate limit wait", -+ ) -+ ) { -+ this.cleanupTenantQueue(priority, tenantId, queue); -+ return { kind: "skip" }; -+ } -+ next.etaMs = -+ rateLimit.wakeUpAt !== undefined ? Math.max(0, rateLimit.wakeUpAt - now) : undefined; -+ observeWakeUpAt(rateLimit.wakeUpAt); -+ } -+ continue; -+ } -+ -+ if (queueTimeoutExpired) { -+ const timeoutError = this.createQueueTimeoutError(next, now); -+ this.attachTrafficMetadata( -+ timeoutError, -+ this.buildTrafficResponseMetadata( -+ next, -+ timeoutError.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), -+ now, -+ timeoutError, -+ ), -+ ); -+ this.controllerLogger.warn("Queue wait timed out before dispatch", { -+ tenantId: next.tenantId, -+ waitedMs: timeoutError.waitedMs, -+ maxQueueWaitMs: timeoutError.maxQueueWaitMs, -+ deadlineAt: timeoutError.deadlineAt, -+ provider: next.request.metadata?.provider, -+ model: next.request.metadata?.model, -+ rateLimitKey: timeoutError.rateLimitKey, -+ }); -+ queue.shift(); -+ this.cleanupTenantQueue(priority, tenantId, queue); -+ next.reject(timeoutError); -+ return { kind: "skip" }; -+ } -+ -+ this.startRequest(next, queue, tenantId); -+ return { kind: "dispatch" }; -+ } -+ } -+ -+ return earliestWakeUpAt !== undefined -+ ? { kind: "wait", wakeUpAt: earliestWakeUpAt } -+ : { kind: "wait" }; -+ } -+ -+ private startRequest(item: QueuedRequest, queue: QueuedRequest[], tenantId: string): void { -+ this.controllerLogger.debug("Start request", { -+ priority: item.priority, -+ type: item.type, -+ tenantId: item.tenantId, -+ attempt: item.attempt, -+ provider: item.request.metadata?.provider, -+ model: item.request.metadata?.model, -+ }); -+ item.dispatchedAt = Date.now(); -+ queue.shift(); -+ this.cleanupTenantQueue(item.priority, tenantId, queue); -+ this.recordPriorityDispatch(item.priority); -+ this.activeCount++; -+ this.concurrencyLimiter.acquire(item, this.trafficLogger); -+ this.rateLimiter.notifyDispatch(item.rateLimitKey, this.trafficLogger); -+ this.circuitBreaker.markTrial(item, this.trafficLogger); -+ void this.executeRequest(item); -+ } -+ -+ /* ============================================================ -+ * Execution -+ * ============================================================ -+ */ -+ -+ private async executeRequest(item: QueuedRequest): Promise { -+ const startedAt = Date.now(); -+ try { -+ this.controllerLogger.debug("Execute request", { -+ priority: item.priority, -+ type: item.type, -+ tenantId: item.tenantId, -+ attempt: item.attempt, -+ provider: item.request.metadata?.provider, -+ model: item.request.metadata?.model, -+ rateLimitKey: item.rateLimitKey, -+ circuitKey: item.circuitKey, -+ circuitStatus: item.circuitStatus, -+ activeCount: this.activeCount, -+ }); -+ const result = await item.request.execute(); -+ const rateLimitKey = item.rateLimitKey ?? this.buildRateLimitKey(item.request.metadata); -+ const adaptiveKey = this.buildAdaptiveKey(item.request.metadata, item.tenantId, rateLimitKey); -+ this.controllerLogger.debug("Request succeeded", { -+ tenantId: item.tenantId, -+ attempt: item.attempt, -+ provider: item.request.metadata?.provider, -+ model: item.request.metadata?.model, -+ elapsedMs: Date.now() - startedAt, -+ }); -+ if (item.type === "stream") { -+ this.controllerLogger.trace("Stream started successfully", { -+ tenantId: item.tenantId, -+ provider: item.request.metadata?.provider, -+ model: item.request.metadata?.model, -+ }); -+ } else { -+ this.circuitBreaker.recordSuccess(item.request.metadata, this.trafficLogger); -+ } -+ const usage = this.usageTracker.recordUsage(item, result, this.trafficLogger); -+ this.rateLimiter.recordUsage(rateLimitKey, usage, this.trafficLogger); -+ this.recordAdaptiveSuccess(adaptiveKey); -+ this.attachTrafficMetadata( -+ result, -+ this.buildTrafficResponseMetadata(item, rateLimitKey, Date.now()), -+ ); -+ item.resolve(result); -+ } catch (error) { -+ const rateLimitKey = item.rateLimitKey ?? this.buildRateLimitKey(item.request.metadata); -+ const normalizedRateLimitError = normalizeRateLimitError({ -+ error, -+ metadata: item.request.metadata, -+ tenantId: item.tenantId, -+ key: rateLimitKey, -+ logger: this.trafficLogger, -+ }); -+ const errorForHandling = normalizedRateLimitError ?? error; -+ const adaptiveKey = this.buildAdaptiveKey(item.request.metadata, item.tenantId, rateLimitKey); -+ if (errorForHandling instanceof RateLimitedUpstreamError) { -+ this.recordAdaptiveRateLimitHit(adaptiveKey, errorForHandling.retryAfterMs); -+ } -+ -+ this.controllerLogger.warn("Request failed", { -+ tenantId: item.tenantId, -+ attempt: item.attempt, -+ provider: item.request.metadata?.provider, -+ model: item.request.metadata?.model, -+ elapsedMs: Date.now() - startedAt, -+ errorName: (error as { name?: unknown } | null)?.name, -+ errorMessage: (error as { message?: unknown } | null)?.message, -+ status: (error as { status?: unknown } | null)?.status, -+ statusCode: (error as { statusCode?: unknown } | null)?.statusCode, -+ }); -+ this.circuitBreaker.recordFailure( -+ item.request.metadata, -+ errorForHandling, -+ this.trafficLogger, -+ ); -+ this.attachTrafficMetadata( -+ errorForHandling, -+ this.buildTrafficResponseMetadata(item, rateLimitKey, Date.now(), errorForHandling), -+ ); -+ -+ const retry = buildRetryPlanWithPolicy( -+ { -+ error: errorForHandling, -+ attempt: item.attempt, -+ metadata: item.request.metadata, -+ key: rateLimitKey, -+ logger: this.trafficLogger, -+ }, -+ this.retryPolicy, -+ ); -+ if (retry) { -+ if (!this.canRetryWithinDeadline(item, retry.delayMs)) { -+ this.controllerLogger.debug("Retry skipped; deadline exceeded", { -+ tenantId: item.tenantId, -+ attempt: item.attempt, -+ provider: item.request.metadata?.provider, -+ model: item.request.metadata?.model, -+ deadlineAt: item.request.deadlineAt, -+ delayMs: retry.delayMs, -+ }); -+ item.reject(errorForHandling); -+ } else { -+ this.controllerLogger.debug("Retrying request", { -+ tenantId: item.tenantId, -+ attempt: item.attempt, -+ nextAttempt: item.attempt + 1, -+ reason: retry.reason, -+ delayMs: retry.delayMs, -+ provider: item.request.metadata?.provider, -+ model: item.request.metadata?.model, -+ }); -+ this.scheduleRetry(item, retry); -+ } -+ } else { -+ this.controllerLogger.debug("No retry plan; rejecting request", { -+ tenantId: item.tenantId, -+ attempt: item.attempt, -+ provider: item.request.metadata?.provider, -+ model: item.request.metadata?.model, -+ }); -+ item.reject(errorForHandling); -+ } -+ } finally { -+ this.rateLimiter.releaseReservation(item.rateLimitKey, this.trafficLogger); -+ this.concurrencyLimiter.release(item, this.trafficLogger); -+ this.activeCount = Math.max(0, this.activeCount - 1); -+ this.controllerLogger.trace("Request finished; slot released", { -+ tenantId: item.tenantId, -+ activeCount: this.activeCount, -+ maxConcurrent: this.maxConcurrent, -+ }); -+ this.scheduleDrain(); -+ } -+ } -+ -+ /* ============================================================ -+ * Retry logic -+ * ============================================================ -+ */ -+ -+ private scheduleRetry(item: QueuedRequest, plan: RetryPlan): void { -+ this.controllerLogger.debug("Schedule retry", { -+ tenantId: item.tenantId, -+ priority: item.priority, -+ currentAttempt: item.attempt, -+ nextAttempt: item.attempt + 1, -+ reason: plan.reason, -+ delayMs: plan.delayMs, -+ }); -+ setTimeout(() => { -+ this.controllerLogger.debug("Retry timer fired", { -+ tenantId: item.tenantId, -+ priority: item.priority, -+ nextAttempt: item.attempt + 1, -+ }); -+ this.enqueueItem({ -+ ...item, -+ attempt: item.attempt + 1, -+ enqueuedAt: Date.now(), -+ dispatchedAt: undefined, -+ tenantConcurrencyKey: undefined, -+ providerModelConcurrencyKey: undefined, -+ rateLimitKey: undefined, -+ etaMs: undefined, -+ circuitKey: undefined, -+ circuitStatus: undefined, -+ }); -+ this.scheduleDrain(); -+ }, plan.delayMs); -+ } -+ -+ private canRetryWithinDeadline(item: QueuedRequest, delayMs: number): boolean { -+ const deadlineAt = item.request.deadlineAt; -+ if (!deadlineAt) return true; -+ const nextAttemptAt = Date.now() + delayMs; -+ return nextAttemptAt <= deadlineAt; -+ } -+ -+ /* ============================================================ -+ * Rate limiting (verbatim logic) -+ * ============================================================ -+ */ -+ -+ private resolveRateLimit(next: QueuedRequest): DispatchDecision | null { -+ const key = this.buildRateLimitKey(next.request.metadata); -+ return this.rateLimiter.resolve(next, key, this.trafficLogger); -+ } -+ -+ private scheduleRateLimitWakeUpAt(wakeUpAt: number): void { -+ this.rateLimiter.scheduleWakeUpAt(wakeUpAt, this.trafficLogger); -+ } -+ -+ /* ============================================================ -+ * Circuit breakers (verbatim logic, linearized) -+ * ============================================================ -+ */ -+ -+ private resolveCircuit(next: QueuedRequest): DispatchDecision | null { -+ return this.circuitBreaker.resolve(next, this.trafficLogger); -+ } -+ -+ /* ============================================================ -+ * Utilities -+ * ============================================================ -+ */ -+ -+ private resolveQueueTimeoutAt(next: QueuedRequest): number | undefined { -+ const maxQueueWaitMs = next.request.maxQueueWaitMs; -+ const normalizedMaxWait = -+ typeof maxQueueWaitMs === "number" && Number.isFinite(maxQueueWaitMs) -+ ? Math.max(0, maxQueueWaitMs) -+ : undefined; -+ const timeoutAt = -+ normalizedMaxWait !== undefined ? next.enqueuedAt + normalizedMaxWait : undefined; -+ const deadlineAt = next.request.deadlineAt; -+ if (timeoutAt === undefined) return deadlineAt; -+ if (deadlineAt === undefined) return timeoutAt; -+ return Math.min(timeoutAt, deadlineAt); -+ } -+ -+ private handleQueueTimeout( -+ next: QueuedRequest, -+ queue: QueuedRequest[], -+ index: number, -+ now: number, -+ queueTimeoutAt?: number, -+ ): "none" | "expired" | "rejected" { -+ if (queueTimeoutAt === undefined) return "none"; -+ if (now < queueTimeoutAt) return "none"; -+ -+ const fallbackApplied = this.circuitBreaker.tryFallback( -+ next, -+ "queue-timeout", -+ this.trafficLogger, -+ ); -+ if (fallbackApplied) { -+ return "expired"; -+ } -+ -+ const timeoutError = this.createQueueTimeoutError(next, now); -+ this.attachTrafficMetadata( -+ timeoutError, -+ this.buildTrafficResponseMetadata( -+ next, -+ timeoutError.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), -+ now, -+ timeoutError, -+ ), -+ ); -+ this.controllerLogger.warn("Queue wait timed out; rejecting request", { -+ tenantId: next.tenantId, -+ waitedMs: timeoutError.waitedMs, -+ maxQueueWaitMs: timeoutError.maxQueueWaitMs, -+ deadlineAt: timeoutError.deadlineAt, -+ provider: next.request.metadata?.provider, -+ model: next.request.metadata?.model, -+ rateLimitKey: timeoutError.rateLimitKey, -+ }); -+ queue.splice(index, 1); -+ next.reject(timeoutError); -+ return "rejected"; -+ } -+ -+ private rejectIfQueueTimedOut( -+ queueTimeoutExpired: boolean, -+ next: QueuedRequest, -+ queue: QueuedRequest[], -+ index: number, -+ now: number, -+ reason: string, -+ ): boolean { -+ if (!queueTimeoutExpired) return false; -+ const timeoutError = this.createQueueTimeoutError(next, now); -+ this.attachTrafficMetadata( -+ timeoutError, -+ this.buildTrafficResponseMetadata( -+ next, -+ timeoutError.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), -+ now, -+ timeoutError, -+ ), -+ ); -+ this.controllerLogger.warn("Queue wait timed out during gate wait", { -+ tenantId: next.tenantId, -+ waitedMs: timeoutError.waitedMs, -+ maxQueueWaitMs: timeoutError.maxQueueWaitMs, -+ deadlineAt: timeoutError.deadlineAt, -+ provider: next.request.metadata?.provider, -+ model: next.request.metadata?.model, -+ rateLimitKey: timeoutError.rateLimitKey, -+ reason, -+ }); -+ queue.splice(index, 1); -+ next.reject(timeoutError); -+ return true; -+ } -+ -+ private createQueueTimeoutError(next: QueuedRequest, now: number): QueueWaitTimeoutError { -+ const waitedMs = Math.max(0, now - next.enqueuedAt); -+ return new QueueWaitTimeoutError({ -+ waitedMs, -+ maxQueueWaitMs: next.request.maxQueueWaitMs, -+ deadlineAt: next.request.deadlineAt, -+ metadata: next.request.metadata, -+ rateLimitKey: next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), -+ }); -+ } -+ -+ private resolveTenantId(request: TrafficRequest): string { -+ return request.tenantId ?? request.metadata?.tenantId ?? "default"; -+ } -+ -+ private enqueueItem(item: QueuedRequest): void { -+ const state = this.queues[item.priority]; -+ const tenantId = item.tenantId; -+ let queue = state.queues.get(tenantId); -+ if (!queue) { -+ queue = []; -+ state.queues.set(tenantId, queue); -+ state.order.push(tenantId); -+ } -+ queue.push(item); -+ } -+ -+ private getQueuedCount(priority: TrafficPriority): number { -+ const state = this.queues[priority]; -+ let total = 0; -+ for (const queue of state.queues.values()) { -+ total += queue.length; -+ } -+ return total; -+ } -+ -+ private hasQueuedWorkBelow(priority: TrafficPriority): boolean { -+ const index = this.priorityOrder.indexOf(priority); -+ if (index < 0) return false; -+ for (let i = index + 1; i < this.priorityOrder.length; i += 1) { -+ if (this.getQueuedCount(this.priorityOrder[i]) > 0) { -+ return true; -+ } -+ } -+ return false; -+ } -+ -+ private canDispatchPriority(priority: TrafficPriority): boolean { -+ const limit = this.priorityBurstLimits[priority]; -+ if (!Number.isFinite(limit) || limit <= 0) return true; -+ if (this.priorityBurstCounts[priority] < limit) return true; -+ return !this.hasQueuedWorkBelow(priority); -+ } -+ -+ private recordPriorityDispatch(priority: TrafficPriority): void { -+ for (const key of this.priorityOrder) { -+ if (key !== priority) { -+ this.priorityBurstCounts[key] = 0; -+ } -+ } -+ this.priorityBurstCounts[priority] += 1; -+ } -+ -+ private getPriorityDispatchOrder(): TrafficPriority[] { -+ return this.priorityOrder.filter((priority) => this.canDispatchPriority(priority)); -+ } -+ -+ private getNextTenantCandidate( -+ priority: TrafficPriority, -+ ): { item: QueuedRequest; queue: QueuedRequest[]; tenantId: string } | undefined { -+ const state = this.queues[priority]; -+ if (state.order.length === 0) return undefined; -+ const maxAttempts = state.order.length; -+ let attempts = 0; -+ -+ while (attempts < maxAttempts && state.order.length > 0) { -+ const index = state.index % state.order.length; -+ const tenantId = state.order[index]; -+ const queue = state.queues.get(tenantId); -+ attempts += 1; -+ -+ if (!queue || queue.length === 0) { -+ this.removeTenantQueue(priority, tenantId); -+ continue; -+ } -+ -+ state.index = (index + 1) % state.order.length; -+ return { item: queue[0], queue, tenantId }; -+ } -+ -+ return undefined; -+ } -+ -+ private cleanupTenantQueue( -+ priority: TrafficPriority, -+ tenantId: string, -+ queue: QueuedRequest[], -+ ): void { -+ if (queue.length > 0) return; -+ this.removeTenantQueue(priority, tenantId); -+ } -+ -+ private removeTenantQueue(priority: TrafficPriority, tenantId: string): void { -+ const state = this.queues[priority]; -+ state.queues.delete(tenantId); -+ const index = state.order.indexOf(tenantId); -+ if (index === -1) return; -+ state.order.splice(index, 1); -+ if (state.order.length === 0) { -+ state.index = 0; -+ return; -+ } -+ if (state.index > index) { -+ state.index -= 1; -+ } -+ if (state.index >= state.order.length) { -+ state.index = 0; -+ } -+ } -+ -+ private resolvePriority(metadata?: TrafficRequestMetadata): TrafficPriority { -+ return metadata?.priority ?? "P1"; -+ } -+ -+ private buildRateLimitKey(metadata?: TrafficRequestMetadata): string { -+ return this.rateLimitKeyBuilder(metadata); -+ } -+ -+ private resolveAdaptiveLimit(next: QueuedRequest, now: number): DispatchDecision | null { -+ const rateLimitKey = next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata); -+ const adaptiveKey = this.buildAdaptiveKey(next.request.metadata, next.tenantId, rateLimitKey); -+ const state = this.adaptiveLimiterState.get(adaptiveKey); -+ if (!state) return null; -+ -+ this.applyAdaptiveDecay(state, now); -+ if (state.cooldownUntil !== undefined && now < state.cooldownUntil) { -+ return { kind: "wait", wakeUpAt: state.cooldownUntil }; -+ } -+ -+ return null; -+ } -+ -+ private recordAdaptiveRateLimitHit(key: string, retryAfterMs?: number): void { -+ const state = this.getAdaptiveState(key); -+ const now = Date.now(); -+ const { windowMs, threshold, minPenaltyMs, maxPenaltyMs, penaltyMultiplier } = -+ this.adaptiveLimiterConfig; -+ -+ state.last429At = now; -+ state.recent429s = state.recent429s.filter((timestamp) => now - timestamp <= windowMs); -+ state.recent429s.push(now); -+ -+ if (state.recent429s.length < threshold) { -+ return; -+ } -+ -+ const basePenalty = state.penaltyMs > 0 ? state.penaltyMs : minPenaltyMs; -+ const nextPenalty = Math.min( -+ maxPenaltyMs, -+ Math.max(minPenaltyMs, Math.round(basePenalty * penaltyMultiplier)), -+ ); -+ state.penaltyMs = nextPenalty; -+ const retryPenalty = typeof retryAfterMs === "number" ? retryAfterMs : 0; -+ const cooldownMs = Math.max(nextPenalty, retryPenalty); -+ state.cooldownUntil = now + cooldownMs; -+ } -+ -+ private recordAdaptiveSuccess(key: string): void { -+ const state = this.adaptiveLimiterState.get(key); -+ if (!state) return; -+ -+ const now = Date.now(); -+ this.applyAdaptiveDecay(state, now); -+ if (state.penaltyMs === 0) { -+ state.cooldownUntil = undefined; -+ state.recent429s = []; -+ state.last429At = undefined; -+ } -+ } -+ -+ private applyAdaptiveDecay(state: AdaptiveLimiterState, now: number): void { -+ const { decayMs, penaltyMultiplier } = this.adaptiveLimiterConfig; -+ if (state.last429At && now - state.last429At < decayMs) { -+ return; -+ } -+ -+ if (state.penaltyMs > 0) { -+ state.penaltyMs = Math.max(0, Math.floor(state.penaltyMs / penaltyMultiplier)); -+ } -+ } -+ -+ private getAdaptiveState(key: string): AdaptiveLimiterState { -+ const existing = this.adaptiveLimiterState.get(key); -+ if (existing) return existing; -+ const created: AdaptiveLimiterState = { -+ recent429s: [], -+ penaltyMs: 0, -+ }; -+ this.adaptiveLimiterState.set(key, created); -+ return created; -+ } -+ -+ private buildAdaptiveKey( -+ metadata: TrafficRequestMetadata | undefined, -+ tenantId: string, -+ rateLimitKey: string, -+ ): string { -+ if (rateLimitKey.includes("tenant=")) { -+ return rateLimitKey; -+ } -+ const tenant = metadata?.tenantId ?? tenantId ?? "default"; -+ return `${rateLimitKey}::tenant=${encodeURIComponent(tenant)}`; -+ } -+ -+ private buildTrafficResponseMetadata( -+ item: QueuedRequest, -+ rateLimitKey: string, -+ now: number, -+ error?: unknown, -+ ): TrafficResponseMetadata { -+ const snapshot = this.rateLimitSnapshots.get(rateLimitKey); -+ const retryAfterMs = this.resolveRetryAfterMs(error, snapshot); -+ const queuedForMs = -+ item.dispatchedAt !== undefined ? item.dispatchedAt - item.enqueuedAt : now - item.enqueuedAt; -+ const queueEtaMs = item.etaMs ?? Math.max(0, queuedForMs); -+ -+ return { -+ rateLimitKey, -+ retryAfterMs, -+ rateLimitRemaining: snapshot?.remaining, -+ rateLimitResetAt: snapshot?.resetAt, -+ rateLimitResetInMs: -+ snapshot?.resetAt !== undefined ? Math.max(0, snapshot.resetAt - now) : undefined, -+ queueEtaMs, -+ tenantId: item.tenantId, -+ priority: item.request.metadata?.priority, -+ taskType: item.request.metadata?.taskType, -+ }; -+ } -+ -+ private buildTrafficResponseMetadataFromMetadata( -+ metadata: TrafficRequestMetadata | undefined, -+ rateLimitKey: string, -+ now: number, -+ error?: unknown, -+ ): TrafficResponseMetadata { -+ const snapshot = this.rateLimitSnapshots.get(rateLimitKey); -+ const retryAfterMs = this.resolveRetryAfterMs(error, snapshot); -+ -+ return { -+ rateLimitKey, -+ retryAfterMs, -+ rateLimitRemaining: snapshot?.remaining, -+ rateLimitResetAt: snapshot?.resetAt, -+ rateLimitResetInMs: -+ snapshot?.resetAt !== undefined ? Math.max(0, snapshot.resetAt - now) : undefined, -+ tenantId: metadata?.tenantId, -+ priority: metadata?.priority, -+ taskType: metadata?.taskType, -+ }; -+ } -+ -+ private attachTrafficMetadata(target: unknown, info: TrafficResponseMetadata): void { -+ if (!target || typeof target !== "object") return; -+ (target as Record).traffic = info; -+ } -+ -+ private resolveRetryAfterMs( -+ error: unknown | undefined, -+ snapshot?: RateLimitSnapshot, -+ ): number | undefined { -+ if (error && typeof error === "object" && "retryAfterMs" in error) { -+ const candidate = (error as { retryAfterMs?: unknown }).retryAfterMs; -+ if (typeof candidate === "number" && Number.isFinite(candidate)) { -+ return candidate; -+ } -+ } -+ if (snapshot?.retryAfterMs !== undefined) { -+ return snapshot.retryAfterMs; -+ } -+ return undefined; -+ } -+ -+ private resolveRateLimitStrategy( -+ key: string, -+ config?: RateLimitStrategyConfig, -+ ): RateLimitStrategyKind { -+ const modelOverride = config?.models?.[key]; -+ if (modelOverride) return modelOverride; -+ const provider = key.split("::")[0] ?? ""; -+ const providerOverride = config?.providers?.[provider]; -+ if (providerOverride) return providerOverride; -+ if (provider.startsWith("openai")) return "window"; -+ return "token-bucket"; -+ } -+} -+ -+/* ============================================================ -+ * Error + Singleton -+ * ============================================================ -+ */ -+ -+let singletonController: TrafficController | undefined; -+ -+export function getTrafficController(options?: TrafficControllerOptions): TrafficController { -+ if (!singletonController) { -+ singletonController = new TrafficController(options); -+ } -+ return singletonController; -+} -+ -+function buildRateLimitKeyFromMetadata(metadata?: TrafficRequestMetadata): string { -+ const provider = metadata?.provider ?? "default-provider"; -+ const model = metadata?.model ?? "default-model"; -+ const parts = [provider, model]; -+ -+ // SOP: Add new metadata fields in one place with a stable label and ordering. -+ // 1) Add the optional field to TrafficRequestMetadata. -+ // 2) Add it here with a stable label so keys stay predictable. -+ // Example: { label: "org", value: metadata?.orgId } -+ const optionalFields: Array<{ label: string; value?: string }> = [ -+ { label: "apiKey", value: metadata?.apiKeyId }, -+ { label: "region", value: metadata?.region }, -+ { label: "endpoint", value: metadata?.endpoint }, -+ { label: "tenant", value: metadata?.tenantId }, -+ { label: "tenantTier", value: metadata?.tenantTier }, -+ { label: "taskType", value: metadata?.taskType }, -+ ]; -+ -+ for (const field of optionalFields) { -+ if (!field.value) continue; -+ parts.push(`${field.label}=${encodeURIComponent(field.value)}`); -+ } -+ -+ return parts.join("::"); -+} -diff --git a/packages/core/src/traffic/traffic-error-utils.ts b/packages/core/src/traffic/traffic-error-utils.ts -new file mode 100644 -index 00000000..4cbb98b5 ---- /dev/null -+++ b/packages/core/src/traffic/traffic-error-utils.ts -@@ -0,0 +1,148 @@ -+import type { Logger } from "../logger"; -+ -+function readObjectProperty(value: unknown, key: string): unknown { -+ if (!value || typeof value !== "object") return undefined; -+ return (value as Record)[key]; -+} -+ -+export function findHeaders(value: unknown): unknown[] { -+ const candidates: unknown[] = [ -+ readObjectProperty(value, "headers"), -+ readObjectProperty(readObjectProperty(value, "response"), "headers"), -+ readObjectProperty(readObjectProperty(value, "cause"), "headers"), -+ readObjectProperty( -+ readObjectProperty(readObjectProperty(value, "cause"), "response"), -+ "headers", -+ ), -+ ]; -+ -+ return candidates.filter((candidate) => candidate !== undefined && candidate !== null); -+} -+ -+export function readHeaderValue(headers: unknown, name: string): string | undefined { -+ if (!headers) return undefined; -+ -+ if (typeof (headers as { get?: unknown }).get === "function") { -+ const v = (headers as { get: (name: string) => unknown }).get(name); -+ return v === null || v === undefined ? undefined : String(v); -+ } -+ -+ if (typeof headers !== "object") return undefined; -+ -+ const entries = Object.entries(headers as Record); -+ const target = name.toLowerCase(); -+ const match = entries.find(([k]) => String(k).toLowerCase() === target); -+ if (!match) return undefined; -+ -+ const value = match[1]; -+ if (Array.isArray(value)) { -+ const first = value[0]; -+ return first === null || first === undefined ? undefined : String(first); -+ } -+ return value === null || value === undefined ? undefined : String(value); -+} -+ -+export function parseRetryAfterMs(value: string, nowMs: number = Date.now()): number | undefined { -+ const raw = value.trim(); -+ if (!raw) return undefined; -+ -+ const seconds = Number(raw); -+ if (Number.isFinite(seconds)) { -+ return Math.max(0, Math.round(seconds * 1000)); -+ } -+ -+ const parsedAt = Date.parse(raw); -+ if (Number.isFinite(parsedAt)) { -+ return Math.max(0, parsedAt - nowMs); -+ } -+ -+ return undefined; -+} -+ -+export function coerceStatus(value: unknown): number | undefined { -+ const n = Number(value); -+ return Number.isFinite(n) ? n : undefined; -+} -+ -+export function extractStatusCode(error: unknown, logger?: Logger): number | undefined { -+ const status = -+ coerceStatus(readObjectProperty(error, "status")) ?? -+ coerceStatus(readObjectProperty(error, "statusCode")) ?? -+ coerceStatus(readObjectProperty(error, "httpStatus")) ?? -+ coerceStatus(readObjectProperty(readObjectProperty(error, "response"), "status")) ?? -+ coerceStatus(readObjectProperty(readObjectProperty(error, "cause"), "status")); -+ -+ logger?.trace?.("Extracted status code", { -+ status, -+ hasStatus: readObjectProperty(error, "status") !== undefined, -+ hasStatusCode: readObjectProperty(error, "statusCode") !== undefined, -+ hasHttpStatus: readObjectProperty(error, "httpStatus") !== undefined, -+ hasResponseStatus: -+ readObjectProperty(readObjectProperty(error, "response"), "status") !== undefined, -+ hasCauseStatus: readObjectProperty(readObjectProperty(error, "cause"), "status") !== undefined, -+ }); -+ -+ return status; -+} -+ -+export function extractRetryAfterMs(error: unknown, logger?: Logger): number | undefined { -+ const retryAfterLogger = logger?.child({ module: "retry-after" }); -+ const candidates = findHeaders(error); -+ -+ for (const headers of candidates) { -+ const raw = readHeaderValue(headers, "retry-after"); -+ if (!raw) continue; -+ const parsed = parseRetryAfterMs(raw); -+ retryAfterLogger?.trace?.("Parsed Retry-After header", { raw, parsedMs: parsed }); -+ if (parsed !== undefined) return parsed; -+ } -+ -+ retryAfterLogger?.trace?.("Retry-After header missing or unparsable"); -+ return undefined; -+} -+ -+export function isTimeoutError(error: unknown, logger?: Logger): boolean { -+ const candidates: unknown[] = [error]; -+ -+ const cause = readObjectProperty(error, "cause"); -+ if (cause) { -+ candidates.push(cause); -+ const nestedCause = readObjectProperty(cause, "cause"); -+ if (nestedCause) candidates.push(nestedCause); -+ } -+ -+ for (const candidate of candidates) { -+ const code = readObjectProperty(candidate, "code"); -+ const name = readObjectProperty(candidate, "name"); -+ const message = readObjectProperty(candidate, "message"); -+ -+ const codeText = String(code ?? "").toLowerCase(); -+ const nameText = String(name ?? "").toLowerCase(); -+ const messageText = String(message ?? "").toLowerCase(); -+ -+ const isTimeout = -+ codeText.includes("timeout") || -+ codeText.includes("timedout") || -+ nameText.includes("timeout") || -+ nameText.includes("timedout") || -+ messageText.includes("timeout") || -+ messageText.includes("timedout") || -+ messageText.includes("timed out"); -+ -+ logger?.trace?.("Checked timeout error", { -+ isTimeout, -+ code, -+ name, -+ messagePreview: typeof message === "string" ? message.slice(0, 160) : message, -+ hasCause: candidate !== error, -+ }); -+ -+ if (isTimeout) return true; -+ } -+ -+ return false; -+} -+ -+export function isPromiseLike(value: unknown): value is PromiseLike { -+ return !!value && typeof (value as { then?: unknown }).then === "function"; -+} -diff --git a/packages/core/src/traffic/traffic-errors.ts b/packages/core/src/traffic/traffic-errors.ts -new file mode 100644 -index 00000000..4943c89f ---- /dev/null -+++ b/packages/core/src/traffic/traffic-errors.ts -@@ -0,0 +1,141 @@ -+import type { Logger } from "../logger"; -+import { extractRetryAfterMs, extractStatusCode } from "./traffic-error-utils"; -+import type { TrafficRequestMetadata } from "./traffic-types"; -+ -+export type RateLimitErrorOptions = { -+ metadata?: TrafficRequestMetadata; -+ retryAfterMs?: number; -+ tenantId?: string; -+ key?: string; -+}; -+ -+export class CircuitBreakerOpenError extends Error { -+ readonly retryAfterMs?: number; -+ readonly metadata?: TrafficRequestMetadata; -+ -+ constructor(message: string, metadata?: TrafficRequestMetadata, retryAfterMs?: number) { -+ super(message); -+ this.name = "CircuitBreakerOpenError"; -+ this.metadata = metadata; -+ this.retryAfterMs = retryAfterMs; -+ } -+} -+ -+export class QueueWaitTimeoutError extends Error { -+ readonly waitedMs: number; -+ readonly maxQueueWaitMs?: number; -+ readonly deadlineAt?: number; -+ readonly metadata?: TrafficRequestMetadata; -+ readonly rateLimitKey?: string; -+ -+ constructor(options: { -+ waitedMs: number; -+ maxQueueWaitMs?: number; -+ deadlineAt?: number; -+ metadata?: TrafficRequestMetadata; -+ rateLimitKey?: string; -+ }) { -+ super("Queue wait time exceeded"); -+ this.name = "QueueWaitTimeoutError"; -+ this.waitedMs = options.waitedMs; -+ this.maxQueueWaitMs = options.maxQueueWaitMs; -+ this.deadlineAt = options.deadlineAt; -+ this.metadata = options.metadata; -+ this.rateLimitKey = options.rateLimitKey; -+ } -+} -+ -+export class RateLimitedUpstreamError extends Error { -+ readonly status = 429; -+ readonly retryAfterMs?: number; -+ readonly metadata?: TrafficRequestMetadata; -+ readonly provider?: string; -+ readonly model?: string; -+ readonly tenantId?: string; -+ readonly key?: string; -+ -+ constructor( -+ message: string, -+ metadata?: TrafficRequestMetadata, -+ retryAfterMs?: number, -+ options?: { tenantId?: string; key?: string }, -+ ); -+ constructor(message: string, options?: RateLimitErrorOptions); -+ constructor( -+ message: string, -+ metadataOrOptions?: TrafficRequestMetadata | RateLimitErrorOptions, -+ retryAfterMs?: number, -+ legacyOptions?: { tenantId?: string; key?: string }, -+ ) { -+ super(message); -+ this.name = "RateLimitedUpstreamError"; -+ const isOptions = -+ metadataOrOptions && -+ (Object.prototype.hasOwnProperty.call(metadataOrOptions, "metadata") || -+ Object.prototype.hasOwnProperty.call(metadataOrOptions, "retryAfterMs") || -+ Object.prototype.hasOwnProperty.call(metadataOrOptions, "key")); -+ -+ const metadata = isOptions -+ ? (metadataOrOptions as RateLimitErrorOptions).metadata -+ : (metadataOrOptions as TrafficRequestMetadata | undefined); -+ const retryAfter = isOptions -+ ? (metadataOrOptions as RateLimitErrorOptions).retryAfterMs -+ : retryAfterMs; -+ const tenantId = isOptions -+ ? (metadataOrOptions as RateLimitErrorOptions).tenantId -+ : legacyOptions?.tenantId; -+ const key = isOptions ? (metadataOrOptions as RateLimitErrorOptions).key : legacyOptions?.key; -+ -+ this.metadata = metadata; -+ this.retryAfterMs = retryAfter; -+ this.provider = metadata?.provider; -+ this.model = metadata?.model; -+ this.tenantId = tenantId ?? metadata?.tenantId; -+ this.key = key; -+ } -+} -+ -+export function normalizeRateLimitError(options: { -+ error: unknown; -+ metadata?: TrafficRequestMetadata; -+ tenantId?: string; -+ key?: string; -+ logger?: Logger; -+}): RateLimitedUpstreamError | undefined { -+ const { error, metadata, tenantId, key, logger } = options; -+ const retryAfterMs = -+ error instanceof RateLimitedUpstreamError -+ ? (error.retryAfterMs ?? extractRetryAfterMs(error, logger)) -+ : extractRetryAfterMs(error, logger); -+ -+ if (error instanceof RateLimitedUpstreamError) { -+ const baseMetadata = metadata ?? error.metadata; -+ const baseTenant = tenantId ?? error.tenantId; -+ const baseKey = key ?? error.key; -+ if ( -+ error.metadata === baseMetadata && -+ error.retryAfterMs === retryAfterMs && -+ error.tenantId === baseTenant && -+ error.key === baseKey -+ ) { -+ return error; -+ } -+ return new RateLimitedUpstreamError(error.message, { -+ metadata: baseMetadata, -+ retryAfterMs, -+ tenantId: baseTenant, -+ key: baseKey, -+ }); -+ } -+ -+ const status = extractStatusCode(error, logger); -+ if (status !== 429) return undefined; -+ -+ const message = error instanceof Error ? error.message : "Rate limit exceeded"; -+ return new RateLimitedUpstreamError(message, { -+ metadata, -+ retryAfterMs, -+ tenantId, -+ key, -+ }); -+} -diff --git a/packages/core/src/traffic/traffic-rate-limiter.ts b/packages/core/src/traffic/traffic-rate-limiter.ts -new file mode 100644 -index 00000000..a77a0423 ---- /dev/null -+++ b/packages/core/src/traffic/traffic-rate-limiter.ts -@@ -0,0 +1,267 @@ -+import type { Logger } from "../logger"; -+import type { -+ RateLimitStrategy, -+ RateLimitUpdateResult, -+} from "./rate-limit-strategies/rate-limit-strategy"; -+import { TokenBucketRateLimitStrategy } from "./rate-limit-strategies/token-bucket-rate-limit-strategy"; -+import type { DispatchDecision, QueuedRequest } from "./traffic-controller-internal"; -+import type { RateLimitConfig, TrafficRequestMetadata } from "./traffic-types"; -+ -+export type { -+ RateLimitHeaderSnapshot, -+ RateLimitStrategy, -+ RateLimitUpdateResult, -+} from "./rate-limit-strategies/rate-limit-strategy"; -+export { DefaultRateLimitStrategy } from "./rate-limit-strategies/default-rate-limit-strategy"; -+export { OpenAIWindowRateLimitStrategy } from "./rate-limit-strategies/openai-window-rate-limit-strategy"; -+export { TokenBucketRateLimitStrategy } from "./rate-limit-strategies/token-bucket-rate-limit-strategy"; -+ -+type SchedulerCallback = () => void; -+ -+export type RateLimitStrategyFactory = (key: string) => RateLimitStrategy; -+ -+type UsageCounters = { -+ inputTokens?: number; -+ outputTokens?: number; -+ totalTokens?: number; -+}; -+ -+type TokenRateState = { -+ capacity: number; -+ refillPerSecond: number; -+ tokens: number; -+ updatedAt: number; -+}; -+ -+export class TrafficRateLimiter { -+ private readonly strategies = new Map(); -+ private readonly tokenRates = new Map(); -+ private wakeUpTimeout?: ReturnType; -+ private wakeUpAt?: number; -+ private readonly onWakeUp: SchedulerCallback; -+ private readonly strategyFactory: RateLimitStrategyFactory; -+ private readonly rateLimits?: RateLimitConfig; -+ -+ constructor( -+ onWakeUp: SchedulerCallback, -+ options?: { strategyFactory?: RateLimitStrategyFactory; rateLimits?: RateLimitConfig }, -+ ) { -+ this.onWakeUp = onWakeUp; -+ this.rateLimits = options?.rateLimits; -+ this.strategyFactory = -+ options?.strategyFactory ?? -+ ((key) => new TokenBucketRateLimitStrategy(key, this.rateLimits?.[key])); -+ } -+ -+ resolve(next: QueuedRequest, key: string, logger?: Logger): DispatchDecision | null { -+ const strategy = this.strategies.get(key) ?? this.createStrategy(key, logger); -+ const requestDecision = strategy.resolve(next, logger); -+ if (requestDecision?.kind === "wait") { -+ const tokenDecision = strategy.handlesTokenLimits -+ ? null -+ : this.resolveTokenLimit(key, logger); -+ if (tokenDecision?.kind === "wait") { -+ const requestWakeUp = requestDecision.wakeUpAt; -+ const tokenWakeUp = tokenDecision.wakeUpAt; -+ if (tokenWakeUp !== undefined && requestWakeUp !== undefined) { -+ return { kind: "wait", wakeUpAt: Math.min(requestWakeUp, tokenWakeUp) }; -+ } -+ if (tokenWakeUp !== undefined && requestWakeUp === undefined) { -+ return tokenDecision; -+ } -+ } -+ return requestDecision; -+ } -+ -+ const tokenDecision = strategy.handlesTokenLimits ? null : this.resolveTokenLimit(key, logger); -+ if (tokenDecision?.kind === "wait") { -+ return tokenDecision; -+ } -+ -+ return requestDecision; -+ } -+ -+ notifyDispatch(key: string | undefined, logger?: Logger): void { -+ if (!key) return; -+ this.strategies.get(key)?.onDispatch(logger); -+ } -+ -+ scheduleWakeUpAt(wakeUpAt: number, logger?: Logger): void { -+ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); -+ const now = Date.now(); -+ const target = Math.max(now, wakeUpAt); -+ -+ if (this.wakeUpTimeout && this.wakeUpAt !== undefined && this.wakeUpAt <= target) { -+ rateLimitLogger?.trace?.("Wakeup already scheduled earlier; skipping", { -+ currentWakeUpAt: this.wakeUpAt, -+ requestedWakeUpAt: target, -+ }); -+ return; -+ } -+ -+ if (this.wakeUpTimeout) clearTimeout(this.wakeUpTimeout); -+ -+ this.wakeUpAt = target; -+ rateLimitLogger?.debug?.("Scheduling rate limit wakeup", { -+ wakeUpAt: target, -+ inMs: Math.max(1, target - now), -+ }); -+ this.wakeUpTimeout = setTimeout( -+ () => { -+ this.wakeUpTimeout = undefined; -+ this.wakeUpAt = undefined; -+ rateLimitLogger?.debug?.("Rate limit wakeup fired"); -+ this.onWakeUp(); -+ }, -+ Math.max(1, target - now), -+ ); -+ } -+ -+ releaseReservation(key?: string, logger?: Logger): void { -+ if (!key) return; -+ this.strategies.get(key)?.onComplete(logger); -+ } -+ -+ recordUsage( -+ key: string | undefined, -+ usage: UsageCounters | Promise | undefined, -+ logger?: Logger, -+ ): void { -+ if (!key || !usage) return; -+ if (typeof (usage as PromiseLike).then === "function") { -+ void (usage as Promise) -+ .then((resolved) => this.recordUsage(key, resolved, logger)) -+ .catch(() => {}); -+ return; -+ } -+ -+ const strategy = this.strategies.get(key); -+ if (strategy?.recordUsage) { -+ strategy.recordUsage(usage, logger); -+ return; -+ } -+ -+ const tokens = this.resolveTokenCount(usage); -+ if (tokens <= 0) return; -+ -+ const bucket = this.getTokenRateState(key, logger); -+ if (!bucket) return; -+ -+ const now = Date.now(); -+ this.refillTokenRate(bucket, now); -+ bucket.tokens = Math.min(bucket.capacity, bucket.tokens); -+ bucket.tokens -= tokens; -+ -+ if (bucket.tokens < 0 && bucket.refillPerSecond > 0) { -+ const waitMs = Math.max(1, Math.ceil((-bucket.tokens / bucket.refillPerSecond) * 1000)); -+ this.scheduleWakeUpAt(now + waitMs, logger); -+ } -+ } -+ -+ updateFromHeaders( -+ metadata: TrafficRequestMetadata | undefined, -+ headers: unknown, -+ key: string, -+ logger?: Logger, -+ ): RateLimitUpdateResult | undefined { -+ const existing = this.strategies.get(key); -+ if (existing) return existing.updateFromHeaders(metadata, headers, logger); -+ -+ const created = this.strategyFactory(key); -+ const update = created.updateFromHeaders(metadata, headers, logger); -+ if (!update) return undefined; -+ this.strategies.set(key, created); -+ return update; -+ } -+ -+ private createStrategy(key: string, logger?: Logger): RateLimitStrategy { -+ const created = this.strategyFactory(key); -+ this.strategies.set(key, created); -+ logger?.child({ module: "rate-limiter" })?.trace?.("Created rate limit strategy", { -+ rateLimitKey: key, -+ strategy: created.constructor.name, -+ }); -+ return created; -+ } -+ -+ private resolveTokenLimit(key: string, logger?: Logger): DispatchDecision | null { -+ const bucket = this.getTokenRateState(key, logger); -+ if (!bucket) return null; -+ -+ const now = Date.now(); -+ this.refillTokenRate(bucket, now); -+ -+ if (bucket.capacity <= 0) { -+ logger?.child({ module: "rate-limiter" })?.debug?.("Token limit misconfigured; blocking", { -+ rateLimitKey: key, -+ capacity: bucket.capacity, -+ refillPerSecond: bucket.refillPerSecond, -+ }); -+ return { kind: "wait" }; -+ } -+ -+ if (bucket.tokens >= 0) return null; -+ -+ if (bucket.refillPerSecond <= 0) { -+ logger?.child({ module: "rate-limiter" })?.debug?.("Token limit has no refill; blocking", { -+ rateLimitKey: key, -+ capacity: bucket.capacity, -+ refillPerSecond: bucket.refillPerSecond, -+ }); -+ return { kind: "wait" }; -+ } -+ -+ const requiredTokens = -bucket.tokens; -+ const waitMs = Math.max(1, Math.ceil((requiredTokens / bucket.refillPerSecond) * 1000)); -+ return { kind: "wait", wakeUpAt: now + waitMs }; -+ } -+ -+ private getTokenRateState(key: string, logger?: Logger): TokenRateState | undefined { -+ const existing = this.tokenRates.get(key); -+ if (existing) return existing; -+ -+ const options = this.rateLimits?.[key]; -+ if (!options) return undefined; -+ -+ const tokensPerMinute = Number(options.tokensPerMinute); -+ if (!Number.isFinite(tokensPerMinute) || tokensPerMinute <= 0) { -+ return undefined; -+ } -+ -+ // Token pacing uses a 1-minute burst by default; request bursts are handled separately. -+ const refillPerSecond = tokensPerMinute / 60; -+ const capacity = tokensPerMinute; -+ const now = Date.now(); -+ const created: TokenRateState = { -+ capacity, -+ refillPerSecond, -+ tokens: capacity, -+ updatedAt: now, -+ }; -+ this.tokenRates.set(key, created); -+ logger?.child({ module: "rate-limiter" })?.trace?.("Created token rate state", { -+ rateLimitKey: key, -+ capacity, -+ refillPerSecond, -+ }); -+ return created; -+ } -+ -+ private refillTokenRate(bucket: TokenRateState, now: number): void { -+ const elapsedMs = now - bucket.updatedAt; -+ if (elapsedMs <= 0) return; -+ bucket.updatedAt = now; -+ if (bucket.capacity <= 0 || bucket.refillPerSecond <= 0) return; -+ const refill = (elapsedMs / 1000) * bucket.refillPerSecond; -+ if (refill <= 0) return; -+ bucket.tokens = Math.min(bucket.capacity, bucket.tokens + refill); -+ } -+ -+ private resolveTokenCount(usage: UsageCounters): number { -+ const total = Number.isFinite(usage.totalTokens) ? usage.totalTokens : undefined; -+ if (total !== undefined) return total; -+ const input = Number.isFinite(usage.inputTokens) ? usage.inputTokens : 0; -+ const output = Number.isFinite(usage.outputTokens) ? usage.outputTokens : 0; -+ return input + output; -+ } -+} -diff --git a/packages/core/src/traffic/traffic-retry.spec.ts b/packages/core/src/traffic/traffic-retry.spec.ts -new file mode 100644 -index 00000000..2360ca10 ---- /dev/null -+++ b/packages/core/src/traffic/traffic-retry.spec.ts -@@ -0,0 +1,45 @@ -+import { describe, expect, it, vi } from "vitest"; -+import { buildRetryPlan } from "./traffic-retry"; -+ -+describe("buildRetryPlan", () => { -+ it("respects Retry-After for 429s", () => { -+ const randomSpy = vi.spyOn(Math, "random").mockReturnValue(0); -+ try { -+ const plan = buildRetryPlan( -+ { -+ status: 429, -+ response: { headers: { "retry-after": "2" } }, -+ }, -+ 1, -+ ); -+ -+ expect(plan).toBeTruthy(); -+ expect(plan?.reason).toBe("rateLimit"); -+ expect(plan?.delayMs).toBeGreaterThanOrEqual(2_000); -+ } finally { -+ randomSpy.mockRestore(); -+ } -+ }); -+ -+ it("parses HTTP-date Retry-After values", () => { -+ vi.useFakeTimers(); -+ const randomSpy = vi.spyOn(Math, "random").mockReturnValue(0); -+ -+ try { -+ vi.setSystemTime(new Date("2020-01-01T00:00:00.000Z")); -+ const plan = buildRetryPlan( -+ { -+ statusCode: 429, -+ response: { headers: { "retry-after": "Wed, 01 Jan 2020 00:00:03 GMT" } }, -+ }, -+ 1, -+ ); -+ -+ expect(plan).toBeTruthy(); -+ expect(plan?.delayMs).toBeGreaterThanOrEqual(3_000); -+ } finally { -+ vi.useRealTimers(); -+ randomSpy.mockRestore(); -+ } -+ }); -+}); -diff --git a/packages/core/src/traffic/traffic-retry.ts b/packages/core/src/traffic/traffic-retry.ts -new file mode 100644 -index 00000000..9604dc53 ---- /dev/null -+++ b/packages/core/src/traffic/traffic-retry.ts -@@ -0,0 +1,144 @@ -+import type { Logger } from "../logger"; -+import { -+ MAX_RETRY_ATTEMPTS, -+ RATE_LIMIT_BASE_BACKOFF_MS, -+ RATE_LIMIT_JITTER_FACTOR, -+ SERVER_ERROR_BASE_BACKOFF_MS, -+ SERVER_ERROR_JITTER_FACTOR, -+ TIMEOUT_BASE_BACKOFF_MS, -+ TIMEOUT_JITTER_FACTOR, -+ TIMEOUT_RETRY_ATTEMPTS, -+} from "./traffic-constants"; -+import { extractRetryAfterMs, extractStatusCode, isTimeoutError } from "./traffic-error-utils"; -+import { RateLimitedUpstreamError } from "./traffic-errors"; -+import type { -+ RetryPlan, -+ RetryPolicy, -+ RetryPolicyConfig, -+ RetryPolicyContext, -+ RetryReason, -+} from "./traffic-types"; -+ -+export type { -+ RetryPlan, -+ RetryPolicy, -+ RetryPolicyConfig, -+ RetryPolicyContext, -+ RetryReason, -+} from "./traffic-types"; -+ -+export function buildRetryPlan( -+ error: unknown, -+ attempt: number, -+ logger?: Logger, -+): RetryPlan | undefined { -+ const retryLogger = logger?.child({ module: "retry" }); -+ const reason = getRetryReason(error, retryLogger); -+ if (!reason) { -+ retryLogger?.debug?.("No retry reason detected; skipping retry", { attempt }); -+ return undefined; -+ } -+ -+ const max = reason === "timeout" ? TIMEOUT_RETRY_ATTEMPTS : MAX_RETRY_ATTEMPTS; -+ if (attempt >= max) { -+ retryLogger?.debug?.("Retry attempts exhausted; skipping retry", { -+ attempt, -+ max, -+ reason, -+ }); -+ return undefined; -+ } -+ -+ const computedDelayMs = computeBackoffDelay(reason, attempt); -+ const retryAfterMs = -+ reason === "rateLimit" -+ ? error instanceof RateLimitedUpstreamError -+ ? error.retryAfterMs -+ : extractRetryAfterMs(error, retryLogger) -+ : undefined; -+ const delayMs = -+ retryAfterMs === undefined ? computedDelayMs : Math.max(computedDelayMs, retryAfterMs); -+ -+ retryLogger?.debug?.("Retry plan built", { -+ attempt, -+ reason, -+ delayMs, -+ computedDelayMs, -+ retryAfterMs, -+ max, -+ }); -+ -+ return { -+ reason, -+ delayMs, -+ }; -+} -+ -+export function buildRetryPlanWithPolicy( -+ context: RetryPolicyContext, -+ policyConfig?: RetryPolicyConfig, -+): RetryPlan | undefined { -+ const retryLogger = context.logger?.child({ module: "retry" }); -+ const policy = resolveRetryPolicy(context, policyConfig); -+ if (policy) { -+ const planned = policy(context); -+ if (planned) { -+ retryLogger?.debug?.("Retry policy returned a plan", { -+ attempt: context.attempt, -+ reason: planned.reason, -+ delayMs: planned.delayMs, -+ }); -+ return planned; -+ } -+ retryLogger?.debug?.("Retry policy declined to retry", { attempt: context.attempt }); -+ } -+ -+ return buildRetryPlan(context.error, context.attempt, context.logger); -+} -+ -+function resolveRetryPolicy( -+ context: RetryPolicyContext, -+ config?: RetryPolicyConfig, -+): RetryPolicy | undefined { -+ if (!config) return undefined; -+ const modelPolicy = context.key ? config.models?.[context.key] : undefined; -+ if (modelPolicy) return modelPolicy; -+ const providerModelKey = -+ context.metadata?.provider && context.metadata?.model -+ ? `${context.metadata.provider}::${context.metadata.model}` -+ : undefined; -+ const providerModelPolicy = providerModelKey ? config.models?.[providerModelKey] : undefined; -+ if (providerModelPolicy) return providerModelPolicy; -+ const provider = context.metadata?.provider; -+ const providerPolicy = provider ? config.providers?.[provider] : undefined; -+ if (providerPolicy) return providerPolicy; -+ return config.default; -+} -+ -+function getRetryReason(error: unknown, logger?: Logger): RetryReason | undefined { -+ if (error instanceof RateLimitedUpstreamError) return "rateLimit"; -+ const status = extractStatusCode(error, logger); -+ if (status === 429) return "rateLimit"; -+ if (status && status >= 500) return "serverError"; -+ if (status === 408 || isTimeoutError(error, logger)) return "timeout"; -+ return undefined; -+} -+ -+function computeBackoffDelay(reason: RetryReason, attempt: number): number { -+ const base = -+ reason === "serverError" -+ ? SERVER_ERROR_BASE_BACKOFF_MS -+ : reason === "timeout" -+ ? TIMEOUT_BASE_BACKOFF_MS -+ : RATE_LIMIT_BASE_BACKOFF_MS; -+ -+ const jitter = -+ reason === "serverError" -+ ? SERVER_ERROR_JITTER_FACTOR -+ : reason === "timeout" -+ ? TIMEOUT_JITTER_FACTOR -+ : RATE_LIMIT_JITTER_FACTOR; -+ -+ const exp = base * 2 ** (attempt - 1); -+ return Math.round(exp + exp * jitter * Math.random()); -+} -diff --git a/packages/core/src/traffic/traffic-types.ts b/packages/core/src/traffic/traffic-types.ts -new file mode 100644 -index 00000000..f2ebbafb ---- /dev/null -+++ b/packages/core/src/traffic/traffic-types.ts -@@ -0,0 +1,173 @@ -+import type { Logger } from "../logger"; -+ -+type BivariantFunction = { -+ bivarianceHack(...args: TArgs): TReturn; -+}["bivarianceHack"]; -+ -+type UsageCounters = { -+ inputTokens?: number; -+ outputTokens?: number; -+ totalTokens?: number; -+}; -+ -+export type RetryReason = "rateLimit" | "serverError" | "timeout"; -+ -+export type RetryPlan = { -+ delayMs: number; -+ reason: RetryReason; -+}; -+ -+export type RetryPolicyContext = { -+ error: unknown; -+ attempt: number; -+ metadata?: TrafficRequestMetadata; -+ key?: string; -+ logger?: Logger; -+}; -+ -+export type RetryPolicy = (context: RetryPolicyContext) => RetryPlan | undefined; -+ -+export type RetryPolicyConfig = { -+ default?: RetryPolicy; -+ providers?: Record; -+ models?: Record; -+}; -+ -+export type TrafficRequestType = "text" | "stream"; -+export type TrafficPriority = "P0" | "P1" | "P2"; -+ -+export interface TrafficRequestMetadata { -+ agentId?: string; -+ agentName?: string; -+ model?: string; -+ provider?: string; -+ priority?: TrafficPriority; -+ tenantId?: string; -+ apiKeyId?: string; -+ region?: string; -+ endpoint?: string; -+ tenantTier?: string; -+ taskType?: string; -+ fallbackPolicyId?: string; -+} -+ -+export type TrafficResponseMetadata = { -+ rateLimitKey?: string; -+ retryAfterMs?: number; -+ rateLimitRemaining?: number; -+ rateLimitResetAt?: number; -+ rateLimitResetInMs?: number; -+ queueEtaMs?: number; -+ tenantId?: string; -+ priority?: TrafficPriority; -+ taskType?: string; -+}; -+ -+export type FallbackTarget = { -+ provider?: string; -+ model: string; -+}; -+ -+export type FallbackChainEntry = string | FallbackTarget; -+ -+export type FallbackPolicyMode = "fallback" | "wait"; -+ -+export type FallbackPolicy = { -+ mode: FallbackPolicyMode; -+}; -+ -+export type FallbackPolicyConfig = { -+ defaultPolicyId?: string; -+ policies?: Record; -+ taskTypePolicyIds?: Record; -+}; -+ -+export type ProviderModelConcurrencyLimit = -+ | number -+ | Record -+ | ((metadata: TrafficRequestMetadata | undefined, key: string) => number | undefined); -+ -+export type TenantConcurrencyLimit = -+ | number -+ | Record -+ | ((tenantId: string, metadata: TrafficRequestMetadata | undefined) => number | undefined); -+ -+export type PriorityBurstLimits = Partial>; -+ -+export type AdaptiveLimiterConfig = { -+ windowMs?: number; -+ threshold?: number; -+ minPenaltyMs?: number; -+ maxPenaltyMs?: number; -+ penaltyMultiplier?: number; -+ decayMs?: number; -+}; -+ -+export interface TrafficRequest { -+ tenantId: string; -+ metadata?: TrafficRequestMetadata; -+ execute: () => Promise; -+ deadlineAt?: number; -+ maxQueueWaitMs?: number; -+ createFallbackRequest?: BivariantFunction< -+ [target: FallbackChainEntry], -+ TrafficRequest | undefined -+ >; -+ extractUsage?: BivariantFunction< -+ [response: TResponse], -+ Promise | UsageCounters | undefined -+ >; -+} -+ -+export interface TrafficControllerOptions { -+ maxConcurrent?: number; -+ maxConcurrentPerProviderModel?: ProviderModelConcurrencyLimit; -+ maxConcurrentPerTenant?: TenantConcurrencyLimit; -+ rateLimits?: RateLimitConfig; -+ priorityBurstLimits?: PriorityBurstLimits; -+ adaptiveLimiter?: AdaptiveLimiterConfig; -+ /** -+ * Optional override for rate-limit key construction. -+ * Useful when you need to add new metadata fields without changing core logic. -+ */ -+ rateLimitKeyBuilder?: (metadata?: TrafficRequestMetadata) => string; -+ /** -+ * Optional retry policy overrides by provider/model. -+ * Models keys can use the rate-limit key or provider::model. -+ */ -+ retryPolicy?: RetryPolicyConfig; -+ /** -+ * Optional fallback policy selection by task type or explicit policy id. -+ */ -+ fallbackPolicy?: FallbackPolicyConfig; -+ /** -+ * Select a rate-limit strategy by provider/model. -+ * Example: -+ * { providers: { openai: "window" }, models: { "openai::gpt-4o": "window" } } -+ */ -+ rateLimitStrategy?: RateLimitStrategyConfig; -+ logger?: Logger; -+ fallbackChains?: Record; -+} -+ -+export type RateLimitStrategyKind = "window" | "token-bucket"; -+ -+export type RateLimitStrategyConfig = { -+ providers?: Record; -+ models?: Record; -+}; -+ -+export interface RateLimitOptions { -+ requestsPerMinute: number; -+ tokensPerMinute: number; -+ burstSize?: number; -+} -+ -+export type RateLimitKey = string; -+export type RateLimitConfig = Record; -+ -+export type TenantUsage = { -+ inputTokens: number; -+ outputTokens: number; -+ totalTokens: number; -+}; -diff --git a/packages/core/src/traffic/traffic-usage-tracker.ts b/packages/core/src/traffic/traffic-usage-tracker.ts -new file mode 100644 -index 00000000..c79b311a ---- /dev/null -+++ b/packages/core/src/traffic/traffic-usage-tracker.ts -@@ -0,0 +1,83 @@ -+import type { Logger } from "../logger"; -+import type { QueuedRequest } from "./traffic-controller-internal"; -+import { isPromiseLike } from "./traffic-error-utils"; -+import type { TenantUsage } from "./traffic-types"; -+ -+type UsageCounters = { -+ inputTokens?: number; -+ outputTokens?: number; -+ totalTokens?: number; -+}; -+ -+export class TrafficUsageTracker { -+ private readonly tenantUsage = new Map(); -+ -+ getTenantUsage(tenantId: string): TenantUsage | undefined { -+ const usage = this.tenantUsage.get(tenantId); -+ return usage ? { ...usage } : undefined; -+ } -+ -+ recordUsage( -+ item: QueuedRequest, -+ result: TResponse, -+ logger?: Logger, -+ ): UsageCounters | Promise | undefined { -+ const usageLogger = logger?.child({ module: "usage-tracker" }); -+ const extractor = item.extractUsage ?? item.request.extractUsage; -+ if (!extractor) { -+ usageLogger?.trace?.("No usage extractor; skipping usage", { tenantId: item.tenantId }); -+ return undefined; -+ } -+ -+ const usage = extractor(result); -+ if (!usage) { -+ usageLogger?.trace?.("Usage extractor returned empty; skipping usage", { -+ tenantId: item.tenantId, -+ }); -+ return undefined; -+ } -+ -+ if (isPromiseLike(usage)) { -+ usageLogger?.trace?.("Usage extractor returned promise; awaiting", { -+ tenantId: item.tenantId, -+ }); -+ void usage.then((u) => u && this.incrementTenantUsage(item.tenantId, u, usageLogger)); -+ return usage; -+ } -+ this.incrementTenantUsage(item.tenantId, usage, usageLogger); -+ return usage; -+ } -+ -+ private incrementTenantUsage(tenantId: string, usage: UsageCounters, logger?: Logger): void { -+ const current = this.tenantUsage.get(tenantId) ?? { -+ inputTokens: 0, -+ outputTokens: 0, -+ totalTokens: 0, -+ }; -+ -+ const input = -+ typeof usage.inputTokens === "number" && Number.isFinite(usage.inputTokens) -+ ? usage.inputTokens -+ : 0; -+ const output = -+ typeof usage.outputTokens === "number" && Number.isFinite(usage.outputTokens) -+ ? usage.outputTokens -+ : 0; -+ const total = -+ typeof usage.totalTokens === "number" && Number.isFinite(usage.totalTokens) -+ ? usage.totalTokens -+ : input + output; -+ -+ this.tenantUsage.set(tenantId, { -+ inputTokens: current.inputTokens + input, -+ outputTokens: current.outputTokens + output, -+ totalTokens: current.totalTokens + total, -+ }); -+ -+ logger?.debug?.("Tenant usage incremented", { -+ tenantId, -+ delta: { inputTokens: input, outputTokens: output, totalTokens: total }, -+ total: this.tenantUsage.get(tenantId), -+ }); -+ } -+} -diff --git a/packages/core/src/workflow/core.ts b/packages/core/src/workflow/core.ts -index 3136511c..2b273d58 100644 ---- a/packages/core/src/workflow/core.ts -+++ b/packages/core/src/workflow/core.ts -@@ -827,6 +827,9 @@ export function createWorkflow< - - // Wrap entire execution in root span - const rootSpan = traceContext.getRootSpan(); -+ if (options?.tenantId) { -+ rootSpan.setAttribute("tenant.id", options.tenantId); -+ } - - // Add workflow state snapshot for remote observability - const workflowState = { -@@ -848,6 +851,7 @@ export function createWorkflow< - executionId, - userId: options?.userId, - conversationId: options?.conversationId, -+ tenantId: options?.tenantId, - traceId: rootSpan.spanContext().traceId, - spanId: rootSpan.spanContext().spanId, - }); -diff --git a/packages/core/src/workflow/internal/state.ts b/packages/core/src/workflow/internal/state.ts -index 71fa602d..2de12528 100644 ---- a/packages/core/src/workflow/internal/state.ts -+++ b/packages/core/src/workflow/internal/state.ts -@@ -23,6 +23,7 @@ export type WorkflowState = { - executionId: string; - conversationId?: string; - userId?: string; -+ tenantId?: string; - context?: UserContext; - active: number; - startAt: Date; -@@ -132,6 +133,7 @@ class WorkflowStateManagerInternal implements WorkflowStateManager - active: config?.active ?? 0, - userId: config?.userId, - conversationId: config?.conversationId, -+ tenantId: config?.tenantId, - context: config?.context, - startAt: new Date(), - endAt: null, -diff --git a/packages/core/src/workflow/internal/utils.ts b/packages/core/src/workflow/internal/utils.ts -index fc39530b..42250d82 100644 ---- a/packages/core/src/workflow/internal/utils.ts -+++ b/packages/core/src/workflow/internal/utils.ts -@@ -32,6 +32,7 @@ export function convertWorkflowStateToParam( - executionId: state.executionId, - conversationId: state.conversationId, - userId: state.userId, -+ tenantId: state.tenantId, - context: state.context, - active: state.active, - startAt: state.startAt, -diff --git a/packages/core/src/workflow/steps/and-agent.ts b/packages/core/src/workflow/steps/and-agent.ts -index bc46c148..14af9b8f 100644 ---- a/packages/core/src/workflow/steps/and-agent.ts -+++ b/packages/core/src/workflow/steps/and-agent.ts -@@ -66,6 +66,7 @@ export function andAgent( - context: restConfig.context ?? state.context, - conversationId: restConfig.conversationId ?? state.conversationId, - userId: restConfig.userId ?? state.userId, -+ tenantId: restConfig.tenantId ?? state.tenantId, - // No parentSpan when there's no workflow context - }); - // Accumulate usage if available (no workflow context) -@@ -92,6 +93,7 @@ export function andAgent( - context: restConfig.context ?? state.context, - conversationId: restConfig.conversationId ?? state.conversationId, - userId: restConfig.userId ?? state.userId, -+ tenantId: restConfig.tenantId ?? state.tenantId, - // Pass the current step span as parent for proper span hierarchy - parentSpan: state.workflowContext?.currentStepSpan, - }); -diff --git a/packages/core/src/workflow/types.ts b/packages/core/src/workflow/types.ts -index f7eed282..49bfd8cb 100644 ---- a/packages/core/src/workflow/types.ts -+++ b/packages/core/src/workflow/types.ts -@@ -214,6 +214,10 @@ export interface WorkflowRunOptions { - * The conversation ID, this can be used to track the current conversation in a workflow - */ - conversationId?: string; -+ /** -+ * Tenant identifier propagated to agent steps and subcalls -+ */ -+ tenantId?: string; - /** - * The user ID, this can be used to track the current user in a workflow - */ -diff --git a/packages/scorers/src/llm/answer-correctness.ts b/packages/scorers/src/llm/answer-correctness.ts -index 2111fa31..d66cc007 100644 ---- a/packages/scorers/src/llm/answer-correctness.ts -+++ b/packages/scorers/src/llm/answer-correctness.ts -@@ -7,6 +7,7 @@ import { - import { safeStringify } from "@voltagent/internal/utils"; - import type { LanguageModel } from "ai"; - import { z } from "zod"; -+import { extractTenantId } from "./utils"; - - const ANSWER_CORRECTNESS_PROMPT = `Given a ground truth and an answer, analyze each statement in the answer and classify them in one of the following categories: - -@@ -84,15 +85,17 @@ export function createAnswerCorrectnessScorer< - const agent = new Agent({ - name: "answer-correctness-classifier", - model, -+ trafficPriority: "P2", - instructions: "You classify statements for answer correctness evaluation", - }); - -+ const tenantId = extractTenantId(context); - const payload = resolvePayload(context, buildPayload); - const prompt = ANSWER_CORRECTNESS_PROMPT.replace("{{question}}", payload.input) - .replace("{{answer}}", payload.output) - .replace("{{ground_truth}}", payload.expected); - -- const response = await agent.generateObject(prompt, CLASSIFICATION_SCHEMA); -+ const response = await agent.generateObject(prompt, CLASSIFICATION_SCHEMA, { tenantId }); - const normalized = normalizeClassification(response.object); - - return { -diff --git a/packages/scorers/src/llm/answer-relevancy.ts b/packages/scorers/src/llm/answer-relevancy.ts -index a3de2237..d9bda1c9 100644 ---- a/packages/scorers/src/llm/answer-relevancy.ts -+++ b/packages/scorers/src/llm/answer-relevancy.ts -@@ -8,6 +8,7 @@ import { - import { safeStringify } from "@voltagent/internal/utils"; - import type { LanguageModel } from "ai"; - import { z } from "zod"; -+import { extractTenantId } from "./utils"; - - const QUESTION_GEN_PROMPT = `Generate a question for the given answer and Identify if answer is noncommittal. Give noncommittal as 1 if the answer is noncommittal and 0 if the answer is committal. A noncommittal answer is one that is evasive, vague, or ambiguous. For example, "I don't know" or "I'm not sure" are noncommittal answers - -@@ -119,9 +120,11 @@ export function createAnswerRelevancyScorer< - const agent = new Agent({ - name: "question-generator", - model, -+ trafficPriority: "P2", - instructions: "You generate questions from answers to evaluate relevancy", - }); - -+ const tenantId = extractTenantId(context); - const payload = resolvePayload(context, buildPayload); - const questions: GeneratedQuestion[] = []; - -@@ -131,7 +134,7 @@ export function createAnswerRelevancyScorer< - payload.context, - ); - -- const response = await agent.generateObject(prompt, QUESTION_SCHEMA); -+ const response = await agent.generateObject(prompt, QUESTION_SCHEMA, { tenantId }); - questions.push({ - question: response.object.question, - noncommittal: response.object.noncommittal === 1, -diff --git a/packages/scorers/src/llm/classifiers.ts b/packages/scorers/src/llm/classifiers.ts -index 1bca4239..a327e20d 100644 ---- a/packages/scorers/src/llm/classifiers.ts -+++ b/packages/scorers/src/llm/classifiers.ts -@@ -7,6 +7,7 @@ import { - } from "@voltagent/core"; - import { safeStringify } from "@voltagent/internal/utils"; - import { z } from "zod"; -+import { extractTenantId } from "./utils"; - - type ChoiceId = string; - -@@ -93,11 +94,14 @@ async function evaluateChoice(args: EvaluateChoiceArgs): Promise - const agent = new Agent({ - name: `${scorerId}-judge`, - model, -+ trafficPriority: "P2", - instructions: judgeInstructions ?? buildDefaultChoiceInstructions(Object.keys(choices)), - }); - -+ const tenantId = extractTenantId(context); - const response = await agent.generateObject(prompt, CHOICE_RESPONSE_SCHEMA, { - maxOutputTokens, -+ tenantId, - }); - - const { choice, reason } = extractChoiceFromResponse(response.object, choices, scorerId); -diff --git a/packages/scorers/src/llm/context-precision.ts b/packages/scorers/src/llm/context-precision.ts -index d31b5b85..ba680f56 100644 ---- a/packages/scorers/src/llm/context-precision.ts -+++ b/packages/scorers/src/llm/context-precision.ts -@@ -7,6 +7,7 @@ import { - import { safeStringify } from "@voltagent/internal/utils"; - import type { LanguageModel } from "ai"; - import { z } from "zod"; -+import { extractTenantId } from "./utils"; - - const CONTEXT_PRECISION_PROMPT = `Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output. - -@@ -109,6 +110,7 @@ export function createContextPrecisionScorer< - const agent = new Agent({ - name: "context-precision-evaluator", - model, -+ trafficPriority: "P2", - instructions: "You evaluate if context was useful for arriving at the answer", - }); - -@@ -116,12 +118,15 @@ export function createContextPrecisionScorer< - const contextText = Array.isArray(payload.context) - ? payload.context.join("\n") - : payload.context; -+ const tenantId = extractTenantId(context); - - const prompt = CONTEXT_PRECISION_PROMPT.replace("{{question}}", payload.input) - .replace("{{context}}", contextText) - .replace("{{answer}}", payload.output); - -- const response = await agent.generateObject(prompt, CONTEXT_PRECISION_SCHEMA); -+ const response = await agent.generateObject(prompt, CONTEXT_PRECISION_SCHEMA, { -+ tenantId, -+ }); - - context.results.raw.contextPrecisionVerdict = response.object; - -diff --git a/packages/scorers/src/llm/context-recall.ts b/packages/scorers/src/llm/context-recall.ts -index e6e86510..2c6053fc 100644 ---- a/packages/scorers/src/llm/context-recall.ts -+++ b/packages/scorers/src/llm/context-recall.ts -@@ -7,6 +7,7 @@ import { - import { safeStringify } from "@voltagent/internal/utils"; - import type { LanguageModel } from "ai"; - import { z } from "zod"; -+import { extractTenantId } from "./utils"; - - const CONTEXT_RECALL_EXTRACT_PROMPT = `Given the context and ground truth (expected output), extract all factual statements from the ground truth. - -@@ -120,6 +121,7 @@ export function createContextRecallScorer< - const agent = new Agent({ - name: "context-recall-evaluator", - model, -+ trafficPriority: "P2", - instructions: "You evaluate how well provided context supports factual statements", - }); - -@@ -127,6 +129,7 @@ export function createContextRecallScorer< - const contextText = Array.isArray(payload.context) - ? payload.context.join("\n") - : payload.context; -+ const tenantId = extractTenantId(context); - - // Extract statements from expected output - const extractPrompt = CONTEXT_RECALL_EXTRACT_PROMPT.replace( -@@ -134,7 +137,9 @@ export function createContextRecallScorer< - contextText, - ).replace("{{expected}}", payload.expected); - -- const extractResponse = await agent.generateObject(extractPrompt, EXTRACT_SCHEMA); -+ const extractResponse = await agent.generateObject(extractPrompt, EXTRACT_SCHEMA, { -+ tenantId, -+ }); - const statements = extractResponse.object.statements; - - if (statements.length === 0) { -@@ -152,7 +157,9 @@ export function createContextRecallScorer< - contextText, - ).replace("{{statement}}", statement); - -- const verifyResponse = await agent.generateObject(verifyPrompt, VERIFY_SCHEMA); -+ const verifyResponse = await agent.generateObject(verifyPrompt, VERIFY_SCHEMA, { -+ tenantId, -+ }); - verdicts.push({ - statement, - verdict: verifyResponse.object.verdict, -diff --git a/packages/scorers/src/llm/context-relevancy.ts b/packages/scorers/src/llm/context-relevancy.ts -index ee882b5b..aca608b2 100644 ---- a/packages/scorers/src/llm/context-relevancy.ts -+++ b/packages/scorers/src/llm/context-relevancy.ts -@@ -7,6 +7,7 @@ import { - import { safeStringify } from "@voltagent/internal/utils"; - import type { LanguageModel } from "ai"; - import { z } from "zod"; -+import { extractTenantId } from "./utils"; - - const CONTEXT_RELEVANCY_PROMPT = `Analyze the provided context and identify which parts are relevant to answering the given question. For each context sentence or passage, determine its relevance level. - -@@ -144,6 +145,7 @@ export function createContextRelevancyScorer< - const agent = new Agent({ - name: "context-relevancy-evaluator", - model, -+ trafficPriority: "P2", - instructions: "You evaluate how relevant provided context is to answering questions", - }); - -@@ -151,13 +153,16 @@ export function createContextRelevancyScorer< - const contextText = Array.isArray(payload.context) - ? payload.context.join("\n") - : payload.context; -+ const tenantId = extractTenantId(context); - - const prompt = CONTEXT_RELEVANCY_PROMPT.replace("{{question}}", payload.input).replace( - "{{context}}", - contextText, - ); - -- const response = await agent.generateObject(prompt, CONTEXT_RELEVANCY_SCHEMA); -+ const response = await agent.generateObject(prompt, CONTEXT_RELEVANCY_SCHEMA, { -+ tenantId, -+ }); - const evaluations = response.object.evaluations; - - context.results.raw.contextRelevancyEvaluations = evaluations; -diff --git a/packages/scorers/src/llm/moderation.ts b/packages/scorers/src/llm/moderation.ts -index 03563bfe..1055927f 100644 ---- a/packages/scorers/src/llm/moderation.ts -+++ b/packages/scorers/src/llm/moderation.ts -@@ -7,6 +7,7 @@ import { - } from "@voltagent/core"; - import { safeStringify } from "@voltagent/internal/utils"; - import { z } from "zod"; -+import { extractTenantId } from "./utils"; - - export interface ModerationScorerOptions { - id?: string; -@@ -220,6 +221,7 @@ async function runModerationJudge(args: { - typeof context.results.prepare === "string" - ? context.results.prepare - : normalizeText(context.payload.output); -+ const tenantId = extractTenantId(context); - - const prompt = await buildPrompt({ - output: normalizedOutput, -@@ -232,12 +234,14 @@ async function runModerationJudge(args: { - const agent = new Agent({ - name: "moderation-judge", - model, -+ trafficPriority: "P2", - instructions: - "You are a safety classifier. Respond with JSON that matches the provided schema containing flagged, scores, and reason.", - }); - - const response = await agent.generateObject(prompt, MODERATION_SCHEMA, { - maxOutputTokens, -+ tenantId, - }); - - const parsed = mapModerationResponse(response.object, threshold); -diff --git a/packages/scorers/src/llm/utils.ts b/packages/scorers/src/llm/utils.ts -new file mode 100644 -index 00000000..75e886e3 ---- /dev/null -+++ b/packages/scorers/src/llm/utils.ts -@@ -0,0 +1,14 @@ -+import type { BuilderPrepareContext, BuilderScoreContext } from "@voltagent/core"; -+ -+type TenantAwareContext = BuilderScoreContext, Record> & -+ BuilderPrepareContext, Record>; -+ -+export function extractTenantId( -+ context: -+ | BuilderScoreContext, Record> -+ | BuilderPrepareContext, Record> -+ | TenantAwareContext, -+): string | undefined { -+ const candidate = (context.payload as { tenantId?: unknown })?.tenantId; -+ return typeof candidate === "string" ? candidate : undefined; -+} -diff --git a/packages/server-core/src/handlers/agent.handlers.ts b/packages/server-core/src/handlers/agent.handlers.ts -index 00c0f2ee..37fbeaf4 100644 ---- a/packages/server-core/src/handlers/agent.handlers.ts -+++ b/packages/server-core/src/handlers/agent.handlers.ts -@@ -1,11 +1,70 @@ --import { ClientHTTPError, type ServerProviderDeps } from "@voltagent/core"; --import { convertUsage } from "@voltagent/core"; -+import { -+ ClientHTTPError, -+ type ServerProviderDeps, -+ type TrafficResponseMetadata, -+ convertUsage, -+} from "@voltagent/core"; - import { type Logger, safeStringify } from "@voltagent/internal"; - import { z } from "zod"; - import { convertJsonSchemaToZod } from "zod-from-json-schema"; - import { convertJsonSchemaToZod as convertJsonSchemaToZodV3 } from "zod-from-json-schema-v3"; - import type { ApiResponse } from "../types"; - import { processAgentOptions } from "../utils/options"; -+import { buildTrafficHeaders } from "../utils/traffic"; -+ -+function extractTrafficMetadata(value: unknown): TrafficResponseMetadata | undefined { -+ if (!value || typeof value !== "object") return undefined; -+ const traffic = (value as { traffic?: unknown }).traffic; -+ if (!traffic || typeof traffic !== "object") return undefined; -+ return traffic as TrafficResponseMetadata; -+} -+ -+function wrapStreamWithTraffic( -+ baseResponse: Response, -+ traffic?: TrafficResponseMetadata, -+): Response { -+ if (!traffic) return baseResponse; -+ const headers = new Headers(baseResponse.headers); -+ const trafficHeaders = buildTrafficHeaders(traffic); -+ for (const [key, value] of Object.entries(trafficHeaders)) { -+ headers.set(key, value); -+ } -+ const baseBody = baseResponse.body; -+ if (!baseBody) { -+ return new Response(baseBody, { -+ status: baseResponse.status, -+ headers, -+ }); -+ } -+ -+ const encoder = new TextEncoder(); -+ const stream = new ReadableStream({ -+ async start(controller) { -+ const trafficEvent = `data: ${safeStringify({ type: "traffic", traffic })}\n\n`; -+ controller.enqueue(encoder.encode(trafficEvent)); -+ const reader = baseBody.getReader(); -+ try { -+ while (true) { -+ const { done, value } = await reader.read(); -+ if (done) break; -+ if (value !== undefined) { -+ controller.enqueue(value); -+ } -+ } -+ } catch (error) { -+ controller.error(error); -+ } finally { -+ reader.releaseLock(); -+ controller.close(); -+ } -+ }, -+ }); -+ -+ return new Response(stream, { -+ status: baseResponse.status, -+ headers, -+ }); -+} - - /** - * Handler for listing all agents -@@ -79,6 +138,7 @@ export async function handleGenerateText( - const options = processAgentOptions(body, signal); - - const result = await agent.generateText(input, options); -+ const traffic = extractTrafficMetadata(result); - - // Convert usage format if present - const usage = result.usage ? convertUsage(result.usage) : undefined; -@@ -102,9 +162,11 @@ export async function handleGenerateText( - } - })(), - }, -+ traffic, - }; - } catch (error) { - logger.error("Failed to generate text", { error }); -+ const traffic = extractTrafficMetadata(error); - if (error instanceof ClientHTTPError) { - return { - success: false, -@@ -112,11 +174,13 @@ export async function handleGenerateText( - code: error.code, - name: error.name, - httpStatus: error.httpStatus, -+ traffic, - }; - } - return { - success: false, - error: error instanceof Error ? error.message : "Unknown error", -+ traffic, - }; - } - } -@@ -153,6 +217,7 @@ export async function handleStreamText( - const options = processAgentOptions(body, signal); - - const result = await agent.streamText(input, options); -+ const traffic = extractTrafficMetadata(result); - - // Access the fullStream property - const { fullStream } = result; -@@ -178,7 +243,7 @@ export async function handleStreamText( - }, - }); - -- return new Response(stream, { -+ const response = new Response(stream, { - status: 200, - headers: { - "Content-Type": "text/event-stream", -@@ -186,20 +251,25 @@ export async function handleStreamText( - Connection: "keep-alive", - }, - }); -+ return wrapStreamWithTraffic(response, traffic); - } catch (error) { - logger.error("Failed to handle stream text request", { error }); - - const errorMessage = error instanceof Error ? error.message : "Unknown error"; -+ const traffic = extractTrafficMetadata(error); -+ const trafficHeaders = buildTrafficHeaders(traffic); - - return new Response( - safeStringify({ - error: errorMessage, - message: errorMessage, -+ traffic, - }), - { - status: 500, - headers: { - "Content-Type": "application/json", -+ ...trafficHeaders, - }, - }, - ); -@@ -238,26 +308,32 @@ export async function handleChatStream( - const options = processAgentOptions(body, signal); - - const result = await agent.streamText(input, options); -+ const traffic = extractTrafficMetadata(result); - - // Use the built-in toUIMessageStreamResponse - it handles errors properly -- return result.toUIMessageStreamResponse({ -+ const response = result.toUIMessageStreamResponse({ - sendReasoning: true, - sendSources: true, - }); -+ return wrapStreamWithTraffic(response, traffic); - } catch (error) { - logger.error("Failed to handle chat stream request", { error }); - - const errorMessage = error instanceof Error ? error.message : "Unknown error"; -+ const traffic = extractTrafficMetadata(error); -+ const trafficHeaders = buildTrafficHeaders(traffic); - - return new Response( - safeStringify({ - error: errorMessage, - message: errorMessage, -+ traffic, - }), - { - status: 500, - headers: { - "Content-Type": "application/json", -+ ...trafficHeaders, - }, - }, - ); -@@ -293,16 +369,20 @@ export async function handleGenerateObject( - ) as any; - - const result = await agent.generateObject(input, zodSchema, options); -+ const traffic = extractTrafficMetadata(result); - - return { - success: true, - data: result.object, -+ traffic, - }; - } catch (error) { - logger.error("Failed to generate object", { error }); -+ const traffic = extractTrafficMetadata(error); - return { - success: false, - error: error instanceof Error ? error.message : "Unknown error", -+ traffic, - }; - } - } -@@ -344,23 +424,29 @@ export async function handleStreamObject( - ) as any; - - const result = await agent.streamObject(input, zodSchema, options); -+ const traffic = extractTrafficMetadata(result); - - // Use the built-in toTextStreamResponse - it handles errors properly -- return result.toTextStreamResponse(); -+ const response = result.toTextStreamResponse(); -+ return wrapStreamWithTraffic(response, traffic); - } catch (error) { - logger.error("Failed to handle stream object request", { error }); - - const errorMessage = error instanceof Error ? error.message : "Unknown error"; -+ const traffic = extractTrafficMetadata(error); -+ const trafficHeaders = buildTrafficHeaders(traffic); - - return new Response( - safeStringify({ - error: errorMessage, - message: errorMessage, -+ traffic, - }), - { - status: 500, - headers: { - "Content-Type": "application/json", -+ ...trafficHeaders, - }, - }, - ); -diff --git a/packages/server-core/src/index.ts b/packages/server-core/src/index.ts -index 1fe7e206..2f7ed826 100644 ---- a/packages/server-core/src/index.ts -+++ b/packages/server-core/src/index.ts -@@ -40,6 +40,7 @@ export * from "./utils/server-utils"; - export * from "./utils/ui-templates"; - export * from "./utils/response-mappers"; - export * from "./utils/sse"; -+export * from "./utils/traffic"; - export * from "./utils/announcements"; - - // Export WebSocket utilities -diff --git a/packages/server-core/src/types/responses.ts b/packages/server-core/src/types/responses.ts -index 2098c2f6..4935a535 100644 ---- a/packages/server-core/src/types/responses.ts -+++ b/packages/server-core/src/types/responses.ts -@@ -1,10 +1,12 @@ - /** - * Framework-agnostic response types for server handlers - */ -+import type { TrafficResponseMetadata } from "@voltagent/core"; - - export interface SuccessResponse { - success: true; - data: T; -+ traffic?: TrafficResponseMetadata; - } - - export interface ErrorResponse { -@@ -13,6 +15,7 @@ export interface ErrorResponse { - httpStatus?: number; - code?: string; - name?: string; -+ traffic?: TrafficResponseMetadata; - } - - export type ApiResponse = SuccessResponse | ErrorResponse; -diff --git a/packages/server-core/src/utils/traffic.ts b/packages/server-core/src/utils/traffic.ts -new file mode 100644 -index 00000000..f9be1845 ---- /dev/null -+++ b/packages/server-core/src/utils/traffic.ts -@@ -0,0 +1,35 @@ -+import type { TrafficResponseMetadata } from "@voltagent/core"; -+ -+export function buildTrafficHeaders(traffic?: TrafficResponseMetadata): Record { -+ if (!traffic) return {}; -+ -+ const headers: Record = {}; -+ -+ if (typeof traffic.retryAfterMs === "number" && Number.isFinite(traffic.retryAfterMs)) { -+ headers["Retry-After"] = String(Math.max(0, Math.ceil(traffic.retryAfterMs / 1000))); -+ } -+ -+ if (traffic.rateLimitRemaining !== undefined) { -+ headers["X-RateLimit-Remaining"] = String(traffic.rateLimitRemaining); -+ } -+ -+ if (typeof traffic.rateLimitResetAt === "number" && Number.isFinite(traffic.rateLimitResetAt)) { -+ headers["X-RateLimit-Reset"] = String(Math.max(0, Math.ceil(traffic.rateLimitResetAt / 1000))); -+ } else if ( -+ typeof traffic.rateLimitResetInMs === "number" && -+ Number.isFinite(traffic.rateLimitResetInMs) -+ ) { -+ const resetAt = Date.now() + Math.max(0, traffic.rateLimitResetInMs); -+ headers["X-RateLimit-Reset"] = String(Math.max(0, Math.ceil(resetAt / 1000))); -+ } -+ -+ if (traffic.queueEtaMs !== undefined) { -+ headers["X-Queue-ETA"] = String(traffic.queueEtaMs); -+ } -+ -+ if (traffic.rateLimitKey) { -+ headers["X-RateLimit-Key"] = traffic.rateLimitKey; -+ } -+ -+ return headers; -+} -diff --git a/packages/server-hono/src/routes/index.ts b/packages/server-hono/src/routes/index.ts -index a5af8214..336a5bf4 100644 ---- a/packages/server-hono/src/routes/index.ts -+++ b/packages/server-hono/src/routes/index.ts -@@ -2,6 +2,7 @@ import type { ServerProviderDeps } from "@voltagent/core"; - import type { Logger } from "@voltagent/internal"; - import { - UPDATE_ROUTES, -+ buildTrafficHeaders, - handleCancelWorkflow, - handleChatStream, - handleCheckUpdates, -@@ -87,11 +88,12 @@ export function registerAgentRoutes( - - const signal = c.req.raw.signal; - const response = await handleGenerateText(agentId, body, deps, logger, signal); -+ const trafficHeaders = buildTrafficHeaders(response.traffic); - if (!response.success) { - const { httpStatus, ...details } = response; -- return c.json(details, httpStatus || 500); -+ return c.json(details, httpStatus || 500, trafficHeaders); - } -- return c.json(response, 200); -+ return c.json(response, 200, trafficHeaders); - }); - - // POST /agents/:id/stream - Stream text (raw fullStream SSE) -@@ -131,11 +133,12 @@ export function registerAgentRoutes( - const body = await c.req.json(); - const signal = c.req.raw.signal; - const response = await handleGenerateObject(agentId, body, deps, logger, signal); -+ const trafficHeaders = buildTrafficHeaders(response.traffic); - if (!response.success) { - const { httpStatus, ...details } = response; -- return c.json(details, httpStatus || 500); -+ return c.json(details, httpStatus || 500, trafficHeaders); - } -- return c.json(response, 200); -+ return c.json(response, 200, trafficHeaders); - }); - - // POST /agents/:id/stream-object - Stream object -diff --git a/packages/serverless-hono/src/routes.ts b/packages/serverless-hono/src/routes.ts -index d377ce4b..39eabcf7 100644 ---- a/packages/serverless-hono/src/routes.ts -+++ b/packages/serverless-hono/src/routes.ts -@@ -28,6 +28,7 @@ import { - type TriggerHttpRequestContext, - UPDATE_ROUTES, - WORKFLOW_ROUTES, -+ buildTrafficHeaders, - executeA2ARequest, - executeTriggerHandler, - getConversationMessagesHandler, -@@ -165,7 +166,8 @@ export function registerAgentRoutes(app: Hono, deps: ServerProviderDeps, logger: - } - const signal = c.req.raw.signal; - const response = await handleGenerateText(agentId, body, deps, logger, signal); -- return c.json(response, response.success ? 200 : 500); -+ const trafficHeaders = buildTrafficHeaders(response.traffic); -+ return c.json(response, response.success ? 200 : 500, trafficHeaders); - }); - - app.post(AGENT_ROUTES.streamText.path, async (c) => { -@@ -197,7 +199,8 @@ export function registerAgentRoutes(app: Hono, deps: ServerProviderDeps, logger: - } - const signal = c.req.raw.signal; - const response = await handleGenerateObject(agentId, body, deps, logger, signal); -- return c.json(response, response.success ? 200 : 500); -+ const trafficHeaders = buildTrafficHeaders(response.traffic); -+ return c.json(response, response.success ? 200 : 500, trafficHeaders); - }); - - app.post(AGENT_ROUTES.streamObject.path, async (c) => { -diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml -index 6675056e..244ce4d1 100644 ---- a/pnpm-lock.yaml -+++ b/pnpm-lock.yaml -@@ -37,7 +37,7 @@ importers: - version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2) - '@nx/plugin': - specifier: 20.4.6 -- version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(typescript@5.9.2) -+ version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2) - '@nx/vite': - specifier: 20.4.6 - version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2)(vite@7.2.7)(vitest@3.2.4) -@@ -92,6 +92,9 @@ importers: - syncpack: - specifier: ^13.0.2 - version: 13.0.4(typescript@5.9.2) -+ ts-node: -+ specifier: ^10.9.2 -+ version: 10.9.2(@swc/core@1.5.29)(@types/node@24.2.1)(typescript@5.9.2) - tslib: - specifier: ^2.3.0 - version: 2.8.1 -@@ -99,7 +102,7 @@ importers: - specifier: ^8.5.0 - version: 8.5.0(@swc/core@1.5.29)(typescript@5.9.2) - typescript: -- specifier: ^5.8.2 -+ specifier: ^5.9.2 - version: 5.9.2 - vite: - specifier: ^7.2.7 -@@ -2750,6 +2753,61 @@ importers: - specifier: ^0.5.3 - version: 0.5.3(@tanstack/query-core@5.89.0)(@tanstack/react-query@5.89.0)(@tanstack/router-core@1.131.44)(@types/node@24.2.1)(@types/react@19.2.7)(@vitejs/plugin-react@5.1.2)(tsx@4.20.4)(vite@7.2.7) - -+ examples/with-viteval/dist: -+ dependencies: -+ '@ai-sdk/openai': -+ specifier: ^2.0.52 -+ version: 2.0.85(zod@3.25.76) -+ '@voltagent/cli': -+ specifier: ^0.1.16 -+ version: link:../../../packages/cli -+ '@voltagent/core': -+ specifier: ^1.2.15 -+ version: link:../../../packages/core -+ '@voltagent/libsql': -+ specifier: ^1.0.13 -+ version: link:../../../packages/libsql -+ '@voltagent/logger': -+ specifier: ^1.0.4 -+ version: link:../../../packages/logger -+ '@voltagent/server-hono': -+ specifier: ^1.2.5 -+ version: link:../../../packages/server-hono -+ ai: -+ specifier: ^5.0.76 -+ version: 5.0.113(zod@3.25.76) -+ consola: -+ specifier: ^3.4.2 -+ version: 3.4.2 -+ envalid: -+ specifier: ^8.1.0 -+ version: 8.1.0 -+ yargs: -+ specifier: ^18.0.0 -+ version: 18.0.0 -+ zod: -+ specifier: ^3.25.76 -+ version: 3.25.76 -+ devDependencies: -+ '@tsconfig/node24': -+ specifier: ^24.0.1 -+ version: 24.0.1 -+ '@types/yargs': -+ specifier: ^17.0.33 -+ version: 17.0.33 -+ dotenv: -+ specifier: ^16.4.5 -+ version: 16.6.1 -+ tsx: -+ specifier: ^4.19.3 -+ version: 4.20.4 -+ typescript: -+ specifier: ^5.8.2 -+ version: 5.9.2 -+ viteval: -+ specifier: ^0.5.3 -+ version: 0.5.3(@tanstack/query-core@5.89.0)(@tanstack/react-query@5.89.0)(@tanstack/router-core@1.131.44)(@types/node@24.2.1)(@types/react@19.2.7)(@vitejs/plugin-react@5.1.2)(tsx@4.20.4)(vite@7.2.7) -+ - examples/with-voice-elevenlabs: - dependencies: - '@ai-sdk/openai': -@@ -3509,7 +3567,7 @@ importers: - version: 3.2.4(vitest@3.2.4) - jest: - specifier: ^29.5.0 -- version: 29.7.0(@types/node@24.2.1) -+ version: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) - ts-jest: - specifier: ^29.1.0 - version: 29.4.1(@babel/core@7.28.5)(esbuild@0.25.10)(jest@29.7.0)(typescript@5.9.2) -@@ -9966,7 +10024,7 @@ packages: - slash: 3.0.0 - dev: true - -- /@jest/core@29.7.0: -+ /@jest/core@29.7.0(ts-node@10.9.2): - resolution: {integrity: sha512-n7aeXWKMnGtDA48y8TLWJPJmLmmZ642Ceo78cYWEpiD7FzDgmNDV/GCVRorPABdXLJZ/9wzzgZAlHjXjxDHGsg==} - engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} - peerDependencies: -@@ -9987,7 +10045,7 @@ packages: - exit: 0.1.2 - graceful-fs: 4.2.11 - jest-changed-files: 29.7.0 -- jest-config: 29.7.0(@types/node@24.6.2) -+ jest-config: 29.7.0(@types/node@24.6.2)(ts-node@10.9.2) - jest-haste-map: 29.7.0 - jest-message-util: 29.7.0 - jest-regex-util: 29.6.3 -@@ -12403,7 +12461,7 @@ packages: - - verdaccio - dev: true - -- /@nx/jest@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2): -+ /@nx/jest@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2): - resolution: {integrity: sha512-yZOZJOQFtpdY3Fu/WYNoDx81TwvF9yfwvalFpLD19bz+2YGl7B89l0S1ZrtSRXFfKXA/w7gb0gmKwthJtQhx9Q==} - dependencies: - '@jest/reporters': 29.7.0 -@@ -12412,7 +12470,7 @@ packages: - '@nx/js': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2) - '@phenomnomnominal/tsquery': 5.0.1(typescript@5.9.2) - identity-obj-proxy: 3.0.0 -- jest-config: 29.7.0(@types/node@24.2.1) -+ jest-config: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) - jest-resolve: 29.7.0 - jest-util: 29.7.0 - minimatch: 9.0.3 -@@ -12807,12 +12865,12 @@ packages: - dev: true - optional: true - -- /@nx/plugin@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(typescript@5.9.2): -+ /@nx/plugin@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2): - resolution: {integrity: sha512-7Jlv+BVqGoO0BolQN7P5Z87160phuE1i7H6C8xFwQnlQ3ZfwQCJzk2dkg1UyzxDkWl6lvVsqBjZPXD55gFQ3+w==} - dependencies: - '@nx/devkit': 20.4.6(nx@20.8.2) - '@nx/eslint': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2) -- '@nx/jest': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2) -+ '@nx/jest': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2) - '@nx/js': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2) - tslib: 2.8.1 - transitivePeerDependencies: -@@ -17770,8 +17828,8 @@ packages: - '@babel/plugin-syntax-jsx': 7.27.1(@babel/core@7.28.5) - '@babel/plugin-syntax-typescript': 7.27.1(@babel/core@7.28.5) - '@babel/template': 7.27.2 -- '@babel/traverse': 7.28.4 -- '@babel/types': 7.28.4 -+ '@babel/traverse': 7.28.5 -+ '@babel/types': 7.28.5 - '@tanstack/react-router': 1.131.44(react-dom@19.2.3)(react@19.2.3) - '@tanstack/router-core': 1.131.44 - '@tanstack/router-generator': 1.131.44 -@@ -22783,7 +22841,7 @@ packages: - crc-32: 1.2.2 - readable-stream: 4.7.0 - -- /create-jest@29.7.0(@types/node@24.2.1): -+ /create-jest@29.7.0(@types/node@24.2.1)(ts-node@10.9.2): - resolution: {integrity: sha512-Adz2bdH0Vq3F53KEMJOoftQFutWCukm6J24wbPWRO4k1kMY7gS7ds/uoJkNuV8wDCtWWnuwGcJwpWcih+zEW1Q==} - engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} - hasBin: true -@@ -22792,7 +22850,7 @@ packages: - chalk: 4.1.2 - exit: 0.1.2 - graceful-fs: 4.2.11 -- jest-config: 29.7.0(@types/node@24.2.1) -+ jest-config: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) - jest-util: 29.7.0 - prompts: 2.4.2 - transitivePeerDependencies: -@@ -27641,7 +27699,7 @@ packages: - - supports-color - dev: true - -- /jest-cli@29.7.0(@types/node@24.2.1): -+ /jest-cli@29.7.0(@types/node@24.2.1)(ts-node@10.9.2): - resolution: {integrity: sha512-OVVobw2IubN/GSYsxETi+gOe7Ka59EFMR/twOU3Jb2GnKKeMGJB5SGUUrEz3SFVmJASUdZUzy83sLNNQ2gZslg==} - engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} - hasBin: true -@@ -27651,14 +27709,14 @@ packages: - node-notifier: - optional: true - dependencies: -- '@jest/core': 29.7.0 -+ '@jest/core': 29.7.0(ts-node@10.9.2) - '@jest/test-result': 29.7.0 - '@jest/types': 29.6.3 - chalk: 4.1.2 -- create-jest: 29.7.0(@types/node@24.2.1) -+ create-jest: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) - exit: 0.1.2 - import-local: 3.2.0 -- jest-config: 29.7.0(@types/node@24.2.1) -+ jest-config: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) - jest-util: 29.7.0 - jest-validate: 29.7.0 - yargs: 17.7.2 -@@ -27669,7 +27727,7 @@ packages: - - ts-node - dev: true - -- /jest-config@29.7.0(@types/node@24.2.1): -+ /jest-config@29.7.0(@types/node@24.2.1)(ts-node@10.9.2): - resolution: {integrity: sha512-uXbpfeQ7R6TZBqI3/TxCU4q4ttk3u0PJeC+E0zbfSoSjq6bJ7buBPxzQPL0ifrkY4DNu4JUdk0ImlBUYi840eQ==} - engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} - peerDependencies: -@@ -27704,12 +27762,13 @@ packages: - pretty-format: 29.7.0 - slash: 3.0.0 - strip-json-comments: 3.1.1 -+ ts-node: 10.9.2(@swc/core@1.5.29)(@types/node@24.2.1)(typescript@5.9.2) - transitivePeerDependencies: - - babel-plugin-macros - - supports-color - dev: true - -- /jest-config@29.7.0(@types/node@24.6.2): -+ /jest-config@29.7.0(@types/node@24.6.2)(ts-node@10.9.2): - resolution: {integrity: sha512-uXbpfeQ7R6TZBqI3/TxCU4q4ttk3u0PJeC+E0zbfSoSjq6bJ7buBPxzQPL0ifrkY4DNu4JUdk0ImlBUYi840eQ==} - engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} - peerDependencies: -@@ -27744,6 +27803,7 @@ packages: - pretty-format: 29.7.0 - slash: 3.0.0 - strip-json-comments: 3.1.1 -+ ts-node: 10.9.2(@swc/core@1.5.29)(@types/node@24.2.1)(typescript@5.9.2) - transitivePeerDependencies: - - babel-plugin-macros - - supports-color -@@ -28041,7 +28101,7 @@ packages: - supports-color: 8.1.1 - dev: true - -- /jest@29.7.0(@types/node@24.2.1): -+ /jest@29.7.0(@types/node@24.2.1)(ts-node@10.9.2): - resolution: {integrity: sha512-NIy3oAFp9shda19hy4HK0HRTWKtPJmGdnvywu01nOqNC2vZg+Z+fvJDxpMQA88eb2I9EcafcdjYgsDthnYTvGw==} - engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} - hasBin: true -@@ -28051,10 +28111,10 @@ packages: - node-notifier: - optional: true - dependencies: -- '@jest/core': 29.7.0 -+ '@jest/core': 29.7.0(ts-node@10.9.2) - '@jest/types': 29.6.3 - import-local: 3.2.0 -- jest-cli: 29.7.0(@types/node@24.2.1) -+ jest-cli: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) - transitivePeerDependencies: - - '@types/node' - - babel-plugin-macros -@@ -36767,7 +36827,7 @@ packages: - esbuild: 0.25.10 - fast-json-stable-stringify: 2.1.0 - handlebars: 4.7.8 -- jest: 29.7.0(@types/node@24.2.1) -+ jest: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) - json5: 2.2.3 - lodash.memoize: 4.1.2 - make-error: 1.3.6 -diff --git a/tmp/test/traffic-concurrency.ts b/tmp/test/traffic-concurrency.ts -new file mode 100644 -index 00000000..d12fc5c9 ---- /dev/null -+++ b/tmp/test/traffic-concurrency.ts -@@ -0,0 +1,91 @@ -+// @ts-nocheck -+/** -+ * Manual test: TrafficController maxConcurrent scheduling. -+ * -+ * What to look for: -+ * - `inFlight` should never exceed `maxConcurrent`. -+ * - Requests should start in bursts up to `maxConcurrent`. -+ * -+ * Run: -+ * - pnpm ts-node tmp/test/traffic-concurrency.ts -+ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-concurrency.ts (enable controller debug logs) -+ */ -+ -+import { safeStringify } from "@voltagent/internal"; -+import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; -+ -+const verbose = process.env.VERBOSE === "1"; -+if (!verbose) { -+ console.debug = () => {}; -+} -+ -+const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); -+const now = () => new Date().toISOString(); -+ -+const maxConcurrent = 3; -+const controller = getTrafficController({ maxConcurrent }); -+ -+let inFlight = 0; -+let maxObserved = 0; -+ -+function makeModel(id: string, durationMs: number) { -+ return { -+ specificationVersion: "v2", -+ provider: "sim", -+ modelId: `concurrency-${id}`, -+ doGenerate: async () => { -+ inFlight += 1; -+ maxObserved = Math.max(maxObserved, inFlight); -+ console.log(`[${now()}] start ${id} inFlight=${inFlight}`); -+ -+ try { -+ await sleep(durationMs); -+ return { -+ content: [{ type: "text", text: `ok:${id}` }], -+ finishReason: "stop", -+ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, -+ warnings: [], -+ response: { modelId: `concurrency-${id}`, headers: {} }, -+ }; -+ } finally { -+ inFlight -= 1; -+ console.log(`[${now()}] end ${id} inFlight=${inFlight}`); -+ } -+ }, -+ }; -+} -+ -+async function main() { -+ console.log(`\n=== TrafficController concurrency (maxConcurrent=${maxConcurrent}) ===`); -+ void controller; -+ -+ const agent = new Agent({ -+ name: "traffic-concurrency", -+ instructions: "echo", -+ model: makeModel("base", 0), -+ temperature: 0, -+ maxOutputTokens: 32, -+ }); -+ -+ const ids = ["A", "B", "C", "D", "E"]; -+ const jobs = ids.map((id) => -+ agent.generateText(id, { -+ tenantId: "default", -+ trafficPriority: "P1", -+ model: makeModel(id, 700), -+ }), -+ ); -+ -+ const settled = await Promise.allSettled(jobs); -+ console.log(`\n[done] maxObserved=${maxObserved}`); -+ console.log( -+ `[done] results=${safeStringify( -+ settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)), -+ )}`, -+ ); -+} -+ -+main().catch((error) => { -+ console.error("Fatal error:", error); -+ process.exit(1); -+}); -diff --git a/tmp/test/traffic-fallback-chain.ts b/tmp/test/traffic-fallback-chain.ts -new file mode 100644 -index 00000000..0cd77b2b ---- /dev/null -+++ b/tmp/test/traffic-fallback-chain.ts -@@ -0,0 +1,168 @@ -+// @ts-nocheck -+/** -+ * Manual test: TrafficController circuit breaker + fallback chains. -+ * -+ * Scenarios: -+ * - Test 1: Open primary circuit (via repeated 429s), then route to fallback1. -+ * - Test 2: Open fallback1 circuit, then route to fallback2 (success). -+ * - Test 3: No fallback configured → CircuitBreakerOpenError. -+ * -+ * Run: -+ * - pnpm ts-node tmp/test/traffic-fallback-chain.ts -+ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-fallback-chain.ts -+ */ -+ -+import { safeStringify } from "@voltagent/internal"; -+import { MockLanguageModelV2, MockProviderV2 } from "ai/test"; -+import { -+ Agent, -+ CircuitBreakerOpenError, -+ getTrafficController, -+} from "../../packages/core/dist/index.js"; -+ -+const verbose = process.env.VERBOSE === "1"; -+if (!verbose) { -+ console.debug = () => {}; -+} -+ -+const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); -+const now = () => new Date().toISOString(); -+ -+type ModelId = "primary" | "fallback1" | "fallback2" | "no-fallback"; -+ -+const provider = "test-provider"; -+ -+const controller = getTrafficController({ -+ maxConcurrent: 1, -+ fallbackChains: { -+ primary: ["fallback1", "fallback2"], -+ fallback1: ["fallback2"], -+ }, -+}); -+ -+function makeAlways429Model(modelId: ModelId) { -+ let attempts = 0; -+ return new MockLanguageModelV2({ -+ provider, -+ modelId, -+ doGenerate: async () => { -+ attempts += 1; -+ console.log(`[${now()}] doGenerate model=${modelId} attempt=${attempts} -> 429`); -+ await sleep(25); -+ const err: any = new Error(`forced 429 for model=${modelId} attempt=${attempts}`); -+ err.status = 429; -+ throw err; -+ }, -+ }); -+} -+ -+function makeAlwaysOkModel(modelId: ModelId) { -+ let attempts = 0; -+ return new MockLanguageModelV2({ -+ provider, -+ modelId, -+ doGenerate: async () => { -+ attempts += 1; -+ console.log(`[${now()}] doGenerate model=${modelId} attempt=${attempts} -> ok`); -+ await sleep(25); -+ return { -+ content: [{ type: "text", text: `ok:${modelId}` }], -+ finishReason: "stop", -+ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, -+ warnings: [], -+ response: { modelId, headers: {} }, -+ }; -+ }, -+ }); -+} -+ -+const primaryModel = makeAlways429Model("primary"); -+const fallback1Model = makeAlways429Model("fallback1"); -+const fallback2Model = makeAlwaysOkModel("fallback2"); -+const noFallbackModel = makeAlways429Model("no-fallback"); -+ -+// Required so Agent fallbacks (string model IDs) resolve without network calls. -+(globalThis as any).AI_SDK_DEFAULT_PROVIDER = new MockProviderV2({ -+ languageModels: { -+ primary: primaryModel, -+ fallback1: fallback1Model, -+ fallback2: fallback2Model, -+ "no-fallback": noFallbackModel, -+ }, -+}); -+ -+const primaryAgent = new Agent({ -+ name: "traffic-fallback-primary", -+ instructions: "echo", -+ model: primaryModel, -+ temperature: 0, -+ maxOutputTokens: 32, -+}); -+ -+const noFallbackAgent = new Agent({ -+ name: "traffic-fallback-none", -+ instructions: "echo", -+ model: noFallbackModel, -+ temperature: 0, -+ maxOutputTokens: 32, -+}); -+ -+async function runOnce(label: string, agent: any) { -+ console.log(`\n--- ${label} ---`); -+ try { -+ const result = await agent.generateText(label, { -+ tenantId: "default", -+ trafficPriority: "P1", -+ }); -+ console.log( -+ `[${label}] success text=${result.text} responseModel=${result.response?.modelId ?? "n/a"}`, -+ ); -+ } catch (err: any) { -+ if (err instanceof CircuitBreakerOpenError) { -+ console.log( -+ `[${label}] CircuitBreakerOpenError retryAfterMs=${err.retryAfterMs} msg=${err.message}`, -+ ); -+ } else { -+ console.log( -+ `[${label}] failed name=${err?.name ?? "Error"} status=${err?.status ?? err?.statusCode ?? "n/a"} msg=${err?.message}`, -+ ); -+ } -+ } -+} -+ -+async function main() { -+ console.log("\n=== Circuit breaker + fallback chain ==="); -+ void controller; -+ -+ console.log("\n[Test 1] Open primary circuit, then route to fallback1"); -+ // Two calls * (up to 3 retries each) ≈ 6 failures → should open the circuit (threshold=5). -+ await runOnce("primary-warmup-1", primaryAgent); -+ await runOnce("primary-warmup-2", primaryAgent); -+ await runOnce("primary-after-open", primaryAgent); // should execute fallback1 (still closed) -+ -+ console.log("\n[Test 2] Open fallback1 circuit, then route to fallback2"); -+ // Build enough failures on fallback1 by routing multiple requests to it via primary circuit-open path. -+ await runOnce("fallback1-warmup-1-via-primary", primaryAgent); -+ await runOnce("fallback1-warmup-2-via-primary", primaryAgent); -+ await runOnce("primary-should-hit-fallback2", primaryAgent); // should execute fallback2 and succeed -+ -+ console.log("\n[Test 3] No fallback configured → CircuitBreakerOpenError"); -+ await runOnce("no-fallback-warmup-1", noFallbackAgent); -+ await runOnce("no-fallback-warmup-2", noFallbackAgent); -+ await runOnce("no-fallback-after-open", noFallbackAgent); -+ -+ console.log("\n[debug] model call counts:"); -+ console.log( -+ safeStringify({ -+ primary: primaryModel.doGenerateCalls?.length, -+ fallback1: fallback1Model.doGenerateCalls?.length, -+ fallback2: fallback2Model.doGenerateCalls?.length, -+ "no-fallback": noFallbackModel.doGenerateCalls?.length, -+ }), -+ ); -+} -+ -+main().catch((error) => { -+ console.error("Fatal error:", error); -+ process.exit(1); -+}); -diff --git a/tmp/test/traffic-priority-openai-real.ts b/tmp/test/traffic-priority-openai-real.ts -new file mode 100644 -index 00000000..223263ba ---- /dev/null -+++ b/tmp/test/traffic-priority-openai-real.ts -@@ -0,0 +1,117 @@ -+// @ts-nocheck -+/** -+ * Manual test: TrafficController + AI SDK with real OpenAI calls. -+ * -+ * What this exercises: -+ * - Priority scheduling (P0/P1/P2) with `maxConcurrent=1` -+ * - Rate limit header ingestion via `updateRateLimitFromHeaders()` (if headers are present) -+ * - Tenant usage aggregation via `extractUsage` + `getTenantUsage()` -+ * -+ * Prereqs: -+ * - Set `OPENAI_API_KEY` -+ * -+ * Run: -+ * - OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-priority-openai-real.ts -+ * - VERBOSE=1 OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-priority-openai-real.ts -+ * -+ * Notes: -+ * - This will make real network calls and may incur cost. -+ */ -+ -+import { openai } from "@ai-sdk/openai"; -+import { safeStringify } from "@voltagent/internal"; -+import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; -+ -+const verbose = process.env.VERBOSE === "1"; -+if (!verbose) { -+ console.debug = () => {}; -+} -+ -+const apiKey = process.env.OPENAI_API_KEY; -+if (!apiKey) { -+ console.error("Missing OPENAI_API_KEY. Example:"); -+ console.error(" OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-priority-openai-real.ts"); -+ process.exit(1); -+} -+ -+const _now = () => new Date().toISOString(); -+const preview = (value: unknown, max = 140) => { -+ if (typeof value !== "string") return String(value ?? ""); -+ return value.length > max ? `${value.slice(0, max)}…` : value; -+}; -+ -+const tenantId = process.env.TENANT_ID ?? "openai-real"; -+const defaultModelId = process.env.OPENAI_MODEL ?? "gpt-4o-mini"; -+ -+const controller = getTrafficController({ maxConcurrent: 1 }); -+ -+function getHeader(headers: any, name: string): string | undefined { -+ if (!headers) return undefined; -+ if (typeof headers.get === "function") { -+ const v = headers.get(name); -+ return v === null || v === undefined ? undefined : String(v); -+ } -+ const key = Object.keys(headers).find((k) => k.toLowerCase() === name.toLowerCase()); -+ if (!key) return undefined; -+ const v = headers[key]; -+ return v === null || v === undefined ? undefined : String(Array.isArray(v) ? v[0] : v); -+} -+ -+async function main() { -+ console.log( -+ `\n=== OpenAI real: priority scheduling (tenantId=${tenantId}, model=${defaultModelId}) ===`, -+ ); -+ void controller; -+ -+ const agent = new Agent({ -+ name: "openai-real-traffic", -+ instructions: "Reply exactly with the requested token.", -+ model: openai(defaultModelId), -+ temperature: 0, -+ maxOutputTokens: 32, -+ }); -+ -+ // Enqueue in reverse priority order; controller should still execute P0 first. -+ const p2 = agent.generateText("Reply with only: P2", { tenantId, trafficPriority: "P2" }); -+ const p1 = agent.generateText("Reply with only: P1", { tenantId, trafficPriority: "P1" }); -+ const p0 = agent.generateText("Reply with only: P0", { tenantId, trafficPriority: "P0" }); -+ -+ const settled = await Promise.allSettled([p0, p1, p2]); -+ for (const result of settled) { -+ if (result.status !== "fulfilled") { -+ console.log(`[result] rejected=${result.reason?.message ?? String(result.reason)}`); -+ continue; -+ } -+ -+ const headers = result.value.response?.headers; -+ const limit = getHeader(headers, "x-ratelimit-limit-requests"); -+ const remaining = getHeader(headers, "x-ratelimit-remaining-requests"); -+ const reset = getHeader(headers, "x-ratelimit-reset-requests"); -+ -+ console.log( -+ `[result] text=${preview(result.value.text)} finishReason=${result.value.finishReason} usage=${safeStringify(result.value.usage)}`, -+ ); -+ console.log( -+ `[result] ratelimitHeaders=${safeStringify({ -+ limit, -+ remaining, -+ reset, -+ })}`, -+ ); -+ } -+ -+ console.log( -+ `\n[done] settled=${safeStringify( -+ settled.map((s) => (s.status === "fulfilled" ? preview(s.value.text) : s.reason?.message)), -+ )}`, -+ ); -+ -+ console.log( -+ `[done] tenantUsage(${tenantId})=${safeStringify(controller.getTenantUsage(tenantId))}`, -+ ); -+} -+ -+main().catch((error) => { -+ console.error("Fatal error:", error); -+ process.exit(1); -+}); -diff --git a/tmp/test/traffic-priority-openai-sim.ts b/tmp/test/traffic-priority-openai-sim.ts -new file mode 100644 -index 00000000..9d36a7d1 ---- /dev/null -+++ b/tmp/test/traffic-priority-openai-sim.ts -@@ -0,0 +1,114 @@ -+// @ts-nocheck -+/** -+ * Manual test: Agent → TrafficController priority scheduling (OpenAI-like stub models). -+ * -+ * This keeps the Agent + AI SDK path, but avoids real network calls by using stub models -+ * that pretend to be `provider="openai"` with modelIds like `gpt-4o`/`gpt-4o-mini`. -+ * -+ * Scenarios: -+ * - Test 1: P0 runs before P1/P2 when all runnable. -+ * - Test 2: P0 request (gpt-4o) is rate-limited → P1 (gpt-4o-mini) proceeds. -+ * -+ * Note: -+ * - Rate-limit wakeups include a small probe delay; a "1s" reset may unblock slightly after 1s. -+ * -+ * Run: -+ * - pnpm ts-node tmp/test/traffic-priority-openai-sim.ts -+ */ -+ -+import { safeStringify } from "@voltagent/internal"; -+import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; -+ -+const verbose = process.env.VERBOSE === "1"; -+if (!verbose) { -+ console.debug = () => {}; -+} -+ -+const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); -+const now = () => new Date().toISOString(); -+ -+function makeOpenAIStubModel(modelId: string, delayMs: number) { -+ let calls = 0; -+ return { -+ specificationVersion: "v2", -+ provider: "openai", -+ modelId, -+ doGenerate: async () => { -+ calls += 1; -+ console.log(`[${now()}] [model] ${modelId} doGenerate call=${calls}`); -+ await sleep(delayMs); -+ return { -+ content: [{ type: "text", text: `ok:${modelId}` }], -+ finishReason: "stop", -+ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, -+ warnings: [], -+ response: { modelId, headers: {} }, -+ }; -+ }, -+ }; -+} -+ -+const controller = getTrafficController({ maxConcurrent: 1 }); -+ -+const modelMini = makeOpenAIStubModel("gpt-4o-mini", 80); -+const modelBig = makeOpenAIStubModel("gpt-4o", 80); -+ -+const agent = new Agent({ -+ name: "priority-openai-sim", -+ instructions: "echo", -+ model: modelMini, -+ temperature: 0, -+ maxOutputTokens: 32, -+}); -+ -+async function test1_priorityOrder() { -+ console.log("\n=== Test 1: P0 ordering via Agent ==="); -+ -+ const p2 = agent.generateText("P2", { trafficPriority: "P2", tenantId: "sim" }); -+ const p1 = agent.generateText("P1", { trafficPriority: "P1", tenantId: "sim" }); -+ const p0 = agent.generateText("P0", { trafficPriority: "P0", tenantId: "sim" }); -+ -+ const results = await Promise.all([p0, p1, p2]); -+ console.log(`[Test 1] results=${safeStringify(results.map((r) => r.text))}`); -+} -+ -+async function test2_p1RunsWhenP0RateLimited() { -+ console.log("\n=== Test 2: P1 proceeds when P0 is rate-limited ==="); -+ -+ // Seed remaining=0 for openai::gpt-4o so the P0 head item initially waits. -+ const applied = controller.updateRateLimitFromHeaders( -+ { provider: "openai", model: "gpt-4o" }, -+ { -+ "x-ratelimit-limit-requests": "1", -+ "x-ratelimit-remaining-requests": "0", -+ "x-ratelimit-reset-requests": "1s", -+ }, -+ ); -+ console.log(`[Test 2] updateRateLimitFromHeaders=${safeStringify(applied)}`); -+ -+ const p0Blocked = agent.generateText("P0 (gpt-4o, rate-limited)", { -+ trafficPriority: "P0", -+ tenantId: "sim", -+ model: modelBig, // per-call model override (new in this branch) -+ }); -+ -+ const p1Free = agent.generateText("P1 (gpt-4o-mini)", { -+ trafficPriority: "P1", -+ tenantId: "sim", -+ model: modelMini, -+ }); -+ -+ const [r0, r1] = await Promise.all([p0Blocked, p1Free]); -+ console.log(`[Test 2] p0 text=${r0.text}`); -+ console.log(`[Test 2] p1 text=${r1.text}`); -+} -+ -+async function main() { -+ await test1_priorityOrder(); -+ await test2_p1RunsWhenP0RateLimited(); -+} -+ -+main().catch((error) => { -+ console.error("Fatal error:", error); -+ process.exit(1); -+}); -diff --git a/tmp/test/traffic-priority.ts b/tmp/test/traffic-priority.ts -new file mode 100644 -index 00000000..409e1078 ---- /dev/null -+++ b/tmp/test/traffic-priority.ts -@@ -0,0 +1,159 @@ -+// @ts-nocheck -+/** -+ * Manual test: TrafficController priority scheduling. -+ * -+ * Scenarios: -+ * - Test 1: P0 should run before P1/P2 when runnable. -+ * - Test 2: If a P0 request is rate-limited, a lower priority (P1) can proceed. -+ * -+ * Note: -+ * - Rate-limit wakeups include a small probe delay; a "1s" reset may unblock slightly after 1s. -+ * -+ * Run: -+ * - pnpm ts-node tmp/test/traffic-priority.ts -+ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-priority.ts -+ */ -+ -+import { safeStringify } from "@voltagent/internal"; -+import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; -+ -+const verbose = process.env.VERBOSE === "1"; -+if (!verbose) { -+ console.debug = () => {}; -+} -+ -+const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); -+const now = () => new Date().toISOString(); -+ -+const controller = getTrafficController({ maxConcurrent: 1 }); -+ -+function extractLabel(prompt: any): string { -+ if (!Array.isArray(prompt)) { -+ return "unknown"; -+ } -+ -+ for (let index = prompt.length - 1; index >= 0; index -= 1) { -+ const message = prompt[index]; -+ if (!message || message.role !== "user" || !Array.isArray(message.content)) { -+ continue; -+ } -+ -+ const textPart = message.content.find((part: any) => part?.type === "text"); -+ if (textPart?.text) { -+ return String(textPart.text); -+ } -+ } -+ -+ return "unknown"; -+} -+ -+function makeModel(provider: string, modelId: string, delayMs = 50) { -+ let calls = 0; -+ let lastStartAt = 0; -+ -+ return { -+ specificationVersion: "v2", -+ provider, -+ modelId, -+ doGenerate: async (options: any) => { -+ calls += 1; -+ const startAt = Date.now(); -+ const delta = lastStartAt ? startAt - lastStartAt : 0; -+ lastStartAt = startAt; -+ -+ const label = extractLabel(options?.prompt); -+ console.log( -+ `[${now()}] doGenerate start model=${provider}::${modelId} call=${calls} (+${delta}ms) input=${label}`, -+ ); -+ await sleep(delayMs); -+ console.log(`[${now()}] doGenerate end model=${provider}::${modelId} input=${label}`); -+ -+ return { -+ content: [{ type: "text", text: `ok:${label}` }], -+ finishReason: "stop", -+ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, -+ warnings: [], -+ response: { modelId, headers: {} }, -+ }; -+ }, -+ }; -+} -+ -+async function test1_priorityOrder() { -+ console.log("\n=== Test 1: priority order (P0 before P1/P2) ==="); -+ -+ const sharedModel = makeModel("p", "shared-model", 50); -+ const agent = new Agent({ -+ name: "traffic-priority", -+ instructions: "echo", -+ model: sharedModel, -+ temperature: 0, -+ maxOutputTokens: 32, -+ }); -+ -+ // Enqueue in reverse order; scheduler should still run P0 first. -+ const p2 = agent.generateText("P2", { tenantId: "default", trafficPriority: "P2" }); -+ const p1 = agent.generateText("P1", { tenantId: "default", trafficPriority: "P1" }); -+ const p0 = agent.generateText("P0", { tenantId: "default", trafficPriority: "P0" }); -+ -+ const settled = await Promise.allSettled([p0, p1, p2]); -+ console.log( -+ `[Test 1] results=${safeStringify( -+ settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)), -+ )}`, -+ ); -+} -+ -+async function test2_lowerPriorityWhenP0RateLimited() { -+ console.log("\n=== Test 2: P1 proceeds when P0 rate-limited ==="); -+ -+ const applied = controller.updateRateLimitFromHeaders( -+ { provider: "p0", model: "m0" }, -+ { -+ "x-ratelimit-limit-requests": "1", -+ "x-ratelimit-remaining-requests": "0", -+ "x-ratelimit-reset-requests": "1s", -+ }, -+ ); -+ console.log(`[Test 2] updateRateLimitFromHeaders=${safeStringify(applied)}`); -+ -+ const modelP0 = makeModel("p0", "m0", 50); -+ const modelP1 = makeModel("p1", "m1", 50); -+ const agent = new Agent({ -+ name: "traffic-priority-rate-limit", -+ instructions: "echo", -+ model: modelP1, -+ temperature: 0, -+ maxOutputTokens: 32, -+ }); -+ -+ // Now the next P0 request is at the head of the queue but rate-limited, -+ // so a runnable P1 request should execute first. -+ const p0Blocked = agent.generateText("P0-blocked (rate limited)", { -+ tenantId: "default", -+ trafficPriority: "P0", -+ model: modelP0, -+ }); -+ const p1Free = agent.generateText("P1-free (should run first)", { -+ tenantId: "default", -+ trafficPriority: "P1", -+ model: modelP1, -+ }); -+ -+ const settled = await Promise.allSettled([p0Blocked, p1Free]); -+ console.log( -+ `[Test 2] results=${safeStringify( -+ settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)), -+ )}`, -+ ); -+} -+ -+async function main() { -+ await test1_priorityOrder(); -+ await test2_lowerPriorityWhenP0RateLimited(); -+} -+ -+main().catch((error) => { -+ console.error("Fatal error:", error); -+ process.exit(1); -+}); -diff --git a/tmp/test/traffic-rate-limit-from-headers.ts b/tmp/test/traffic-rate-limit-from-headers.ts -new file mode 100644 -index 00000000..d8262661 ---- /dev/null -+++ b/tmp/test/traffic-rate-limit-from-headers.ts -@@ -0,0 +1,158 @@ -+// @ts-nocheck -+/** -+ * Manual test: TrafficController dynamic rate limits from OpenAI response headers. -+ * -+ * This hits the real OpenAI model via Agent + AI SDK, and relies on the -+ * `x-ratelimit-*` response headers to seed/update the TrafficController. -+ * -+ * What to look for: -+ * - Each request prints the observed `x-ratelimit-*` headers (if present). -+ * - Agent should also log: "[Traffic] Applied rate limit from response headers". -+ * - With enough parallel requests, some requests may take longer due to controller throttling. -+ * -+ * Prereqs: -+ * - Set `OPENAI_API_KEY` -+ * -+ * Optional env: -+ * - `OPENAI_MODEL` (default: gpt-4o-mini) -+ * - `REQUESTS` (default: 10) -+ * - `MAX_CONCURRENT` (default: 50) -+ * - `TENANT_ID` (default: openai-rate-limit-headers) -+ * -+ * Run: -+ * - OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-from-headers.ts -+ * - VERBOSE=1 OPENAI_API_KEY=... REQUESTS=30 pnpm ts-node tmp/test/traffic-rate-limit-from-headers.ts -+ */ -+ -+import { openai } from "@ai-sdk/openai"; -+import { safeStringify } from "@voltagent/internal"; -+import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; -+ -+const verbose = process.env.VERBOSE === "1"; -+if (!verbose) { -+ console.debug = () => {}; -+} -+ -+const now = () => new Date().toISOString(); -+ -+const apiKey = process.env.OPENAI_API_KEY; -+if (!apiKey) { -+ console.error("Missing OPENAI_API_KEY. Example:"); -+ console.error(" OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-from-headers.ts"); -+ process.exit(1); -+} -+ -+const provider = "openai"; -+const modelId = process.env.OPENAI_MODEL ?? "gpt-4o-mini"; -+const tenantId = process.env.TENANT_ID ?? "openai-rate-limit-headers"; -+const requestCountRaw = Number(process.env.REQUESTS ?? "10"); -+const maxConcurrentRaw = Number(process.env.MAX_CONCURRENT ?? "50"); -+const requestCount = Number.isFinite(requestCountRaw) && requestCountRaw > 0 ? requestCountRaw : 10; -+const maxConcurrent = -+ Number.isFinite(maxConcurrentRaw) && maxConcurrentRaw > 0 ? maxConcurrentRaw : 50; -+ -+const key = `${provider}::${modelId}`; -+const controller = getTrafficController({ maxConcurrent }); -+ -+function getHeader(headers: any, name: string): string | undefined { -+ if (!headers) return undefined; -+ if (typeof headers.get === "function") { -+ const v = headers.get(name); -+ return v === null || v === undefined ? undefined : String(v); -+ } -+ -+ const entries = Object.entries(headers as Record); -+ const target = name.toLowerCase(); -+ const match = entries.find(([k]) => String(k).toLowerCase() === target); -+ if (!match) return undefined; -+ -+ const value = match[1]; -+ if (Array.isArray(value)) { -+ const first = value[0]; -+ return first === null || first === undefined ? undefined : String(first); -+ } -+ -+ return value === null || value === undefined ? undefined : String(value); -+} -+ -+async function main() { -+ console.log( -+ `\n=== OpenAI rate limit headers → TrafficController (${key}, maxConcurrent=${maxConcurrent}, requests=${requestCount}) ===`, -+ ); -+ void controller; -+ -+ const agent = new Agent({ -+ name: "openai-rate-limit-from-headers", -+ instructions: "Reply with only the requested token.", -+ model: openai(modelId), -+ temperature: 0, -+ maxOutputTokens: 32, -+ }); -+ -+ console.log("\n[seed] Making one request to capture headers..."); -+ const seedStartedAt = Date.now(); -+ const seed = await agent.generateText("Reply with only: seed", { -+ tenantId, -+ trafficPriority: "P1", -+ }); -+ const seedElapsedMs = Date.now() - seedStartedAt; -+ -+ const seedHeaders = seed.response?.headers; -+ console.log(`[seed] done in ${seedElapsedMs}ms text=${seed.text}`); -+ console.log( -+ `[seed] x-ratelimit-*=${safeStringify({ -+ limit: getHeader(seedHeaders, "x-ratelimit-limit-requests"), -+ remaining: getHeader(seedHeaders, "x-ratelimit-remaining-requests"), -+ reset: getHeader(seedHeaders, "x-ratelimit-reset-requests"), -+ })}`, -+ ); -+ -+ console.log(`\n[burst] Scheduling ${requestCount} parallel requests...`); -+ const jobs = Array.from({ length: requestCount }, (_, idx) => { -+ const label = `req-${idx + 1}`; -+ const enqueuedAt = Date.now(); -+ console.log(`[${now()}] enqueue ${label}`); -+ -+ return agent -+ .generateText(`Reply with only: ${label}`, { tenantId, trafficPriority: "P1" }) -+ .then((result) => { -+ const elapsedMs = Date.now() - enqueuedAt; -+ const headers = result.response?.headers; -+ console.log( -+ `[${now()}] done ${label} in ${elapsedMs}ms text=${result.text} x-ratelimit-remaining=${getHeader( -+ headers, -+ "x-ratelimit-remaining-requests", -+ )}`, -+ ); -+ return { -+ label, -+ elapsedMs, -+ text: result.text, -+ headers: { -+ limit: getHeader(headers, "x-ratelimit-limit-requests"), -+ remaining: getHeader(headers, "x-ratelimit-remaining-requests"), -+ reset: getHeader(headers, "x-ratelimit-reset-requests"), -+ }, -+ }; -+ }) -+ .catch((error) => { -+ const elapsedMs = Date.now() - enqueuedAt; -+ console.log( -+ `[${now()}] failed ${label} in ${elapsedMs}ms name=${error?.name ?? "Error"} status=${error?.status ?? error?.statusCode ?? "n/a"} msg=${error?.message}`, -+ ); -+ throw error; -+ }); -+ }); -+ -+ const settled = await Promise.allSettled(jobs); -+ -+ console.log(`\n[done] settled=${safeStringify(settled.map((s) => s.status))}`); -+ console.log( -+ `[done] tenantUsage(${tenantId})=${safeStringify(controller.getTenantUsage(tenantId))}`, -+ ); -+} -+ -+main().catch((error) => { -+ console.error("Fatal error:", error); -+ process.exit(1); -+}); -diff --git a/tmp/test/traffic-rate-limit-openai-window-sim.ts b/tmp/test/traffic-rate-limit-openai-window-sim.ts -new file mode 100644 -index 00000000..35232faa ---- /dev/null -+++ b/tmp/test/traffic-rate-limit-openai-window-sim.ts -@@ -0,0 +1,247 @@ -+// @ts-nocheck -+/** -+ * Manual test (real network): Simulate OpenAI "window remaining + reset" semantics and watch -+ * TrafficController pace + probe behavior via logs. -+ * -+ * Why "simulate"? -+ * - Real OpenAI headers usually show very large remaining values, so pacing is hard to observe. -+ * - This script still hits the real OpenAI model, but it drives the controller state using -+ * synthetic `x-ratelimit-*` headers to force a small window (e.g. remaining=3, reset=30s). -+ * -+ * What this demonstrates (matches your Step 1–7): -+ * 1) We seed controller with remaining + reset window. -+ * 2) We enqueue many requests. -+ * 3) Controller subtracts `reserved` from `remaining` to avoid stampedes. -+ * 4) When `effectiveRemaining <= 1`, controller waits until `resetAt + probeDelay`. -+ * 5) When room exists, controller paces using `nextAllowedAt`. -+ * 6) When a request finishes, we release reservation (controller) and apply new headers (this script). -+ * 7) After reset, controller sends a probe even when remaining==0; probe "fetches" fresh headers and flow resumes. -+ * -+ * Prereqs: -+ * - Set `OPENAI_API_KEY` -+ * -+ * Suggested logging: -+ * - `VOLTAGENT_LOG_LEVEL=trace` (to see traffic controller internals) -+ * -+ * Run: -+ * - VOLTAGENT_LOG_LEVEL=trace OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-openai-window-sim.ts -+ * -+ * Optional env: -+ * - OPENAI_MODEL (default: gpt-4o-mini) -+ * - WINDOW_SECONDS (default: 30) -+ * - REMAINING (default: 3) -+ * - REQUESTS (default: 10) -+ * - MAX_CONCURRENT (default: 50) -+ */ -+ -+import { safeStringify } from "@voltagent/internal"; -+import { TrafficController } from "../../packages/core/dist/index.js"; -+ -+const apiKey = process.env.OPENAI_API_KEY; -+if (!apiKey) { -+ console.error("Missing OPENAI_API_KEY. Example:"); -+ console.error( -+ " VOLTAGENT_LOG_LEVEL=trace OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-openai-window-sim.ts", -+ ); -+ process.exit(1); -+} -+ -+const now = () => new Date().toISOString(); -+ -+const modelId = process.env.OPENAI_MODEL ?? "gpt-4o-mini"; -+const windowSecondsRaw = Number(process.env.WINDOW_SECONDS ?? "30"); -+const remainingRaw = Number(process.env.REMAINING ?? "3"); -+const requestsRaw = Number(process.env.REQUESTS ?? "10"); -+const maxConcurrentRaw = Number(process.env.MAX_CONCURRENT ?? "50"); -+ -+const windowSeconds = -+ Number.isFinite(windowSecondsRaw) && windowSecondsRaw > 0 ? windowSecondsRaw : 30; -+const initialRemaining = -+ Number.isFinite(remainingRaw) && remainingRaw > 0 ? Math.floor(remainingRaw) : 3; -+const requestCount = Number.isFinite(requestsRaw) && requestsRaw > 0 ? Math.floor(requestsRaw) : 10; -+const maxConcurrent = -+ Number.isFinite(maxConcurrentRaw) && maxConcurrentRaw > 0 ? Math.floor(maxConcurrentRaw) : 50; -+ -+const provider = "openai"; -+const tenantId = "openai-window-sim"; -+const windowMs = Math.round(windowSeconds * 1000); -+ -+async function callOpenAIResponses(label: string): Promise<{ -+ status: number; -+ headers: Record; -+ textPreview: string; -+}> { -+ const url = "https://api.openai.com/v1/responses"; -+ const body = safeStringify({ -+ model: modelId, -+ input: `Reply with only: ${label}`, -+ max_output_tokens: 16, -+ }); -+ -+ const startedAt = Date.now(); -+ const res = await fetch(url, { -+ method: "POST", -+ headers: { -+ authorization: `Bearer ${apiKey}`, -+ "content-type": "application/json", -+ }, -+ body, -+ }); -+ -+ const limit = res.headers.get("x-ratelimit-limit-requests") ?? undefined; -+ const remaining = res.headers.get("x-ratelimit-remaining-requests") ?? undefined; -+ const reset = res.headers.get("x-ratelimit-reset-requests") ?? undefined; -+ -+ if (!res.ok) { -+ const text = await res.text().catch(() => ""); -+ throw new Error( -+ `OpenAI error status=${res.status} elapsedMs=${Date.now() - startedAt} body=${text.slice(0, 280)}`, -+ ); -+ } -+ -+ const data: any = await res.json(); -+ const outputText = -+ data?.output?.[0]?.content?.find?.((c: any) => c?.type === "output_text")?.text ?? -+ data?.output_text ?? -+ data?.output?.[0]?.content?.[0]?.text ?? -+ ""; -+ -+ return { -+ status: res.status, -+ headers: { -+ "x-ratelimit-limit-requests": limit, -+ "x-ratelimit-remaining-requests": remaining, -+ "x-ratelimit-reset-requests": reset, -+ }, -+ textPreview: String(outputText).slice(0, 80), -+ }; -+} -+ -+async function main() { -+ console.log( -+ `\n=== OpenAI real + synthetic window rate limit (provider=${provider}, model=${modelId}) ===`, -+ ); -+ console.log( -+ `[config] maxConcurrent=${maxConcurrent} windowSeconds=${windowSeconds} initialRemaining=${initialRemaining} requests=${requestCount}`, -+ ); -+ console.log( -+ "[hint] Set VOLTAGENT_LOG_LEVEL=trace to see TrafficController internals (reserved/effectiveRemaining/nextAllowedAt).", -+ ); -+ -+ const controller = new TrafficController({ maxConcurrent }); -+ -+ // --- Step 1: seed "remaining + reset window" into controller --- -+ let windowResetAt = Date.now() + windowMs; -+ let remainingInWindow = initialRemaining; -+ -+ const applySyntheticHeaders = (source: string) => { -+ const resetMs = Math.max(1, windowResetAt - Date.now()); -+ const applied = controller.updateRateLimitFromHeaders( -+ { provider, model: modelId, tenantId }, -+ { -+ "x-ratelimit-limit-requests": String(initialRemaining), -+ "x-ratelimit-remaining-requests": String(Math.max(0, remainingInWindow)), -+ "x-ratelimit-reset-requests": `${resetMs}ms`, -+ }, -+ ); -+ console.log( -+ `[${now()}] [synthetic] source=${source} remaining=${remainingInWindow} resetInMs=${resetMs} applied=${safeStringify( -+ applied && { -+ key: applied.key, -+ state: { -+ remaining: applied.state.remaining, -+ reserved: applied.state.reserved, -+ resetAt: applied.state.resetAt, -+ nextAllowedAt: applied.state.nextAllowedAt, -+ }, -+ }, -+ )}`, -+ ); -+ }; -+ -+ applySyntheticHeaders("seed"); -+ -+ console.log("\n[seed] Making one real request to confirm connectivity + show real headers..."); -+ const seed = await callOpenAIResponses("seed"); -+ console.log( -+ `[${now()}] [seed] ok status=${seed.status} text=${seed.textPreview} realHeaders=${safeStringify( -+ seed.headers, -+ )}`, -+ ); -+ -+ console.log(`\n[burst] Enqueueing ${requestCount} controller-managed requests...`); -+ -+ const jobs = Array.from({ length: requestCount }, (_, index) => { -+ const label = `req-${index + 1}`; -+ const enqueuedAt = Date.now(); -+ console.log(`[${now()}] [enqueue] ${label}`); -+ -+ return controller -+ .handleText({ -+ tenantId, -+ metadata: { -+ tenantId, -+ provider, -+ model: modelId, -+ priority: "P1", -+ agentName: "openai-window-sim", -+ agentId: label, -+ }, -+ execute: async () => { -+ const startedAt = Date.now(); -+ console.log(`[${now()}] [execute-start] ${label}`); -+ -+ const result = await callOpenAIResponses(label); -+ -+ console.log( -+ `[${now()}] [execute-end] ${label} elapsedMs=${Date.now() - startedAt} realHeaders=${safeStringify( -+ result.headers, -+ )}`, -+ ); -+ -+ // --- Step 6: decrement remaining + apply new "headers" --- -+ const nowMs = Date.now(); -+ if (nowMs >= windowResetAt) { -+ // --- Step 7: reset happened; probe request fetched "fresh" headers for the next window --- -+ console.log( -+ `[${now()}] [reset] window elapsed; starting new synthetic window (windowSeconds=${windowSeconds})`, -+ ); -+ windowResetAt = nowMs + windowMs; -+ remainingInWindow = initialRemaining; -+ } -+ -+ remainingInWindow = Math.max(0, remainingInWindow - 1); -+ applySyntheticHeaders("response"); -+ -+ return result; -+ }, -+ }) -+ .then((r) => { -+ const totalElapsedMs = Date.now() - enqueuedAt; -+ console.log( -+ `[${now()}] [done] ${label} totalElapsedMs=${totalElapsedMs} text=${r.textPreview}`, -+ ); -+ return { label, totalElapsedMs, status: "fulfilled" as const }; -+ }) -+ .catch((error: any) => { -+ const totalElapsedMs = Date.now() - enqueuedAt; -+ console.log( -+ `[${now()}] [fail] ${label} totalElapsedMs=${totalElapsedMs} name=${error?.name ?? "Error"} msg=${ -+ error?.message ?? String(error) -+ }`, -+ ); -+ return { label, totalElapsedMs, status: "rejected" as const }; -+ }); -+ }); -+ -+ const settled = await Promise.all(jobs); -+ console.log(`\n[done] settled=${safeStringify(settled.map((s) => s.status))}`); -+ console.log( -+ `[done] tenantUsage(${tenantId})=${safeStringify(controller.getTenantUsage(tenantId))}`, -+ ); -+} -+ -+main().catch((error) => { -+ console.error("Fatal error:", error); -+ process.exit(1); -+}); -diff --git a/tmp/test/traffic-rate-limit-static.ts b/tmp/test/traffic-rate-limit-static.ts -new file mode 100644 -index 00000000..3f91d5bb ---- /dev/null -+++ b/tmp/test/traffic-rate-limit-static.ts -@@ -0,0 +1,149 @@ -+// @ts-nocheck -+/** -+ * Manual test: TrafficController window-based rate limiting (simulated OpenAI headers). -+ * -+ * What to look for: -+ * - Requests should be paced out across the window (no steady "refill" math). -+ * - If responses arrive out-of-order, remaining headers might "increase"; controller should -+ * keep remaining monotonic within the same window. -+ * -+ * Run: -+ * - pnpm ts-node tmp/test/traffic-rate-limit-static.ts -+ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-rate-limit-static.ts -+ * -+ * Optional env: -+ * - LIMIT=6 WINDOW_MS=3000 pnpm ts-node tmp/test/traffic-rate-limit-static.ts -+ */ -+ -+import { safeStringify } from "@voltagent/internal"; -+import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; -+ -+const verbose = process.env.VERBOSE === "1"; -+if (!verbose) { -+ console.debug = () => {}; -+} -+ -+const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); -+const now = () => new Date().toISOString(); -+ -+const provider = "sim"; -+const model = "rate-limited-model"; -+const key = `${provider}::${model}`; -+ -+const controller = getTrafficController({ maxConcurrent: 50 }); -+ -+const limit = Number(process.env.LIMIT ?? 6); -+const windowMs = Number(process.env.WINDOW_MS ?? 3000); -+let windowStartAt = Date.now(); -+let windowResetAt = windowStartAt + windowMs; -+let usedInWindow = 0; -+ -+function extractLabel(prompt: any): string { -+ if (!Array.isArray(prompt)) { -+ return "unknown"; -+ } -+ -+ for (let index = prompt.length - 1; index >= 0; index -= 1) { -+ const message = prompt[index]; -+ if (!message || message.role !== "user" || !Array.isArray(message.content)) { -+ continue; -+ } -+ -+ const textPart = message.content.find((part: any) => part?.type === "text"); -+ if (textPart?.text) { -+ return String(textPart.text); -+ } -+ } -+ -+ return "unknown"; -+} -+ -+async function main() { -+ console.log( -+ `\n=== Window rate limit for ${key} (limit=${limit}, windowMs=${windowMs}, jobs=10) ===`, -+ ); -+ -+ const seeded = controller.updateRateLimitFromHeaders( -+ { provider, model }, -+ { -+ "x-ratelimit-limit-requests": String(limit), -+ "x-ratelimit-remaining-requests": String(limit), -+ "x-ratelimit-reset-requests": `${windowMs}ms`, -+ }, -+ ); -+ console.log(`[seed] updateRateLimitFromHeaders=${safeStringify(seeded)}`); -+ -+ let calls = 0; -+ let lastStartAt = 0; -+ const rateLimitedModel = { -+ specificationVersion: "v2", -+ provider, -+ modelId: model, -+ doGenerate: async (options: any) => { -+ const simulatedLatencyMs = 10 + Math.floor(Math.random() * 120); -+ const nowMs = Date.now(); -+ if (nowMs >= windowResetAt) { -+ windowStartAt = nowMs; -+ windowResetAt = windowStartAt + windowMs; -+ usedInWindow = 0; -+ } -+ -+ calls += 1; -+ usedInWindow += 1; -+ const startAt = Date.now(); -+ const delta = lastStartAt ? startAt - lastStartAt : 0; -+ lastStartAt = startAt; -+ -+ const label = extractLabel(options?.prompt); -+ console.log( -+ `[${now()}] doGenerate start call=${calls} (+${delta}ms) input=${label} latencyMs=${simulatedLatencyMs}`, -+ ); -+ await sleep(simulatedLatencyMs); -+ console.log(`[${now()}] doGenerate end input=${label}`); -+ -+ const remainingAfterThis = Math.max(0, limit - usedInWindow); -+ const resetMs = Math.max(1, windowResetAt - Date.now()); -+ return { -+ content: [{ type: "text", text: `ok:${label}` }], -+ finishReason: "stop", -+ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, -+ warnings: [], -+ response: { -+ modelId: model, -+ headers: { -+ "x-ratelimit-limit-requests": String(limit), -+ "x-ratelimit-remaining-requests": String(remainingAfterThis), -+ "x-ratelimit-reset-requests": `${resetMs}ms`, -+ }, -+ }, -+ }; -+ }, -+ }; -+ -+ const agent = new Agent({ -+ name: "traffic-rate-limit-static", -+ instructions: "echo", -+ model: rateLimitedModel, -+ temperature: 0, -+ maxOutputTokens: 32, -+ }); -+ -+ const jobs = Array.from({ length: 10 }, (_, idx) => -+ agent.generateText(`req-${idx + 1}`, { -+ tenantId: "default", -+ trafficPriority: "P1", -+ }), -+ ); -+ -+ const settled = await Promise.allSettled(jobs); -+ console.log( -+ `\n[done] results=${safeStringify( -+ settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)), -+ )}`, -+ ); -+} -+ -+main().catch((error) => { -+ console.error("Fatal error:", error); -+ process.exit(1); -+}); -diff --git a/tmp/test/traffic-retry-after.ts b/tmp/test/traffic-retry-after.ts -new file mode 100644 -index 00000000..c0c213eb ---- /dev/null -+++ b/tmp/test/traffic-retry-after.ts -@@ -0,0 +1,245 @@ -+// @ts-nocheck -+/** -+ * Manual test: Retry-After handling (429 retry + 200 OK header ingestion). -+ * -+ * What this exercises: -+ * - Retry-After on 429 errors increases retry delay (TrafficController retry plan). -+ * - Retry-After on successful responses throttles subsequent requests for the same provider::model. -+ * -+ * Run: -+ * - pnpm -C packages/core build -+ * - pnpm ts-node tmp/test/traffic-retry-after.ts -+ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-retry-after.ts -+ */ -+ -+import { safeStringify } from "@voltagent/internal"; -+import { -+ Agent, -+ RateLimitedUpstreamError, -+ getTrafficController, -+} from "../../packages/core/dist/index.js"; -+ -+const verbose = process.env.VERBOSE === "1"; -+if (!verbose) { -+ console.debug = () => {}; -+} -+ -+const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); -+const now = () => new Date().toISOString(); -+ -+function extractLabel(prompt: any): string { -+ if (!Array.isArray(prompt)) { -+ return "unknown"; -+ } -+ -+ for (let index = prompt.length - 1; index >= 0; index -= 1) { -+ const message = prompt[index]; -+ if (!message || message.role !== "user" || !Array.isArray(message.content)) { -+ continue; -+ } -+ -+ const textPart = message.content.find((part: any) => part?.type === "text"); -+ if (textPart?.text) { -+ return String(textPart.text); -+ } -+ } -+ -+ return "unknown"; -+} -+ -+function make429RetryAfterModel(args: { -+ provider: string; -+ modelId: string; -+ retryAfterSeconds: number; -+ mode: "headers" | "typedError"; -+}) { -+ const { provider, modelId, retryAfterSeconds, mode } = args; -+ let calls = 0; -+ const startedAt: number[] = []; -+ -+ return { -+ specificationVersion: "v2", -+ provider, -+ modelId, -+ startedAt, -+ doGenerate: async (options: any) => { -+ calls += 1; -+ const start = Date.now(); -+ startedAt.push(start); -+ -+ const label = extractLabel(options?.prompt); -+ console.log(`[${now()}] [model] ${provider}::${modelId} start call=${calls} input=${label}`); -+ -+ if (calls === 1) { -+ const retryAfterValue = String(retryAfterSeconds); -+ -+ if (mode === "typedError") { -+ throw new RateLimitedUpstreamError( -+ `rate limited (typed) retry-after=${retryAfterValue}s`, -+ { provider, model: modelId }, -+ Math.round(retryAfterSeconds * 1000), -+ ); -+ } -+ -+ const err: any = new Error(`rate limited (headers) retry-after=${retryAfterValue}s`); -+ err.status = 429; -+ err.response = { -+ status: 429, -+ headers: { -+ "retry-after": retryAfterValue, -+ }, -+ }; -+ throw err; -+ } -+ -+ return { -+ content: [{ type: "text", text: `ok:${label}` }], -+ finishReason: "stop", -+ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, -+ warnings: [], -+ response: { modelId, headers: {} }, -+ }; -+ }, -+ }; -+} -+ -+function makeSuccessRetryAfterModel(args: { -+ provider: string; -+ modelId: string; -+ retryAfterSeconds: number; -+ latencyMs: number; -+}) { -+ const { provider, modelId, retryAfterSeconds, latencyMs } = args; -+ let calls = 0; -+ const startedAt: number[] = []; -+ const endedAt: number[] = []; -+ -+ return { -+ specificationVersion: "v2", -+ provider, -+ modelId, -+ startedAt, -+ endedAt, -+ doGenerate: async (options: any) => { -+ calls += 1; -+ const start = Date.now(); -+ startedAt.push(start); -+ -+ const label = extractLabel(options?.prompt); -+ console.log(`[${now()}] [model] ${provider}::${modelId} start call=${calls} input=${label}`); -+ await sleep(latencyMs); -+ -+ const end = Date.now(); -+ endedAt.push(end); -+ console.log(`[${now()}] [model] ${provider}::${modelId} end call=${calls} input=${label}`); -+ -+ return { -+ content: [{ type: "text", text: `ok:${label}` }], -+ finishReason: "stop", -+ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, -+ warnings: [], -+ response: { -+ modelId, -+ headers: -+ calls === 1 -+ ? { -+ "retry-after": String(retryAfterSeconds), -+ } -+ : {}, -+ }, -+ }; -+ }, -+ }; -+} -+ -+async function test_retryAfterOn429(mode: "headers" | "typedError") { -+ const retryAfterSeconds = 1; -+ const provider = `retry-after-429-${mode}`; -+ const modelId = "ra-429"; -+ const tenantId = `ra-429-${mode}`; -+ -+ const model = make429RetryAfterModel({ provider, modelId, retryAfterSeconds, mode }); -+ const agent = new Agent({ -+ name: `ra-429-${mode}`, -+ instructions: "echo", -+ model, -+ temperature: 0, -+ maxOutputTokens: 32, -+ }); -+ -+ console.log(`\n=== Test: Retry-After on 429 (${mode}) ===`); -+ const result = await agent.generateText("hello", { tenantId, trafficPriority: "P1" }); -+ -+ const times = model.startedAt; -+ const deltaMs = times.length >= 2 ? times[1] - times[0] : undefined; -+ -+ console.log( -+ `[result] text=${result.text} calls=${times.length} startedAt=${safeStringify(times)} deltaMs=${deltaMs}`, -+ ); -+ -+ if (deltaMs === undefined || deltaMs < retryAfterSeconds * 1000) { -+ throw new Error( -+ `Expected retry delay >= ${retryAfterSeconds * 1000}ms, got ${deltaMs ?? "n/a"}ms`, -+ ); -+ } -+} -+ -+async function test_retryAfterOnSuccessResponse() { -+ const retryAfterSeconds = 0.3; -+ const provider = "retry-after-200"; -+ const modelId = "ra-200"; -+ const tenantId = "ra-200"; -+ -+ const model = makeSuccessRetryAfterModel({ -+ provider, -+ modelId, -+ retryAfterSeconds, -+ latencyMs: 20, -+ }); -+ -+ const agent = new Agent({ -+ name: "ra-200", -+ instructions: "echo", -+ model, -+ temperature: 0, -+ maxOutputTokens: 32, -+ }); -+ -+ console.log("\n=== Test: Retry-After on 200 response headers ==="); -+ const first = agent.generateText("first", { tenantId, trafficPriority: "P1" }); -+ const second = agent.generateText("second", { tenantId, trafficPriority: "P1" }); -+ -+ const [r1, r2] = await Promise.all([first, second]); -+ -+ const end1 = model.endedAt[0]; -+ const start2 = model.startedAt[1]; -+ const enforcedDelayMs = start2 && end1 ? start2 - end1 : undefined; -+ -+ console.log( -+ `[result] texts=${safeStringify([r1.text, r2.text])} startedAt=${safeStringify( -+ model.startedAt, -+ )} endedAt=${safeStringify(model.endedAt)} enforcedDelayMs=${enforcedDelayMs}`, -+ ); -+ -+ if (enforcedDelayMs === undefined || enforcedDelayMs < retryAfterSeconds * 1000) { -+ throw new Error( -+ `Expected rate-limit delay >= ${retryAfterSeconds * 1000}ms, got ${enforcedDelayMs ?? "n/a"}ms`, -+ ); -+ } -+} -+ -+async function main() { -+ // Create controller early so all Agent calls share the same singleton. -+ getTrafficController({ maxConcurrent: 1 }); -+ -+ await test_retryAfterOn429("headers"); -+ await test_retryAfterOn429("typedError"); -+ await test_retryAfterOnSuccessResponse(); -+ -+ console.log("\n[done] All Retry-After manual checks passed."); -+} -+ -+main().catch((error) => { -+ console.error("Fatal error:", error); -+ process.exit(1); -+}); -diff --git a/tmp/test/traffic-retry-behavior.ts b/tmp/test/traffic-retry-behavior.ts -new file mode 100644 -index 00000000..273af55a ---- /dev/null -+++ b/tmp/test/traffic-retry-behavior.ts -@@ -0,0 +1,169 @@ -+// @ts-nocheck -+/** -+ * Manual test: TrafficController retry behavior via Agent + AI SDK path (stub model). -+ * -+ * Scenarios included: -+ * - 5xx retries (up to 3 attempts) -+ * - 429 retries (up to 3 attempts) -+ * - timeout retries (up to 2 attempts) -+ * - non-retriable 4xx does not retry -+ * -+ * Run: -+ * - pnpm ts-node tmp/test/traffic-retry-behavior.ts -+ * -+ * Notes: -+ * - Uses a stub LanguageModel; no network calls. -+ * - Watch the `[model] attempt=...` logs to confirm retries. -+ */ -+ -+import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; -+ -+const verbose = process.env.VERBOSE === "1"; -+if (!verbose) { -+ console.debug = () => {}; -+} -+ -+type Scenario = -+ | "server-error" -+ | "rate-limit" -+ | "timeout" -+ | "bad-request" -+ | "forbidden" -+ // Variations to hit different retry-detection branches. -+ | "server-error-status-string" -+ | "server-error-statusCode" -+ | "server-error-response-status" -+ | "server-error-cause-status" -+ | "rate-limit-statusCode" -+ | "timeout-code-only" -+ | "timeout-name-only" -+ | "timeout-message-only" -+ // Variations that should STOP retrying (hit max attempts). -+ | "server-error-exceed-max" -+ | "timeout-exceed-max"; -+ -+type RetryPlan = { -+ failCountBeforeSuccess: number; -+ status?: number | string; -+ statusCode?: number | string; -+ httpStatus?: number | string; -+ responseStatus?: number | string; -+ causeStatus?: number | string; -+ code?: string; -+ name?: string; -+ message?: string; -+}; -+ -+const plans: Record = { -+ "server-error": { failCountBeforeSuccess: 2, status: 500 }, -+ "rate-limit": { failCountBeforeSuccess: 2, status: 429 }, -+ timeout: { failCountBeforeSuccess: 1, status: 408, code: "ETIMEDOUT", message: "timeout" }, -+ "bad-request": { failCountBeforeSuccess: 10, status: 400 }, -+ forbidden: { failCountBeforeSuccess: 10, status: 403 }, -+ "server-error-status-string": { failCountBeforeSuccess: 2, status: "500" }, -+ "server-error-statusCode": { failCountBeforeSuccess: 2, statusCode: 502 }, -+ "server-error-response-status": { failCountBeforeSuccess: 2, responseStatus: 503 }, -+ "server-error-cause-status": { failCountBeforeSuccess: 2, causeStatus: 500 }, -+ "rate-limit-statusCode": { failCountBeforeSuccess: 2, statusCode: 429 }, -+ "timeout-code-only": { failCountBeforeSuccess: 1, code: "timeout" }, -+ "timeout-name-only": { failCountBeforeSuccess: 1, name: "TimeoutError" }, -+ "timeout-message-only": { failCountBeforeSuccess: 1, message: "this is a TIMEOUT" }, -+ "server-error-exceed-max": { failCountBeforeSuccess: 10, status: 500 }, -+ "timeout-exceed-max": { failCountBeforeSuccess: 10, message: "timeout" }, -+}; -+ -+function makeModel(modelId: string, plan: RetryPlan) { -+ let counter = 0; -+ let lastAttemptAt = 0; -+ -+ return { -+ specificationVersion: "v2", -+ provider: "retry-provider", -+ modelId, -+ doGenerate: async () => { -+ counter += 1; -+ const now = Date.now(); -+ const delta = lastAttemptAt ? now - lastAttemptAt : 0; -+ lastAttemptAt = now; -+ -+ console.log(`[model] modelId=${modelId} attempt=${counter} (+${delta}ms)`); -+ -+ if (counter <= plan.failCountBeforeSuccess) { -+ const err: any = new Error(plan.message ?? `forced failure ${counter} for ${modelId}`); -+ if (plan.status !== undefined) err.status = plan.status; -+ if (plan.statusCode !== undefined) err.statusCode = plan.statusCode; -+ if (plan.httpStatus !== undefined) err.httpStatus = plan.httpStatus; -+ if (plan.responseStatus !== undefined) err.response = { status: plan.responseStatus }; -+ if (plan.causeStatus !== undefined) err.cause = { status: plan.causeStatus }; -+ if (plan.code !== undefined) err.code = plan.code; -+ if (plan.name !== undefined) err.name = plan.name; -+ throw err; -+ } -+ -+ return { -+ content: [{ type: "text", text: "ok" }], -+ finishReason: "stop", -+ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, -+ warnings: [], -+ response: { modelId, headers: {} }, -+ }; -+ }, -+ }; -+} -+ -+async function runScenario(name: Scenario) { -+ const plan = plans[name]; -+ const modelId = `retry-${name}`; -+ const model = makeModel(modelId, plan); -+ -+ const agent = new Agent({ -+ name: `RetryAgent-${name}`, -+ instructions: "echo", -+ model, -+ maxOutputTokens: 32, -+ temperature: 0, -+ }); -+ -+ console.log(`\n=== ${name} ===`); -+ try { -+ const result = await agent.generateText(name, { tenantId: "retry-test" }); -+ console.log(`[${name}] succeeded. text=${result.text}`); -+ } catch (err: any) { -+ console.log( -+ `[${name}] failed. status=${err?.status ?? err?.statusCode ?? err?.response?.status ?? "n/a"}`, -+ ); -+ } -+} -+ -+async function main() { -+ // Create controller early so all Agent calls share the same singleton. -+ getTrafficController({ maxConcurrent: 1 }); -+ -+ const runs: Scenario[] = [ -+ "server-error", -+ "rate-limit", -+ "timeout", -+ "bad-request", -+ "forbidden", -+ // Uncomment for additional coverage: -+ // "server-error-status-string", -+ // "server-error-statusCode", -+ // "server-error-response-status", -+ // "server-error-cause-status", -+ // "rate-limit-statusCode", -+ // "timeout-code-only", -+ // "timeout-name-only", -+ // "timeout-message-only", -+ // "server-error-exceed-max", -+ // "timeout-exceed-max", -+ ]; -+ -+ for (const name of runs) { -+ await runScenario(name); -+ } -+} -+ -+main().catch((error) => { -+ console.error("Fatal error:", error); -+ process.exit(1); -+}); -diff --git a/tmp/test/traffic-tenant-usage.ts b/tmp/test/traffic-tenant-usage.ts -new file mode 100644 -index 00000000..801d7761 ---- /dev/null -+++ b/tmp/test/traffic-tenant-usage.ts -@@ -0,0 +1,71 @@ -+// @ts-nocheck -+/** -+ * Manual test: Tenant usage aggregation (via Agent → TrafficController). -+ * -+ * What to look for: -+ * - `getTenantUsage(tenantId)` should increase after each agent call. -+ * -+ * Run: -+ * - pnpm ts-node tmp/test/traffic-tenant-usage.ts -+ */ -+ -+import { safeStringify } from "@voltagent/internal"; -+import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; -+ -+const verbose = process.env.VERBOSE === "1"; -+if (!verbose) { -+ console.debug = () => {}; -+} -+ -+function makeModel(modelId: string) { -+ return { -+ specificationVersion: "v2", -+ provider: "usage-provider", -+ modelId, -+ doGenerate: async () => { -+ return { -+ content: [{ type: "text", text: `ok:${modelId}` }], -+ finishReason: "stop", -+ usage: { inputTokens: 2, outputTokens: 3, totalTokens: 5 }, -+ warnings: [], -+ response: { modelId, headers: {} }, -+ }; -+ }, -+ }; -+} -+ -+const controller = getTrafficController({ maxConcurrent: 10 }); -+ -+async function run(label: string, tenantId: string) { -+ const model = makeModel("tenant-usage-model"); -+ const agent = new Agent({ -+ name: `TenantUsageAgent-${label}`, -+ instructions: "echo", -+ model, -+ temperature: 0, -+ maxOutputTokens: 32, -+ }); -+ -+ console.log(`\n=== ${label} tenantId=${tenantId} ===`); -+ const result = await agent.generateText(`hello:${label}`, { tenantId }); -+ console.log(`[${label}] text=${result.text}`); -+ -+ const usage = controller.getTenantUsage(tenantId); -+ console.log(`[${label}] controller.getTenantUsage(${tenantId})=${safeStringify(usage)}`); -+} -+ -+async function main() { -+ await run("A1", "tenant-a"); -+ await run("A2", "tenant-a"); -+ await run("B1", "tenant-b"); -+ -+ console.log("\n=== Final usage snapshot ==="); -+ console.log(`tenant-a=${safeStringify(controller.getTenantUsage("tenant-a"))}`); -+ console.log(`tenant-b=${safeStringify(controller.getTenantUsage("tenant-b"))}`); -+ console.log(`default=${safeStringify(controller.getTenantUsage("default"))}`); -+} -+ -+main().catch((error) => { -+ console.error("Fatal error:", error); -+ process.exit(1); -+}); -diff --git a/tmp/test/traffic-text-vs-stream.ts b/tmp/test/traffic-text-vs-stream.ts -new file mode 100644 -index 00000000..41aa484d ---- /dev/null -+++ b/tmp/test/traffic-text-vs-stream.ts -@@ -0,0 +1,128 @@ -+// @ts-nocheck -+/** -+ * Manual test: Text + stream traffic share the same TrafficController queue. -+ * -+ * What to look for: -+ * - Stream and text requests should respect the same maxConcurrent + priority rules. -+ * -+ * Run: -+ * - pnpm ts-node tmp/test/traffic-text-vs-stream.ts -+ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-text-vs-stream.ts -+ */ -+ -+import { ReadableStream } from "node:stream/web"; -+import { safeStringify } from "@voltagent/internal"; -+import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; -+ -+const verbose = process.env.VERBOSE === "1"; -+if (!verbose) { -+ console.debug = () => {}; -+} -+ -+const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); -+const now = () => new Date().toISOString(); -+ -+const controller = getTrafficController({ maxConcurrent: 1 }); -+ -+function extractLabel(prompt: any): string { -+ if (!Array.isArray(prompt)) { -+ return "unknown"; -+ } -+ -+ for (let index = prompt.length - 1; index >= 0; index -= 1) { -+ const message = prompt[index]; -+ if (!message || message.role !== "user" || !Array.isArray(message.content)) { -+ continue; -+ } -+ -+ const textPart = message.content.find((part: any) => part?.type === "text"); -+ if (textPart?.text) { -+ return String(textPart.text); -+ } -+ } -+ -+ return "unknown"; -+} -+ -+async function main() { -+ console.log("\n=== Text vs Stream (shared scheduler) ==="); -+ void controller; -+ -+ const provider = "sim"; -+ const modelId = "shared-queue"; -+ -+ const model = { -+ specificationVersion: "v2", -+ provider, -+ modelId, -+ doGenerate: async (options: any) => { -+ const label = extractLabel(options?.prompt); -+ console.log(`[${now()}] doGenerate start input=${label}`); -+ await sleep(50); -+ console.log(`[${now()}] doGenerate end input=${label}`); -+ return { -+ content: [{ type: "text", text: `text:${label}` }], -+ finishReason: "stop", -+ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, -+ warnings: [], -+ response: { modelId, headers: {} }, -+ }; -+ }, -+ doStream: async (options: any) => { -+ const label = extractLabel(options?.prompt); -+ console.log(`[${now()}] doStream start input=${label}`); -+ -+ // Hold the controller slot for a bit so ordering is visible. -+ await sleep(400); -+ -+ console.log(`[${now()}] doStream ready input=${label}`); -+ const streamId = `text-${label}`; -+ const text = `stream:${label}`; -+ -+ const stream = new ReadableStream({ -+ start(streamController) { -+ streamController.enqueue({ type: "stream-start", warnings: [] }); -+ streamController.enqueue({ type: "text-start", id: streamId }); -+ streamController.enqueue({ type: "text-delta", id: streamId, delta: text }); -+ streamController.enqueue({ type: "text-end", id: streamId }); -+ streamController.enqueue({ -+ type: "finish", -+ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, -+ finishReason: "stop", -+ }); -+ streamController.close(); -+ }, -+ }); -+ -+ return { stream, response: { headers: {} } }; -+ }, -+ }; -+ -+ const agent = new Agent({ -+ name: "traffic-text-vs-stream", -+ instructions: "echo", -+ model, -+ temperature: 0, -+ maxOutputTokens: 32, -+ }); -+ -+ const streamP1 = agent.streamText("S1", { tenantId: "default", trafficPriority: "P1" }); -+ const textP0 = agent.generateText("T0", { tenantId: "default", trafficPriority: "P0" }); -+ const textP1 = agent.generateText("T1", { tenantId: "default", trafficPriority: "P1" }); -+ -+ const [streamResult, t0, t1] = await Promise.all([streamP1, textP0, textP1]); -+ const streamText = await streamResult.text; -+ -+ console.log( -+ `\n[done] results=${safeStringify({ -+ streamText, -+ textP0: t0.text, -+ textP1: t1.text, -+ })}`, -+ ); -+} -+ -+main().catch((error) => { -+ console.error("Fatal error:", error); -+ process.exit(1); -+}); From 6182cf038876a565ae5d8aeac44a0c1345112231 Mon Sep 17 00:00:00 2001 From: riturajFi Date: Thu, 25 Dec 2025 21:16:38 +0530 Subject: [PATCH 31/41] fix: per tenant tpm/rpm --- .../src/traffic/traffic-controller.spec.ts | 135 ++++++++++++++++++ .../core/src/traffic/traffic-controller.ts | 3 +- 2 files changed, 137 insertions(+), 1 deletion(-) diff --git a/packages/core/src/traffic/traffic-controller.spec.ts b/packages/core/src/traffic/traffic-controller.spec.ts index 51c08c9da..8f0a2c47c 100644 --- a/packages/core/src/traffic/traffic-controller.spec.ts +++ b/packages/core/src/traffic/traffic-controller.spec.ts @@ -270,6 +270,45 @@ describe("TrafficController rate limit headers", () => { vi.useRealTimers(); } }); + + it("shares rate limits across tenants for the same provider/model", async () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(0)); + const controller = new TrafficController({ maxConcurrent: 1 }); + controller.updateRateLimitFromHeaders( + { provider: "openai", model: "gpt-4o", tenantId: "tenant-a" }, + { + "x-ratelimit-limit-requests": "1", + "x-ratelimit-remaining-requests": "0", + "x-ratelimit-reset-requests": "1s", + }, + ); + + const order: string[] = []; + const request = controller.handleText({ + tenantId: "tenant-b", + metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, + execute: async () => { + order.push("tenant-b"); + return "ok"; + }, + }); + + await vi.advanceTimersByTimeAsync(999); + await Promise.resolve(); + expect(order).toEqual([]); + + await vi.advanceTimersByTimeAsync(1); + await vi.runAllTimersAsync(); + await request; + + expect(order).toEqual(["tenant-b"]); + } finally { + vi.useRealTimers(); + } + }); }); describe("TrafficController token limits", () => { @@ -385,6 +424,102 @@ describe("TrafficController token limits", () => { vi.useRealTimers(); } }); + + it("allows token-only configs on non-OpenAI providers", async () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(0)); + const controller = new TrafficController({ + maxConcurrent: 2, + rateLimits: { + "p::m": { + requestsPerMinute: 0, + tokensPerMinute: 2, + }, + }, + }); + const order: string[] = []; + + const first = controller.handleText({ + tenantId: "tenant-a", + metadata: { provider: "p", model: "m", priority: "P1" }, + estimatedTokens: 2, + execute: async () => { + order.push("first"); + return "first"; + }, + extractUsage: () => ({ totalTokens: 2 }), + }); + + const second = controller.handleText({ + tenantId: "tenant-b", + metadata: { provider: "p", model: "m", priority: "P1" }, + estimatedTokens: 1, + execute: async () => { + order.push("second"); + return "second"; + }, + extractUsage: () => ({ totalTokens: 1 }), + }); + + await first; + expect(order).toEqual(["first"]); + + await vi.advanceTimersByTimeAsync(29_999); + await Promise.resolve(); + expect(order).toEqual(["first"]); + + await vi.advanceTimersByTimeAsync(1); + await vi.runAllTimersAsync(); + await second; + expect(order).toEqual(["first", "second"]); + } finally { + vi.useRealTimers(); + } + }); + + it("honors OpenAI token headers even without token config", async () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(0)); + const controller = new TrafficController({ maxConcurrent: 1 }); + controller.updateRateLimitFromHeaders( + { provider: "openai", model: "gpt-4o" }, + { + "x-ratelimit-limit-tokens": "2", + "x-ratelimit-remaining-tokens": "0", + "x-ratelimit-reset-tokens": "1s", + }, + ); + + const order: string[] = []; + const request = controller.handleText({ + tenantId: "tenant-a", + metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, + estimatedTokens: 1, + execute: async () => { + order.push("run"); + return "ok"; + }, + }); + + await Promise.resolve(); + expect(order).toEqual([]); + + await vi.advanceTimersByTimeAsync(1_000 + RATE_LIMIT_PROBE_DELAY_MS - 1); + await Promise.resolve(); + expect(order).toEqual([]); + + await vi.advanceTimersByTimeAsync(1); + await vi.runAllTimersAsync(); + await request; + expect(order).toEqual(["run"]); + } finally { + vi.useRealTimers(); + } + }); }); describe("TrafficController stream reporting", () => { diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index 3c704dbde..e6f2c5399 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -1242,7 +1242,8 @@ function buildRateLimitKeyFromMetadata(metadata?: TrafficRequestMetadata): strin { label: "apiKey", value: metadata?.apiKeyId }, { label: "region", value: metadata?.region }, { label: "endpoint", value: metadata?.endpoint }, - { label: "tenant", value: metadata?.tenantId }, + // Intentionally exclude tenantId to enforce provider/model limits across tenants. + // Use rateLimitKeyBuilder to include tenant for per-tenant rate limits. { label: "tenantTier", value: metadata?.tenantTier }, { label: "taskType", value: metadata?.taskType }, ]; From 6a962e43be873ea69b145a523804b536a41439cf Mon Sep 17 00:00:00 2001 From: riturajFi Date: Thu, 25 Dec 2025 21:17:09 +0530 Subject: [PATCH 32/41] fix: token throttling incomplete --- .../openai-window-rate-limit-strategy.ts | 97 ++++++++++++++++--- .../token-bucket-rate-limit-strategy.ts | 10 +- 2 files changed, 95 insertions(+), 12 deletions(-) diff --git a/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts index 32ffc7e45..fdb1c7a83 100644 --- a/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts +++ b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts @@ -10,6 +10,7 @@ import type { QueuedRequest, RateLimitWindowState, } from "../traffic-controller-internal"; +import { parseRetryAfterMs, readHeaderValue } from "../traffic-error-utils"; import type { RateLimitOptions, TrafficRequestMetadata } from "../traffic-types"; import { DefaultRateLimitStrategy } from "./default-rate-limit-strategy"; import type { @@ -17,9 +18,10 @@ import type { RateLimitUpdateResult, RateLimitUsage, } from "./rate-limit-strategy"; +import { parseResetDurationToMs } from "./rate-limit-utils"; export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { - readonly handlesTokenLimits: boolean; + readonly handlesTokenLimits = true; private readonly window: DefaultRateLimitStrategy; private readonly key: string; private readonly requestsPerMinute?: number; @@ -35,7 +37,6 @@ export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { // Window strategy enforces fixed 60s windows; burstSize is intentionally ignored here. this.requestsPerMinute = this.normalizeLimit(options?.requestsPerMinute); this.tokensPerMinute = this.normalizeLimit(options?.tokensPerMinute); - this.handlesTokenLimits = this.tokensPerMinute !== undefined; } resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { @@ -94,12 +95,12 @@ export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { } recordUsage(usage: RateLimitUsage, logger?: Logger, reservedTokens?: number): void { - if (this.tokensPerMinute === undefined) return; const tokens = this.resolveTokenCount(usage); if (tokens <= 0) return; const now = Date.now(); const state = this.ensureTokenState(now); + if (!state) return; const reserved = typeof reservedTokens === "number" ? reservedTokens : 0; const delta = tokens - reserved; if (delta > 0) { @@ -120,10 +121,12 @@ export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { headers: unknown, logger?: Logger, ): RateLimitUpdateResult | undefined { - if (this.requestsPerMinute !== undefined) { - return undefined; - } - return this.window.updateFromHeaders(metadata, headers, logger); + const update = + this.requestsPerMinute !== undefined + ? undefined + : this.window.updateFromHeaders(metadata, headers, logger); + this.applyTokenHeaderUpdates(headers, logger); + return update; } private resolveRequestWindow(next: QueuedRequest, logger?: Logger): DispatchDecision | null { @@ -205,10 +208,10 @@ export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { } private resolveTokenWindow(next: QueuedRequest, logger?: Logger): DispatchDecision | null { - if (this.tokensPerMinute === undefined) return null; const rateLimitLogger = logger?.child({ module: "rate-limiter" }); const now = Date.now(); const state = this.ensureTokenState(now); + if (!state) return null; const estimatedTokens = next.estimatedTokens; if (typeof estimatedTokens === "number" && estimatedTokens > 0) { @@ -247,10 +250,23 @@ export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { return state; } - private ensureTokenState(now: number): RateLimitWindowState { - const limit = this.tokensPerMinute ?? 0; + private ensureTokenState(now: number): RateLimitWindowState | undefined { + const configuredLimit = this.tokensPerMinute; const state = this.tokenState; - if (!state || now >= state.resetAt) { + if (!state) { + if (configuredLimit === undefined) return undefined; + this.tokenState = { + limit: configuredLimit, + remaining: configuredLimit, + resetAt: now + this.windowMs, + reserved: 0, + nextAllowedAt: now, + }; + return this.tokenState; + } + + if (now >= state.resetAt) { + const limit = configuredLimit ?? state.limit; this.tokenState = { limit, remaining: limit, @@ -260,6 +276,12 @@ export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { }; return this.tokenState; } + + if (configuredLimit !== undefined && configuredLimit !== state.limit) { + state.limit = configuredLimit; + state.remaining = Math.min(state.remaining, configuredLimit); + } + return state; } @@ -268,6 +290,59 @@ export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { return Number.isFinite(numeric) && numeric > 0 ? numeric : undefined; } + private applyTokenHeaderUpdates(headers: unknown, logger?: Logger): void { + const rateLimitLogger = logger?.child({ module: "rate-limiter" }); + const limitTokens = readHeaderValue(headers, "x-ratelimit-limit-tokens"); + const remainingTokens = readHeaderValue(headers, "x-ratelimit-remaining-tokens"); + const resetTokens = readHeaderValue(headers, "x-ratelimit-reset-tokens"); + const retryAfter = readHeaderValue(headers, "retry-after"); + + const limit = Number(limitTokens); + const remaining = Number(remainingTokens); + const resetTokensMs = resetTokens ? parseResetDurationToMs(resetTokens) : undefined; + const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter) : undefined; + + if (!Number.isFinite(limit) || !Number.isFinite(remaining) || resetTokensMs === undefined) { + rateLimitLogger?.trace?.("OpenAI token headers missing or invalid; skipping", { + rateLimitKey: this.key, + hasLimit: !!limitTokens, + hasRemaining: !!remainingTokens, + hasReset: !!resetTokens, + }); + return; + } + + const now = Date.now(); + const configuredLimit = this.tokensPerMinute; + const effectiveLimit = configuredLimit === undefined ? limit : Math.min(configuredLimit, limit); + const clampedRemaining = Math.max(0, Math.min(remaining, effectiveLimit)); + const parsedResetAt = now + resetTokensMs; + const existing = this.tokenState; + const isSameWindow = !!existing && now < existing.resetAt; + const resetAt = isSameWindow ? Math.max(existing.resetAt, parsedResetAt) : parsedResetAt; + const nextAllowedAt = isSameWindow ? Math.max(existing.nextAllowedAt, now) : now; + const reserved = Math.max(0, existing?.reserved ?? 0); + const effectiveRemaining = isSameWindow + ? Math.min(existing.remaining, clampedRemaining) + : clampedRemaining; + + this.tokenState = { + limit: effectiveLimit, + remaining: effectiveRemaining, + resetAt, + reserved, + nextAllowedAt, + }; + + rateLimitLogger?.debug?.("OpenAI token headers applied", { + rateLimitKey: this.key, + limit: effectiveLimit, + remaining: effectiveRemaining, + resetAt, + retryAfterMs, + }); + } + private resolveTokenCount(usage: RateLimitUsage): number { const total = Number.isFinite(usage.totalTokens) ? usage.totalTokens : undefined; if (total !== undefined) return total; diff --git a/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts index 2ae7b1892..ee269ecd2 100644 --- a/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts +++ b/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts @@ -22,15 +22,22 @@ type TokenBucketState = { function normalizeTokenBucketOptions( raw: RateLimitOptions | undefined, -): Omit { +): Omit | undefined { const requestsPerMinuteRaw = raw?.requestsPerMinute; + const tokensPerMinuteRaw = raw?.tokensPerMinute; const burstSizeRaw = raw?.burstSize; const requestsPerMinute = typeof requestsPerMinuteRaw === "number" ? requestsPerMinuteRaw : Number(requestsPerMinuteRaw); + const tokensPerMinute = + typeof tokensPerMinuteRaw === "number" ? tokensPerMinuteRaw : Number(tokensPerMinuteRaw); const burstSize = typeof burstSizeRaw === "number" ? burstSizeRaw : Number(burstSizeRaw); const safeRequestsPerMinute = Number.isFinite(requestsPerMinute) ? requestsPerMinute : 0; + const hasTokenLimit = Number.isFinite(tokensPerMinute) && tokensPerMinute > 0; + if (safeRequestsPerMinute <= 0 && hasTokenLimit) { + return undefined; + } const safeBurst = Number.isFinite(burstSize) ? burstSize : safeRequestsPerMinute; const refillPerSecond = safeRequestsPerMinute > 0 ? safeRequestsPerMinute / 60 : 0; @@ -59,6 +66,7 @@ export class TokenBucketRateLimitStrategy implements RateLimitStrategy { this.key = key; if (!options) return; const normalized = normalizeTokenBucketOptions(options); + if (!normalized) return; const now = Date.now(); this.bucket = { ...normalized, From 1879d42e974dbdb96b1f37cb849a99edfaa9fdde Mon Sep 17 00:00:00 2001 From: riturajFi Date: Thu, 25 Dec 2025 21:27:06 +0530 Subject: [PATCH 33/41] fix: cant sneak retry --- packages/core/src/agent/agent.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index f9f5cc1c4..ca42da8a1 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -627,10 +627,12 @@ export class Agent { fallbackPolicyId, experimental_output, providerOptions, + maxRetries: _maxRetries, // Always disable provider retries (TrafficController handles retries) model: _model, // Exclude model so aiSDKOptions doesn't override resolved model ...aiSDKOptions } = options || {}; void _model; + void _maxRetries; void maxQueueWaitMs; void taskType; void fallbackPolicyId; @@ -1005,10 +1007,12 @@ export class Agent { fallbackPolicyId, experimental_output, providerOptions, + maxRetries: _maxRetries, // Always disable provider retries (TrafficController handles retries) model: _model, // Exclude model from aiSDKOptions to avoid overriding resolved model ...aiSDKOptions } = options || {}; void _model; + void _maxRetries; void maxQueueWaitMs; void taskType; void fallbackPolicyId; @@ -1708,10 +1712,12 @@ export class Agent { fallbackPolicyId, maxQueueWaitMs, providerOptions, + maxRetries: _maxRetries, // Always disable provider retries (TrafficController handles retries) model: _model, // Exclude model so spread does not override resolved model ...aiSDKOptions } = options || {}; void _model; + void _maxRetries; void taskType; void fallbackPolicyId; void maxQueueWaitMs; @@ -1992,10 +1998,12 @@ export class Agent { fallbackPolicyId, maxQueueWaitMs, providerOptions, + maxRetries: _maxRetries, // Always disable provider retries (TrafficController handles retries) model: _model, // Exclude model so aiSDKOptions cannot override resolved model ...aiSDKOptions } = options || {}; void _model; + void _maxRetries; void taskType; void fallbackPolicyId; void maxQueueWaitMs; From 76637682979b6b1dc2269d0dc40d9d78033b0f34 Mon Sep 17 00:00:00 2001 From: riturajFi Date: Thu, 25 Dec 2025 21:47:04 +0530 Subject: [PATCH 34/41] fix: queue timeout fallback --- packages/core/src/traffic/traffic-circuit-breaker.ts | 3 +-- packages/core/src/traffic/traffic-controller-internal.ts | 1 + packages/core/src/traffic/traffic-controller.ts | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/packages/core/src/traffic/traffic-circuit-breaker.ts b/packages/core/src/traffic/traffic-circuit-breaker.ts index 2936a5870..72caf70ac 100644 --- a/packages/core/src/traffic/traffic-circuit-breaker.ts +++ b/packages/core/src/traffic/traffic-circuit-breaker.ts @@ -369,8 +369,7 @@ export class TrafficCircuitBreaker { next.circuitStatus = undefined; next.extractUsage = fallbackRequest.extractUsage; if (context?.reason === "queue-timeout") { - next.enqueuedAt = Date.now(); - next.dispatchedAt = undefined; + next.queueTimeoutDisabled = true; } logger?.debug?.("Switched to fallback request", { previousCircuitKey: context?.previousCircuitKey, diff --git a/packages/core/src/traffic/traffic-controller-internal.ts b/packages/core/src/traffic/traffic-controller-internal.ts index aa808b6da..fd2012cf5 100644 --- a/packages/core/src/traffic/traffic-controller-internal.ts +++ b/packages/core/src/traffic/traffic-controller-internal.ts @@ -42,6 +42,7 @@ export interface QueuedRequest { dispatchedAt?: number; estimatedTokens?: number; reservedTokens?: number; + queueTimeoutDisabled?: boolean; tenantConcurrencyKey?: string; providerModelConcurrencyKey?: string; diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index e6f2c5399..5afe82f8b 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -828,6 +828,9 @@ export class TrafficController { */ private resolveQueueTimeoutAt(next: QueuedRequest): number | undefined { + if (next.queueTimeoutDisabled) { + return next.request.deadlineAt; + } const maxQueueWaitMs = next.request.maxQueueWaitMs; const normalizedMaxWait = typeof maxQueueWaitMs === "number" && Number.isFinite(maxQueueWaitMs) From c315693a53f373b87facd84d901f119a279b4f07 Mon Sep 17 00:00:00 2001 From: riturajFi Date: Thu, 25 Dec 2025 21:48:40 +0530 Subject: [PATCH 35/41] fix: missing traffic metadata --- .../server-core/src/schemas/agent.schemas.ts | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/packages/server-core/src/schemas/agent.schemas.ts b/packages/server-core/src/schemas/agent.schemas.ts index 52e80b832..41181e00f 100644 --- a/packages/server-core/src/schemas/agent.schemas.ts +++ b/packages/server-core/src/schemas/agent.schemas.ts @@ -77,6 +77,18 @@ export const GenerateOptionsSchema = z .object({ userId: z.string().optional().describe("Optional user ID for context tracking"), conversationId: z.string().optional().describe("Optional conversation ID for context tracking"), + tenantId: z.string().optional().describe("Optional tenant ID for traffic limits"), + trafficPriority: z + .enum(["P0", "P1", "P2"]) + .optional() + .describe("Optional traffic priority for scheduling (P0, P1, P2)"), + apiKeyId: z.string().optional().describe("Optional API key identifier for traffic limits"), + region: z.string().optional().describe("Optional region identifier for traffic limits"), + endpoint: z.string().optional().describe("Optional endpoint identifier for traffic limits"), + tenantTier: z + .string() + .optional() + .describe("Optional tenant tier identifier for traffic limits"), context: z .record(z.string(), z.unknown()) .nullish() @@ -94,6 +106,14 @@ export const GenerateOptionsSchema = z .positive() .optional() .describe("Maximum number of steps for this request"), + maxQueueWaitMs: z + .number() + .int() + .nonnegative() + .optional() + .describe("Maximum time to wait in the queue before timing out (ms)"), + taskType: z.string().optional().describe("Optional task classification for fallback policy"), + fallbackPolicyId: z.string().optional().describe("Optional explicit fallback policy id"), temperature: z .number() .min(0) From 50329dae6105ba57b8a6e39be7734a616282ffa6 Mon Sep 17 00:00:00 2001 From: riturajFi Date: Thu, 25 Dec 2025 22:34:41 +0530 Subject: [PATCH 36/41] =?UTF-8?q?fix:=20short=E2=80=91response=20fallback?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packages/core/src/agent/agent.ts | 401 ++++++++++++++++++ packages/core/src/index.ts | 1 + .../src/traffic/traffic-circuit-breaker.ts | 18 + .../core/src/traffic/traffic-controller.ts | 66 +-- packages/core/src/traffic/traffic-types.ts | 9 +- 5 files changed, 462 insertions(+), 33 deletions(-) diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index ca42da8a1..84343c041 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -51,6 +51,7 @@ import { ToolManager } from "../tool/manager"; import { type FallbackChainEntry, type TrafficPriority, + type TrafficRequest, type TrafficRequestMetadata, getTrafficController, } from "../traffic/traffic-controller"; @@ -508,6 +509,14 @@ export class Agent { extractUsage: (result: GenerateTextResultWithContext) => this.extractUsageFromResponse(result), createFallbackRequest: (fallbackTarget) => { + if (this.isShortResponseFallback(fallbackTarget)) { + return this.buildShortTextFallbackRequest( + tenantId, + metadata, + mergedOptions, + fallbackTarget.text, + ); + } const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = this.resolveFallbackTarget(fallbackTarget); return buildRequest(fallbackModel, fallbackProvider); @@ -891,6 +900,14 @@ export class Agent { extractUsage: (result: StreamTextResultWithContext) => this.extractUsageFromResponse(result), createFallbackRequest: (fallbackTarget) => { + if (this.isShortResponseFallback(fallbackTarget)) { + return this.buildShortStreamTextFallbackRequest( + tenantId, + metadata, + mergedOptions, + fallbackTarget.text, + ); + } const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = this.resolveFallbackTarget(fallbackTarget); return buildRequest(fallbackModel, fallbackProvider); @@ -1611,6 +1628,15 @@ export class Agent { extractUsage: (result: GenerateObjectResultWithContext>) => this.extractUsageFromResponse(result), createFallbackRequest: (fallbackTarget) => { + if (this.isShortResponseFallback(fallbackTarget)) { + return this.buildShortObjectFallbackRequest( + tenantId, + metadata, + schema, + mergedOptions, + fallbackTarget.text, + ); + } const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = this.resolveFallbackTarget(fallbackTarget); return buildRequest(fallbackModel, fallbackProvider); @@ -1895,6 +1921,15 @@ export class Agent { extractUsage: (result: StreamObjectResultWithContext>) => this.extractUsageFromResponse(result), createFallbackRequest: (fallbackTarget) => { + if (this.isShortResponseFallback(fallbackTarget)) { + return this.buildShortStreamObjectFallbackRequest( + tenantId, + metadata, + schema, + mergedOptions, + fallbackTarget.text, + ); + } const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = this.resolveFallbackTarget(fallbackTarget); return buildRequest(fallbackModel, fallbackProvider); @@ -4253,6 +4288,372 @@ export class Agent { }; } + private isShortResponseFallback( + target: FallbackChainEntry, + ): target is { kind: "short-response"; text: string } { + return ( + typeof target === "object" && + target !== null && + "kind" in target && + (target as { kind?: string }).kind === "short-response" + ); + } + + private buildShortResponseMetadata( + baseMetadata: TrafficRequestMetadata | undefined, + ): TrafficRequestMetadata { + const metadata = baseMetadata ?? this.buildTrafficMetadata(); + return { + ...metadata, + provider: "short-response", + model: "short-response", + }; + } + + private createZeroUsage(): LanguageModelUsage { + return { inputTokens: 0, outputTokens: 0, totalTokens: 0 }; + } + + private createShortTextStream(text: string): AsyncIterableStream { + return createAsyncIterableReadable((controller) => { + controller.enqueue(text); + controller.close(); + }); + } + + private createShortFullStream(text: string): AsyncIterableStream { + const usage = this.createZeroUsage(); + const id = `short-response-${randomUUID()}`; + return createAsyncIterableReadable((controller) => { + controller.enqueue({ + type: "text-delta", + id, + delta: text, + text, + } as VoltAgentTextStreamPart); + controller.enqueue({ + type: "finish", + finishReason: "stop", + usage, + totalUsage: usage, + } as VoltAgentTextStreamPart); + controller.close(); + }); + } + + private createShortTextResult( + text: string, + options?: GenerateTextOptions, + ): GenerateTextResultWithContext { + const usage = this.createZeroUsage(); + const context = toContextMap(options?.context) ?? new Map(); + const createTextStream = (): AsyncIterableStream => this.createShortTextStream(text); + + return { + text, + content: [], + reasoning: [], + reasoningText: "", + files: [], + sources: [], + toolCalls: [], + staticToolCalls: [], + dynamicToolCalls: [], + toolResults: [], + staticToolResults: [], + dynamicToolResults: [], + usage, + totalUsage: usage, + warnings: [], + finishReason: "stop", + steps: [], + experimental_output: undefined, + response: { + id: "short-response", + modelId: "short-response", + timestamp: new Date(), + messages: [], + }, + context, + request: { + body: {}, + }, + providerMetadata: undefined, + experimental_providerMetadata: undefined, + pipeTextStreamToResponse: (response, init) => { + pipeTextStreamToResponse({ + response, + textStream: createTextStream(), + ...(init ?? {}), + }); + }, + toTextStreamResponse: (init) => { + return createTextStreamResponse({ + textStream: createTextStream(), + ...(init ?? {}), + }); + }, + toDataStream: () => createTextStream(), + toDataStreamResponse: (init) => { + return createTextStreamResponse({ + textStream: createTextStream(), + ...(init ?? {}), + }); + }, + pipeDataStreamToResponse: (response, init) => { + pipeTextStreamToResponse({ + response, + textStream: createTextStream(), + ...(init ?? {}), + }); + }, + } as GenerateTextResultWithContext; + } + + private createShortStreamTextResult( + text: string, + options?: StreamTextOptions, + ): StreamTextResultWithContext { + const usage = this.createZeroUsage(); + const context = toContextMap(options?.context) ?? new Map(); + const createTextStream = (): AsyncIterableStream => this.createShortTextStream(text); + const createFullStream = (): AsyncIterableStream => + this.createShortFullStream(text); + + const toUIMessageStream = (_options?: unknown) => + createUIMessageStream({ + execute: async ({ writer }) => { + writer.write({ type: "text", text } as any); + }, + onError: (error) => String(error), + }); + + const toUIMessageStreamResponse = (options?: ResponseInit) => { + const stream = toUIMessageStream(options); + const responseInit = options ? { ...options } : {}; + return createUIMessageStreamResponse({ + stream, + ...responseInit, + }); + }; + + const pipeUIMessageStreamToResponse = (response: any, init?: ResponseInit) => { + const stream = toUIMessageStream(init); + const initOptions = init ? { ...init } : {}; + pipeUIMessageStreamToResponse({ + response, + stream, + ...initOptions, + }); + }; + + return { + text: Promise.resolve(text), + get textStream() { + return createTextStream(); + }, + get fullStream() { + return createFullStream(); + }, + usage: Promise.resolve(usage), + finishReason: Promise.resolve("stop"), + experimental_partialOutputStream: undefined, + toUIMessageStream: toUIMessageStream as StreamTextResultWithContext["toUIMessageStream"], + toUIMessageStreamResponse: + toUIMessageStreamResponse as StreamTextResultWithContext["toUIMessageStreamResponse"], + pipeUIMessageStreamToResponse: + pipeUIMessageStreamToResponse as StreamTextResultWithContext["pipeUIMessageStreamToResponse"], + pipeTextStreamToResponse: (response, init) => { + pipeTextStreamToResponse({ + response, + textStream: createTextStream(), + ...(init ?? {}), + }); + }, + toTextStreamResponse: (init) => { + return createTextStreamResponse({ + textStream: createTextStream(), + ...(init ?? {}), + }); + }, + context, + }; + } + + private resolveShortResponseObject(schema: T, text: string): z.infer { + const candidates: unknown[] = []; + if (text.length > 0) { + try { + candidates.push(JSON.parse(text)); + } catch {} + } + candidates.push(text); + candidates.push({ text }); + for (const candidate of candidates) { + const parsed = schema.safeParse(candidate); + if (parsed.success) { + return parsed.data; + } + } + return (candidates[0] ?? text) as z.infer; + } + + private createShortObjectResult( + schema: T, + text: string, + options?: GenerateObjectOptions, + ): GenerateObjectResultWithContext> { + const object = this.resolveShortResponseObject(schema, text); + const usage = this.createZeroUsage(); + const context = toContextMap(options?.context) ?? new Map(); + + return { + object, + usage, + warnings: [], + finishReason: "stop", + response: { + id: "short-response", + modelId: "short-response", + timestamp: new Date(), + messages: [], + }, + context, + request: { + body: {}, + }, + reasoning: "", + providerMetadata: undefined, + toJsonResponse: (init?: ResponseInit) => { + const responseInit = init ? { ...init } : {}; + const headers = { + "content-type": "application/json", + ...(responseInit.headers ?? {}), + }; + return new Response(safeStringify(object), { + ...responseInit, + headers, + }); + }, + } as GenerateObjectResultWithContext>; + } + + private createShortStreamObjectResult( + schema: T, + text: string, + options?: StreamObjectOptions, + ): StreamObjectResultWithContext> { + const object = this.resolveShortResponseObject(schema, text); + const usage = this.createZeroUsage(); + const context = toContextMap(options?.context) ?? new Map(); + const textPayload = safeStringify(object); + const createTextStream = (): AsyncIterableStream => + this.createShortTextStream(textPayload); + + const partialObjectStream = new ReadableStream>>({ + start(controller) { + controller.enqueue(object); + controller.close(); + }, + }); + + return { + object: Promise.resolve(object), + partialObjectStream, + textStream: createTextStream(), + warnings: Promise.resolve(undefined), + usage: Promise.resolve(usage), + finishReason: Promise.resolve("stop"), + pipeTextStreamToResponse: (response, init) => { + pipeTextStreamToResponse({ + response, + textStream: createTextStream(), + ...(init ?? {}), + }); + }, + toTextStreamResponse: (init) => { + return createTextStreamResponse({ + textStream: createTextStream(), + ...(init ?? {}), + }); + }, + context, + }; + } + + private buildShortTextFallbackRequest( + tenantId: string, + metadata: TrafficRequestMetadata | undefined, + options: GenerateTextOptions | undefined, + text: string, + ): TrafficRequest { + const shortMetadata = this.buildShortResponseMetadata(metadata); + return { + tenantId, + metadata: shortMetadata, + maxQueueWaitMs: options?.maxQueueWaitMs, + estimatedTokens: 0, + execute: async () => this.createShortTextResult(text, options), + extractUsage: (result: GenerateTextResultWithContext) => + this.extractUsageFromResponse(result), + }; + } + + private buildShortStreamTextFallbackRequest( + tenantId: string, + metadata: TrafficRequestMetadata | undefined, + options: StreamTextOptions | undefined, + text: string, + ): TrafficRequest { + const shortMetadata = this.buildShortResponseMetadata(metadata); + return { + tenantId, + metadata: shortMetadata, + maxQueueWaitMs: options?.maxQueueWaitMs, + estimatedTokens: 0, + execute: async () => this.createShortStreamTextResult(text, options), + extractUsage: (result: StreamTextResultWithContext) => this.extractUsageFromResponse(result), + }; + } + + private buildShortObjectFallbackRequest( + tenantId: string, + metadata: TrafficRequestMetadata | undefined, + schema: T, + options: GenerateObjectOptions | undefined, + text: string, + ): TrafficRequest>> { + const shortMetadata = this.buildShortResponseMetadata(metadata); + return { + tenantId, + metadata: shortMetadata, + maxQueueWaitMs: options?.maxQueueWaitMs, + estimatedTokens: 0, + execute: async () => this.createShortObjectResult(schema, text, options), + extractUsage: (result: GenerateObjectResultWithContext>) => + this.extractUsageFromResponse(result), + }; + } + + private buildShortStreamObjectFallbackRequest( + tenantId: string, + metadata: TrafficRequestMetadata | undefined, + schema: T, + options: StreamObjectOptions | undefined, + text: string, + ): TrafficRequest>> { + const shortMetadata = this.buildShortResponseMetadata(metadata); + return { + tenantId, + metadata: shortMetadata, + maxQueueWaitMs: options?.maxQueueWaitMs, + estimatedTokens: 0, + execute: async () => this.createShortStreamObjectResult(schema, text, options), + extractUsage: (result: StreamObjectResultWithContext>) => + this.extractUsageFromResponse(result), + }; + } + private updateTrafficControllerRateLimits( response: unknown, metadata: TrafficRequestMetadata | undefined, diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 3850f0acf..9dee43331 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -37,6 +37,7 @@ export { type RateLimitKey, type RateLimitOptions, type AdaptiveLimiterConfig, + type PriorityWeights, type PriorityBurstLimits, type TrafficRequest, type TrafficRequestMetadata, diff --git a/packages/core/src/traffic/traffic-circuit-breaker.ts b/packages/core/src/traffic/traffic-circuit-breaker.ts index 72caf70ac..20d166ca2 100644 --- a/packages/core/src/traffic/traffic-circuit-breaker.ts +++ b/packages/core/src/traffic/traffic-circuit-breaker.ts @@ -379,6 +379,17 @@ export class TrafficCircuitBreaker { }); } + private isShortResponseFallback( + candidate: FallbackChainEntry, + ): candidate is { kind: "short-response"; text: string } { + return ( + typeof candidate === "object" && + candidate !== null && + "kind" in candidate && + (candidate as { kind?: string }).kind === "short-response" + ); + } + private findFallbackTarget( metadata: TrafficRequestMetadata | undefined, visitedKeys: Set, @@ -401,6 +412,13 @@ export class TrafficCircuitBreaker { } for (const candidate of chain) { + if (this.isShortResponseFallback(candidate)) { + logger?.debug?.("Selected short-response fallback", { + currentModel, + currentProvider: provider, + }); + return candidate; + } const target = this.normalizeFallbackTarget(candidate, provider); const candidateMetadata: TrafficRequestMetadata = { ...(metadata ?? {}), diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index 5afe82f8b..269304d9c 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -24,6 +24,7 @@ import type { FallbackPolicyMode, FallbackTarget, PriorityBurstLimits, + PriorityWeights, ProviderModelConcurrencyLimit, RateLimitConfig, RateLimitKey, @@ -55,6 +56,7 @@ export type { FallbackPolicyMode, FallbackTarget, PriorityBurstLimits, + PriorityWeights, ProviderModelConcurrencyLimit, RateLimitConfig, RateLimitKey, @@ -95,7 +97,7 @@ type AdaptiveLimiterState = { last429At?: number; }; -const DEFAULT_PRIORITY_BURST_LIMITS: Record = { +const DEFAULT_PRIORITY_WEIGHTS: Record = { P0: 5, P1: 3, P2: 2, @@ -128,12 +130,8 @@ export class TrafficController { P2: { order: [], index: 0, queues: new Map() }, }; private readonly priorityOrder: TrafficPriority[] = ["P0", "P1", "P2"]; - private readonly priorityBurstLimits: Record; - private readonly priorityBurstCounts: Record = { - P0: 0, - P1: 0, - P2: 0, - }; + private readonly priorityWeights: Record; + private readonly priorityCredits: Record; private activeCount = 0; private drainScheduled = false; @@ -159,10 +157,17 @@ export class TrafficController { this.scheduler = this.createScheduler(); this.rateLimitKeyBuilder = options.rateLimitKeyBuilder ?? buildRateLimitKeyFromMetadata; this.retryPolicy = options.retryPolicy; - this.priorityBurstLimits = { - ...DEFAULT_PRIORITY_BURST_LIMITS, - ...(options.priorityBurstLimits ?? {}), + const priorityOverrides = options.priorityWeights ?? options.priorityBurstLimits; + const priorityWeights = { + ...DEFAULT_PRIORITY_WEIGHTS, + ...(priorityOverrides ?? {}), + }; + this.priorityWeights = { + P0: Math.max(0, Math.floor(priorityWeights.P0)), + P1: Math.max(0, Math.floor(priorityWeights.P1)), + P2: Math.max(0, Math.floor(priorityWeights.P2)), }; + this.priorityCredits = { ...this.priorityWeights }; this.adaptiveLimiterConfig = { ...DEFAULT_ADAPTIVE_LIMITER, ...(options.adaptiveLimiter ?? {}), @@ -203,6 +208,7 @@ export class TrafficController { hasStrategyOverrides: options.rateLimitStrategy !== undefined, hasRetryPolicy: options.retryPolicy !== undefined, hasPriorityBurstLimits: options.priorityBurstLimits !== undefined, + hasPriorityWeights: options.priorityWeights !== undefined, hasAdaptiveLimiter: options.adaptiveLimiter !== undefined, }); } @@ -957,35 +963,31 @@ export class TrafficController { return total; } - private hasQueuedWorkBelow(priority: TrafficPriority): boolean { - const index = this.priorityOrder.indexOf(priority); - if (index < 0) return false; - for (let i = index + 1; i < this.priorityOrder.length; i += 1) { - if (this.getQueuedCount(this.priorityOrder[i]) > 0) { - return true; - } - } - return false; - } - - private canDispatchPriority(priority: TrafficPriority): boolean { - const limit = this.priorityBurstLimits[priority]; - if (!Number.isFinite(limit) || limit <= 0) return true; - if (this.priorityBurstCounts[priority] < limit) return true; - return !this.hasQueuedWorkBelow(priority); + private refillPriorityCredits(): void { + this.priorityCredits.P0 = this.priorityWeights.P0; + this.priorityCredits.P1 = this.priorityWeights.P1; + this.priorityCredits.P2 = this.priorityWeights.P2; } private recordPriorityDispatch(priority: TrafficPriority): void { - for (const key of this.priorityOrder) { - if (key !== priority) { - this.priorityBurstCounts[key] = 0; - } + if (this.priorityCredits[priority] > 0) { + this.priorityCredits[priority] -= 1; } - this.priorityBurstCounts[priority] += 1; } private getPriorityDispatchOrder(): TrafficPriority[] { - return this.priorityOrder.filter((priority) => this.canDispatchPriority(priority)); + const prioritiesWithWork = this.priorityOrder.filter( + (priority) => this.getQueuedCount(priority) > 0, + ); + if (prioritiesWithWork.length === 0) return []; + + let available = prioritiesWithWork.filter((priority) => this.priorityCredits[priority] > 0); + if (available.length === 0) { + this.refillPriorityCredits(); + available = prioritiesWithWork.filter((priority) => this.priorityCredits[priority] > 0); + } + + return available.length === 0 ? prioritiesWithWork : available; } private getNextTenantCandidate( diff --git a/packages/core/src/traffic/traffic-types.ts b/packages/core/src/traffic/traffic-types.ts index 80cfc3724..1d847e252 100644 --- a/packages/core/src/traffic/traffic-types.ts +++ b/packages/core/src/traffic/traffic-types.ts @@ -68,7 +68,12 @@ export type FallbackTarget = { model: string; }; -export type FallbackChainEntry = string | FallbackTarget; +export type ShortResponseFallbackTarget = { + kind: "short-response"; + text: string; +}; + +export type FallbackChainEntry = string | FallbackTarget | ShortResponseFallbackTarget; export type FallbackPolicyMode = "fallback" | "wait"; @@ -93,6 +98,7 @@ export type TenantConcurrencyLimit = | ((tenantId: string, metadata: TrafficRequestMetadata | undefined) => number | undefined); export type PriorityBurstLimits = Partial>; +export type PriorityWeights = Partial>; export type AdaptiveLimiterConfig = { windowMs?: number; @@ -126,6 +132,7 @@ export interface TrafficControllerOptions { maxConcurrentPerTenant?: TenantConcurrencyLimit; rateLimits?: RateLimitConfig; priorityBurstLimits?: PriorityBurstLimits; + priorityWeights?: PriorityWeights; adaptiveLimiter?: AdaptiveLimiterConfig; /** * Optional override for rate-limit key construction. From 7bf13483f7a590f7f92a401b9832afbaa24ff893 Mon Sep 17 00:00:00 2001 From: riturajFi Date: Fri, 26 Dec 2025 13:06:56 +0530 Subject: [PATCH 37/41] fix: stream concurency fix --- diff.txt | 9297 +++++++++++++++++ .../src/traffic/traffic-controller.spec.ts | 46 + .../core/src/traffic/traffic-controller.ts | 404 +- packages/core/src/traffic/traffic-types.ts | 1 + 4 files changed, 9594 insertions(+), 154 deletions(-) create mode 100644 diff.txt diff --git a/diff.txt b/diff.txt new file mode 100644 index 000000000..b393df88f --- /dev/null +++ b/diff.txt @@ -0,0 +1,9297 @@ +diff --git a/commits.txt b/commits.txt +new file mode 100644 +index 00000000..73fd43c5 +--- /dev/null ++++ b/commits.txt +@@ -0,0 +1,6 @@ ++e8443df2 ++9503a0a6 ++293fe825 ++a88ecd67 ++66d74dd2 ++53f34370 +\ No newline at end of file +diff --git a/examples/with-client-side-tools/next-env.d.ts b/examples/with-client-side-tools/next-env.d.ts +index 1b3be084..9edff1c7 100644 +--- a/examples/with-client-side-tools/next-env.d.ts ++++ b/examples/with-client-side-tools/next-env.d.ts +@@ -1,5 +1,6 @@ + /// + /// ++import "./.next/types/routes.d.ts"; + + // NOTE: This file should not be edited + // see https://nextjs.org/docs/app/api-reference/config/typescript for more information. +diff --git a/examples/with-client-side-tools/tsconfig.json b/examples/with-client-side-tools/tsconfig.json +index 3697fcb9..0fca67d3 100644 +--- a/examples/with-client-side-tools/tsconfig.json ++++ b/examples/with-client-side-tools/tsconfig.json +@@ -1,6 +1,10 @@ + { + "compilerOptions": { +- "lib": ["dom", "dom.iterable", "esnext"], ++ "lib": [ ++ "dom", ++ "dom.iterable", ++ "esnext" ++ ], + "allowJs": true, + "skipLibCheck": true, + "strict": true, +@@ -11,7 +15,7 @@ + "resolveJsonModule": true, + "isolatedModules": true, + "sourceMap": true, +- "jsx": "preserve", ++ "jsx": "react-jsx", + "incremental": true, + "plugins": [ + { +@@ -19,10 +23,20 @@ + } + ], + "paths": { +- "@/*": ["./*"] ++ "@/*": [ ++ "./*" ++ ] + }, + "target": "ES2017" + }, +- "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"], +- "exclude": ["node_modules"] ++ "include": [ ++ "next-env.d.ts", ++ "**/*.ts", ++ "**/*.tsx", ++ ".next/types/**/*.ts", ++ ".next/dev/types/**/*.ts" ++ ], ++ "exclude": [ ++ "node_modules" ++ ] + } +diff --git a/examples/with-netlify-functions/netlify/functions/voltagent.js b/examples/with-netlify-functions/netlify/functions/voltagent.js +new file mode 100644 +index 00000000..0ec386b8 +--- /dev/null ++++ b/examples/with-netlify-functions/netlify/functions/voltagent.js +@@ -0,0 +1,4 @@ ++import { createNetlifyFunctionHandler } from "@voltagent/serverless-hono"; ++import { getVoltAgent } from "../../src/index"; ++const voltAgent = getVoltAgent(); ++export const handler = createNetlifyFunctionHandler(voltAgent); +diff --git a/examples/with-netlify-functions/src/index.js b/examples/with-netlify-functions/src/index.js +new file mode 100644 +index 00000000..af385b50 +--- /dev/null ++++ b/examples/with-netlify-functions/src/index.js +@@ -0,0 +1,17 @@ ++import { openai } from "@ai-sdk/openai"; ++import { Agent, VoltAgent } from "@voltagent/core"; ++import { serverlessHono } from "@voltagent/serverless-hono"; ++import { weatherTool } from "./tools"; ++const agent = new Agent({ ++ name: "netlify-function-agent", ++ instructions: "Help the user quickly and call tools when needed.", ++ model: openai("gpt-4o-mini"), ++ tools: [weatherTool], ++}); ++const voltAgent = new VoltAgent({ ++ agents: { agent }, ++ serverless: serverlessHono(), ++}); ++export function getVoltAgent() { ++ return voltAgent; ++} +diff --git a/examples/with-netlify-functions/src/tools/index.js b/examples/with-netlify-functions/src/tools/index.js +new file mode 100644 +index 00000000..d1c5bf43 +--- /dev/null ++++ b/examples/with-netlify-functions/src/tools/index.js +@@ -0,0 +1,26 @@ ++import { createTool } from "@voltagent/core"; ++import z from "zod"; ++export const weatherTool = createTool({ ++ id: "get-weather", ++ name: "getWeather", ++ description: "Return a mock weather report for the requested location", ++ parameters: z.object({ ++ location: z.string().describe("City or location to look up"), ++ }), ++ execute: async ({ location }, context) => { ++ context?.logger.info(`Fetching weather for ${location}`); ++ const mockWeatherData = { ++ location, ++ temperature: Math.floor(Math.random() * 30) + 5, ++ condition: ["Sunny", "Cloudy", "Rainy", "Snowy", "Partly Cloudy"][ ++ Math.floor(Math.random() * 5) ++ ], ++ humidity: Math.floor(Math.random() * 60) + 30, ++ windSpeed: Math.floor(Math.random() * 30), ++ }; ++ return { ++ weather: mockWeatherData, ++ message: `Current weather in ${location}: ${mockWeatherData.temperature}°C and ${mockWeatherData.condition.toLowerCase()} with ${mockWeatherData.humidity}% humidity and wind speed of ${mockWeatherData.windSpeed} km/h.`, ++ }; ++ }, ++}); +diff --git a/package.json b/package.json +index 7c80f7c5..7e3ef8ba 100644 +--- a/package.json ++++ b/package.json +@@ -32,9 +32,10 @@ + "publint": "^0.3.8", + "rimraf": "^5.0.5", + "syncpack": "^13.0.2", ++ "ts-node": "^10.9.2", + "tslib": "^2.3.0", + "tsup": "^8.5.0", +- "typescript": "^5.8.2", ++ "typescript": "^5.9.2", + "vite": "^7.2.7", + "vitest": "^3.2.4" + }, +diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts +index 291bdf7f..84343c04 100644 +--- a/packages/core/src/agent/agent.ts ++++ b/packages/core/src/agent/agent.ts +@@ -48,6 +48,14 @@ import type { BaseRetriever } from "../retriever/retriever"; + import type { Tool, Toolkit } from "../tool"; + import { createTool } from "../tool"; + import { ToolManager } from "../tool/manager"; ++import { ++ type FallbackChainEntry, ++ type TrafficPriority, ++ type TrafficRequest, ++ type TrafficRequestMetadata, ++ getTrafficController, ++} from "../traffic/traffic-controller"; ++import { findHeaders } from "../traffic/traffic-error-utils"; + import { randomUUID } from "../utils/id"; + import { convertModelMessagesToUIMessages } from "../utils/message-converter"; + import { NodeType, createNodeId } from "../utils/node-utils"; +@@ -262,8 +270,42 @@ export interface BaseGenerationOptions extends Partial { + // Context + userId?: string; + conversationId?: string; ++ tenantId?: string; ++ /** ++ * Optional key metadata for per-key rate limits. ++ */ ++ apiKeyId?: string; ++ /** ++ * Optional region metadata for per-region rate limits. ++ */ ++ region?: string; ++ /** ++ * Optional endpoint metadata for per-endpoint rate limits. ++ */ ++ endpoint?: string; ++ /** ++ * Optional tenant tier metadata for per-tier rate limits. ++ */ ++ tenantTier?: string; + context?: ContextInput; + elicitation?: (request: unknown) => Promise; ++ /** ++ * Optional priority override for scheduling. ++ * Defaults to agent-level priority when omitted. ++ */ ++ trafficPriority?: TrafficPriority; ++ /** ++ * Optional maximum time to wait in the queue before timing out. ++ */ ++ maxQueueWaitMs?: number; ++ /** ++ * Optional task classification for circuit-breaker fallback policies. ++ */ ++ taskType?: string; ++ /** ++ * Optional explicit fallback policy id. ++ */ ++ fallbackPolicyId?: string; + + // Parent tracking + parentAgentId?: string; +@@ -303,6 +345,8 @@ export interface BaseGenerationOptions extends Partial { + + // Provider-specific options + providerOptions?: ProviderOptions; ++ // Optional per-call model override (used for fallbacks) ++ model?: LanguageModel; + + // Experimental output (for structured generation) + experimental_output?: ReturnType | ReturnType; +@@ -347,6 +391,7 @@ export class Agent { + readonly voice?: Voice; + readonly retriever?: BaseRetriever; + readonly supervisorConfig?: SupervisorConfig; ++ private readonly trafficPriority: TrafficPriority; + private readonly context?: Map; + + private readonly logger: Logger; +@@ -372,6 +417,7 @@ export class Agent { + this.temperature = options.temperature; + this.maxOutputTokens = options.maxOutputTokens; + this.maxSteps = options.maxSteps || 5; ++ this.trafficPriority = options.trafficPriority ?? "P1"; + this.stopWhen = options.stopWhen; + this.markdown = options.markdown ?? false; + this.voice = options.voice; +@@ -444,6 +490,47 @@ export class Agent { + async generateText( + input: string | UIMessage[] | BaseMessage[], + options?: GenerateTextOptions, ++ ): Promise { ++ const controller = getTrafficController({ logger: this.logger }); // Use shared controller so all agent calls flow through central queue/metrics ++ const tenantId = this.resolveTenantId(options); ++ const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => { ++ const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); ++ const metadata = this.buildTrafficMetadata( ++ mergedOptions?.model, ++ mergedOptions, ++ providerOverride, ++ ); // Compute once per queued request (including per-call model overrides) ++ return { ++ tenantId, ++ metadata, ++ maxQueueWaitMs: options?.maxQueueWaitMs, ++ estimatedTokens: this.estimateTokens(input, mergedOptions), ++ execute: () => this.executeGenerateText(input, mergedOptions, metadata), // Defer actual execution so controller can schedule it ++ extractUsage: (result: GenerateTextResultWithContext) => ++ this.extractUsageFromResponse(result), ++ createFallbackRequest: (fallbackTarget) => { ++ if (this.isShortResponseFallback(fallbackTarget)) { ++ return this.buildShortTextFallbackRequest( ++ tenantId, ++ metadata, ++ mergedOptions, ++ fallbackTarget.text, ++ ); ++ } ++ const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = ++ this.resolveFallbackTarget(fallbackTarget); ++ return buildRequest(fallbackModel, fallbackProvider); ++ }, ++ }; ++ }; ++ ++ return controller.handleText(buildRequest(options?.model)); ++ } ++ ++ private async executeGenerateText( ++ input: string | UIMessage[] | BaseMessage[], ++ options?: GenerateTextOptions, ++ trafficMetadata?: TrafficRequestMetadata, + ): Promise { + const startTime = Date.now(); + const oc = this.createOperationContext(input, options); +@@ -471,7 +558,7 @@ export class Agent { + options, + ); + +- const modelName = this.getModelName(); ++ const modelName = this.getModelName(model); + const contextLimit = options?.contextLimit; + + // Add model attributes and all options +@@ -544,10 +631,20 @@ export class Agent { + hooks, + maxSteps: userMaxSteps, + tools: userTools, ++ maxQueueWaitMs, ++ taskType, ++ fallbackPolicyId, + experimental_output, + providerOptions, ++ maxRetries: _maxRetries, // Always disable provider retries (TrafficController handles retries) ++ model: _model, // Exclude model so aiSDKOptions doesn't override resolved model + ...aiSDKOptions + } = options || {}; ++ void _model; ++ void _maxRetries; ++ void maxQueueWaitMs; ++ void taskType; ++ void fallbackPolicyId; + + const llmSpan = this.createLLMSpan(oc, { + operation: "generateText", +@@ -567,6 +664,11 @@ export class Agent { + + let result!: GenerateTextResult; + try { ++ methodLogger.info("[AI SDK] Calling generateText", { ++ messageCount: messages.length, ++ modelName, ++ tools: tools ? Object.keys(tools) : [], ++ }); + result = await oc.traceContext.withSpan(llmSpan, () => + generateText({ + model, +@@ -575,7 +677,7 @@ export class Agent { + // Default values + temperature: this.temperature, + maxOutputTokens: this.maxOutputTokens, +- maxRetries: 3, ++ maxRetries: 0, + stopWhen: options?.stopWhen ?? this.stopWhen ?? stepCountIs(maxSteps), + // User overrides from AI SDK options + ...aiSDKOptions, +@@ -588,7 +690,15 @@ export class Agent { + onStepFinish: this.createStepHandler(oc, options), + }), + ); ++ methodLogger.info("[AI SDK] Received generateText result", { ++ finishReason: result.finishReason, ++ usage: result.usage ? safeStringify(result.usage) : undefined, ++ stepCount: result.steps?.length ?? 0, ++ rawResult: safeStringify(result), ++ }); ++ this.updateTrafficControllerRateLimits(result.response, trafficMetadata, methodLogger); + } catch (error) { ++ this.updateTrafficControllerRateLimits(error, trafficMetadata, methodLogger); + finalizeLLMSpan(SpanStatusCode.ERROR, { message: (error as Error).message }); + throw error; + } +@@ -771,6 +881,47 @@ export class Agent { + async streamText( + input: string | UIMessage[] | BaseMessage[], + options?: StreamTextOptions, ++ ): Promise { ++ const controller = getTrafficController({ logger: this.logger }); // Same controller handles streaming to keep ordering/backpressure consistent ++ const tenantId = this.resolveTenantId(options); ++ const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => { ++ const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); ++ const metadata = this.buildTrafficMetadata( ++ mergedOptions?.model, ++ mergedOptions, ++ providerOverride, ++ ); // Compute once per queued request (including per-call model overrides) ++ return { ++ tenantId, ++ metadata, ++ maxQueueWaitMs: options?.maxQueueWaitMs, ++ estimatedTokens: this.estimateTokens(input, mergedOptions), ++ execute: () => this.executeStreamText(input, mergedOptions, metadata), // Actual streaming work happens after the controller dequeues us ++ extractUsage: (result: StreamTextResultWithContext) => ++ this.extractUsageFromResponse(result), ++ createFallbackRequest: (fallbackTarget) => { ++ if (this.isShortResponseFallback(fallbackTarget)) { ++ return this.buildShortStreamTextFallbackRequest( ++ tenantId, ++ metadata, ++ mergedOptions, ++ fallbackTarget.text, ++ ); ++ } ++ const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = ++ this.resolveFallbackTarget(fallbackTarget); ++ return buildRequest(fallbackModel, fallbackProvider); ++ }, ++ }; ++ }; ++ ++ return controller.handleStream(buildRequest(options?.model)); ++ } ++ ++ private async executeStreamText( ++ input: string | UIMessage[] | BaseMessage[], ++ options?: StreamTextOptions, ++ trafficMetadata?: TrafficRequestMetadata, + ): Promise { + const startTime = Date.now(); + const oc = this.createOperationContext(input, options); +@@ -800,7 +951,7 @@ export class Agent { + options, + ); + +- const modelName = this.getModelName(); ++ const modelName = this.getModelName(model); + const contextLimit = options?.contextLimit; + + // Add model attributes to root span if TraceContext exists +@@ -868,10 +1019,20 @@ export class Agent { + maxSteps: userMaxSteps, + tools: userTools, + onFinish: userOnFinish, ++ maxQueueWaitMs, ++ taskType, ++ fallbackPolicyId, + experimental_output, + providerOptions, ++ maxRetries: _maxRetries, // Always disable provider retries (TrafficController handles retries) ++ model: _model, // Exclude model from aiSDKOptions to avoid overriding resolved model + ...aiSDKOptions + } = options || {}; ++ void _model; ++ void _maxRetries; ++ void maxQueueWaitMs; ++ void taskType; ++ void fallbackPolicyId; + + const guardrailStreamingEnabled = guardrailSet.output.length > 0; + +@@ -893,7 +1054,13 @@ export class Agent { + }, + }); + const finalizeLLMSpan = this.createLLMSpanFinalizer(llmSpan); ++ const trafficController = getTrafficController({ logger: this.logger }); + ++ methodLogger.info("[AI SDK] Calling streamText", { ++ messageCount: messages.length, ++ modelName, ++ tools: tools ? Object.keys(tools) : [], ++ }); + const result = streamText({ + model, + messages, +@@ -901,7 +1068,7 @@ export class Agent { + // Default values + temperature: this.temperature, + maxOutputTokens: this.maxOutputTokens, +- maxRetries: 3, ++ maxRetries: 0, // Retry via traffic controller to avoid provider-level storms + stopWhen: options?.stopWhen ?? this.stopWhen ?? stepCountIs(maxSteps), + // User overrides from AI SDK options + ...aiSDKOptions, +@@ -937,6 +1104,8 @@ export class Agent { + modelName: this.getModelName(), + }); + ++ this.updateTrafficControllerRateLimits(actualError, trafficMetadata, methodLogger); ++ trafficController.reportStreamFailure(trafficMetadata, actualError); + finalizeLLMSpan(SpanStatusCode.ERROR, { message: (actualError as Error)?.message }); + + // History update removed - using OpenTelemetry only +@@ -962,6 +1131,18 @@ export class Agent { + .catch(() => {}); + }, + onFinish: async (finalResult) => { ++ methodLogger.info("[AI SDK] streamText finished", { ++ finishReason: finalResult.finishReason, ++ usage: finalResult.totalUsage ? safeStringify(finalResult.totalUsage) : undefined, ++ stepCount: finalResult.steps?.length ?? 0, ++ rawResult: safeStringify(finalResult), ++ }); ++ this.updateTrafficControllerRateLimits( ++ finalResult.response, ++ trafficMetadata, ++ methodLogger, ++ ); ++ trafficController.reportStreamSuccess(trafficMetadata); + const providerUsage = finalResult.usage + ? await Promise.resolve(finalResult.usage) + : undefined; +@@ -1428,6 +1609,49 @@ export class Agent { + input: string | UIMessage[] | BaseMessage[], + schema: T, + options?: GenerateObjectOptions, ++ ): Promise>> { ++ const controller = getTrafficController({ logger: this.logger }); ++ const tenantId = this.resolveTenantId(options); ++ const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => { ++ const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); ++ const metadata = this.buildTrafficMetadata( ++ mergedOptions?.model, ++ mergedOptions, ++ providerOverride, ++ ); // Compute once per queued request (including per-call model overrides) ++ return { ++ tenantId, ++ metadata, ++ maxQueueWaitMs: options?.maxQueueWaitMs, ++ estimatedTokens: this.estimateTokens(input, mergedOptions), ++ execute: () => this.executeGenerateObject(input, schema, mergedOptions, metadata), ++ extractUsage: (result: GenerateObjectResultWithContext>) => ++ this.extractUsageFromResponse(result), ++ createFallbackRequest: (fallbackTarget) => { ++ if (this.isShortResponseFallback(fallbackTarget)) { ++ return this.buildShortObjectFallbackRequest( ++ tenantId, ++ metadata, ++ schema, ++ mergedOptions, ++ fallbackTarget.text, ++ ); ++ } ++ const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = ++ this.resolveFallbackTarget(fallbackTarget); ++ return buildRequest(fallbackModel, fallbackProvider); ++ }, ++ }; ++ }; ++ ++ return controller.handleText(buildRequest(options?.model)); ++ } ++ ++ private async executeGenerateObject( ++ input: string | UIMessage[] | BaseMessage[], ++ schema: T, ++ options?: GenerateObjectOptions, ++ trafficMetadata?: TrafficRequestMetadata, + ): Promise>> { + const startTime = Date.now(); + const oc = this.createOperationContext(input, options); +@@ -1452,7 +1676,7 @@ export class Agent { + options, + ); + +- const modelName = this.getModelName(); ++ const modelName = this.getModelName(model); + const schemaName = schema.description || "unknown"; + + // Add model attributes and all options +@@ -1510,10 +1734,25 @@ export class Agent { + hooks, + maxSteps: userMaxSteps, + tools: userTools, ++ taskType, ++ fallbackPolicyId, ++ maxQueueWaitMs, + providerOptions, ++ maxRetries: _maxRetries, // Always disable provider retries (TrafficController handles retries) ++ model: _model, // Exclude model so spread does not override resolved model + ...aiSDKOptions + } = options || {}; +- ++ void _model; ++ void _maxRetries; ++ void taskType; ++ void fallbackPolicyId; ++ void maxQueueWaitMs; ++ ++ methodLogger.info("[AI SDK] Calling generateObject", { ++ messageCount: messages.length, ++ modelName, ++ schemaName, ++ }); + const result = await generateObject({ + model, + messages, +@@ -1522,7 +1761,7 @@ export class Agent { + // Default values + maxOutputTokens: this.maxOutputTokens, + temperature: this.temperature, +- maxRetries: 3, ++ maxRetries: 0, + // User overrides from AI SDK options + ...aiSDKOptions, + // Provider-specific options +@@ -1530,6 +1769,13 @@ export class Agent { + // VoltAgent controlled + abortSignal: oc.abortController.signal, + }); ++ methodLogger.info("[AI SDK] Received generateObject result", { ++ finishReason: result.finishReason, ++ usage: result.usage ? safeStringify(result.usage) : undefined, ++ warnings: result.warnings, ++ rawResult: safeStringify(result), ++ }); ++ this.updateTrafficControllerRateLimits(result.response, trafficMetadata, methodLogger); + + const usageInfo = convertUsage(result.usage); + const finalObject = await executeOutputGuardrails({ +@@ -1638,6 +1884,7 @@ export class Agent { + context: oc.context, + }; + } catch (error) { ++ this.updateTrafficControllerRateLimits(error, trafficMetadata, methodLogger); + await this.flushPendingMessagesOnError(oc).catch(() => {}); + return this.handleError(error as Error, oc, options, startTime); + } finally { +@@ -1655,6 +1902,49 @@ export class Agent { + input: string | UIMessage[] | BaseMessage[], + schema: T, + options?: StreamObjectOptions, ++ ): Promise>> { ++ const controller = getTrafficController({ logger: this.logger }); ++ const tenantId = this.resolveTenantId(options); ++ const buildRequest = (modelOverride?: LanguageModel, providerOverride?: string) => { ++ const mergedOptions = this.mergeOptionsWithModel(options, modelOverride); ++ const metadata = this.buildTrafficMetadata( ++ mergedOptions?.model, ++ mergedOptions, ++ providerOverride, ++ ); // Compute once per queued request (including per-call model overrides) ++ return { ++ tenantId, ++ metadata, ++ maxQueueWaitMs: options?.maxQueueWaitMs, ++ estimatedTokens: this.estimateTokens(input, mergedOptions), ++ execute: () => this.executeStreamObject(input, schema, mergedOptions, metadata), ++ extractUsage: (result: StreamObjectResultWithContext>) => ++ this.extractUsageFromResponse(result), ++ createFallbackRequest: (fallbackTarget) => { ++ if (this.isShortResponseFallback(fallbackTarget)) { ++ return this.buildShortStreamObjectFallbackRequest( ++ tenantId, ++ metadata, ++ schema, ++ mergedOptions, ++ fallbackTarget.text, ++ ); ++ } ++ const { modelOverride: fallbackModel, providerOverride: fallbackProvider } = ++ this.resolveFallbackTarget(fallbackTarget); ++ return buildRequest(fallbackModel, fallbackProvider); ++ }, ++ }; ++ }; ++ ++ return controller.handleStream(buildRequest(options?.model)); ++ } ++ ++ private async executeStreamObject( ++ input: string | UIMessage[] | BaseMessage[], ++ schema: T, ++ options?: StreamObjectOptions, ++ trafficMetadata?: TrafficRequestMetadata, + ): Promise>> { + const startTime = Date.now(); + const oc = this.createOperationContext(input, options); +@@ -1680,7 +1970,7 @@ export class Agent { + options, + ); + +- const modelName = this.getModelName(); ++ const modelName = this.getModelName(model); + const schemaName = schema.description || "unknown"; + + // Add model attributes and all options +@@ -1739,14 +2029,30 @@ export class Agent { + maxSteps: userMaxSteps, + tools: userTools, + onFinish: userOnFinish, ++ taskType, ++ fallbackPolicyId, ++ maxQueueWaitMs, + providerOptions, ++ maxRetries: _maxRetries, // Always disable provider retries (TrafficController handles retries) ++ model: _model, // Exclude model so aiSDKOptions cannot override resolved model + ...aiSDKOptions + } = options || {}; ++ void _model; ++ void _maxRetries; ++ void taskType; ++ void fallbackPolicyId; ++ void maxQueueWaitMs; + + let guardrailObjectPromise!: Promise>; + let resolveGuardrailObject: ((value: z.infer) => void) | undefined; + let rejectGuardrailObject: ((reason: unknown) => void) | undefined; ++ const trafficController = getTrafficController({ logger: this.logger }); + ++ methodLogger.info("[AI SDK] Calling streamObject", { ++ messageCount: messages.length, ++ modelName, ++ schemaName, ++ }); + const result = streamObject({ + model, + messages, +@@ -1755,7 +2061,7 @@ export class Agent { + // Default values + maxOutputTokens: this.maxOutputTokens, + temperature: this.temperature, +- maxRetries: 3, ++ maxRetries: 0, + // User overrides from AI SDK options + ...aiSDKOptions, + // Provider-specific options +@@ -1771,9 +2077,11 @@ export class Agent { + methodLogger.error("Stream object error occurred", { + error: actualError, + agentName: this.name, +- modelName: this.getModelName(), ++ modelName: this.getModelName(model), + schemaName: schemaName, + }); ++ this.updateTrafficControllerRateLimits(actualError, trafficMetadata, methodLogger); ++ trafficController.reportStreamFailure(trafficMetadata, actualError); + + // History update removed - using OpenTelemetry only + +@@ -1800,6 +2108,17 @@ export class Agent { + }, + onFinish: async (finalResult: any) => { + try { ++ methodLogger.info("[AI SDK] streamObject finished", { ++ finishReason: finalResult.finishReason, ++ usage: finalResult.usage ? safeStringify(finalResult.usage) : undefined, ++ rawResult: safeStringify(finalResult), ++ }); ++ this.updateTrafficControllerRateLimits( ++ finalResult.response, ++ trafficMetadata, ++ methodLogger, ++ ); ++ trafficController.reportStreamSuccess(trafficMetadata); + const usageInfo = convertUsage(finalResult.usage as any); + let finalObject = finalResult.object as z.infer; + if (guardrailSet.output.length > 0) { +@@ -2021,8 +2340,9 @@ export class Agent { + // Calculate maxSteps (use provided option or calculate based on subagents) + const maxSteps = options?.maxSteps ?? this.calculateMaxSteps(); + +- // Resolve dynamic values +- const model = await this.resolveValue(this.model, oc); ++ // Resolve dynamic values (allow per-call model override for fallbacks) ++ const selectedModel = options?.model ?? this.model; ++ const model = await this.resolveValue(selectedModel, oc); + const dynamicToolList = (await this.resolveValue(this.dynamicTools, oc)) || []; + + // Merge agent tools with option tools +@@ -2073,6 +2393,12 @@ export class Agent { + ): OperationContext { + const operationId = randomUUID(); + const startTimeDate = new Date(); ++ const priority = this.resolveTrafficPriority(options); ++ const tenantId = this.resolveTenantId(options); ++ const apiKeyId = options?.apiKeyId ?? options?.parentOperationContext?.apiKeyId; ++ const region = options?.region ?? options?.parentOperationContext?.region; ++ const endpoint = options?.endpoint ?? options?.parentOperationContext?.endpoint; ++ const tenantTier = options?.tenantTier ?? options?.parentOperationContext?.tenantTier; + + // Prefer reusing an existing context instance to preserve reference across calls/subagents + const runtimeContext = toContextMap(options?.context); +@@ -2123,6 +2449,7 @@ export class Agent { + operationId, + userId: options?.userId, + conversationId: options?.conversationId, ++ tenantId, + executionId: operationId, + }); + +@@ -2137,6 +2464,9 @@ export class Agent { + parentAgentId: options?.parentAgentId, + input, + }); ++ if (tenantId) { ++ traceContext.getRootSpan().setAttribute("tenant.id", tenantId); ++ } + traceContext.getRootSpan().setAttribute("voltagent.operation_id", operationId); + + // Use parent's AbortController if available, otherwise create new one +@@ -2174,8 +2504,14 @@ export class Agent { + logger, + conversationSteps: options?.parentOperationContext?.conversationSteps || [], + abortController, ++ priority, + userId: options?.userId, + conversationId: options?.conversationId, ++ tenantId, ++ apiKeyId, ++ region, ++ endpoint, ++ tenantTier, + parentAgentId: options?.parentAgentId, + traceContext, + startTime: startTimeDate, +@@ -3170,6 +3506,20 @@ export class Agent { + return value; + } + ++ private mergeOptionsWithModel( ++ options: BaseGenerationOptions | undefined, ++ modelOverride?: LanguageModel, ++ ): BaseGenerationOptions | undefined { ++ if (!options && modelOverride === undefined) { ++ return undefined; ++ } ++ ++ return { ++ ...(options ?? {}), ++ ...(modelOverride !== undefined ? { model: modelOverride } : {}), ++ }; ++ } ++ + /** + * Prepare tools with execution context + */ +@@ -3822,17 +4172,622 @@ export class Agent { + return this.subAgentManager.calculateMaxSteps(this.maxSteps); + } + ++ private resolveTrafficPriority(options?: BaseGenerationOptions): TrafficPriority { ++ const normalize = (value?: TrafficPriority): TrafficPriority | undefined => { ++ if (value === "P0" || value === "P1" || value === "P2") { ++ return value; ++ } ++ return undefined; ++ }; ++ ++ const parentPriority = normalize(options?.parentOperationContext?.priority); ++ const localPriority = normalize(options?.trafficPriority) ?? this.trafficPriority ?? "P1"; ++ ++ if (parentPriority) { ++ return this.pickHigherPriority(parentPriority, localPriority); ++ } ++ ++ return localPriority; ++ } ++ ++ private resolveTenantId(options?: BaseGenerationOptions): string { ++ const parentTenant = options?.parentOperationContext?.tenantId; ++ if (parentTenant) { ++ return parentTenant; ++ } ++ ++ if (options?.tenantId) { ++ return options.tenantId; ++ } ++ ++ return "default"; ++ } ++ ++ private pickHigherPriority(a: TrafficPriority, b: TrafficPriority): TrafficPriority { ++ const rank: Record = { P0: 0, P1: 1, P2: 2 }; ++ return rank[a] <= rank[b] ? a : b; ++ } ++ ++ private buildTrafficMetadata( ++ modelOverride?: LanguageModel | DynamicValue, ++ options?: BaseGenerationOptions, ++ providerOverride?: string, ++ ): TrafficRequestMetadata { ++ const provider = ++ providerOverride ?? ++ this.resolveProvider(modelOverride) ?? ++ this.resolveProvider(this.model) ?? ++ undefined; ++ const priority = this.resolveTrafficPriority(options); ++ const apiKeyId = options?.apiKeyId ?? options?.parentOperationContext?.apiKeyId; ++ const region = options?.region ?? options?.parentOperationContext?.region; ++ const endpoint = options?.endpoint ?? options?.parentOperationContext?.endpoint; ++ const tenantTier = options?.tenantTier ?? options?.parentOperationContext?.tenantTier; ++ ++ return { ++ agentId: this.id, // Identify which agent issued the request ++ agentName: this.name, // Human-readable label for logs/metrics ++ model: this.getModelName(modelOverride), // Used for future capacity policies ++ provider, // Allows per-provider throttling later ++ priority, ++ tenantId: this.resolveTenantId(options), ++ apiKeyId, ++ region, ++ endpoint, ++ tenantTier, ++ taskType: options?.taskType, ++ fallbackPolicyId: options?.fallbackPolicyId, ++ }; ++ } ++ ++ private estimateTokens( ++ input: string | UIMessage[] | BaseMessage[], ++ options?: BaseGenerationOptions, ++ ): number | undefined { ++ let text = ""; ++ if (typeof input === "string") { ++ text = input; ++ } else if (Array.isArray(input)) { ++ text = input ++ .map((message) => { ++ if (typeof message === "string") return message; ++ if (message && typeof message === "object") { ++ const content = (message as { content?: unknown }).content; ++ if (typeof content === "string") return content; ++ if (content !== undefined) return safeStringify(content); ++ return safeStringify(message); ++ } ++ return String(message ?? ""); ++ }) ++ .join(" "); ++ } else if (input) { ++ text = safeStringify(input); ++ } ++ ++ const inputTokens = text ? Math.ceil(text.length / 4) : 0; ++ const outputTokensRaw = ++ typeof options?.maxOutputTokens === "number" ? options.maxOutputTokens : this.maxOutputTokens; ++ const outputTokens = ++ typeof outputTokensRaw === "number" && Number.isFinite(outputTokensRaw) ++ ? Math.max(0, Math.floor(outputTokensRaw)) ++ : 0; ++ const total = inputTokens + outputTokens; ++ return total > 0 ? total : undefined; ++ } ++ ++ private resolveFallbackTarget(target: FallbackChainEntry): { ++ modelOverride?: LanguageModel; ++ providerOverride?: string; ++ } { ++ if (typeof target === "string") { ++ return { modelOverride: target }; ++ } ++ return { ++ modelOverride: target.model, ++ providerOverride: target.provider, ++ }; ++ } ++ ++ private isShortResponseFallback( ++ target: FallbackChainEntry, ++ ): target is { kind: "short-response"; text: string } { ++ return ( ++ typeof target === "object" && ++ target !== null && ++ "kind" in target && ++ (target as { kind?: string }).kind === "short-response" ++ ); ++ } ++ ++ private buildShortResponseMetadata( ++ baseMetadata: TrafficRequestMetadata | undefined, ++ ): TrafficRequestMetadata { ++ const metadata = baseMetadata ?? this.buildTrafficMetadata(); ++ return { ++ ...metadata, ++ provider: "short-response", ++ model: "short-response", ++ }; ++ } ++ ++ private createZeroUsage(): LanguageModelUsage { ++ return { inputTokens: 0, outputTokens: 0, totalTokens: 0 }; ++ } ++ ++ private createShortTextStream(text: string): AsyncIterableStream { ++ return createAsyncIterableReadable((controller) => { ++ controller.enqueue(text); ++ controller.close(); ++ }); ++ } ++ ++ private createShortFullStream(text: string): AsyncIterableStream { ++ const usage = this.createZeroUsage(); ++ const id = `short-response-${randomUUID()}`; ++ return createAsyncIterableReadable((controller) => { ++ controller.enqueue({ ++ type: "text-delta", ++ id, ++ delta: text, ++ text, ++ } as VoltAgentTextStreamPart); ++ controller.enqueue({ ++ type: "finish", ++ finishReason: "stop", ++ usage, ++ totalUsage: usage, ++ } as VoltAgentTextStreamPart); ++ controller.close(); ++ }); ++ } ++ ++ private createShortTextResult( ++ text: string, ++ options?: GenerateTextOptions, ++ ): GenerateTextResultWithContext { ++ const usage = this.createZeroUsage(); ++ const context = toContextMap(options?.context) ?? new Map(); ++ const createTextStream = (): AsyncIterableStream => this.createShortTextStream(text); ++ ++ return { ++ text, ++ content: [], ++ reasoning: [], ++ reasoningText: "", ++ files: [], ++ sources: [], ++ toolCalls: [], ++ staticToolCalls: [], ++ dynamicToolCalls: [], ++ toolResults: [], ++ staticToolResults: [], ++ dynamicToolResults: [], ++ usage, ++ totalUsage: usage, ++ warnings: [], ++ finishReason: "stop", ++ steps: [], ++ experimental_output: undefined, ++ response: { ++ id: "short-response", ++ modelId: "short-response", ++ timestamp: new Date(), ++ messages: [], ++ }, ++ context, ++ request: { ++ body: {}, ++ }, ++ providerMetadata: undefined, ++ experimental_providerMetadata: undefined, ++ pipeTextStreamToResponse: (response, init) => { ++ pipeTextStreamToResponse({ ++ response, ++ textStream: createTextStream(), ++ ...(init ?? {}), ++ }); ++ }, ++ toTextStreamResponse: (init) => { ++ return createTextStreamResponse({ ++ textStream: createTextStream(), ++ ...(init ?? {}), ++ }); ++ }, ++ toDataStream: () => createTextStream(), ++ toDataStreamResponse: (init) => { ++ return createTextStreamResponse({ ++ textStream: createTextStream(), ++ ...(init ?? {}), ++ }); ++ }, ++ pipeDataStreamToResponse: (response, init) => { ++ pipeTextStreamToResponse({ ++ response, ++ textStream: createTextStream(), ++ ...(init ?? {}), ++ }); ++ }, ++ } as GenerateTextResultWithContext; ++ } ++ ++ private createShortStreamTextResult( ++ text: string, ++ options?: StreamTextOptions, ++ ): StreamTextResultWithContext { ++ const usage = this.createZeroUsage(); ++ const context = toContextMap(options?.context) ?? new Map(); ++ const createTextStream = (): AsyncIterableStream => this.createShortTextStream(text); ++ const createFullStream = (): AsyncIterableStream => ++ this.createShortFullStream(text); ++ ++ const toUIMessageStream = (_options?: unknown) => ++ createUIMessageStream({ ++ execute: async ({ writer }) => { ++ writer.write({ type: "text", text } as any); ++ }, ++ onError: (error) => String(error), ++ }); ++ ++ const toUIMessageStreamResponse = (options?: ResponseInit) => { ++ const stream = toUIMessageStream(options); ++ const responseInit = options ? { ...options } : {}; ++ return createUIMessageStreamResponse({ ++ stream, ++ ...responseInit, ++ }); ++ }; ++ ++ const pipeUIMessageStreamToResponse = (response: any, init?: ResponseInit) => { ++ const stream = toUIMessageStream(init); ++ const initOptions = init ? { ...init } : {}; ++ pipeUIMessageStreamToResponse({ ++ response, ++ stream, ++ ...initOptions, ++ }); ++ }; ++ ++ return { ++ text: Promise.resolve(text), ++ get textStream() { ++ return createTextStream(); ++ }, ++ get fullStream() { ++ return createFullStream(); ++ }, ++ usage: Promise.resolve(usage), ++ finishReason: Promise.resolve("stop"), ++ experimental_partialOutputStream: undefined, ++ toUIMessageStream: toUIMessageStream as StreamTextResultWithContext["toUIMessageStream"], ++ toUIMessageStreamResponse: ++ toUIMessageStreamResponse as StreamTextResultWithContext["toUIMessageStreamResponse"], ++ pipeUIMessageStreamToResponse: ++ pipeUIMessageStreamToResponse as StreamTextResultWithContext["pipeUIMessageStreamToResponse"], ++ pipeTextStreamToResponse: (response, init) => { ++ pipeTextStreamToResponse({ ++ response, ++ textStream: createTextStream(), ++ ...(init ?? {}), ++ }); ++ }, ++ toTextStreamResponse: (init) => { ++ return createTextStreamResponse({ ++ textStream: createTextStream(), ++ ...(init ?? {}), ++ }); ++ }, ++ context, ++ }; ++ } ++ ++ private resolveShortResponseObject(schema: T, text: string): z.infer { ++ const candidates: unknown[] = []; ++ if (text.length > 0) { ++ try { ++ candidates.push(JSON.parse(text)); ++ } catch {} ++ } ++ candidates.push(text); ++ candidates.push({ text }); ++ for (const candidate of candidates) { ++ const parsed = schema.safeParse(candidate); ++ if (parsed.success) { ++ return parsed.data; ++ } ++ } ++ return (candidates[0] ?? text) as z.infer; ++ } ++ ++ private createShortObjectResult( ++ schema: T, ++ text: string, ++ options?: GenerateObjectOptions, ++ ): GenerateObjectResultWithContext> { ++ const object = this.resolveShortResponseObject(schema, text); ++ const usage = this.createZeroUsage(); ++ const context = toContextMap(options?.context) ?? new Map(); ++ ++ return { ++ object, ++ usage, ++ warnings: [], ++ finishReason: "stop", ++ response: { ++ id: "short-response", ++ modelId: "short-response", ++ timestamp: new Date(), ++ messages: [], ++ }, ++ context, ++ request: { ++ body: {}, ++ }, ++ reasoning: "", ++ providerMetadata: undefined, ++ toJsonResponse: (init?: ResponseInit) => { ++ const responseInit = init ? { ...init } : {}; ++ const headers = { ++ "content-type": "application/json", ++ ...(responseInit.headers ?? {}), ++ }; ++ return new Response(safeStringify(object), { ++ ...responseInit, ++ headers, ++ }); ++ }, ++ } as GenerateObjectResultWithContext>; ++ } ++ ++ private createShortStreamObjectResult( ++ schema: T, ++ text: string, ++ options?: StreamObjectOptions, ++ ): StreamObjectResultWithContext> { ++ const object = this.resolveShortResponseObject(schema, text); ++ const usage = this.createZeroUsage(); ++ const context = toContextMap(options?.context) ?? new Map(); ++ const textPayload = safeStringify(object); ++ const createTextStream = (): AsyncIterableStream => ++ this.createShortTextStream(textPayload); ++ ++ const partialObjectStream = new ReadableStream>>({ ++ start(controller) { ++ controller.enqueue(object); ++ controller.close(); ++ }, ++ }); ++ ++ return { ++ object: Promise.resolve(object), ++ partialObjectStream, ++ textStream: createTextStream(), ++ warnings: Promise.resolve(undefined), ++ usage: Promise.resolve(usage), ++ finishReason: Promise.resolve("stop"), ++ pipeTextStreamToResponse: (response, init) => { ++ pipeTextStreamToResponse({ ++ response, ++ textStream: createTextStream(), ++ ...(init ?? {}), ++ }); ++ }, ++ toTextStreamResponse: (init) => { ++ return createTextStreamResponse({ ++ textStream: createTextStream(), ++ ...(init ?? {}), ++ }); ++ }, ++ context, ++ }; ++ } ++ ++ private buildShortTextFallbackRequest( ++ tenantId: string, ++ metadata: TrafficRequestMetadata | undefined, ++ options: GenerateTextOptions | undefined, ++ text: string, ++ ): TrafficRequest { ++ const shortMetadata = this.buildShortResponseMetadata(metadata); ++ return { ++ tenantId, ++ metadata: shortMetadata, ++ maxQueueWaitMs: options?.maxQueueWaitMs, ++ estimatedTokens: 0, ++ execute: async () => this.createShortTextResult(text, options), ++ extractUsage: (result: GenerateTextResultWithContext) => ++ this.extractUsageFromResponse(result), ++ }; ++ } ++ ++ private buildShortStreamTextFallbackRequest( ++ tenantId: string, ++ metadata: TrafficRequestMetadata | undefined, ++ options: StreamTextOptions | undefined, ++ text: string, ++ ): TrafficRequest { ++ const shortMetadata = this.buildShortResponseMetadata(metadata); ++ return { ++ tenantId, ++ metadata: shortMetadata, ++ maxQueueWaitMs: options?.maxQueueWaitMs, ++ estimatedTokens: 0, ++ execute: async () => this.createShortStreamTextResult(text, options), ++ extractUsage: (result: StreamTextResultWithContext) => this.extractUsageFromResponse(result), ++ }; ++ } ++ ++ private buildShortObjectFallbackRequest( ++ tenantId: string, ++ metadata: TrafficRequestMetadata | undefined, ++ schema: T, ++ options: GenerateObjectOptions | undefined, ++ text: string, ++ ): TrafficRequest>> { ++ const shortMetadata = this.buildShortResponseMetadata(metadata); ++ return { ++ tenantId, ++ metadata: shortMetadata, ++ maxQueueWaitMs: options?.maxQueueWaitMs, ++ estimatedTokens: 0, ++ execute: async () => this.createShortObjectResult(schema, text, options), ++ extractUsage: (result: GenerateObjectResultWithContext>) => ++ this.extractUsageFromResponse(result), ++ }; ++ } ++ ++ private buildShortStreamObjectFallbackRequest( ++ tenantId: string, ++ metadata: TrafficRequestMetadata | undefined, ++ schema: T, ++ options: StreamObjectOptions | undefined, ++ text: string, ++ ): TrafficRequest>> { ++ const shortMetadata = this.buildShortResponseMetadata(metadata); ++ return { ++ tenantId, ++ metadata: shortMetadata, ++ maxQueueWaitMs: options?.maxQueueWaitMs, ++ estimatedTokens: 0, ++ execute: async () => this.createShortStreamObjectResult(schema, text, options), ++ extractUsage: (result: StreamObjectResultWithContext>) => ++ this.extractUsageFromResponse(result), ++ }; ++ } ++ ++ private updateTrafficControllerRateLimits( ++ response: unknown, ++ metadata: TrafficRequestMetadata | undefined, ++ logger?: Logger, ++ ): void { ++ const headerCandidates = findHeaders(response); ++ if (headerCandidates.length === 0) { ++ logger?.debug?.("[Traffic] No headers found for rate limit update"); ++ return; ++ } ++ ++ const controller = getTrafficController(); ++ const effectiveMetadata = metadata ?? this.buildTrafficMetadata(); ++ let updateResult: ReturnType | undefined; ++ for (const headers of headerCandidates) { ++ updateResult = controller.updateRateLimitFromHeaders(effectiveMetadata, headers); ++ if (updateResult) break; ++ } ++ ++ if (!updateResult) { ++ logger?.debug?.("[Traffic] No rate limit headers applied from response"); ++ return; ++ } ++ ++ const now = Date.now(); ++ const effectiveRemaining = Math.max( ++ 0, ++ updateResult.state.remaining - updateResult.state.reserved, ++ ); ++ const resetInMs = Math.max(0, updateResult.state.resetAt - now); ++ const nextAllowedInMs = Math.max(0, updateResult.state.nextAllowedAt - now); ++ logger?.info?.("[Traffic] Applied rate limit from response headers", { ++ rateLimitKey: updateResult.key, ++ limit: updateResult.state.limit, ++ remaining: updateResult.state.remaining, ++ reserved: updateResult.state.reserved, ++ effectiveRemaining, ++ resetAt: updateResult.state.resetAt, ++ resetInMs, ++ nextAllowedAt: updateResult.state.nextAllowedAt, ++ nextAllowedInMs, ++ headers: { ++ limitRequests: updateResult.headerSnapshot.limitRequests, ++ remainingRequests: updateResult.headerSnapshot.remainingRequests, ++ resetRequestsMs: updateResult.headerSnapshot.resetRequestsMs, ++ }, ++ }); ++ } ++ ++ private extractUsageFromResponse( ++ result: ++ | { ++ usage?: LanguageModelUsage | Promise; ++ totalUsage?: LanguageModelUsage | Promise; ++ } ++ | undefined, ++ ): Promise | LanguageModelUsage | undefined { ++ if (!result) { ++ return undefined; ++ } ++ ++ const usageCandidate = ++ (result as { totalUsage?: LanguageModelUsage | Promise }) ++ ?.totalUsage ?? ++ (result as { usage?: LanguageModelUsage | Promise })?.usage; ++ ++ if (!usageCandidate) { ++ return undefined; ++ } ++ ++ const normalizeUsage = ( ++ usage: LanguageModelUsage | undefined, ++ ): LanguageModelUsage | undefined => { ++ if (!usage) return undefined; ++ const input = Number.isFinite(usage.inputTokens) ? (usage.inputTokens as number) : undefined; ++ const output = Number.isFinite(usage.outputTokens) ++ ? (usage.outputTokens as number) ++ : undefined; ++ const total = Number.isFinite(usage.totalTokens) ? (usage.totalTokens as number) : undefined; ++ ++ if (total === undefined && input === undefined && output === undefined) { ++ return undefined; ++ } ++ ++ const safeInput = input ?? 0; ++ const safeOutput = output ?? 0; ++ const safeTotal = total ?? safeInput + safeOutput; ++ ++ return { ++ ...usage, ++ inputTokens: safeInput, ++ outputTokens: safeOutput, ++ totalTokens: safeTotal, ++ }; ++ }; ++ ++ if ( ++ typeof (usageCandidate as PromiseLike).then === "function" ++ ) { ++ return (usageCandidate as Promise) ++ .then((usage) => normalizeUsage(usage)) ++ .catch(() => undefined); ++ } ++ ++ return normalizeUsage(usageCandidate as LanguageModelUsage); ++ } ++ ++ private resolveProvider( ++ model: LanguageModel | DynamicValue | undefined, ++ ): string | undefined { ++ if ( ++ model && ++ typeof model === "object" && ++ "provider" in model && ++ typeof (model as any).provider === "string" ++ ) { ++ return (model as any).provider; ++ } ++ ++ return undefined; ++ } ++ + /** + * Get the model name + */ +- public getModelName(): string { +- if (typeof this.model === "function") { ++ public getModelName(modelOverride?: LanguageModel | DynamicValue): string { ++ const selectedModel = modelOverride ?? this.model; ++ if (typeof selectedModel === "function") { + return "dynamic"; + } +- if (typeof this.model === "string") { +- return this.model; ++ if (typeof selectedModel === "string") { ++ return selectedModel; + } +- return this.model.modelId || "unknown"; ++ return selectedModel.modelId || "unknown"; + } + + /** +diff --git a/packages/core/src/agent/eval.ts b/packages/core/src/agent/eval.ts +index 9e4fe9f2..de712505 100644 +--- a/packages/core/src/agent/eval.ts ++++ b/packages/core/src/agent/eval.ts +@@ -711,6 +711,7 @@ function buildEvalPayload( + rawOutput: output, + userId: oc.userId, + conversationId: oc.conversationId, ++ tenantId: oc.tenantId, + traceId: spanContext.traceId, + spanId: spanContext.spanId, + metadata, +diff --git a/packages/core/src/agent/types.ts b/packages/core/src/agent/types.ts +index dd5fb29d..c70bd478 100644 +--- a/packages/core/src/agent/types.ts ++++ b/packages/core/src/agent/types.ts +@@ -29,6 +29,7 @@ import type { Logger } from "@voltagent/internal"; + import type { LocalScorerDefinition, SamplingPolicy } from "../eval/runtime"; + import type { MemoryOptions, MemoryStorageMetadata, WorkingMemorySummary } from "../memory/types"; + import type { VoltAgentObservability } from "../observability"; ++import type { TrafficPriority } from "../traffic/traffic-controller"; + import type { + DynamicValue, + DynamicValueOptions, +@@ -456,6 +457,11 @@ export type AgentOptions = { + temperature?: number; + maxOutputTokens?: number; + maxSteps?: number; ++ /** ++ * Default scheduling priority for this agent's LLM calls. ++ * Defaults to P1 when unspecified. ++ */ ++ trafficPriority?: TrafficPriority; + /** + * Default stop condition for step execution (ai-sdk `stopWhen`). + * Per-call `stopWhen` in method options overrides this. +@@ -493,6 +499,7 @@ export interface AgentEvalPayload { + rawOutput?: unknown; + userId?: string; + conversationId?: string; ++ tenantId?: string; + traceId: string; + spanId: string; + metadata?: Record; +@@ -890,6 +897,21 @@ export type OperationContext = { + /** Optional conversation identifier associated with this operation */ + conversationId?: string; + ++ /** Optional tenant identifier propagated across nested operations */ ++ tenantId?: string; ++ ++ /** Optional key identifier for per-key traffic limits */ ++ apiKeyId?: string; ++ ++ /** Optional region identifier for per-region traffic limits */ ++ region?: string; ++ ++ /** Optional endpoint identifier for per-endpoint traffic limits */ ++ endpoint?: string; ++ ++ /** Optional tenant tier identifier for per-tier traffic limits */ ++ tenantTier?: string; ++ + /** User-managed context map for this operation */ + readonly context: Map; + +@@ -914,6 +936,9 @@ export type OperationContext = { + /** Conversation steps for building full message history including tool calls/results */ + conversationSteps?: StepWithContent[]; + ++ /** Scheduling priority propagated from parent calls */ ++ priority?: TrafficPriority; ++ + /** AbortController for cancelling the operation and accessing the signal */ + abortController: AbortController; + +diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts +index 8753f039..9dee4333 100644 +--- a/packages/core/src/index.ts ++++ b/packages/core/src/index.ts +@@ -21,6 +21,30 @@ export type { + WorkflowTimelineEvent, + RegisteredWorkflow, + } from "./workflow"; ++export { ++ // Surface traffic controller so downstream consumers can route agent calls through the shared scheduler ++ TrafficController, ++ CircuitBreakerOpenError, ++ QueueWaitTimeoutError, ++ RateLimitedUpstreamError, ++ getTrafficController, ++ type FallbackChainEntry, ++ type FallbackPolicy, ++ type FallbackPolicyConfig, ++ type FallbackPolicyMode, ++ type FallbackTarget, ++ type RateLimitConfig, ++ type RateLimitKey, ++ type RateLimitOptions, ++ type AdaptiveLimiterConfig, ++ type PriorityWeights, ++ type PriorityBurstLimits, ++ type TrafficRequest, ++ type TrafficRequestMetadata, ++ type TrafficResponseMetadata, ++ type TrafficPriority, ++ type TrafficRequestType, ++} from "./traffic/traffic-controller"; + // Export new Agent from agent.ts + export { + Agent, +diff --git a/packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts +new file mode 100644 +index 00000000..652b7e59 +--- /dev/null ++++ b/packages/core/src/traffic/rate-limit-strategies/default-rate-limit-strategy.ts +@@ -0,0 +1,243 @@ ++import type { Logger } from "../../logger"; ++import { ++ RATE_LIMIT_EXHAUSTION_BUFFER, ++ RATE_LIMIT_MIN_PACE_INTERVAL_MS, ++ RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS, ++ RATE_LIMIT_PROBE_DELAY_MS, ++} from "../traffic-constants"; ++import type { ++ DispatchDecision, ++ QueuedRequest, ++ RateLimitWindowState, ++} from "../traffic-controller-internal"; ++import { parseRetryAfterMs, readHeaderValue } from "../traffic-error-utils"; ++import type { TrafficRequestMetadata } from "../traffic-types"; ++import type { ++ RateLimitHeaderSnapshot, ++ RateLimitStrategy, ++ RateLimitUpdateResult, ++} from "./rate-limit-strategy"; ++import { parseResetDurationToMs } from "./rate-limit-utils"; ++ ++export class DefaultRateLimitStrategy implements RateLimitStrategy { ++ private state?: RateLimitWindowState; ++ private readonly key: string; ++ ++ constructor(key: string) { ++ this.key = key; ++ } ++ ++ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const state = this.state; ++ if (!state) { ++ rateLimitLogger?.trace?.("Rate limit state missing; allow request", { ++ rateLimitKey: this.key, ++ }); ++ return null; ++ } ++ ++ const now = Date.now(); ++ const effectiveRemaining = Math.max(0, state.remaining - state.reserved); ++ const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; ++ ++ if (effectiveRemaining <= RATE_LIMIT_EXHAUSTION_BUFFER) { ++ if (now < probeAt) { ++ rateLimitLogger?.debug?.("Rate limit exhausted; waiting for probe", { ++ rateLimitKey: this.key, ++ remaining: state.remaining, ++ reserved: state.reserved, ++ effectiveRemaining, ++ resetAt: state.resetAt, ++ probeAt, ++ }); ++ return { kind: "wait", wakeUpAt: probeAt }; ++ } ++ if (state.reserved > 0) { ++ rateLimitLogger?.debug?.("Rate limit exhausted but in-flight reservations exist; waiting", { ++ rateLimitKey: this.key, ++ remaining: state.remaining, ++ reserved: state.reserved, ++ effectiveRemaining, ++ resetAt: state.resetAt, ++ }); ++ return { kind: "wait" }; ++ } ++ } ++ ++ if (now < state.nextAllowedAt) { ++ rateLimitLogger?.debug?.("Rate limit pacing; waiting until nextAllowedAt", { ++ rateLimitKey: this.key, ++ nextAllowedAt: state.nextAllowedAt, ++ resetAt: state.resetAt, ++ waitMs: Math.min(state.resetAt, state.nextAllowedAt) - now, ++ }); ++ return { kind: "wait", wakeUpAt: Math.min(state.resetAt, state.nextAllowedAt) }; ++ } ++ ++ state.reserved += 1; ++ next.rateLimitKey = this.key; ++ rateLimitLogger?.trace?.("Reserved rate limit token", { ++ rateLimitKey: this.key, ++ reserved: state.reserved, ++ remaining: state.remaining, ++ resetAt: state.resetAt, ++ nextAllowedAt: state.nextAllowedAt, ++ }); ++ ++ const remainingWindowMs = Math.max(0, state.resetAt - now); ++ const intervalMs = Math.max( ++ RATE_LIMIT_MIN_PACE_INTERVAL_MS, ++ Math.ceil(remainingWindowMs / Math.max(effectiveRemaining, 1)), ++ ); ++ ++ const candidateNext = Math.max(state.nextAllowedAt, now + intervalMs); ++ if ( ++ state.nextAllowedAt <= now || ++ candidateNext >= state.nextAllowedAt + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS ++ ) { ++ state.nextAllowedAt = candidateNext; ++ rateLimitLogger?.trace?.("Updated pacing nextAllowedAt", { ++ rateLimitKey: this.key, ++ nextAllowedAt: state.nextAllowedAt, ++ intervalMs, ++ remainingWindowMs, ++ effectiveRemaining, ++ }); ++ } ++ ++ return null; ++ } ++ ++ onDispatch(_logger?: Logger): void {} ++ ++ onComplete(logger?: Logger): void { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const state = this.state; ++ if (!state || state.reserved <= 0) return; ++ state.reserved -= 1; ++ rateLimitLogger?.trace?.("Released rate limit reservation", { ++ rateLimitKey: this.key, ++ reserved: state.reserved, ++ remaining: state.remaining, ++ resetAt: state.resetAt, ++ nextAllowedAt: state.nextAllowedAt, ++ }); ++ } ++ ++ updateFromHeaders( ++ _metadata: TrafficRequestMetadata | undefined, ++ headers: unknown, ++ logger?: Logger, ++ ): RateLimitUpdateResult | undefined { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const limitRequests = readHeaderValue(headers, "x-ratelimit-limit-requests"); ++ const remainingRequests = readHeaderValue(headers, "x-ratelimit-remaining-requests"); ++ const resetRequests = readHeaderValue(headers, "x-ratelimit-reset-requests"); ++ const retryAfter = readHeaderValue(headers, "retry-after"); ++ const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter) : undefined; ++ ++ const now = Date.now(); ++ const existing = this.state; ++ let state: RateLimitWindowState | undefined; ++ let headerSnapshot: RateLimitHeaderSnapshot | undefined; ++ ++ if (limitRequests && remainingRequests && resetRequests) { ++ const limit = Number(limitRequests); ++ const remaining = Number(remainingRequests); ++ if (!Number.isFinite(limit) || !Number.isFinite(remaining)) { ++ rateLimitLogger?.debug?.("Invalid rate limit numeric headers; skipping", { ++ rateLimitKey: this.key, ++ limitRequests, ++ remainingRequests, ++ }); ++ return undefined; ++ } ++ ++ const resetRequestsMs = parseResetDurationToMs(resetRequests); ++ if (resetRequestsMs === undefined) { ++ rateLimitLogger?.debug?.("Unable to parse reset duration; skipping", { ++ rateLimitKey: this.key, ++ resetRequests, ++ }); ++ return undefined; ++ } ++ ++ const parsedResetAt = now + resetRequestsMs; ++ const isSameWindow = !!existing && now < existing.resetAt; ++ const resetAt = isSameWindow ? Math.max(existing.resetAt, parsedResetAt) : parsedResetAt; ++ const nextAllowedAt = isSameWindow ? Math.max(existing.nextAllowedAt, now) : now; ++ const reserved = Math.max(0, existing?.reserved ?? 0); ++ ++ state = { ++ limit, ++ remaining: isSameWindow ? Math.min(existing.remaining, remaining) : remaining, ++ resetAt, ++ reserved, ++ nextAllowedAt, ++ }; ++ headerSnapshot = { ++ limitRequests, ++ remainingRequests, ++ resetRequests, ++ resetRequestsMs, ++ }; ++ } else if (retryAfterMs === undefined) { ++ rateLimitLogger?.trace?.("Missing rate limit headers; skipping", { ++ rateLimitKey: this.key, ++ hasLimit: !!limitRequests, ++ hasRemaining: !!remainingRequests, ++ hasReset: !!resetRequests, ++ hasRetryAfter: !!retryAfter, ++ }); ++ return undefined; ++ } ++ ++ if (!state) { ++ if (retryAfterMs === undefined) { ++ rateLimitLogger?.trace?.("Retry-After missing or unparsable; skipping", { ++ rateLimitKey: this.key, ++ retryAfter, ++ }); ++ return undefined; ++ } ++ const targetAt = now + retryAfterMs; ++ const isSameWindow = !!existing && now < existing.resetAt; ++ state = { ++ limit: existing?.limit ?? 1, ++ remaining: 0, ++ resetAt: isSameWindow ? Math.max(existing.resetAt, targetAt) : targetAt, ++ reserved: Math.max(0, existing?.reserved ?? 0), ++ nextAllowedAt: Math.max(existing?.nextAllowedAt ?? now, targetAt), ++ }; ++ headerSnapshot = { retryAfter, retryAfterMs }; ++ } else if (retryAfterMs !== undefined) { ++ const targetAt = now + retryAfterMs; ++ state = { ++ ...state, ++ remaining: 0, ++ resetAt: Math.max(state.resetAt, targetAt), ++ nextAllowedAt: Math.max(state.nextAllowedAt, targetAt), ++ }; ++ headerSnapshot = { ...headerSnapshot, retryAfter, retryAfterMs }; ++ } ++ ++ this.state = state; ++ rateLimitLogger?.debug?.("Applied rate limit headers to state", { ++ rateLimitKey: this.key, ++ limit: state.limit, ++ remaining: state.remaining, ++ effectiveRemaining: Math.max(0, state.remaining - state.reserved), ++ resetAt: state.resetAt, ++ nextAllowedAt: state.nextAllowedAt, ++ resetRequestsMs: headerSnapshot?.resetRequestsMs, ++ retryAfterMs: headerSnapshot?.retryAfterMs, ++ }); ++ ++ return { ++ key: this.key, ++ headerSnapshot: headerSnapshot ?? {}, ++ state, ++ }; ++ } ++} +diff --git a/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts +new file mode 100644 +index 00000000..fdb1c7a8 +--- /dev/null ++++ b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts +@@ -0,0 +1,353 @@ ++import type { Logger } from "../../logger"; ++import { ++ RATE_LIMIT_EXHAUSTION_BUFFER, ++ RATE_LIMIT_MIN_PACE_INTERVAL_MS, ++ RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS, ++ RATE_LIMIT_PROBE_DELAY_MS, ++} from "../traffic-constants"; ++import type { ++ DispatchDecision, ++ QueuedRequest, ++ RateLimitWindowState, ++} from "../traffic-controller-internal"; ++import { parseRetryAfterMs, readHeaderValue } from "../traffic-error-utils"; ++import type { RateLimitOptions, TrafficRequestMetadata } from "../traffic-types"; ++import { DefaultRateLimitStrategy } from "./default-rate-limit-strategy"; ++import type { ++ RateLimitStrategy, ++ RateLimitUpdateResult, ++ RateLimitUsage, ++} from "./rate-limit-strategy"; ++import { parseResetDurationToMs } from "./rate-limit-utils"; ++ ++export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { ++ readonly handlesTokenLimits = true; ++ private readonly window: DefaultRateLimitStrategy; ++ private readonly key: string; ++ private readonly requestsPerMinute?: number; ++ private readonly tokensPerMinute?: number; ++ private requestState?: RateLimitWindowState; ++ private tokenState?: RateLimitWindowState; ++ private bootstrapReserved = 0; ++ private readonly windowMs = 60_000; ++ ++ constructor(key: string, options?: RateLimitOptions) { ++ this.key = key; ++ this.window = new DefaultRateLimitStrategy(key); ++ // Window strategy enforces fixed 60s windows; burstSize is intentionally ignored here. ++ this.requestsPerMinute = this.normalizeLimit(options?.requestsPerMinute); ++ this.tokensPerMinute = this.normalizeLimit(options?.tokensPerMinute); ++ } ++ ++ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { ++ if (this.requestsPerMinute !== undefined) { ++ const requestDecision = this.resolveRequestWindow(next, logger); ++ if (requestDecision) return requestDecision; ++ } else { ++ const decision = this.window.resolve(next, logger); ++ if (decision) return decision; ++ ++ if (!next.rateLimitKey && this.tokensPerMinute === undefined) { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ if (this.bootstrapReserved >= 1) { ++ rateLimitLogger?.debug?.("OpenAI rate limit bootstrap active; waiting", { ++ rateLimitKey: this.key, ++ bootstrapReserved: this.bootstrapReserved, ++ }); ++ return { kind: "wait" }; ++ } ++ ++ this.bootstrapReserved += 1; ++ next.rateLimitKey = this.key; ++ rateLimitLogger?.debug?.("OpenAI rate limit bootstrap reserved", { ++ rateLimitKey: this.key, ++ bootstrapReserved: this.bootstrapReserved, ++ }); ++ } ++ } ++ ++ const tokenDecision = this.resolveTokenWindow(next, logger); ++ if (tokenDecision) return tokenDecision; ++ return null; ++ } ++ ++ onDispatch(logger?: Logger): void { ++ if (this.requestsPerMinute === undefined) { ++ this.window.onDispatch(logger); ++ } ++ } ++ ++ onComplete(logger?: Logger): void { ++ if (this.requestsPerMinute !== undefined) { ++ const now = Date.now(); ++ const state = this.ensureRequestState(now); ++ if (state.reserved > 0) { ++ state.reserved -= 1; ++ } ++ state.remaining = Math.max(0, state.remaining - 1); ++ return; ++ } ++ ++ if (this.bootstrapReserved > 0) { ++ this.bootstrapReserved -= 1; ++ } ++ this.window.onComplete(logger); ++ } ++ ++ recordUsage(usage: RateLimitUsage, logger?: Logger, reservedTokens?: number): void { ++ const tokens = this.resolveTokenCount(usage); ++ if (tokens <= 0) return; ++ ++ const now = Date.now(); ++ const state = this.ensureTokenState(now); ++ if (!state) return; ++ const reserved = typeof reservedTokens === "number" ? reservedTokens : 0; ++ const delta = tokens - reserved; ++ if (delta > 0) { ++ state.remaining = Math.max(0, state.remaining - delta); ++ } else if (delta < 0) { ++ state.remaining = Math.min(state.limit, state.remaining + Math.abs(delta)); ++ } ++ logger?.child({ module: "rate-limiter" })?.trace?.("OpenAI token usage recorded", { ++ rateLimitKey: this.key, ++ tokens, ++ remaining: state.remaining, ++ resetAt: state.resetAt, ++ }); ++ } ++ ++ updateFromHeaders( ++ metadata: TrafficRequestMetadata | undefined, ++ headers: unknown, ++ logger?: Logger, ++ ): RateLimitUpdateResult | undefined { ++ const update = ++ this.requestsPerMinute !== undefined ++ ? undefined ++ : this.window.updateFromHeaders(metadata, headers, logger); ++ this.applyTokenHeaderUpdates(headers, logger); ++ return update; ++ } ++ ++ private resolveRequestWindow(next: QueuedRequest, logger?: Logger): DispatchDecision | null { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const now = Date.now(); ++ const state = this.ensureRequestState(now); ++ const effectiveRemaining = Math.max(0, state.remaining - state.reserved); ++ const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; ++ ++ if (effectiveRemaining <= RATE_LIMIT_EXHAUSTION_BUFFER) { ++ if (now < probeAt) { ++ rateLimitLogger?.debug?.("OpenAI request window exhausted; waiting for probe", { ++ rateLimitKey: this.key, ++ remaining: state.remaining, ++ reserved: state.reserved, ++ effectiveRemaining, ++ resetAt: state.resetAt, ++ probeAt, ++ }); ++ return { kind: "wait", wakeUpAt: probeAt }; ++ } ++ if (state.reserved > 0) { ++ rateLimitLogger?.debug?.( ++ "OpenAI request window exhausted but in-flight reservations exist; waiting", ++ { ++ rateLimitKey: this.key, ++ remaining: state.remaining, ++ reserved: state.reserved, ++ effectiveRemaining, ++ resetAt: state.resetAt, ++ }, ++ ); ++ return { kind: "wait" }; ++ } ++ } ++ ++ if (now < state.nextAllowedAt) { ++ rateLimitLogger?.debug?.("OpenAI request window pacing; waiting until nextAllowedAt", { ++ rateLimitKey: this.key, ++ nextAllowedAt: state.nextAllowedAt, ++ resetAt: state.resetAt, ++ waitMs: Math.min(state.resetAt, state.nextAllowedAt) - now, ++ }); ++ return { kind: "wait", wakeUpAt: Math.min(state.resetAt, state.nextAllowedAt) }; ++ } ++ ++ state.reserved += 1; ++ next.rateLimitKey = this.key; ++ rateLimitLogger?.trace?.("Reserved OpenAI request window slot", { ++ rateLimitKey: this.key, ++ reserved: state.reserved, ++ remaining: state.remaining, ++ resetAt: state.resetAt, ++ nextAllowedAt: state.nextAllowedAt, ++ }); ++ ++ const remainingWindowMs = Math.max(0, state.resetAt - now); ++ const intervalMs = Math.max( ++ RATE_LIMIT_MIN_PACE_INTERVAL_MS, ++ Math.ceil(remainingWindowMs / Math.max(effectiveRemaining, 1)), ++ ); ++ ++ const candidateNext = Math.max(state.nextAllowedAt, now + intervalMs); ++ if ( ++ state.nextAllowedAt <= now || ++ candidateNext >= state.nextAllowedAt + RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS ++ ) { ++ state.nextAllowedAt = candidateNext; ++ rateLimitLogger?.trace?.("Updated OpenAI request pacing nextAllowedAt", { ++ rateLimitKey: this.key, ++ nextAllowedAt: state.nextAllowedAt, ++ intervalMs, ++ remainingWindowMs, ++ effectiveRemaining, ++ }); ++ } ++ ++ return null; ++ } ++ ++ private resolveTokenWindow(next: QueuedRequest, logger?: Logger): DispatchDecision | null { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const now = Date.now(); ++ const state = this.ensureTokenState(now); ++ if (!state) return null; ++ const estimatedTokens = next.estimatedTokens; ++ ++ if (typeof estimatedTokens === "number" && estimatedTokens > 0) { ++ if (state.remaining >= estimatedTokens) { ++ state.remaining = Math.max(0, state.remaining - estimatedTokens); ++ next.reservedTokens = estimatedTokens; ++ return null; ++ } ++ } else if (state.remaining > 0) { ++ return null; ++ } ++ ++ const probeAt = state.resetAt + RATE_LIMIT_PROBE_DELAY_MS; ++ rateLimitLogger?.debug?.("OpenAI token window exhausted; waiting", { ++ rateLimitKey: this.key, ++ remaining: state.remaining, ++ resetAt: state.resetAt, ++ probeAt, ++ }); ++ return { kind: "wait", wakeUpAt: probeAt }; ++ } ++ ++ private ensureRequestState(now: number): RateLimitWindowState { ++ const limit = this.requestsPerMinute ?? 0; ++ const state = this.requestState; ++ if (!state || now >= state.resetAt) { ++ this.requestState = { ++ limit, ++ remaining: limit, ++ resetAt: now + this.windowMs, ++ reserved: 0, ++ nextAllowedAt: now, ++ }; ++ return this.requestState; ++ } ++ return state; ++ } ++ ++ private ensureTokenState(now: number): RateLimitWindowState | undefined { ++ const configuredLimit = this.tokensPerMinute; ++ const state = this.tokenState; ++ if (!state) { ++ if (configuredLimit === undefined) return undefined; ++ this.tokenState = { ++ limit: configuredLimit, ++ remaining: configuredLimit, ++ resetAt: now + this.windowMs, ++ reserved: 0, ++ nextAllowedAt: now, ++ }; ++ return this.tokenState; ++ } ++ ++ if (now >= state.resetAt) { ++ const limit = configuredLimit ?? state.limit; ++ this.tokenState = { ++ limit, ++ remaining: limit, ++ resetAt: now + this.windowMs, ++ reserved: 0, ++ nextAllowedAt: now, ++ }; ++ return this.tokenState; ++ } ++ ++ if (configuredLimit !== undefined && configuredLimit !== state.limit) { ++ state.limit = configuredLimit; ++ state.remaining = Math.min(state.remaining, configuredLimit); ++ } ++ ++ return state; ++ } ++ ++ private normalizeLimit(value: number | undefined): number | undefined { ++ const numeric = typeof value === "number" ? value : Number(value); ++ return Number.isFinite(numeric) && numeric > 0 ? numeric : undefined; ++ } ++ ++ private applyTokenHeaderUpdates(headers: unknown, logger?: Logger): void { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const limitTokens = readHeaderValue(headers, "x-ratelimit-limit-tokens"); ++ const remainingTokens = readHeaderValue(headers, "x-ratelimit-remaining-tokens"); ++ const resetTokens = readHeaderValue(headers, "x-ratelimit-reset-tokens"); ++ const retryAfter = readHeaderValue(headers, "retry-after"); ++ ++ const limit = Number(limitTokens); ++ const remaining = Number(remainingTokens); ++ const resetTokensMs = resetTokens ? parseResetDurationToMs(resetTokens) : undefined; ++ const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter) : undefined; ++ ++ if (!Number.isFinite(limit) || !Number.isFinite(remaining) || resetTokensMs === undefined) { ++ rateLimitLogger?.trace?.("OpenAI token headers missing or invalid; skipping", { ++ rateLimitKey: this.key, ++ hasLimit: !!limitTokens, ++ hasRemaining: !!remainingTokens, ++ hasReset: !!resetTokens, ++ }); ++ return; ++ } ++ ++ const now = Date.now(); ++ const configuredLimit = this.tokensPerMinute; ++ const effectiveLimit = configuredLimit === undefined ? limit : Math.min(configuredLimit, limit); ++ const clampedRemaining = Math.max(0, Math.min(remaining, effectiveLimit)); ++ const parsedResetAt = now + resetTokensMs; ++ const existing = this.tokenState; ++ const isSameWindow = !!existing && now < existing.resetAt; ++ const resetAt = isSameWindow ? Math.max(existing.resetAt, parsedResetAt) : parsedResetAt; ++ const nextAllowedAt = isSameWindow ? Math.max(existing.nextAllowedAt, now) : now; ++ const reserved = Math.max(0, existing?.reserved ?? 0); ++ const effectiveRemaining = isSameWindow ++ ? Math.min(existing.remaining, clampedRemaining) ++ : clampedRemaining; ++ ++ this.tokenState = { ++ limit: effectiveLimit, ++ remaining: effectiveRemaining, ++ resetAt, ++ reserved, ++ nextAllowedAt, ++ }; ++ ++ rateLimitLogger?.debug?.("OpenAI token headers applied", { ++ rateLimitKey: this.key, ++ limit: effectiveLimit, ++ remaining: effectiveRemaining, ++ resetAt, ++ retryAfterMs, ++ }); ++ } ++ ++ private resolveTokenCount(usage: RateLimitUsage): number { ++ const total = Number.isFinite(usage.totalTokens) ? usage.totalTokens : undefined; ++ if (total !== undefined) return total; ++ const input = Number.isFinite(usage.inputTokens) ? usage.inputTokens : 0; ++ const output = Number.isFinite(usage.outputTokens) ? usage.outputTokens : 0; ++ return input + output; ++ } ++} +diff --git a/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts +new file mode 100644 +index 00000000..653fdaf2 +--- /dev/null ++++ b/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts +@@ -0,0 +1,41 @@ ++import type { Logger } from "../../logger"; ++import type { ++ DispatchDecision, ++ QueuedRequest, ++ RateLimitWindowState, ++} from "../traffic-controller-internal"; ++import type { TrafficRequestMetadata } from "../traffic-types"; ++ ++export type RateLimitHeaderSnapshot = { ++ limitRequests?: string; ++ remainingRequests?: string; ++ resetRequests?: string; ++ resetRequestsMs?: number; ++ retryAfter?: string; ++ retryAfterMs?: number; ++}; ++ ++export type RateLimitUpdateResult = { ++ key: string; ++ headerSnapshot: RateLimitHeaderSnapshot; ++ state: RateLimitWindowState; ++}; ++ ++export type RateLimitUsage = { ++ inputTokens?: number; ++ outputTokens?: number; ++ totalTokens?: number; ++}; ++ ++export interface RateLimitStrategy { ++ readonly handlesTokenLimits?: boolean; ++ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null; ++ onDispatch(logger?: Logger): void; ++ onComplete(logger?: Logger): void; ++ recordUsage?(usage: RateLimitUsage, logger?: Logger, reservedTokens?: number): void; ++ updateFromHeaders( ++ metadata: TrafficRequestMetadata | undefined, ++ headers: unknown, ++ logger?: Logger, ++ ): RateLimitUpdateResult | undefined; ++} +diff --git a/packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts b/packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts +new file mode 100644 +index 00000000..310c9a7e +--- /dev/null ++++ b/packages/core/src/traffic/rate-limit-strategies/rate-limit-utils.ts +@@ -0,0 +1,26 @@ ++export function parseResetDurationToMs(raw: string): number | undefined { ++ const value = raw.trim(); ++ if (!value) return undefined; ++ ++ let totalMs = 0; ++ const regex = /(\d+(?:\.\d+)?)(ms|s|m|h|d)/g; ++ let matched = false; ++ for (const match of value.matchAll(regex)) { ++ matched = true; ++ const amount = Number.parseFloat(match[1] ?? ""); ++ if (!Number.isFinite(amount)) continue; ++ const unit = match[2]; ++ if (unit === "ms") totalMs += amount; ++ else if (unit === "s") totalMs += amount * 1000; ++ else if (unit === "m") totalMs += amount * 60_000; ++ else if (unit === "h") totalMs += amount * 3_600_000; ++ else if (unit === "d") totalMs += amount * 86_400_000; ++ } ++ ++ if (matched) { ++ return Math.round(totalMs); ++ } ++ ++ const n = Number(value); ++ return Number.isFinite(n) ? Math.round(n) : undefined; ++} +diff --git a/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts +new file mode 100644 +index 00000000..ee269ecd +--- /dev/null ++++ b/packages/core/src/traffic/rate-limit-strategies/token-bucket-rate-limit-strategy.ts +@@ -0,0 +1,218 @@ ++import type { Logger } from "../../logger"; ++import type { ++ DispatchDecision, ++ QueuedRequest, ++ RateLimitWindowState, ++} from "../traffic-controller-internal"; ++import { parseRetryAfterMs, readHeaderValue } from "../traffic-error-utils"; ++import type { RateLimitOptions, TrafficRequestMetadata } from "../traffic-types"; ++import type { ++ RateLimitHeaderSnapshot, ++ RateLimitStrategy, ++ RateLimitUpdateResult, ++} from "./rate-limit-strategy"; ++import { parseResetDurationToMs } from "./rate-limit-utils"; ++ ++type TokenBucketState = { ++ capacity: number; ++ refillPerSecond: number; ++ tokens: number; ++ updatedAt: number; ++}; ++ ++function normalizeTokenBucketOptions( ++ raw: RateLimitOptions | undefined, ++): Omit | undefined { ++ const requestsPerMinuteRaw = raw?.requestsPerMinute; ++ const tokensPerMinuteRaw = raw?.tokensPerMinute; ++ const burstSizeRaw = raw?.burstSize; ++ ++ const requestsPerMinute = ++ typeof requestsPerMinuteRaw === "number" ? requestsPerMinuteRaw : Number(requestsPerMinuteRaw); ++ const tokensPerMinute = ++ typeof tokensPerMinuteRaw === "number" ? tokensPerMinuteRaw : Number(tokensPerMinuteRaw); ++ const burstSize = typeof burstSizeRaw === "number" ? burstSizeRaw : Number(burstSizeRaw); ++ ++ const safeRequestsPerMinute = Number.isFinite(requestsPerMinute) ? requestsPerMinute : 0; ++ const hasTokenLimit = Number.isFinite(tokensPerMinute) && tokensPerMinute > 0; ++ if (safeRequestsPerMinute <= 0 && hasTokenLimit) { ++ return undefined; ++ } ++ const safeBurst = Number.isFinite(burstSize) ? burstSize : safeRequestsPerMinute; ++ const refillPerSecond = safeRequestsPerMinute > 0 ? safeRequestsPerMinute / 60 : 0; ++ ++ return { ++ capacity: safeBurst > 0 ? Math.max(1, safeBurst) : 0, ++ refillPerSecond, ++ }; ++} ++function refillTokenBucket(bucket: TokenBucketState, now: number): void { ++ const elapsedMs = now - bucket.updatedAt; ++ if (elapsedMs <= 0) return; ++ bucket.updatedAt = now; ++ if (bucket.capacity <= 0 || bucket.refillPerSecond <= 0) return; ++ ++ const refill = (elapsedMs / 1000) * bucket.refillPerSecond; ++ if (refill <= 0) return; ++ bucket.tokens = Math.min(bucket.capacity, bucket.tokens + refill); ++} ++ ++export class TokenBucketRateLimitStrategy implements RateLimitStrategy { ++ private readonly key: string; ++ private bucket?: TokenBucketState; ++ private cooldownUntil?: number; ++ ++ constructor(key: string, options?: RateLimitOptions) { ++ this.key = key; ++ if (!options) return; ++ const normalized = normalizeTokenBucketOptions(options); ++ if (!normalized) return; ++ const now = Date.now(); ++ this.bucket = { ++ ...normalized, ++ tokens: normalized.capacity, ++ updatedAt: now, ++ }; ++ } ++ ++ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const now = Date.now(); ++ ++ if (this.cooldownUntil !== undefined && now < this.cooldownUntil) { ++ rateLimitLogger?.debug?.("Token bucket cooldown active; waiting", { ++ rateLimitKey: this.key, ++ cooldownUntil: this.cooldownUntil, ++ waitMs: this.cooldownUntil - now, ++ }); ++ return { kind: "wait", wakeUpAt: this.cooldownUntil }; ++ } ++ ++ const bucket = this.bucket; ++ if (!bucket) return null; ++ ++ refillTokenBucket(bucket, now); ++ ++ if (bucket.capacity <= 0) { ++ rateLimitLogger?.debug?.("Token bucket misconfigured; blocking", { ++ rateLimitKey: this.key, ++ capacity: bucket.capacity, ++ refillPerSecond: bucket.refillPerSecond, ++ }); ++ return { kind: "wait" }; ++ } ++ ++ if (bucket.tokens >= 1) { ++ bucket.tokens -= 1; ++ next.rateLimitKey = this.key; ++ rateLimitLogger?.trace?.("Consumed token bucket token", { ++ rateLimitKey: this.key, ++ tokens: bucket.tokens, ++ capacity: bucket.capacity, ++ refillPerSecond: bucket.refillPerSecond, ++ }); ++ return null; ++ } ++ ++ if (bucket.refillPerSecond <= 0) { ++ rateLimitLogger?.debug?.("Token bucket has no refill; blocking", { ++ rateLimitKey: this.key, ++ capacity: bucket.capacity, ++ refillPerSecond: bucket.refillPerSecond, ++ }); ++ return { kind: "wait" }; ++ } ++ ++ const requiredTokens = 1 - bucket.tokens; ++ const waitMs = Math.max(1, Math.ceil((requiredTokens / bucket.refillPerSecond) * 1000)); ++ const wakeUpAt = now + waitMs; ++ rateLimitLogger?.debug?.("Token bucket empty; waiting", { ++ rateLimitKey: this.key, ++ tokens: bucket.tokens, ++ capacity: bucket.capacity, ++ refillPerSecond: bucket.refillPerSecond, ++ wakeUpAt, ++ waitMs, ++ }); ++ return { kind: "wait", wakeUpAt }; ++ } ++ ++ onDispatch(_logger?: Logger): void {} ++ ++ onComplete(_logger?: Logger): void {} ++ ++ updateFromHeaders( ++ _metadata: TrafficRequestMetadata | undefined, ++ headers: unknown, ++ logger?: Logger, ++ ): RateLimitUpdateResult | undefined { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const now = Date.now(); ++ ++ const retryAfter = readHeaderValue(headers, "retry-after"); ++ const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter, now) : undefined; ++ ++ const remainingRequests = readHeaderValue(headers, "x-ratelimit-remaining-requests"); ++ const resetRequests = readHeaderValue(headers, "x-ratelimit-reset-requests"); ++ const resetRequestsMs = resetRequests ? parseResetDurationToMs(resetRequests) : undefined; ++ ++ let appliedUntil: number | undefined; ++ ++ if (retryAfterMs !== undefined) { ++ const targetAt = now + retryAfterMs; ++ this.cooldownUntil = ++ this.cooldownUntil === undefined ? targetAt : Math.max(this.cooldownUntil, targetAt); ++ appliedUntil = this.cooldownUntil; ++ } ++ ++ if (remainingRequests && resetRequestsMs !== undefined) { ++ const remaining = Number(remainingRequests); ++ if (Number.isFinite(remaining) && remaining <= 0) { ++ const targetAt = now + resetRequestsMs; ++ this.cooldownUntil = ++ this.cooldownUntil === undefined ? targetAt : Math.max(this.cooldownUntil, targetAt); ++ appliedUntil = this.cooldownUntil; ++ } ++ } ++ ++ if (appliedUntil === undefined) { ++ rateLimitLogger?.trace?.("No applicable cooldown headers; skipping", { ++ rateLimitKey: this.key, ++ hasRetryAfter: !!retryAfter, ++ hasRemainingRequests: !!remainingRequests, ++ hasResetRequests: !!resetRequests, ++ }); ++ return undefined; ++ } ++ ++ rateLimitLogger?.debug?.("Applied token bucket cooldown from headers", { ++ rateLimitKey: this.key, ++ cooldownUntil: appliedUntil, ++ inMs: Math.max(0, appliedUntil - now), ++ retryAfterMs, ++ resetRequestsMs, ++ }); ++ ++ const headerSnapshot: RateLimitHeaderSnapshot = { ++ remainingRequests, ++ resetRequests, ++ resetRequestsMs, ++ retryAfter, ++ retryAfterMs, ++ }; ++ ++ const state: RateLimitWindowState = { ++ limit: 1, ++ remaining: 0, ++ resetAt: appliedUntil, ++ reserved: 0, ++ nextAllowedAt: appliedUntil, ++ }; ++ ++ return { ++ key: this.key, ++ headerSnapshot, ++ state, ++ }; ++ } ++} +diff --git a/packages/core/src/traffic/traffic-circuit-breaker.ts b/packages/core/src/traffic/traffic-circuit-breaker.ts +new file mode 100644 +index 00000000..20d166ca +--- /dev/null ++++ b/packages/core/src/traffic/traffic-circuit-breaker.ts +@@ -0,0 +1,478 @@ ++import type { Logger } from "../logger"; ++import { ++ CIRCUIT_COOLDOWN_MS, ++ CIRCUIT_FAILURE_THRESHOLD, ++ CIRCUIT_FAILURE_WINDOW_MS, ++ CIRCUIT_PROBE_INTERVAL_MS, ++ CIRCUIT_TIMEOUT_THRESHOLD, ++ CIRCUIT_TIMEOUT_WINDOW_MS, ++ DEFAULT_FALLBACK_CHAINS, ++} from "./traffic-constants"; ++import type { ++ CircuitState, ++ CircuitStateStatus, ++ DispatchDecision, ++ QueuedRequest, ++} from "./traffic-controller-internal"; ++import { extractStatusCode, isTimeoutError } from "./traffic-error-utils"; ++import { CircuitBreakerOpenError } from "./traffic-errors"; ++import type { ++ FallbackChainEntry, ++ FallbackPolicy, ++ FallbackPolicyConfig, ++ FallbackTarget, ++ TrafficRequestMetadata, ++ TrafficResponseMetadata, ++} from "./traffic-types"; ++ ++export class TrafficCircuitBreaker { ++ private readonly circuitBreakers = new Map(); ++ private readonly fallbackChains: Map; ++ private readonly fallbackPolicy?: FallbackPolicyConfig; ++ private readonly buildRateLimitKey: (metadata?: TrafficRequestMetadata) => string; ++ ++ constructor(options: { ++ fallbackChains?: Record; ++ fallbackPolicy?: FallbackPolicyConfig; ++ buildRateLimitKey: (metadata?: TrafficRequestMetadata) => string; ++ }) { ++ this.buildRateLimitKey = options.buildRateLimitKey; ++ const chains = options.fallbackChains ?? DEFAULT_FALLBACK_CHAINS; ++ this.fallbackChains = new Map(Object.entries(chains)); ++ this.fallbackPolicy = options.fallbackPolicy; ++ } ++ ++ resolve(next: QueuedRequest, logger?: Logger): DispatchDecision | null { ++ const circuitLogger = logger?.child({ module: "circuit-breaker" }); ++ const visitedKeys = new Set(); ++ ++ while (true) { ++ const key = this.buildRateLimitKey(next.request.metadata); ++ next.circuitKey = key; ++ visitedKeys.add(key); ++ circuitLogger?.trace?.("Circuit resolve step", { ++ circuitKey: key, ++ provider: next.request.metadata?.provider, ++ model: next.request.metadata?.model, ++ }); ++ ++ const evaluation = this.evaluateCircuitState(key, circuitLogger); ++ next.circuitStatus = evaluation.state; ++ circuitLogger?.debug?.("Circuit evaluated", { ++ circuitKey: key, ++ state: evaluation.state, ++ allowRequest: evaluation.allowRequest, ++ retryAfterMs: evaluation.retryAfterMs, ++ }); ++ ++ if (evaluation.allowRequest) return null; ++ ++ const { policy, policyId } = this.resolveFallbackPolicy(next.request.metadata); ++ if (policy.mode === "wait") { ++ const wakeUpAt = ++ evaluation.retryAfterMs !== undefined ? Date.now() + evaluation.retryAfterMs : undefined; ++ circuitLogger?.debug?.("Circuit open; waiting per fallback policy", { ++ circuitKey: key, ++ policyId, ++ retryAfterMs: evaluation.retryAfterMs, ++ wakeUpAt, ++ }); ++ return { kind: "wait", wakeUpAt }; ++ } ++ ++ const fallback = this.findFallbackTarget(next.request.metadata, visitedKeys, circuitLogger); ++ circuitLogger?.debug?.("Circuit open; attempting fallback", { ++ circuitKey: key, ++ currentModel: next.request.metadata?.model, ++ fallback, ++ visitedKeys: Array.from(visitedKeys), ++ }); ++ if (!fallback || !next.request.createFallbackRequest) { ++ const error = new CircuitBreakerOpenError( ++ `Circuit open for ${key}`, ++ next.request.metadata, ++ evaluation.retryAfterMs, ++ ); ++ const traffic: TrafficResponseMetadata = { ++ rateLimitKey: key, ++ retryAfterMs: evaluation.retryAfterMs, ++ tenantId: next.request.metadata?.tenantId ?? next.tenantId, ++ priority: next.request.metadata?.priority, ++ taskType: next.request.metadata?.taskType, ++ }; ++ (error as Record).traffic = traffic; ++ next.reject(error); ++ circuitLogger?.warn?.("No fallback available; rejecting request", { ++ circuitKey: key, ++ retryAfterMs: evaluation.retryAfterMs, ++ }); ++ return { kind: "skip" }; ++ } ++ ++ const fallbackRequest = next.request.createFallbackRequest(fallback); ++ if (!fallbackRequest) { ++ circuitLogger?.warn?.("createFallbackRequest returned undefined; skipping", { ++ circuitKey: key, ++ fallback, ++ }); ++ return { kind: "skip" }; ++ } ++ ++ this.applyFallbackRequest(next, fallbackRequest, fallback, circuitLogger, { ++ previousCircuitKey: key, ++ reason: "circuit-open", ++ }); ++ } ++ } ++ ++ tryFallback(next: QueuedRequest, reason: "queue-timeout", logger?: Logger): boolean { ++ const circuitLogger = logger?.child({ module: "circuit-breaker" }); ++ const { policy, policyId } = this.resolveFallbackPolicy(next.request.metadata); ++ if (policy.mode === "wait") { ++ circuitLogger?.debug?.("Fallback skipped by policy", { ++ policyId, ++ reason, ++ provider: next.request.metadata?.provider, ++ model: next.request.metadata?.model, ++ }); ++ return false; ++ } ++ ++ const visitedKeys = new Set(); ++ const key = this.buildRateLimitKey(next.request.metadata); ++ visitedKeys.add(key); ++ ++ const fallback = this.findFallbackTarget(next.request.metadata, visitedKeys, circuitLogger); ++ if (!fallback || !next.request.createFallbackRequest) { ++ circuitLogger?.debug?.("Fallback unavailable for request", { ++ reason, ++ provider: next.request.metadata?.provider, ++ model: next.request.metadata?.model, ++ fallback, ++ }); ++ return false; ++ } ++ ++ const fallbackRequest = next.request.createFallbackRequest(fallback); ++ if (!fallbackRequest) { ++ circuitLogger?.warn?.("createFallbackRequest returned undefined; skipping", { ++ reason, ++ fallback, ++ }); ++ return false; ++ } ++ ++ this.applyFallbackRequest(next, fallbackRequest, fallback, circuitLogger, { ++ previousCircuitKey: key, ++ reason, ++ policyId, ++ }); ++ return true; ++ } ++ ++ markTrial(item: QueuedRequest, logger?: Logger): void { ++ const circuitLogger = logger?.child({ module: "circuit-breaker" }); ++ const key = item.circuitKey; ++ if (!key) return; ++ const state = this.circuitBreakers.get(key); ++ if (state && state.status === "half-open" && !state.trialInFlight) { ++ state.trialInFlight = true; ++ circuitLogger?.debug?.("Marked half-open trial in flight", { circuitKey: key }); ++ } ++ } ++ ++ recordSuccess(metadata?: TrafficRequestMetadata, logger?: Logger): void { ++ const circuitLogger = logger?.child({ module: "circuit-breaker" }); ++ const key = this.buildRateLimitKey(metadata); ++ this.circuitBreakers.delete(key); ++ circuitLogger?.debug?.("Circuit success; cleared circuit state", { ++ circuitKey: key, ++ provider: metadata?.provider, ++ model: metadata?.model, ++ }); ++ } ++ ++ recordFailure( ++ metadata: TrafficRequestMetadata | undefined, ++ error: unknown, ++ logger?: Logger, ++ ): void { ++ const circuitLogger = logger?.child({ module: "circuit-breaker" }); ++ const key = this.buildRateLimitKey(metadata); ++ const status = extractStatusCode(error, logger); ++ const isTimeout = status === 408 || isTimeoutError(error, logger); ++ const isStatusEligible = this.isCircuitBreakerStatus(status); ++ const isTimeoutEligible = !isStatusEligible && isTimeout; ++ const isEligible = isStatusEligible || isTimeoutEligible; ++ ++ circuitLogger?.debug?.("Circuit failure observed", { ++ circuitKey: key, ++ status, ++ isTimeout, ++ eligible: isEligible, ++ provider: metadata?.provider, ++ model: metadata?.model, ++ }); ++ ++ if (!isEligible) { ++ this.circuitBreakers.delete(key); ++ circuitLogger?.debug?.("Failure not eligible for circuit breaker; cleared circuit state", { ++ circuitKey: key, ++ status, ++ isTimeout, ++ }); ++ return; ++ } ++ ++ const now = Date.now(); ++ const state = ++ this.circuitBreakers.get(key) ?? ++ ({ status: "closed", failureTimestamps: [], timeoutTimestamps: [] } as CircuitState); ++ ++ state.failureTimestamps = state.failureTimestamps.filter( ++ (t) => now - t <= CIRCUIT_FAILURE_WINDOW_MS, ++ ); ++ state.timeoutTimestamps = state.timeoutTimestamps.filter( ++ (t) => now - t <= CIRCUIT_TIMEOUT_WINDOW_MS, ++ ); ++ ++ state.failureTimestamps.push(now); ++ if (isTimeoutEligible) { ++ state.timeoutTimestamps.push(now); ++ } ++ ++ if ( ++ state.status === "half-open" || ++ state.failureTimestamps.length >= CIRCUIT_FAILURE_THRESHOLD || ++ state.timeoutTimestamps.length >= CIRCUIT_TIMEOUT_THRESHOLD ++ ) { ++ const openReasons: string[] = []; ++ if (state.status === "half-open") openReasons.push("half-open-failure"); ++ if (state.failureTimestamps.length >= CIRCUIT_FAILURE_THRESHOLD) { ++ openReasons.push("failure-threshold"); ++ } ++ if (state.timeoutTimestamps.length >= CIRCUIT_TIMEOUT_THRESHOLD) { ++ openReasons.push("timeout-threshold"); ++ } ++ ++ state.status = "open"; ++ state.openedAt = now; ++ state.trialInFlight = false; ++ state.nextProbeAt = now + CIRCUIT_PROBE_INTERVAL_MS; ++ circuitLogger?.warn?.("Circuit opened", { ++ circuitKey: key, ++ openReasons, ++ status, ++ isTimeout, ++ failureCount: state.failureTimestamps.length, ++ failureThreshold: CIRCUIT_FAILURE_THRESHOLD, ++ timeoutCount: state.timeoutTimestamps.length, ++ timeoutThreshold: CIRCUIT_TIMEOUT_THRESHOLD, ++ openedAt: state.openedAt, ++ }); ++ } ++ ++ this.circuitBreakers.set(key, state); ++ circuitLogger?.trace?.("Circuit state updated", { ++ circuitKey: key, ++ status: state.status, ++ failureCount: state.failureTimestamps.length, ++ failureWindowMs: CIRCUIT_FAILURE_WINDOW_MS, ++ timeoutCount: state.timeoutTimestamps.length, ++ timeoutWindowMs: CIRCUIT_TIMEOUT_WINDOW_MS, ++ }); ++ } ++ ++ private evaluateCircuitState( ++ key: string, ++ logger?: Logger, ++ ): { ++ allowRequest: boolean; ++ state: CircuitStateStatus; ++ retryAfterMs?: number; ++ } { ++ const state = this.circuitBreakers.get(key); ++ if (!state) { ++ logger?.trace?.("Circuit state missing; allow request", { circuitKey: key }); ++ return { allowRequest: true, state: "closed" }; ++ } ++ ++ const now = Date.now(); ++ ++ if (state.status === "open") { ++ const elapsed = state.openedAt ? now - state.openedAt : 0; ++ if (state.nextProbeAt === undefined) { ++ state.nextProbeAt = now + CIRCUIT_PROBE_INTERVAL_MS; ++ } ++ const cooldownRemaining = Math.max(0, CIRCUIT_COOLDOWN_MS - elapsed); ++ const probeRemaining = Math.max(0, state.nextProbeAt - now); ++ if (probeRemaining === 0 || cooldownRemaining === 0) { ++ state.status = "half-open"; ++ state.trialInFlight = false; ++ state.failureTimestamps = []; ++ state.timeoutTimestamps = []; ++ state.nextProbeAt = undefined; ++ logger?.debug?.("Circuit transitioned to half-open", { ++ circuitKey: key, ++ reason: cooldownRemaining === 0 ? "cooldown" : "probe", ++ }); ++ return { allowRequest: true, state: "half-open" }; ++ } ++ return { ++ allowRequest: false, ++ state: "open", ++ retryAfterMs: Math.min(cooldownRemaining, probeRemaining), ++ }; ++ } ++ ++ if (state.status === "half-open" && state.trialInFlight) { ++ return { allowRequest: false, state: "half-open" }; ++ } ++ ++ return { allowRequest: true, state: state.status }; ++ } ++ ++ private resolveFallbackPolicy(metadata: TrafficRequestMetadata | undefined): { ++ policy: FallbackPolicy; ++ policyId?: string; ++ } { ++ const policyId = ++ metadata?.fallbackPolicyId ?? ++ (metadata?.taskType ++ ? this.fallbackPolicy?.taskTypePolicyIds?.[metadata.taskType] ++ : undefined) ?? ++ this.fallbackPolicy?.defaultPolicyId; ++ ++ const policy = policyId ? this.fallbackPolicy?.policies?.[policyId] : undefined; ++ return { ++ policy: policy ?? { mode: "fallback" }, ++ policyId, ++ }; ++ } ++ ++ private applyFallbackRequest( ++ next: QueuedRequest, ++ fallbackRequest: QueuedRequest["request"], ++ fallback: FallbackChainEntry, ++ logger?: Logger, ++ context?: { previousCircuitKey?: string; reason?: string; policyId?: string }, ++ ): void { ++ next.request = fallbackRequest; ++ next.attempt = 1; ++ next.estimatedTokens = fallbackRequest.estimatedTokens; ++ next.reservedTokens = undefined; ++ next.tenantConcurrencyKey = undefined; ++ next.providerModelConcurrencyKey = undefined; ++ next.rateLimitKey = undefined; ++ next.etaMs = undefined; ++ next.circuitKey = undefined; ++ next.circuitStatus = undefined; ++ next.extractUsage = fallbackRequest.extractUsage; ++ if (context?.reason === "queue-timeout") { ++ next.queueTimeoutDisabled = true; ++ } ++ logger?.debug?.("Switched to fallback request", { ++ previousCircuitKey: context?.previousCircuitKey, ++ fallbackModel: fallback, ++ reason: context?.reason, ++ policyId: context?.policyId, ++ }); ++ } ++ ++ private isShortResponseFallback( ++ candidate: FallbackChainEntry, ++ ): candidate is { kind: "short-response"; text: string } { ++ return ( ++ typeof candidate === "object" && ++ candidate !== null && ++ "kind" in candidate && ++ (candidate as { kind?: string }).kind === "short-response" ++ ); ++ } ++ ++ private findFallbackTarget( ++ metadata: TrafficRequestMetadata | undefined, ++ visitedKeys: Set, ++ logger?: Logger, ++ ): FallbackChainEntry | undefined { ++ const currentModel = metadata?.model; ++ if (!currentModel) { ++ logger?.trace?.("No current model; no fallback", {}); ++ return undefined; ++ } ++ ++ const provider = metadata?.provider; ++ const chain = this.resolveFallbackChain(provider, currentModel); ++ if (!chain) { ++ logger?.trace?.("No fallback chain for model", { ++ currentModel, ++ provider, ++ }); ++ return undefined; ++ } ++ ++ for (const candidate of chain) { ++ if (this.isShortResponseFallback(candidate)) { ++ logger?.debug?.("Selected short-response fallback", { ++ currentModel, ++ currentProvider: provider, ++ }); ++ return candidate; ++ } ++ const target = this.normalizeFallbackTarget(candidate, provider); ++ const candidateMetadata: TrafficRequestMetadata = { ++ ...(metadata ?? {}), ++ provider: target.provider ?? provider, ++ model: target.model, ++ }; ++ const candidateKey = this.buildRateLimitKey(candidateMetadata); ++ if (visitedKeys.has(candidateKey)) { ++ continue; ++ } ++ ++ const evaluation = this.evaluateCircuitState(candidateKey, logger); ++ if (evaluation.allowRequest) { ++ visitedKeys.add(candidateKey); ++ logger?.debug?.("Selected fallback target", { ++ currentModel, ++ currentProvider: provider, ++ fallbackModel: target.model, ++ fallbackProvider: target.provider ?? provider, ++ fallbackCircuitKey: candidateKey, ++ }); ++ return candidate; ++ } ++ } ++ ++ return undefined; ++ } ++ ++ private resolveFallbackChain( ++ provider: string | undefined, ++ model: string, ++ ): FallbackChainEntry[] | undefined { ++ const providerKey = provider ? `${provider}::${model}` : undefined; ++ if (providerKey) { ++ const providerChain = this.fallbackChains.get(providerKey); ++ if (providerChain) return providerChain; ++ } ++ return this.fallbackChains.get(model); ++ } ++ ++ private normalizeFallbackTarget( ++ candidate: FallbackChainEntry, ++ provider: string | undefined, ++ ): FallbackTarget { ++ if (typeof candidate === "string") { ++ return { provider, model: candidate }; ++ } ++ return { ++ provider: candidate.provider ?? provider, ++ model: candidate.model, ++ }; ++ } ++ ++ private isCircuitBreakerStatus(status?: number): boolean { ++ return status === 429 || (status !== undefined && status >= 500); ++ } ++} +diff --git a/packages/core/src/traffic/traffic-concurrency-limiter.ts b/packages/core/src/traffic/traffic-concurrency-limiter.ts +new file mode 100644 +index 00000000..e1525612 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-concurrency-limiter.ts +@@ -0,0 +1,235 @@ ++import type { Logger } from "../logger"; ++import type { QueuedRequest } from "./traffic-controller-internal"; ++import type { ++ ProviderModelConcurrencyLimit, ++ TenantConcurrencyLimit, ++ TrafficRequestMetadata, ++} from "./traffic-types"; ++ ++export type ConcurrencyBlockReason = ++ | { ++ gate: "providerModel"; ++ key: string; ++ inFlight: number; ++ limit: number; ++ } ++ | { ++ gate: "tenant"; ++ key: string; ++ inFlight: number; ++ limit: number; ++ }; ++ ++export type ConcurrencyDecision = ++ | { kind: "allow" } ++ | { kind: "wait"; reasons: ConcurrencyBlockReason[] }; ++ ++function toNonNegativeIntegerLimit(raw: unknown): number | undefined { ++ if (raw === undefined || raw === null) return undefined; ++ const n = typeof raw === "number" ? raw : Number(raw); ++ if (!Number.isFinite(n)) return undefined; ++ if (n <= 0) return 0; ++ return Math.floor(n); ++} ++ ++function getInFlight(map: Map, key: string): number { ++ return map.get(key) ?? 0; ++} ++ ++function incrementInFlight(map: Map, key: string): void { ++ map.set(key, getInFlight(map, key) + 1); ++} ++ ++function decrementInFlight(map: Map, key: string): void { ++ const current = getInFlight(map, key); ++ if (current <= 1) { ++ map.delete(key); ++ return; ++ } ++ map.set(key, current - 1); ++} ++ ++export class TrafficConcurrencyLimiter { ++ private readonly inFlightByProviderModel = new Map(); ++ private readonly inFlightByTenant = new Map(); ++ ++ private readonly buildProviderModelKey: (metadata?: TrafficRequestMetadata) => string; ++ private readonly providerModelLimit?: ProviderModelConcurrencyLimit; ++ private readonly tenantLimit?: TenantConcurrencyLimit; ++ private readonly providerModelEnabled: boolean; ++ private readonly tenantEnabled: boolean; ++ ++ constructor(options: { ++ buildProviderModelKey: (metadata?: TrafficRequestMetadata) => string; ++ maxConcurrentPerProviderModel?: ProviderModelConcurrencyLimit; ++ maxConcurrentPerTenant?: TenantConcurrencyLimit; ++ }) { ++ this.buildProviderModelKey = options.buildProviderModelKey; ++ this.providerModelLimit = options.maxConcurrentPerProviderModel; ++ this.tenantLimit = options.maxConcurrentPerTenant; ++ this.providerModelEnabled = options.maxConcurrentPerProviderModel !== undefined; ++ this.tenantEnabled = options.maxConcurrentPerTenant !== undefined; ++ } ++ ++ resolve(next: QueuedRequest, logger?: Logger): ConcurrencyDecision { ++ if (!this.providerModelEnabled && !this.tenantEnabled) return { kind: "allow" }; ++ const concurrencyLogger = logger?.child({ module: "concurrency-limiter" }); ++ const reasons: ConcurrencyBlockReason[] = []; ++ ++ if (this.providerModelEnabled) { ++ const providerModelKey = this.buildProviderModelKey(next.request.metadata); ++ const providerModelLimit = this.resolveProviderModelLimit( ++ providerModelKey, ++ next.request.metadata, ++ concurrencyLogger, ++ ); ++ if (providerModelLimit !== undefined) { ++ const inFlight = getInFlight(this.inFlightByProviderModel, providerModelKey); ++ if (inFlight >= providerModelLimit) { ++ reasons.push({ ++ gate: "providerModel", ++ key: providerModelKey, ++ inFlight, ++ limit: providerModelLimit, ++ }); ++ } ++ } ++ } ++ ++ if (this.tenantEnabled) { ++ const tenantKey = next.tenantId; ++ const tenantLimit = this.resolveTenantLimit( ++ tenantKey, ++ next.request.metadata, ++ concurrencyLogger, ++ ); ++ if (tenantLimit !== undefined) { ++ const inFlight = getInFlight(this.inFlightByTenant, tenantKey); ++ if (inFlight >= tenantLimit) { ++ reasons.push({ ++ gate: "tenant", ++ key: tenantKey, ++ inFlight, ++ limit: tenantLimit, ++ }); ++ } ++ } ++ } ++ ++ if (reasons.length === 0) return { kind: "allow" }; ++ ++ concurrencyLogger?.trace?.("Concurrency gate blocked request", { ++ tenantId: next.tenantId, ++ reasons, ++ }); ++ return { kind: "wait", reasons }; ++ } ++ ++ acquire(next: QueuedRequest, logger?: Logger): void { ++ if (!this.providerModelEnabled && !this.tenantEnabled) return; ++ const concurrencyLogger = logger?.child({ module: "concurrency-limiter" }); ++ ++ let tenantKey: string | undefined; ++ if (this.tenantEnabled) { ++ tenantKey = next.tenantId; ++ next.tenantConcurrencyKey = tenantKey; ++ incrementInFlight(this.inFlightByTenant, tenantKey); ++ } ++ ++ let providerModelKey: string | undefined; ++ if (this.providerModelEnabled) { ++ providerModelKey = this.buildProviderModelKey(next.request.metadata); ++ next.providerModelConcurrencyKey = providerModelKey; ++ incrementInFlight(this.inFlightByProviderModel, providerModelKey); ++ } ++ ++ concurrencyLogger?.trace?.("Concurrency slots acquired", { ++ tenantId: tenantKey, ++ tenantInFlight: tenantKey ? getInFlight(this.inFlightByTenant, tenantKey) : undefined, ++ providerModelKey, ++ providerModelInFlight: providerModelKey ++ ? getInFlight(this.inFlightByProviderModel, providerModelKey) ++ : undefined, ++ }); ++ } ++ ++ release(next: QueuedRequest, logger?: Logger): void { ++ const concurrencyLogger = logger?.child({ module: "concurrency-limiter" }); ++ const tenantKey = next.tenantConcurrencyKey; ++ const providerModelKey = next.providerModelConcurrencyKey; ++ ++ if (tenantKey) { ++ decrementInFlight(this.inFlightByTenant, tenantKey); ++ } ++ ++ if (providerModelKey) { ++ decrementInFlight(this.inFlightByProviderModel, providerModelKey); ++ } ++ ++ if (tenantKey || providerModelKey) { ++ concurrencyLogger?.trace?.("Concurrency slots released", { ++ tenantId: tenantKey, ++ tenantInFlight: tenantKey ? getInFlight(this.inFlightByTenant, tenantKey) : undefined, ++ providerModelKey, ++ providerModelInFlight: providerModelKey ++ ? getInFlight(this.inFlightByProviderModel, providerModelKey) ++ : undefined, ++ }); ++ } ++ ++ next.tenantConcurrencyKey = undefined; ++ next.providerModelConcurrencyKey = undefined; ++ } ++ ++ private resolveTenantLimit( ++ tenantId: string, ++ metadata: TrafficRequestMetadata | undefined, ++ logger?: Logger, ++ ): number | undefined { ++ const policy = this.tenantLimit; ++ if (policy === undefined) return undefined; ++ ++ if (typeof policy === "number") return toNonNegativeIntegerLimit(policy); ++ if (typeof policy === "function") { ++ try { ++ return toNonNegativeIntegerLimit(policy(tenantId, metadata)); ++ } catch (error) { ++ logger?.warn?.("Tenant concurrency resolver threw; ignoring", { ++ tenantId, ++ errorName: (error as { name?: unknown } | null)?.name, ++ errorMessage: (error as { message?: unknown } | null)?.message, ++ }); ++ return undefined; ++ } ++ } ++ ++ return toNonNegativeIntegerLimit(policy[tenantId]); ++ } ++ ++ private resolveProviderModelLimit( ++ key: string, ++ metadata: TrafficRequestMetadata | undefined, ++ logger?: Logger, ++ ): number | undefined { ++ const policy = this.providerModelLimit; ++ if (policy === undefined) return undefined; ++ ++ if (typeof policy === "number") return toNonNegativeIntegerLimit(policy); ++ if (typeof policy === "function") { ++ try { ++ return toNonNegativeIntegerLimit(policy(metadata, key)); ++ } catch (error) { ++ logger?.warn?.("Provider/model concurrency resolver threw; ignoring", { ++ key, ++ provider: metadata?.provider, ++ model: metadata?.model, ++ errorName: (error as { name?: unknown } | null)?.name, ++ errorMessage: (error as { message?: unknown } | null)?.message, ++ }); ++ return undefined; ++ } ++ } ++ ++ return toNonNegativeIntegerLimit(policy[key]); ++ } ++} +diff --git a/packages/core/src/traffic/traffic-constants.ts b/packages/core/src/traffic/traffic-constants.ts +new file mode 100644 +index 00000000..68d99df7 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-constants.ts +@@ -0,0 +1,26 @@ ++export const MAX_RETRY_ATTEMPTS = 3; ++export const TIMEOUT_RETRY_ATTEMPTS = 2; ++ ++export const RATE_LIMIT_BASE_BACKOFF_MS = 500; ++export const SERVER_ERROR_BASE_BACKOFF_MS = 1000; ++export const TIMEOUT_BASE_BACKOFF_MS = 750; ++ ++export const RATE_LIMIT_JITTER_FACTOR = 0.35; ++export const SERVER_ERROR_JITTER_FACTOR = 0.8; ++export const TIMEOUT_JITTER_FACTOR = 0.5; ++ ++export const CIRCUIT_FAILURE_THRESHOLD = 5; ++export const CIRCUIT_FAILURE_WINDOW_MS = 10_000; ++export const CIRCUIT_TIMEOUT_THRESHOLD = CIRCUIT_FAILURE_THRESHOLD; ++export const CIRCUIT_TIMEOUT_WINDOW_MS = CIRCUIT_FAILURE_WINDOW_MS; ++export const CIRCUIT_COOLDOWN_MS = 30_000; ++export const CIRCUIT_PROBE_INTERVAL_MS = 5_000; ++ ++export const RATE_LIMIT_EXHAUSTION_BUFFER = 1; ++export const RATE_LIMIT_PROBE_DELAY_MS = 50; ++export const RATE_LIMIT_MIN_PACE_INTERVAL_MS = 10; ++export const RATE_LIMIT_NEXT_ALLOWED_UPDATE_THRESHOLD_MS = 10; ++ ++export const DEFAULT_FALLBACK_CHAINS: Record = { ++ "gpt-4o": ["gpt-4o-mini", "gpt-3.5"], ++}; +diff --git a/packages/core/src/traffic/traffic-controller-internal.ts b/packages/core/src/traffic/traffic-controller-internal.ts +new file mode 100644 +index 00000000..fd2012cf +--- /dev/null ++++ b/packages/core/src/traffic/traffic-controller-internal.ts +@@ -0,0 +1,57 @@ ++import type { TrafficPriority, TrafficRequest, TrafficRequestType } from "./traffic-types"; ++ ++export type Scheduler = (callback: () => void) => void; ++ ++export type DispatchDecision = ++ | { kind: "dispatch" } ++ | { kind: "skip" } ++ | { kind: "wait"; wakeUpAt?: number }; ++ ++export type CircuitStateStatus = "closed" | "open" | "half-open"; ++ ++export interface CircuitState { ++ status: CircuitStateStatus; ++ failureTimestamps: number[]; ++ timeoutTimestamps: number[]; ++ openedAt?: number; ++ trialInFlight?: boolean; ++ nextProbeAt?: number; ++} ++ ++export interface RateLimitWindowState { ++ limit: number; ++ remaining: number; ++ resetAt: number; ++ reserved: number; ++ nextAllowedAt: number; ++} ++ ++type BivariantHandler = { ++ bivarianceHack(...args: TArgs): void; ++}["bivarianceHack"]; ++ ++export interface QueuedRequest { ++ type: TrafficRequestType; ++ request: TrafficRequest; ++ resolve: BivariantHandler<[TResponse | PromiseLike]>; ++ reject: BivariantHandler<[reason?: unknown]>; ++ attempt: number; ++ priority: TrafficPriority; ++ tenantId: string; ++ enqueuedAt: number; ++ dispatchedAt?: number; ++ estimatedTokens?: number; ++ reservedTokens?: number; ++ queueTimeoutDisabled?: boolean; ++ ++ tenantConcurrencyKey?: string; ++ providerModelConcurrencyKey?: string; ++ ++ rateLimitKey?: string; ++ etaMs?: number; ++ ++ circuitKey?: string; ++ circuitStatus?: CircuitStateStatus; ++ ++ extractUsage?: TrafficRequest["extractUsage"]; ++} +diff --git a/packages/core/src/traffic/traffic-controller.spec.ts b/packages/core/src/traffic/traffic-controller.spec.ts +new file mode 100644 +index 00000000..8f0a2c47 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-controller.spec.ts +@@ -0,0 +1,706 @@ ++import { describe, expect, it, vi } from "vitest"; ++import { CIRCUIT_FAILURE_THRESHOLD, RATE_LIMIT_PROBE_DELAY_MS } from "./traffic-constants"; ++import { TrafficController } from "./traffic-controller"; ++ ++describe("TrafficController priority scheduling", () => { ++ it("prioritizes P0 over lower priorities when runnable", async () => { ++ const controller = new TrafficController({ maxConcurrent: 1 }); ++ const order: string[] = []; ++ ++ const p1 = controller.handleText({ ++ metadata: { provider: "p", model: "m1", priority: "P1" }, ++ execute: async () => { ++ order.push("P1"); ++ return "P1"; ++ }, ++ }); ++ ++ const p2 = controller.handleText({ ++ metadata: { provider: "p", model: "m2", priority: "P2" }, ++ execute: async () => { ++ order.push("P2"); ++ return "P2"; ++ }, ++ }); ++ ++ const p0 = controller.handleText({ ++ metadata: { provider: "p", model: "m0", priority: "P0" }, ++ execute: async () => { ++ order.push("P0"); ++ return "P0"; ++ }, ++ }); ++ ++ await Promise.all([p0, p1, p2]); ++ ++ expect(order[0]).toBe("P0"); ++ expect(order).toEqual(["P0", "P1", "P2"]); ++ }); ++ ++ it("allows lower priorities to proceed when a higher priority request is rate limited", async () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ maxConcurrent: 1 }); ++ controller.updateRateLimitFromHeaders( ++ { provider: "p0", model: "m0" }, ++ { ++ "x-ratelimit-limit-requests": "1", ++ "x-ratelimit-remaining-requests": "0", ++ "x-ratelimit-reset-requests": "1s", ++ }, ++ ); ++ ++ const order: string[] = []; ++ ++ const p0 = controller.handleText({ ++ metadata: { provider: "p0", model: "m0", priority: "P0" }, ++ execute: async () => { ++ order.push("P0"); ++ return "P0"; ++ }, ++ }); ++ ++ const p1 = controller.handleText({ ++ metadata: { provider: "p1", model: "m1", priority: "P1" }, ++ execute: async () => { ++ order.push("P1"); ++ return "P1"; ++ }, ++ }); ++ ++ await vi.runAllTimersAsync(); ++ await Promise.all([p0, p1]); ++ ++ expect(order[0]).toBe("P1"); ++ expect(order[1]).toBe("P0"); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++}); ++ ++describe("TrafficController concurrency limits", () => { ++ it("shares provider/model limits across tenants", async () => { ++ const controller = new TrafficController({ ++ maxConcurrent: 2, ++ maxConcurrentPerProviderModel: 1, ++ }); ++ const started: string[] = []; ++ let releaseFirst!: () => void; ++ const firstGate = new Promise((resolve) => { ++ releaseFirst = resolve; ++ }); ++ ++ const first = controller.handleText({ ++ tenantId: "tenant-a", ++ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, ++ execute: async () => { ++ started.push("tenant-a"); ++ await firstGate; ++ return "a"; ++ }, ++ }); ++ ++ const second = controller.handleText({ ++ tenantId: "tenant-b", ++ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, ++ execute: async () => { ++ started.push("tenant-b"); ++ return "b"; ++ }, ++ }); ++ ++ await new Promise((resolve) => setTimeout(resolve, 0)); ++ expect(started).toEqual(["tenant-a"]); ++ ++ releaseFirst(); ++ await Promise.all([first, second]); ++ expect(started).toEqual(["tenant-a", "tenant-b"]); ++ }); ++}); ++ ++describe("TrafficController rate limit headers", () => { ++ it("parses OpenAI-style compound reset durations (e.g. 1m30.951s)", () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(1_000_000)); ++ const controller = new TrafficController({ maxConcurrent: 1 }); ++ const now = Date.now(); ++ ++ const result = controller.updateRateLimitFromHeaders( ++ { provider: "openai.responses", model: "gpt-4o-mini" }, ++ { ++ "x-ratelimit-limit-requests": "10000", ++ "x-ratelimit-remaining-requests": "9989", ++ "x-ratelimit-reset-requests": "1m30.951s", ++ }, ++ ); ++ ++ expect(result).toBeTruthy(); ++ expect(result?.headerSnapshot.resetRequestsMs).toBeCloseTo(90_951, 6); ++ expect(result?.state.limit).toBe(10000); ++ expect(result?.state.remaining).toBe(9989); ++ expect(result?.state.resetAt).toBe(now + 90_951); ++ expect(result?.state.reserved).toBe(0); ++ expect(result?.state.nextAllowedAt).toBe(now); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++ ++ it("keeps resetAt monotonic when headers shorten the reset duration", () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ maxConcurrent: 1 }); ++ ++ const first = controller.updateRateLimitFromHeaders( ++ { provider: "openai.responses", model: "gpt-4o-mini" }, ++ { ++ "x-ratelimit-limit-requests": "10000", ++ "x-ratelimit-remaining-requests": "9999", ++ "x-ratelimit-reset-requests": "60s", ++ }, ++ ); ++ ++ expect(first).toBeTruthy(); ++ expect(first?.state.resetAt).toBe(60_000); ++ ++ vi.setSystemTime(new Date(10_000)); ++ const second = controller.updateRateLimitFromHeaders( ++ { provider: "openai.responses", model: "gpt-4o-mini" }, ++ { ++ "x-ratelimit-limit-requests": "10000", ++ "x-ratelimit-remaining-requests": "9998", ++ "x-ratelimit-reset-requests": "5s", ++ }, ++ ); ++ ++ expect(second).toBeTruthy(); ++ expect(second?.state.resetAt).toBe(60_000); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++ ++ it("never increases remaining within the same window", () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ maxConcurrent: 1 }); ++ ++ const first = controller.updateRateLimitFromHeaders( ++ { provider: "openai.responses", model: "gpt-4o-mini" }, ++ { ++ "x-ratelimit-limit-requests": "10", ++ "x-ratelimit-remaining-requests": "9", ++ "x-ratelimit-reset-requests": "60s", ++ }, ++ ); ++ ++ expect(first?.state.remaining).toBe(9); ++ expect(first?.state.resetAt).toBe(60_000); ++ ++ vi.setSystemTime(new Date(10_000)); ++ const second = controller.updateRateLimitFromHeaders( ++ { provider: "openai.responses", model: "gpt-4o-mini" }, ++ { ++ "x-ratelimit-limit-requests": "10", ++ "x-ratelimit-remaining-requests": "8", ++ "x-ratelimit-reset-requests": "50s", ++ }, ++ ); ++ ++ expect(second?.state.remaining).toBe(8); ++ expect(second?.state.resetAt).toBe(60_000); ++ ++ vi.setSystemTime(new Date(20_000)); ++ const third = controller.updateRateLimitFromHeaders( ++ { provider: "openai.responses", model: "gpt-4o-mini" }, ++ { ++ "x-ratelimit-limit-requests": "10", ++ "x-ratelimit-remaining-requests": "9", ++ "x-ratelimit-reset-requests": "40s", ++ }, ++ ); ++ ++ expect(third?.state.remaining).toBe(8); ++ expect(third?.state.resetAt).toBe(60_000); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++ ++ it("applies Retry-After even when x-ratelimit headers are missing", async () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ maxConcurrent: 1 }); ++ const order: string[] = []; ++ ++ controller.updateRateLimitFromHeaders( ++ { provider: "p", model: "m" }, ++ { ++ "retry-after": "2", ++ }, ++ ); ++ ++ const p0 = controller.handleText({ ++ metadata: { provider: "p", model: "m", priority: "P0" }, ++ execute: async () => { ++ order.push("P0"); ++ return "P0"; ++ }, ++ }); ++ ++ await vi.advanceTimersByTimeAsync(1_999); ++ expect(order).toEqual([]); ++ ++ await vi.advanceTimersByTimeAsync(1); ++ await vi.runAllTimersAsync(); ++ await p0; ++ expect(order).toEqual(["P0"]); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++ ++ it("shares rate limits across tenants for the same provider/model", async () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ maxConcurrent: 1 }); ++ controller.updateRateLimitFromHeaders( ++ { provider: "openai", model: "gpt-4o", tenantId: "tenant-a" }, ++ { ++ "x-ratelimit-limit-requests": "1", ++ "x-ratelimit-remaining-requests": "0", ++ "x-ratelimit-reset-requests": "1s", ++ }, ++ ); ++ ++ const order: string[] = []; ++ const request = controller.handleText({ ++ tenantId: "tenant-b", ++ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, ++ execute: async () => { ++ order.push("tenant-b"); ++ return "ok"; ++ }, ++ }); ++ ++ await vi.advanceTimersByTimeAsync(999); ++ await Promise.resolve(); ++ expect(order).toEqual([]); ++ ++ await vi.advanceTimersByTimeAsync(1); ++ await vi.runAllTimersAsync(); ++ await request; ++ ++ expect(order).toEqual(["tenant-b"]); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++}); ++ ++describe("TrafficController token limits", () => { ++ it("blocks OpenAI when the token window is exhausted even without RPM config", async () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ ++ maxConcurrent: 1, ++ rateLimits: { ++ "openai::gpt-4o": { ++ requestsPerMinute: 0, ++ tokensPerMinute: 2, ++ }, ++ }, ++ }); ++ const order: string[] = []; ++ ++ const first = controller.handleText({ ++ tenantId: "tenant-a", ++ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, ++ execute: async () => { ++ order.push("first"); ++ return "first"; ++ }, ++ extractUsage: () => ({ totalTokens: 2 }), ++ }); ++ ++ const second = controller.handleText({ ++ tenantId: "tenant-b", ++ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, ++ execute: async () => { ++ order.push("second"); ++ return "second"; ++ }, ++ extractUsage: () => ({ totalTokens: 1 }), ++ }); ++ ++ await first; ++ expect(order).toEqual(["first"]); ++ ++ await vi.advanceTimersByTimeAsync(60_000 + RATE_LIMIT_PROBE_DELAY_MS - 1); ++ expect(order).toEqual(["first"]); ++ ++ await vi.advanceTimersByTimeAsync(1); ++ await vi.runAllTimersAsync(); ++ await second; ++ expect(order).toEqual(["first", "second"]); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++ ++ it("reserves estimated tokens before dispatch", async () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ ++ maxConcurrent: 2, ++ rateLimits: { ++ "openai::gpt-4o": { ++ requestsPerMinute: 0, ++ tokensPerMinute: 2, ++ }, ++ }, ++ }); ++ const order: string[] = []; ++ let releaseFirst!: () => void; ++ const firstGate = new Promise((resolve) => { ++ releaseFirst = resolve; ++ }); ++ ++ const first = controller.handleText({ ++ tenantId: "tenant-a", ++ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, ++ estimatedTokens: 2, ++ execute: async () => { ++ order.push("first"); ++ await firstGate; ++ return "first"; ++ }, ++ extractUsage: () => ({ totalTokens: 2 }), ++ }); ++ ++ const second = controller.handleText({ ++ tenantId: "tenant-b", ++ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, ++ estimatedTokens: 1, ++ execute: async () => { ++ order.push("second"); ++ return "second"; ++ }, ++ extractUsage: () => ({ totalTokens: 1 }), ++ }); ++ ++ await Promise.resolve(); ++ expect(order).toEqual(["first"]); ++ ++ await vi.advanceTimersByTimeAsync(60_000 + RATE_LIMIT_PROBE_DELAY_MS - 1); ++ await Promise.resolve(); ++ expect(order).toEqual(["first"]); ++ ++ await vi.advanceTimersByTimeAsync(1); ++ await vi.runAllTimersAsync(); ++ await Promise.resolve(); ++ expect(order).toEqual(["first", "second"]); ++ ++ releaseFirst(); ++ await Promise.all([first, second]); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++ ++ it("allows token-only configs on non-OpenAI providers", async () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ ++ maxConcurrent: 2, ++ rateLimits: { ++ "p::m": { ++ requestsPerMinute: 0, ++ tokensPerMinute: 2, ++ }, ++ }, ++ }); ++ const order: string[] = []; ++ ++ const first = controller.handleText({ ++ tenantId: "tenant-a", ++ metadata: { provider: "p", model: "m", priority: "P1" }, ++ estimatedTokens: 2, ++ execute: async () => { ++ order.push("first"); ++ return "first"; ++ }, ++ extractUsage: () => ({ totalTokens: 2 }), ++ }); ++ ++ const second = controller.handleText({ ++ tenantId: "tenant-b", ++ metadata: { provider: "p", model: "m", priority: "P1" }, ++ estimatedTokens: 1, ++ execute: async () => { ++ order.push("second"); ++ return "second"; ++ }, ++ extractUsage: () => ({ totalTokens: 1 }), ++ }); ++ ++ await first; ++ expect(order).toEqual(["first"]); ++ ++ await vi.advanceTimersByTimeAsync(29_999); ++ await Promise.resolve(); ++ expect(order).toEqual(["first"]); ++ ++ await vi.advanceTimersByTimeAsync(1); ++ await vi.runAllTimersAsync(); ++ await second; ++ expect(order).toEqual(["first", "second"]); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++ ++ it("honors OpenAI token headers even without token config", async () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ maxConcurrent: 1 }); ++ controller.updateRateLimitFromHeaders( ++ { provider: "openai", model: "gpt-4o" }, ++ { ++ "x-ratelimit-limit-tokens": "2", ++ "x-ratelimit-remaining-tokens": "0", ++ "x-ratelimit-reset-tokens": "1s", ++ }, ++ ); ++ ++ const order: string[] = []; ++ const request = controller.handleText({ ++ tenantId: "tenant-a", ++ metadata: { provider: "openai", model: "gpt-4o", priority: "P1" }, ++ estimatedTokens: 1, ++ execute: async () => { ++ order.push("run"); ++ return "ok"; ++ }, ++ }); ++ ++ await Promise.resolve(); ++ expect(order).toEqual([]); ++ ++ await vi.advanceTimersByTimeAsync(1_000 + RATE_LIMIT_PROBE_DELAY_MS - 1); ++ await Promise.resolve(); ++ expect(order).toEqual([]); ++ ++ await vi.advanceTimersByTimeAsync(1); ++ await vi.runAllTimersAsync(); ++ await request; ++ expect(order).toEqual(["run"]); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++}); ++ ++describe("TrafficController stream reporting", () => { ++ it("slows down after stream 429 errors", async () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ ++ maxConcurrent: 1, ++ adaptiveLimiter: { ++ windowMs: 1_000, ++ threshold: 1, ++ minPenaltyMs: 10, ++ maxPenaltyMs: 10, ++ penaltyMultiplier: 1, ++ decayMs: 1_000, ++ }, ++ }); ++ const metadata = { ++ provider: "p", ++ model: "m", ++ priority: "P1" as const, ++ tenantId: "tenant-a", ++ }; ++ ++ controller.reportStreamFailure( ++ metadata, ++ Object.assign(new Error("rate limit"), { status: 429 }), ++ ); ++ ++ const order: string[] = []; ++ const request = controller.handleText({ ++ tenantId: "tenant-a", ++ metadata, ++ execute: async () => { ++ order.push("run"); ++ return "ok"; ++ }, ++ }); ++ ++ await Promise.resolve(); ++ expect(order).toEqual([]); ++ ++ await vi.advanceTimersByTimeAsync(9); ++ await Promise.resolve(); ++ expect(order).toEqual([]); ++ ++ await vi.advanceTimersByTimeAsync(1); ++ await vi.runAllTimersAsync(); ++ await request; ++ expect(order).toEqual(["run"]); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++ ++ it("treats post-start stream failures as circuit breaker failures", async () => { ++ const controller = new TrafficController({ ++ maxConcurrent: 1, ++ fallbackChains: { ++ primary: ["fallback"], ++ }, ++ }); ++ const tenantId = "tenant-1"; ++ const metadata = { provider: "p", model: "primary", priority: "P1" as const }; ++ ++ await controller.handleStream({ ++ tenantId, ++ metadata, ++ execute: async () => ({ ok: true }), ++ }); ++ ++ for (let i = 0; i < CIRCUIT_FAILURE_THRESHOLD; i += 1) { ++ controller.reportStreamFailure(metadata, new Error("stream-failure")); ++ } ++ ++ const order: string[] = []; ++ await controller.handleStream({ ++ tenantId, ++ metadata, ++ execute: async () => { ++ order.push("primary"); ++ return "primary"; ++ }, ++ createFallbackRequest: (target) => ({ ++ tenantId, ++ metadata: { ++ provider: "p", ++ model: typeof target === "string" ? target : target.model, ++ priority: "P1", ++ }, ++ execute: async () => { ++ const modelId = typeof target === "string" ? target : target.model; ++ order.push(modelId); ++ return modelId; ++ }, ++ }), ++ }); ++ ++ expect(order).toEqual(["fallback"]); ++ }); ++}); ++ ++describe("TrafficController queue timeouts", () => { ++ it("lets fallback requests wait after queue timeout without rejecting", async () => { ++ vi.useFakeTimers(); ++ ++ try { ++ vi.setSystemTime(new Date(0)); ++ const controller = new TrafficController({ ++ maxConcurrent: 1, ++ fallbackChains: { ++ "p::m": ["m-fallback"], ++ }, ++ }); ++ const order: string[] = []; ++ let releaseFirst!: () => void; ++ const firstGate = new Promise((resolve) => { ++ releaseFirst = resolve; ++ }); ++ ++ const first = controller.handleText({ ++ tenantId: "tenant-a", ++ metadata: { provider: "p", model: "m", priority: "P1" }, ++ execute: async () => { ++ order.push("first"); ++ await firstGate; ++ return "first"; ++ }, ++ }); ++ ++ const second = controller.handleText({ ++ tenantId: "tenant-a", ++ metadata: { provider: "p", model: "m", priority: "P1" }, ++ maxQueueWaitMs: 1, ++ execute: async () => { ++ order.push("primary"); ++ return "primary"; ++ }, ++ createFallbackRequest: (target) => ({ ++ tenantId: "tenant-a", ++ metadata: { ++ provider: "p", ++ model: typeof target === "string" ? target : target.model, ++ priority: "P1", ++ }, ++ maxQueueWaitMs: 1, ++ execute: async () => { ++ order.push("fallback"); ++ return "fallback"; ++ }, ++ }), ++ }); ++ ++ await Promise.resolve(); ++ expect(order).toEqual(["first"]); ++ ++ await vi.advanceTimersByTimeAsync(2); ++ ++ const third = controller.handleText({ ++ tenantId: "tenant-a", ++ metadata: { provider: "p", model: "other", priority: "P1" }, ++ execute: async () => { ++ order.push("third"); ++ return "third"; ++ }, ++ }); ++ ++ await Promise.resolve(); ++ expect(order).toEqual(["first"]); ++ ++ releaseFirst(); ++ await vi.runAllTimersAsync(); ++ ++ await expect(second).resolves.toBe("fallback"); ++ await Promise.all([first, third]); ++ ++ expect(order).toEqual(["first", "fallback", "third"]); ++ } finally { ++ vi.useRealTimers(); ++ } ++ }); ++}); +diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts +new file mode 100644 +index 00000000..269304d9 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-controller.ts +@@ -0,0 +1,1268 @@ ++import type { Logger } from "../logger"; ++import { LoggerProxy } from "../logger"; ++import { TrafficCircuitBreaker } from "./traffic-circuit-breaker"; ++import { TrafficConcurrencyLimiter } from "./traffic-concurrency-limiter"; ++import type { DispatchDecision, QueuedRequest, Scheduler } from "./traffic-controller-internal"; ++import { ++ CircuitBreakerOpenError, ++ QueueWaitTimeoutError, ++ RateLimitedUpstreamError, ++ normalizeRateLimitError, ++} from "./traffic-errors"; ++import { ++ OpenAIWindowRateLimitStrategy, ++ type RateLimitUpdateResult, ++ TokenBucketRateLimitStrategy, ++ TrafficRateLimiter, ++} from "./traffic-rate-limiter"; ++import { buildRetryPlanWithPolicy } from "./traffic-retry"; ++import type { ++ AdaptiveLimiterConfig, ++ FallbackChainEntry, ++ FallbackPolicy, ++ FallbackPolicyConfig, ++ FallbackPolicyMode, ++ FallbackTarget, ++ PriorityBurstLimits, ++ PriorityWeights, ++ ProviderModelConcurrencyLimit, ++ RateLimitConfig, ++ RateLimitKey, ++ RateLimitStrategyConfig, ++ RateLimitStrategyKind, ++ RetryPlan, ++ RetryPolicyConfig, ++ TenantConcurrencyLimit, ++ TenantUsage, ++ TrafficControllerOptions, ++ TrafficPriority, ++ TrafficRequest, ++ TrafficRequestMetadata, ++ TrafficRequestType, ++ TrafficResponseMetadata, ++} from "./traffic-types"; ++import { TrafficUsageTracker } from "./traffic-usage-tracker"; ++ ++/* ============================================================ ++ * Traffic Controller ++ * ============================================================ ++ */ ++ ++export type { ++ AdaptiveLimiterConfig, ++ FallbackChainEntry, ++ FallbackPolicy, ++ FallbackPolicyConfig, ++ FallbackPolicyMode, ++ FallbackTarget, ++ PriorityBurstLimits, ++ PriorityWeights, ++ ProviderModelConcurrencyLimit, ++ RateLimitConfig, ++ RateLimitKey, ++ RateLimitStrategyConfig, ++ RateLimitStrategyKind, ++ TenantConcurrencyLimit, ++ TenantUsage, ++ TrafficControllerOptions, ++ TrafficPriority, ++ TrafficRequest, ++ TrafficRequestMetadata, ++ TrafficResponseMetadata, ++ TrafficRequestType, ++}; ++ ++export { CircuitBreakerOpenError }; ++export { QueueWaitTimeoutError }; ++export { RateLimitedUpstreamError }; ++ ++type TenantQueueState = { ++ order: string[]; ++ index: number; ++ queues: Map; ++}; ++ ++type RateLimitSnapshot = { ++ limit?: number; ++ remaining?: number; ++ resetAt?: number; ++ nextAllowedAt?: number; ++ retryAfterMs?: number; ++}; ++ ++type AdaptiveLimiterState = { ++ recent429s: number[]; ++ penaltyMs: number; ++ cooldownUntil?: number; ++ last429At?: number; ++}; ++ ++const DEFAULT_PRIORITY_WEIGHTS: Record = { ++ P0: 5, ++ P1: 3, ++ P2: 2, ++}; ++ ++const DEFAULT_ADAPTIVE_LIMITER: Required = { ++ windowMs: 30_000, ++ threshold: 3, ++ minPenaltyMs: 500, ++ maxPenaltyMs: 10_000, ++ penaltyMultiplier: 2, ++ decayMs: 10_000, ++}; ++ ++export class TrafficController { ++ /* ---------- Core ---------- */ ++ ++ private readonly scheduler: Scheduler; ++ private readonly maxConcurrent: number; ++ private readonly rateLimitKeyBuilder: (metadata?: TrafficRequestMetadata) => string; ++ private readonly retryPolicy?: RetryPolicyConfig; ++ private readonly logger: Logger; ++ private readonly trafficLogger: Logger; ++ private readonly controllerLogger: Logger; ++ private readonly concurrencyLimiter: TrafficConcurrencyLimiter; ++ ++ private readonly queues: Record = { ++ P0: { order: [], index: 0, queues: new Map() }, ++ P1: { order: [], index: 0, queues: new Map() }, ++ P2: { order: [], index: 0, queues: new Map() }, ++ }; ++ private readonly priorityOrder: TrafficPriority[] = ["P0", "P1", "P2"]; ++ private readonly priorityWeights: Record; ++ private readonly priorityCredits: Record; ++ ++ private activeCount = 0; ++ private drainScheduled = false; ++ ++ /* ---------- Rate limits ---------- */ ++ private readonly rateLimiter: TrafficRateLimiter; ++ ++ /* ---------- Circuit breakers ---------- */ ++ private readonly circuitBreaker: TrafficCircuitBreaker; ++ ++ /* ---------- Usage ---------- */ ++ private readonly usageTracker = new TrafficUsageTracker(); ++ ++ /* ---------- Traffic metadata ---------- */ ++ private readonly rateLimitSnapshots = new Map(); ++ ++ /* ---------- Adaptive limiter ---------- */ ++ private readonly adaptiveLimiterConfig: Required; ++ private readonly adaptiveLimiterState = new Map(); ++ ++ constructor(options: TrafficControllerOptions = {}) { ++ this.maxConcurrent = options.maxConcurrent ?? Number.POSITIVE_INFINITY; ++ this.scheduler = this.createScheduler(); ++ this.rateLimitKeyBuilder = options.rateLimitKeyBuilder ?? buildRateLimitKeyFromMetadata; ++ this.retryPolicy = options.retryPolicy; ++ const priorityOverrides = options.priorityWeights ?? options.priorityBurstLimits; ++ const priorityWeights = { ++ ...DEFAULT_PRIORITY_WEIGHTS, ++ ...(priorityOverrides ?? {}), ++ }; ++ this.priorityWeights = { ++ P0: Math.max(0, Math.floor(priorityWeights.P0)), ++ P1: Math.max(0, Math.floor(priorityWeights.P1)), ++ P2: Math.max(0, Math.floor(priorityWeights.P2)), ++ }; ++ this.priorityCredits = { ...this.priorityWeights }; ++ this.adaptiveLimiterConfig = { ++ ...DEFAULT_ADAPTIVE_LIMITER, ++ ...(options.adaptiveLimiter ?? {}), ++ }; ++ this.logger = new LoggerProxy({ component: "traffic-controller" }, options.logger); ++ this.trafficLogger = this.logger.child({ subsystem: "traffic" }); ++ this.controllerLogger = this.trafficLogger.child({ module: "controller" }); ++ const rateLimits = options.rateLimits; ++ const rateLimitStrategy = options.rateLimitStrategy; ++ this.rateLimiter = new TrafficRateLimiter(() => this.scheduleDrain(), { ++ rateLimits, ++ strategyFactory: (key) => { ++ const strategyKind = this.resolveRateLimitStrategy(key, rateLimitStrategy); ++ if (strategyKind === "window") { ++ return new OpenAIWindowRateLimitStrategy(key, rateLimits?.[key]); ++ } ++ return new TokenBucketRateLimitStrategy(key, rateLimits?.[key]); ++ }, ++ }); ++ this.circuitBreaker = new TrafficCircuitBreaker({ ++ fallbackChains: options.fallbackChains, ++ fallbackPolicy: options.fallbackPolicy, ++ buildRateLimitKey: (metadata) => this.buildRateLimitKey(metadata), ++ }); ++ this.concurrencyLimiter = new TrafficConcurrencyLimiter({ ++ buildProviderModelKey: (metadata) => buildProviderModelKeyFromMetadata(metadata), ++ maxConcurrentPerProviderModel: options.maxConcurrentPerProviderModel, ++ maxConcurrentPerTenant: options.maxConcurrentPerTenant, ++ }); ++ ++ this.controllerLogger.debug("Initialized TrafficController", { ++ maxConcurrent: this.maxConcurrent, ++ hasFallbackChains: !!options.fallbackChains, ++ hasFallbackPolicy: options.fallbackPolicy !== undefined, ++ hasProviderModelConcurrency: options.maxConcurrentPerProviderModel !== undefined, ++ hasTenantConcurrency: options.maxConcurrentPerTenant !== undefined, ++ hasConfigRateLimits: options.rateLimits !== undefined, ++ hasStrategyOverrides: options.rateLimitStrategy !== undefined, ++ hasRetryPolicy: options.retryPolicy !== undefined, ++ hasPriorityBurstLimits: options.priorityBurstLimits !== undefined, ++ hasPriorityWeights: options.priorityWeights !== undefined, ++ hasAdaptiveLimiter: options.adaptiveLimiter !== undefined, ++ }); ++ } ++ ++ /* ============================================================ ++ * Public API ++ * ============================================================ ++ */ ++ ++ handleText(request: TrafficRequest): Promise { ++ this.controllerLogger.trace("handleText called", { ++ tenantId: request.tenantId, ++ provider: request.metadata?.provider, ++ model: request.metadata?.model, ++ priority: request.metadata?.priority, ++ }); ++ return this.enqueue("text", request); ++ } ++ ++ handleStream(request: TrafficRequest): Promise { ++ this.controllerLogger.trace("handleStream called", { ++ tenantId: request.tenantId, ++ provider: request.metadata?.provider, ++ model: request.metadata?.model, ++ priority: request.metadata?.priority, ++ }); ++ return this.enqueue("stream", request); ++ } ++ ++ reportStreamSuccess(metadata?: TrafficRequestMetadata): void { ++ this.controllerLogger.debug("Stream reported success", { ++ provider: metadata?.provider, ++ model: metadata?.model, ++ tenantId: metadata?.tenantId, ++ priority: metadata?.priority, ++ }); ++ this.circuitBreaker.recordSuccess(metadata, this.trafficLogger); ++ const rateLimitKey = this.buildRateLimitKey(metadata); ++ const adaptiveKey = this.buildAdaptiveKey( ++ metadata, ++ metadata?.tenantId ?? "default", ++ rateLimitKey, ++ ); ++ this.recordAdaptiveSuccess(adaptiveKey); ++ } ++ ++ reportStreamFailure(metadata: TrafficRequestMetadata | undefined, error: unknown): void { ++ const rateLimitKey = this.buildRateLimitKey(metadata); ++ const normalizedRateLimitError = normalizeRateLimitError({ ++ error, ++ metadata, ++ tenantId: metadata?.tenantId, ++ key: rateLimitKey, ++ logger: this.trafficLogger, ++ }); ++ const errorForHandling = normalizedRateLimitError ?? error; ++ ++ this.controllerLogger.warn("Stream reported failure", { ++ provider: metadata?.provider, ++ model: metadata?.model, ++ tenantId: metadata?.tenantId, ++ priority: metadata?.priority, ++ errorName: (error as { name?: unknown } | null)?.name, ++ errorMessage: (error as { message?: unknown } | null)?.message, ++ status: (error as { status?: unknown } | null)?.status, ++ statusCode: (error as { statusCode?: unknown } | null)?.statusCode, ++ }); ++ this.circuitBreaker.recordFailure(metadata, errorForHandling, this.trafficLogger); ++ const adaptiveKey = this.buildAdaptiveKey( ++ metadata, ++ metadata?.tenantId ?? "default", ++ rateLimitKey, ++ ); ++ if (errorForHandling instanceof RateLimitedUpstreamError) { ++ this.recordAdaptiveRateLimitHit(adaptiveKey, errorForHandling.retryAfterMs); ++ } ++ const traffic = this.buildTrafficResponseMetadataFromMetadata( ++ metadata, ++ rateLimitKey, ++ Date.now(), ++ errorForHandling, ++ ); ++ this.attachTrafficMetadata(errorForHandling, traffic); ++ if (errorForHandling !== error) { ++ this.attachTrafficMetadata(error, traffic); ++ } ++ } ++ ++ updateRateLimitFromHeaders( ++ metadata: TrafficRequestMetadata | undefined, ++ headers: unknown, ++ ): RateLimitUpdateResult | undefined { ++ const key = this.buildRateLimitKey(metadata); ++ this.controllerLogger.debug("updateRateLimitFromHeaders called", { ++ rateLimitKey: key, ++ provider: metadata?.provider, ++ model: metadata?.model, ++ }); ++ ++ const update = this.rateLimiter.updateFromHeaders(metadata, headers, key, this.trafficLogger); ++ if (!update) { ++ this.controllerLogger.debug("updateRateLimitFromHeaders skipped (no headers applied)", { ++ rateLimitKey: key, ++ }); ++ return undefined; ++ } ++ ++ this.controllerLogger.debug("Rate limit headers applied", { ++ rateLimitKey: update.key, ++ limit: update.state.limit, ++ remaining: update.state.remaining, ++ reserved: update.state.reserved, ++ resetAt: update.state.resetAt, ++ nextAllowedAt: update.state.nextAllowedAt, ++ resetRequestsMs: update.headerSnapshot.resetRequestsMs, ++ }); ++ ++ this.rateLimitSnapshots.set(update.key, { ++ limit: update.state.limit, ++ remaining: update.state.remaining, ++ resetAt: update.state.resetAt, ++ nextAllowedAt: update.state.nextAllowedAt, ++ retryAfterMs: update.headerSnapshot.retryAfterMs, ++ }); ++ ++ return update; ++ } ++ ++ getTenantUsage(tenantId: string): TenantUsage | undefined { ++ this.controllerLogger.trace("getTenantUsage called", { tenantId }); ++ return this.usageTracker.getTenantUsage(tenantId); ++ } ++ ++ /* ============================================================ ++ * Scheduler & Queue ++ * ============================================================ ++ */ ++ ++ private createScheduler(): Scheduler { ++ return typeof queueMicrotask === "function" ? queueMicrotask : (cb) => setTimeout(cb, 0); ++ } ++ ++ private enqueue( ++ type: TrafficRequestType, ++ request: TrafficRequest, ++ ): Promise { ++ return new Promise((resolve, reject) => { ++ const priority = this.resolvePriority(request.metadata); ++ const tenantId = this.resolveTenantId(request); ++ this.controllerLogger.debug("Enqueue request", { ++ type, ++ tenantId, ++ priority, ++ provider: request.metadata?.provider, ++ model: request.metadata?.model, ++ }); ++ this.enqueueItem({ ++ type, ++ request, ++ resolve, ++ reject, ++ attempt: 1, ++ priority, ++ tenantId, ++ enqueuedAt: Date.now(), ++ estimatedTokens: request.estimatedTokens, ++ extractUsage: request.extractUsage, ++ }); ++ this.scheduleDrain(); ++ }); ++ } ++ ++ private scheduleDrain(): void { ++ if (this.drainScheduled) return; ++ this.drainScheduled = true; ++ ++ this.controllerLogger.trace("Drain scheduled"); ++ this.scheduler(() => { ++ this.drainScheduled = false; ++ this.controllerLogger.trace("Drain tick"); ++ this.drainQueue(); ++ }); ++ } ++ ++ private drainQueue(): void { ++ this.controllerLogger.trace("Drain start", { ++ activeCount: this.activeCount, ++ maxConcurrent: this.maxConcurrent, ++ queuedP0: this.getQueuedCount("P0"), ++ queuedP1: this.getQueuedCount("P1"), ++ queuedP2: this.getQueuedCount("P2"), ++ }); ++ while (true) { ++ const decision = this.tryDispatchNext(); ++ this.controllerLogger.trace("Dispatch decision", decision); ++ if (decision.kind === "dispatch" || decision.kind === "skip") continue; ++ if (decision.kind === "wait") { ++ if (decision.wakeUpAt) { ++ this.controllerLogger.debug("Rate limit wait; scheduling wakeup", { ++ wakeUpAt: decision.wakeUpAt, ++ inMs: Math.max(0, decision.wakeUpAt - Date.now()), ++ }); ++ this.scheduleRateLimitWakeUpAt(decision.wakeUpAt); ++ } ++ return; ++ } ++ return; ++ } ++ } ++ ++ /* ============================================================ ++ * Dispatch ++ * ============================================================ ++ */ ++ ++ private tryDispatchNext(): DispatchDecision { ++ if (this.activeCount >= this.maxConcurrent) return { kind: "wait" }; ++ ++ let earliestWakeUpAt: number | undefined; ++ ++ const observeWakeUpAt = (candidate?: number): void => { ++ if (candidate === undefined) return; ++ earliestWakeUpAt = ++ earliestWakeUpAt === undefined ? candidate : Math.min(earliestWakeUpAt, candidate); ++ }; ++ ++ const priorities = this.getPriorityDispatchOrder(); ++ for (const priority of priorities) { ++ const state = this.queues[priority]; ++ if (state.order.length === 0) continue; ++ ++ let attempts = 0; ++ const maxAttempts = state.order.length; ++ ++ while (attempts < maxAttempts) { ++ const candidate = this.getNextTenantCandidate(priority); ++ if (!candidate) break; ++ attempts += 1; ++ ++ const { item: next, queue, tenantId } = candidate; ++ const now = Date.now(); ++ const queueTimeoutAt = this.resolveQueueTimeoutAt(next); ++ const queueTimeoutTriggered = this.handleQueueTimeout(next, queue, 0, now, queueTimeoutAt); ++ if (queueTimeoutTriggered === "rejected") { ++ this.cleanupTenantQueue(priority, tenantId, queue); ++ return { kind: "skip" }; ++ } ++ if (queueTimeoutAt !== undefined && now < queueTimeoutAt) { ++ observeWakeUpAt(queueTimeoutAt); ++ } ++ const queueTimeoutExpired = queueTimeoutTriggered === "expired"; ++ ++ this.controllerLogger.trace("Evaluate next queued request", { ++ priority, ++ tenantId: next.tenantId, ++ type: next.type, ++ attempt: next.attempt, ++ provider: next.request.metadata?.provider, ++ model: next.request.metadata?.model, ++ queueLength: queue.length, ++ }); ++ ++ const circuit = this.resolveCircuit(next); ++ if (circuit) { ++ this.controllerLogger.trace("Circuit resolution returned decision", { ++ priority, ++ decision: circuit, ++ circuitKey: next.circuitKey, ++ circuitStatus: next.circuitStatus, ++ }); ++ if (circuit.kind === "skip") { ++ queue.shift(); ++ this.cleanupTenantQueue(priority, tenantId, queue); ++ return { kind: "skip" }; ++ } ++ if (circuit.kind === "wait") { ++ if ( ++ this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "circuit wait") ++ ) { ++ this.cleanupTenantQueue(priority, tenantId, queue); ++ return { kind: "skip" }; ++ } ++ next.etaMs = ++ circuit.wakeUpAt !== undefined ? Math.max(0, circuit.wakeUpAt - now) : undefined; ++ observeWakeUpAt(circuit.wakeUpAt); ++ continue; ++ } ++ } ++ ++ const concurrency = this.concurrencyLimiter.resolve(next, this.trafficLogger); ++ if (concurrency.kind === "wait") { ++ this.controllerLogger.trace("Concurrency gate blocked request", { ++ priority, ++ tenantId: next.tenantId, ++ provider: next.request.metadata?.provider, ++ model: next.request.metadata?.model, ++ reasons: concurrency.reasons, ++ }); ++ if ( ++ this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "concurrency wait") ++ ) { ++ this.cleanupTenantQueue(priority, tenantId, queue); ++ return { kind: "skip" }; ++ } ++ next.etaMs = undefined; ++ continue; ++ } ++ ++ const adaptive = this.resolveAdaptiveLimit(next, now); ++ if (adaptive?.kind === "wait") { ++ if ( ++ this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "adaptive wait") ++ ) { ++ this.cleanupTenantQueue(priority, tenantId, queue); ++ return { kind: "skip" }; ++ } ++ next.etaMs = ++ adaptive.wakeUpAt !== undefined ? Math.max(0, adaptive.wakeUpAt - now) : undefined; ++ observeWakeUpAt(adaptive.wakeUpAt); ++ continue; ++ } ++ ++ const rateLimit = this.resolveRateLimit(next); ++ if (rateLimit) { ++ this.controllerLogger.trace("Rate limit resolution returned decision", { ++ priority, ++ decision: rateLimit, ++ rateLimitKey: next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), ++ }); ++ if (rateLimit.kind === "wait") { ++ if ( ++ this.rejectIfQueueTimedOut( ++ queueTimeoutExpired, ++ next, ++ queue, ++ 0, ++ now, ++ "rate limit wait", ++ ) ++ ) { ++ this.cleanupTenantQueue(priority, tenantId, queue); ++ return { kind: "skip" }; ++ } ++ next.etaMs = ++ rateLimit.wakeUpAt !== undefined ? Math.max(0, rateLimit.wakeUpAt - now) : undefined; ++ observeWakeUpAt(rateLimit.wakeUpAt); ++ } ++ continue; ++ } ++ ++ if (queueTimeoutExpired) { ++ const timeoutError = this.createQueueTimeoutError(next, now); ++ this.attachTrafficMetadata( ++ timeoutError, ++ this.buildTrafficResponseMetadata( ++ next, ++ timeoutError.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), ++ now, ++ timeoutError, ++ ), ++ ); ++ this.controllerLogger.warn("Queue wait timed out before dispatch", { ++ tenantId: next.tenantId, ++ waitedMs: timeoutError.waitedMs, ++ maxQueueWaitMs: timeoutError.maxQueueWaitMs, ++ deadlineAt: timeoutError.deadlineAt, ++ provider: next.request.metadata?.provider, ++ model: next.request.metadata?.model, ++ rateLimitKey: timeoutError.rateLimitKey, ++ }); ++ queue.shift(); ++ this.cleanupTenantQueue(priority, tenantId, queue); ++ next.reject(timeoutError); ++ return { kind: "skip" }; ++ } ++ ++ this.startRequest(next, queue, tenantId); ++ return { kind: "dispatch" }; ++ } ++ } ++ ++ return earliestWakeUpAt !== undefined ++ ? { kind: "wait", wakeUpAt: earliestWakeUpAt } ++ : { kind: "wait" }; ++ } ++ ++ private startRequest(item: QueuedRequest, queue: QueuedRequest[], tenantId: string): void { ++ this.controllerLogger.debug("Start request", { ++ priority: item.priority, ++ type: item.type, ++ tenantId: item.tenantId, ++ attempt: item.attempt, ++ provider: item.request.metadata?.provider, ++ model: item.request.metadata?.model, ++ }); ++ item.dispatchedAt = Date.now(); ++ queue.shift(); ++ this.cleanupTenantQueue(item.priority, tenantId, queue); ++ this.recordPriorityDispatch(item.priority); ++ this.activeCount++; ++ this.concurrencyLimiter.acquire(item, this.trafficLogger); ++ this.rateLimiter.notifyDispatch(item.rateLimitKey, this.trafficLogger); ++ this.circuitBreaker.markTrial(item, this.trafficLogger); ++ void this.executeRequest(item); ++ } ++ ++ /* ============================================================ ++ * Execution ++ * ============================================================ ++ */ ++ ++ private async executeRequest(item: QueuedRequest): Promise { ++ const startedAt = Date.now(); ++ try { ++ this.controllerLogger.debug("Execute request", { ++ priority: item.priority, ++ type: item.type, ++ tenantId: item.tenantId, ++ attempt: item.attempt, ++ provider: item.request.metadata?.provider, ++ model: item.request.metadata?.model, ++ rateLimitKey: item.rateLimitKey, ++ circuitKey: item.circuitKey, ++ circuitStatus: item.circuitStatus, ++ activeCount: this.activeCount, ++ }); ++ const result = await item.request.execute(); ++ const rateLimitKey = item.rateLimitKey ?? this.buildRateLimitKey(item.request.metadata); ++ const adaptiveKey = this.buildAdaptiveKey(item.request.metadata, item.tenantId, rateLimitKey); ++ this.controllerLogger.debug("Request succeeded", { ++ tenantId: item.tenantId, ++ attempt: item.attempt, ++ provider: item.request.metadata?.provider, ++ model: item.request.metadata?.model, ++ elapsedMs: Date.now() - startedAt, ++ }); ++ if (item.type === "stream") { ++ this.controllerLogger.trace("Stream started successfully", { ++ tenantId: item.tenantId, ++ provider: item.request.metadata?.provider, ++ model: item.request.metadata?.model, ++ }); ++ } else { ++ this.circuitBreaker.recordSuccess(item.request.metadata, this.trafficLogger); ++ } ++ const usage = this.usageTracker.recordUsage(item, result, this.trafficLogger); ++ this.rateLimiter.recordUsage(rateLimitKey, usage, this.trafficLogger, item.reservedTokens); ++ this.recordAdaptiveSuccess(adaptiveKey); ++ this.attachTrafficMetadata( ++ result, ++ this.buildTrafficResponseMetadata(item, rateLimitKey, Date.now()), ++ ); ++ item.resolve(result); ++ } catch (error) { ++ const rateLimitKey = item.rateLimitKey ?? this.buildRateLimitKey(item.request.metadata); ++ const normalizedRateLimitError = normalizeRateLimitError({ ++ error, ++ metadata: item.request.metadata, ++ tenantId: item.tenantId, ++ key: rateLimitKey, ++ logger: this.trafficLogger, ++ }); ++ const errorForHandling = normalizedRateLimitError ?? error; ++ const adaptiveKey = this.buildAdaptiveKey(item.request.metadata, item.tenantId, rateLimitKey); ++ if (typeof item.reservedTokens === "number" && item.reservedTokens > 0) { ++ this.rateLimiter.recordUsage( ++ rateLimitKey, ++ { totalTokens: 0 }, ++ this.trafficLogger, ++ item.reservedTokens, ++ ); ++ } ++ if (errorForHandling instanceof RateLimitedUpstreamError) { ++ this.recordAdaptiveRateLimitHit(adaptiveKey, errorForHandling.retryAfterMs); ++ } ++ ++ this.controllerLogger.warn("Request failed", { ++ tenantId: item.tenantId, ++ attempt: item.attempt, ++ provider: item.request.metadata?.provider, ++ model: item.request.metadata?.model, ++ elapsedMs: Date.now() - startedAt, ++ errorName: (error as { name?: unknown } | null)?.name, ++ errorMessage: (error as { message?: unknown } | null)?.message, ++ status: (error as { status?: unknown } | null)?.status, ++ statusCode: (error as { statusCode?: unknown } | null)?.statusCode, ++ }); ++ this.circuitBreaker.recordFailure( ++ item.request.metadata, ++ errorForHandling, ++ this.trafficLogger, ++ ); ++ this.attachTrafficMetadata( ++ errorForHandling, ++ this.buildTrafficResponseMetadata(item, rateLimitKey, Date.now(), errorForHandling), ++ ); ++ ++ const retry = buildRetryPlanWithPolicy( ++ { ++ error: errorForHandling, ++ attempt: item.attempt, ++ metadata: item.request.metadata, ++ key: rateLimitKey, ++ logger: this.trafficLogger, ++ }, ++ this.retryPolicy, ++ ); ++ if (retry) { ++ if (!this.canRetryWithinDeadline(item, retry.delayMs)) { ++ this.controllerLogger.debug("Retry skipped; deadline exceeded", { ++ tenantId: item.tenantId, ++ attempt: item.attempt, ++ provider: item.request.metadata?.provider, ++ model: item.request.metadata?.model, ++ deadlineAt: item.request.deadlineAt, ++ delayMs: retry.delayMs, ++ }); ++ item.reject(errorForHandling); ++ } else { ++ this.controllerLogger.debug("Retrying request", { ++ tenantId: item.tenantId, ++ attempt: item.attempt, ++ nextAttempt: item.attempt + 1, ++ reason: retry.reason, ++ delayMs: retry.delayMs, ++ provider: item.request.metadata?.provider, ++ model: item.request.metadata?.model, ++ }); ++ this.scheduleRetry(item, retry); ++ } ++ } else { ++ this.controllerLogger.debug("No retry plan; rejecting request", { ++ tenantId: item.tenantId, ++ attempt: item.attempt, ++ provider: item.request.metadata?.provider, ++ model: item.request.metadata?.model, ++ }); ++ item.reject(errorForHandling); ++ } ++ } finally { ++ this.rateLimiter.releaseReservation(item.rateLimitKey, this.trafficLogger); ++ this.concurrencyLimiter.release(item, this.trafficLogger); ++ this.activeCount = Math.max(0, this.activeCount - 1); ++ this.controllerLogger.trace("Request finished; slot released", { ++ tenantId: item.tenantId, ++ activeCount: this.activeCount, ++ maxConcurrent: this.maxConcurrent, ++ }); ++ this.scheduleDrain(); ++ } ++ } ++ ++ /* ============================================================ ++ * Retry logic ++ * ============================================================ ++ */ ++ ++ private scheduleRetry(item: QueuedRequest, plan: RetryPlan): void { ++ this.controllerLogger.debug("Schedule retry", { ++ tenantId: item.tenantId, ++ priority: item.priority, ++ currentAttempt: item.attempt, ++ nextAttempt: item.attempt + 1, ++ reason: plan.reason, ++ delayMs: plan.delayMs, ++ }); ++ setTimeout(() => { ++ this.controllerLogger.debug("Retry timer fired", { ++ tenantId: item.tenantId, ++ priority: item.priority, ++ nextAttempt: item.attempt + 1, ++ }); ++ this.enqueueItem({ ++ ...item, ++ attempt: item.attempt + 1, ++ enqueuedAt: Date.now(), ++ dispatchedAt: undefined, ++ reservedTokens: undefined, ++ tenantConcurrencyKey: undefined, ++ providerModelConcurrencyKey: undefined, ++ rateLimitKey: undefined, ++ etaMs: undefined, ++ circuitKey: undefined, ++ circuitStatus: undefined, ++ }); ++ this.scheduleDrain(); ++ }, plan.delayMs); ++ } ++ ++ private canRetryWithinDeadline(item: QueuedRequest, delayMs: number): boolean { ++ const deadlineAt = item.request.deadlineAt; ++ if (!deadlineAt) return true; ++ const nextAttemptAt = Date.now() + delayMs; ++ return nextAttemptAt <= deadlineAt; ++ } ++ ++ /* ============================================================ ++ * Rate limiting (verbatim logic) ++ * ============================================================ ++ */ ++ ++ private resolveRateLimit(next: QueuedRequest): DispatchDecision | null { ++ const key = this.buildRateLimitKey(next.request.metadata); ++ return this.rateLimiter.resolve(next, key, this.trafficLogger); ++ } ++ ++ private scheduleRateLimitWakeUpAt(wakeUpAt: number): void { ++ this.rateLimiter.scheduleWakeUpAt(wakeUpAt, this.trafficLogger); ++ } ++ ++ /* ============================================================ ++ * Circuit breakers (verbatim logic, linearized) ++ * ============================================================ ++ */ ++ ++ private resolveCircuit(next: QueuedRequest): DispatchDecision | null { ++ return this.circuitBreaker.resolve(next, this.trafficLogger); ++ } ++ ++ /* ============================================================ ++ * Utilities ++ * ============================================================ ++ */ ++ ++ private resolveQueueTimeoutAt(next: QueuedRequest): number | undefined { ++ if (next.queueTimeoutDisabled) { ++ return next.request.deadlineAt; ++ } ++ const maxQueueWaitMs = next.request.maxQueueWaitMs; ++ const normalizedMaxWait = ++ typeof maxQueueWaitMs === "number" && Number.isFinite(maxQueueWaitMs) ++ ? Math.max(0, maxQueueWaitMs) ++ : undefined; ++ const timeoutAt = ++ normalizedMaxWait !== undefined ? next.enqueuedAt + normalizedMaxWait : undefined; ++ const deadlineAt = next.request.deadlineAt; ++ if (timeoutAt === undefined) return deadlineAt; ++ if (deadlineAt === undefined) return timeoutAt; ++ return Math.min(timeoutAt, deadlineAt); ++ } ++ ++ private handleQueueTimeout( ++ next: QueuedRequest, ++ queue: QueuedRequest[], ++ index: number, ++ now: number, ++ queueTimeoutAt?: number, ++ ): "none" | "expired" | "rejected" { ++ if (queueTimeoutAt === undefined) return "none"; ++ if (now < queueTimeoutAt) return "none"; ++ ++ const fallbackApplied = this.circuitBreaker.tryFallback( ++ next, ++ "queue-timeout", ++ this.trafficLogger, ++ ); ++ if (fallbackApplied) { ++ return "none"; ++ } ++ ++ const timeoutError = this.createQueueTimeoutError(next, now); ++ this.attachTrafficMetadata( ++ timeoutError, ++ this.buildTrafficResponseMetadata( ++ next, ++ timeoutError.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), ++ now, ++ timeoutError, ++ ), ++ ); ++ this.controllerLogger.warn("Queue wait timed out; rejecting request", { ++ tenantId: next.tenantId, ++ waitedMs: timeoutError.waitedMs, ++ maxQueueWaitMs: timeoutError.maxQueueWaitMs, ++ deadlineAt: timeoutError.deadlineAt, ++ provider: next.request.metadata?.provider, ++ model: next.request.metadata?.model, ++ rateLimitKey: timeoutError.rateLimitKey, ++ }); ++ queue.splice(index, 1); ++ next.reject(timeoutError); ++ return "rejected"; ++ } ++ ++ private rejectIfQueueTimedOut( ++ queueTimeoutExpired: boolean, ++ next: QueuedRequest, ++ queue: QueuedRequest[], ++ index: number, ++ now: number, ++ reason: string, ++ ): boolean { ++ if (!queueTimeoutExpired) return false; ++ const timeoutError = this.createQueueTimeoutError(next, now); ++ this.attachTrafficMetadata( ++ timeoutError, ++ this.buildTrafficResponseMetadata( ++ next, ++ timeoutError.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), ++ now, ++ timeoutError, ++ ), ++ ); ++ this.controllerLogger.warn("Queue wait timed out during gate wait", { ++ tenantId: next.tenantId, ++ waitedMs: timeoutError.waitedMs, ++ maxQueueWaitMs: timeoutError.maxQueueWaitMs, ++ deadlineAt: timeoutError.deadlineAt, ++ provider: next.request.metadata?.provider, ++ model: next.request.metadata?.model, ++ rateLimitKey: timeoutError.rateLimitKey, ++ reason, ++ }); ++ queue.splice(index, 1); ++ next.reject(timeoutError); ++ return true; ++ } ++ ++ private createQueueTimeoutError(next: QueuedRequest, now: number): QueueWaitTimeoutError { ++ const waitedMs = Math.max(0, now - next.enqueuedAt); ++ return new QueueWaitTimeoutError({ ++ waitedMs, ++ maxQueueWaitMs: next.request.maxQueueWaitMs, ++ deadlineAt: next.request.deadlineAt, ++ metadata: next.request.metadata, ++ rateLimitKey: next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), ++ }); ++ } ++ ++ private resolveTenantId(request: TrafficRequest): string { ++ return request.tenantId ?? request.metadata?.tenantId ?? "default"; ++ } ++ ++ private enqueueItem(item: QueuedRequest): void { ++ const state = this.queues[item.priority]; ++ const tenantId = item.tenantId; ++ let queue = state.queues.get(tenantId); ++ if (!queue) { ++ queue = []; ++ state.queues.set(tenantId, queue); ++ state.order.push(tenantId); ++ } ++ queue.push(item); ++ } ++ ++ private getQueuedCount(priority: TrafficPriority): number { ++ const state = this.queues[priority]; ++ let total = 0; ++ for (const queue of state.queues.values()) { ++ total += queue.length; ++ } ++ return total; ++ } ++ ++ private refillPriorityCredits(): void { ++ this.priorityCredits.P0 = this.priorityWeights.P0; ++ this.priorityCredits.P1 = this.priorityWeights.P1; ++ this.priorityCredits.P2 = this.priorityWeights.P2; ++ } ++ ++ private recordPriorityDispatch(priority: TrafficPriority): void { ++ if (this.priorityCredits[priority] > 0) { ++ this.priorityCredits[priority] -= 1; ++ } ++ } ++ ++ private getPriorityDispatchOrder(): TrafficPriority[] { ++ const prioritiesWithWork = this.priorityOrder.filter( ++ (priority) => this.getQueuedCount(priority) > 0, ++ ); ++ if (prioritiesWithWork.length === 0) return []; ++ ++ let available = prioritiesWithWork.filter((priority) => this.priorityCredits[priority] > 0); ++ if (available.length === 0) { ++ this.refillPriorityCredits(); ++ available = prioritiesWithWork.filter((priority) => this.priorityCredits[priority] > 0); ++ } ++ ++ return available.length === 0 ? prioritiesWithWork : available; ++ } ++ ++ private getNextTenantCandidate( ++ priority: TrafficPriority, ++ ): { item: QueuedRequest; queue: QueuedRequest[]; tenantId: string } | undefined { ++ const state = this.queues[priority]; ++ if (state.order.length === 0) return undefined; ++ const maxAttempts = state.order.length; ++ let attempts = 0; ++ ++ while (attempts < maxAttempts && state.order.length > 0) { ++ const index = state.index % state.order.length; ++ const tenantId = state.order[index]; ++ const queue = state.queues.get(tenantId); ++ attempts += 1; ++ ++ if (!queue || queue.length === 0) { ++ this.removeTenantQueue(priority, tenantId); ++ continue; ++ } ++ ++ state.index = (index + 1) % state.order.length; ++ return { item: queue[0], queue, tenantId }; ++ } ++ ++ return undefined; ++ } ++ ++ private cleanupTenantQueue( ++ priority: TrafficPriority, ++ tenantId: string, ++ queue: QueuedRequest[], ++ ): void { ++ if (queue.length > 0) return; ++ this.removeTenantQueue(priority, tenantId); ++ } ++ ++ private removeTenantQueue(priority: TrafficPriority, tenantId: string): void { ++ const state = this.queues[priority]; ++ state.queues.delete(tenantId); ++ const index = state.order.indexOf(tenantId); ++ if (index === -1) return; ++ state.order.splice(index, 1); ++ if (state.order.length === 0) { ++ state.index = 0; ++ return; ++ } ++ if (state.index > index) { ++ state.index -= 1; ++ } ++ if (state.index >= state.order.length) { ++ state.index = 0; ++ } ++ } ++ ++ private resolvePriority(metadata?: TrafficRequestMetadata): TrafficPriority { ++ return metadata?.priority ?? "P1"; ++ } ++ ++ private buildRateLimitKey(metadata?: TrafficRequestMetadata): string { ++ return this.rateLimitKeyBuilder(metadata); ++ } ++ ++ private resolveAdaptiveLimit(next: QueuedRequest, now: number): DispatchDecision | null { ++ const rateLimitKey = next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata); ++ const adaptiveKey = this.buildAdaptiveKey(next.request.metadata, next.tenantId, rateLimitKey); ++ const state = this.adaptiveLimiterState.get(adaptiveKey); ++ if (!state) return null; ++ ++ this.applyAdaptiveDecay(state, now); ++ if (state.cooldownUntil !== undefined && now < state.cooldownUntil) { ++ return { kind: "wait", wakeUpAt: state.cooldownUntil }; ++ } ++ ++ return null; ++ } ++ ++ private recordAdaptiveRateLimitHit(key: string, retryAfterMs?: number): void { ++ const state = this.getAdaptiveState(key); ++ const now = Date.now(); ++ const { windowMs, threshold, minPenaltyMs, maxPenaltyMs, penaltyMultiplier } = ++ this.adaptiveLimiterConfig; ++ ++ state.last429At = now; ++ state.recent429s = state.recent429s.filter((timestamp) => now - timestamp <= windowMs); ++ state.recent429s.push(now); ++ ++ if (state.recent429s.length < threshold) { ++ return; ++ } ++ ++ const basePenalty = state.penaltyMs > 0 ? state.penaltyMs : minPenaltyMs; ++ const nextPenalty = Math.min( ++ maxPenaltyMs, ++ Math.max(minPenaltyMs, Math.round(basePenalty * penaltyMultiplier)), ++ ); ++ state.penaltyMs = nextPenalty; ++ const retryPenalty = typeof retryAfterMs === "number" ? retryAfterMs : 0; ++ const cooldownMs = Math.max(nextPenalty, retryPenalty); ++ state.cooldownUntil = now + cooldownMs; ++ } ++ ++ private recordAdaptiveSuccess(key: string): void { ++ const state = this.adaptiveLimiterState.get(key); ++ if (!state) return; ++ ++ const now = Date.now(); ++ this.applyAdaptiveDecay(state, now); ++ if (state.penaltyMs === 0) { ++ state.cooldownUntil = undefined; ++ state.recent429s = []; ++ state.last429At = undefined; ++ } ++ } ++ ++ private applyAdaptiveDecay(state: AdaptiveLimiterState, now: number): void { ++ const { decayMs, penaltyMultiplier } = this.adaptiveLimiterConfig; ++ if (state.last429At && now - state.last429At < decayMs) { ++ return; ++ } ++ ++ if (state.penaltyMs > 0) { ++ state.penaltyMs = Math.max(0, Math.floor(state.penaltyMs / penaltyMultiplier)); ++ } ++ } ++ ++ private getAdaptiveState(key: string): AdaptiveLimiterState { ++ const existing = this.adaptiveLimiterState.get(key); ++ if (existing) return existing; ++ const created: AdaptiveLimiterState = { ++ recent429s: [], ++ penaltyMs: 0, ++ }; ++ this.adaptiveLimiterState.set(key, created); ++ return created; ++ } ++ ++ private buildAdaptiveKey( ++ metadata: TrafficRequestMetadata | undefined, ++ tenantId: string, ++ rateLimitKey: string, ++ ): string { ++ if (rateLimitKey.includes("tenant=")) { ++ return rateLimitKey; ++ } ++ const tenant = metadata?.tenantId ?? tenantId ?? "default"; ++ return `${rateLimitKey}::tenant=${encodeURIComponent(tenant)}`; ++ } ++ ++ private buildTrafficResponseMetadata( ++ item: QueuedRequest, ++ rateLimitKey: string, ++ now: number, ++ error?: unknown, ++ ): TrafficResponseMetadata { ++ const snapshot = this.rateLimitSnapshots.get(rateLimitKey); ++ const retryAfterMs = this.resolveRetryAfterMs(error, snapshot); ++ const queuedForMs = ++ item.dispatchedAt !== undefined ? item.dispatchedAt - item.enqueuedAt : now - item.enqueuedAt; ++ const queueEtaMs = item.etaMs ?? Math.max(0, queuedForMs); ++ ++ return { ++ rateLimitKey, ++ retryAfterMs, ++ rateLimitRemaining: snapshot?.remaining, ++ rateLimitResetAt: snapshot?.resetAt, ++ rateLimitResetInMs: ++ snapshot?.resetAt !== undefined ? Math.max(0, snapshot.resetAt - now) : undefined, ++ queueEtaMs, ++ tenantId: item.tenantId, ++ priority: item.request.metadata?.priority, ++ taskType: item.request.metadata?.taskType, ++ }; ++ } ++ ++ private buildTrafficResponseMetadataFromMetadata( ++ metadata: TrafficRequestMetadata | undefined, ++ rateLimitKey: string, ++ now: number, ++ error?: unknown, ++ ): TrafficResponseMetadata { ++ const snapshot = this.rateLimitSnapshots.get(rateLimitKey); ++ const retryAfterMs = this.resolveRetryAfterMs(error, snapshot); ++ ++ return { ++ rateLimitKey, ++ retryAfterMs, ++ rateLimitRemaining: snapshot?.remaining, ++ rateLimitResetAt: snapshot?.resetAt, ++ rateLimitResetInMs: ++ snapshot?.resetAt !== undefined ? Math.max(0, snapshot.resetAt - now) : undefined, ++ tenantId: metadata?.tenantId, ++ priority: metadata?.priority, ++ taskType: metadata?.taskType, ++ }; ++ } ++ ++ private attachTrafficMetadata(target: unknown, info: TrafficResponseMetadata): void { ++ if (!target || typeof target !== "object") return; ++ (target as Record).traffic = info; ++ } ++ ++ private resolveRetryAfterMs( ++ error: unknown | undefined, ++ snapshot?: RateLimitSnapshot, ++ ): number | undefined { ++ if (error && typeof error === "object" && "retryAfterMs" in error) { ++ const candidate = (error as { retryAfterMs?: unknown }).retryAfterMs; ++ if (typeof candidate === "number" && Number.isFinite(candidate)) { ++ return candidate; ++ } ++ } ++ if (snapshot?.retryAfterMs !== undefined) { ++ return snapshot.retryAfterMs; ++ } ++ return undefined; ++ } ++ ++ private resolveRateLimitStrategy( ++ key: string, ++ config?: RateLimitStrategyConfig, ++ ): RateLimitStrategyKind { ++ const modelOverride = config?.models?.[key]; ++ if (modelOverride) return modelOverride; ++ const provider = key.split("::")[0] ?? ""; ++ const providerOverride = config?.providers?.[provider]; ++ if (providerOverride) return providerOverride; ++ if (provider.startsWith("openai")) return "window"; ++ return "token-bucket"; ++ } ++} ++ ++/* ============================================================ ++ * Error + Singleton ++ * ============================================================ ++ */ ++ ++let singletonController: TrafficController | undefined; ++ ++export function getTrafficController(options?: TrafficControllerOptions): TrafficController { ++ if (!singletonController) { ++ singletonController = new TrafficController(options); ++ } ++ return singletonController; ++} ++ ++function buildRateLimitKeyFromMetadata(metadata?: TrafficRequestMetadata): string { ++ const provider = metadata?.provider ?? "default-provider"; ++ const model = metadata?.model ?? "default-model"; ++ const parts = [provider, model]; ++ ++ // SOP: Add new metadata fields in one place with a stable label and ordering. ++ // 1) Add the optional field to TrafficRequestMetadata. ++ // 2) Add it here with a stable label so keys stay predictable. ++ // Example: { label: "org", value: metadata?.orgId } ++ const optionalFields: Array<{ label: string; value?: string }> = [ ++ { label: "apiKey", value: metadata?.apiKeyId }, ++ { label: "region", value: metadata?.region }, ++ { label: "endpoint", value: metadata?.endpoint }, ++ // Intentionally exclude tenantId to enforce provider/model limits across tenants. ++ // Use rateLimitKeyBuilder to include tenant for per-tenant rate limits. ++ { label: "tenantTier", value: metadata?.tenantTier }, ++ { label: "taskType", value: metadata?.taskType }, ++ ]; ++ ++ for (const field of optionalFields) { ++ if (!field.value) continue; ++ parts.push(`${field.label}=${encodeURIComponent(field.value)}`); ++ } ++ ++ return parts.join("::"); ++} ++ ++function buildProviderModelKeyFromMetadata(metadata?: TrafficRequestMetadata): string { ++ const provider = metadata?.provider ?? "default-provider"; ++ const model = metadata?.model ?? "default-model"; ++ return `${provider}::${model}`; ++} +diff --git a/packages/core/src/traffic/traffic-error-utils.ts b/packages/core/src/traffic/traffic-error-utils.ts +new file mode 100644 +index 00000000..4cbb98b5 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-error-utils.ts +@@ -0,0 +1,148 @@ ++import type { Logger } from "../logger"; ++ ++function readObjectProperty(value: unknown, key: string): unknown { ++ if (!value || typeof value !== "object") return undefined; ++ return (value as Record)[key]; ++} ++ ++export function findHeaders(value: unknown): unknown[] { ++ const candidates: unknown[] = [ ++ readObjectProperty(value, "headers"), ++ readObjectProperty(readObjectProperty(value, "response"), "headers"), ++ readObjectProperty(readObjectProperty(value, "cause"), "headers"), ++ readObjectProperty( ++ readObjectProperty(readObjectProperty(value, "cause"), "response"), ++ "headers", ++ ), ++ ]; ++ ++ return candidates.filter((candidate) => candidate !== undefined && candidate !== null); ++} ++ ++export function readHeaderValue(headers: unknown, name: string): string | undefined { ++ if (!headers) return undefined; ++ ++ if (typeof (headers as { get?: unknown }).get === "function") { ++ const v = (headers as { get: (name: string) => unknown }).get(name); ++ return v === null || v === undefined ? undefined : String(v); ++ } ++ ++ if (typeof headers !== "object") return undefined; ++ ++ const entries = Object.entries(headers as Record); ++ const target = name.toLowerCase(); ++ const match = entries.find(([k]) => String(k).toLowerCase() === target); ++ if (!match) return undefined; ++ ++ const value = match[1]; ++ if (Array.isArray(value)) { ++ const first = value[0]; ++ return first === null || first === undefined ? undefined : String(first); ++ } ++ return value === null || value === undefined ? undefined : String(value); ++} ++ ++export function parseRetryAfterMs(value: string, nowMs: number = Date.now()): number | undefined { ++ const raw = value.trim(); ++ if (!raw) return undefined; ++ ++ const seconds = Number(raw); ++ if (Number.isFinite(seconds)) { ++ return Math.max(0, Math.round(seconds * 1000)); ++ } ++ ++ const parsedAt = Date.parse(raw); ++ if (Number.isFinite(parsedAt)) { ++ return Math.max(0, parsedAt - nowMs); ++ } ++ ++ return undefined; ++} ++ ++export function coerceStatus(value: unknown): number | undefined { ++ const n = Number(value); ++ return Number.isFinite(n) ? n : undefined; ++} ++ ++export function extractStatusCode(error: unknown, logger?: Logger): number | undefined { ++ const status = ++ coerceStatus(readObjectProperty(error, "status")) ?? ++ coerceStatus(readObjectProperty(error, "statusCode")) ?? ++ coerceStatus(readObjectProperty(error, "httpStatus")) ?? ++ coerceStatus(readObjectProperty(readObjectProperty(error, "response"), "status")) ?? ++ coerceStatus(readObjectProperty(readObjectProperty(error, "cause"), "status")); ++ ++ logger?.trace?.("Extracted status code", { ++ status, ++ hasStatus: readObjectProperty(error, "status") !== undefined, ++ hasStatusCode: readObjectProperty(error, "statusCode") !== undefined, ++ hasHttpStatus: readObjectProperty(error, "httpStatus") !== undefined, ++ hasResponseStatus: ++ readObjectProperty(readObjectProperty(error, "response"), "status") !== undefined, ++ hasCauseStatus: readObjectProperty(readObjectProperty(error, "cause"), "status") !== undefined, ++ }); ++ ++ return status; ++} ++ ++export function extractRetryAfterMs(error: unknown, logger?: Logger): number | undefined { ++ const retryAfterLogger = logger?.child({ module: "retry-after" }); ++ const candidates = findHeaders(error); ++ ++ for (const headers of candidates) { ++ const raw = readHeaderValue(headers, "retry-after"); ++ if (!raw) continue; ++ const parsed = parseRetryAfterMs(raw); ++ retryAfterLogger?.trace?.("Parsed Retry-After header", { raw, parsedMs: parsed }); ++ if (parsed !== undefined) return parsed; ++ } ++ ++ retryAfterLogger?.trace?.("Retry-After header missing or unparsable"); ++ return undefined; ++} ++ ++export function isTimeoutError(error: unknown, logger?: Logger): boolean { ++ const candidates: unknown[] = [error]; ++ ++ const cause = readObjectProperty(error, "cause"); ++ if (cause) { ++ candidates.push(cause); ++ const nestedCause = readObjectProperty(cause, "cause"); ++ if (nestedCause) candidates.push(nestedCause); ++ } ++ ++ for (const candidate of candidates) { ++ const code = readObjectProperty(candidate, "code"); ++ const name = readObjectProperty(candidate, "name"); ++ const message = readObjectProperty(candidate, "message"); ++ ++ const codeText = String(code ?? "").toLowerCase(); ++ const nameText = String(name ?? "").toLowerCase(); ++ const messageText = String(message ?? "").toLowerCase(); ++ ++ const isTimeout = ++ codeText.includes("timeout") || ++ codeText.includes("timedout") || ++ nameText.includes("timeout") || ++ nameText.includes("timedout") || ++ messageText.includes("timeout") || ++ messageText.includes("timedout") || ++ messageText.includes("timed out"); ++ ++ logger?.trace?.("Checked timeout error", { ++ isTimeout, ++ code, ++ name, ++ messagePreview: typeof message === "string" ? message.slice(0, 160) : message, ++ hasCause: candidate !== error, ++ }); ++ ++ if (isTimeout) return true; ++ } ++ ++ return false; ++} ++ ++export function isPromiseLike(value: unknown): value is PromiseLike { ++ return !!value && typeof (value as { then?: unknown }).then === "function"; ++} +diff --git a/packages/core/src/traffic/traffic-errors.ts b/packages/core/src/traffic/traffic-errors.ts +new file mode 100644 +index 00000000..4943c89f +--- /dev/null ++++ b/packages/core/src/traffic/traffic-errors.ts +@@ -0,0 +1,141 @@ ++import type { Logger } from "../logger"; ++import { extractRetryAfterMs, extractStatusCode } from "./traffic-error-utils"; ++import type { TrafficRequestMetadata } from "./traffic-types"; ++ ++export type RateLimitErrorOptions = { ++ metadata?: TrafficRequestMetadata; ++ retryAfterMs?: number; ++ tenantId?: string; ++ key?: string; ++}; ++ ++export class CircuitBreakerOpenError extends Error { ++ readonly retryAfterMs?: number; ++ readonly metadata?: TrafficRequestMetadata; ++ ++ constructor(message: string, metadata?: TrafficRequestMetadata, retryAfterMs?: number) { ++ super(message); ++ this.name = "CircuitBreakerOpenError"; ++ this.metadata = metadata; ++ this.retryAfterMs = retryAfterMs; ++ } ++} ++ ++export class QueueWaitTimeoutError extends Error { ++ readonly waitedMs: number; ++ readonly maxQueueWaitMs?: number; ++ readonly deadlineAt?: number; ++ readonly metadata?: TrafficRequestMetadata; ++ readonly rateLimitKey?: string; ++ ++ constructor(options: { ++ waitedMs: number; ++ maxQueueWaitMs?: number; ++ deadlineAt?: number; ++ metadata?: TrafficRequestMetadata; ++ rateLimitKey?: string; ++ }) { ++ super("Queue wait time exceeded"); ++ this.name = "QueueWaitTimeoutError"; ++ this.waitedMs = options.waitedMs; ++ this.maxQueueWaitMs = options.maxQueueWaitMs; ++ this.deadlineAt = options.deadlineAt; ++ this.metadata = options.metadata; ++ this.rateLimitKey = options.rateLimitKey; ++ } ++} ++ ++export class RateLimitedUpstreamError extends Error { ++ readonly status = 429; ++ readonly retryAfterMs?: number; ++ readonly metadata?: TrafficRequestMetadata; ++ readonly provider?: string; ++ readonly model?: string; ++ readonly tenantId?: string; ++ readonly key?: string; ++ ++ constructor( ++ message: string, ++ metadata?: TrafficRequestMetadata, ++ retryAfterMs?: number, ++ options?: { tenantId?: string; key?: string }, ++ ); ++ constructor(message: string, options?: RateLimitErrorOptions); ++ constructor( ++ message: string, ++ metadataOrOptions?: TrafficRequestMetadata | RateLimitErrorOptions, ++ retryAfterMs?: number, ++ legacyOptions?: { tenantId?: string; key?: string }, ++ ) { ++ super(message); ++ this.name = "RateLimitedUpstreamError"; ++ const isOptions = ++ metadataOrOptions && ++ (Object.prototype.hasOwnProperty.call(metadataOrOptions, "metadata") || ++ Object.prototype.hasOwnProperty.call(metadataOrOptions, "retryAfterMs") || ++ Object.prototype.hasOwnProperty.call(metadataOrOptions, "key")); ++ ++ const metadata = isOptions ++ ? (metadataOrOptions as RateLimitErrorOptions).metadata ++ : (metadataOrOptions as TrafficRequestMetadata | undefined); ++ const retryAfter = isOptions ++ ? (metadataOrOptions as RateLimitErrorOptions).retryAfterMs ++ : retryAfterMs; ++ const tenantId = isOptions ++ ? (metadataOrOptions as RateLimitErrorOptions).tenantId ++ : legacyOptions?.tenantId; ++ const key = isOptions ? (metadataOrOptions as RateLimitErrorOptions).key : legacyOptions?.key; ++ ++ this.metadata = metadata; ++ this.retryAfterMs = retryAfter; ++ this.provider = metadata?.provider; ++ this.model = metadata?.model; ++ this.tenantId = tenantId ?? metadata?.tenantId; ++ this.key = key; ++ } ++} ++ ++export function normalizeRateLimitError(options: { ++ error: unknown; ++ metadata?: TrafficRequestMetadata; ++ tenantId?: string; ++ key?: string; ++ logger?: Logger; ++}): RateLimitedUpstreamError | undefined { ++ const { error, metadata, tenantId, key, logger } = options; ++ const retryAfterMs = ++ error instanceof RateLimitedUpstreamError ++ ? (error.retryAfterMs ?? extractRetryAfterMs(error, logger)) ++ : extractRetryAfterMs(error, logger); ++ ++ if (error instanceof RateLimitedUpstreamError) { ++ const baseMetadata = metadata ?? error.metadata; ++ const baseTenant = tenantId ?? error.tenantId; ++ const baseKey = key ?? error.key; ++ if ( ++ error.metadata === baseMetadata && ++ error.retryAfterMs === retryAfterMs && ++ error.tenantId === baseTenant && ++ error.key === baseKey ++ ) { ++ return error; ++ } ++ return new RateLimitedUpstreamError(error.message, { ++ metadata: baseMetadata, ++ retryAfterMs, ++ tenantId: baseTenant, ++ key: baseKey, ++ }); ++ } ++ ++ const status = extractStatusCode(error, logger); ++ if (status !== 429) return undefined; ++ ++ const message = error instanceof Error ? error.message : "Rate limit exceeded"; ++ return new RateLimitedUpstreamError(message, { ++ metadata, ++ retryAfterMs, ++ tenantId, ++ key, ++ }); ++} +diff --git a/packages/core/src/traffic/traffic-rate-limiter.ts b/packages/core/src/traffic/traffic-rate-limiter.ts +new file mode 100644 +index 00000000..3e5aefbe +--- /dev/null ++++ b/packages/core/src/traffic/traffic-rate-limiter.ts +@@ -0,0 +1,295 @@ ++import type { Logger } from "../logger"; ++import type { ++ RateLimitStrategy, ++ RateLimitUpdateResult, ++} from "./rate-limit-strategies/rate-limit-strategy"; ++import { TokenBucketRateLimitStrategy } from "./rate-limit-strategies/token-bucket-rate-limit-strategy"; ++import type { DispatchDecision, QueuedRequest } from "./traffic-controller-internal"; ++import type { RateLimitConfig, TrafficRequestMetadata } from "./traffic-types"; ++ ++export type { ++ RateLimitHeaderSnapshot, ++ RateLimitStrategy, ++ RateLimitUpdateResult, ++} from "./rate-limit-strategies/rate-limit-strategy"; ++export { DefaultRateLimitStrategy } from "./rate-limit-strategies/default-rate-limit-strategy"; ++export { OpenAIWindowRateLimitStrategy } from "./rate-limit-strategies/openai-window-rate-limit-strategy"; ++export { TokenBucketRateLimitStrategy } from "./rate-limit-strategies/token-bucket-rate-limit-strategy"; ++ ++type SchedulerCallback = () => void; ++ ++export type RateLimitStrategyFactory = (key: string) => RateLimitStrategy; ++ ++type UsageCounters = { ++ inputTokens?: number; ++ outputTokens?: number; ++ totalTokens?: number; ++}; ++ ++type TokenRateState = { ++ capacity: number; ++ refillPerSecond: number; ++ tokens: number; ++ updatedAt: number; ++}; ++ ++export class TrafficRateLimiter { ++ private readonly strategies = new Map(); ++ private readonly tokenRates = new Map(); ++ private wakeUpTimeout?: ReturnType; ++ private wakeUpAt?: number; ++ private readonly onWakeUp: SchedulerCallback; ++ private readonly strategyFactory: RateLimitStrategyFactory; ++ private readonly rateLimits?: RateLimitConfig; ++ ++ constructor( ++ onWakeUp: SchedulerCallback, ++ options?: { strategyFactory?: RateLimitStrategyFactory; rateLimits?: RateLimitConfig }, ++ ) { ++ this.onWakeUp = onWakeUp; ++ this.rateLimits = options?.rateLimits; ++ this.strategyFactory = ++ options?.strategyFactory ?? ++ ((key) => new TokenBucketRateLimitStrategy(key, this.rateLimits?.[key])); ++ } ++ ++ resolve(next: QueuedRequest, key: string, logger?: Logger): DispatchDecision | null { ++ const strategy = this.strategies.get(key) ?? this.createStrategy(key, logger); ++ const requestDecision = strategy.resolve(next, logger); ++ if (requestDecision?.kind === "wait") { ++ const tokenDecision = strategy.handlesTokenLimits ++ ? null ++ : this.resolveTokenLimit(next, key, logger, false); ++ if (tokenDecision?.kind === "wait") { ++ const requestWakeUp = requestDecision.wakeUpAt; ++ const tokenWakeUp = tokenDecision.wakeUpAt; ++ if (tokenWakeUp !== undefined && requestWakeUp !== undefined) { ++ return { kind: "wait", wakeUpAt: Math.min(requestWakeUp, tokenWakeUp) }; ++ } ++ if (tokenWakeUp !== undefined && requestWakeUp === undefined) { ++ return tokenDecision; ++ } ++ } ++ return requestDecision; ++ } ++ ++ const tokenDecision = strategy.handlesTokenLimits ++ ? null ++ : this.resolveTokenLimit(next, key, logger, true); ++ if (tokenDecision?.kind === "wait") { ++ return tokenDecision; ++ } ++ ++ return requestDecision; ++ } ++ ++ notifyDispatch(key: string | undefined, logger?: Logger): void { ++ if (!key) return; ++ this.strategies.get(key)?.onDispatch(logger); ++ } ++ ++ scheduleWakeUpAt(wakeUpAt: number, logger?: Logger): void { ++ const rateLimitLogger = logger?.child({ module: "rate-limiter" }); ++ const now = Date.now(); ++ const target = Math.max(now, wakeUpAt); ++ ++ if (this.wakeUpTimeout && this.wakeUpAt !== undefined && this.wakeUpAt <= target) { ++ rateLimitLogger?.trace?.("Wakeup already scheduled earlier; skipping", { ++ currentWakeUpAt: this.wakeUpAt, ++ requestedWakeUpAt: target, ++ }); ++ return; ++ } ++ ++ if (this.wakeUpTimeout) clearTimeout(this.wakeUpTimeout); ++ ++ this.wakeUpAt = target; ++ rateLimitLogger?.debug?.("Scheduling rate limit wakeup", { ++ wakeUpAt: target, ++ inMs: Math.max(1, target - now), ++ }); ++ this.wakeUpTimeout = setTimeout( ++ () => { ++ this.wakeUpTimeout = undefined; ++ this.wakeUpAt = undefined; ++ rateLimitLogger?.debug?.("Rate limit wakeup fired"); ++ this.onWakeUp(); ++ }, ++ Math.max(1, target - now), ++ ); ++ } ++ ++ releaseReservation(key?: string, logger?: Logger): void { ++ if (!key) return; ++ this.strategies.get(key)?.onComplete(logger); ++ } ++ ++ recordUsage( ++ key: string | undefined, ++ usage: UsageCounters | Promise | undefined, ++ logger?: Logger, ++ reservedTokens?: number, ++ ): void { ++ if (!key || !usage) return; ++ if (typeof (usage as PromiseLike).then === "function") { ++ void (usage as Promise) ++ .then((resolved) => this.recordUsage(key, resolved, logger, reservedTokens)) ++ .catch(() => {}); ++ return; ++ } ++ ++ const strategy = this.strategies.get(key); ++ if (strategy?.recordUsage) { ++ strategy.recordUsage(usage, logger, reservedTokens); ++ return; ++ } ++ ++ const tokens = this.resolveTokenCount(usage); ++ if (tokens <= 0) return; ++ ++ const bucket = this.getTokenRateState(key, logger); ++ if (!bucket) return; ++ ++ const now = Date.now(); ++ this.refillTokenRate(bucket, now); ++ bucket.tokens = Math.min(bucket.capacity, bucket.tokens); ++ const reserved = typeof reservedTokens === "number" ? reservedTokens : 0; ++ const delta = tokens - reserved; ++ if (delta > 0) { ++ bucket.tokens -= delta; ++ } else if (delta < 0) { ++ bucket.tokens = Math.min(bucket.capacity, bucket.tokens + Math.abs(delta)); ++ } ++ ++ if (bucket.tokens < 0 && bucket.refillPerSecond > 0) { ++ const waitMs = Math.max(1, Math.ceil((-bucket.tokens / bucket.refillPerSecond) * 1000)); ++ this.scheduleWakeUpAt(now + waitMs, logger); ++ } ++ } ++ ++ updateFromHeaders( ++ metadata: TrafficRequestMetadata | undefined, ++ headers: unknown, ++ key: string, ++ logger?: Logger, ++ ): RateLimitUpdateResult | undefined { ++ const existing = this.strategies.get(key); ++ if (existing) return existing.updateFromHeaders(metadata, headers, logger); ++ ++ const created = this.strategyFactory(key); ++ const update = created.updateFromHeaders(metadata, headers, logger); ++ if (!update) return undefined; ++ this.strategies.set(key, created); ++ return update; ++ } ++ ++ private createStrategy(key: string, logger?: Logger): RateLimitStrategy { ++ const created = this.strategyFactory(key); ++ this.strategies.set(key, created); ++ logger?.child({ module: "rate-limiter" })?.trace?.("Created rate limit strategy", { ++ rateLimitKey: key, ++ strategy: created.constructor.name, ++ }); ++ return created; ++ } ++ ++ private resolveTokenLimit( ++ next: QueuedRequest, ++ key: string, ++ logger?: Logger, ++ reserveTokens = true, ++ ): DispatchDecision | null { ++ const bucket = this.getTokenRateState(key, logger); ++ if (!bucket) return null; ++ ++ const now = Date.now(); ++ this.refillTokenRate(bucket, now); ++ ++ if (bucket.capacity <= 0) { ++ logger?.child({ module: "rate-limiter" })?.debug?.("Token limit misconfigured; blocking", { ++ rateLimitKey: key, ++ capacity: bucket.capacity, ++ refillPerSecond: bucket.refillPerSecond, ++ }); ++ return { kind: "wait" }; ++ } ++ ++ const estimatedTokens = next.estimatedTokens; ++ if (typeof estimatedTokens === "number" && estimatedTokens > 0) { ++ if (bucket.tokens >= estimatedTokens) { ++ if (reserveTokens) { ++ bucket.tokens -= estimatedTokens; ++ next.reservedTokens = estimatedTokens; ++ } ++ return null; ++ } ++ } else if (bucket.tokens >= 0) { ++ return null; ++ } ++ ++ if (bucket.refillPerSecond <= 0) { ++ logger?.child({ module: "rate-limiter" })?.debug?.("Token limit has no refill; blocking", { ++ rateLimitKey: key, ++ capacity: bucket.capacity, ++ refillPerSecond: bucket.refillPerSecond, ++ }); ++ return { kind: "wait" }; ++ } ++ ++ const requiredTokens = ++ typeof estimatedTokens === "number" && estimatedTokens > 0 ++ ? Math.max(estimatedTokens - bucket.tokens, 1) ++ : -bucket.tokens; ++ const waitMs = Math.max(1, Math.ceil((requiredTokens / bucket.refillPerSecond) * 1000)); ++ return { kind: "wait", wakeUpAt: now + waitMs }; ++ } ++ ++ private getTokenRateState(key: string, logger?: Logger): TokenRateState | undefined { ++ const existing = this.tokenRates.get(key); ++ if (existing) return existing; ++ ++ const options = this.rateLimits?.[key]; ++ if (!options) return undefined; ++ ++ const tokensPerMinute = Number(options.tokensPerMinute); ++ if (!Number.isFinite(tokensPerMinute) || tokensPerMinute <= 0) { ++ return undefined; ++ } ++ ++ // Token pacing uses a 1-minute burst by default; request bursts are handled separately. ++ const refillPerSecond = tokensPerMinute / 60; ++ const capacity = tokensPerMinute; ++ const now = Date.now(); ++ const created: TokenRateState = { ++ capacity, ++ refillPerSecond, ++ tokens: capacity, ++ updatedAt: now, ++ }; ++ this.tokenRates.set(key, created); ++ logger?.child({ module: "rate-limiter" })?.trace?.("Created token rate state", { ++ rateLimitKey: key, ++ capacity, ++ refillPerSecond, ++ }); ++ return created; ++ } ++ ++ private refillTokenRate(bucket: TokenRateState, now: number): void { ++ const elapsedMs = now - bucket.updatedAt; ++ if (elapsedMs <= 0) return; ++ bucket.updatedAt = now; ++ if (bucket.capacity <= 0 || bucket.refillPerSecond <= 0) return; ++ const refill = (elapsedMs / 1000) * bucket.refillPerSecond; ++ if (refill <= 0) return; ++ bucket.tokens = Math.min(bucket.capacity, bucket.tokens + refill); ++ } ++ ++ private resolveTokenCount(usage: UsageCounters): number { ++ const total = Number.isFinite(usage.totalTokens) ? usage.totalTokens : undefined; ++ if (total !== undefined) return total; ++ const input = Number.isFinite(usage.inputTokens) ? usage.inputTokens : 0; ++ const output = Number.isFinite(usage.outputTokens) ? usage.outputTokens : 0; ++ return input + output; ++ } ++} +diff --git a/packages/core/src/traffic/traffic-retry.spec.ts b/packages/core/src/traffic/traffic-retry.spec.ts +new file mode 100644 +index 00000000..2360ca10 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-retry.spec.ts +@@ -0,0 +1,45 @@ ++import { describe, expect, it, vi } from "vitest"; ++import { buildRetryPlan } from "./traffic-retry"; ++ ++describe("buildRetryPlan", () => { ++ it("respects Retry-After for 429s", () => { ++ const randomSpy = vi.spyOn(Math, "random").mockReturnValue(0); ++ try { ++ const plan = buildRetryPlan( ++ { ++ status: 429, ++ response: { headers: { "retry-after": "2" } }, ++ }, ++ 1, ++ ); ++ ++ expect(plan).toBeTruthy(); ++ expect(plan?.reason).toBe("rateLimit"); ++ expect(plan?.delayMs).toBeGreaterThanOrEqual(2_000); ++ } finally { ++ randomSpy.mockRestore(); ++ } ++ }); ++ ++ it("parses HTTP-date Retry-After values", () => { ++ vi.useFakeTimers(); ++ const randomSpy = vi.spyOn(Math, "random").mockReturnValue(0); ++ ++ try { ++ vi.setSystemTime(new Date("2020-01-01T00:00:00.000Z")); ++ const plan = buildRetryPlan( ++ { ++ statusCode: 429, ++ response: { headers: { "retry-after": "Wed, 01 Jan 2020 00:00:03 GMT" } }, ++ }, ++ 1, ++ ); ++ ++ expect(plan).toBeTruthy(); ++ expect(plan?.delayMs).toBeGreaterThanOrEqual(3_000); ++ } finally { ++ vi.useRealTimers(); ++ randomSpy.mockRestore(); ++ } ++ }); ++}); +diff --git a/packages/core/src/traffic/traffic-retry.ts b/packages/core/src/traffic/traffic-retry.ts +new file mode 100644 +index 00000000..9604dc53 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-retry.ts +@@ -0,0 +1,144 @@ ++import type { Logger } from "../logger"; ++import { ++ MAX_RETRY_ATTEMPTS, ++ RATE_LIMIT_BASE_BACKOFF_MS, ++ RATE_LIMIT_JITTER_FACTOR, ++ SERVER_ERROR_BASE_BACKOFF_MS, ++ SERVER_ERROR_JITTER_FACTOR, ++ TIMEOUT_BASE_BACKOFF_MS, ++ TIMEOUT_JITTER_FACTOR, ++ TIMEOUT_RETRY_ATTEMPTS, ++} from "./traffic-constants"; ++import { extractRetryAfterMs, extractStatusCode, isTimeoutError } from "./traffic-error-utils"; ++import { RateLimitedUpstreamError } from "./traffic-errors"; ++import type { ++ RetryPlan, ++ RetryPolicy, ++ RetryPolicyConfig, ++ RetryPolicyContext, ++ RetryReason, ++} from "./traffic-types"; ++ ++export type { ++ RetryPlan, ++ RetryPolicy, ++ RetryPolicyConfig, ++ RetryPolicyContext, ++ RetryReason, ++} from "./traffic-types"; ++ ++export function buildRetryPlan( ++ error: unknown, ++ attempt: number, ++ logger?: Logger, ++): RetryPlan | undefined { ++ const retryLogger = logger?.child({ module: "retry" }); ++ const reason = getRetryReason(error, retryLogger); ++ if (!reason) { ++ retryLogger?.debug?.("No retry reason detected; skipping retry", { attempt }); ++ return undefined; ++ } ++ ++ const max = reason === "timeout" ? TIMEOUT_RETRY_ATTEMPTS : MAX_RETRY_ATTEMPTS; ++ if (attempt >= max) { ++ retryLogger?.debug?.("Retry attempts exhausted; skipping retry", { ++ attempt, ++ max, ++ reason, ++ }); ++ return undefined; ++ } ++ ++ const computedDelayMs = computeBackoffDelay(reason, attempt); ++ const retryAfterMs = ++ reason === "rateLimit" ++ ? error instanceof RateLimitedUpstreamError ++ ? error.retryAfterMs ++ : extractRetryAfterMs(error, retryLogger) ++ : undefined; ++ const delayMs = ++ retryAfterMs === undefined ? computedDelayMs : Math.max(computedDelayMs, retryAfterMs); ++ ++ retryLogger?.debug?.("Retry plan built", { ++ attempt, ++ reason, ++ delayMs, ++ computedDelayMs, ++ retryAfterMs, ++ max, ++ }); ++ ++ return { ++ reason, ++ delayMs, ++ }; ++} ++ ++export function buildRetryPlanWithPolicy( ++ context: RetryPolicyContext, ++ policyConfig?: RetryPolicyConfig, ++): RetryPlan | undefined { ++ const retryLogger = context.logger?.child({ module: "retry" }); ++ const policy = resolveRetryPolicy(context, policyConfig); ++ if (policy) { ++ const planned = policy(context); ++ if (planned) { ++ retryLogger?.debug?.("Retry policy returned a plan", { ++ attempt: context.attempt, ++ reason: planned.reason, ++ delayMs: planned.delayMs, ++ }); ++ return planned; ++ } ++ retryLogger?.debug?.("Retry policy declined to retry", { attempt: context.attempt }); ++ } ++ ++ return buildRetryPlan(context.error, context.attempt, context.logger); ++} ++ ++function resolveRetryPolicy( ++ context: RetryPolicyContext, ++ config?: RetryPolicyConfig, ++): RetryPolicy | undefined { ++ if (!config) return undefined; ++ const modelPolicy = context.key ? config.models?.[context.key] : undefined; ++ if (modelPolicy) return modelPolicy; ++ const providerModelKey = ++ context.metadata?.provider && context.metadata?.model ++ ? `${context.metadata.provider}::${context.metadata.model}` ++ : undefined; ++ const providerModelPolicy = providerModelKey ? config.models?.[providerModelKey] : undefined; ++ if (providerModelPolicy) return providerModelPolicy; ++ const provider = context.metadata?.provider; ++ const providerPolicy = provider ? config.providers?.[provider] : undefined; ++ if (providerPolicy) return providerPolicy; ++ return config.default; ++} ++ ++function getRetryReason(error: unknown, logger?: Logger): RetryReason | undefined { ++ if (error instanceof RateLimitedUpstreamError) return "rateLimit"; ++ const status = extractStatusCode(error, logger); ++ if (status === 429) return "rateLimit"; ++ if (status && status >= 500) return "serverError"; ++ if (status === 408 || isTimeoutError(error, logger)) return "timeout"; ++ return undefined; ++} ++ ++function computeBackoffDelay(reason: RetryReason, attempt: number): number { ++ const base = ++ reason === "serverError" ++ ? SERVER_ERROR_BASE_BACKOFF_MS ++ : reason === "timeout" ++ ? TIMEOUT_BASE_BACKOFF_MS ++ : RATE_LIMIT_BASE_BACKOFF_MS; ++ ++ const jitter = ++ reason === "serverError" ++ ? SERVER_ERROR_JITTER_FACTOR ++ : reason === "timeout" ++ ? TIMEOUT_JITTER_FACTOR ++ : RATE_LIMIT_JITTER_FACTOR; ++ ++ const exp = base * 2 ** (attempt - 1); ++ return Math.round(exp + exp * jitter * Math.random()); ++} +diff --git a/packages/core/src/traffic/traffic-types.ts b/packages/core/src/traffic/traffic-types.ts +new file mode 100644 +index 00000000..1d847e25 +--- /dev/null ++++ b/packages/core/src/traffic/traffic-types.ts +@@ -0,0 +1,181 @@ ++import type { Logger } from "../logger"; ++ ++type BivariantFunction = { ++ bivarianceHack(...args: TArgs): TReturn; ++}["bivarianceHack"]; ++ ++type UsageCounters = { ++ inputTokens?: number; ++ outputTokens?: number; ++ totalTokens?: number; ++}; ++ ++export type RetryReason = "rateLimit" | "serverError" | "timeout"; ++ ++export type RetryPlan = { ++ delayMs: number; ++ reason: RetryReason; ++}; ++ ++export type RetryPolicyContext = { ++ error: unknown; ++ attempt: number; ++ metadata?: TrafficRequestMetadata; ++ key?: string; ++ logger?: Logger; ++}; ++ ++export type RetryPolicy = (context: RetryPolicyContext) => RetryPlan | undefined; ++ ++export type RetryPolicyConfig = { ++ default?: RetryPolicy; ++ providers?: Record; ++ models?: Record; ++}; ++ ++export type TrafficRequestType = "text" | "stream"; ++export type TrafficPriority = "P0" | "P1" | "P2"; ++ ++export interface TrafficRequestMetadata { ++ agentId?: string; ++ agentName?: string; ++ model?: string; ++ provider?: string; ++ priority?: TrafficPriority; ++ tenantId?: string; ++ apiKeyId?: string; ++ region?: string; ++ endpoint?: string; ++ tenantTier?: string; ++ taskType?: string; ++ fallbackPolicyId?: string; ++} ++ ++export type TrafficResponseMetadata = { ++ rateLimitKey?: string; ++ retryAfterMs?: number; ++ rateLimitRemaining?: number; ++ rateLimitResetAt?: number; ++ rateLimitResetInMs?: number; ++ queueEtaMs?: number; ++ tenantId?: string; ++ priority?: TrafficPriority; ++ taskType?: string; ++}; ++ ++export type FallbackTarget = { ++ provider?: string; ++ model: string; ++}; ++ ++export type ShortResponseFallbackTarget = { ++ kind: "short-response"; ++ text: string; ++}; ++ ++export type FallbackChainEntry = string | FallbackTarget | ShortResponseFallbackTarget; ++ ++export type FallbackPolicyMode = "fallback" | "wait"; ++ ++export type FallbackPolicy = { ++ mode: FallbackPolicyMode; ++}; ++ ++export type FallbackPolicyConfig = { ++ defaultPolicyId?: string; ++ policies?: Record; ++ taskTypePolicyIds?: Record; ++}; ++ ++export type ProviderModelConcurrencyLimit = ++ | number ++ | Record ++ | ((metadata: TrafficRequestMetadata | undefined, key: string) => number | undefined); ++ ++export type TenantConcurrencyLimit = ++ | number ++ | Record ++ | ((tenantId: string, metadata: TrafficRequestMetadata | undefined) => number | undefined); ++ ++export type PriorityBurstLimits = Partial>; ++export type PriorityWeights = Partial>; ++ ++export type AdaptiveLimiterConfig = { ++ windowMs?: number; ++ threshold?: number; ++ minPenaltyMs?: number; ++ maxPenaltyMs?: number; ++ penaltyMultiplier?: number; ++ decayMs?: number; ++}; ++ ++export interface TrafficRequest { ++ tenantId: string; ++ metadata?: TrafficRequestMetadata; ++ execute: () => Promise; ++ deadlineAt?: number; ++ maxQueueWaitMs?: number; ++ estimatedTokens?: number; ++ createFallbackRequest?: BivariantFunction< ++ [target: FallbackChainEntry], ++ TrafficRequest | undefined ++ >; ++ extractUsage?: BivariantFunction< ++ [response: TResponse], ++ Promise | UsageCounters | undefined ++ >; ++} ++ ++export interface TrafficControllerOptions { ++ maxConcurrent?: number; ++ maxConcurrentPerProviderModel?: ProviderModelConcurrencyLimit; ++ maxConcurrentPerTenant?: TenantConcurrencyLimit; ++ rateLimits?: RateLimitConfig; ++ priorityBurstLimits?: PriorityBurstLimits; ++ priorityWeights?: PriorityWeights; ++ adaptiveLimiter?: AdaptiveLimiterConfig; ++ /** ++ * Optional override for rate-limit key construction. ++ * Useful when you need to add new metadata fields without changing core logic. ++ */ ++ rateLimitKeyBuilder?: (metadata?: TrafficRequestMetadata) => string; ++ /** ++ * Optional retry policy overrides by provider/model. ++ * Models keys can use the rate-limit key or provider::model. ++ */ ++ retryPolicy?: RetryPolicyConfig; ++ /** ++ * Optional fallback policy selection by task type or explicit policy id. ++ */ ++ fallbackPolicy?: FallbackPolicyConfig; ++ /** ++ * Select a rate-limit strategy by provider/model. ++ * Example: ++ * { providers: { openai: "window" }, models: { "openai::gpt-4o": "window" } } ++ */ ++ rateLimitStrategy?: RateLimitStrategyConfig; ++ logger?: Logger; ++ fallbackChains?: Record; ++} ++ ++export type RateLimitStrategyKind = "window" | "token-bucket"; ++ ++export type RateLimitStrategyConfig = { ++ providers?: Record; ++ models?: Record; ++}; ++ ++export interface RateLimitOptions { ++ requestsPerMinute: number; ++ tokensPerMinute: number; ++ burstSize?: number; ++} ++ ++export type RateLimitKey = string; ++export type RateLimitConfig = Record; ++ ++export type TenantUsage = { ++ inputTokens: number; ++ outputTokens: number; ++ totalTokens: number; ++}; +diff --git a/packages/core/src/traffic/traffic-usage-tracker.ts b/packages/core/src/traffic/traffic-usage-tracker.ts +new file mode 100644 +index 00000000..c79b311a +--- /dev/null ++++ b/packages/core/src/traffic/traffic-usage-tracker.ts +@@ -0,0 +1,83 @@ ++import type { Logger } from "../logger"; ++import type { QueuedRequest } from "./traffic-controller-internal"; ++import { isPromiseLike } from "./traffic-error-utils"; ++import type { TenantUsage } from "./traffic-types"; ++ ++type UsageCounters = { ++ inputTokens?: number; ++ outputTokens?: number; ++ totalTokens?: number; ++}; ++ ++export class TrafficUsageTracker { ++ private readonly tenantUsage = new Map(); ++ ++ getTenantUsage(tenantId: string): TenantUsage | undefined { ++ const usage = this.tenantUsage.get(tenantId); ++ return usage ? { ...usage } : undefined; ++ } ++ ++ recordUsage( ++ item: QueuedRequest, ++ result: TResponse, ++ logger?: Logger, ++ ): UsageCounters | Promise | undefined { ++ const usageLogger = logger?.child({ module: "usage-tracker" }); ++ const extractor = item.extractUsage ?? item.request.extractUsage; ++ if (!extractor) { ++ usageLogger?.trace?.("No usage extractor; skipping usage", { tenantId: item.tenantId }); ++ return undefined; ++ } ++ ++ const usage = extractor(result); ++ if (!usage) { ++ usageLogger?.trace?.("Usage extractor returned empty; skipping usage", { ++ tenantId: item.tenantId, ++ }); ++ return undefined; ++ } ++ ++ if (isPromiseLike(usage)) { ++ usageLogger?.trace?.("Usage extractor returned promise; awaiting", { ++ tenantId: item.tenantId, ++ }); ++ void usage.then((u) => u && this.incrementTenantUsage(item.tenantId, u, usageLogger)); ++ return usage; ++ } ++ this.incrementTenantUsage(item.tenantId, usage, usageLogger); ++ return usage; ++ } ++ ++ private incrementTenantUsage(tenantId: string, usage: UsageCounters, logger?: Logger): void { ++ const current = this.tenantUsage.get(tenantId) ?? { ++ inputTokens: 0, ++ outputTokens: 0, ++ totalTokens: 0, ++ }; ++ ++ const input = ++ typeof usage.inputTokens === "number" && Number.isFinite(usage.inputTokens) ++ ? usage.inputTokens ++ : 0; ++ const output = ++ typeof usage.outputTokens === "number" && Number.isFinite(usage.outputTokens) ++ ? usage.outputTokens ++ : 0; ++ const total = ++ typeof usage.totalTokens === "number" && Number.isFinite(usage.totalTokens) ++ ? usage.totalTokens ++ : input + output; ++ ++ this.tenantUsage.set(tenantId, { ++ inputTokens: current.inputTokens + input, ++ outputTokens: current.outputTokens + output, ++ totalTokens: current.totalTokens + total, ++ }); ++ ++ logger?.debug?.("Tenant usage incremented", { ++ tenantId, ++ delta: { inputTokens: input, outputTokens: output, totalTokens: total }, ++ total: this.tenantUsage.get(tenantId), ++ }); ++ } ++} +diff --git a/packages/core/src/workflow/core.ts b/packages/core/src/workflow/core.ts +index 3136511c..2b273d58 100644 +--- a/packages/core/src/workflow/core.ts ++++ b/packages/core/src/workflow/core.ts +@@ -827,6 +827,9 @@ export function createWorkflow< + + // Wrap entire execution in root span + const rootSpan = traceContext.getRootSpan(); ++ if (options?.tenantId) { ++ rootSpan.setAttribute("tenant.id", options.tenantId); ++ } + + // Add workflow state snapshot for remote observability + const workflowState = { +@@ -848,6 +851,7 @@ export function createWorkflow< + executionId, + userId: options?.userId, + conversationId: options?.conversationId, ++ tenantId: options?.tenantId, + traceId: rootSpan.spanContext().traceId, + spanId: rootSpan.spanContext().spanId, + }); +diff --git a/packages/core/src/workflow/internal/state.ts b/packages/core/src/workflow/internal/state.ts +index 71fa602d..2de12528 100644 +--- a/packages/core/src/workflow/internal/state.ts ++++ b/packages/core/src/workflow/internal/state.ts +@@ -23,6 +23,7 @@ export type WorkflowState = { + executionId: string; + conversationId?: string; + userId?: string; ++ tenantId?: string; + context?: UserContext; + active: number; + startAt: Date; +@@ -132,6 +133,7 @@ class WorkflowStateManagerInternal implements WorkflowStateManager + active: config?.active ?? 0, + userId: config?.userId, + conversationId: config?.conversationId, ++ tenantId: config?.tenantId, + context: config?.context, + startAt: new Date(), + endAt: null, +diff --git a/packages/core/src/workflow/internal/utils.ts b/packages/core/src/workflow/internal/utils.ts +index fc39530b..42250d82 100644 +--- a/packages/core/src/workflow/internal/utils.ts ++++ b/packages/core/src/workflow/internal/utils.ts +@@ -32,6 +32,7 @@ export function convertWorkflowStateToParam( + executionId: state.executionId, + conversationId: state.conversationId, + userId: state.userId, ++ tenantId: state.tenantId, + context: state.context, + active: state.active, + startAt: state.startAt, +diff --git a/packages/core/src/workflow/steps/and-agent.ts b/packages/core/src/workflow/steps/and-agent.ts +index bc46c148..14af9b8f 100644 +--- a/packages/core/src/workflow/steps/and-agent.ts ++++ b/packages/core/src/workflow/steps/and-agent.ts +@@ -66,6 +66,7 @@ export function andAgent( + context: restConfig.context ?? state.context, + conversationId: restConfig.conversationId ?? state.conversationId, + userId: restConfig.userId ?? state.userId, ++ tenantId: restConfig.tenantId ?? state.tenantId, + // No parentSpan when there's no workflow context + }); + // Accumulate usage if available (no workflow context) +@@ -92,6 +93,7 @@ export function andAgent( + context: restConfig.context ?? state.context, + conversationId: restConfig.conversationId ?? state.conversationId, + userId: restConfig.userId ?? state.userId, ++ tenantId: restConfig.tenantId ?? state.tenantId, + // Pass the current step span as parent for proper span hierarchy + parentSpan: state.workflowContext?.currentStepSpan, + }); +diff --git a/packages/core/src/workflow/types.ts b/packages/core/src/workflow/types.ts +index f7eed282..49bfd8cb 100644 +--- a/packages/core/src/workflow/types.ts ++++ b/packages/core/src/workflow/types.ts +@@ -214,6 +214,10 @@ export interface WorkflowRunOptions { + * The conversation ID, this can be used to track the current conversation in a workflow + */ + conversationId?: string; ++ /** ++ * Tenant identifier propagated to agent steps and subcalls ++ */ ++ tenantId?: string; + /** + * The user ID, this can be used to track the current user in a workflow + */ +diff --git a/packages/scorers/src/llm/answer-correctness.ts b/packages/scorers/src/llm/answer-correctness.ts +index 2111fa31..d66cc007 100644 +--- a/packages/scorers/src/llm/answer-correctness.ts ++++ b/packages/scorers/src/llm/answer-correctness.ts +@@ -7,6 +7,7 @@ import { + import { safeStringify } from "@voltagent/internal/utils"; + import type { LanguageModel } from "ai"; + import { z } from "zod"; ++import { extractTenantId } from "./utils"; + + const ANSWER_CORRECTNESS_PROMPT = `Given a ground truth and an answer, analyze each statement in the answer and classify them in one of the following categories: + +@@ -84,15 +85,17 @@ export function createAnswerCorrectnessScorer< + const agent = new Agent({ + name: "answer-correctness-classifier", + model, ++ trafficPriority: "P2", + instructions: "You classify statements for answer correctness evaluation", + }); + ++ const tenantId = extractTenantId(context); + const payload = resolvePayload(context, buildPayload); + const prompt = ANSWER_CORRECTNESS_PROMPT.replace("{{question}}", payload.input) + .replace("{{answer}}", payload.output) + .replace("{{ground_truth}}", payload.expected); + +- const response = await agent.generateObject(prompt, CLASSIFICATION_SCHEMA); ++ const response = await agent.generateObject(prompt, CLASSIFICATION_SCHEMA, { tenantId }); + const normalized = normalizeClassification(response.object); + + return { +diff --git a/packages/scorers/src/llm/answer-relevancy.ts b/packages/scorers/src/llm/answer-relevancy.ts +index a3de2237..d9bda1c9 100644 +--- a/packages/scorers/src/llm/answer-relevancy.ts ++++ b/packages/scorers/src/llm/answer-relevancy.ts +@@ -8,6 +8,7 @@ import { + import { safeStringify } from "@voltagent/internal/utils"; + import type { LanguageModel } from "ai"; + import { z } from "zod"; ++import { extractTenantId } from "./utils"; + + const QUESTION_GEN_PROMPT = `Generate a question for the given answer and Identify if answer is noncommittal. Give noncommittal as 1 if the answer is noncommittal and 0 if the answer is committal. A noncommittal answer is one that is evasive, vague, or ambiguous. For example, "I don't know" or "I'm not sure" are noncommittal answers + +@@ -119,9 +120,11 @@ export function createAnswerRelevancyScorer< + const agent = new Agent({ + name: "question-generator", + model, ++ trafficPriority: "P2", + instructions: "You generate questions from answers to evaluate relevancy", + }); + ++ const tenantId = extractTenantId(context); + const payload = resolvePayload(context, buildPayload); + const questions: GeneratedQuestion[] = []; + +@@ -131,7 +134,7 @@ export function createAnswerRelevancyScorer< + payload.context, + ); + +- const response = await agent.generateObject(prompt, QUESTION_SCHEMA); ++ const response = await agent.generateObject(prompt, QUESTION_SCHEMA, { tenantId }); + questions.push({ + question: response.object.question, + noncommittal: response.object.noncommittal === 1, +diff --git a/packages/scorers/src/llm/classifiers.ts b/packages/scorers/src/llm/classifiers.ts +index 1bca4239..a327e20d 100644 +--- a/packages/scorers/src/llm/classifiers.ts ++++ b/packages/scorers/src/llm/classifiers.ts +@@ -7,6 +7,7 @@ import { + } from "@voltagent/core"; + import { safeStringify } from "@voltagent/internal/utils"; + import { z } from "zod"; ++import { extractTenantId } from "./utils"; + + type ChoiceId = string; + +@@ -93,11 +94,14 @@ async function evaluateChoice(args: EvaluateChoiceArgs): Promise + const agent = new Agent({ + name: `${scorerId}-judge`, + model, ++ trafficPriority: "P2", + instructions: judgeInstructions ?? buildDefaultChoiceInstructions(Object.keys(choices)), + }); + ++ const tenantId = extractTenantId(context); + const response = await agent.generateObject(prompt, CHOICE_RESPONSE_SCHEMA, { + maxOutputTokens, ++ tenantId, + }); + + const { choice, reason } = extractChoiceFromResponse(response.object, choices, scorerId); +diff --git a/packages/scorers/src/llm/context-precision.ts b/packages/scorers/src/llm/context-precision.ts +index d31b5b85..ba680f56 100644 +--- a/packages/scorers/src/llm/context-precision.ts ++++ b/packages/scorers/src/llm/context-precision.ts +@@ -7,6 +7,7 @@ import { + import { safeStringify } from "@voltagent/internal/utils"; + import type { LanguageModel } from "ai"; + import { z } from "zod"; ++import { extractTenantId } from "./utils"; + + const CONTEXT_PRECISION_PROMPT = `Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output. + +@@ -109,6 +110,7 @@ export function createContextPrecisionScorer< + const agent = new Agent({ + name: "context-precision-evaluator", + model, ++ trafficPriority: "P2", + instructions: "You evaluate if context was useful for arriving at the answer", + }); + +@@ -116,12 +118,15 @@ export function createContextPrecisionScorer< + const contextText = Array.isArray(payload.context) + ? payload.context.join("\n") + : payload.context; ++ const tenantId = extractTenantId(context); + + const prompt = CONTEXT_PRECISION_PROMPT.replace("{{question}}", payload.input) + .replace("{{context}}", contextText) + .replace("{{answer}}", payload.output); + +- const response = await agent.generateObject(prompt, CONTEXT_PRECISION_SCHEMA); ++ const response = await agent.generateObject(prompt, CONTEXT_PRECISION_SCHEMA, { ++ tenantId, ++ }); + + context.results.raw.contextPrecisionVerdict = response.object; + +diff --git a/packages/scorers/src/llm/context-recall.ts b/packages/scorers/src/llm/context-recall.ts +index e6e86510..2c6053fc 100644 +--- a/packages/scorers/src/llm/context-recall.ts ++++ b/packages/scorers/src/llm/context-recall.ts +@@ -7,6 +7,7 @@ import { + import { safeStringify } from "@voltagent/internal/utils"; + import type { LanguageModel } from "ai"; + import { z } from "zod"; ++import { extractTenantId } from "./utils"; + + const CONTEXT_RECALL_EXTRACT_PROMPT = `Given the context and ground truth (expected output), extract all factual statements from the ground truth. + +@@ -120,6 +121,7 @@ export function createContextRecallScorer< + const agent = new Agent({ + name: "context-recall-evaluator", + model, ++ trafficPriority: "P2", + instructions: "You evaluate how well provided context supports factual statements", + }); + +@@ -127,6 +129,7 @@ export function createContextRecallScorer< + const contextText = Array.isArray(payload.context) + ? payload.context.join("\n") + : payload.context; ++ const tenantId = extractTenantId(context); + + // Extract statements from expected output + const extractPrompt = CONTEXT_RECALL_EXTRACT_PROMPT.replace( +@@ -134,7 +137,9 @@ export function createContextRecallScorer< + contextText, + ).replace("{{expected}}", payload.expected); + +- const extractResponse = await agent.generateObject(extractPrompt, EXTRACT_SCHEMA); ++ const extractResponse = await agent.generateObject(extractPrompt, EXTRACT_SCHEMA, { ++ tenantId, ++ }); + const statements = extractResponse.object.statements; + + if (statements.length === 0) { +@@ -152,7 +157,9 @@ export function createContextRecallScorer< + contextText, + ).replace("{{statement}}", statement); + +- const verifyResponse = await agent.generateObject(verifyPrompt, VERIFY_SCHEMA); ++ const verifyResponse = await agent.generateObject(verifyPrompt, VERIFY_SCHEMA, { ++ tenantId, ++ }); + verdicts.push({ + statement, + verdict: verifyResponse.object.verdict, +diff --git a/packages/scorers/src/llm/context-relevancy.ts b/packages/scorers/src/llm/context-relevancy.ts +index ee882b5b..aca608b2 100644 +--- a/packages/scorers/src/llm/context-relevancy.ts ++++ b/packages/scorers/src/llm/context-relevancy.ts +@@ -7,6 +7,7 @@ import { + import { safeStringify } from "@voltagent/internal/utils"; + import type { LanguageModel } from "ai"; + import { z } from "zod"; ++import { extractTenantId } from "./utils"; + + const CONTEXT_RELEVANCY_PROMPT = `Analyze the provided context and identify which parts are relevant to answering the given question. For each context sentence or passage, determine its relevance level. + +@@ -144,6 +145,7 @@ export function createContextRelevancyScorer< + const agent = new Agent({ + name: "context-relevancy-evaluator", + model, ++ trafficPriority: "P2", + instructions: "You evaluate how relevant provided context is to answering questions", + }); + +@@ -151,13 +153,16 @@ export function createContextRelevancyScorer< + const contextText = Array.isArray(payload.context) + ? payload.context.join("\n") + : payload.context; ++ const tenantId = extractTenantId(context); + + const prompt = CONTEXT_RELEVANCY_PROMPT.replace("{{question}}", payload.input).replace( + "{{context}}", + contextText, + ); + +- const response = await agent.generateObject(prompt, CONTEXT_RELEVANCY_SCHEMA); ++ const response = await agent.generateObject(prompt, CONTEXT_RELEVANCY_SCHEMA, { ++ tenantId, ++ }); + const evaluations = response.object.evaluations; + + context.results.raw.contextRelevancyEvaluations = evaluations; +diff --git a/packages/scorers/src/llm/moderation.ts b/packages/scorers/src/llm/moderation.ts +index 03563bfe..1055927f 100644 +--- a/packages/scorers/src/llm/moderation.ts ++++ b/packages/scorers/src/llm/moderation.ts +@@ -7,6 +7,7 @@ import { + } from "@voltagent/core"; + import { safeStringify } from "@voltagent/internal/utils"; + import { z } from "zod"; ++import { extractTenantId } from "./utils"; + + export interface ModerationScorerOptions { + id?: string; +@@ -220,6 +221,7 @@ async function runModerationJudge(args: { + typeof context.results.prepare === "string" + ? context.results.prepare + : normalizeText(context.payload.output); ++ const tenantId = extractTenantId(context); + + const prompt = await buildPrompt({ + output: normalizedOutput, +@@ -232,12 +234,14 @@ async function runModerationJudge(args: { + const agent = new Agent({ + name: "moderation-judge", + model, ++ trafficPriority: "P2", + instructions: + "You are a safety classifier. Respond with JSON that matches the provided schema containing flagged, scores, and reason.", + }); + + const response = await agent.generateObject(prompt, MODERATION_SCHEMA, { + maxOutputTokens, ++ tenantId, + }); + + const parsed = mapModerationResponse(response.object, threshold); +diff --git a/packages/scorers/src/llm/utils.ts b/packages/scorers/src/llm/utils.ts +new file mode 100644 +index 00000000..75e886e3 +--- /dev/null ++++ b/packages/scorers/src/llm/utils.ts +@@ -0,0 +1,14 @@ ++import type { BuilderPrepareContext, BuilderScoreContext } from "@voltagent/core"; ++ ++type TenantAwareContext = BuilderScoreContext, Record> & ++ BuilderPrepareContext, Record>; ++ ++export function extractTenantId( ++ context: ++ | BuilderScoreContext, Record> ++ | BuilderPrepareContext, Record> ++ | TenantAwareContext, ++): string | undefined { ++ const candidate = (context.payload as { tenantId?: unknown })?.tenantId; ++ return typeof candidate === "string" ? candidate : undefined; ++} +diff --git a/packages/server-core/src/handlers/agent.handlers.ts b/packages/server-core/src/handlers/agent.handlers.ts +index 00c0f2ee..37fbeaf4 100644 +--- a/packages/server-core/src/handlers/agent.handlers.ts ++++ b/packages/server-core/src/handlers/agent.handlers.ts +@@ -1,11 +1,70 @@ +-import { ClientHTTPError, type ServerProviderDeps } from "@voltagent/core"; +-import { convertUsage } from "@voltagent/core"; ++import { ++ ClientHTTPError, ++ type ServerProviderDeps, ++ type TrafficResponseMetadata, ++ convertUsage, ++} from "@voltagent/core"; + import { type Logger, safeStringify } from "@voltagent/internal"; + import { z } from "zod"; + import { convertJsonSchemaToZod } from "zod-from-json-schema"; + import { convertJsonSchemaToZod as convertJsonSchemaToZodV3 } from "zod-from-json-schema-v3"; + import type { ApiResponse } from "../types"; + import { processAgentOptions } from "../utils/options"; ++import { buildTrafficHeaders } from "../utils/traffic"; ++ ++function extractTrafficMetadata(value: unknown): TrafficResponseMetadata | undefined { ++ if (!value || typeof value !== "object") return undefined; ++ const traffic = (value as { traffic?: unknown }).traffic; ++ if (!traffic || typeof traffic !== "object") return undefined; ++ return traffic as TrafficResponseMetadata; ++} ++ ++function wrapStreamWithTraffic( ++ baseResponse: Response, ++ traffic?: TrafficResponseMetadata, ++): Response { ++ if (!traffic) return baseResponse; ++ const headers = new Headers(baseResponse.headers); ++ const trafficHeaders = buildTrafficHeaders(traffic); ++ for (const [key, value] of Object.entries(trafficHeaders)) { ++ headers.set(key, value); ++ } ++ const baseBody = baseResponse.body; ++ if (!baseBody) { ++ return new Response(baseBody, { ++ status: baseResponse.status, ++ headers, ++ }); ++ } ++ ++ const encoder = new TextEncoder(); ++ const stream = new ReadableStream({ ++ async start(controller) { ++ const trafficEvent = `data: ${safeStringify({ type: "traffic", traffic })}\n\n`; ++ controller.enqueue(encoder.encode(trafficEvent)); ++ const reader = baseBody.getReader(); ++ try { ++ while (true) { ++ const { done, value } = await reader.read(); ++ if (done) break; ++ if (value !== undefined) { ++ controller.enqueue(value); ++ } ++ } ++ } catch (error) { ++ controller.error(error); ++ } finally { ++ reader.releaseLock(); ++ controller.close(); ++ } ++ }, ++ }); ++ ++ return new Response(stream, { ++ status: baseResponse.status, ++ headers, ++ }); ++} + + /** + * Handler for listing all agents +@@ -79,6 +138,7 @@ export async function handleGenerateText( + const options = processAgentOptions(body, signal); + + const result = await agent.generateText(input, options); ++ const traffic = extractTrafficMetadata(result); + + // Convert usage format if present + const usage = result.usage ? convertUsage(result.usage) : undefined; +@@ -102,9 +162,11 @@ export async function handleGenerateText( + } + })(), + }, ++ traffic, + }; + } catch (error) { + logger.error("Failed to generate text", { error }); ++ const traffic = extractTrafficMetadata(error); + if (error instanceof ClientHTTPError) { + return { + success: false, +@@ -112,11 +174,13 @@ export async function handleGenerateText( + code: error.code, + name: error.name, + httpStatus: error.httpStatus, ++ traffic, + }; + } + return { + success: false, + error: error instanceof Error ? error.message : "Unknown error", ++ traffic, + }; + } + } +@@ -153,6 +217,7 @@ export async function handleStreamText( + const options = processAgentOptions(body, signal); + + const result = await agent.streamText(input, options); ++ const traffic = extractTrafficMetadata(result); + + // Access the fullStream property + const { fullStream } = result; +@@ -178,7 +243,7 @@ export async function handleStreamText( + }, + }); + +- return new Response(stream, { ++ const response = new Response(stream, { + status: 200, + headers: { + "Content-Type": "text/event-stream", +@@ -186,20 +251,25 @@ export async function handleStreamText( + Connection: "keep-alive", + }, + }); ++ return wrapStreamWithTraffic(response, traffic); + } catch (error) { + logger.error("Failed to handle stream text request", { error }); + + const errorMessage = error instanceof Error ? error.message : "Unknown error"; ++ const traffic = extractTrafficMetadata(error); ++ const trafficHeaders = buildTrafficHeaders(traffic); + + return new Response( + safeStringify({ + error: errorMessage, + message: errorMessage, ++ traffic, + }), + { + status: 500, + headers: { + "Content-Type": "application/json", ++ ...trafficHeaders, + }, + }, + ); +@@ -238,26 +308,32 @@ export async function handleChatStream( + const options = processAgentOptions(body, signal); + + const result = await agent.streamText(input, options); ++ const traffic = extractTrafficMetadata(result); + + // Use the built-in toUIMessageStreamResponse - it handles errors properly +- return result.toUIMessageStreamResponse({ ++ const response = result.toUIMessageStreamResponse({ + sendReasoning: true, + sendSources: true, + }); ++ return wrapStreamWithTraffic(response, traffic); + } catch (error) { + logger.error("Failed to handle chat stream request", { error }); + + const errorMessage = error instanceof Error ? error.message : "Unknown error"; ++ const traffic = extractTrafficMetadata(error); ++ const trafficHeaders = buildTrafficHeaders(traffic); + + return new Response( + safeStringify({ + error: errorMessage, + message: errorMessage, ++ traffic, + }), + { + status: 500, + headers: { + "Content-Type": "application/json", ++ ...trafficHeaders, + }, + }, + ); +@@ -293,16 +369,20 @@ export async function handleGenerateObject( + ) as any; + + const result = await agent.generateObject(input, zodSchema, options); ++ const traffic = extractTrafficMetadata(result); + + return { + success: true, + data: result.object, ++ traffic, + }; + } catch (error) { + logger.error("Failed to generate object", { error }); ++ const traffic = extractTrafficMetadata(error); + return { + success: false, + error: error instanceof Error ? error.message : "Unknown error", ++ traffic, + }; + } + } +@@ -344,23 +424,29 @@ export async function handleStreamObject( + ) as any; + + const result = await agent.streamObject(input, zodSchema, options); ++ const traffic = extractTrafficMetadata(result); + + // Use the built-in toTextStreamResponse - it handles errors properly +- return result.toTextStreamResponse(); ++ const response = result.toTextStreamResponse(); ++ return wrapStreamWithTraffic(response, traffic); + } catch (error) { + logger.error("Failed to handle stream object request", { error }); + + const errorMessage = error instanceof Error ? error.message : "Unknown error"; ++ const traffic = extractTrafficMetadata(error); ++ const trafficHeaders = buildTrafficHeaders(traffic); + + return new Response( + safeStringify({ + error: errorMessage, + message: errorMessage, ++ traffic, + }), + { + status: 500, + headers: { + "Content-Type": "application/json", ++ ...trafficHeaders, + }, + }, + ); +diff --git a/packages/server-core/src/index.ts b/packages/server-core/src/index.ts +index 1fe7e206..2f7ed826 100644 +--- a/packages/server-core/src/index.ts ++++ b/packages/server-core/src/index.ts +@@ -40,6 +40,7 @@ export * from "./utils/server-utils"; + export * from "./utils/ui-templates"; + export * from "./utils/response-mappers"; + export * from "./utils/sse"; ++export * from "./utils/traffic"; + export * from "./utils/announcements"; + + // Export WebSocket utilities +diff --git a/packages/server-core/src/schemas/agent.schemas.ts b/packages/server-core/src/schemas/agent.schemas.ts +index 52e80b83..41181e00 100644 +--- a/packages/server-core/src/schemas/agent.schemas.ts ++++ b/packages/server-core/src/schemas/agent.schemas.ts +@@ -77,6 +77,18 @@ export const GenerateOptionsSchema = z + .object({ + userId: z.string().optional().describe("Optional user ID for context tracking"), + conversationId: z.string().optional().describe("Optional conversation ID for context tracking"), ++ tenantId: z.string().optional().describe("Optional tenant ID for traffic limits"), ++ trafficPriority: z ++ .enum(["P0", "P1", "P2"]) ++ .optional() ++ .describe("Optional traffic priority for scheduling (P0, P1, P2)"), ++ apiKeyId: z.string().optional().describe("Optional API key identifier for traffic limits"), ++ region: z.string().optional().describe("Optional region identifier for traffic limits"), ++ endpoint: z.string().optional().describe("Optional endpoint identifier for traffic limits"), ++ tenantTier: z ++ .string() ++ .optional() ++ .describe("Optional tenant tier identifier for traffic limits"), + context: z + .record(z.string(), z.unknown()) + .nullish() +@@ -94,6 +106,14 @@ export const GenerateOptionsSchema = z + .positive() + .optional() + .describe("Maximum number of steps for this request"), ++ maxQueueWaitMs: z ++ .number() ++ .int() ++ .nonnegative() ++ .optional() ++ .describe("Maximum time to wait in the queue before timing out (ms)"), ++ taskType: z.string().optional().describe("Optional task classification for fallback policy"), ++ fallbackPolicyId: z.string().optional().describe("Optional explicit fallback policy id"), + temperature: z + .number() + .min(0) +diff --git a/packages/server-core/src/types/responses.ts b/packages/server-core/src/types/responses.ts +index 2098c2f6..4935a535 100644 +--- a/packages/server-core/src/types/responses.ts ++++ b/packages/server-core/src/types/responses.ts +@@ -1,10 +1,12 @@ + /** + * Framework-agnostic response types for server handlers + */ ++import type { TrafficResponseMetadata } from "@voltagent/core"; + + export interface SuccessResponse { + success: true; + data: T; ++ traffic?: TrafficResponseMetadata; + } + + export interface ErrorResponse { +@@ -13,6 +15,7 @@ export interface ErrorResponse { + httpStatus?: number; + code?: string; + name?: string; ++ traffic?: TrafficResponseMetadata; + } + + export type ApiResponse = SuccessResponse | ErrorResponse; +diff --git a/packages/server-core/src/utils/traffic.ts b/packages/server-core/src/utils/traffic.ts +new file mode 100644 +index 00000000..f9be1845 +--- /dev/null ++++ b/packages/server-core/src/utils/traffic.ts +@@ -0,0 +1,35 @@ ++import type { TrafficResponseMetadata } from "@voltagent/core"; ++ ++export function buildTrafficHeaders(traffic?: TrafficResponseMetadata): Record { ++ if (!traffic) return {}; ++ ++ const headers: Record = {}; ++ ++ if (typeof traffic.retryAfterMs === "number" && Number.isFinite(traffic.retryAfterMs)) { ++ headers["Retry-After"] = String(Math.max(0, Math.ceil(traffic.retryAfterMs / 1000))); ++ } ++ ++ if (traffic.rateLimitRemaining !== undefined) { ++ headers["X-RateLimit-Remaining"] = String(traffic.rateLimitRemaining); ++ } ++ ++ if (typeof traffic.rateLimitResetAt === "number" && Number.isFinite(traffic.rateLimitResetAt)) { ++ headers["X-RateLimit-Reset"] = String(Math.max(0, Math.ceil(traffic.rateLimitResetAt / 1000))); ++ } else if ( ++ typeof traffic.rateLimitResetInMs === "number" && ++ Number.isFinite(traffic.rateLimitResetInMs) ++ ) { ++ const resetAt = Date.now() + Math.max(0, traffic.rateLimitResetInMs); ++ headers["X-RateLimit-Reset"] = String(Math.max(0, Math.ceil(resetAt / 1000))); ++ } ++ ++ if (traffic.queueEtaMs !== undefined) { ++ headers["X-Queue-ETA"] = String(traffic.queueEtaMs); ++ } ++ ++ if (traffic.rateLimitKey) { ++ headers["X-RateLimit-Key"] = traffic.rateLimitKey; ++ } ++ ++ return headers; ++} +diff --git a/packages/server-hono/src/routes/index.ts b/packages/server-hono/src/routes/index.ts +index a5af8214..336a5bf4 100644 +--- a/packages/server-hono/src/routes/index.ts ++++ b/packages/server-hono/src/routes/index.ts +@@ -2,6 +2,7 @@ import type { ServerProviderDeps } from "@voltagent/core"; + import type { Logger } from "@voltagent/internal"; + import { + UPDATE_ROUTES, ++ buildTrafficHeaders, + handleCancelWorkflow, + handleChatStream, + handleCheckUpdates, +@@ -87,11 +88,12 @@ export function registerAgentRoutes( + + const signal = c.req.raw.signal; + const response = await handleGenerateText(agentId, body, deps, logger, signal); ++ const trafficHeaders = buildTrafficHeaders(response.traffic); + if (!response.success) { + const { httpStatus, ...details } = response; +- return c.json(details, httpStatus || 500); ++ return c.json(details, httpStatus || 500, trafficHeaders); + } +- return c.json(response, 200); ++ return c.json(response, 200, trafficHeaders); + }); + + // POST /agents/:id/stream - Stream text (raw fullStream SSE) +@@ -131,11 +133,12 @@ export function registerAgentRoutes( + const body = await c.req.json(); + const signal = c.req.raw.signal; + const response = await handleGenerateObject(agentId, body, deps, logger, signal); ++ const trafficHeaders = buildTrafficHeaders(response.traffic); + if (!response.success) { + const { httpStatus, ...details } = response; +- return c.json(details, httpStatus || 500); ++ return c.json(details, httpStatus || 500, trafficHeaders); + } +- return c.json(response, 200); ++ return c.json(response, 200, trafficHeaders); + }); + + // POST /agents/:id/stream-object - Stream object +diff --git a/packages/serverless-hono/src/routes.ts b/packages/serverless-hono/src/routes.ts +index d377ce4b..39eabcf7 100644 +--- a/packages/serverless-hono/src/routes.ts ++++ b/packages/serverless-hono/src/routes.ts +@@ -28,6 +28,7 @@ import { + type TriggerHttpRequestContext, + UPDATE_ROUTES, + WORKFLOW_ROUTES, ++ buildTrafficHeaders, + executeA2ARequest, + executeTriggerHandler, + getConversationMessagesHandler, +@@ -165,7 +166,8 @@ export function registerAgentRoutes(app: Hono, deps: ServerProviderDeps, logger: + } + const signal = c.req.raw.signal; + const response = await handleGenerateText(agentId, body, deps, logger, signal); +- return c.json(response, response.success ? 200 : 500); ++ const trafficHeaders = buildTrafficHeaders(response.traffic); ++ return c.json(response, response.success ? 200 : 500, trafficHeaders); + }); + + app.post(AGENT_ROUTES.streamText.path, async (c) => { +@@ -197,7 +199,8 @@ export function registerAgentRoutes(app: Hono, deps: ServerProviderDeps, logger: + } + const signal = c.req.raw.signal; + const response = await handleGenerateObject(agentId, body, deps, logger, signal); +- return c.json(response, response.success ? 200 : 500); ++ const trafficHeaders = buildTrafficHeaders(response.traffic); ++ return c.json(response, response.success ? 200 : 500, trafficHeaders); + }); + + app.post(AGENT_ROUTES.streamObject.path, async (c) => { +diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml +index 20029de4..6671d8c1 100644 +--- a/pnpm-lock.yaml ++++ b/pnpm-lock.yaml +@@ -37,7 +37,7 @@ importers: + version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2) + '@nx/plugin': + specifier: 20.4.6 +- version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(typescript@5.9.2) ++ version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2) + '@nx/vite': + specifier: 20.4.6 + version: 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2)(vite@7.2.7)(vitest@3.2.4) +@@ -92,6 +92,9 @@ importers: + syncpack: + specifier: ^13.0.2 + version: 13.0.4(typescript@5.9.2) ++ ts-node: ++ specifier: ^10.9.2 ++ version: 10.9.2(@swc/core@1.5.29)(@types/node@24.2.1)(typescript@5.9.2) + tslib: + specifier: ^2.3.0 + version: 2.8.1 +@@ -99,7 +102,7 @@ importers: + specifier: ^8.5.0 + version: 8.5.0(@swc/core@1.5.29)(typescript@5.9.2) + typescript: +- specifier: ^5.8.2 ++ specifier: ^5.9.2 + version: 5.9.2 + vite: + specifier: ^7.2.7 +@@ -2750,6 +2753,61 @@ importers: + specifier: ^0.5.3 + version: 0.5.3(@tanstack/query-core@5.89.0)(@tanstack/react-query@5.89.0)(@tanstack/router-core@1.131.44)(@types/node@24.2.1)(@types/react@19.2.7)(@vitejs/plugin-react@5.1.2)(tsx@4.20.4)(vite@7.2.7) + ++ examples/with-viteval/dist: ++ dependencies: ++ '@ai-sdk/openai': ++ specifier: ^2.0.52 ++ version: 2.0.85(zod@3.25.76) ++ '@voltagent/cli': ++ specifier: ^0.1.16 ++ version: link:../../../packages/cli ++ '@voltagent/core': ++ specifier: ^1.2.15 ++ version: link:../../../packages/core ++ '@voltagent/libsql': ++ specifier: ^1.0.13 ++ version: link:../../../packages/libsql ++ '@voltagent/logger': ++ specifier: ^1.0.4 ++ version: link:../../../packages/logger ++ '@voltagent/server-hono': ++ specifier: ^1.2.5 ++ version: link:../../../packages/server-hono ++ ai: ++ specifier: ^5.0.76 ++ version: 5.0.113(zod@3.25.76) ++ consola: ++ specifier: ^3.4.2 ++ version: 3.4.2 ++ envalid: ++ specifier: ^8.1.0 ++ version: 8.1.0 ++ yargs: ++ specifier: ^18.0.0 ++ version: 18.0.0 ++ zod: ++ specifier: ^3.25.76 ++ version: 3.25.76 ++ devDependencies: ++ '@tsconfig/node24': ++ specifier: ^24.0.1 ++ version: 24.0.1 ++ '@types/yargs': ++ specifier: ^17.0.33 ++ version: 17.0.33 ++ dotenv: ++ specifier: ^16.4.5 ++ version: 16.6.1 ++ tsx: ++ specifier: ^4.19.3 ++ version: 4.20.4 ++ typescript: ++ specifier: ^5.8.2 ++ version: 5.9.2 ++ viteval: ++ specifier: ^0.5.3 ++ version: 0.5.3(@tanstack/query-core@5.89.0)(@tanstack/react-query@5.89.0)(@tanstack/router-core@1.131.44)(@types/node@24.2.1)(@types/react@19.2.7)(@vitejs/plugin-react@5.1.2)(tsx@4.20.4)(vite@7.2.7) ++ + examples/with-voice-elevenlabs: + dependencies: + '@ai-sdk/openai': +@@ -3509,7 +3567,7 @@ importers: + version: 3.2.4(vitest@3.2.4) + jest: + specifier: ^29.5.0 +- version: 29.7.0(@types/node@24.2.1) ++ version: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) + ts-jest: + specifier: ^29.1.0 + version: 29.4.1(@babel/core@7.28.5)(esbuild@0.25.10)(jest@29.7.0)(typescript@5.9.2) +@@ -9966,7 +10024,7 @@ packages: + slash: 3.0.0 + dev: true + +- /@jest/core@29.7.0: ++ /@jest/core@29.7.0(ts-node@10.9.2): + resolution: {integrity: sha512-n7aeXWKMnGtDA48y8TLWJPJmLmmZ642Ceo78cYWEpiD7FzDgmNDV/GCVRorPABdXLJZ/9wzzgZAlHjXjxDHGsg==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + peerDependencies: +@@ -9987,7 +10045,7 @@ packages: + exit: 0.1.2 + graceful-fs: 4.2.11 + jest-changed-files: 29.7.0 +- jest-config: 29.7.0(@types/node@24.6.2) ++ jest-config: 29.7.0(@types/node@24.6.2)(ts-node@10.9.2) + jest-haste-map: 29.7.0 + jest-message-util: 29.7.0 + jest-regex-util: 29.6.3 +@@ -12403,7 +12461,7 @@ packages: + - verdaccio + dev: true + +- /@nx/jest@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2): ++ /@nx/jest@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2): + resolution: {integrity: sha512-yZOZJOQFtpdY3Fu/WYNoDx81TwvF9yfwvalFpLD19bz+2YGl7B89l0S1ZrtSRXFfKXA/w7gb0gmKwthJtQhx9Q==} + dependencies: + '@jest/reporters': 29.7.0 +@@ -12412,7 +12470,7 @@ packages: + '@nx/js': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2) + '@phenomnomnominal/tsquery': 5.0.1(typescript@5.9.2) + identity-obj-proxy: 3.0.0 +- jest-config: 29.7.0(@types/node@24.2.1) ++ jest-config: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) + jest-resolve: 29.7.0 + jest-util: 29.7.0 + minimatch: 9.0.3 +@@ -12807,12 +12865,12 @@ packages: + dev: true + optional: true + +- /@nx/plugin@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(typescript@5.9.2): ++ /@nx/plugin@20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2): + resolution: {integrity: sha512-7Jlv+BVqGoO0BolQN7P5Z87160phuE1i7H6C8xFwQnlQ3ZfwQCJzk2dkg1UyzxDkWl6lvVsqBjZPXD55gFQ3+w==} + dependencies: + '@nx/devkit': 20.4.6(nx@20.8.2) + '@nx/eslint': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(eslint@9.33.0)(nx@20.8.2) +- '@nx/jest': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2) ++ '@nx/jest': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(ts-node@10.9.2)(typescript@5.9.2) + '@nx/js': 20.4.6(@swc-node/register@1.9.2)(@swc/core@1.5.29)(@types/node@24.2.1)(nx@20.8.2)(typescript@5.9.2) + tslib: 2.8.1 + transitivePeerDependencies: +@@ -17770,8 +17828,8 @@ packages: + '@babel/plugin-syntax-jsx': 7.27.1(@babel/core@7.28.5) + '@babel/plugin-syntax-typescript': 7.27.1(@babel/core@7.28.5) + '@babel/template': 7.27.2 +- '@babel/traverse': 7.28.4 +- '@babel/types': 7.28.4 ++ '@babel/traverse': 7.28.5 ++ '@babel/types': 7.28.5 + '@tanstack/react-router': 1.131.44(react-dom@19.2.3)(react@19.2.3) + '@tanstack/router-core': 1.131.44 + '@tanstack/router-generator': 1.131.44 +@@ -22783,7 +22841,7 @@ packages: + crc-32: 1.2.2 + readable-stream: 4.7.0 + +- /create-jest@29.7.0(@types/node@24.2.1): ++ /create-jest@29.7.0(@types/node@24.2.1)(ts-node@10.9.2): + resolution: {integrity: sha512-Adz2bdH0Vq3F53KEMJOoftQFutWCukm6J24wbPWRO4k1kMY7gS7ds/uoJkNuV8wDCtWWnuwGcJwpWcih+zEW1Q==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + hasBin: true +@@ -22792,7 +22850,7 @@ packages: + chalk: 4.1.2 + exit: 0.1.2 + graceful-fs: 4.2.11 +- jest-config: 29.7.0(@types/node@24.2.1) ++ jest-config: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) + jest-util: 29.7.0 + prompts: 2.4.2 + transitivePeerDependencies: +@@ -27641,7 +27699,7 @@ packages: + - supports-color + dev: true + +- /jest-cli@29.7.0(@types/node@24.2.1): ++ /jest-cli@29.7.0(@types/node@24.2.1)(ts-node@10.9.2): + resolution: {integrity: sha512-OVVobw2IubN/GSYsxETi+gOe7Ka59EFMR/twOU3Jb2GnKKeMGJB5SGUUrEz3SFVmJASUdZUzy83sLNNQ2gZslg==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + hasBin: true +@@ -27651,14 +27709,14 @@ packages: + node-notifier: + optional: true + dependencies: +- '@jest/core': 29.7.0 ++ '@jest/core': 29.7.0(ts-node@10.9.2) + '@jest/test-result': 29.7.0 + '@jest/types': 29.6.3 + chalk: 4.1.2 +- create-jest: 29.7.0(@types/node@24.2.1) ++ create-jest: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) + exit: 0.1.2 + import-local: 3.2.0 +- jest-config: 29.7.0(@types/node@24.2.1) ++ jest-config: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) + jest-util: 29.7.0 + jest-validate: 29.7.0 + yargs: 17.7.2 +@@ -27669,7 +27727,7 @@ packages: + - ts-node + dev: true + +- /jest-config@29.7.0(@types/node@24.2.1): ++ /jest-config@29.7.0(@types/node@24.2.1)(ts-node@10.9.2): + resolution: {integrity: sha512-uXbpfeQ7R6TZBqI3/TxCU4q4ttk3u0PJeC+E0zbfSoSjq6bJ7buBPxzQPL0ifrkY4DNu4JUdk0ImlBUYi840eQ==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + peerDependencies: +@@ -27704,12 +27762,13 @@ packages: + pretty-format: 29.7.0 + slash: 3.0.0 + strip-json-comments: 3.1.1 ++ ts-node: 10.9.2(@swc/core@1.5.29)(@types/node@24.2.1)(typescript@5.9.2) + transitivePeerDependencies: + - babel-plugin-macros + - supports-color + dev: true + +- /jest-config@29.7.0(@types/node@24.6.2): ++ /jest-config@29.7.0(@types/node@24.6.2)(ts-node@10.9.2): + resolution: {integrity: sha512-uXbpfeQ7R6TZBqI3/TxCU4q4ttk3u0PJeC+E0zbfSoSjq6bJ7buBPxzQPL0ifrkY4DNu4JUdk0ImlBUYi840eQ==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + peerDependencies: +@@ -27744,6 +27803,7 @@ packages: + pretty-format: 29.7.0 + slash: 3.0.0 + strip-json-comments: 3.1.1 ++ ts-node: 10.9.2(@swc/core@1.5.29)(@types/node@24.2.1)(typescript@5.9.2) + transitivePeerDependencies: + - babel-plugin-macros + - supports-color +@@ -28041,7 +28101,7 @@ packages: + supports-color: 8.1.1 + dev: true + +- /jest@29.7.0(@types/node@24.2.1): ++ /jest@29.7.0(@types/node@24.2.1)(ts-node@10.9.2): + resolution: {integrity: sha512-NIy3oAFp9shda19hy4HK0HRTWKtPJmGdnvywu01nOqNC2vZg+Z+fvJDxpMQA88eb2I9EcafcdjYgsDthnYTvGw==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + hasBin: true +@@ -28051,10 +28111,10 @@ packages: + node-notifier: + optional: true + dependencies: +- '@jest/core': 29.7.0 ++ '@jest/core': 29.7.0(ts-node@10.9.2) + '@jest/types': 29.6.3 + import-local: 3.2.0 +- jest-cli: 29.7.0(@types/node@24.2.1) ++ jest-cli: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) + transitivePeerDependencies: + - '@types/node' + - babel-plugin-macros +@@ -36767,7 +36827,7 @@ packages: + esbuild: 0.25.10 + fast-json-stable-stringify: 2.1.0 + handlebars: 4.7.8 +- jest: 29.7.0(@types/node@24.2.1) ++ jest: 29.7.0(@types/node@24.2.1)(ts-node@10.9.2) + json5: 2.2.3 + lodash.memoize: 4.1.2 + make-error: 1.3.6 +diff --git a/tmp/test/traffic-concurrency.ts b/tmp/test/traffic-concurrency.ts +new file mode 100644 +index 00000000..d12fc5c9 +--- /dev/null ++++ b/tmp/test/traffic-concurrency.ts +@@ -0,0 +1,91 @@ ++// @ts-nocheck ++/** ++ * Manual test: TrafficController maxConcurrent scheduling. ++ * ++ * What to look for: ++ * - `inFlight` should never exceed `maxConcurrent`. ++ * - Requests should start in bursts up to `maxConcurrent`. ++ * ++ * Run: ++ * - pnpm ts-node tmp/test/traffic-concurrency.ts ++ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-concurrency.ts (enable controller debug logs) ++ */ ++ ++import { safeStringify } from "@voltagent/internal"; ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); ++const now = () => new Date().toISOString(); ++ ++const maxConcurrent = 3; ++const controller = getTrafficController({ maxConcurrent }); ++ ++let inFlight = 0; ++let maxObserved = 0; ++ ++function makeModel(id: string, durationMs: number) { ++ return { ++ specificationVersion: "v2", ++ provider: "sim", ++ modelId: `concurrency-${id}`, ++ doGenerate: async () => { ++ inFlight += 1; ++ maxObserved = Math.max(maxObserved, inFlight); ++ console.log(`[${now()}] start ${id} inFlight=${inFlight}`); ++ ++ try { ++ await sleep(durationMs); ++ return { ++ content: [{ type: "text", text: `ok:${id}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { modelId: `concurrency-${id}`, headers: {} }, ++ }; ++ } finally { ++ inFlight -= 1; ++ console.log(`[${now()}] end ${id} inFlight=${inFlight}`); ++ } ++ }, ++ }; ++} ++ ++async function main() { ++ console.log(`\n=== TrafficController concurrency (maxConcurrent=${maxConcurrent}) ===`); ++ void controller; ++ ++ const agent = new Agent({ ++ name: "traffic-concurrency", ++ instructions: "echo", ++ model: makeModel("base", 0), ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ const ids = ["A", "B", "C", "D", "E"]; ++ const jobs = ids.map((id) => ++ agent.generateText(id, { ++ tenantId: "default", ++ trafficPriority: "P1", ++ model: makeModel(id, 700), ++ }), ++ ); ++ ++ const settled = await Promise.allSettled(jobs); ++ console.log(`\n[done] maxObserved=${maxObserved}`); ++ console.log( ++ `[done] results=${safeStringify( ++ settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)), ++ )}`, ++ ); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-fallback-chain.ts b/tmp/test/traffic-fallback-chain.ts +new file mode 100644 +index 00000000..0cd77b2b +--- /dev/null ++++ b/tmp/test/traffic-fallback-chain.ts +@@ -0,0 +1,168 @@ ++// @ts-nocheck ++/** ++ * Manual test: TrafficController circuit breaker + fallback chains. ++ * ++ * Scenarios: ++ * - Test 1: Open primary circuit (via repeated 429s), then route to fallback1. ++ * - Test 2: Open fallback1 circuit, then route to fallback2 (success). ++ * - Test 3: No fallback configured → CircuitBreakerOpenError. ++ * ++ * Run: ++ * - pnpm ts-node tmp/test/traffic-fallback-chain.ts ++ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-fallback-chain.ts ++ */ ++ ++import { safeStringify } from "@voltagent/internal"; ++import { MockLanguageModelV2, MockProviderV2 } from "ai/test"; ++import { ++ Agent, ++ CircuitBreakerOpenError, ++ getTrafficController, ++} from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); ++const now = () => new Date().toISOString(); ++ ++type ModelId = "primary" | "fallback1" | "fallback2" | "no-fallback"; ++ ++const provider = "test-provider"; ++ ++const controller = getTrafficController({ ++ maxConcurrent: 1, ++ fallbackChains: { ++ primary: ["fallback1", "fallback2"], ++ fallback1: ["fallback2"], ++ }, ++}); ++ ++function makeAlways429Model(modelId: ModelId) { ++ let attempts = 0; ++ return new MockLanguageModelV2({ ++ provider, ++ modelId, ++ doGenerate: async () => { ++ attempts += 1; ++ console.log(`[${now()}] doGenerate model=${modelId} attempt=${attempts} -> 429`); ++ await sleep(25); ++ const err: any = new Error(`forced 429 for model=${modelId} attempt=${attempts}`); ++ err.status = 429; ++ throw err; ++ }, ++ }); ++} ++ ++function makeAlwaysOkModel(modelId: ModelId) { ++ let attempts = 0; ++ return new MockLanguageModelV2({ ++ provider, ++ modelId, ++ doGenerate: async () => { ++ attempts += 1; ++ console.log(`[${now()}] doGenerate model=${modelId} attempt=${attempts} -> ok`); ++ await sleep(25); ++ return { ++ content: [{ type: "text", text: `ok:${modelId}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { modelId, headers: {} }, ++ }; ++ }, ++ }); ++} ++ ++const primaryModel = makeAlways429Model("primary"); ++const fallback1Model = makeAlways429Model("fallback1"); ++const fallback2Model = makeAlwaysOkModel("fallback2"); ++const noFallbackModel = makeAlways429Model("no-fallback"); ++ ++// Required so Agent fallbacks (string model IDs) resolve without network calls. ++(globalThis as any).AI_SDK_DEFAULT_PROVIDER = new MockProviderV2({ ++ languageModels: { ++ primary: primaryModel, ++ fallback1: fallback1Model, ++ fallback2: fallback2Model, ++ "no-fallback": noFallbackModel, ++ }, ++}); ++ ++const primaryAgent = new Agent({ ++ name: "traffic-fallback-primary", ++ instructions: "echo", ++ model: primaryModel, ++ temperature: 0, ++ maxOutputTokens: 32, ++}); ++ ++const noFallbackAgent = new Agent({ ++ name: "traffic-fallback-none", ++ instructions: "echo", ++ model: noFallbackModel, ++ temperature: 0, ++ maxOutputTokens: 32, ++}); ++ ++async function runOnce(label: string, agent: any) { ++ console.log(`\n--- ${label} ---`); ++ try { ++ const result = await agent.generateText(label, { ++ tenantId: "default", ++ trafficPriority: "P1", ++ }); ++ console.log( ++ `[${label}] success text=${result.text} responseModel=${result.response?.modelId ?? "n/a"}`, ++ ); ++ } catch (err: any) { ++ if (err instanceof CircuitBreakerOpenError) { ++ console.log( ++ `[${label}] CircuitBreakerOpenError retryAfterMs=${err.retryAfterMs} msg=${err.message}`, ++ ); ++ } else { ++ console.log( ++ `[${label}] failed name=${err?.name ?? "Error"} status=${err?.status ?? err?.statusCode ?? "n/a"} msg=${err?.message}`, ++ ); ++ } ++ } ++} ++ ++async function main() { ++ console.log("\n=== Circuit breaker + fallback chain ==="); ++ void controller; ++ ++ console.log("\n[Test 1] Open primary circuit, then route to fallback1"); ++ // Two calls * (up to 3 retries each) ≈ 6 failures → should open the circuit (threshold=5). ++ await runOnce("primary-warmup-1", primaryAgent); ++ await runOnce("primary-warmup-2", primaryAgent); ++ await runOnce("primary-after-open", primaryAgent); // should execute fallback1 (still closed) ++ ++ console.log("\n[Test 2] Open fallback1 circuit, then route to fallback2"); ++ // Build enough failures on fallback1 by routing multiple requests to it via primary circuit-open path. ++ await runOnce("fallback1-warmup-1-via-primary", primaryAgent); ++ await runOnce("fallback1-warmup-2-via-primary", primaryAgent); ++ await runOnce("primary-should-hit-fallback2", primaryAgent); // should execute fallback2 and succeed ++ ++ console.log("\n[Test 3] No fallback configured → CircuitBreakerOpenError"); ++ await runOnce("no-fallback-warmup-1", noFallbackAgent); ++ await runOnce("no-fallback-warmup-2", noFallbackAgent); ++ await runOnce("no-fallback-after-open", noFallbackAgent); ++ ++ console.log("\n[debug] model call counts:"); ++ console.log( ++ safeStringify({ ++ primary: primaryModel.doGenerateCalls?.length, ++ fallback1: fallback1Model.doGenerateCalls?.length, ++ fallback2: fallback2Model.doGenerateCalls?.length, ++ "no-fallback": noFallbackModel.doGenerateCalls?.length, ++ }), ++ ); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-priority-openai-real.ts b/tmp/test/traffic-priority-openai-real.ts +new file mode 100644 +index 00000000..223263ba +--- /dev/null ++++ b/tmp/test/traffic-priority-openai-real.ts +@@ -0,0 +1,117 @@ ++// @ts-nocheck ++/** ++ * Manual test: TrafficController + AI SDK with real OpenAI calls. ++ * ++ * What this exercises: ++ * - Priority scheduling (P0/P1/P2) with `maxConcurrent=1` ++ * - Rate limit header ingestion via `updateRateLimitFromHeaders()` (if headers are present) ++ * - Tenant usage aggregation via `extractUsage` + `getTenantUsage()` ++ * ++ * Prereqs: ++ * - Set `OPENAI_API_KEY` ++ * ++ * Run: ++ * - OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-priority-openai-real.ts ++ * - VERBOSE=1 OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-priority-openai-real.ts ++ * ++ * Notes: ++ * - This will make real network calls and may incur cost. ++ */ ++ ++import { openai } from "@ai-sdk/openai"; ++import { safeStringify } from "@voltagent/internal"; ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const apiKey = process.env.OPENAI_API_KEY; ++if (!apiKey) { ++ console.error("Missing OPENAI_API_KEY. Example:"); ++ console.error(" OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-priority-openai-real.ts"); ++ process.exit(1); ++} ++ ++const _now = () => new Date().toISOString(); ++const preview = (value: unknown, max = 140) => { ++ if (typeof value !== "string") return String(value ?? ""); ++ return value.length > max ? `${value.slice(0, max)}…` : value; ++}; ++ ++const tenantId = process.env.TENANT_ID ?? "openai-real"; ++const defaultModelId = process.env.OPENAI_MODEL ?? "gpt-4o-mini"; ++ ++const controller = getTrafficController({ maxConcurrent: 1 }); ++ ++function getHeader(headers: any, name: string): string | undefined { ++ if (!headers) return undefined; ++ if (typeof headers.get === "function") { ++ const v = headers.get(name); ++ return v === null || v === undefined ? undefined : String(v); ++ } ++ const key = Object.keys(headers).find((k) => k.toLowerCase() === name.toLowerCase()); ++ if (!key) return undefined; ++ const v = headers[key]; ++ return v === null || v === undefined ? undefined : String(Array.isArray(v) ? v[0] : v); ++} ++ ++async function main() { ++ console.log( ++ `\n=== OpenAI real: priority scheduling (tenantId=${tenantId}, model=${defaultModelId}) ===`, ++ ); ++ void controller; ++ ++ const agent = new Agent({ ++ name: "openai-real-traffic", ++ instructions: "Reply exactly with the requested token.", ++ model: openai(defaultModelId), ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ // Enqueue in reverse priority order; controller should still execute P0 first. ++ const p2 = agent.generateText("Reply with only: P2", { tenantId, trafficPriority: "P2" }); ++ const p1 = agent.generateText("Reply with only: P1", { tenantId, trafficPriority: "P1" }); ++ const p0 = agent.generateText("Reply with only: P0", { tenantId, trafficPriority: "P0" }); ++ ++ const settled = await Promise.allSettled([p0, p1, p2]); ++ for (const result of settled) { ++ if (result.status !== "fulfilled") { ++ console.log(`[result] rejected=${result.reason?.message ?? String(result.reason)}`); ++ continue; ++ } ++ ++ const headers = result.value.response?.headers; ++ const limit = getHeader(headers, "x-ratelimit-limit-requests"); ++ const remaining = getHeader(headers, "x-ratelimit-remaining-requests"); ++ const reset = getHeader(headers, "x-ratelimit-reset-requests"); ++ ++ console.log( ++ `[result] text=${preview(result.value.text)} finishReason=${result.value.finishReason} usage=${safeStringify(result.value.usage)}`, ++ ); ++ console.log( ++ `[result] ratelimitHeaders=${safeStringify({ ++ limit, ++ remaining, ++ reset, ++ })}`, ++ ); ++ } ++ ++ console.log( ++ `\n[done] settled=${safeStringify( ++ settled.map((s) => (s.status === "fulfilled" ? preview(s.value.text) : s.reason?.message)), ++ )}`, ++ ); ++ ++ console.log( ++ `[done] tenantUsage(${tenantId})=${safeStringify(controller.getTenantUsage(tenantId))}`, ++ ); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-priority-openai-sim.ts b/tmp/test/traffic-priority-openai-sim.ts +new file mode 100644 +index 00000000..9d36a7d1 +--- /dev/null ++++ b/tmp/test/traffic-priority-openai-sim.ts +@@ -0,0 +1,114 @@ ++// @ts-nocheck ++/** ++ * Manual test: Agent → TrafficController priority scheduling (OpenAI-like stub models). ++ * ++ * This keeps the Agent + AI SDK path, but avoids real network calls by using stub models ++ * that pretend to be `provider="openai"` with modelIds like `gpt-4o`/`gpt-4o-mini`. ++ * ++ * Scenarios: ++ * - Test 1: P0 runs before P1/P2 when all runnable. ++ * - Test 2: P0 request (gpt-4o) is rate-limited → P1 (gpt-4o-mini) proceeds. ++ * ++ * Note: ++ * - Rate-limit wakeups include a small probe delay; a "1s" reset may unblock slightly after 1s. ++ * ++ * Run: ++ * - pnpm ts-node tmp/test/traffic-priority-openai-sim.ts ++ */ ++ ++import { safeStringify } from "@voltagent/internal"; ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); ++const now = () => new Date().toISOString(); ++ ++function makeOpenAIStubModel(modelId: string, delayMs: number) { ++ let calls = 0; ++ return { ++ specificationVersion: "v2", ++ provider: "openai", ++ modelId, ++ doGenerate: async () => { ++ calls += 1; ++ console.log(`[${now()}] [model] ${modelId} doGenerate call=${calls}`); ++ await sleep(delayMs); ++ return { ++ content: [{ type: "text", text: `ok:${modelId}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { modelId, headers: {} }, ++ }; ++ }, ++ }; ++} ++ ++const controller = getTrafficController({ maxConcurrent: 1 }); ++ ++const modelMini = makeOpenAIStubModel("gpt-4o-mini", 80); ++const modelBig = makeOpenAIStubModel("gpt-4o", 80); ++ ++const agent = new Agent({ ++ name: "priority-openai-sim", ++ instructions: "echo", ++ model: modelMini, ++ temperature: 0, ++ maxOutputTokens: 32, ++}); ++ ++async function test1_priorityOrder() { ++ console.log("\n=== Test 1: P0 ordering via Agent ==="); ++ ++ const p2 = agent.generateText("P2", { trafficPriority: "P2", tenantId: "sim" }); ++ const p1 = agent.generateText("P1", { trafficPriority: "P1", tenantId: "sim" }); ++ const p0 = agent.generateText("P0", { trafficPriority: "P0", tenantId: "sim" }); ++ ++ const results = await Promise.all([p0, p1, p2]); ++ console.log(`[Test 1] results=${safeStringify(results.map((r) => r.text))}`); ++} ++ ++async function test2_p1RunsWhenP0RateLimited() { ++ console.log("\n=== Test 2: P1 proceeds when P0 is rate-limited ==="); ++ ++ // Seed remaining=0 for openai::gpt-4o so the P0 head item initially waits. ++ const applied = controller.updateRateLimitFromHeaders( ++ { provider: "openai", model: "gpt-4o" }, ++ { ++ "x-ratelimit-limit-requests": "1", ++ "x-ratelimit-remaining-requests": "0", ++ "x-ratelimit-reset-requests": "1s", ++ }, ++ ); ++ console.log(`[Test 2] updateRateLimitFromHeaders=${safeStringify(applied)}`); ++ ++ const p0Blocked = agent.generateText("P0 (gpt-4o, rate-limited)", { ++ trafficPriority: "P0", ++ tenantId: "sim", ++ model: modelBig, // per-call model override (new in this branch) ++ }); ++ ++ const p1Free = agent.generateText("P1 (gpt-4o-mini)", { ++ trafficPriority: "P1", ++ tenantId: "sim", ++ model: modelMini, ++ }); ++ ++ const [r0, r1] = await Promise.all([p0Blocked, p1Free]); ++ console.log(`[Test 2] p0 text=${r0.text}`); ++ console.log(`[Test 2] p1 text=${r1.text}`); ++} ++ ++async function main() { ++ await test1_priorityOrder(); ++ await test2_p1RunsWhenP0RateLimited(); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-priority.ts b/tmp/test/traffic-priority.ts +new file mode 100644 +index 00000000..409e1078 +--- /dev/null ++++ b/tmp/test/traffic-priority.ts +@@ -0,0 +1,159 @@ ++// @ts-nocheck ++/** ++ * Manual test: TrafficController priority scheduling. ++ * ++ * Scenarios: ++ * - Test 1: P0 should run before P1/P2 when runnable. ++ * - Test 2: If a P0 request is rate-limited, a lower priority (P1) can proceed. ++ * ++ * Note: ++ * - Rate-limit wakeups include a small probe delay; a "1s" reset may unblock slightly after 1s. ++ * ++ * Run: ++ * - pnpm ts-node tmp/test/traffic-priority.ts ++ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-priority.ts ++ */ ++ ++import { safeStringify } from "@voltagent/internal"; ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); ++const now = () => new Date().toISOString(); ++ ++const controller = getTrafficController({ maxConcurrent: 1 }); ++ ++function extractLabel(prompt: any): string { ++ if (!Array.isArray(prompt)) { ++ return "unknown"; ++ } ++ ++ for (let index = prompt.length - 1; index >= 0; index -= 1) { ++ const message = prompt[index]; ++ if (!message || message.role !== "user" || !Array.isArray(message.content)) { ++ continue; ++ } ++ ++ const textPart = message.content.find((part: any) => part?.type === "text"); ++ if (textPart?.text) { ++ return String(textPart.text); ++ } ++ } ++ ++ return "unknown"; ++} ++ ++function makeModel(provider: string, modelId: string, delayMs = 50) { ++ let calls = 0; ++ let lastStartAt = 0; ++ ++ return { ++ specificationVersion: "v2", ++ provider, ++ modelId, ++ doGenerate: async (options: any) => { ++ calls += 1; ++ const startAt = Date.now(); ++ const delta = lastStartAt ? startAt - lastStartAt : 0; ++ lastStartAt = startAt; ++ ++ const label = extractLabel(options?.prompt); ++ console.log( ++ `[${now()}] doGenerate start model=${provider}::${modelId} call=${calls} (+${delta}ms) input=${label}`, ++ ); ++ await sleep(delayMs); ++ console.log(`[${now()}] doGenerate end model=${provider}::${modelId} input=${label}`); ++ ++ return { ++ content: [{ type: "text", text: `ok:${label}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { modelId, headers: {} }, ++ }; ++ }, ++ }; ++} ++ ++async function test1_priorityOrder() { ++ console.log("\n=== Test 1: priority order (P0 before P1/P2) ==="); ++ ++ const sharedModel = makeModel("p", "shared-model", 50); ++ const agent = new Agent({ ++ name: "traffic-priority", ++ instructions: "echo", ++ model: sharedModel, ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ // Enqueue in reverse order; scheduler should still run P0 first. ++ const p2 = agent.generateText("P2", { tenantId: "default", trafficPriority: "P2" }); ++ const p1 = agent.generateText("P1", { tenantId: "default", trafficPriority: "P1" }); ++ const p0 = agent.generateText("P0", { tenantId: "default", trafficPriority: "P0" }); ++ ++ const settled = await Promise.allSettled([p0, p1, p2]); ++ console.log( ++ `[Test 1] results=${safeStringify( ++ settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)), ++ )}`, ++ ); ++} ++ ++async function test2_lowerPriorityWhenP0RateLimited() { ++ console.log("\n=== Test 2: P1 proceeds when P0 rate-limited ==="); ++ ++ const applied = controller.updateRateLimitFromHeaders( ++ { provider: "p0", model: "m0" }, ++ { ++ "x-ratelimit-limit-requests": "1", ++ "x-ratelimit-remaining-requests": "0", ++ "x-ratelimit-reset-requests": "1s", ++ }, ++ ); ++ console.log(`[Test 2] updateRateLimitFromHeaders=${safeStringify(applied)}`); ++ ++ const modelP0 = makeModel("p0", "m0", 50); ++ const modelP1 = makeModel("p1", "m1", 50); ++ const agent = new Agent({ ++ name: "traffic-priority-rate-limit", ++ instructions: "echo", ++ model: modelP1, ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ // Now the next P0 request is at the head of the queue but rate-limited, ++ // so a runnable P1 request should execute first. ++ const p0Blocked = agent.generateText("P0-blocked (rate limited)", { ++ tenantId: "default", ++ trafficPriority: "P0", ++ model: modelP0, ++ }); ++ const p1Free = agent.generateText("P1-free (should run first)", { ++ tenantId: "default", ++ trafficPriority: "P1", ++ model: modelP1, ++ }); ++ ++ const settled = await Promise.allSettled([p0Blocked, p1Free]); ++ console.log( ++ `[Test 2] results=${safeStringify( ++ settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)), ++ )}`, ++ ); ++} ++ ++async function main() { ++ await test1_priorityOrder(); ++ await test2_lowerPriorityWhenP0RateLimited(); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-rate-limit-from-headers.ts b/tmp/test/traffic-rate-limit-from-headers.ts +new file mode 100644 +index 00000000..d8262661 +--- /dev/null ++++ b/tmp/test/traffic-rate-limit-from-headers.ts +@@ -0,0 +1,158 @@ ++// @ts-nocheck ++/** ++ * Manual test: TrafficController dynamic rate limits from OpenAI response headers. ++ * ++ * This hits the real OpenAI model via Agent + AI SDK, and relies on the ++ * `x-ratelimit-*` response headers to seed/update the TrafficController. ++ * ++ * What to look for: ++ * - Each request prints the observed `x-ratelimit-*` headers (if present). ++ * - Agent should also log: "[Traffic] Applied rate limit from response headers". ++ * - With enough parallel requests, some requests may take longer due to controller throttling. ++ * ++ * Prereqs: ++ * - Set `OPENAI_API_KEY` ++ * ++ * Optional env: ++ * - `OPENAI_MODEL` (default: gpt-4o-mini) ++ * - `REQUESTS` (default: 10) ++ * - `MAX_CONCURRENT` (default: 50) ++ * - `TENANT_ID` (default: openai-rate-limit-headers) ++ * ++ * Run: ++ * - OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-from-headers.ts ++ * - VERBOSE=1 OPENAI_API_KEY=... REQUESTS=30 pnpm ts-node tmp/test/traffic-rate-limit-from-headers.ts ++ */ ++ ++import { openai } from "@ai-sdk/openai"; ++import { safeStringify } from "@voltagent/internal"; ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const now = () => new Date().toISOString(); ++ ++const apiKey = process.env.OPENAI_API_KEY; ++if (!apiKey) { ++ console.error("Missing OPENAI_API_KEY. Example:"); ++ console.error(" OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-from-headers.ts"); ++ process.exit(1); ++} ++ ++const provider = "openai"; ++const modelId = process.env.OPENAI_MODEL ?? "gpt-4o-mini"; ++const tenantId = process.env.TENANT_ID ?? "openai-rate-limit-headers"; ++const requestCountRaw = Number(process.env.REQUESTS ?? "10"); ++const maxConcurrentRaw = Number(process.env.MAX_CONCURRENT ?? "50"); ++const requestCount = Number.isFinite(requestCountRaw) && requestCountRaw > 0 ? requestCountRaw : 10; ++const maxConcurrent = ++ Number.isFinite(maxConcurrentRaw) && maxConcurrentRaw > 0 ? maxConcurrentRaw : 50; ++ ++const key = `${provider}::${modelId}`; ++const controller = getTrafficController({ maxConcurrent }); ++ ++function getHeader(headers: any, name: string): string | undefined { ++ if (!headers) return undefined; ++ if (typeof headers.get === "function") { ++ const v = headers.get(name); ++ return v === null || v === undefined ? undefined : String(v); ++ } ++ ++ const entries = Object.entries(headers as Record); ++ const target = name.toLowerCase(); ++ const match = entries.find(([k]) => String(k).toLowerCase() === target); ++ if (!match) return undefined; ++ ++ const value = match[1]; ++ if (Array.isArray(value)) { ++ const first = value[0]; ++ return first === null || first === undefined ? undefined : String(first); ++ } ++ ++ return value === null || value === undefined ? undefined : String(value); ++} ++ ++async function main() { ++ console.log( ++ `\n=== OpenAI rate limit headers → TrafficController (${key}, maxConcurrent=${maxConcurrent}, requests=${requestCount}) ===`, ++ ); ++ void controller; ++ ++ const agent = new Agent({ ++ name: "openai-rate-limit-from-headers", ++ instructions: "Reply with only the requested token.", ++ model: openai(modelId), ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ console.log("\n[seed] Making one request to capture headers..."); ++ const seedStartedAt = Date.now(); ++ const seed = await agent.generateText("Reply with only: seed", { ++ tenantId, ++ trafficPriority: "P1", ++ }); ++ const seedElapsedMs = Date.now() - seedStartedAt; ++ ++ const seedHeaders = seed.response?.headers; ++ console.log(`[seed] done in ${seedElapsedMs}ms text=${seed.text}`); ++ console.log( ++ `[seed] x-ratelimit-*=${safeStringify({ ++ limit: getHeader(seedHeaders, "x-ratelimit-limit-requests"), ++ remaining: getHeader(seedHeaders, "x-ratelimit-remaining-requests"), ++ reset: getHeader(seedHeaders, "x-ratelimit-reset-requests"), ++ })}`, ++ ); ++ ++ console.log(`\n[burst] Scheduling ${requestCount} parallel requests...`); ++ const jobs = Array.from({ length: requestCount }, (_, idx) => { ++ const label = `req-${idx + 1}`; ++ const enqueuedAt = Date.now(); ++ console.log(`[${now()}] enqueue ${label}`); ++ ++ return agent ++ .generateText(`Reply with only: ${label}`, { tenantId, trafficPriority: "P1" }) ++ .then((result) => { ++ const elapsedMs = Date.now() - enqueuedAt; ++ const headers = result.response?.headers; ++ console.log( ++ `[${now()}] done ${label} in ${elapsedMs}ms text=${result.text} x-ratelimit-remaining=${getHeader( ++ headers, ++ "x-ratelimit-remaining-requests", ++ )}`, ++ ); ++ return { ++ label, ++ elapsedMs, ++ text: result.text, ++ headers: { ++ limit: getHeader(headers, "x-ratelimit-limit-requests"), ++ remaining: getHeader(headers, "x-ratelimit-remaining-requests"), ++ reset: getHeader(headers, "x-ratelimit-reset-requests"), ++ }, ++ }; ++ }) ++ .catch((error) => { ++ const elapsedMs = Date.now() - enqueuedAt; ++ console.log( ++ `[${now()}] failed ${label} in ${elapsedMs}ms name=${error?.name ?? "Error"} status=${error?.status ?? error?.statusCode ?? "n/a"} msg=${error?.message}`, ++ ); ++ throw error; ++ }); ++ }); ++ ++ const settled = await Promise.allSettled(jobs); ++ ++ console.log(`\n[done] settled=${safeStringify(settled.map((s) => s.status))}`); ++ console.log( ++ `[done] tenantUsage(${tenantId})=${safeStringify(controller.getTenantUsage(tenantId))}`, ++ ); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-rate-limit-openai-window-sim.ts b/tmp/test/traffic-rate-limit-openai-window-sim.ts +new file mode 100644 +index 00000000..35232faa +--- /dev/null ++++ b/tmp/test/traffic-rate-limit-openai-window-sim.ts +@@ -0,0 +1,247 @@ ++// @ts-nocheck ++/** ++ * Manual test (real network): Simulate OpenAI "window remaining + reset" semantics and watch ++ * TrafficController pace + probe behavior via logs. ++ * ++ * Why "simulate"? ++ * - Real OpenAI headers usually show very large remaining values, so pacing is hard to observe. ++ * - This script still hits the real OpenAI model, but it drives the controller state using ++ * synthetic `x-ratelimit-*` headers to force a small window (e.g. remaining=3, reset=30s). ++ * ++ * What this demonstrates (matches your Step 1–7): ++ * 1) We seed controller with remaining + reset window. ++ * 2) We enqueue many requests. ++ * 3) Controller subtracts `reserved` from `remaining` to avoid stampedes. ++ * 4) When `effectiveRemaining <= 1`, controller waits until `resetAt + probeDelay`. ++ * 5) When room exists, controller paces using `nextAllowedAt`. ++ * 6) When a request finishes, we release reservation (controller) and apply new headers (this script). ++ * 7) After reset, controller sends a probe even when remaining==0; probe "fetches" fresh headers and flow resumes. ++ * ++ * Prereqs: ++ * - Set `OPENAI_API_KEY` ++ * ++ * Suggested logging: ++ * - `VOLTAGENT_LOG_LEVEL=trace` (to see traffic controller internals) ++ * ++ * Run: ++ * - VOLTAGENT_LOG_LEVEL=trace OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-openai-window-sim.ts ++ * ++ * Optional env: ++ * - OPENAI_MODEL (default: gpt-4o-mini) ++ * - WINDOW_SECONDS (default: 30) ++ * - REMAINING (default: 3) ++ * - REQUESTS (default: 10) ++ * - MAX_CONCURRENT (default: 50) ++ */ ++ ++import { safeStringify } from "@voltagent/internal"; ++import { TrafficController } from "../../packages/core/dist/index.js"; ++ ++const apiKey = process.env.OPENAI_API_KEY; ++if (!apiKey) { ++ console.error("Missing OPENAI_API_KEY. Example:"); ++ console.error( ++ " VOLTAGENT_LOG_LEVEL=trace OPENAI_API_KEY=... pnpm ts-node tmp/test/traffic-rate-limit-openai-window-sim.ts", ++ ); ++ process.exit(1); ++} ++ ++const now = () => new Date().toISOString(); ++ ++const modelId = process.env.OPENAI_MODEL ?? "gpt-4o-mini"; ++const windowSecondsRaw = Number(process.env.WINDOW_SECONDS ?? "30"); ++const remainingRaw = Number(process.env.REMAINING ?? "3"); ++const requestsRaw = Number(process.env.REQUESTS ?? "10"); ++const maxConcurrentRaw = Number(process.env.MAX_CONCURRENT ?? "50"); ++ ++const windowSeconds = ++ Number.isFinite(windowSecondsRaw) && windowSecondsRaw > 0 ? windowSecondsRaw : 30; ++const initialRemaining = ++ Number.isFinite(remainingRaw) && remainingRaw > 0 ? Math.floor(remainingRaw) : 3; ++const requestCount = Number.isFinite(requestsRaw) && requestsRaw > 0 ? Math.floor(requestsRaw) : 10; ++const maxConcurrent = ++ Number.isFinite(maxConcurrentRaw) && maxConcurrentRaw > 0 ? Math.floor(maxConcurrentRaw) : 50; ++ ++const provider = "openai"; ++const tenantId = "openai-window-sim"; ++const windowMs = Math.round(windowSeconds * 1000); ++ ++async function callOpenAIResponses(label: string): Promise<{ ++ status: number; ++ headers: Record; ++ textPreview: string; ++}> { ++ const url = "https://api.openai.com/v1/responses"; ++ const body = safeStringify({ ++ model: modelId, ++ input: `Reply with only: ${label}`, ++ max_output_tokens: 16, ++ }); ++ ++ const startedAt = Date.now(); ++ const res = await fetch(url, { ++ method: "POST", ++ headers: { ++ authorization: `Bearer ${apiKey}`, ++ "content-type": "application/json", ++ }, ++ body, ++ }); ++ ++ const limit = res.headers.get("x-ratelimit-limit-requests") ?? undefined; ++ const remaining = res.headers.get("x-ratelimit-remaining-requests") ?? undefined; ++ const reset = res.headers.get("x-ratelimit-reset-requests") ?? undefined; ++ ++ if (!res.ok) { ++ const text = await res.text().catch(() => ""); ++ throw new Error( ++ `OpenAI error status=${res.status} elapsedMs=${Date.now() - startedAt} body=${text.slice(0, 280)}`, ++ ); ++ } ++ ++ const data: any = await res.json(); ++ const outputText = ++ data?.output?.[0]?.content?.find?.((c: any) => c?.type === "output_text")?.text ?? ++ data?.output_text ?? ++ data?.output?.[0]?.content?.[0]?.text ?? ++ ""; ++ ++ return { ++ status: res.status, ++ headers: { ++ "x-ratelimit-limit-requests": limit, ++ "x-ratelimit-remaining-requests": remaining, ++ "x-ratelimit-reset-requests": reset, ++ }, ++ textPreview: String(outputText).slice(0, 80), ++ }; ++} ++ ++async function main() { ++ console.log( ++ `\n=== OpenAI real + synthetic window rate limit (provider=${provider}, model=${modelId}) ===`, ++ ); ++ console.log( ++ `[config] maxConcurrent=${maxConcurrent} windowSeconds=${windowSeconds} initialRemaining=${initialRemaining} requests=${requestCount}`, ++ ); ++ console.log( ++ "[hint] Set VOLTAGENT_LOG_LEVEL=trace to see TrafficController internals (reserved/effectiveRemaining/nextAllowedAt).", ++ ); ++ ++ const controller = new TrafficController({ maxConcurrent }); ++ ++ // --- Step 1: seed "remaining + reset window" into controller --- ++ let windowResetAt = Date.now() + windowMs; ++ let remainingInWindow = initialRemaining; ++ ++ const applySyntheticHeaders = (source: string) => { ++ const resetMs = Math.max(1, windowResetAt - Date.now()); ++ const applied = controller.updateRateLimitFromHeaders( ++ { provider, model: modelId, tenantId }, ++ { ++ "x-ratelimit-limit-requests": String(initialRemaining), ++ "x-ratelimit-remaining-requests": String(Math.max(0, remainingInWindow)), ++ "x-ratelimit-reset-requests": `${resetMs}ms`, ++ }, ++ ); ++ console.log( ++ `[${now()}] [synthetic] source=${source} remaining=${remainingInWindow} resetInMs=${resetMs} applied=${safeStringify( ++ applied && { ++ key: applied.key, ++ state: { ++ remaining: applied.state.remaining, ++ reserved: applied.state.reserved, ++ resetAt: applied.state.resetAt, ++ nextAllowedAt: applied.state.nextAllowedAt, ++ }, ++ }, ++ )}`, ++ ); ++ }; ++ ++ applySyntheticHeaders("seed"); ++ ++ console.log("\n[seed] Making one real request to confirm connectivity + show real headers..."); ++ const seed = await callOpenAIResponses("seed"); ++ console.log( ++ `[${now()}] [seed] ok status=${seed.status} text=${seed.textPreview} realHeaders=${safeStringify( ++ seed.headers, ++ )}`, ++ ); ++ ++ console.log(`\n[burst] Enqueueing ${requestCount} controller-managed requests...`); ++ ++ const jobs = Array.from({ length: requestCount }, (_, index) => { ++ const label = `req-${index + 1}`; ++ const enqueuedAt = Date.now(); ++ console.log(`[${now()}] [enqueue] ${label}`); ++ ++ return controller ++ .handleText({ ++ tenantId, ++ metadata: { ++ tenantId, ++ provider, ++ model: modelId, ++ priority: "P1", ++ agentName: "openai-window-sim", ++ agentId: label, ++ }, ++ execute: async () => { ++ const startedAt = Date.now(); ++ console.log(`[${now()}] [execute-start] ${label}`); ++ ++ const result = await callOpenAIResponses(label); ++ ++ console.log( ++ `[${now()}] [execute-end] ${label} elapsedMs=${Date.now() - startedAt} realHeaders=${safeStringify( ++ result.headers, ++ )}`, ++ ); ++ ++ // --- Step 6: decrement remaining + apply new "headers" --- ++ const nowMs = Date.now(); ++ if (nowMs >= windowResetAt) { ++ // --- Step 7: reset happened; probe request fetched "fresh" headers for the next window --- ++ console.log( ++ `[${now()}] [reset] window elapsed; starting new synthetic window (windowSeconds=${windowSeconds})`, ++ ); ++ windowResetAt = nowMs + windowMs; ++ remainingInWindow = initialRemaining; ++ } ++ ++ remainingInWindow = Math.max(0, remainingInWindow - 1); ++ applySyntheticHeaders("response"); ++ ++ return result; ++ }, ++ }) ++ .then((r) => { ++ const totalElapsedMs = Date.now() - enqueuedAt; ++ console.log( ++ `[${now()}] [done] ${label} totalElapsedMs=${totalElapsedMs} text=${r.textPreview}`, ++ ); ++ return { label, totalElapsedMs, status: "fulfilled" as const }; ++ }) ++ .catch((error: any) => { ++ const totalElapsedMs = Date.now() - enqueuedAt; ++ console.log( ++ `[${now()}] [fail] ${label} totalElapsedMs=${totalElapsedMs} name=${error?.name ?? "Error"} msg=${ ++ error?.message ?? String(error) ++ }`, ++ ); ++ return { label, totalElapsedMs, status: "rejected" as const }; ++ }); ++ }); ++ ++ const settled = await Promise.all(jobs); ++ console.log(`\n[done] settled=${safeStringify(settled.map((s) => s.status))}`); ++ console.log( ++ `[done] tenantUsage(${tenantId})=${safeStringify(controller.getTenantUsage(tenantId))}`, ++ ); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-rate-limit-static.ts b/tmp/test/traffic-rate-limit-static.ts +new file mode 100644 +index 00000000..3f91d5bb +--- /dev/null ++++ b/tmp/test/traffic-rate-limit-static.ts +@@ -0,0 +1,149 @@ ++// @ts-nocheck ++/** ++ * Manual test: TrafficController window-based rate limiting (simulated OpenAI headers). ++ * ++ * What to look for: ++ * - Requests should be paced out across the window (no steady "refill" math). ++ * - If responses arrive out-of-order, remaining headers might "increase"; controller should ++ * keep remaining monotonic within the same window. ++ * ++ * Run: ++ * - pnpm ts-node tmp/test/traffic-rate-limit-static.ts ++ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-rate-limit-static.ts ++ * ++ * Optional env: ++ * - LIMIT=6 WINDOW_MS=3000 pnpm ts-node tmp/test/traffic-rate-limit-static.ts ++ */ ++ ++import { safeStringify } from "@voltagent/internal"; ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); ++const now = () => new Date().toISOString(); ++ ++const provider = "sim"; ++const model = "rate-limited-model"; ++const key = `${provider}::${model}`; ++ ++const controller = getTrafficController({ maxConcurrent: 50 }); ++ ++const limit = Number(process.env.LIMIT ?? 6); ++const windowMs = Number(process.env.WINDOW_MS ?? 3000); ++let windowStartAt = Date.now(); ++let windowResetAt = windowStartAt + windowMs; ++let usedInWindow = 0; ++ ++function extractLabel(prompt: any): string { ++ if (!Array.isArray(prompt)) { ++ return "unknown"; ++ } ++ ++ for (let index = prompt.length - 1; index >= 0; index -= 1) { ++ const message = prompt[index]; ++ if (!message || message.role !== "user" || !Array.isArray(message.content)) { ++ continue; ++ } ++ ++ const textPart = message.content.find((part: any) => part?.type === "text"); ++ if (textPart?.text) { ++ return String(textPart.text); ++ } ++ } ++ ++ return "unknown"; ++} ++ ++async function main() { ++ console.log( ++ `\n=== Window rate limit for ${key} (limit=${limit}, windowMs=${windowMs}, jobs=10) ===`, ++ ); ++ ++ const seeded = controller.updateRateLimitFromHeaders( ++ { provider, model }, ++ { ++ "x-ratelimit-limit-requests": String(limit), ++ "x-ratelimit-remaining-requests": String(limit), ++ "x-ratelimit-reset-requests": `${windowMs}ms`, ++ }, ++ ); ++ console.log(`[seed] updateRateLimitFromHeaders=${safeStringify(seeded)}`); ++ ++ let calls = 0; ++ let lastStartAt = 0; ++ const rateLimitedModel = { ++ specificationVersion: "v2", ++ provider, ++ modelId: model, ++ doGenerate: async (options: any) => { ++ const simulatedLatencyMs = 10 + Math.floor(Math.random() * 120); ++ const nowMs = Date.now(); ++ if (nowMs >= windowResetAt) { ++ windowStartAt = nowMs; ++ windowResetAt = windowStartAt + windowMs; ++ usedInWindow = 0; ++ } ++ ++ calls += 1; ++ usedInWindow += 1; ++ const startAt = Date.now(); ++ const delta = lastStartAt ? startAt - lastStartAt : 0; ++ lastStartAt = startAt; ++ ++ const label = extractLabel(options?.prompt); ++ console.log( ++ `[${now()}] doGenerate start call=${calls} (+${delta}ms) input=${label} latencyMs=${simulatedLatencyMs}`, ++ ); ++ await sleep(simulatedLatencyMs); ++ console.log(`[${now()}] doGenerate end input=${label}`); ++ ++ const remainingAfterThis = Math.max(0, limit - usedInWindow); ++ const resetMs = Math.max(1, windowResetAt - Date.now()); ++ return { ++ content: [{ type: "text", text: `ok:${label}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { ++ modelId: model, ++ headers: { ++ "x-ratelimit-limit-requests": String(limit), ++ "x-ratelimit-remaining-requests": String(remainingAfterThis), ++ "x-ratelimit-reset-requests": `${resetMs}ms`, ++ }, ++ }, ++ }; ++ }, ++ }; ++ ++ const agent = new Agent({ ++ name: "traffic-rate-limit-static", ++ instructions: "echo", ++ model: rateLimitedModel, ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ const jobs = Array.from({ length: 10 }, (_, idx) => ++ agent.generateText(`req-${idx + 1}`, { ++ tenantId: "default", ++ trafficPriority: "P1", ++ }), ++ ); ++ ++ const settled = await Promise.allSettled(jobs); ++ console.log( ++ `\n[done] results=${safeStringify( ++ settled.map((s) => (s.status === "fulfilled" ? s.value.text : s.reason?.message)), ++ )}`, ++ ); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-retry-after.ts b/tmp/test/traffic-retry-after.ts +new file mode 100644 +index 00000000..c0c213eb +--- /dev/null ++++ b/tmp/test/traffic-retry-after.ts +@@ -0,0 +1,245 @@ ++// @ts-nocheck ++/** ++ * Manual test: Retry-After handling (429 retry + 200 OK header ingestion). ++ * ++ * What this exercises: ++ * - Retry-After on 429 errors increases retry delay (TrafficController retry plan). ++ * - Retry-After on successful responses throttles subsequent requests for the same provider::model. ++ * ++ * Run: ++ * - pnpm -C packages/core build ++ * - pnpm ts-node tmp/test/traffic-retry-after.ts ++ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-retry-after.ts ++ */ ++ ++import { safeStringify } from "@voltagent/internal"; ++import { ++ Agent, ++ RateLimitedUpstreamError, ++ getTrafficController, ++} from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); ++const now = () => new Date().toISOString(); ++ ++function extractLabel(prompt: any): string { ++ if (!Array.isArray(prompt)) { ++ return "unknown"; ++ } ++ ++ for (let index = prompt.length - 1; index >= 0; index -= 1) { ++ const message = prompt[index]; ++ if (!message || message.role !== "user" || !Array.isArray(message.content)) { ++ continue; ++ } ++ ++ const textPart = message.content.find((part: any) => part?.type === "text"); ++ if (textPart?.text) { ++ return String(textPart.text); ++ } ++ } ++ ++ return "unknown"; ++} ++ ++function make429RetryAfterModel(args: { ++ provider: string; ++ modelId: string; ++ retryAfterSeconds: number; ++ mode: "headers" | "typedError"; ++}) { ++ const { provider, modelId, retryAfterSeconds, mode } = args; ++ let calls = 0; ++ const startedAt: number[] = []; ++ ++ return { ++ specificationVersion: "v2", ++ provider, ++ modelId, ++ startedAt, ++ doGenerate: async (options: any) => { ++ calls += 1; ++ const start = Date.now(); ++ startedAt.push(start); ++ ++ const label = extractLabel(options?.prompt); ++ console.log(`[${now()}] [model] ${provider}::${modelId} start call=${calls} input=${label}`); ++ ++ if (calls === 1) { ++ const retryAfterValue = String(retryAfterSeconds); ++ ++ if (mode === "typedError") { ++ throw new RateLimitedUpstreamError( ++ `rate limited (typed) retry-after=${retryAfterValue}s`, ++ { provider, model: modelId }, ++ Math.round(retryAfterSeconds * 1000), ++ ); ++ } ++ ++ const err: any = new Error(`rate limited (headers) retry-after=${retryAfterValue}s`); ++ err.status = 429; ++ err.response = { ++ status: 429, ++ headers: { ++ "retry-after": retryAfterValue, ++ }, ++ }; ++ throw err; ++ } ++ ++ return { ++ content: [{ type: "text", text: `ok:${label}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { modelId, headers: {} }, ++ }; ++ }, ++ }; ++} ++ ++function makeSuccessRetryAfterModel(args: { ++ provider: string; ++ modelId: string; ++ retryAfterSeconds: number; ++ latencyMs: number; ++}) { ++ const { provider, modelId, retryAfterSeconds, latencyMs } = args; ++ let calls = 0; ++ const startedAt: number[] = []; ++ const endedAt: number[] = []; ++ ++ return { ++ specificationVersion: "v2", ++ provider, ++ modelId, ++ startedAt, ++ endedAt, ++ doGenerate: async (options: any) => { ++ calls += 1; ++ const start = Date.now(); ++ startedAt.push(start); ++ ++ const label = extractLabel(options?.prompt); ++ console.log(`[${now()}] [model] ${provider}::${modelId} start call=${calls} input=${label}`); ++ await sleep(latencyMs); ++ ++ const end = Date.now(); ++ endedAt.push(end); ++ console.log(`[${now()}] [model] ${provider}::${modelId} end call=${calls} input=${label}`); ++ ++ return { ++ content: [{ type: "text", text: `ok:${label}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { ++ modelId, ++ headers: ++ calls === 1 ++ ? { ++ "retry-after": String(retryAfterSeconds), ++ } ++ : {}, ++ }, ++ }; ++ }, ++ }; ++} ++ ++async function test_retryAfterOn429(mode: "headers" | "typedError") { ++ const retryAfterSeconds = 1; ++ const provider = `retry-after-429-${mode}`; ++ const modelId = "ra-429"; ++ const tenantId = `ra-429-${mode}`; ++ ++ const model = make429RetryAfterModel({ provider, modelId, retryAfterSeconds, mode }); ++ const agent = new Agent({ ++ name: `ra-429-${mode}`, ++ instructions: "echo", ++ model, ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ console.log(`\n=== Test: Retry-After on 429 (${mode}) ===`); ++ const result = await agent.generateText("hello", { tenantId, trafficPriority: "P1" }); ++ ++ const times = model.startedAt; ++ const deltaMs = times.length >= 2 ? times[1] - times[0] : undefined; ++ ++ console.log( ++ `[result] text=${result.text} calls=${times.length} startedAt=${safeStringify(times)} deltaMs=${deltaMs}`, ++ ); ++ ++ if (deltaMs === undefined || deltaMs < retryAfterSeconds * 1000) { ++ throw new Error( ++ `Expected retry delay >= ${retryAfterSeconds * 1000}ms, got ${deltaMs ?? "n/a"}ms`, ++ ); ++ } ++} ++ ++async function test_retryAfterOnSuccessResponse() { ++ const retryAfterSeconds = 0.3; ++ const provider = "retry-after-200"; ++ const modelId = "ra-200"; ++ const tenantId = "ra-200"; ++ ++ const model = makeSuccessRetryAfterModel({ ++ provider, ++ modelId, ++ retryAfterSeconds, ++ latencyMs: 20, ++ }); ++ ++ const agent = new Agent({ ++ name: "ra-200", ++ instructions: "echo", ++ model, ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ console.log("\n=== Test: Retry-After on 200 response headers ==="); ++ const first = agent.generateText("first", { tenantId, trafficPriority: "P1" }); ++ const second = agent.generateText("second", { tenantId, trafficPriority: "P1" }); ++ ++ const [r1, r2] = await Promise.all([first, second]); ++ ++ const end1 = model.endedAt[0]; ++ const start2 = model.startedAt[1]; ++ const enforcedDelayMs = start2 && end1 ? start2 - end1 : undefined; ++ ++ console.log( ++ `[result] texts=${safeStringify([r1.text, r2.text])} startedAt=${safeStringify( ++ model.startedAt, ++ )} endedAt=${safeStringify(model.endedAt)} enforcedDelayMs=${enforcedDelayMs}`, ++ ); ++ ++ if (enforcedDelayMs === undefined || enforcedDelayMs < retryAfterSeconds * 1000) { ++ throw new Error( ++ `Expected rate-limit delay >= ${retryAfterSeconds * 1000}ms, got ${enforcedDelayMs ?? "n/a"}ms`, ++ ); ++ } ++} ++ ++async function main() { ++ // Create controller early so all Agent calls share the same singleton. ++ getTrafficController({ maxConcurrent: 1 }); ++ ++ await test_retryAfterOn429("headers"); ++ await test_retryAfterOn429("typedError"); ++ await test_retryAfterOnSuccessResponse(); ++ ++ console.log("\n[done] All Retry-After manual checks passed."); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-retry-behavior.ts b/tmp/test/traffic-retry-behavior.ts +new file mode 100644 +index 00000000..273af55a +--- /dev/null ++++ b/tmp/test/traffic-retry-behavior.ts +@@ -0,0 +1,169 @@ ++// @ts-nocheck ++/** ++ * Manual test: TrafficController retry behavior via Agent + AI SDK path (stub model). ++ * ++ * Scenarios included: ++ * - 5xx retries (up to 3 attempts) ++ * - 429 retries (up to 3 attempts) ++ * - timeout retries (up to 2 attempts) ++ * - non-retriable 4xx does not retry ++ * ++ * Run: ++ * - pnpm ts-node tmp/test/traffic-retry-behavior.ts ++ * ++ * Notes: ++ * - Uses a stub LanguageModel; no network calls. ++ * - Watch the `[model] attempt=...` logs to confirm retries. ++ */ ++ ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++type Scenario = ++ | "server-error" ++ | "rate-limit" ++ | "timeout" ++ | "bad-request" ++ | "forbidden" ++ // Variations to hit different retry-detection branches. ++ | "server-error-status-string" ++ | "server-error-statusCode" ++ | "server-error-response-status" ++ | "server-error-cause-status" ++ | "rate-limit-statusCode" ++ | "timeout-code-only" ++ | "timeout-name-only" ++ | "timeout-message-only" ++ // Variations that should STOP retrying (hit max attempts). ++ | "server-error-exceed-max" ++ | "timeout-exceed-max"; ++ ++type RetryPlan = { ++ failCountBeforeSuccess: number; ++ status?: number | string; ++ statusCode?: number | string; ++ httpStatus?: number | string; ++ responseStatus?: number | string; ++ causeStatus?: number | string; ++ code?: string; ++ name?: string; ++ message?: string; ++}; ++ ++const plans: Record = { ++ "server-error": { failCountBeforeSuccess: 2, status: 500 }, ++ "rate-limit": { failCountBeforeSuccess: 2, status: 429 }, ++ timeout: { failCountBeforeSuccess: 1, status: 408, code: "ETIMEDOUT", message: "timeout" }, ++ "bad-request": { failCountBeforeSuccess: 10, status: 400 }, ++ forbidden: { failCountBeforeSuccess: 10, status: 403 }, ++ "server-error-status-string": { failCountBeforeSuccess: 2, status: "500" }, ++ "server-error-statusCode": { failCountBeforeSuccess: 2, statusCode: 502 }, ++ "server-error-response-status": { failCountBeforeSuccess: 2, responseStatus: 503 }, ++ "server-error-cause-status": { failCountBeforeSuccess: 2, causeStatus: 500 }, ++ "rate-limit-statusCode": { failCountBeforeSuccess: 2, statusCode: 429 }, ++ "timeout-code-only": { failCountBeforeSuccess: 1, code: "timeout" }, ++ "timeout-name-only": { failCountBeforeSuccess: 1, name: "TimeoutError" }, ++ "timeout-message-only": { failCountBeforeSuccess: 1, message: "this is a TIMEOUT" }, ++ "server-error-exceed-max": { failCountBeforeSuccess: 10, status: 500 }, ++ "timeout-exceed-max": { failCountBeforeSuccess: 10, message: "timeout" }, ++}; ++ ++function makeModel(modelId: string, plan: RetryPlan) { ++ let counter = 0; ++ let lastAttemptAt = 0; ++ ++ return { ++ specificationVersion: "v2", ++ provider: "retry-provider", ++ modelId, ++ doGenerate: async () => { ++ counter += 1; ++ const now = Date.now(); ++ const delta = lastAttemptAt ? now - lastAttemptAt : 0; ++ lastAttemptAt = now; ++ ++ console.log(`[model] modelId=${modelId} attempt=${counter} (+${delta}ms)`); ++ ++ if (counter <= plan.failCountBeforeSuccess) { ++ const err: any = new Error(plan.message ?? `forced failure ${counter} for ${modelId}`); ++ if (plan.status !== undefined) err.status = plan.status; ++ if (plan.statusCode !== undefined) err.statusCode = plan.statusCode; ++ if (plan.httpStatus !== undefined) err.httpStatus = plan.httpStatus; ++ if (plan.responseStatus !== undefined) err.response = { status: plan.responseStatus }; ++ if (plan.causeStatus !== undefined) err.cause = { status: plan.causeStatus }; ++ if (plan.code !== undefined) err.code = plan.code; ++ if (plan.name !== undefined) err.name = plan.name; ++ throw err; ++ } ++ ++ return { ++ content: [{ type: "text", text: "ok" }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { modelId, headers: {} }, ++ }; ++ }, ++ }; ++} ++ ++async function runScenario(name: Scenario) { ++ const plan = plans[name]; ++ const modelId = `retry-${name}`; ++ const model = makeModel(modelId, plan); ++ ++ const agent = new Agent({ ++ name: `RetryAgent-${name}`, ++ instructions: "echo", ++ model, ++ maxOutputTokens: 32, ++ temperature: 0, ++ }); ++ ++ console.log(`\n=== ${name} ===`); ++ try { ++ const result = await agent.generateText(name, { tenantId: "retry-test" }); ++ console.log(`[${name}] succeeded. text=${result.text}`); ++ } catch (err: any) { ++ console.log( ++ `[${name}] failed. status=${err?.status ?? err?.statusCode ?? err?.response?.status ?? "n/a"}`, ++ ); ++ } ++} ++ ++async function main() { ++ // Create controller early so all Agent calls share the same singleton. ++ getTrafficController({ maxConcurrent: 1 }); ++ ++ const runs: Scenario[] = [ ++ "server-error", ++ "rate-limit", ++ "timeout", ++ "bad-request", ++ "forbidden", ++ // Uncomment for additional coverage: ++ // "server-error-status-string", ++ // "server-error-statusCode", ++ // "server-error-response-status", ++ // "server-error-cause-status", ++ // "rate-limit-statusCode", ++ // "timeout-code-only", ++ // "timeout-name-only", ++ // "timeout-message-only", ++ // "server-error-exceed-max", ++ // "timeout-exceed-max", ++ ]; ++ ++ for (const name of runs) { ++ await runScenario(name); ++ } ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-tenant-usage.ts b/tmp/test/traffic-tenant-usage.ts +new file mode 100644 +index 00000000..801d7761 +--- /dev/null ++++ b/tmp/test/traffic-tenant-usage.ts +@@ -0,0 +1,71 @@ ++// @ts-nocheck ++/** ++ * Manual test: Tenant usage aggregation (via Agent → TrafficController). ++ * ++ * What to look for: ++ * - `getTenantUsage(tenantId)` should increase after each agent call. ++ * ++ * Run: ++ * - pnpm ts-node tmp/test/traffic-tenant-usage.ts ++ */ ++ ++import { safeStringify } from "@voltagent/internal"; ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++function makeModel(modelId: string) { ++ return { ++ specificationVersion: "v2", ++ provider: "usage-provider", ++ modelId, ++ doGenerate: async () => { ++ return { ++ content: [{ type: "text", text: `ok:${modelId}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 2, outputTokens: 3, totalTokens: 5 }, ++ warnings: [], ++ response: { modelId, headers: {} }, ++ }; ++ }, ++ }; ++} ++ ++const controller = getTrafficController({ maxConcurrent: 10 }); ++ ++async function run(label: string, tenantId: string) { ++ const model = makeModel("tenant-usage-model"); ++ const agent = new Agent({ ++ name: `TenantUsageAgent-${label}`, ++ instructions: "echo", ++ model, ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ console.log(`\n=== ${label} tenantId=${tenantId} ===`); ++ const result = await agent.generateText(`hello:${label}`, { tenantId }); ++ console.log(`[${label}] text=${result.text}`); ++ ++ const usage = controller.getTenantUsage(tenantId); ++ console.log(`[${label}] controller.getTenantUsage(${tenantId})=${safeStringify(usage)}`); ++} ++ ++async function main() { ++ await run("A1", "tenant-a"); ++ await run("A2", "tenant-a"); ++ await run("B1", "tenant-b"); ++ ++ console.log("\n=== Final usage snapshot ==="); ++ console.log(`tenant-a=${safeStringify(controller.getTenantUsage("tenant-a"))}`); ++ console.log(`tenant-b=${safeStringify(controller.getTenantUsage("tenant-b"))}`); ++ console.log(`default=${safeStringify(controller.getTenantUsage("default"))}`); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); +diff --git a/tmp/test/traffic-text-vs-stream.ts b/tmp/test/traffic-text-vs-stream.ts +new file mode 100644 +index 00000000..41aa484d +--- /dev/null ++++ b/tmp/test/traffic-text-vs-stream.ts +@@ -0,0 +1,128 @@ ++// @ts-nocheck ++/** ++ * Manual test: Text + stream traffic share the same TrafficController queue. ++ * ++ * What to look for: ++ * - Stream and text requests should respect the same maxConcurrent + priority rules. ++ * ++ * Run: ++ * - pnpm ts-node tmp/test/traffic-text-vs-stream.ts ++ * - VERBOSE=1 pnpm ts-node tmp/test/traffic-text-vs-stream.ts ++ */ ++ ++import { ReadableStream } from "node:stream/web"; ++import { safeStringify } from "@voltagent/internal"; ++import { Agent, getTrafficController } from "../../packages/core/dist/index.js"; ++ ++const verbose = process.env.VERBOSE === "1"; ++if (!verbose) { ++ console.debug = () => {}; ++} ++ ++const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); ++const now = () => new Date().toISOString(); ++ ++const controller = getTrafficController({ maxConcurrent: 1 }); ++ ++function extractLabel(prompt: any): string { ++ if (!Array.isArray(prompt)) { ++ return "unknown"; ++ } ++ ++ for (let index = prompt.length - 1; index >= 0; index -= 1) { ++ const message = prompt[index]; ++ if (!message || message.role !== "user" || !Array.isArray(message.content)) { ++ continue; ++ } ++ ++ const textPart = message.content.find((part: any) => part?.type === "text"); ++ if (textPart?.text) { ++ return String(textPart.text); ++ } ++ } ++ ++ return "unknown"; ++} ++ ++async function main() { ++ console.log("\n=== Text vs Stream (shared scheduler) ==="); ++ void controller; ++ ++ const provider = "sim"; ++ const modelId = "shared-queue"; ++ ++ const model = { ++ specificationVersion: "v2", ++ provider, ++ modelId, ++ doGenerate: async (options: any) => { ++ const label = extractLabel(options?.prompt); ++ console.log(`[${now()}] doGenerate start input=${label}`); ++ await sleep(50); ++ console.log(`[${now()}] doGenerate end input=${label}`); ++ return { ++ content: [{ type: "text", text: `text:${label}` }], ++ finishReason: "stop", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ warnings: [], ++ response: { modelId, headers: {} }, ++ }; ++ }, ++ doStream: async (options: any) => { ++ const label = extractLabel(options?.prompt); ++ console.log(`[${now()}] doStream start input=${label}`); ++ ++ // Hold the controller slot for a bit so ordering is visible. ++ await sleep(400); ++ ++ console.log(`[${now()}] doStream ready input=${label}`); ++ const streamId = `text-${label}`; ++ const text = `stream:${label}`; ++ ++ const stream = new ReadableStream({ ++ start(streamController) { ++ streamController.enqueue({ type: "stream-start", warnings: [] }); ++ streamController.enqueue({ type: "text-start", id: streamId }); ++ streamController.enqueue({ type: "text-delta", id: streamId, delta: text }); ++ streamController.enqueue({ type: "text-end", id: streamId }); ++ streamController.enqueue({ ++ type: "finish", ++ usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, ++ finishReason: "stop", ++ }); ++ streamController.close(); ++ }, ++ }); ++ ++ return { stream, response: { headers: {} } }; ++ }, ++ }; ++ ++ const agent = new Agent({ ++ name: "traffic-text-vs-stream", ++ instructions: "echo", ++ model, ++ temperature: 0, ++ maxOutputTokens: 32, ++ }); ++ ++ const streamP1 = agent.streamText("S1", { tenantId: "default", trafficPriority: "P1" }); ++ const textP0 = agent.generateText("T0", { tenantId: "default", trafficPriority: "P0" }); ++ const textP1 = agent.generateText("T1", { tenantId: "default", trafficPriority: "P1" }); ++ ++ const [streamResult, t0, t1] = await Promise.all([streamP1, textP0, textP1]); ++ const streamText = await streamResult.text; ++ ++ console.log( ++ `\n[done] results=${safeStringify({ ++ streamText, ++ textP0: t0.text, ++ textP1: t1.text, ++ })}`, ++ ); ++} ++ ++main().catch((error) => { ++ console.error("Fatal error:", error); ++ process.exit(1); ++}); diff --git a/packages/core/src/traffic/traffic-controller.spec.ts b/packages/core/src/traffic/traffic-controller.spec.ts index 8f0a2c47c..0d40426a1 100644 --- a/packages/core/src/traffic/traffic-controller.spec.ts +++ b/packages/core/src/traffic/traffic-controller.spec.ts @@ -523,6 +523,52 @@ describe("TrafficController token limits", () => { }); describe("TrafficController stream reporting", () => { + it("holds concurrency slots for streams until completion", async () => { + const controller = new TrafficController({ maxConcurrent: 1 }); + const order: string[] = []; + const firstMetadata = { + provider: "p", + model: "m", + priority: "P1" as const, + tenantId: "tenant-a", + }; + const secondMetadata = { + provider: "p", + model: "m", + priority: "P1" as const, + tenantId: "tenant-a", + }; + + const first = controller.handleStream({ + tenantId: "tenant-a", + metadata: firstMetadata, + execute: async () => { + order.push("first"); + return "first"; + }, + }); + + const second = controller.handleStream({ + tenantId: "tenant-a", + metadata: secondMetadata, + execute: async () => { + order.push("second"); + return "second"; + }, + }); + + await first; + await Promise.resolve(); + expect(order).toEqual(["first"]); + + controller.reportStreamSuccess(firstMetadata); + await Promise.resolve(); + expect(order).toEqual(["first", "second"]); + + controller.reportStreamSuccess(secondMetadata); + await Promise.all([first, second]); + }); + it("slows down after stream 429 errors", async () => { vi.useFakeTimers(); diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index 269304d9c..9b7221280 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -1,5 +1,6 @@ import type { Logger } from "../logger"; import { LoggerProxy } from "../logger"; +import { randomUUID } from "../utils/id"; import { TrafficCircuitBreaker } from "./traffic-circuit-breaker"; import { TrafficConcurrencyLimiter } from "./traffic-concurrency-limiter"; import type { DispatchDecision, QueuedRequest, Scheduler } from "./traffic-controller-internal"; @@ -135,6 +136,7 @@ export class TrafficController { private activeCount = 0; private drainScheduled = false; + private readonly inFlightStreams = new Map(); /* ---------- Rate limits ---------- */ private readonly rateLimiter: TrafficRateLimiter; @@ -253,6 +255,7 @@ export class TrafficController { rateLimitKey, ); this.recordAdaptiveSuccess(adaptiveKey); + this.releaseStreamSlot(metadata, "success"); } reportStreamFailure(metadata: TrafficRequestMetadata | undefined, error: unknown): void { @@ -295,6 +298,7 @@ export class TrafficController { if (errorForHandling !== error) { this.attachTrafficMetadata(error, traffic); } + this.releaseStreamSlot(metadata, "failure"); } updateRateLimitFromHeaders( @@ -356,26 +360,27 @@ export class TrafficController { request: TrafficRequest, ): Promise { return new Promise((resolve, reject) => { - const priority = this.resolvePriority(request.metadata); - const tenantId = this.resolveTenantId(request); + const normalizedRequest = this.ensureStreamRequestId(type, request); + const priority = this.resolvePriority(normalizedRequest.metadata); + const tenantId = this.resolveTenantId(normalizedRequest); this.controllerLogger.debug("Enqueue request", { type, tenantId, priority, - provider: request.metadata?.provider, - model: request.metadata?.model, + provider: normalizedRequest.metadata?.provider, + model: normalizedRequest.metadata?.model, }); this.enqueueItem({ type, - request, + request: normalizedRequest, resolve, reject, attempt: 1, priority, tenantId, enqueuedAt: Date.now(), - estimatedTokens: request.estimatedTokens, - extractUsage: request.extractUsage, + estimatedTokens: normalizedRequest.estimatedTokens, + extractUsage: normalizedRequest.extractUsage, }); this.scheduleDrain(); }); @@ -448,145 +453,11 @@ export class TrafficController { if (!candidate) break; attempts += 1; - const { item: next, queue, tenantId } = candidate; const now = Date.now(); - const queueTimeoutAt = this.resolveQueueTimeoutAt(next); - const queueTimeoutTriggered = this.handleQueueTimeout(next, queue, 0, now, queueTimeoutAt); - if (queueTimeoutTriggered === "rejected") { - this.cleanupTenantQueue(priority, tenantId, queue); - return { kind: "skip" }; - } - if (queueTimeoutAt !== undefined && now < queueTimeoutAt) { - observeWakeUpAt(queueTimeoutAt); - } - const queueTimeoutExpired = queueTimeoutTriggered === "expired"; - - this.controllerLogger.trace("Evaluate next queued request", { - priority, - tenantId: next.tenantId, - type: next.type, - attempt: next.attempt, - provider: next.request.metadata?.provider, - model: next.request.metadata?.model, - queueLength: queue.length, - }); - - const circuit = this.resolveCircuit(next); - if (circuit) { - this.controllerLogger.trace("Circuit resolution returned decision", { - priority, - decision: circuit, - circuitKey: next.circuitKey, - circuitStatus: next.circuitStatus, - }); - if (circuit.kind === "skip") { - queue.shift(); - this.cleanupTenantQueue(priority, tenantId, queue); - return { kind: "skip" }; - } - if (circuit.kind === "wait") { - if ( - this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "circuit wait") - ) { - this.cleanupTenantQueue(priority, tenantId, queue); - return { kind: "skip" }; - } - next.etaMs = - circuit.wakeUpAt !== undefined ? Math.max(0, circuit.wakeUpAt - now) : undefined; - observeWakeUpAt(circuit.wakeUpAt); - continue; - } - } - - const concurrency = this.concurrencyLimiter.resolve(next, this.trafficLogger); - if (concurrency.kind === "wait") { - this.controllerLogger.trace("Concurrency gate blocked request", { - priority, - tenantId: next.tenantId, - provider: next.request.metadata?.provider, - model: next.request.metadata?.model, - reasons: concurrency.reasons, - }); - if ( - this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "concurrency wait") - ) { - this.cleanupTenantQueue(priority, tenantId, queue); - return { kind: "skip" }; - } - next.etaMs = undefined; - continue; - } - - const adaptive = this.resolveAdaptiveLimit(next, now); - if (adaptive?.kind === "wait") { - if ( - this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "adaptive wait") - ) { - this.cleanupTenantQueue(priority, tenantId, queue); - return { kind: "skip" }; - } - next.etaMs = - adaptive.wakeUpAt !== undefined ? Math.max(0, adaptive.wakeUpAt - now) : undefined; - observeWakeUpAt(adaptive.wakeUpAt); - continue; - } - - const rateLimit = this.resolveRateLimit(next); - if (rateLimit) { - this.controllerLogger.trace("Rate limit resolution returned decision", { - priority, - decision: rateLimit, - rateLimitKey: next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), - }); - if (rateLimit.kind === "wait") { - if ( - this.rejectIfQueueTimedOut( - queueTimeoutExpired, - next, - queue, - 0, - now, - "rate limit wait", - ) - ) { - this.cleanupTenantQueue(priority, tenantId, queue); - return { kind: "skip" }; - } - next.etaMs = - rateLimit.wakeUpAt !== undefined ? Math.max(0, rateLimit.wakeUpAt - now) : undefined; - observeWakeUpAt(rateLimit.wakeUpAt); - } - continue; - } - - if (queueTimeoutExpired) { - const timeoutError = this.createQueueTimeoutError(next, now); - this.attachTrafficMetadata( - timeoutError, - this.buildTrafficResponseMetadata( - next, - timeoutError.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), - now, - timeoutError, - ), - ); - this.controllerLogger.warn("Queue wait timed out before dispatch", { - tenantId: next.tenantId, - waitedMs: timeoutError.waitedMs, - maxQueueWaitMs: timeoutError.maxQueueWaitMs, - deadlineAt: timeoutError.deadlineAt, - provider: next.request.metadata?.provider, - model: next.request.metadata?.model, - rateLimitKey: timeoutError.rateLimitKey, - }); - queue.shift(); - this.cleanupTenantQueue(priority, tenantId, queue); - next.reject(timeoutError); - return { kind: "skip" }; - } - - this.startRequest(next, queue, tenantId); - return { kind: "dispatch" }; + const result = this.processQueuedCandidate(priority, candidate, now); + observeWakeUpAt(result.wakeUpAt); + if (result.action === "dispatch") return { kind: "dispatch" }; + if (result.action === "skip") return { kind: "skip" }; } } @@ -622,6 +493,7 @@ export class TrafficController { private async executeRequest(item: QueuedRequest): Promise { const startedAt = Date.now(); + let streamHeld = false; try { this.controllerLogger.debug("Execute request", { priority: item.priority, @@ -661,6 +533,25 @@ export class TrafficController { result, this.buildTrafficResponseMetadata(item, rateLimitKey, Date.now()), ); + if (item.type === "stream") { + const requestId = item.request.metadata?.requestId; + if (!requestId) { + this.controllerLogger.warn("Stream missing requestId; releasing slot immediately", { + tenantId: item.tenantId, + provider: item.request.metadata?.provider, + model: item.request.metadata?.model, + }); + } else { + this.inFlightStreams.set(requestId, item); + streamHeld = true; + this.controllerLogger.debug("Stream registered; holding slot", { + requestId, + tenantId: item.tenantId, + provider: item.request.metadata?.provider, + model: item.request.metadata?.model, + }); + } + } item.resolve(result); } catch (error) { const rateLimitKey = item.rateLimitKey ?? this.buildRateLimitKey(item.request.metadata); @@ -749,15 +640,9 @@ export class TrafficController { item.reject(errorForHandling); } } finally { - this.rateLimiter.releaseReservation(item.rateLimitKey, this.trafficLogger); - this.concurrencyLimiter.release(item, this.trafficLogger); - this.activeCount = Math.max(0, this.activeCount - 1); - this.controllerLogger.trace("Request finished; slot released", { - tenantId: item.tenantId, - activeCount: this.activeCount, - maxConcurrent: this.maxConcurrent, - }); - this.scheduleDrain(); + if (!(item.type === "stream" && streamHeld)) { + this.releaseActiveSlot(item, "completed"); + } } } @@ -1051,6 +936,217 @@ export class TrafficController { return this.rateLimitKeyBuilder(metadata); } + private processQueuedCandidate( + priority: TrafficPriority, + candidate: { item: QueuedRequest; queue: QueuedRequest[]; tenantId: string }, + now: number, + ): { action: "dispatch" | "skip" | "continue"; wakeUpAt?: number } { + const { item: next, queue, tenantId } = candidate; + let wakeUpAt: number | undefined; + const queueTimeoutAt = this.resolveQueueTimeoutAt(next); + const queueTimeoutTriggered = this.handleQueueTimeout(next, queue, 0, now, queueTimeoutAt); + if (queueTimeoutTriggered === "rejected") { + this.cleanupTenantQueue(priority, tenantId, queue); + return { action: "skip" }; + } + if (queueTimeoutAt !== undefined && now < queueTimeoutAt) { + wakeUpAt = queueTimeoutAt; + } + const queueTimeoutExpired = queueTimeoutTriggered === "expired"; + + this.controllerLogger.trace("Evaluate next queued request", { + priority, + tenantId: next.tenantId, + type: next.type, + attempt: next.attempt, + provider: next.request.metadata?.provider, + model: next.request.metadata?.model, + queueLength: queue.length, + }); + + const circuit = this.resolveCircuit(next); + if (circuit) { + this.controllerLogger.trace("Circuit resolution returned decision", { + priority, + decision: circuit, + circuitKey: next.circuitKey, + circuitStatus: next.circuitStatus, + }); + if (circuit.kind === "skip") { + queue.shift(); + this.cleanupTenantQueue(priority, tenantId, queue); + return { action: "skip", wakeUpAt }; + } + if (circuit.kind === "wait") { + if (this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "circuit wait")) { + this.cleanupTenantQueue(priority, tenantId, queue); + return { action: "skip", wakeUpAt }; + } + next.etaMs = + circuit.wakeUpAt !== undefined ? Math.max(0, circuit.wakeUpAt - now) : undefined; + return { action: "continue", wakeUpAt: this.pickEarlierWakeUp(wakeUpAt, circuit.wakeUpAt) }; + } + } + + const concurrency = this.concurrencyLimiter.resolve(next, this.trafficLogger); + if (concurrency.kind === "wait") { + this.controllerLogger.trace("Concurrency gate blocked request", { + priority, + tenantId: next.tenantId, + provider: next.request.metadata?.provider, + model: next.request.metadata?.model, + reasons: concurrency.reasons, + }); + if ( + this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "concurrency wait") + ) { + this.cleanupTenantQueue(priority, tenantId, queue); + return { action: "skip", wakeUpAt }; + } + next.etaMs = undefined; + return { action: "continue", wakeUpAt }; + } + + const adaptive = this.resolveAdaptiveLimit(next, now); + if (adaptive?.kind === "wait") { + if (this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "adaptive wait")) { + this.cleanupTenantQueue(priority, tenantId, queue); + return { action: "skip", wakeUpAt }; + } + next.etaMs = + adaptive.wakeUpAt !== undefined ? Math.max(0, adaptive.wakeUpAt - now) : undefined; + return { action: "continue", wakeUpAt: this.pickEarlierWakeUp(wakeUpAt, adaptive.wakeUpAt) }; + } + + const rateLimit = this.resolveRateLimit(next); + if (rateLimit) { + this.controllerLogger.trace("Rate limit resolution returned decision", { + priority, + decision: rateLimit, + rateLimitKey: next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), + }); + if (rateLimit.kind === "wait") { + if ( + this.rejectIfQueueTimedOut(queueTimeoutExpired, next, queue, 0, now, "rate limit wait") + ) { + this.cleanupTenantQueue(priority, tenantId, queue); + return { action: "skip", wakeUpAt }; + } + next.etaMs = + rateLimit.wakeUpAt !== undefined ? Math.max(0, rateLimit.wakeUpAt - now) : undefined; + return { + action: "continue", + wakeUpAt: this.pickEarlierWakeUp(wakeUpAt, rateLimit.wakeUpAt), + }; + } + return { action: "continue", wakeUpAt }; + } + + if (queueTimeoutExpired) { + const timeoutError = this.createQueueTimeoutError(next, now); + this.attachTrafficMetadata( + timeoutError, + this.buildTrafficResponseMetadata( + next, + timeoutError.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata), + now, + timeoutError, + ), + ); + this.controllerLogger.warn("Queue wait timed out before dispatch", { + tenantId: next.tenantId, + waitedMs: timeoutError.waitedMs, + maxQueueWaitMs: timeoutError.maxQueueWaitMs, + deadlineAt: timeoutError.deadlineAt, + provider: next.request.metadata?.provider, + model: next.request.metadata?.model, + rateLimitKey: timeoutError.rateLimitKey, + }); + queue.shift(); + this.cleanupTenantQueue(priority, tenantId, queue); + next.reject(timeoutError); + return { action: "skip", wakeUpAt }; + } + + this.startRequest(next, queue, tenantId); + return { action: "dispatch", wakeUpAt }; + } + + private pickEarlierWakeUp( + current: number | undefined, + candidate: number | undefined, + ): number | undefined { + if (candidate === undefined) return current; + if (current === undefined) return candidate; + return Math.min(current, candidate); + } + + private ensureStreamRequestId( + type: TrafficRequestType, + request: TrafficRequest, + ): TrafficRequest { + if (type !== "stream") return request; + const metadata = request.metadata; + if (metadata?.requestId) return request; + + const requestId = randomUUID(); + if (metadata && typeof metadata === "object") { + (metadata as TrafficRequestMetadata).requestId = requestId; + return request; + } + + return { + ...request, + metadata: { + ...(metadata ?? {}), + requestId, + }, + }; + } + + private releaseStreamSlot( + metadata: TrafficRequestMetadata | undefined, + outcome: "success" | "failure", + ): void { + const requestId = metadata?.requestId; + if (!requestId) { + this.controllerLogger.debug("Stream completion missing requestId; slot not released", { + outcome, + }); + return; + } + const item = this.inFlightStreams.get(requestId); + if (!item) { + this.controllerLogger.debug("Stream completion missing in-flight entry", { + requestId, + outcome, + }); + return; + } + this.inFlightStreams.delete(requestId); + this.controllerLogger.debug("Stream completed; releasing slot", { + requestId, + tenantId: item.tenantId, + provider: item.request.metadata?.provider, + model: item.request.metadata?.model, + outcome, + }); + this.releaseActiveSlot(item, `stream-${outcome}`); + } + + private releaseActiveSlot(item: QueuedRequest, reason: string): void { + this.rateLimiter.releaseReservation(item.rateLimitKey, this.trafficLogger); + this.concurrencyLimiter.release(item, this.trafficLogger); + this.activeCount = Math.max(0, this.activeCount - 1); + this.controllerLogger.trace("Request finished; slot released", { + tenantId: item.tenantId, + activeCount: this.activeCount, + maxConcurrent: this.maxConcurrent, + reason, + }); + this.scheduleDrain(); + } + private resolveAdaptiveLimit(next: QueuedRequest, now: number): DispatchDecision | null { const rateLimitKey = next.rateLimitKey ?? this.buildRateLimitKey(next.request.metadata); const adaptiveKey = this.buildAdaptiveKey(next.request.metadata, next.tenantId, rateLimitKey); diff --git a/packages/core/src/traffic/traffic-types.ts b/packages/core/src/traffic/traffic-types.ts index 1d847e252..396fbf07c 100644 --- a/packages/core/src/traffic/traffic-types.ts +++ b/packages/core/src/traffic/traffic-types.ts @@ -41,6 +41,7 @@ export interface TrafficRequestMetadata { agentName?: string; model?: string; provider?: string; + requestId?: string; priority?: TrafficPriority; tenantId?: string; apiKeyId?: string; From b75304f13629b826032e3d00be556f98dfc4ec0a Mon Sep 17 00:00:00 2001 From: riturajFi Date: Fri, 26 Dec 2025 13:19:33 +0530 Subject: [PATCH 38/41] =?UTF-8?q?fix:=20token-only=20headers=20weren?= =?UTF-8?q?=E2=80=99t=20persisted?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../openai-window-rate-limit-strategy.ts | 34 ++++++++++++++++--- .../rate-limit-strategy.ts | 4 +++ .../core/src/traffic/traffic-controller.ts | 1 + 3 files changed, 35 insertions(+), 4 deletions(-) diff --git a/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts index fdb1c7a83..7cca0d260 100644 --- a/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts +++ b/packages/core/src/traffic/rate-limit-strategies/openai-window-rate-limit-strategy.ts @@ -125,7 +125,16 @@ export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { this.requestsPerMinute !== undefined ? undefined : this.window.updateFromHeaders(metadata, headers, logger); - this.applyTokenHeaderUpdates(headers, logger); + const tokenUpdate = this.applyTokenHeaderUpdates(headers, logger); + if (!update) { + return tokenUpdate; + } + if (tokenUpdate?.headerSnapshot) { + return { + ...update, + headerSnapshot: { ...update.headerSnapshot, ...tokenUpdate.headerSnapshot }, + }; + } return update; } @@ -290,7 +299,10 @@ export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { return Number.isFinite(numeric) && numeric > 0 ? numeric : undefined; } - private applyTokenHeaderUpdates(headers: unknown, logger?: Logger): void { + private applyTokenHeaderUpdates( + headers: unknown, + logger?: Logger, + ): RateLimitUpdateResult | undefined { const rateLimitLogger = logger?.child({ module: "rate-limiter" }); const limitTokens = readHeaderValue(headers, "x-ratelimit-limit-tokens"); const remainingTokens = readHeaderValue(headers, "x-ratelimit-remaining-tokens"); @@ -309,7 +321,7 @@ export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { hasRemaining: !!remainingTokens, hasReset: !!resetTokens, }); - return; + return undefined; } const now = Date.now(); @@ -326,13 +338,14 @@ export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { ? Math.min(existing.remaining, clampedRemaining) : clampedRemaining; - this.tokenState = { + const state: RateLimitWindowState = { limit: effectiveLimit, remaining: effectiveRemaining, resetAt, reserved, nextAllowedAt, }; + this.tokenState = state; rateLimitLogger?.debug?.("OpenAI token headers applied", { rateLimitKey: this.key, @@ -341,6 +354,19 @@ export class OpenAIWindowRateLimitStrategy implements RateLimitStrategy { resetAt, retryAfterMs, }); + + return { + key: this.key, + headerSnapshot: { + limitTokens, + remainingTokens, + resetTokens, + resetTokensMs, + retryAfter, + retryAfterMs, + }, + state, + }; } private resolveTokenCount(usage: RateLimitUsage): number { diff --git a/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts b/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts index 653fdaf2f..af398b25f 100644 --- a/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts +++ b/packages/core/src/traffic/rate-limit-strategies/rate-limit-strategy.ts @@ -11,6 +11,10 @@ export type RateLimitHeaderSnapshot = { remainingRequests?: string; resetRequests?: string; resetRequestsMs?: number; + limitTokens?: string; + remainingTokens?: string; + resetTokens?: string; + resetTokensMs?: number; retryAfter?: string; retryAfterMs?: number; }; diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index 9b7221280..bd46114cd 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -328,6 +328,7 @@ export class TrafficController { resetAt: update.state.resetAt, nextAllowedAt: update.state.nextAllowedAt, resetRequestsMs: update.headerSnapshot.resetRequestsMs, + resetTokensMs: update.headerSnapshot.resetTokensMs, }); this.rateLimitSnapshots.set(update.key, { From 3852e126cd9911102ecb0cdc262b63f1eb985d6c Mon Sep 17 00:00:00 2001 From: riturajFi Date: Fri, 26 Dec 2025 15:14:08 +0530 Subject: [PATCH 39/41] fix: unhandled rejection risk --- .../core/src/traffic/traffic-usage-tracker.ts | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/packages/core/src/traffic/traffic-usage-tracker.ts b/packages/core/src/traffic/traffic-usage-tracker.ts index c79b311ad..b75f602a1 100644 --- a/packages/core/src/traffic/traffic-usage-tracker.ts +++ b/packages/core/src/traffic/traffic-usage-tracker.ts @@ -29,7 +29,17 @@ export class TrafficUsageTracker { return undefined; } - const usage = extractor(result); + let usage: UsageCounters | Promise | undefined; + try { + usage = extractor(result); + } catch (error) { + usageLogger?.warn?.("Usage extractor threw; skipping usage", { + tenantId: item.tenantId, + errorName: (error as { name?: unknown } | null)?.name, + errorMessage: (error as { message?: unknown } | null)?.message, + }); + return undefined; + } if (!usage) { usageLogger?.trace?.("Usage extractor returned empty; skipping usage", { tenantId: item.tenantId, @@ -41,7 +51,15 @@ export class TrafficUsageTracker { usageLogger?.trace?.("Usage extractor returned promise; awaiting", { tenantId: item.tenantId, }); - void usage.then((u) => u && this.incrementTenantUsage(item.tenantId, u, usageLogger)); + void usage + .then((u) => u && this.incrementTenantUsage(item.tenantId, u, usageLogger)) + .catch((error) => { + usageLogger?.warn?.("Usage extractor promise rejected; skipping usage", { + tenantId: item.tenantId, + errorName: (error as { name?: unknown } | null)?.name, + errorMessage: (error as { message?: unknown } | null)?.message, + }); + }); return usage; } this.incrementTenantUsage(item.tenantId, usage, usageLogger); From cad68999dd29dd41ca071f2ffdfb5d26d983f5e1 Mon Sep 17 00:00:00 2001 From: riturajFi Date: Fri, 26 Dec 2025 15:26:35 +0530 Subject: [PATCH 40/41] fix: dispatcher to enforce queue timeouts --- .../src/traffic/traffic-controller.spec.ts | 52 +++++++++++++++++++ .../core/src/traffic/traffic-controller.ts | 45 +++++++++++++++- 2 files changed, 96 insertions(+), 1 deletion(-) diff --git a/packages/core/src/traffic/traffic-controller.spec.ts b/packages/core/src/traffic/traffic-controller.spec.ts index 0d40426a1..dee0719f8 100644 --- a/packages/core/src/traffic/traffic-controller.spec.ts +++ b/packages/core/src/traffic/traffic-controller.spec.ts @@ -671,6 +671,58 @@ describe("TrafficController stream reporting", () => { }); describe("TrafficController queue timeouts", () => { + it("times out queued requests even when max concurrency is saturated", async () => { + vi.useFakeTimers(); + + try { + vi.setSystemTime(new Date(0)); + const controller = new TrafficController({ maxConcurrent: 1 }); + const order: string[] = []; + let releaseFirst!: () => void; + const firstGate = new Promise((resolve) => { + releaseFirst = resolve; + }); + + const first = controller.handleText({ + tenantId: "tenant-a", + metadata: { provider: "p", model: "m", priority: "P1" }, + execute: async () => { + order.push("first"); + await firstGate; + return "first"; + }, + }); + + const second = controller.handleText({ + tenantId: "tenant-a", + metadata: { provider: "p", model: "m", priority: "P1" }, + maxQueueWaitMs: 1, + execute: async () => { + order.push("second"); + return "second"; + }, + }); + const secondExpectation = expect(second).rejects.toHaveProperty( + "name", + "QueueWaitTimeoutError", + ); + + await Promise.resolve(); + expect(order).toEqual(["first"]); + + await vi.advanceTimersByTimeAsync(2); + await vi.runAllTimersAsync(); + await secondExpectation; + expect(order).toEqual(["first"]); + + releaseFirst(); + await vi.runAllTimersAsync(); + await first; + } finally { + vi.useRealTimers(); + } + }); + it("lets fallback requests wait after queue timeout without rejecting", async () => { vi.useFakeTimers(); diff --git a/packages/core/src/traffic/traffic-controller.ts b/packages/core/src/traffic/traffic-controller.ts index bd46114cd..90d56037c 100644 --- a/packages/core/src/traffic/traffic-controller.ts +++ b/packages/core/src/traffic/traffic-controller.ts @@ -431,7 +431,13 @@ export class TrafficController { */ private tryDispatchNext(): DispatchDecision { - if (this.activeCount >= this.maxConcurrent) return { kind: "wait" }; + if (this.activeCount >= this.maxConcurrent) { + const timeoutSweep = this.processQueueTimeoutsOnly(Date.now()); + if (timeoutSweep.evicted) return { kind: "skip" }; + return timeoutSweep.wakeUpAt !== undefined + ? { kind: "wait", wakeUpAt: timeoutSweep.wakeUpAt } + : { kind: "wait" }; + } let earliestWakeUpAt: number | undefined; @@ -937,6 +943,43 @@ export class TrafficController { return this.rateLimitKeyBuilder(metadata); } + private processQueueTimeoutsOnly(now: number): { evicted: boolean; wakeUpAt?: number } { + let evicted = false; + let wakeUpAt: number | undefined; + + const observeWakeUpAt = (candidate?: number): void => { + if (candidate === undefined) return; + wakeUpAt = wakeUpAt === undefined ? candidate : Math.min(wakeUpAt, candidate); + }; + + for (const priority of this.priorityOrder) { + const state = this.queues[priority]; + if (state.order.length === 0) continue; + + for (const tenantId of [...state.order]) { + const queue = state.queues.get(tenantId); + if (!queue || queue.length === 0) { + this.removeTenantQueue(priority, tenantId); + continue; + } + + const next = queue[0]; + const queueTimeoutAt = this.resolveQueueTimeoutAt(next); + if (queueTimeoutAt !== undefined && now < queueTimeoutAt) { + observeWakeUpAt(queueTimeoutAt); + } + + const queueTimeoutTriggered = this.handleQueueTimeout(next, queue, 0, now, queueTimeoutAt); + if (queueTimeoutTriggered === "rejected") { + evicted = true; + this.cleanupTenantQueue(priority, tenantId, queue); + } + } + } + + return { evicted, wakeUpAt }; + } + private processQueuedCandidate( priority: TrafficPriority, candidate: { item: QueuedRequest; queue: QueuedRequest[]; tenantId: string }, From f102264f54109e7a51dad9d9da8dfc509cfc64e4 Mon Sep 17 00:00:00 2001 From: riturajFi Date: Fri, 26 Dec 2025 15:29:19 +0530 Subject: [PATCH 41/41] fix: adjusted the dispatcher to enforce queue timeout --- packages/server-core/src/handlers/agent.handlers.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/packages/server-core/src/handlers/agent.handlers.ts b/packages/server-core/src/handlers/agent.handlers.ts index 37fbeaf4e..74d479962 100644 --- a/packages/server-core/src/handlers/agent.handlers.ts +++ b/packages/server-core/src/handlers/agent.handlers.ts @@ -43,6 +43,7 @@ function wrapStreamWithTraffic( const trafficEvent = `data: ${safeStringify({ type: "traffic", traffic })}\n\n`; controller.enqueue(encoder.encode(trafficEvent)); const reader = baseBody.getReader(); + let didError = false; try { while (true) { const { done, value } = await reader.read(); @@ -52,10 +53,13 @@ function wrapStreamWithTraffic( } } } catch (error) { + didError = true; controller.error(error); } finally { reader.releaseLock(); - controller.close(); + if (!didError) { + controller.close(); + } } }, });