diff --git a/config.example.yaml b/config.example.yaml index f46158c33..63ded611c 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -105,7 +105,7 @@ ws-auth: false # excluded-models: # - "claude-opus-4-5-20251101" # exclude specific models (exact match) # - "claude-3-*" # wildcard matching prefix (e.g. claude-3-7-sonnet-20250219) -# - "*-think" # wildcard matching suffix (e.g. claude-opus-4-5-thinking) +# - "*-thinking" # wildcard matching suffix (e.g. claude-opus-4-5-thinking) # - "*haiku*" # wildcard matching substring (e.g. claude-3-5-haiku-20241022) # Kiro (AWS CodeWhisperer) configuration diff --git a/internal/registry/model_definitions.go b/internal/registry/model_definitions.go index 82944609b..59881c60b 100644 --- a/internal/registry/model_definitions.go +++ b/internal/registry/model_definitions.go @@ -16,6 +16,7 @@ func GetClaudeModels() []*ModelInfo { DisplayName: "Claude 4.5 Haiku", ContextLength: 200000, MaxCompletionTokens: 64000, + // Thinking: not supported for Haiku models }, { ID: "claude-sonnet-4-5-20250929", @@ -49,6 +50,7 @@ func GetClaudeModels() []*ModelInfo { DisplayName: "Claude 4.1 Opus", ContextLength: 200000, MaxCompletionTokens: 32000, + Thinking: &ThinkingSupport{Min: 1024, Max: 100000, ZeroAllowed: false, DynamicAllowed: true}, }, { ID: "claude-opus-4-20250514", @@ -59,6 +61,7 @@ func GetClaudeModels() []*ModelInfo { DisplayName: "Claude 4 Opus", ContextLength: 200000, MaxCompletionTokens: 32000, + Thinking: &ThinkingSupport{Min: 1024, Max: 100000, ZeroAllowed: false, DynamicAllowed: true}, }, { ID: "claude-sonnet-4-20250514", @@ -69,6 +72,7 @@ func GetClaudeModels() []*ModelInfo { DisplayName: "Claude 4 Sonnet", ContextLength: 200000, MaxCompletionTokens: 64000, + Thinking: &ThinkingSupport{Min: 1024, Max: 100000, ZeroAllowed: false, DynamicAllowed: true}, }, { ID: "claude-3-7-sonnet-20250219", @@ -79,6 +83,7 @@ func GetClaudeModels() []*ModelInfo { DisplayName: "Claude 3.7 Sonnet", ContextLength: 128000, MaxCompletionTokens: 8192, + Thinking: &ThinkingSupport{Min: 1024, Max: 100000, ZeroAllowed: false, DynamicAllowed: true}, }, { ID: "claude-3-5-haiku-20241022", @@ -89,6 +94,7 @@ func GetClaudeModels() []*ModelInfo { DisplayName: "Claude 3.5 Haiku", ContextLength: 128000, MaxCompletionTokens: 8192, + // Thinking: not supported for Haiku models }, } } @@ -476,6 +482,7 @@ func GetOpenAIModels() []*ModelInfo { ContextLength: 400000, MaxCompletionTokens: 128000, SupportedParameters: []string{"tools"}, + Thinking: &ThinkingSupport{Levels: []string{"minimal", "low", "medium", "high"}}, }, { ID: "gpt-5-codex", @@ -489,6 +496,7 @@ func GetOpenAIModels() []*ModelInfo { ContextLength: 400000, MaxCompletionTokens: 128000, SupportedParameters: []string{"tools"}, + Thinking: &ThinkingSupport{Levels: []string{"low", "medium", "high"}}, }, { ID: "gpt-5-codex-mini", @@ -502,6 +510,7 @@ func GetOpenAIModels() []*ModelInfo { ContextLength: 400000, MaxCompletionTokens: 128000, SupportedParameters: []string{"tools"}, + Thinking: &ThinkingSupport{Levels: []string{"low", "medium", "high"}}, }, { ID: "gpt-5.1", @@ -515,6 +524,7 @@ func GetOpenAIModels() []*ModelInfo { ContextLength: 400000, MaxCompletionTokens: 128000, SupportedParameters: []string{"tools"}, + Thinking: &ThinkingSupport{Levels: []string{"none", "low", "medium", "high"}}, }, { ID: "gpt-5.1-codex", @@ -528,6 +538,7 @@ func GetOpenAIModels() []*ModelInfo { ContextLength: 400000, MaxCompletionTokens: 128000, SupportedParameters: []string{"tools"}, + Thinking: &ThinkingSupport{Levels: []string{"low", "medium", "high"}}, }, { ID: "gpt-5.1-codex-mini", @@ -541,6 +552,7 @@ func GetOpenAIModels() []*ModelInfo { ContextLength: 400000, MaxCompletionTokens: 128000, SupportedParameters: []string{"tools"}, + Thinking: &ThinkingSupport{Levels: []string{"low", "medium", "high"}}, }, { ID: "gpt-5.1-codex-max", @@ -554,6 +566,7 @@ func GetOpenAIModels() []*ModelInfo { ContextLength: 400000, MaxCompletionTokens: 128000, SupportedParameters: []string{"tools"}, + Thinking: &ThinkingSupport{Levels: []string{"low", "medium", "high", "xhigh"}}, }, } } @@ -610,6 +623,7 @@ func GetIFlowModels() []*ModelInfo { DisplayName string Description string Created int64 + Thinking *ThinkingSupport }{ {ID: "tstars2.0", DisplayName: "TStars-2.0", Description: "iFlow TStars-2.0 multimodal assistant", Created: 1746489600}, {ID: "qwen3-coder-plus", DisplayName: "Qwen3-Coder-Plus", Description: "Qwen3 Coder Plus code generation", Created: 1753228800}, @@ -619,17 +633,17 @@ func GetIFlowModels() []*ModelInfo { {ID: "kimi-k2-0905", DisplayName: "Kimi-K2-Instruct-0905", Description: "Moonshot Kimi K2 instruct 0905", Created: 1757030400}, {ID: "glm-4.6", DisplayName: "GLM-4.6", Description: "Zhipu GLM 4.6 general model", Created: 1759190400}, {ID: "kimi-k2", DisplayName: "Kimi-K2", Description: "Moonshot Kimi K2 general model", Created: 1752192000}, - {ID: "kimi-k2-thinking", DisplayName: "Kimi-K2-Thinking", Description: "Moonshot Kimi K2 general model", Created: 1762387200}, + {ID: "kimi-k2-thinking", DisplayName: "Kimi-K2-Thinking", Description: "Moonshot Kimi K2 thinking model", Created: 1762387200, Thinking: &ThinkingSupport{Levels: []string{"low", "medium", "high"}}}, {ID: "deepseek-v3.2-chat", DisplayName: "DeepSeek-V3.2", Description: "DeepSeek V3.2", Created: 1764576000}, {ID: "deepseek-v3.2", DisplayName: "DeepSeek-V3.2-Exp", Description: "DeepSeek V3.2 experimental", Created: 1759104000}, {ID: "deepseek-v3.1", DisplayName: "DeepSeek-V3.1-Terminus", Description: "DeepSeek V3.1 Terminus", Created: 1756339200}, - {ID: "deepseek-r1", DisplayName: "DeepSeek-R1", Description: "DeepSeek reasoning model R1", Created: 1737331200}, + {ID: "deepseek-r1", DisplayName: "DeepSeek-R1", Description: "DeepSeek reasoning model R1", Created: 1737331200, Thinking: &ThinkingSupport{Levels: []string{"low", "medium", "high"}}}, {ID: "deepseek-v3", DisplayName: "DeepSeek-V3-671B", Description: "DeepSeek V3 671B", Created: 1734307200}, {ID: "qwen3-32b", DisplayName: "Qwen3-32B", Description: "Qwen3 32B", Created: 1747094400}, - {ID: "qwen3-235b-a22b-thinking-2507", DisplayName: "Qwen3-235B-A22B-Thinking", Description: "Qwen3 235B A22B Thinking (2507)", Created: 1753401600}, + {ID: "qwen3-235b-a22b-thinking-2507", DisplayName: "Qwen3-235B-A22B-Thinking", Description: "Qwen3 235B A22B Thinking (2507)", Created: 1753401600, Thinking: &ThinkingSupport{Levels: []string{"low", "medium", "high"}}}, {ID: "qwen3-235b-a22b-instruct", DisplayName: "Qwen3-235B-A22B-Instruct", Description: "Qwen3 235B A22B Instruct", Created: 1753401600}, {ID: "qwen3-235b", DisplayName: "Qwen3-235B-A22B", Description: "Qwen3 235B A22B", Created: 1753401600}, - {ID: "minimax-m2", DisplayName: "MiniMax-M2", Description: "MiniMax M2", Created: 1758672000}, + {ID: "minimax-m2", DisplayName: "MiniMax-M2", Description: "MiniMax M2", Created: 1758672000, Thinking: &ThinkingSupport{Levels: []string{"low", "medium", "high"}}}, } models := make([]*ModelInfo, 0, len(entries)) for _, entry := range entries { @@ -641,6 +655,7 @@ func GetIFlowModels() []*ModelInfo { Type: "iflow", DisplayName: entry.DisplayName, Description: entry.Description, + Thinking: entry.Thinking, }) } return models diff --git a/internal/registry/model_registry.go b/internal/registry/model_registry.go index 5ef9007f9..f3517bde8 100644 --- a/internal/registry/model_registry.go +++ b/internal/registry/model_registry.go @@ -63,6 +63,9 @@ type ThinkingSupport struct { ZeroAllowed bool `json:"zero_allowed,omitempty"` // DynamicAllowed indicates whether -1 is a valid value (dynamic thinking budget). DynamicAllowed bool `json:"dynamic_allowed,omitempty"` + // Levels defines discrete reasoning effort levels (e.g., "low", "medium", "high"). + // When set, the model uses level-based reasoning instead of token budgets. + Levels []string `json:"levels,omitempty"` } // ModelRegistration tracks a model's availability diff --git a/internal/runtime/executor/antigravity_executor.go b/internal/runtime/executor/antigravity_executor.go index a7289c64c..e9ae3dc09 100644 --- a/internal/runtime/executor/antigravity_executor.go +++ b/internal/runtime/executor/antigravity_executor.go @@ -39,7 +39,7 @@ const ( defaultAntigravityAgent = "antigravity/1.11.5 windows/amd64" antigravityAuthType = "antigravity" refreshSkew = 3000 * time.Second - streamScannerBuffer int = 20_971_520 + streamScannerBuffer int = 52_428_800 // 50MB ) var ( diff --git a/internal/runtime/executor/claude_executor.go b/internal/runtime/executor/claude_executor.go index c7470954e..67671026f 100644 --- a/internal/runtime/executor/claude_executor.go +++ b/internal/runtime/executor/claude_executor.go @@ -254,7 +254,7 @@ func (e *ClaudeExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.A // If from == to (Claude → Claude), directly forward the SSE stream without translation if from == to { scanner := bufio.NewScanner(decodedBody) - scanner.Buffer(nil, 20_971_520) + scanner.Buffer(nil, 52_428_800) // 50MB for scanner.Scan() { line := scanner.Bytes() appendAPIResponseChunk(ctx, e.cfg, line) @@ -277,7 +277,7 @@ func (e *ClaudeExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.A // For other formats, use translation scanner := bufio.NewScanner(decodedBody) - scanner.Buffer(nil, 20_971_520) + scanner.Buffer(nil, 52_428_800) // 50MB var param any for scanner.Scan() { line := scanner.Bytes() @@ -450,59 +450,15 @@ func extractAndRemoveBetas(body []byte) ([]string, []byte) { return betas, body } -// injectThinkingConfig adds thinking configuration based on metadata or legacy suffixes. +// injectThinkingConfig adds thinking configuration based on metadata using the unified flow. +// It uses util.ResolveClaudeThinkingConfig which internally calls ResolveThinkingConfigFromMetadata +// and NormalizeThinkingBudget, ensuring consistency with other executors like Gemini. func (e *ClaudeExecutor) injectThinkingConfig(modelName string, metadata map[string]any, body []byte) []byte { - // Only inject if thinking config is not already present - if gjson.GetBytes(body, "thinking").Exists() { + budget, ok := util.ResolveClaudeThinkingConfig(modelName, metadata) + if !ok { return body } - - budgetTokens, ok := resolveClaudeThinkingBudget(modelName, metadata) - if !ok || budgetTokens <= 0 { - return body - } - - body, _ = sjson.SetBytes(body, "thinking.type", "enabled") - body, _ = sjson.SetBytes(body, "thinking.budget_tokens", budgetTokens) - return body -} - -func resolveClaudeThinkingBudget(modelName string, metadata map[string]any) (int, bool) { - budget, include, effort, matched := util.ThinkingFromMetadata(metadata) - if matched { - if include != nil && !*include { - return 0, false - } - if budget != nil { - normalized := util.NormalizeThinkingBudget(modelName, *budget) - if normalized > 0 { - return normalized, true - } - return 0, false - } - if effort != nil { - if derived, ok := util.ThinkingEffortToBudget(modelName, *effort); ok && derived > 0 { - return derived, true - } - } - } - return claudeBudgetFromSuffix(modelName) -} - -func claudeBudgetFromSuffix(modelName string) (int, bool) { - lower := strings.ToLower(strings.TrimSpace(modelName)) - switch { - case strings.HasSuffix(lower, "-thinking-low"): - return 1024, true - case strings.HasSuffix(lower, "-thinking-medium"): - return 8192, true - case strings.HasSuffix(lower, "-thinking-high"): - return 24576, true - case strings.HasSuffix(lower, "-thinking"): - return 8192, true - default: - return 0, false - } + return util.ApplyClaudeThinkingConfig(body, budget) } // ensureMaxTokensForThinking ensures max_tokens > thinking.budget_tokens when thinking is enabled. diff --git a/internal/runtime/executor/codex_executor.go b/internal/runtime/executor/codex_executor.go index 511b4f45d..558c06479 100644 --- a/internal/runtime/executor/codex_executor.go +++ b/internal/runtime/executor/codex_executor.go @@ -54,7 +54,11 @@ func (e *CodexExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, re from := opts.SourceFormat to := sdktranslator.FromString("codex") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false) - body = applyReasoningEffortMetadata(body, req.Metadata, req.Model) + body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning.effort") + body = normalizeThinkingConfig(body, upstreamModel) + if errValidate := validateThinkingConfig(body, upstreamModel); errValidate != nil { + return resp, errValidate + } body = applyPayloadConfig(e.cfg, req.Model, body) body, _ = sjson.SetBytes(body, "model", upstreamModel) body, _ = sjson.SetBytes(body, "stream", true) @@ -148,7 +152,11 @@ func (e *CodexExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Au to := sdktranslator.FromString("codex") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), true) - body = applyReasoningEffortMetadata(body, req.Metadata, req.Model) + body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning.effort") + body = normalizeThinkingConfig(body, upstreamModel) + if errValidate := validateThinkingConfig(body, upstreamModel); errValidate != nil { + return nil, errValidate + } body = applyPayloadConfig(e.cfg, req.Model, body) body, _ = sjson.DeleteBytes(body, "previous_response_id") body, _ = sjson.SetBytes(body, "model", upstreamModel) @@ -208,7 +216,7 @@ func (e *CodexExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Au } }() scanner := bufio.NewScanner(httpResp.Body) - scanner.Buffer(nil, 20_971_520) + scanner.Buffer(nil, 52_428_800) // 50MB var param any for scanner.Scan() { line := scanner.Bytes() @@ -246,7 +254,7 @@ func (e *CodexExecutor) CountTokens(ctx context.Context, auth *cliproxyauth.Auth modelForCounting := req.Model - body = applyReasoningEffortMetadata(body, req.Metadata, req.Model) + body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning.effort") body, _ = sjson.SetBytes(body, "model", upstreamModel) body, _ = sjson.DeleteBytes(body, "previous_response_id") body, _ = sjson.SetBytes(body, "stream", false) diff --git a/internal/runtime/executor/gemini_cli_executor.go b/internal/runtime/executor/gemini_cli_executor.go index a2e0ececd..2c4f3f881 100644 --- a/internal/runtime/executor/gemini_cli_executor.go +++ b/internal/runtime/executor/gemini_cli_executor.go @@ -309,7 +309,7 @@ func (e *GeminiCLIExecutor) ExecuteStream(ctx context.Context, auth *cliproxyaut }() if opts.Alt == "" { scanner := bufio.NewScanner(resp.Body) - scanner.Buffer(nil, 20_971_520) + scanner.Buffer(nil, 52_428_800) // 50MB var param any for scanner.Scan() { line := scanner.Bytes() diff --git a/internal/runtime/executor/gemini_executor.go b/internal/runtime/executor/gemini_executor.go index bd214b109..7b94b1455 100644 --- a/internal/runtime/executor/gemini_executor.go +++ b/internal/runtime/executor/gemini_executor.go @@ -249,7 +249,7 @@ func (e *GeminiExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.A } }() scanner := bufio.NewScanner(httpResp.Body) - scanner.Buffer(nil, 20_971_520) + scanner.Buffer(nil, 52_428_800) // 50MB var param any for scanner.Scan() { line := scanner.Bytes() diff --git a/internal/runtime/executor/gemini_vertex_executor.go b/internal/runtime/executor/gemini_vertex_executor.go index cb41df48f..51a6118c2 100644 --- a/internal/runtime/executor/gemini_vertex_executor.go +++ b/internal/runtime/executor/gemini_vertex_executor.go @@ -579,7 +579,7 @@ func (e *GeminiVertexExecutor) executeStreamWithServiceAccount(ctx context.Conte } }() scanner := bufio.NewScanner(httpResp.Body) - scanner.Buffer(nil, 20_971_520) + scanner.Buffer(nil, 52_428_800) // 50MB var param any for scanner.Scan() { line := scanner.Bytes() @@ -696,7 +696,7 @@ func (e *GeminiVertexExecutor) executeStreamWithAPIKey(ctx context.Context, auth } }() scanner := bufio.NewScanner(httpResp.Body) - scanner.Buffer(nil, 20_971_520) + scanner.Buffer(nil, 52_428_800) // 50MB var param any for scanner.Scan() { line := scanner.Bytes() diff --git a/internal/runtime/executor/iflow_executor.go b/internal/runtime/executor/iflow_executor.go index c68a64315..60aa539f7 100644 --- a/internal/runtime/executor/iflow_executor.go +++ b/internal/runtime/executor/iflow_executor.go @@ -57,10 +57,15 @@ func (e *IFlowExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, re from := opts.SourceFormat to := sdktranslator.FromString("openai") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false) - body = applyReasoningEffortMetadataChatCompletions(body, req.Metadata, req.Model) - if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" { + body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning_effort") + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + if upstreamModel != "" { body, _ = sjson.SetBytes(body, "model", upstreamModel) } + body = normalizeThinkingConfig(body, upstreamModel) + if errValidate := validateThinkingConfig(body, upstreamModel); errValidate != nil { + return resp, errValidate + } body = applyPayloadConfig(e.cfg, req.Model, body) endpoint := strings.TrimSuffix(baseURL, "/") + iflowDefaultEndpoint @@ -143,10 +148,15 @@ func (e *IFlowExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Au to := sdktranslator.FromString("openai") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), true) - body = applyReasoningEffortMetadataChatCompletions(body, req.Metadata, req.Model) - if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" { + body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning_effort") + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + if upstreamModel != "" { body, _ = sjson.SetBytes(body, "model", upstreamModel) } + body = normalizeThinkingConfig(body, upstreamModel) + if errValidate := validateThinkingConfig(body, upstreamModel); errValidate != nil { + return nil, errValidate + } // Ensure tools array exists to avoid provider quirks similar to Qwen's behaviour. toolsResult := gjson.GetBytes(body, "tools") if toolsResult.Exists() && toolsResult.IsArray() && len(toolsResult.Array()) == 0 { @@ -209,7 +219,7 @@ func (e *IFlowExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Au }() scanner := bufio.NewScanner(httpResp.Body) - scanner.Buffer(nil, 20_971_520) + scanner.Buffer(nil, 52_428_800) // 50MB var param any for scanner.Scan() { line := scanner.Bytes() diff --git a/internal/runtime/executor/openai_compat_executor.go b/internal/runtime/executor/openai_compat_executor.go index 93122c202..ac932c0be 100644 --- a/internal/runtime/executor/openai_compat_executor.go +++ b/internal/runtime/executor/openai_compat_executor.go @@ -58,10 +58,15 @@ func (e *OpenAICompatExecutor) Execute(ctx context.Context, auth *cliproxyauth.A translated = e.overrideModel(translated, modelOverride) } translated = applyPayloadConfigWithRoot(e.cfg, req.Model, to.String(), "", translated) - translated = applyReasoningEffortMetadataChatCompletions(translated, req.Metadata, req.Model) - if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" { + translated = applyReasoningEffortMetadata(translated, req.Metadata, req.Model, "reasoning_effort") + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + if upstreamModel != "" { translated, _ = sjson.SetBytes(translated, "model", upstreamModel) } + translated = normalizeThinkingConfig(translated, upstreamModel) + if errValidate := validateThinkingConfig(translated, upstreamModel); errValidate != nil { + return resp, errValidate + } url := strings.TrimSuffix(baseURL, "/") + "/chat/completions" httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(translated)) @@ -147,10 +152,15 @@ func (e *OpenAICompatExecutor) ExecuteStream(ctx context.Context, auth *cliproxy translated = e.overrideModel(translated, modelOverride) } translated = applyPayloadConfigWithRoot(e.cfg, req.Model, to.String(), "", translated) - translated = applyReasoningEffortMetadataChatCompletions(translated, req.Metadata, req.Model) - if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" { + translated = applyReasoningEffortMetadata(translated, req.Metadata, req.Model, "reasoning_effort") + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + if upstreamModel != "" { translated, _ = sjson.SetBytes(translated, "model", upstreamModel) } + translated = normalizeThinkingConfig(translated, upstreamModel) + if errValidate := validateThinkingConfig(translated, upstreamModel); errValidate != nil { + return nil, errValidate + } url := strings.TrimSuffix(baseURL, "/") + "/chat/completions" httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(translated)) @@ -214,7 +224,7 @@ func (e *OpenAICompatExecutor) ExecuteStream(ctx context.Context, auth *cliproxy } }() scanner := bufio.NewScanner(httpResp.Body) - scanner.Buffer(nil, 20_971_520) + scanner.Buffer(nil, 52_428_800) // 50MB var param any for scanner.Scan() { line := scanner.Bytes() diff --git a/internal/runtime/executor/payload_helpers.go b/internal/runtime/executor/payload_helpers.go index 37e3141a3..9bc82f1f3 100644 --- a/internal/runtime/executor/payload_helpers.go +++ b/internal/runtime/executor/payload_helpers.go @@ -1,6 +1,8 @@ package executor import ( + "fmt" + "net/http" "strings" "github.com/router-for-me/CLIProxyAPI/v6/internal/config" @@ -9,7 +11,7 @@ import ( "github.com/tidwall/sjson" ) -// applyThinkingMetadata applies thinking config from model suffix metadata (e.g., -reasoning, -thinking-N) +// applyThinkingMetadata applies thinking config from model suffix metadata (e.g., (high), (8192)) // for standard Gemini format payloads. It normalizes the budget when the model supports thinking. func applyThinkingMetadata(payload []byte, metadata map[string]any, model string) []byte { budgetOverride, includeOverride, ok := util.ResolveThinkingConfigFromMetadata(model, metadata) @@ -26,7 +28,7 @@ func applyThinkingMetadata(payload []byte, metadata map[string]any, model string return util.ApplyGeminiThinkingConfig(payload, budgetOverride, includeOverride) } -// applyThinkingMetadataCLI applies thinking config from model suffix metadata (e.g., -reasoning, -thinking-N) +// applyThinkingMetadataCLI applies thinking config from model suffix metadata (e.g., (high), (8192)) // for Gemini CLI format payloads (nested under "request"). It normalizes the budget when the model supports thinking. func applyThinkingMetadataCLI(payload []byte, metadata map[string]any, model string) []byte { budgetOverride, includeOverride, ok := util.ResolveThinkingConfigFromMetadata(model, metadata) @@ -43,40 +45,21 @@ func applyThinkingMetadataCLI(payload []byte, metadata map[string]any, model str return util.ApplyGeminiCLIThinkingConfig(payload, budgetOverride, includeOverride) } -// applyReasoningEffortMetadata applies reasoning effort overrides (reasoning.effort) when present in metadata. -// It avoids overwriting an existing reasoning.effort field and only applies to models that support thinking. -func applyReasoningEffortMetadata(payload []byte, metadata map[string]any, model string) []byte { +// applyReasoningEffortMetadata applies reasoning effort overrides from metadata to the given JSON path. +// Metadata values take precedence over any existing field when the model supports thinking, intentionally +// overwriting caller-provided values to honor suffix/default metadata priority. +func applyReasoningEffortMetadata(payload []byte, metadata map[string]any, model, field string) []byte { if len(metadata) == 0 { return payload } if !util.ModelSupportsThinking(model) { return payload } - if gjson.GetBytes(payload, "reasoning.effort").Exists() { + if field == "" { return payload } if effort, ok := util.ReasoningEffortFromMetadata(metadata); ok && effort != "" { - if updated, err := sjson.SetBytes(payload, "reasoning.effort", effort); err == nil { - return updated - } - } - return payload -} - -// applyReasoningEffortMetadataChatCompletions applies reasoning_effort (OpenAI chat completions field) -// when present in metadata. It avoids overwriting an existing reasoning_effort field. -func applyReasoningEffortMetadataChatCompletions(payload []byte, metadata map[string]any, model string) []byte { - if len(metadata) == 0 { - return payload - } - if !util.ModelSupportsThinking(model) { - return payload - } - if gjson.GetBytes(payload, "reasoning_effort").Exists() { - return payload - } - if effort, ok := util.ReasoningEffortFromMetadata(metadata); ok && effort != "" { - if updated, err := sjson.SetBytes(payload, "reasoning_effort", effort); err == nil { + if updated, err := sjson.SetBytes(payload, field, effort); err == nil { return updated } } @@ -232,3 +215,93 @@ func matchModelPattern(pattern, model string) bool { } return pi == len(pattern) } + +// normalizeThinkingConfig normalizes thinking-related fields in the payload +// based on model capabilities. For models without thinking support, it strips +// reasoning fields. For models with level-based thinking, it validates and +// normalizes the reasoning effort level. +func normalizeThinkingConfig(payload []byte, model string) []byte { + if len(payload) == 0 || model == "" { + return payload + } + + if !util.ModelSupportsThinking(model) { + return stripThinkingFields(payload) + } + + if util.ModelUsesThinkingLevels(model) { + return normalizeReasoningEffortLevel(payload, model) + } + + return payload +} + +// stripThinkingFields removes thinking-related fields from the payload for +// models that do not support thinking. +func stripThinkingFields(payload []byte) []byte { + fieldsToRemove := []string{ + "reasoning", + "reasoning_effort", + "reasoning.effort", + } + out := payload + for _, field := range fieldsToRemove { + if gjson.GetBytes(out, field).Exists() { + out, _ = sjson.DeleteBytes(out, field) + } + } + return out +} + +// normalizeReasoningEffortLevel validates and normalizes the reasoning_effort +// or reasoning.effort field for level-based thinking models. +func normalizeReasoningEffortLevel(payload []byte, model string) []byte { + out := payload + + if effort := gjson.GetBytes(out, "reasoning_effort"); effort.Exists() { + if normalized, ok := util.NormalizeReasoningEffortLevel(model, effort.String()); ok { + out, _ = sjson.SetBytes(out, "reasoning_effort", normalized) + } + } + + if effort := gjson.GetBytes(out, "reasoning.effort"); effort.Exists() { + if normalized, ok := util.NormalizeReasoningEffortLevel(model, effort.String()); ok { + out, _ = sjson.SetBytes(out, "reasoning.effort", normalized) + } + } + + return out +} + +// validateThinkingConfig checks for unsupported reasoning levels on level-based models. +// Returns a statusErr with 400 when an unsupported level is supplied to avoid silently +// downgrading requests. +func validateThinkingConfig(payload []byte, model string) error { + if len(payload) == 0 || model == "" { + return nil + } + if !util.ModelSupportsThinking(model) || !util.ModelUsesThinkingLevels(model) { + return nil + } + + levels := util.GetModelThinkingLevels(model) + checkField := func(path string) error { + if effort := gjson.GetBytes(payload, path); effort.Exists() { + if _, ok := util.NormalizeReasoningEffortLevel(model, effort.String()); !ok { + return statusErr{ + code: http.StatusBadRequest, + msg: fmt.Sprintf("unsupported reasoning effort level %q for model %s (supported: %s)", effort.String(), model, strings.Join(levels, ", ")), + } + } + } + return nil + } + + if err := checkField("reasoning_effort"); err != nil { + return err + } + if err := checkField("reasoning.effort"); err != nil { + return err + } + return nil +} diff --git a/internal/runtime/executor/qwen_executor.go b/internal/runtime/executor/qwen_executor.go index f060cb61e..c8e2974a7 100644 --- a/internal/runtime/executor/qwen_executor.go +++ b/internal/runtime/executor/qwen_executor.go @@ -51,10 +51,15 @@ func (e *QwenExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, req from := opts.SourceFormat to := sdktranslator.FromString("openai") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false) - body = applyReasoningEffortMetadataChatCompletions(body, req.Metadata, req.Model) - if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" { + body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning_effort") + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + if upstreamModel != "" { body, _ = sjson.SetBytes(body, "model", upstreamModel) } + body = normalizeThinkingConfig(body, upstreamModel) + if errValidate := validateThinkingConfig(body, upstreamModel); errValidate != nil { + return resp, errValidate + } body = applyPayloadConfig(e.cfg, req.Model, body) url := strings.TrimSuffix(baseURL, "/") + "/chat/completions" @@ -126,10 +131,15 @@ func (e *QwenExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Aut to := sdktranslator.FromString("openai") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), true) - body = applyReasoningEffortMetadataChatCompletions(body, req.Metadata, req.Model) - if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" { + body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning_effort") + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + if upstreamModel != "" { body, _ = sjson.SetBytes(body, "model", upstreamModel) } + body = normalizeThinkingConfig(body, upstreamModel) + if errValidate := validateThinkingConfig(body, upstreamModel); errValidate != nil { + return nil, errValidate + } toolsResult := gjson.GetBytes(body, "tools") // I'm addressing the Qwen3 "poisoning" issue, which is caused by the model needing a tool to be defined. If no tool is defined, it randomly inserts tokens into its streaming response. // This will have no real consequences. It's just to scare Qwen3. @@ -190,7 +200,7 @@ func (e *QwenExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Aut } }() scanner := bufio.NewScanner(httpResp.Body) - scanner.Buffer(nil, 20_971_520) + scanner.Buffer(nil, 52_428_800) // 50MB var param any for scanner.Scan() { line := scanner.Bytes() diff --git a/internal/translator/antigravity/claude/antigravity_claude_response.go b/internal/translator/antigravity/claude/antigravity_claude_response.go index 42265e80e..28785a8fb 100644 --- a/internal/translator/antigravity/claude/antigravity_claude_response.go +++ b/internal/translator/antigravity/claude/antigravity_claude_response.go @@ -35,6 +35,7 @@ type Params struct { TotalTokenCount int64 // Cached total token count from usage metadata HasSentFinalEvents bool // Indicates if final content/message events have been sent HasToolUse bool // Indicates if tool use was observed in the stream + HasContent bool // Tracks whether any content (text, thinking, or tool use) has been output } // toolUseIDCounter provides a process-wide unique counter for tool use identifiers. @@ -69,11 +70,14 @@ func ConvertAntigravityResponseToClaude(_ context.Context, _ string, originalReq if bytes.Equal(rawJSON, []byte("[DONE]")) { output := "" - appendFinalEvents(params, &output, true) - - return []string{ - output + "event: message_stop\ndata: {\"type\":\"message_stop\"}\n\n\n", + // Only send final events if we have actually output content + if params.HasContent { + appendFinalEvents(params, &output, true) + return []string{ + output + "event: message_stop\ndata: {\"type\":\"message_stop\"}\n\n\n", + } } + return []string{} } output := "" @@ -119,10 +123,12 @@ func ConvertAntigravityResponseToClaude(_ context.Context, _ string, originalReq output = output + "event: content_block_delta\n" data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"signature_delta","signature":""}}`, params.ResponseIndex), "delta.signature", thoughtSignature.String()) output = output + fmt.Sprintf("data: %s\n\n\n", data) + params.HasContent = true } else if params.ResponseType == 2 { // Continue existing thinking block if already in thinking state output = output + "event: content_block_delta\n" data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"thinking_delta","thinking":""}}`, params.ResponseIndex), "delta.thinking", partTextResult.String()) output = output + fmt.Sprintf("data: %s\n\n\n", data) + params.HasContent = true } else { // Transition from another state to thinking // First, close any existing content block @@ -146,6 +152,7 @@ func ConvertAntigravityResponseToClaude(_ context.Context, _ string, originalReq data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"thinking_delta","thinking":""}}`, params.ResponseIndex), "delta.thinking", partTextResult.String()) output = output + fmt.Sprintf("data: %s\n\n\n", data) params.ResponseType = 2 // Set state to thinking + params.HasContent = true } } else { finishReasonResult := gjson.GetBytes(rawJSON, "response.candidates.0.finishReason") @@ -156,6 +163,7 @@ func ConvertAntigravityResponseToClaude(_ context.Context, _ string, originalReq output = output + "event: content_block_delta\n" data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"text_delta","text":""}}`, params.ResponseIndex), "delta.text", partTextResult.String()) output = output + fmt.Sprintf("data: %s\n\n\n", data) + params.HasContent = true } else { // Transition from another state to text content // First, close any existing content block @@ -179,6 +187,7 @@ func ConvertAntigravityResponseToClaude(_ context.Context, _ string, originalReq data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"text_delta","text":""}}`, params.ResponseIndex), "delta.text", partTextResult.String()) output = output + fmt.Sprintf("data: %s\n\n\n", data) params.ResponseType = 1 // Set state to content + params.HasContent = true } } } @@ -230,6 +239,7 @@ func ConvertAntigravityResponseToClaude(_ context.Context, _ string, originalReq output = output + fmt.Sprintf("data: %s\n\n\n", data) } params.ResponseType = 3 + params.HasContent = true } } } @@ -269,6 +279,11 @@ func appendFinalEvents(params *Params, output *string, force bool) { return } + // Only send final events if we have actually output content + if !params.HasContent { + return + } + if params.ResponseType != 0 { *output = *output + "event: content_block_stop\n" *output = *output + fmt.Sprintf(`data: {"type":"content_block_stop","index":%d}`, params.ResponseIndex) diff --git a/internal/translator/claude/gemini/claude_gemini_response.go b/internal/translator/claude/gemini/claude_gemini_response.go index 72e1820ce..c77cca4bc 100644 --- a/internal/translator/claude/gemini/claude_gemini_response.go +++ b/internal/translator/claude/gemini/claude_gemini_response.go @@ -331,9 +331,8 @@ func ConvertClaudeResponseToGeminiNonStream(_ context.Context, modelName string, streamingEvents := make([][]byte, 0) scanner := bufio.NewScanner(bytes.NewReader(rawJSON)) - // Use a smaller initial buffer (64KB) that can grow up to 20MB if needed - // This prevents allocating 20MB for every request regardless of size - scanner.Buffer(make([]byte, 64*1024), 20_971_520) + buffer := make([]byte, 52_428_800) // 50MB + scanner.Buffer(buffer, 52_428_800) for scanner.Scan() { line := scanner.Bytes() // log.Debug(string(line)) diff --git a/internal/translator/claude/openai/responses/claude_openai-responses_response.go b/internal/translator/claude/openai/responses/claude_openai-responses_response.go index 77507f97b..252967f14 100644 --- a/internal/translator/claude/openai/responses/claude_openai-responses_response.go +++ b/internal/translator/claude/openai/responses/claude_openai-responses_response.go @@ -445,8 +445,8 @@ func ConvertClaudeResponseToOpenAIResponsesNonStream(_ context.Context, _ string // Use a simple scanner to iterate through raw bytes // Note: extremely large responses may require increasing the buffer scanner := bufio.NewScanner(bytes.NewReader(rawJSON)) - buf := make([]byte, 20_971_520) - scanner.Buffer(buf, 20_971_520) + buf := make([]byte, 52_428_800) // 50MB + scanner.Buffer(buf, 52_428_800) for scanner.Scan() { line := scanner.Bytes() if !bytes.HasPrefix(line, dataTag) { diff --git a/internal/translator/gemini-cli/claude/gemini-cli_claude_response.go b/internal/translator/gemini-cli/claude/gemini-cli_claude_response.go index ba9f68018..920610865 100644 --- a/internal/translator/gemini-cli/claude/gemini-cli_claude_response.go +++ b/internal/translator/gemini-cli/claude/gemini-cli_claude_response.go @@ -26,6 +26,7 @@ type Params struct { HasFirstResponse bool // Indicates if the initial message_start event has been sent ResponseType int // Current response type: 0=none, 1=content, 2=thinking, 3=function ResponseIndex int // Index counter for content blocks in the streaming response + HasContent bool // Tracks whether any content (text, thinking, or tool use) has been output } // toolUseIDCounter provides a process-wide unique counter for tool use identifiers. @@ -57,9 +58,13 @@ func ConvertGeminiCLIResponseToClaude(_ context.Context, _ string, originalReque } if bytes.Equal(rawJSON, []byte("[DONE]")) { - return []string{ - "event: message_stop\ndata: {\"type\":\"message_stop\"}\n\n\n", + // Only send message_stop if we have actually output content + if (*param).(*Params).HasContent { + return []string{ + "event: message_stop\ndata: {\"type\":\"message_stop\"}\n\n\n", + } } + return []string{} } // Track whether tools are being used in this response chunk @@ -107,7 +112,8 @@ func ConvertGeminiCLIResponseToClaude(_ context.Context, _ string, originalReque if (*param).(*Params).ResponseType == 2 { sb.WriteString("event: content_block_delta\n") data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"thinking_delta","thinking":""}}`, (*param).(*Params).ResponseIndex), "delta.thinking", partTextResult.String()) - sb.WriteString(fmt.Sprintf("data: %s\n\n\n", data)) + output = output + fmt.Sprintf("data: %s\n\n\n", data) + (*param).(*Params).HasContent = true } else { // Transition from another state to thinking // First, close any existing content block @@ -126,6 +132,7 @@ func ConvertGeminiCLIResponseToClaude(_ context.Context, _ string, originalReque data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"thinking_delta","thinking":""}}`, (*param).(*Params).ResponseIndex), "delta.thinking", partTextResult.String()) sb.WriteString(fmt.Sprintf("data: %s\n\n\n", data)) (*param).(*Params).ResponseType = 2 // Set state to thinking + (*param).(*Params).HasContent = true } } else { // Process regular text content (user-visible output) @@ -133,7 +140,8 @@ func ConvertGeminiCLIResponseToClaude(_ context.Context, _ string, originalReque if (*param).(*Params).ResponseType == 1 { sb.WriteString("event: content_block_delta\n") data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"text_delta","text":""}}`, (*param).(*Params).ResponseIndex), "delta.text", partTextResult.String()) - sb.WriteString(fmt.Sprintf("data: %s\n\n\n", data)) + output = output + fmt.Sprintf("data: %s\n\n\n", data) + (*param).(*Params).HasContent = true } else { // Transition from another state to text content // First, close any existing content block @@ -152,6 +160,7 @@ func ConvertGeminiCLIResponseToClaude(_ context.Context, _ string, originalReque data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"text_delta","text":""}}`, (*param).(*Params).ResponseIndex), "delta.text", partTextResult.String()) sb.WriteString(fmt.Sprintf("data: %s\n\n\n", data)) (*param).(*Params).ResponseType = 1 // Set state to content + (*param).(*Params).HasContent = true } } } else if functionCallResult.Exists() { @@ -194,6 +203,7 @@ func ConvertGeminiCLIResponseToClaude(_ context.Context, _ string, originalReque sb.WriteString(fmt.Sprintf("data: %s\n\n\n", data)) } (*param).(*Params).ResponseType = 3 + (*param).(*Params).HasContent = true } } } @@ -202,28 +212,31 @@ func ConvertGeminiCLIResponseToClaude(_ context.Context, _ string, originalReque // Process usage metadata and finish reason when present in the response if usageResult.Exists() && bytes.Contains(rawJSON, []byte(`"finishReason"`)) { if candidatesTokenCountResult := usageResult.Get("candidatesTokenCount"); candidatesTokenCountResult.Exists() { - // Close the final content block - sb.WriteString("event: content_block_stop\n") - sb.WriteString(fmt.Sprintf(`data: {"type":"content_block_stop","index":%d}`, (*param).(*Params).ResponseIndex)) - sb.WriteString("\n\n\n") - - // Send the final message delta with usage information and stop reason - sb.WriteString("event: message_delta\n") - sb.WriteString(`data: `) - - // Create the message delta template with appropriate stop reason - template := `{"type":"message_delta","delta":{"stop_reason":"end_turn","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}` - // Set tool_use stop reason if tools were used in this response - if usedTool { - template = `{"type":"message_delta","delta":{"stop_reason":"tool_use","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}` - } + // Only send final events if we have actually output content + if (*param).(*Params).HasContent { + // Close the final content block + output = output + "event: content_block_stop\n" + output = output + fmt.Sprintf(`data: {"type":"content_block_stop","index":%d}`, (*param).(*Params).ResponseIndex) + output = output + "\n\n\n" + + // Send the final message delta with usage information and stop reason + output = output + "event: message_delta\n" + output = output + `data: ` + + // Create the message delta template with appropriate stop reason + template := `{"type":"message_delta","delta":{"stop_reason":"end_turn","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}` + // Set tool_use stop reason if tools were used in this response + if usedTool { + template = `{"type":"message_delta","delta":{"stop_reason":"tool_use","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}` + } - // Include thinking tokens in output token count if present - thoughtsTokenCount := usageResult.Get("thoughtsTokenCount").Int() - template, _ = sjson.Set(template, "usage.output_tokens", candidatesTokenCountResult.Int()+thoughtsTokenCount) - template, _ = sjson.Set(template, "usage.input_tokens", usageResult.Get("promptTokenCount").Int()) + // Include thinking tokens in output token count if present + thoughtsTokenCount := usageResult.Get("thoughtsTokenCount").Int() + template, _ = sjson.Set(template, "usage.output_tokens", candidatesTokenCountResult.Int()+thoughtsTokenCount) + template, _ = sjson.Set(template, "usage.input_tokens", usageResult.Get("promptTokenCount").Int()) - sb.WriteString(template + "\n\n\n") + output = output + template + "\n\n\n" + } } } diff --git a/internal/translator/gemini/claude/gemini_claude_response.go b/internal/translator/gemini/claude/gemini_claude_response.go index 8fd566df3..7767c3652 100644 --- a/internal/translator/gemini/claude/gemini_claude_response.go +++ b/internal/translator/gemini/claude/gemini_claude_response.go @@ -25,6 +25,7 @@ type Params struct { HasFirstResponse bool ResponseType int ResponseIndex int + HasContent bool // Tracks whether any content (text, thinking, or tool use) has been output } // toolUseIDCounter provides a process-wide unique counter for tool use identifiers. @@ -57,9 +58,13 @@ func ConvertGeminiResponseToClaude(_ context.Context, _ string, originalRequestR } if bytes.Equal(rawJSON, []byte("[DONE]")) { - return []string{ - "event: message_stop\ndata: {\"type\":\"message_stop\"}\n\n\n", + // Only send message_stop if we have actually output content + if (*param).(*Params).HasContent { + return []string{ + "event: message_stop\ndata: {\"type\":\"message_stop\"}\n\n\n", + } } + return []string{} } // Track whether tools are being used in this response chunk @@ -108,6 +113,7 @@ func ConvertGeminiResponseToClaude(_ context.Context, _ string, originalRequestR output = output + "event: content_block_delta\n" data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"thinking_delta","thinking":""}}`, (*param).(*Params).ResponseIndex), "delta.thinking", partTextResult.String()) output = output + fmt.Sprintf("data: %s\n\n\n", data) + (*param).(*Params).HasContent = true } else { // Transition from another state to thinking // First, close any existing content block @@ -131,6 +137,7 @@ func ConvertGeminiResponseToClaude(_ context.Context, _ string, originalRequestR data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"thinking_delta","thinking":""}}`, (*param).(*Params).ResponseIndex), "delta.thinking", partTextResult.String()) output = output + fmt.Sprintf("data: %s\n\n\n", data) (*param).(*Params).ResponseType = 2 // Set state to thinking + (*param).(*Params).HasContent = true } } else { // Process regular text content (user-visible output) @@ -139,6 +146,7 @@ func ConvertGeminiResponseToClaude(_ context.Context, _ string, originalRequestR output = output + "event: content_block_delta\n" data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"text_delta","text":""}}`, (*param).(*Params).ResponseIndex), "delta.text", partTextResult.String()) output = output + fmt.Sprintf("data: %s\n\n\n", data) + (*param).(*Params).HasContent = true } else { // Transition from another state to text content // First, close any existing content block @@ -162,6 +170,7 @@ func ConvertGeminiResponseToClaude(_ context.Context, _ string, originalRequestR data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"text_delta","text":""}}`, (*param).(*Params).ResponseIndex), "delta.text", partTextResult.String()) output = output + fmt.Sprintf("data: %s\n\n\n", data) (*param).(*Params).ResponseType = 1 // Set state to content + (*param).(*Params).HasContent = true } } } else if functionCallResult.Exists() { @@ -211,6 +220,7 @@ func ConvertGeminiResponseToClaude(_ context.Context, _ string, originalRequestR output = output + fmt.Sprintf("data: %s\n\n\n", data) } (*param).(*Params).ResponseType = 3 + (*param).(*Params).HasContent = true } } } @@ -218,23 +228,26 @@ func ConvertGeminiResponseToClaude(_ context.Context, _ string, originalRequestR usageResult := gjson.GetBytes(rawJSON, "usageMetadata") if usageResult.Exists() && bytes.Contains(rawJSON, []byte(`"finishReason"`)) { if candidatesTokenCountResult := usageResult.Get("candidatesTokenCount"); candidatesTokenCountResult.Exists() { - output = output + "event: content_block_stop\n" - output = output + fmt.Sprintf(`data: {"type":"content_block_stop","index":%d}`, (*param).(*Params).ResponseIndex) - output = output + "\n\n\n" + // Only send final events if we have actually output content + if (*param).(*Params).HasContent { + output = output + "event: content_block_stop\n" + output = output + fmt.Sprintf(`data: {"type":"content_block_stop","index":%d}`, (*param).(*Params).ResponseIndex) + output = output + "\n\n\n" + + output = output + "event: message_delta\n" + output = output + `data: ` + + template := `{"type":"message_delta","delta":{"stop_reason":"end_turn","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}` + if usedTool { + template = `{"type":"message_delta","delta":{"stop_reason":"tool_use","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}` + } - output = output + "event: message_delta\n" - output = output + `data: ` + thoughtsTokenCount := usageResult.Get("thoughtsTokenCount").Int() + template, _ = sjson.Set(template, "usage.output_tokens", candidatesTokenCountResult.Int()+thoughtsTokenCount) + template, _ = sjson.Set(template, "usage.input_tokens", usageResult.Get("promptTokenCount").Int()) - template := `{"type":"message_delta","delta":{"stop_reason":"end_turn","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}` - if usedTool { - template = `{"type":"message_delta","delta":{"stop_reason":"tool_use","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}` + output = output + template + "\n\n\n" } - - thoughtsTokenCount := usageResult.Get("thoughtsTokenCount").Int() - template, _ = sjson.Set(template, "usage.output_tokens", candidatesTokenCountResult.Int()+thoughtsTokenCount) - template, _ = sjson.Set(template, "usage.input_tokens", usageResult.Get("promptTokenCount").Int()) - - output = output + template + "\n\n\n" } } diff --git a/internal/util/claude_thinking.go b/internal/util/claude_thinking.go new file mode 100644 index 000000000..b0c5a0a2f --- /dev/null +++ b/internal/util/claude_thinking.go @@ -0,0 +1,46 @@ +package util + +import ( + "github.com/tidwall/gjson" + "github.com/tidwall/sjson" +) + +// ApplyClaudeThinkingConfig applies thinking configuration to a Claude API request payload. +// It sets the thinking.type to "enabled" and thinking.budget_tokens to the specified budget. +// If budget is nil or the payload already has thinking config, it returns the payload unchanged. +func ApplyClaudeThinkingConfig(body []byte, budget *int) []byte { + if budget == nil { + return body + } + if gjson.GetBytes(body, "thinking").Exists() { + return body + } + if *budget <= 0 { + return body + } + updated := body + updated, _ = sjson.SetBytes(updated, "thinking.type", "enabled") + updated, _ = sjson.SetBytes(updated, "thinking.budget_tokens", *budget) + return updated +} + +// ResolveClaudeThinkingConfig resolves thinking configuration from metadata for Claude models. +// It uses the unified ResolveThinkingConfigFromMetadata and normalizes the budget. +// Returns the normalized budget (nil if thinking should not be enabled) and whether it matched. +func ResolveClaudeThinkingConfig(modelName string, metadata map[string]any) (*int, bool) { + budget, include, matched := ResolveThinkingConfigFromMetadata(modelName, metadata) + if !matched { + return nil, false + } + if include != nil && !*include { + return nil, true + } + if budget == nil { + return nil, true + } + normalized := NormalizeThinkingBudget(modelName, *budget) + if normalized <= 0 { + return nil, true + } + return &normalized, true +} diff --git a/internal/util/thinking.go b/internal/util/thinking.go index c16b91cd1..9671f20b5 100644 --- a/internal/util/thinking.go +++ b/internal/util/thinking.go @@ -1,6 +1,8 @@ package util import ( + "strings" + "github.com/router-for-me/CLIProxyAPI/v6/internal/registry" ) @@ -67,3 +69,39 @@ func thinkingRangeFromRegistry(model string) (found bool, min int, max int, zero } return true, info.Thinking.Min, info.Thinking.Max, info.Thinking.ZeroAllowed, info.Thinking.DynamicAllowed } + +// GetModelThinkingLevels returns the discrete reasoning effort levels for the model. +// Returns nil if the model has no thinking support or no levels defined. +func GetModelThinkingLevels(model string) []string { + if model == "" { + return nil + } + info := registry.GetGlobalRegistry().GetModelInfo(model) + if info == nil || info.Thinking == nil { + return nil + } + return info.Thinking.Levels +} + +// ModelUsesThinkingLevels reports whether the model uses discrete reasoning +// effort levels instead of numeric budgets. +func ModelUsesThinkingLevels(model string) bool { + levels := GetModelThinkingLevels(model) + return len(levels) > 0 +} + +// NormalizeReasoningEffortLevel validates and normalizes a reasoning effort +// level for the given model. Returns false when the level is not supported. +func NormalizeReasoningEffortLevel(model, effort string) (string, bool) { + levels := GetModelThinkingLevels(model) + if len(levels) == 0 { + return "", false + } + loweredEffort := strings.ToLower(strings.TrimSpace(effort)) + for _, lvl := range levels { + if strings.ToLower(lvl) == loweredEffort { + return lvl, true + } + } + return "", false +} diff --git a/internal/util/thinking_suffix.go b/internal/util/thinking_suffix.go index e3fd91366..7851c580f 100644 --- a/internal/util/thinking_suffix.go +++ b/internal/util/thinking_suffix.go @@ -14,61 +14,59 @@ const ( ) // NormalizeThinkingModel parses dynamic thinking suffixes on model names and returns -// the normalized base model with extracted metadata. Supported patterns: -// - "-thinking-" extracts a numeric budget -// - "-thinking-" extracts a reasoning effort level (minimal/low/medium/high/xhigh/auto/none) -// - "-thinking" maps to a default reasoning effort of "medium" -// - "-reasoning" maps to dynamic budget (-1) and include_thoughts=true -// - "-nothinking" maps to budget=0 and include_thoughts=false +// the normalized base model with extracted metadata. Supported pattern: +// - "()" where value can be: +// - A numeric budget (e.g., "(8192)", "(16384)") +// - A reasoning effort level (e.g., "(high)", "(medium)", "(low)") +// +// Examples: +// - "claude-sonnet-4-5-20250929(16384)" → budget=16384 +// - "gpt-5.1(high)" → reasoning_effort="high" +// - "gemini-2.5-pro(32768)" → budget=32768 +// +// Note: Empty parentheses "()" are not supported and will be ignored. func NormalizeThinkingModel(modelName string) (string, map[string]any) { if modelName == "" { return modelName, nil } - lower := strings.ToLower(modelName) baseModel := modelName var ( budgetOverride *int - includeThoughts *bool reasoningEffort *string matched bool ) - switch { - case strings.HasSuffix(lower, "-nothinking"): - baseModel = modelName[:len(modelName)-len("-nothinking")] - budget := 0 - include := false - budgetOverride = &budget - includeThoughts = &include - matched = true - case strings.HasSuffix(lower, "-reasoning"): - baseModel = modelName[:len(modelName)-len("-reasoning")] - budget := -1 - include := true - budgetOverride = &budget - includeThoughts = &include - matched = true - default: - if idx := strings.LastIndex(lower, "-thinking-"); idx != -1 { - value := modelName[idx+len("-thinking-"):] - if value != "" { - if parsed, ok := parseIntPrefix(value); ok { - baseModel = modelName[:idx] - budgetOverride = &parsed - matched = true - } else if effort, okEffort := normalizeReasoningEffort(value); okEffort { - baseModel = modelName[:idx] - reasoningEffort = &effort - matched = true - } - } - } else if strings.HasSuffix(lower, "-thinking") { - baseModel = modelName[:len(modelName)-len("-thinking")] - effort := "medium" - reasoningEffort = &effort + // Match "()" pattern at the end of the model name + if idx := strings.LastIndex(modelName, "("); idx != -1 { + if !strings.HasSuffix(modelName, ")") { + // Incomplete parenthesis, ignore + return baseModel, nil + } + + value := modelName[idx+1 : len(modelName)-1] // Extract content between ( and ) + if value == "" { + // Empty parentheses not supported + return baseModel, nil + } + + candidateBase := modelName[:idx] + + // Auto-detect: pure numeric → budget, string → reasoning effort level + if parsed, ok := parseIntPrefix(value); ok { + // Numeric value: treat as thinking budget + baseModel = candidateBase + budgetOverride = &parsed matched = true + } else { + // String value: treat as reasoning effort level + baseModel = candidateBase + raw := strings.ToLower(strings.TrimSpace(value)) + if raw != "" { + reasoningEffort = &raw + matched = true + } } } @@ -82,9 +80,6 @@ func NormalizeThinkingModel(modelName string) (string, map[string]any) { if budgetOverride != nil { metadata[ThinkingBudgetMetadataKey] = *budgetOverride } - if includeThoughts != nil { - metadata[ThinkingIncludeThoughtsMetadataKey] = *includeThoughts - } if reasoningEffort != nil { metadata[ReasoningEffortMetadataKey] = *reasoningEffort } @@ -185,7 +180,7 @@ func ReasoningEffortFromMetadata(metadata map[string]any) (string, bool) { return "", false } if effort != nil && *effort != "" { - return *effort, true + return strings.ToLower(strings.TrimSpace(*effort)), true } if budget != nil { switch *budget { @@ -207,7 +202,11 @@ func ThinkingEffortToBudget(model, effort string) (int, bool) { if effort == "" { return 0, false } - switch strings.ToLower(effort) { + normalized, ok := NormalizeReasoningEffortLevel(model, effort) + if !ok { + normalized = strings.ToLower(strings.TrimSpace(effort)) + } + switch normalized { case "none": return 0, true case "auto": @@ -312,16 +311,3 @@ func parseNumberToInt(raw any) (int, bool) { } return 0, false } - -func normalizeReasoningEffort(value string) (string, bool) { - if value == "" { - return "", false - } - effort := strings.ToLower(strings.TrimSpace(value)) - switch effort { - case "minimal", "low", "medium", "high", "xhigh", "auto", "none": - return effort, true - default: - return "", false - } -} diff --git a/sdk/api/handlers/claude/code_handlers.go b/sdk/api/handlers/claude/code_handlers.go index 63ea6065e..8a57a0cc6 100644 --- a/sdk/api/handlers/claude/code_handlers.go +++ b/sdk/api/handlers/claude/code_handlers.go @@ -271,6 +271,11 @@ func (h *ClaudeCodeAPIHandler) forwardClaudeStream(c *gin.Context, flusher http. continue } if errMsg != nil { + status := http.StatusInternalServerError + if errMsg.StatusCode > 0 { + status = errMsg.StatusCode + } + c.Status(status) // An error occurred: emit as a proper SSE error event errorBytes, _ := json.Marshal(h.toClaudeError(errMsg)) _, _ = writer.WriteString("event: error\n") @@ -278,6 +283,7 @@ func (h *ClaudeCodeAPIHandler) forwardClaudeStream(c *gin.Context, flusher http. _, _ = writer.Write(errorBytes) _, _ = writer.WriteString("\n\n") _ = writer.Flush() + flusher.Flush() } var execErr error if errMsg != nil {