From a89514951f5676feed3594b1f806920e32507ebf Mon Sep 17 00:00:00 2001 From: hkfires <10558748+hkfires@users.noreply.github.com> Date: Wed, 10 Dec 2025 22:19:55 +0800 Subject: [PATCH 01/15] fix(claude): prevent final events when no content streamed --- .../claude/antigravity_claude_response.go | 23 ++++++-- .../claude/gemini-cli_claude_response.go | 57 ++++++++++++------- .../gemini/claude/gemini_claude_response.go | 45 +++++++++------ 3 files changed, 83 insertions(+), 42 deletions(-) diff --git a/internal/translator/antigravity/claude/antigravity_claude_response.go b/internal/translator/antigravity/claude/antigravity_claude_response.go index 42265e80e..28785a8fb 100644 --- a/internal/translator/antigravity/claude/antigravity_claude_response.go +++ b/internal/translator/antigravity/claude/antigravity_claude_response.go @@ -35,6 +35,7 @@ type Params struct { TotalTokenCount int64 // Cached total token count from usage metadata HasSentFinalEvents bool // Indicates if final content/message events have been sent HasToolUse bool // Indicates if tool use was observed in the stream + HasContent bool // Tracks whether any content (text, thinking, or tool use) has been output } // toolUseIDCounter provides a process-wide unique counter for tool use identifiers. @@ -69,11 +70,14 @@ func ConvertAntigravityResponseToClaude(_ context.Context, _ string, originalReq if bytes.Equal(rawJSON, []byte("[DONE]")) { output := "" - appendFinalEvents(params, &output, true) - - return []string{ - output + "event: message_stop\ndata: {\"type\":\"message_stop\"}\n\n\n", + // Only send final events if we have actually output content + if params.HasContent { + appendFinalEvents(params, &output, true) + return []string{ + output + "event: message_stop\ndata: {\"type\":\"message_stop\"}\n\n\n", + } } + return []string{} } output := "" @@ -119,10 +123,12 @@ func ConvertAntigravityResponseToClaude(_ context.Context, _ string, originalReq output = output + "event: content_block_delta\n" data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"signature_delta","signature":""}}`, params.ResponseIndex), "delta.signature", thoughtSignature.String()) output = output + fmt.Sprintf("data: %s\n\n\n", data) + params.HasContent = true } else if params.ResponseType == 2 { // Continue existing thinking block if already in thinking state output = output + "event: content_block_delta\n" data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"thinking_delta","thinking":""}}`, params.ResponseIndex), "delta.thinking", partTextResult.String()) output = output + fmt.Sprintf("data: %s\n\n\n", data) + params.HasContent = true } else { // Transition from another state to thinking // First, close any existing content block @@ -146,6 +152,7 @@ func ConvertAntigravityResponseToClaude(_ context.Context, _ string, originalReq data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"thinking_delta","thinking":""}}`, params.ResponseIndex), "delta.thinking", partTextResult.String()) output = output + fmt.Sprintf("data: %s\n\n\n", data) params.ResponseType = 2 // Set state to thinking + params.HasContent = true } } else { finishReasonResult := gjson.GetBytes(rawJSON, "response.candidates.0.finishReason") @@ -156,6 +163,7 @@ func ConvertAntigravityResponseToClaude(_ context.Context, _ string, originalReq output = output + "event: content_block_delta\n" data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"text_delta","text":""}}`, params.ResponseIndex), "delta.text", partTextResult.String()) output = output + fmt.Sprintf("data: %s\n\n\n", data) + params.HasContent = true } else { // Transition from another state to text content // First, close any existing content block @@ -179,6 +187,7 @@ func ConvertAntigravityResponseToClaude(_ context.Context, _ string, originalReq data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"text_delta","text":""}}`, params.ResponseIndex), "delta.text", partTextResult.String()) output = output + fmt.Sprintf("data: %s\n\n\n", data) params.ResponseType = 1 // Set state to content + params.HasContent = true } } } @@ -230,6 +239,7 @@ func ConvertAntigravityResponseToClaude(_ context.Context, _ string, originalReq output = output + fmt.Sprintf("data: %s\n\n\n", data) } params.ResponseType = 3 + params.HasContent = true } } } @@ -269,6 +279,11 @@ func appendFinalEvents(params *Params, output *string, force bool) { return } + // Only send final events if we have actually output content + if !params.HasContent { + return + } + if params.ResponseType != 0 { *output = *output + "event: content_block_stop\n" *output = *output + fmt.Sprintf(`data: {"type":"content_block_stop","index":%d}`, params.ResponseIndex) diff --git a/internal/translator/gemini-cli/claude/gemini-cli_claude_response.go b/internal/translator/gemini-cli/claude/gemini-cli_claude_response.go index 9b37c52b6..ca905f9e3 100644 --- a/internal/translator/gemini-cli/claude/gemini-cli_claude_response.go +++ b/internal/translator/gemini-cli/claude/gemini-cli_claude_response.go @@ -26,6 +26,7 @@ type Params struct { HasFirstResponse bool // Indicates if the initial message_start event has been sent ResponseType int // Current response type: 0=none, 1=content, 2=thinking, 3=function ResponseIndex int // Index counter for content blocks in the streaming response + HasContent bool // Tracks whether any content (text, thinking, or tool use) has been output } // toolUseIDCounter provides a process-wide unique counter for tool use identifiers. @@ -57,9 +58,13 @@ func ConvertGeminiCLIResponseToClaude(_ context.Context, _ string, originalReque } if bytes.Equal(rawJSON, []byte("[DONE]")) { - return []string{ - "event: message_stop\ndata: {\"type\":\"message_stop\"}\n\n\n", + // Only send message_stop if we have actually output content + if (*param).(*Params).HasContent { + return []string{ + "event: message_stop\ndata: {\"type\":\"message_stop\"}\n\n\n", + } } + return []string{} } // Track whether tools are being used in this response chunk @@ -108,6 +113,7 @@ func ConvertGeminiCLIResponseToClaude(_ context.Context, _ string, originalReque output = output + "event: content_block_delta\n" data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"thinking_delta","thinking":""}}`, (*param).(*Params).ResponseIndex), "delta.thinking", partTextResult.String()) output = output + fmt.Sprintf("data: %s\n\n\n", data) + (*param).(*Params).HasContent = true } else { // Transition from another state to thinking // First, close any existing content block @@ -131,6 +137,7 @@ func ConvertGeminiCLIResponseToClaude(_ context.Context, _ string, originalReque data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"thinking_delta","thinking":""}}`, (*param).(*Params).ResponseIndex), "delta.thinking", partTextResult.String()) output = output + fmt.Sprintf("data: %s\n\n\n", data) (*param).(*Params).ResponseType = 2 // Set state to thinking + (*param).(*Params).HasContent = true } } else { // Process regular text content (user-visible output) @@ -139,6 +146,7 @@ func ConvertGeminiCLIResponseToClaude(_ context.Context, _ string, originalReque output = output + "event: content_block_delta\n" data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"text_delta","text":""}}`, (*param).(*Params).ResponseIndex), "delta.text", partTextResult.String()) output = output + fmt.Sprintf("data: %s\n\n\n", data) + (*param).(*Params).HasContent = true } else { // Transition from another state to text content // First, close any existing content block @@ -162,6 +170,7 @@ func ConvertGeminiCLIResponseToClaude(_ context.Context, _ string, originalReque data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"text_delta","text":""}}`, (*param).(*Params).ResponseIndex), "delta.text", partTextResult.String()) output = output + fmt.Sprintf("data: %s\n\n\n", data) (*param).(*Params).ResponseType = 1 // Set state to content + (*param).(*Params).HasContent = true } } } else if functionCallResult.Exists() { @@ -211,6 +220,7 @@ func ConvertGeminiCLIResponseToClaude(_ context.Context, _ string, originalReque output = output + fmt.Sprintf("data: %s\n\n\n", data) } (*param).(*Params).ResponseType = 3 + (*param).(*Params).HasContent = true } } } @@ -219,28 +229,31 @@ func ConvertGeminiCLIResponseToClaude(_ context.Context, _ string, originalReque // Process usage metadata and finish reason when present in the response if usageResult.Exists() && bytes.Contains(rawJSON, []byte(`"finishReason"`)) { if candidatesTokenCountResult := usageResult.Get("candidatesTokenCount"); candidatesTokenCountResult.Exists() { - // Close the final content block - output = output + "event: content_block_stop\n" - output = output + fmt.Sprintf(`data: {"type":"content_block_stop","index":%d}`, (*param).(*Params).ResponseIndex) - output = output + "\n\n\n" - - // Send the final message delta with usage information and stop reason - output = output + "event: message_delta\n" - output = output + `data: ` - - // Create the message delta template with appropriate stop reason - template := `{"type":"message_delta","delta":{"stop_reason":"end_turn","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}` - // Set tool_use stop reason if tools were used in this response - if usedTool { - template = `{"type":"message_delta","delta":{"stop_reason":"tool_use","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}` - } + // Only send final events if we have actually output content + if (*param).(*Params).HasContent { + // Close the final content block + output = output + "event: content_block_stop\n" + output = output + fmt.Sprintf(`data: {"type":"content_block_stop","index":%d}`, (*param).(*Params).ResponseIndex) + output = output + "\n\n\n" + + // Send the final message delta with usage information and stop reason + output = output + "event: message_delta\n" + output = output + `data: ` + + // Create the message delta template with appropriate stop reason + template := `{"type":"message_delta","delta":{"stop_reason":"end_turn","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}` + // Set tool_use stop reason if tools were used in this response + if usedTool { + template = `{"type":"message_delta","delta":{"stop_reason":"tool_use","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}` + } - // Include thinking tokens in output token count if present - thoughtsTokenCount := usageResult.Get("thoughtsTokenCount").Int() - template, _ = sjson.Set(template, "usage.output_tokens", candidatesTokenCountResult.Int()+thoughtsTokenCount) - template, _ = sjson.Set(template, "usage.input_tokens", usageResult.Get("promptTokenCount").Int()) + // Include thinking tokens in output token count if present + thoughtsTokenCount := usageResult.Get("thoughtsTokenCount").Int() + template, _ = sjson.Set(template, "usage.output_tokens", candidatesTokenCountResult.Int()+thoughtsTokenCount) + template, _ = sjson.Set(template, "usage.input_tokens", usageResult.Get("promptTokenCount").Int()) - output = output + template + "\n\n\n" + output = output + template + "\n\n\n" + } } } diff --git a/internal/translator/gemini/claude/gemini_claude_response.go b/internal/translator/gemini/claude/gemini_claude_response.go index 8fd566df3..7767c3652 100644 --- a/internal/translator/gemini/claude/gemini_claude_response.go +++ b/internal/translator/gemini/claude/gemini_claude_response.go @@ -25,6 +25,7 @@ type Params struct { HasFirstResponse bool ResponseType int ResponseIndex int + HasContent bool // Tracks whether any content (text, thinking, or tool use) has been output } // toolUseIDCounter provides a process-wide unique counter for tool use identifiers. @@ -57,9 +58,13 @@ func ConvertGeminiResponseToClaude(_ context.Context, _ string, originalRequestR } if bytes.Equal(rawJSON, []byte("[DONE]")) { - return []string{ - "event: message_stop\ndata: {\"type\":\"message_stop\"}\n\n\n", + // Only send message_stop if we have actually output content + if (*param).(*Params).HasContent { + return []string{ + "event: message_stop\ndata: {\"type\":\"message_stop\"}\n\n\n", + } } + return []string{} } // Track whether tools are being used in this response chunk @@ -108,6 +113,7 @@ func ConvertGeminiResponseToClaude(_ context.Context, _ string, originalRequestR output = output + "event: content_block_delta\n" data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"thinking_delta","thinking":""}}`, (*param).(*Params).ResponseIndex), "delta.thinking", partTextResult.String()) output = output + fmt.Sprintf("data: %s\n\n\n", data) + (*param).(*Params).HasContent = true } else { // Transition from another state to thinking // First, close any existing content block @@ -131,6 +137,7 @@ func ConvertGeminiResponseToClaude(_ context.Context, _ string, originalRequestR data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"thinking_delta","thinking":""}}`, (*param).(*Params).ResponseIndex), "delta.thinking", partTextResult.String()) output = output + fmt.Sprintf("data: %s\n\n\n", data) (*param).(*Params).ResponseType = 2 // Set state to thinking + (*param).(*Params).HasContent = true } } else { // Process regular text content (user-visible output) @@ -139,6 +146,7 @@ func ConvertGeminiResponseToClaude(_ context.Context, _ string, originalRequestR output = output + "event: content_block_delta\n" data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"text_delta","text":""}}`, (*param).(*Params).ResponseIndex), "delta.text", partTextResult.String()) output = output + fmt.Sprintf("data: %s\n\n\n", data) + (*param).(*Params).HasContent = true } else { // Transition from another state to text content // First, close any existing content block @@ -162,6 +170,7 @@ func ConvertGeminiResponseToClaude(_ context.Context, _ string, originalRequestR data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"text_delta","text":""}}`, (*param).(*Params).ResponseIndex), "delta.text", partTextResult.String()) output = output + fmt.Sprintf("data: %s\n\n\n", data) (*param).(*Params).ResponseType = 1 // Set state to content + (*param).(*Params).HasContent = true } } } else if functionCallResult.Exists() { @@ -211,6 +220,7 @@ func ConvertGeminiResponseToClaude(_ context.Context, _ string, originalRequestR output = output + fmt.Sprintf("data: %s\n\n\n", data) } (*param).(*Params).ResponseType = 3 + (*param).(*Params).HasContent = true } } } @@ -218,23 +228,26 @@ func ConvertGeminiResponseToClaude(_ context.Context, _ string, originalRequestR usageResult := gjson.GetBytes(rawJSON, "usageMetadata") if usageResult.Exists() && bytes.Contains(rawJSON, []byte(`"finishReason"`)) { if candidatesTokenCountResult := usageResult.Get("candidatesTokenCount"); candidatesTokenCountResult.Exists() { - output = output + "event: content_block_stop\n" - output = output + fmt.Sprintf(`data: {"type":"content_block_stop","index":%d}`, (*param).(*Params).ResponseIndex) - output = output + "\n\n\n" + // Only send final events if we have actually output content + if (*param).(*Params).HasContent { + output = output + "event: content_block_stop\n" + output = output + fmt.Sprintf(`data: {"type":"content_block_stop","index":%d}`, (*param).(*Params).ResponseIndex) + output = output + "\n\n\n" + + output = output + "event: message_delta\n" + output = output + `data: ` + + template := `{"type":"message_delta","delta":{"stop_reason":"end_turn","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}` + if usedTool { + template = `{"type":"message_delta","delta":{"stop_reason":"tool_use","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}` + } - output = output + "event: message_delta\n" - output = output + `data: ` + thoughtsTokenCount := usageResult.Get("thoughtsTokenCount").Int() + template, _ = sjson.Set(template, "usage.output_tokens", candidatesTokenCountResult.Int()+thoughtsTokenCount) + template, _ = sjson.Set(template, "usage.input_tokens", usageResult.Get("promptTokenCount").Int()) - template := `{"type":"message_delta","delta":{"stop_reason":"end_turn","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}` - if usedTool { - template = `{"type":"message_delta","delta":{"stop_reason":"tool_use","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}` + output = output + template + "\n\n\n" } - - thoughtsTokenCount := usageResult.Get("thoughtsTokenCount").Int() - template, _ = sjson.Set(template, "usage.output_tokens", candidatesTokenCountResult.Int()+thoughtsTokenCount) - template, _ = sjson.Set(template, "usage.input_tokens", usageResult.Get("promptTokenCount").Int()) - - output = output + template + "\n\n\n" } } From 76c563d161d7687519edfca710d71a01f00caee6 Mon Sep 17 00:00:00 2001 From: sususu Date: Wed, 10 Dec 2025 23:20:04 +0800 Subject: [PATCH 02/15] fix(executor): increase buffer size for stream scanners to 50MB across multiple executors --- internal/runtime/executor/antigravity_executor.go | 2 +- internal/runtime/executor/claude_executor.go | 4 ++-- internal/runtime/executor/codex_executor.go | 2 +- internal/runtime/executor/gemini_cli_executor.go | 2 +- internal/runtime/executor/gemini_executor.go | 2 +- internal/runtime/executor/gemini_vertex_executor.go | 4 ++-- internal/runtime/executor/iflow_executor.go | 2 +- internal/runtime/executor/openai_compat_executor.go | 2 +- internal/runtime/executor/qwen_executor.go | 2 +- internal/translator/claude/gemini/claude_gemini_response.go | 4 ++-- .../openai/responses/claude_openai-responses_response.go | 4 ++-- 11 files changed, 15 insertions(+), 15 deletions(-) diff --git a/internal/runtime/executor/antigravity_executor.go b/internal/runtime/executor/antigravity_executor.go index a32e66ec0..f5b5ef067 100644 --- a/internal/runtime/executor/antigravity_executor.go +++ b/internal/runtime/executor/antigravity_executor.go @@ -38,7 +38,7 @@ const ( defaultAntigravityAgent = "antigravity/1.11.5 windows/amd64" antigravityAuthType = "antigravity" refreshSkew = 3000 * time.Second - streamScannerBuffer int = 20_971_520 + streamScannerBuffer int = 52_428_800 // 50MB ) var randSource = rand.New(rand.NewSource(time.Now().UnixNano())) diff --git a/internal/runtime/executor/claude_executor.go b/internal/runtime/executor/claude_executor.go index 1a18c46a5..37e6e6c86 100644 --- a/internal/runtime/executor/claude_executor.go +++ b/internal/runtime/executor/claude_executor.go @@ -238,7 +238,7 @@ func (e *ClaudeExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.A // If from == to (Claude → Claude), directly forward the SSE stream without translation if from == to { scanner := bufio.NewScanner(decodedBody) - scanner.Buffer(nil, 20_971_520) + scanner.Buffer(nil, 52_428_800) // 50MB for scanner.Scan() { line := scanner.Bytes() appendAPIResponseChunk(ctx, e.cfg, line) @@ -261,7 +261,7 @@ func (e *ClaudeExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.A // For other formats, use translation scanner := bufio.NewScanner(decodedBody) - scanner.Buffer(nil, 20_971_520) + scanner.Buffer(nil, 52_428_800) // 50MB var param any for scanner.Scan() { line := scanner.Bytes() diff --git a/internal/runtime/executor/codex_executor.go b/internal/runtime/executor/codex_executor.go index 1c4291f64..e9cc8b769 100644 --- a/internal/runtime/executor/codex_executor.go +++ b/internal/runtime/executor/codex_executor.go @@ -205,7 +205,7 @@ func (e *CodexExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Au } }() scanner := bufio.NewScanner(httpResp.Body) - scanner.Buffer(nil, 20_971_520) + scanner.Buffer(nil, 52_428_800) // 50MB var param any for scanner.Scan() { line := scanner.Bytes() diff --git a/internal/runtime/executor/gemini_cli_executor.go b/internal/runtime/executor/gemini_cli_executor.go index a2e0ececd..2c4f3f881 100644 --- a/internal/runtime/executor/gemini_cli_executor.go +++ b/internal/runtime/executor/gemini_cli_executor.go @@ -309,7 +309,7 @@ func (e *GeminiCLIExecutor) ExecuteStream(ctx context.Context, auth *cliproxyaut }() if opts.Alt == "" { scanner := bufio.NewScanner(resp.Body) - scanner.Buffer(nil, 20_971_520) + scanner.Buffer(nil, 52_428_800) // 50MB var param any for scanner.Scan() { line := scanner.Bytes() diff --git a/internal/runtime/executor/gemini_executor.go b/internal/runtime/executor/gemini_executor.go index 8879a4f19..0c52b2030 100644 --- a/internal/runtime/executor/gemini_executor.go +++ b/internal/runtime/executor/gemini_executor.go @@ -243,7 +243,7 @@ func (e *GeminiExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.A } }() scanner := bufio.NewScanner(httpResp.Body) - scanner.Buffer(nil, 20_971_520) + scanner.Buffer(nil, 52_428_800) // 50MB var param any for scanner.Scan() { line := scanner.Bytes() diff --git a/internal/runtime/executor/gemini_vertex_executor.go b/internal/runtime/executor/gemini_vertex_executor.go index c7d10a670..0759d10ac 100644 --- a/internal/runtime/executor/gemini_vertex_executor.go +++ b/internal/runtime/executor/gemini_vertex_executor.go @@ -564,7 +564,7 @@ func (e *GeminiVertexExecutor) executeStreamWithServiceAccount(ctx context.Conte } }() scanner := bufio.NewScanner(httpResp.Body) - scanner.Buffer(nil, 20_971_520) + scanner.Buffer(nil, 52_428_800) // 50MB var param any for scanner.Scan() { line := scanner.Bytes() @@ -678,7 +678,7 @@ func (e *GeminiVertexExecutor) executeStreamWithAPIKey(ctx context.Context, auth } }() scanner := bufio.NewScanner(httpResp.Body) - scanner.Buffer(nil, 20_971_520) + scanner.Buffer(nil, 52_428_800) // 50MB var param any for scanner.Scan() { line := scanner.Bytes() diff --git a/internal/runtime/executor/iflow_executor.go b/internal/runtime/executor/iflow_executor.go index 3589e9226..ee30521a0 100644 --- a/internal/runtime/executor/iflow_executor.go +++ b/internal/runtime/executor/iflow_executor.go @@ -201,7 +201,7 @@ func (e *IFlowExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Au }() scanner := bufio.NewScanner(httpResp.Body) - scanner.Buffer(nil, 20_971_520) + scanner.Buffer(nil, 52_428_800) // 50MB var param any for scanner.Scan() { line := scanner.Bytes() diff --git a/internal/runtime/executor/openai_compat_executor.go b/internal/runtime/executor/openai_compat_executor.go index 55ec6dc9f..bb27b58c4 100644 --- a/internal/runtime/executor/openai_compat_executor.go +++ b/internal/runtime/executor/openai_compat_executor.go @@ -206,7 +206,7 @@ func (e *OpenAICompatExecutor) ExecuteStream(ctx context.Context, auth *cliproxy } }() scanner := bufio.NewScanner(httpResp.Body) - scanner.Buffer(nil, 20_971_520) + scanner.Buffer(nil, 52_428_800) // 50MB var param any for scanner.Scan() { line := scanner.Bytes() diff --git a/internal/runtime/executor/qwen_executor.go b/internal/runtime/executor/qwen_executor.go index 0c3e6b56f..63c54bee3 100644 --- a/internal/runtime/executor/qwen_executor.go +++ b/internal/runtime/executor/qwen_executor.go @@ -181,7 +181,7 @@ func (e *QwenExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Aut } }() scanner := bufio.NewScanner(httpResp.Body) - scanner.Buffer(nil, 20_971_520) + scanner.Buffer(nil, 52_428_800) // 50MB var param any for scanner.Scan() { line := scanner.Bytes() diff --git a/internal/translator/claude/gemini/claude_gemini_response.go b/internal/translator/claude/gemini/claude_gemini_response.go index 0c90398ec..c77cca4bc 100644 --- a/internal/translator/claude/gemini/claude_gemini_response.go +++ b/internal/translator/claude/gemini/claude_gemini_response.go @@ -331,8 +331,8 @@ func ConvertClaudeResponseToGeminiNonStream(_ context.Context, modelName string, streamingEvents := make([][]byte, 0) scanner := bufio.NewScanner(bytes.NewReader(rawJSON)) - buffer := make([]byte, 20_971_520) - scanner.Buffer(buffer, 20_971_520) + buffer := make([]byte, 52_428_800) // 50MB + scanner.Buffer(buffer, 52_428_800) for scanner.Scan() { line := scanner.Bytes() // log.Debug(string(line)) diff --git a/internal/translator/claude/openai/responses/claude_openai-responses_response.go b/internal/translator/claude/openai/responses/claude_openai-responses_response.go index 77507f97b..252967f14 100644 --- a/internal/translator/claude/openai/responses/claude_openai-responses_response.go +++ b/internal/translator/claude/openai/responses/claude_openai-responses_response.go @@ -445,8 +445,8 @@ func ConvertClaudeResponseToOpenAIResponsesNonStream(_ context.Context, _ string // Use a simple scanner to iterate through raw bytes // Note: extremely large responses may require increasing the buffer scanner := bufio.NewScanner(bytes.NewReader(rawJSON)) - buf := make([]byte, 20_971_520) - scanner.Buffer(buf, 20_971_520) + buf := make([]byte, 52_428_800) // 50MB + scanner.Buffer(buf, 52_428_800) for scanner.Scan() { line := scanner.Bytes() if !bytes.HasPrefix(line, dataTag) { From a03d514095c4f76d7d5bf986bd1e109854e2868f Mon Sep 17 00:00:00 2001 From: hkfires <10558748+hkfires@users.noreply.github.com> Date: Thu, 11 Dec 2025 11:28:44 +0800 Subject: [PATCH 03/15] feat(registry): add thinking metadata for models --- internal/registry/model_definitions.go | 13 +++++++++++++ internal/registry/model_registry.go | 3 +++ 2 files changed, 16 insertions(+) diff --git a/internal/registry/model_definitions.go b/internal/registry/model_definitions.go index 2f87f195b..9956d964f 100644 --- a/internal/registry/model_definitions.go +++ b/internal/registry/model_definitions.go @@ -16,6 +16,7 @@ func GetClaudeModels() []*ModelInfo { DisplayName: "Claude 4.5 Haiku", ContextLength: 200000, MaxCompletionTokens: 64000, + // Thinking: not supported for Haiku models }, { ID: "claude-sonnet-4-5-20250929", @@ -49,6 +50,7 @@ func GetClaudeModels() []*ModelInfo { DisplayName: "Claude 4.1 Opus", ContextLength: 200000, MaxCompletionTokens: 32000, + Thinking: &ThinkingSupport{Min: 1024, Max: 100000, ZeroAllowed: false, DynamicAllowed: true}, }, { ID: "claude-opus-4-20250514", @@ -59,6 +61,7 @@ func GetClaudeModels() []*ModelInfo { DisplayName: "Claude 4 Opus", ContextLength: 200000, MaxCompletionTokens: 32000, + Thinking: &ThinkingSupport{Min: 1024, Max: 100000, ZeroAllowed: false, DynamicAllowed: true}, }, { ID: "claude-sonnet-4-20250514", @@ -69,6 +72,7 @@ func GetClaudeModels() []*ModelInfo { DisplayName: "Claude 4 Sonnet", ContextLength: 200000, MaxCompletionTokens: 64000, + Thinking: &ThinkingSupport{Min: 1024, Max: 100000, ZeroAllowed: false, DynamicAllowed: true}, }, { ID: "claude-3-7-sonnet-20250219", @@ -79,6 +83,7 @@ func GetClaudeModels() []*ModelInfo { DisplayName: "Claude 3.7 Sonnet", ContextLength: 128000, MaxCompletionTokens: 8192, + Thinking: &ThinkingSupport{Min: 1024, Max: 100000, ZeroAllowed: false, DynamicAllowed: true}, }, { ID: "claude-3-5-haiku-20241022", @@ -89,6 +94,7 @@ func GetClaudeModels() []*ModelInfo { DisplayName: "Claude 3.5 Haiku", ContextLength: 128000, MaxCompletionTokens: 8192, + // Thinking: not supported for Haiku models }, } } @@ -476,6 +482,7 @@ func GetOpenAIModels() []*ModelInfo { ContextLength: 400000, MaxCompletionTokens: 128000, SupportedParameters: []string{"tools"}, + Thinking: &ThinkingSupport{Levels: []string{"minimal", "low", "medium", "high"}}, }, { ID: "gpt-5-codex", @@ -489,6 +496,7 @@ func GetOpenAIModels() []*ModelInfo { ContextLength: 400000, MaxCompletionTokens: 128000, SupportedParameters: []string{"tools"}, + Thinking: &ThinkingSupport{Levels: []string{"low", "medium", "high"}}, }, { ID: "gpt-5-codex-mini", @@ -502,6 +510,7 @@ func GetOpenAIModels() []*ModelInfo { ContextLength: 400000, MaxCompletionTokens: 128000, SupportedParameters: []string{"tools"}, + Thinking: &ThinkingSupport{Levels: []string{"low", "medium", "high"}}, }, { ID: "gpt-5.1", @@ -515,6 +524,7 @@ func GetOpenAIModels() []*ModelInfo { ContextLength: 400000, MaxCompletionTokens: 128000, SupportedParameters: []string{"tools"}, + Thinking: &ThinkingSupport{Levels: []string{"none", "low", "medium", "high"}}, }, { ID: "gpt-5.1-codex", @@ -528,6 +538,7 @@ func GetOpenAIModels() []*ModelInfo { ContextLength: 400000, MaxCompletionTokens: 128000, SupportedParameters: []string{"tools"}, + Thinking: &ThinkingSupport{Levels: []string{"low", "medium", "high"}}, }, { ID: "gpt-5.1-codex-mini", @@ -541,6 +552,7 @@ func GetOpenAIModels() []*ModelInfo { ContextLength: 400000, MaxCompletionTokens: 128000, SupportedParameters: []string{"tools"}, + Thinking: &ThinkingSupport{Levels: []string{"low", "medium", "high"}}, }, { ID: "gpt-5.1-codex-max", @@ -554,6 +566,7 @@ func GetOpenAIModels() []*ModelInfo { ContextLength: 400000, MaxCompletionTokens: 128000, SupportedParameters: []string{"tools"}, + Thinking: &ThinkingSupport{Levels: []string{"low", "medium", "high", "xhigh"}}, }, } } diff --git a/internal/registry/model_registry.go b/internal/registry/model_registry.go index 5ef9007f9..f3517bde8 100644 --- a/internal/registry/model_registry.go +++ b/internal/registry/model_registry.go @@ -63,6 +63,9 @@ type ThinkingSupport struct { ZeroAllowed bool `json:"zero_allowed,omitempty"` // DynamicAllowed indicates whether -1 is a valid value (dynamic thinking budget). DynamicAllowed bool `json:"dynamic_allowed,omitempty"` + // Levels defines discrete reasoning effort levels (e.g., "low", "medium", "high"). + // When set, the model uses level-based reasoning instead of token budgets. + Levels []string `json:"levels,omitempty"` } // ModelRegistration tracks a model's availability From 3ffd120ae9e9ce2bf34cc87c9994150ec4474ff6 Mon Sep 17 00:00:00 2001 From: hkfires <10558748+hkfires@users.noreply.github.com> Date: Thu, 11 Dec 2025 11:51:33 +0800 Subject: [PATCH 04/15] feat(runtime): add thinking config normalization --- internal/runtime/executor/codex_executor.go | 2 + .../executor/openai_compat_executor.go | 8 ++- internal/runtime/executor/payload_helpers.go | 57 +++++++++++++++++++ internal/util/thinking.go | 46 +++++++++++++++ 4 files changed, 111 insertions(+), 2 deletions(-) diff --git a/internal/runtime/executor/codex_executor.go b/internal/runtime/executor/codex_executor.go index 46a301773..3fe5ed6e0 100644 --- a/internal/runtime/executor/codex_executor.go +++ b/internal/runtime/executor/codex_executor.go @@ -55,6 +55,7 @@ func (e *CodexExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, re to := sdktranslator.FromString("codex") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false) body = applyReasoningEffortMetadata(body, req.Metadata, req.Model) + body = normalizeThinkingConfig(body, upstreamModel) body = applyPayloadConfig(e.cfg, req.Model, body) body, _ = sjson.SetBytes(body, "model", upstreamModel) body, _ = sjson.SetBytes(body, "stream", true) @@ -149,6 +150,7 @@ func (e *CodexExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Au body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), true) body = applyReasoningEffortMetadata(body, req.Metadata, req.Model) + body = normalizeThinkingConfig(body, upstreamModel) body = applyPayloadConfig(e.cfg, req.Model, body) body, _ = sjson.DeleteBytes(body, "previous_response_id") body, _ = sjson.SetBytes(body, "model", upstreamModel) diff --git a/internal/runtime/executor/openai_compat_executor.go b/internal/runtime/executor/openai_compat_executor.go index 93122c202..ba47750e5 100644 --- a/internal/runtime/executor/openai_compat_executor.go +++ b/internal/runtime/executor/openai_compat_executor.go @@ -59,9 +59,11 @@ func (e *OpenAICompatExecutor) Execute(ctx context.Context, auth *cliproxyauth.A } translated = applyPayloadConfigWithRoot(e.cfg, req.Model, to.String(), "", translated) translated = applyReasoningEffortMetadataChatCompletions(translated, req.Metadata, req.Model) - if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" { + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + if upstreamModel != "" { translated, _ = sjson.SetBytes(translated, "model", upstreamModel) } + translated = normalizeThinkingConfig(translated, upstreamModel) url := strings.TrimSuffix(baseURL, "/") + "/chat/completions" httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(translated)) @@ -148,9 +150,11 @@ func (e *OpenAICompatExecutor) ExecuteStream(ctx context.Context, auth *cliproxy } translated = applyPayloadConfigWithRoot(e.cfg, req.Model, to.String(), "", translated) translated = applyReasoningEffortMetadataChatCompletions(translated, req.Metadata, req.Model) - if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" { + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + if upstreamModel != "" { translated, _ = sjson.SetBytes(translated, "model", upstreamModel) } + translated = normalizeThinkingConfig(translated, upstreamModel) url := strings.TrimSuffix(baseURL, "/") + "/chat/completions" httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(translated)) diff --git a/internal/runtime/executor/payload_helpers.go b/internal/runtime/executor/payload_helpers.go index 37e3141a3..9d431f114 100644 --- a/internal/runtime/executor/payload_helpers.go +++ b/internal/runtime/executor/payload_helpers.go @@ -232,3 +232,60 @@ func matchModelPattern(pattern, model string) bool { } return pi == len(pattern) } + +// normalizeThinkingConfig normalizes thinking-related fields in the payload +// based on model capabilities. For models without thinking support, it strips +// reasoning fields. For models with level-based thinking, it validates and +// normalizes the reasoning effort level. +func normalizeThinkingConfig(payload []byte, model string) []byte { + if len(payload) == 0 || model == "" { + return payload + } + + if !util.ModelSupportsThinking(model) { + return stripThinkingFields(payload) + } + + if util.ModelUsesThinkingLevels(model) { + return normalizeReasoningEffortLevel(payload, model) + } + + return payload +} + +// stripThinkingFields removes thinking-related fields from the payload for +// models that do not support thinking. +func stripThinkingFields(payload []byte) []byte { + fieldsToRemove := []string{ + "reasoning", + "reasoning_effort", + "reasoning.effort", + } + out := payload + for _, field := range fieldsToRemove { + if gjson.GetBytes(out, field).Exists() { + out, _ = sjson.DeleteBytes(out, field) + } + } + return out +} + +// normalizeReasoningEffortLevel validates and normalizes the reasoning_effort +// or reasoning.effort field for level-based thinking models. +func normalizeReasoningEffortLevel(payload []byte, model string) []byte { + out := payload + + if effort := gjson.GetBytes(out, "reasoning_effort"); effort.Exists() { + if normalized, ok := util.NormalizeReasoningEffortLevel(model, effort.String()); ok { + out, _ = sjson.SetBytes(out, "reasoning_effort", normalized) + } + } + + if effort := gjson.GetBytes(out, "reasoning.effort"); effort.Exists() { + if normalized, ok := util.NormalizeReasoningEffortLevel(model, effort.String()); ok { + out, _ = sjson.SetBytes(out, "reasoning.effort", normalized) + } + } + + return out +} diff --git a/internal/util/thinking.go b/internal/util/thinking.go index c16b91cd1..37200980c 100644 --- a/internal/util/thinking.go +++ b/internal/util/thinking.go @@ -1,6 +1,8 @@ package util import ( + "strings" + "github.com/router-for-me/CLIProxyAPI/v6/internal/registry" ) @@ -67,3 +69,47 @@ func thinkingRangeFromRegistry(model string) (found bool, min int, max int, zero } return true, info.Thinking.Min, info.Thinking.Max, info.Thinking.ZeroAllowed, info.Thinking.DynamicAllowed } + +// GetModelThinkingLevels returns the discrete reasoning effort levels for the model. +// Returns nil if the model has no thinking support or no levels defined. +func GetModelThinkingLevels(model string) []string { + if model == "" { + return nil + } + info := registry.GetGlobalRegistry().GetModelInfo(model) + if info == nil || info.Thinking == nil { + return nil + } + return info.Thinking.Levels +} + +// ModelUsesThinkingLevels reports whether the model uses discrete reasoning +// effort levels instead of numeric budgets. +func ModelUsesThinkingLevels(model string) bool { + levels := GetModelThinkingLevels(model) + return len(levels) > 0 +} + +// NormalizeReasoningEffortLevel validates and normalizes a reasoning effort +// level for the given model. If the level is not supported, it returns the +// first (lowest) level from the model's supported levels. +func NormalizeReasoningEffortLevel(model, effort string) (string, bool) { + levels := GetModelThinkingLevels(model) + if len(levels) == 0 { + return "", false + } + loweredEffort := strings.ToLower(strings.TrimSpace(effort)) + for _, lvl := range levels { + if strings.ToLower(lvl) == loweredEffort { + return lvl, true + } + } + return defaultReasoningLevel(levels), true +} + +func defaultReasoningLevel(levels []string) string { + if len(levels) > 0 { + return levels[0] + } + return "" +} From d06d0eab2f12af290453c17d8cb24e595792751a Mon Sep 17 00:00:00 2001 From: hkfires <10558748+hkfires@users.noreply.github.com> Date: Thu, 11 Dec 2025 12:14:51 +0800 Subject: [PATCH 05/15] fix(util): centralize reasoning effort normalization --- internal/util/thinking.go | 42 ++++++++++++++++++++++++++++++++ internal/util/thinking_suffix.go | 26 ++++++++------------ 2 files changed, 52 insertions(+), 16 deletions(-) diff --git a/internal/util/thinking.go b/internal/util/thinking.go index 37200980c..bcf92c5b3 100644 --- a/internal/util/thinking.go +++ b/internal/util/thinking.go @@ -113,3 +113,45 @@ func defaultReasoningLevel(levels []string) string { } return "" } + +// standardReasoningEfforts defines the canonical set of reasoning effort levels. +// This serves as the single source of truth for valid effort values. +var standardReasoningEfforts = []string{"none", "auto", "minimal", "low", "medium", "high", "xhigh"} + +// IsValidReasoningEffort checks if the given effort string is a valid reasoning effort level. +// This is a registry-independent check against the standard effort levels. +func IsValidReasoningEffort(effort string) bool { + if effort == "" { + return false + } + lowered := strings.ToLower(strings.TrimSpace(effort)) + for _, e := range standardReasoningEfforts { + if e == lowered { + return true + } + } + return false +} + +// NormalizeReasoningEffort normalizes a reasoning effort string to its canonical form. +// It first tries registry-based normalization if a model is provided, then falls back +// to the standard effort levels. Returns empty string and false if invalid. +func NormalizeReasoningEffort(model, effort string) (string, bool) { + if effort == "" { + return "", false + } + lowered := strings.ToLower(strings.TrimSpace(effort)) + + if model != "" { + if normalized, ok := NormalizeReasoningEffortLevel(model, effort); ok { + return normalized, true + } + } + + for _, e := range standardReasoningEfforts { + if e == lowered { + return e, true + } + } + return "", false +} diff --git a/internal/util/thinking_suffix.go b/internal/util/thinking_suffix.go index e3fd91366..1a1a8715f 100644 --- a/internal/util/thinking_suffix.go +++ b/internal/util/thinking_suffix.go @@ -58,8 +58,9 @@ func NormalizeThinkingModel(modelName string) (string, map[string]any) { baseModel = modelName[:idx] budgetOverride = &parsed matched = true - } else if effort, okEffort := normalizeReasoningEffort(value); okEffort { + } else if IsValidReasoningEffort(value) { baseModel = modelName[:idx] + effort := strings.ToLower(strings.TrimSpace(value)) reasoningEffort = &effort matched = true } @@ -185,7 +186,9 @@ func ReasoningEffortFromMetadata(metadata map[string]any) (string, bool) { return "", false } if effort != nil && *effort != "" { - return *effort, true + if IsValidReasoningEffort(*effort) { + return strings.ToLower(strings.TrimSpace(*effort)), true + } } if budget != nil { switch *budget { @@ -207,7 +210,11 @@ func ThinkingEffortToBudget(model, effort string) (int, bool) { if effort == "" { return 0, false } - switch strings.ToLower(effort) { + normalized, ok := NormalizeReasoningEffort(model, effort) + if !ok { + return 0, false + } + switch normalized { case "none": return 0, true case "auto": @@ -312,16 +319,3 @@ func parseNumberToInt(raw any) (int, bool) { } return 0, false } - -func normalizeReasoningEffort(value string) (string, bool) { - if value == "" { - return "", false - } - effort := strings.ToLower(strings.TrimSpace(value)) - switch effort { - case "minimal", "low", "medium", "high", "xhigh", "auto", "none": - return effort, true - default: - return "", false - } -} From 169f4295d041b0c2e1089d02073740c36f83e8bf Mon Sep 17 00:00:00 2001 From: hkfires <10558748+hkfires@users.noreply.github.com> Date: Thu, 11 Dec 2025 12:20:12 +0800 Subject: [PATCH 06/15] fix(util): align reasoning effort handling with registry --- internal/util/thinking.go | 42 -------------------------------- internal/util/thinking_suffix.go | 19 ++++++++------- 2 files changed, 10 insertions(+), 51 deletions(-) diff --git a/internal/util/thinking.go b/internal/util/thinking.go index bcf92c5b3..37200980c 100644 --- a/internal/util/thinking.go +++ b/internal/util/thinking.go @@ -113,45 +113,3 @@ func defaultReasoningLevel(levels []string) string { } return "" } - -// standardReasoningEfforts defines the canonical set of reasoning effort levels. -// This serves as the single source of truth for valid effort values. -var standardReasoningEfforts = []string{"none", "auto", "minimal", "low", "medium", "high", "xhigh"} - -// IsValidReasoningEffort checks if the given effort string is a valid reasoning effort level. -// This is a registry-independent check against the standard effort levels. -func IsValidReasoningEffort(effort string) bool { - if effort == "" { - return false - } - lowered := strings.ToLower(strings.TrimSpace(effort)) - for _, e := range standardReasoningEfforts { - if e == lowered { - return true - } - } - return false -} - -// NormalizeReasoningEffort normalizes a reasoning effort string to its canonical form. -// It first tries registry-based normalization if a model is provided, then falls back -// to the standard effort levels. Returns empty string and false if invalid. -func NormalizeReasoningEffort(model, effort string) (string, bool) { - if effort == "" { - return "", false - } - lowered := strings.ToLower(strings.TrimSpace(effort)) - - if model != "" { - if normalized, ok := NormalizeReasoningEffortLevel(model, effort); ok { - return normalized, true - } - } - - for _, e := range standardReasoningEfforts { - if e == lowered { - return e, true - } - } - return "", false -} diff --git a/internal/util/thinking_suffix.go b/internal/util/thinking_suffix.go index 1a1a8715f..c2d806ad9 100644 --- a/internal/util/thinking_suffix.go +++ b/internal/util/thinking_suffix.go @@ -58,11 +58,14 @@ func NormalizeThinkingModel(modelName string) (string, map[string]any) { baseModel = modelName[:idx] budgetOverride = &parsed matched = true - } else if IsValidReasoningEffort(value) { + } else { baseModel = modelName[:idx] - effort := strings.ToLower(strings.TrimSpace(value)) - reasoningEffort = &effort - matched = true + if normalized, ok := NormalizeReasoningEffortLevel(baseModel, value); ok { + reasoningEffort = &normalized + matched = true + } else { + baseModel = modelName + } } } } else if strings.HasSuffix(lower, "-thinking") { @@ -186,9 +189,7 @@ func ReasoningEffortFromMetadata(metadata map[string]any) (string, bool) { return "", false } if effort != nil && *effort != "" { - if IsValidReasoningEffort(*effort) { - return strings.ToLower(strings.TrimSpace(*effort)), true - } + return strings.ToLower(strings.TrimSpace(*effort)), true } if budget != nil { switch *budget { @@ -210,9 +211,9 @@ func ThinkingEffortToBudget(model, effort string) (int, bool) { if effort == "" { return 0, false } - normalized, ok := NormalizeReasoningEffort(model, effort) + normalized, ok := NormalizeReasoningEffortLevel(model, effort) if !ok { - return 0, false + normalized = strings.ToLower(strings.TrimSpace(effort)) } switch normalized { case "none": From 519da2e04222641a412fb5c17a0bc2cf20428800 Mon Sep 17 00:00:00 2001 From: hkfires <10558748+hkfires@users.noreply.github.com> Date: Thu, 11 Dec 2025 12:36:54 +0800 Subject: [PATCH 07/15] fix(runtime): validate reasoning effort levels --- internal/runtime/executor/codex_executor.go | 6 ++++ .../executor/openai_compat_executor.go | 6 ++++ internal/runtime/executor/payload_helpers.go | 35 +++++++++++++++++++ internal/util/thinking.go | 12 ++----- 4 files changed, 49 insertions(+), 10 deletions(-) diff --git a/internal/runtime/executor/codex_executor.go b/internal/runtime/executor/codex_executor.go index 3fe5ed6e0..7003373f7 100644 --- a/internal/runtime/executor/codex_executor.go +++ b/internal/runtime/executor/codex_executor.go @@ -56,6 +56,9 @@ func (e *CodexExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, re body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false) body = applyReasoningEffortMetadata(body, req.Metadata, req.Model) body = normalizeThinkingConfig(body, upstreamModel) + if errValidate := validateThinkingConfig(body, upstreamModel); errValidate != nil { + return resp, errValidate + } body = applyPayloadConfig(e.cfg, req.Model, body) body, _ = sjson.SetBytes(body, "model", upstreamModel) body, _ = sjson.SetBytes(body, "stream", true) @@ -151,6 +154,9 @@ func (e *CodexExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Au body = applyReasoningEffortMetadata(body, req.Metadata, req.Model) body = normalizeThinkingConfig(body, upstreamModel) + if errValidate := validateThinkingConfig(body, upstreamModel); errValidate != nil { + return nil, errValidate + } body = applyPayloadConfig(e.cfg, req.Model, body) body, _ = sjson.DeleteBytes(body, "previous_response_id") body, _ = sjson.SetBytes(body, "model", upstreamModel) diff --git a/internal/runtime/executor/openai_compat_executor.go b/internal/runtime/executor/openai_compat_executor.go index ba47750e5..507b0fd90 100644 --- a/internal/runtime/executor/openai_compat_executor.go +++ b/internal/runtime/executor/openai_compat_executor.go @@ -64,6 +64,9 @@ func (e *OpenAICompatExecutor) Execute(ctx context.Context, auth *cliproxyauth.A translated, _ = sjson.SetBytes(translated, "model", upstreamModel) } translated = normalizeThinkingConfig(translated, upstreamModel) + if errValidate := validateThinkingConfig(translated, upstreamModel); errValidate != nil { + return resp, errValidate + } url := strings.TrimSuffix(baseURL, "/") + "/chat/completions" httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(translated)) @@ -155,6 +158,9 @@ func (e *OpenAICompatExecutor) ExecuteStream(ctx context.Context, auth *cliproxy translated, _ = sjson.SetBytes(translated, "model", upstreamModel) } translated = normalizeThinkingConfig(translated, upstreamModel) + if errValidate := validateThinkingConfig(translated, upstreamModel); errValidate != nil { + return nil, errValidate + } url := strings.TrimSuffix(baseURL, "/") + "/chat/completions" httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(translated)) diff --git a/internal/runtime/executor/payload_helpers.go b/internal/runtime/executor/payload_helpers.go index 9d431f114..5711bbbdd 100644 --- a/internal/runtime/executor/payload_helpers.go +++ b/internal/runtime/executor/payload_helpers.go @@ -1,6 +1,8 @@ package executor import ( + "fmt" + "net/http" "strings" "github.com/router-for-me/CLIProxyAPI/v6/internal/config" @@ -289,3 +291,36 @@ func normalizeReasoningEffortLevel(payload []byte, model string) []byte { return out } + +// validateThinkingConfig checks for unsupported reasoning levels on level-based models. +// Returns a statusErr with 400 when an unsupported level is supplied to avoid silently +// downgrading requests. +func validateThinkingConfig(payload []byte, model string) error { + if len(payload) == 0 || model == "" { + return nil + } + if !util.ModelSupportsThinking(model) || !util.ModelUsesThinkingLevels(model) { + return nil + } + + levels := util.GetModelThinkingLevels(model) + checkField := func(path string) error { + if effort := gjson.GetBytes(payload, path); effort.Exists() { + if _, ok := util.NormalizeReasoningEffortLevel(model, effort.String()); !ok { + return statusErr{ + code: http.StatusBadRequest, + msg: fmt.Sprintf("unsupported reasoning effort level %q for model %s (supported: %s)", effort.String(), model, strings.Join(levels, ", ")), + } + } + } + return nil + } + + if err := checkField("reasoning_effort"); err != nil { + return err + } + if err := checkField("reasoning.effort"); err != nil { + return err + } + return nil +} diff --git a/internal/util/thinking.go b/internal/util/thinking.go index 37200980c..9671f20b5 100644 --- a/internal/util/thinking.go +++ b/internal/util/thinking.go @@ -91,8 +91,7 @@ func ModelUsesThinkingLevels(model string) bool { } // NormalizeReasoningEffortLevel validates and normalizes a reasoning effort -// level for the given model. If the level is not supported, it returns the -// first (lowest) level from the model's supported levels. +// level for the given model. Returns false when the level is not supported. func NormalizeReasoningEffortLevel(model, effort string) (string, bool) { levels := GetModelThinkingLevels(model) if len(levels) == 0 { @@ -104,12 +103,5 @@ func NormalizeReasoningEffortLevel(model, effort string) (string, bool) { return lvl, true } } - return defaultReasoningLevel(levels), true -} - -func defaultReasoningLevel(levels []string) string { - if len(levels) > 0 { - return levels[0] - } - return "" + return "", false } From 3a81ab22fdb6c9b993fac1deef94785f8a8f5dbf Mon Sep 17 00:00:00 2001 From: hkfires <10558748+hkfires@users.noreply.github.com> Date: Thu, 11 Dec 2025 14:35:05 +0800 Subject: [PATCH 08/15] fix(runtime): unify reasoning effort metadata overrides --- internal/runtime/executor/codex_executor.go | 6 ++-- internal/runtime/executor/iflow_executor.go | 4 +-- .../executor/openai_compat_executor.go | 4 +-- internal/runtime/executor/payload_helpers.go | 30 +++------------- internal/runtime/executor/qwen_executor.go | 4 +-- internal/util/thinking_suffix.go | 34 ++++++++++++++++--- 6 files changed, 44 insertions(+), 38 deletions(-) diff --git a/internal/runtime/executor/codex_executor.go b/internal/runtime/executor/codex_executor.go index 7003373f7..b9470b3c3 100644 --- a/internal/runtime/executor/codex_executor.go +++ b/internal/runtime/executor/codex_executor.go @@ -54,7 +54,7 @@ func (e *CodexExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, re from := opts.SourceFormat to := sdktranslator.FromString("codex") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false) - body = applyReasoningEffortMetadata(body, req.Metadata, req.Model) + body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning.effort") body = normalizeThinkingConfig(body, upstreamModel) if errValidate := validateThinkingConfig(body, upstreamModel); errValidate != nil { return resp, errValidate @@ -152,7 +152,7 @@ func (e *CodexExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Au to := sdktranslator.FromString("codex") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), true) - body = applyReasoningEffortMetadata(body, req.Metadata, req.Model) + body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning.effort") body = normalizeThinkingConfig(body, upstreamModel) if errValidate := validateThinkingConfig(body, upstreamModel); errValidate != nil { return nil, errValidate @@ -254,7 +254,7 @@ func (e *CodexExecutor) CountTokens(ctx context.Context, auth *cliproxyauth.Auth modelForCounting := req.Model - body = applyReasoningEffortMetadata(body, req.Metadata, req.Model) + body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning.effort") body, _ = sjson.SetBytes(body, "model", upstreamModel) body, _ = sjson.DeleteBytes(body, "previous_response_id") body, _ = sjson.SetBytes(body, "stream", false) diff --git a/internal/runtime/executor/iflow_executor.go b/internal/runtime/executor/iflow_executor.go index c68a64315..a445e47da 100644 --- a/internal/runtime/executor/iflow_executor.go +++ b/internal/runtime/executor/iflow_executor.go @@ -57,7 +57,7 @@ func (e *IFlowExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, re from := opts.SourceFormat to := sdktranslator.FromString("openai") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false) - body = applyReasoningEffortMetadataChatCompletions(body, req.Metadata, req.Model) + body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning_effort") if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" { body, _ = sjson.SetBytes(body, "model", upstreamModel) } @@ -143,7 +143,7 @@ func (e *IFlowExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Au to := sdktranslator.FromString("openai") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), true) - body = applyReasoningEffortMetadataChatCompletions(body, req.Metadata, req.Model) + body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning_effort") if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" { body, _ = sjson.SetBytes(body, "model", upstreamModel) } diff --git a/internal/runtime/executor/openai_compat_executor.go b/internal/runtime/executor/openai_compat_executor.go index 507b0fd90..68b2963a6 100644 --- a/internal/runtime/executor/openai_compat_executor.go +++ b/internal/runtime/executor/openai_compat_executor.go @@ -58,7 +58,7 @@ func (e *OpenAICompatExecutor) Execute(ctx context.Context, auth *cliproxyauth.A translated = e.overrideModel(translated, modelOverride) } translated = applyPayloadConfigWithRoot(e.cfg, req.Model, to.String(), "", translated) - translated = applyReasoningEffortMetadataChatCompletions(translated, req.Metadata, req.Model) + translated = applyReasoningEffortMetadata(translated, req.Metadata, req.Model, "reasoning_effort") upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) if upstreamModel != "" { translated, _ = sjson.SetBytes(translated, "model", upstreamModel) @@ -152,7 +152,7 @@ func (e *OpenAICompatExecutor) ExecuteStream(ctx context.Context, auth *cliproxy translated = e.overrideModel(translated, modelOverride) } translated = applyPayloadConfigWithRoot(e.cfg, req.Model, to.String(), "", translated) - translated = applyReasoningEffortMetadataChatCompletions(translated, req.Metadata, req.Model) + translated = applyReasoningEffortMetadata(translated, req.Metadata, req.Model, "reasoning_effort") upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) if upstreamModel != "" { translated, _ = sjson.SetBytes(translated, "model", upstreamModel) diff --git a/internal/runtime/executor/payload_helpers.go b/internal/runtime/executor/payload_helpers.go index 5711bbbdd..61486d62c 100644 --- a/internal/runtime/executor/payload_helpers.go +++ b/internal/runtime/executor/payload_helpers.go @@ -45,40 +45,20 @@ func applyThinkingMetadataCLI(payload []byte, metadata map[string]any, model str return util.ApplyGeminiCLIThinkingConfig(payload, budgetOverride, includeOverride) } -// applyReasoningEffortMetadata applies reasoning effort overrides (reasoning.effort) when present in metadata. -// It avoids overwriting an existing reasoning.effort field and only applies to models that support thinking. -func applyReasoningEffortMetadata(payload []byte, metadata map[string]any, model string) []byte { +// applyReasoningEffortMetadata applies reasoning effort overrides from metadata to the given JSON path. +// Metadata values take precedence over any existing field when the model supports thinking. +func applyReasoningEffortMetadata(payload []byte, metadata map[string]any, model, field string) []byte { if len(metadata) == 0 { return payload } if !util.ModelSupportsThinking(model) { return payload } - if gjson.GetBytes(payload, "reasoning.effort").Exists() { + if field == "" { return payload } if effort, ok := util.ReasoningEffortFromMetadata(metadata); ok && effort != "" { - if updated, err := sjson.SetBytes(payload, "reasoning.effort", effort); err == nil { - return updated - } - } - return payload -} - -// applyReasoningEffortMetadataChatCompletions applies reasoning_effort (OpenAI chat completions field) -// when present in metadata. It avoids overwriting an existing reasoning_effort field. -func applyReasoningEffortMetadataChatCompletions(payload []byte, metadata map[string]any, model string) []byte { - if len(metadata) == 0 { - return payload - } - if !util.ModelSupportsThinking(model) { - return payload - } - if gjson.GetBytes(payload, "reasoning_effort").Exists() { - return payload - } - if effort, ok := util.ReasoningEffortFromMetadata(metadata); ok && effort != "" { - if updated, err := sjson.SetBytes(payload, "reasoning_effort", effort); err == nil { + if updated, err := sjson.SetBytes(payload, field, effort); err == nil { return updated } } diff --git a/internal/runtime/executor/qwen_executor.go b/internal/runtime/executor/qwen_executor.go index f060cb61e..d25ed5da7 100644 --- a/internal/runtime/executor/qwen_executor.go +++ b/internal/runtime/executor/qwen_executor.go @@ -51,7 +51,7 @@ func (e *QwenExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, req from := opts.SourceFormat to := sdktranslator.FromString("openai") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false) - body = applyReasoningEffortMetadataChatCompletions(body, req.Metadata, req.Model) + body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning_effort") if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" { body, _ = sjson.SetBytes(body, "model", upstreamModel) } @@ -126,7 +126,7 @@ func (e *QwenExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Aut to := sdktranslator.FromString("openai") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), true) - body = applyReasoningEffortMetadataChatCompletions(body, req.Metadata, req.Model) + body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning_effort") if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" { body, _ = sjson.SetBytes(body, "model", upstreamModel) } diff --git a/internal/util/thinking_suffix.go b/internal/util/thinking_suffix.go index c2d806ad9..47ce42f75 100644 --- a/internal/util/thinking_suffix.go +++ b/internal/util/thinking_suffix.go @@ -55,16 +55,42 @@ func NormalizeThinkingModel(modelName string) (string, map[string]any) { value := modelName[idx+len("-thinking-"):] if value != "" { if parsed, ok := parseIntPrefix(value); ok { - baseModel = modelName[:idx] - budgetOverride = &parsed - matched = true + candidateBase := modelName[:idx] + if ModelUsesThinkingLevels(candidateBase) { + baseModel = candidateBase + // Numeric suffix on level-aware models should still surface as reasoning effort metadata. + raw := strings.ToLower(strings.TrimSpace(value)) + if raw != "" { + reasoningEffort = &raw + } + matched = true + } else { + baseModel = candidateBase + budgetOverride = &parsed + matched = true + } } else { baseModel = modelName[:idx] if normalized, ok := NormalizeReasoningEffortLevel(baseModel, value); ok { reasoningEffort = &normalized matched = true + } else if !ModelUsesThinkingLevels(baseModel) { + // Keep unknown effort tokens so callers can honor user intent even without normalization. + raw := strings.ToLower(strings.TrimSpace(value)) + if raw != "" { + reasoningEffort = &raw + matched = true + } else { + baseModel = modelName + } } else { - baseModel = modelName + raw := strings.ToLower(strings.TrimSpace(value)) + if raw != "" { + reasoningEffort = &raw + matched = true + } else { + baseModel = modelName + } } } } From 007572b58e2e6577f3c9a9a83d946e3b9c757437 Mon Sep 17 00:00:00 2001 From: hkfires <10558748+hkfires@users.noreply.github.com> Date: Thu, 11 Dec 2025 15:52:14 +0800 Subject: [PATCH 09/15] fix(util): do not strip thinking suffix on registered models NormalizeThinkingModel now checks ModelSupportsThinking before removing "-thinking" or "-thinking-", avoiding accidental parsing of model names where the suffix is part of the official id (e.g., kimi-k2-thinking, qwen3-235b-a22b-thinking-2507). The registry adds ThinkingSupport metadata for several models and propagates it via ModelInfo (e.g., kimi-k2-thinking, deepseek-r1, qwen3-235b-a22b-thinking-2507, minimax-m2), enabling accurate detection of thinking-capable models and correcting base model inference. --- internal/registry/model_definitions.go | 10 ++++++---- internal/util/thinking_suffix.go | 19 +++++++++++++++---- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/internal/registry/model_definitions.go b/internal/registry/model_definitions.go index 9956d964f..adaff867c 100644 --- a/internal/registry/model_definitions.go +++ b/internal/registry/model_definitions.go @@ -623,6 +623,7 @@ func GetIFlowModels() []*ModelInfo { DisplayName string Description string Created int64 + Thinking *ThinkingSupport }{ {ID: "tstars2.0", DisplayName: "TStars-2.0", Description: "iFlow TStars-2.0 multimodal assistant", Created: 1746489600}, {ID: "qwen3-coder-plus", DisplayName: "Qwen3-Coder-Plus", Description: "Qwen3 Coder Plus code generation", Created: 1753228800}, @@ -632,17 +633,17 @@ func GetIFlowModels() []*ModelInfo { {ID: "kimi-k2-0905", DisplayName: "Kimi-K2-Instruct-0905", Description: "Moonshot Kimi K2 instruct 0905", Created: 1757030400}, {ID: "glm-4.6", DisplayName: "GLM-4.6", Description: "Zhipu GLM 4.6 general model", Created: 1759190400}, {ID: "kimi-k2", DisplayName: "Kimi-K2", Description: "Moonshot Kimi K2 general model", Created: 1752192000}, - {ID: "kimi-k2-thinking", DisplayName: "Kimi-K2-Thinking", Description: "Moonshot Kimi K2 general model", Created: 1762387200}, + {ID: "kimi-k2-thinking", DisplayName: "Kimi-K2-Thinking", Description: "Moonshot Kimi K2 thinking model", Created: 1762387200, Thinking: &ThinkingSupport{Levels: []string{"low", "medium", "high"}}}, {ID: "deepseek-v3.2-chat", DisplayName: "DeepSeek-V3.2", Description: "DeepSeek V3.2", Created: 1764576000}, {ID: "deepseek-v3.2", DisplayName: "DeepSeek-V3.2-Exp", Description: "DeepSeek V3.2 experimental", Created: 1759104000}, {ID: "deepseek-v3.1", DisplayName: "DeepSeek-V3.1-Terminus", Description: "DeepSeek V3.1 Terminus", Created: 1756339200}, - {ID: "deepseek-r1", DisplayName: "DeepSeek-R1", Description: "DeepSeek reasoning model R1", Created: 1737331200}, + {ID: "deepseek-r1", DisplayName: "DeepSeek-R1", Description: "DeepSeek reasoning model R1", Created: 1737331200, Thinking: &ThinkingSupport{Levels: []string{"low", "medium", "high"}}}, {ID: "deepseek-v3", DisplayName: "DeepSeek-V3-671B", Description: "DeepSeek V3 671B", Created: 1734307200}, {ID: "qwen3-32b", DisplayName: "Qwen3-32B", Description: "Qwen3 32B", Created: 1747094400}, - {ID: "qwen3-235b-a22b-thinking-2507", DisplayName: "Qwen3-235B-A22B-Thinking", Description: "Qwen3 235B A22B Thinking (2507)", Created: 1753401600}, + {ID: "qwen3-235b-a22b-thinking-2507", DisplayName: "Qwen3-235B-A22B-Thinking", Description: "Qwen3 235B A22B Thinking (2507)", Created: 1753401600, Thinking: &ThinkingSupport{Levels: []string{"low", "medium", "high"}}}, {ID: "qwen3-235b-a22b-instruct", DisplayName: "Qwen3-235B-A22B-Instruct", Description: "Qwen3 235B A22B Instruct", Created: 1753401600}, {ID: "qwen3-235b", DisplayName: "Qwen3-235B-A22B", Description: "Qwen3 235B A22B", Created: 1753401600}, - {ID: "minimax-m2", DisplayName: "MiniMax-M2", Description: "MiniMax M2", Created: 1758672000}, + {ID: "minimax-m2", DisplayName: "MiniMax-M2", Description: "MiniMax M2", Created: 1758672000, Thinking: &ThinkingSupport{Levels: []string{"low", "medium", "high"}}}, } models := make([]*ModelInfo, 0, len(entries)) for _, entry := range entries { @@ -654,6 +655,7 @@ func GetIFlowModels() []*ModelInfo { Type: "iflow", DisplayName: entry.DisplayName, Description: entry.Description, + Thinking: entry.Thinking, }) } return models diff --git a/internal/util/thinking_suffix.go b/internal/util/thinking_suffix.go index 47ce42f75..ef8302b0e 100644 --- a/internal/util/thinking_suffix.go +++ b/internal/util/thinking_suffix.go @@ -52,6 +52,11 @@ func NormalizeThinkingModel(modelName string) (string, map[string]any) { matched = true default: if idx := strings.LastIndex(lower, "-thinking-"); idx != -1 { + // Skip stripping if the original model is a registered thinking model. + // This prevents "-thinking-2507" in "qwen3-235b-a22b-thinking-2507" from being parsed. + if ModelSupportsThinking(modelName) { + break + } value := modelName[idx+len("-thinking-"):] if value != "" { if parsed, ok := parseIntPrefix(value); ok { @@ -95,10 +100,16 @@ func NormalizeThinkingModel(modelName string) (string, map[string]any) { } } } else if strings.HasSuffix(lower, "-thinking") { - baseModel = modelName[:len(modelName)-len("-thinking")] - effort := "medium" - reasoningEffort = &effort - matched = true + candidateBase := modelName[:len(modelName)-len("-thinking")] + // Only strip the suffix if the original model is NOT a registered thinking model. + // This prevents stripping "-thinking" from models like "kimi-k2-thinking" where + // the suffix is part of the model's actual name. + if !ModelSupportsThinking(modelName) { + baseModel = candidateBase + effort := "medium" + reasoningEffort = &effort + matched = true + } } } From f6300c72b790c6017a08ceacc425f9863907493d Mon Sep 17 00:00:00 2001 From: hkfires <10558748+hkfires@users.noreply.github.com> Date: Thu, 11 Dec 2025 16:21:50 +0800 Subject: [PATCH 10/15] fix(runtime): validate thinking config in iflow and qwen --- internal/runtime/executor/iflow_executor.go | 14 ++++++++++++-- internal/runtime/executor/qwen_executor.go | 14 ++++++++++++-- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/internal/runtime/executor/iflow_executor.go b/internal/runtime/executor/iflow_executor.go index a445e47da..d1a69812d 100644 --- a/internal/runtime/executor/iflow_executor.go +++ b/internal/runtime/executor/iflow_executor.go @@ -58,9 +58,14 @@ func (e *IFlowExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, re to := sdktranslator.FromString("openai") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false) body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning_effort") - if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" { + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + if upstreamModel != "" { body, _ = sjson.SetBytes(body, "model", upstreamModel) } + body = normalizeThinkingConfig(body, upstreamModel) + if errValidate := validateThinkingConfig(body, upstreamModel); errValidate != nil { + return resp, errValidate + } body = applyPayloadConfig(e.cfg, req.Model, body) endpoint := strings.TrimSuffix(baseURL, "/") + iflowDefaultEndpoint @@ -144,9 +149,14 @@ func (e *IFlowExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Au body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), true) body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning_effort") - if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" { + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + if upstreamModel != "" { body, _ = sjson.SetBytes(body, "model", upstreamModel) } + body = normalizeThinkingConfig(body, upstreamModel) + if errValidate := validateThinkingConfig(body, upstreamModel); errValidate != nil { + return nil, errValidate + } // Ensure tools array exists to avoid provider quirks similar to Qwen's behaviour. toolsResult := gjson.GetBytes(body, "tools") if toolsResult.Exists() && toolsResult.IsArray() && len(toolsResult.Array()) == 0 { diff --git a/internal/runtime/executor/qwen_executor.go b/internal/runtime/executor/qwen_executor.go index d25ed5da7..2b8d0e502 100644 --- a/internal/runtime/executor/qwen_executor.go +++ b/internal/runtime/executor/qwen_executor.go @@ -52,9 +52,14 @@ func (e *QwenExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, req to := sdktranslator.FromString("openai") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false) body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning_effort") - if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" { + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + if upstreamModel != "" { body, _ = sjson.SetBytes(body, "model", upstreamModel) } + body = normalizeThinkingConfig(body, upstreamModel) + if errValidate := validateThinkingConfig(body, upstreamModel); errValidate != nil { + return resp, errValidate + } body = applyPayloadConfig(e.cfg, req.Model, body) url := strings.TrimSuffix(baseURL, "/") + "/chat/completions" @@ -127,9 +132,14 @@ func (e *QwenExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Aut body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), true) body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning_effort") - if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" { + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + if upstreamModel != "" { body, _ = sjson.SetBytes(body, "model", upstreamModel) } + body = normalizeThinkingConfig(body, upstreamModel) + if errValidate := validateThinkingConfig(body, upstreamModel); errValidate != nil { + return nil, errValidate + } toolsResult := gjson.GetBytes(body, "tools") // I'm addressing the Qwen3 "poisoning" issue, which is caused by the model needing a tool to be defined. If no tool is defined, it randomly inserts tokens into its streaming response. // This will have no real consequences. It's just to scare Qwen3. From 21bbceca0ce75e651f9dd0a29a681f2c580c661f Mon Sep 17 00:00:00 2001 From: hkfires <10558748+hkfires@users.noreply.github.com> Date: Thu, 11 Dec 2025 16:35:36 +0800 Subject: [PATCH 11/15] docs(runtime): document reasoning effort precedence --- internal/runtime/executor/payload_helpers.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/internal/runtime/executor/payload_helpers.go b/internal/runtime/executor/payload_helpers.go index 61486d62c..9c45681ac 100644 --- a/internal/runtime/executor/payload_helpers.go +++ b/internal/runtime/executor/payload_helpers.go @@ -46,7 +46,8 @@ func applyThinkingMetadataCLI(payload []byte, metadata map[string]any, model str } // applyReasoningEffortMetadata applies reasoning effort overrides from metadata to the given JSON path. -// Metadata values take precedence over any existing field when the model supports thinking. +// Metadata values take precedence over any existing field when the model supports thinking, intentionally +// overwriting caller-provided values to honor suffix/default metadata priority. func applyReasoningEffortMetadata(payload []byte, metadata map[string]any, model, field string) []byte { if len(metadata) == 0 { return payload From 6285459c08e9f6f5996374085053892d2d5b91fa Mon Sep 17 00:00:00 2001 From: hkfires <10558748+hkfires@users.noreply.github.com> Date: Thu, 11 Dec 2025 17:20:44 +0800 Subject: [PATCH 12/15] fix(runtime): unify claude thinking config resolution --- internal/runtime/executor/claude_executor.go | 56 +++----------------- internal/util/claude_thinking.go | 46 ++++++++++++++++ 2 files changed, 52 insertions(+), 50 deletions(-) create mode 100644 internal/util/claude_thinking.go diff --git a/internal/runtime/executor/claude_executor.go b/internal/runtime/executor/claude_executor.go index c7470954e..6af086080 100644 --- a/internal/runtime/executor/claude_executor.go +++ b/internal/runtime/executor/claude_executor.go @@ -450,59 +450,15 @@ func extractAndRemoveBetas(body []byte) ([]string, []byte) { return betas, body } -// injectThinkingConfig adds thinking configuration based on metadata or legacy suffixes. +// injectThinkingConfig adds thinking configuration based on metadata using the unified flow. +// It uses util.ResolveClaudeThinkingConfig which internally calls ResolveThinkingConfigFromMetadata +// and NormalizeThinkingBudget, ensuring consistency with other executors like Gemini. func (e *ClaudeExecutor) injectThinkingConfig(modelName string, metadata map[string]any, body []byte) []byte { - // Only inject if thinking config is not already present - if gjson.GetBytes(body, "thinking").Exists() { + budget, ok := util.ResolveClaudeThinkingConfig(modelName, metadata) + if !ok { return body } - - budgetTokens, ok := resolveClaudeThinkingBudget(modelName, metadata) - if !ok || budgetTokens <= 0 { - return body - } - - body, _ = sjson.SetBytes(body, "thinking.type", "enabled") - body, _ = sjson.SetBytes(body, "thinking.budget_tokens", budgetTokens) - return body -} - -func resolveClaudeThinkingBudget(modelName string, metadata map[string]any) (int, bool) { - budget, include, effort, matched := util.ThinkingFromMetadata(metadata) - if matched { - if include != nil && !*include { - return 0, false - } - if budget != nil { - normalized := util.NormalizeThinkingBudget(modelName, *budget) - if normalized > 0 { - return normalized, true - } - return 0, false - } - if effort != nil { - if derived, ok := util.ThinkingEffortToBudget(modelName, *effort); ok && derived > 0 { - return derived, true - } - } - } - return claudeBudgetFromSuffix(modelName) -} - -func claudeBudgetFromSuffix(modelName string) (int, bool) { - lower := strings.ToLower(strings.TrimSpace(modelName)) - switch { - case strings.HasSuffix(lower, "-thinking-low"): - return 1024, true - case strings.HasSuffix(lower, "-thinking-medium"): - return 8192, true - case strings.HasSuffix(lower, "-thinking-high"): - return 24576, true - case strings.HasSuffix(lower, "-thinking"): - return 8192, true - default: - return 0, false - } + return util.ApplyClaudeThinkingConfig(body, budget) } // ensureMaxTokensForThinking ensures max_tokens > thinking.budget_tokens when thinking is enabled. diff --git a/internal/util/claude_thinking.go b/internal/util/claude_thinking.go new file mode 100644 index 000000000..b0c5a0a2f --- /dev/null +++ b/internal/util/claude_thinking.go @@ -0,0 +1,46 @@ +package util + +import ( + "github.com/tidwall/gjson" + "github.com/tidwall/sjson" +) + +// ApplyClaudeThinkingConfig applies thinking configuration to a Claude API request payload. +// It sets the thinking.type to "enabled" and thinking.budget_tokens to the specified budget. +// If budget is nil or the payload already has thinking config, it returns the payload unchanged. +func ApplyClaudeThinkingConfig(body []byte, budget *int) []byte { + if budget == nil { + return body + } + if gjson.GetBytes(body, "thinking").Exists() { + return body + } + if *budget <= 0 { + return body + } + updated := body + updated, _ = sjson.SetBytes(updated, "thinking.type", "enabled") + updated, _ = sjson.SetBytes(updated, "thinking.budget_tokens", *budget) + return updated +} + +// ResolveClaudeThinkingConfig resolves thinking configuration from metadata for Claude models. +// It uses the unified ResolveThinkingConfigFromMetadata and normalizes the budget. +// Returns the normalized budget (nil if thinking should not be enabled) and whether it matched. +func ResolveClaudeThinkingConfig(modelName string, metadata map[string]any) (*int, bool) { + budget, include, matched := ResolveThinkingConfigFromMetadata(modelName, metadata) + if !matched { + return nil, false + } + if include != nil && !*include { + return nil, true + } + if budget == nil { + return nil, true + } + normalized := NormalizeThinkingBudget(modelName, *budget) + if normalized <= 0 { + return nil, true + } + return &normalized, true +} From facfe7c518cb528426dcb82c7f927e4f151bea33 Mon Sep 17 00:00:00 2001 From: hkfires <10558748+hkfires@users.noreply.github.com> Date: Thu, 11 Dec 2025 18:17:28 +0800 Subject: [PATCH 13/15] refactor(thinking): use bracket tags for thinking meta Align thinking suffix handling on a single bracket-style marker. NormalizeThinkingModel strips a terminal `[value]` segment from model identifiers and turns it into either a thinking budget (for numeric values) or a reasoning effort hint (for strings). Emission of `ThinkingIncludeThoughtsMetadataKey` is removed. Executor helpers and the example config are updated so their comments reference the new `[value]` suffix format instead of the legacy dash variants. BREAKING CHANGE: dash-based thinking suffixes (`-thinking`, `-thinking-N`, `-reasoning`, `-nothinking`) are no longer parsed for thinking metadata; only `[value]` annotations are recognized. --- config.example.yaml | 2 +- internal/runtime/executor/payload_helpers.go | 4 +- internal/util/thinking_suffix.go | 122 ++++++------------- 3 files changed, 41 insertions(+), 87 deletions(-) diff --git a/config.example.yaml b/config.example.yaml index dfd7454bd..31f169737 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -100,7 +100,7 @@ ws-auth: false # excluded-models: # - "claude-opus-4-5-20251101" # exclude specific models (exact match) # - "claude-3-*" # wildcard matching prefix (e.g. claude-3-7-sonnet-20250219) -# - "*-think" # wildcard matching suffix (e.g. claude-opus-4-5-thinking) +# - "*-thinking" # wildcard matching suffix (e.g. claude-opus-4-5-thinking) # - "*haiku*" # wildcard matching substring (e.g. claude-3-5-haiku-20241022) # OpenAI compatibility providers diff --git a/internal/runtime/executor/payload_helpers.go b/internal/runtime/executor/payload_helpers.go index 9c45681ac..be2498685 100644 --- a/internal/runtime/executor/payload_helpers.go +++ b/internal/runtime/executor/payload_helpers.go @@ -11,7 +11,7 @@ import ( "github.com/tidwall/sjson" ) -// applyThinkingMetadata applies thinking config from model suffix metadata (e.g., -reasoning, -thinking-N) +// applyThinkingMetadata applies thinking config from model suffix metadata (e.g., [high], [8192]) // for standard Gemini format payloads. It normalizes the budget when the model supports thinking. func applyThinkingMetadata(payload []byte, metadata map[string]any, model string) []byte { budgetOverride, includeOverride, ok := util.ResolveThinkingConfigFromMetadata(model, metadata) @@ -28,7 +28,7 @@ func applyThinkingMetadata(payload []byte, metadata map[string]any, model string return util.ApplyGeminiThinkingConfig(payload, budgetOverride, includeOverride) } -// applyThinkingMetadataCLI applies thinking config from model suffix metadata (e.g., -reasoning, -thinking-N) +// applyThinkingMetadataCLI applies thinking config from model suffix metadata (e.g., [high], [8192]) // for Gemini CLI format payloads (nested under "request"). It normalizes the budget when the model supports thinking. func applyThinkingMetadataCLI(payload []byte, metadata map[string]any, model string) []byte { budgetOverride, includeOverride, ok := util.ResolveThinkingConfigFromMetadata(model, metadata) diff --git a/internal/util/thinking_suffix.go b/internal/util/thinking_suffix.go index ef8302b0e..c9a68534e 100644 --- a/internal/util/thinking_suffix.go +++ b/internal/util/thinking_suffix.go @@ -14,100 +14,57 @@ const ( ) // NormalizeThinkingModel parses dynamic thinking suffixes on model names and returns -// the normalized base model with extracted metadata. Supported patterns: -// - "-thinking-" extracts a numeric budget -// - "-thinking-" extracts a reasoning effort level (minimal/low/medium/high/xhigh/auto/none) -// - "-thinking" maps to a default reasoning effort of "medium" -// - "-reasoning" maps to dynamic budget (-1) and include_thoughts=true -// - "-nothinking" maps to budget=0 and include_thoughts=false +// the normalized base model with extracted metadata. Supported pattern: +// - "[]" where value can be: +// - A numeric budget (e.g., "[8192]", "[16384]") +// - A reasoning effort level (e.g., "[high]", "[medium]", "[low]") +// +// Examples: +// - "claude-sonnet-4-5-20250929[16384]" → budget=16384 +// - "gpt-5.1[high]" → reasoning_effort="high" +// - "gemini-2.5-pro[32768]" → budget=32768 +// +// Note: Empty brackets "[]" are not supported and will be ignored. func NormalizeThinkingModel(modelName string) (string, map[string]any) { if modelName == "" { return modelName, nil } - lower := strings.ToLower(modelName) baseModel := modelName var ( budgetOverride *int - includeThoughts *bool reasoningEffort *string matched bool ) - switch { - case strings.HasSuffix(lower, "-nothinking"): - baseModel = modelName[:len(modelName)-len("-nothinking")] - budget := 0 - include := false - budgetOverride = &budget - includeThoughts = &include - matched = true - case strings.HasSuffix(lower, "-reasoning"): - baseModel = modelName[:len(modelName)-len("-reasoning")] - budget := -1 - include := true - budgetOverride = &budget - includeThoughts = &include - matched = true - default: - if idx := strings.LastIndex(lower, "-thinking-"); idx != -1 { - // Skip stripping if the original model is a registered thinking model. - // This prevents "-thinking-2507" in "qwen3-235b-a22b-thinking-2507" from being parsed. - if ModelSupportsThinking(modelName) { - break - } - value := modelName[idx+len("-thinking-"):] - if value != "" { - if parsed, ok := parseIntPrefix(value); ok { - candidateBase := modelName[:idx] - if ModelUsesThinkingLevels(candidateBase) { - baseModel = candidateBase - // Numeric suffix on level-aware models should still surface as reasoning effort metadata. - raw := strings.ToLower(strings.TrimSpace(value)) - if raw != "" { - reasoningEffort = &raw - } - matched = true - } else { - baseModel = candidateBase - budgetOverride = &parsed - matched = true - } - } else { - baseModel = modelName[:idx] - if normalized, ok := NormalizeReasoningEffortLevel(baseModel, value); ok { - reasoningEffort = &normalized - matched = true - } else if !ModelUsesThinkingLevels(baseModel) { - // Keep unknown effort tokens so callers can honor user intent even without normalization. - raw := strings.ToLower(strings.TrimSpace(value)) - if raw != "" { - reasoningEffort = &raw - matched = true - } else { - baseModel = modelName - } - } else { - raw := strings.ToLower(strings.TrimSpace(value)) - if raw != "" { - reasoningEffort = &raw - matched = true - } else { - baseModel = modelName - } - } - } - } - } else if strings.HasSuffix(lower, "-thinking") { - candidateBase := modelName[:len(modelName)-len("-thinking")] - // Only strip the suffix if the original model is NOT a registered thinking model. - // This prevents stripping "-thinking" from models like "kimi-k2-thinking" where - // the suffix is part of the model's actual name. - if !ModelSupportsThinking(modelName) { - baseModel = candidateBase - effort := "medium" - reasoningEffort = &effort + // Match "[value]" pattern at the end of the model name + if idx := strings.LastIndex(modelName, "["); idx != -1 { + if !strings.HasSuffix(modelName, "]") { + // Incomplete bracket, ignore + return baseModel, nil + } + + value := modelName[idx+1 : len(modelName)-1] // Extract content between [ and ] + if value == "" { + // Empty brackets not supported + return baseModel, nil + } + + candidateBase := modelName[:idx] + + // Auto-detect: pure numeric → budget, string → reasoning effort level + if parsed, ok := parseIntPrefix(value); ok { + // Numeric value: treat as thinking budget + baseModel = candidateBase + budgetOverride = &parsed + matched = true + } else { + // String value: treat as reasoning effort level + baseModel = candidateBase + raw := strings.ToLower(strings.TrimSpace(value)) + if raw != "" { + reasoningEffort = &raw matched = true } } @@ -123,9 +80,6 @@ func NormalizeThinkingModel(modelName string) (string, map[string]any) { if budgetOverride != nil { metadata[ThinkingBudgetMetadataKey] = *budgetOverride } - if includeThoughts != nil { - metadata[ThinkingIncludeThoughtsMetadataKey] = *includeThoughts - } if reasoningEffort != nil { metadata[ReasoningEffortMetadataKey] = *reasoningEffort } From e79f65fd8efbac89e02e7d39d8bde89878178239 Mon Sep 17 00:00:00 2001 From: hkfires <10558748+hkfires@users.noreply.github.com> Date: Thu, 11 Dec 2025 18:39:07 +0800 Subject: [PATCH 14/15] refactor(thinking): use parentheses for metadata suffix --- internal/runtime/executor/payload_helpers.go | 4 +-- internal/util/thinking_suffix.go | 26 ++++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/internal/runtime/executor/payload_helpers.go b/internal/runtime/executor/payload_helpers.go index be2498685..9bc82f1f3 100644 --- a/internal/runtime/executor/payload_helpers.go +++ b/internal/runtime/executor/payload_helpers.go @@ -11,7 +11,7 @@ import ( "github.com/tidwall/sjson" ) -// applyThinkingMetadata applies thinking config from model suffix metadata (e.g., [high], [8192]) +// applyThinkingMetadata applies thinking config from model suffix metadata (e.g., (high), (8192)) // for standard Gemini format payloads. It normalizes the budget when the model supports thinking. func applyThinkingMetadata(payload []byte, metadata map[string]any, model string) []byte { budgetOverride, includeOverride, ok := util.ResolveThinkingConfigFromMetadata(model, metadata) @@ -28,7 +28,7 @@ func applyThinkingMetadata(payload []byte, metadata map[string]any, model string return util.ApplyGeminiThinkingConfig(payload, budgetOverride, includeOverride) } -// applyThinkingMetadataCLI applies thinking config from model suffix metadata (e.g., [high], [8192]) +// applyThinkingMetadataCLI applies thinking config from model suffix metadata (e.g., (high), (8192)) // for Gemini CLI format payloads (nested under "request"). It normalizes the budget when the model supports thinking. func applyThinkingMetadataCLI(payload []byte, metadata map[string]any, model string) []byte { budgetOverride, includeOverride, ok := util.ResolveThinkingConfigFromMetadata(model, metadata) diff --git a/internal/util/thinking_suffix.go b/internal/util/thinking_suffix.go index c9a68534e..7851c580f 100644 --- a/internal/util/thinking_suffix.go +++ b/internal/util/thinking_suffix.go @@ -15,16 +15,16 @@ const ( // NormalizeThinkingModel parses dynamic thinking suffixes on model names and returns // the normalized base model with extracted metadata. Supported pattern: -// - "[]" where value can be: -// - A numeric budget (e.g., "[8192]", "[16384]") -// - A reasoning effort level (e.g., "[high]", "[medium]", "[low]") +// - "()" where value can be: +// - A numeric budget (e.g., "(8192)", "(16384)") +// - A reasoning effort level (e.g., "(high)", "(medium)", "(low)") // // Examples: -// - "claude-sonnet-4-5-20250929[16384]" → budget=16384 -// - "gpt-5.1[high]" → reasoning_effort="high" -// - "gemini-2.5-pro[32768]" → budget=32768 +// - "claude-sonnet-4-5-20250929(16384)" → budget=16384 +// - "gpt-5.1(high)" → reasoning_effort="high" +// - "gemini-2.5-pro(32768)" → budget=32768 // -// Note: Empty brackets "[]" are not supported and will be ignored. +// Note: Empty parentheses "()" are not supported and will be ignored. func NormalizeThinkingModel(modelName string) (string, map[string]any) { if modelName == "" { return modelName, nil @@ -38,16 +38,16 @@ func NormalizeThinkingModel(modelName string) (string, map[string]any) { matched bool ) - // Match "[value]" pattern at the end of the model name - if idx := strings.LastIndex(modelName, "["); idx != -1 { - if !strings.HasSuffix(modelName, "]") { - // Incomplete bracket, ignore + // Match "()" pattern at the end of the model name + if idx := strings.LastIndex(modelName, "("); idx != -1 { + if !strings.HasSuffix(modelName, ")") { + // Incomplete parenthesis, ignore return baseModel, nil } - value := modelName[idx+1 : len(modelName)-1] // Extract content between [ and ] + value := modelName[idx+1 : len(modelName)-1] // Extract content between ( and ) if value == "" { - // Empty brackets not supported + // Empty parentheses not supported return baseModel, nil } From 88bdd25f06b773763a48f9f83f358d84eea57846 Mon Sep 17 00:00:00 2001 From: hkfires <10558748+hkfires@users.noreply.github.com> Date: Thu, 11 Dec 2025 20:12:06 +0800 Subject: [PATCH 15/15] fix(amp): set status on claude stream errors --- sdk/api/handlers/claude/code_handlers.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sdk/api/handlers/claude/code_handlers.go b/sdk/api/handlers/claude/code_handlers.go index 63ea6065e..8a57a0cc6 100644 --- a/sdk/api/handlers/claude/code_handlers.go +++ b/sdk/api/handlers/claude/code_handlers.go @@ -271,6 +271,11 @@ func (h *ClaudeCodeAPIHandler) forwardClaudeStream(c *gin.Context, flusher http. continue } if errMsg != nil { + status := http.StatusInternalServerError + if errMsg.StatusCode > 0 { + status = errMsg.StatusCode + } + c.Status(status) // An error occurred: emit as a proper SSE error event errorBytes, _ := json.Marshal(h.toClaudeError(errMsg)) _, _ = writer.WriteString("event: error\n") @@ -278,6 +283,7 @@ func (h *ClaudeCodeAPIHandler) forwardClaudeStream(c *gin.Context, flusher http. _, _ = writer.Write(errorBytes) _, _ = writer.WriteString("\n\n") _ = writer.Flush() + flusher.Flush() } var execErr error if errMsg != nil {