router-for-me · luispater · Dec 16, 2025 · Dec 15, 2025 · Dec 15, 2025 · Dec 15, 2025
diff --git a/internal/runtime/executor/kiro_executor.go b/internal/runtime/executor/kiro_executor.go
@@ -166,16 +166,17 @@ type KiroExecutor struct {
 // This is critical because OpenAI and Claude formats have different tool structures:
 // - OpenAI: tools[].function.name, tools[].function.description
 // - Claude: tools[].name, tools[].description
+// headers parameter allows checking Anthropic-Beta header for thinking mode detection.
 // Returns the serialized JSON payload and a boolean indicating whether thinking mode was injected.
-func buildKiroPayloadForFormat(body []byte, modelID, profileArn, origin string, isAgentic, isChatOnly bool, sourceFormat sdktranslator.Format) ([]byte, bool) {
+func buildKiroPayloadForFormat(body []byte, modelID, profileArn, origin string, isAgentic, isChatOnly bool, sourceFormat sdktranslator.Format, headers http.Header) ([]byte, bool) {
 	switch sourceFormat.String() {
 	case "openai":
 		log.Debugf("kiro: using OpenAI payload builder for source format: %s", sourceFormat.String())
-		return kiroopenai.BuildKiroPayloadFromOpenAI(body, modelID, profileArn, origin, isAgentic, isChatOnly)
+		return kiroopenai.BuildKiroPayloadFromOpenAI(body, modelID, profileArn, origin, isAgentic, isChatOnly, headers, nil)
 	default:
 		// Default to Claude format (also handles "claude", "kiro", etc.)
 		log.Debugf("kiro: using Claude payload builder for source format: %s", sourceFormat.String())
-		return kiroclaude.BuildKiroPayload(body, modelID, profileArn, origin, isAgentic, isChatOnly)
+		return kiroclaude.BuildKiroPayload(body, modelID, profileArn, origin, isAgentic, isChatOnly, headers, nil)
 	}
 }
 
@@ -249,7 +250,7 @@ func (e *KiroExecutor) executeWithRetry(ctx context.Context, auth *cliproxyauth.
 
 		// Rebuild payload with the correct origin for this endpoint
 		// Each endpoint requires its matching Origin value in the request body
-		kiroPayload, _ = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from)
+		kiroPayload, _ = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from, opts.Headers)
 
 		log.Debugf("kiro: trying endpoint %d/%d: %s (Name: %s, Origin: %s)",
 			endpointIdx+1, len(endpointConfigs), url, endpointConfig.Name, currentOrigin)
@@ -359,7 +360,7 @@ func (e *KiroExecutor) executeWithRetry(ctx context.Context, auth *cliproxyauth.
 						auth = refreshedAuth
 						accessToken, profileArn = kiroCredentials(auth)
 						// Rebuild payload with new profile ARN if changed
-						kiroPayload, _ = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from)
+						kiroPayload, _ = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from, opts.Headers)
 						log.Infof("kiro: token refreshed successfully, retrying request")
 						continue
 					}
@@ -416,7 +417,7 @@ func (e *KiroExecutor) executeWithRetry(ctx context.Context, auth *cliproxyauth.
 					if refreshedAuth != nil {
 						auth = refreshedAuth
 						accessToken, profileArn = kiroCredentials(auth)
-						kiroPayload, _ = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from)
+						kiroPayload, _ = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from, opts.Headers)
 						log.Infof("kiro: token refreshed for 403, retrying request")
 						continue
 					}
@@ -555,10 +556,7 @@ func (e *KiroExecutor) executeStreamWithRetry(ctx context.Context, auth *cliprox
 
 		// Rebuild payload with the correct origin for this endpoint
 		// Each endpoint requires its matching Origin value in the request body
-		kiroPayload, _ = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from)
-		// Kiro API always returns <thinking> tags regardless of whether thinking mode was requested
-		// So we always enable thinking parsing for Kiro responses
-		thinkingEnabled := true
+		kiroPayload, thinkingEnabled := buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from, opts.Headers)
 
 		log.Debugf("kiro: stream trying endpoint %d/%d: %s (Name: %s, Origin: %s)",
 			endpointIdx+1, len(endpointConfigs), url, endpointConfig.Name, currentOrigin)
@@ -681,7 +679,7 @@ func (e *KiroExecutor) executeStreamWithRetry(ctx context.Context, auth *cliprox
 						auth = refreshedAuth
 						accessToken, profileArn = kiroCredentials(auth)
 						// Rebuild payload with new profile ARN if changed
-						kiroPayload, _ = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from)
+						kiroPayload, _ = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from, opts.Headers)
 						log.Infof("kiro: token refreshed successfully, retrying stream request")
 						continue
 					}
@@ -738,7 +736,7 @@ func (e *KiroExecutor) executeStreamWithRetry(ctx context.Context, auth *cliprox
 					if refreshedAuth != nil {
 						auth = refreshedAuth
 						accessToken, profileArn = kiroCredentials(auth)
-						kiroPayload, _ = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from)
+						kiroPayload, _ = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from, opts.Headers)
 						log.Infof("kiro: token refreshed for 403, retrying stream request")
 						continue
 					}
@@ -1702,6 +1700,7 @@ func (e *KiroExecutor) streamToChannel(ctx context.Context, body io.Reader, out
 	pendingEndTagChars := 0      // Number of chars that might be start of </thinking>
 	isThinkingBlockOpen := false // Track if thinking content block is open
 	thinkingBlockIndex := -1     // Index of the thinking content block
+	var accumulatedThinkingContent strings.Builder // Accumulate thinking content for signature generation
 
 	// Code block state tracking for heuristic thinking tag parsing
 	// When inside a markdown code block, <thinking> tags should NOT be parsed
@@ -1847,6 +1846,8 @@ func (e *KiroExecutor) streamToChannel(ctx context.Context, body io.Reader, out
 							out <- cliproxyexecutor.StreamChunk{Payload: []byte(chunk + "\n\n")}
 						}
 					}
+					// Accumulate thinking content for signature generation
+					accumulatedThinkingContent.WriteString(pendingText)
 				} else {
 					// Output as regular text
 					if !isTextBlockOpen {
@@ -2390,21 +2391,24 @@ func (e *KiroExecutor) streamToChannel(ctx context.Context, body io.Reader, out
 										out <- cliproxyexecutor.StreamChunk{Payload: []byte(chunk + "\n\n")}
 									}
 								}
+								// Accumulate thinking content for signature generation
+								accumulatedThinkingContent.WriteString(thinkContent)
 							}
 
 							// Note: Partial tag handling is done via pendingEndTagChars
 							// When the next chunk arrives, the partial tag will be reconstructed
 
 							// Close thinking block
 							if isThinkingBlockOpen {
-								blockStop := kiroclaude.BuildClaudeContentBlockStopEvent(thinkingBlockIndex)
+								blockStop := kiroclaude.BuildClaudeThinkingBlockStopEvent(thinkingBlockIndex)
 								sseData := sdktranslator.TranslateStream(ctx, sdktranslator.FromString("kiro"), targetFormat, model, originalReq, claudeBody, blockStop, &translatorParam)
 								for _, chunk := range sseData {
 									if chunk != "" {
 										out <- cliproxyexecutor.StreamChunk{Payload: []byte(chunk + "\n\n")}
 									}
 								}
 								isThinkingBlockOpen = false
+								accumulatedThinkingContent.Reset() // Reset for potential next thinking block
 							}
 
 							inThinkBlock = false
@@ -2450,6 +2454,8 @@ func (e *KiroExecutor) streamToChannel(ctx context.Context, body io.Reader, out
 										out <- cliproxyexecutor.StreamChunk{Payload: []byte(chunk + "\n\n")}
 									}
 								}
+								// Accumulate thinking content for signature generation
+								accumulatedThinkingContent.WriteString(contentToEmit)
 							}
 
 							remaining = ""
@@ -2592,6 +2598,7 @@ func (e *KiroExecutor) streamToChannel(ctx context.Context, body io.Reader, out
 			// Handle tool uses in response (with deduplication)
 			for _, tu := range toolUses {
 				toolUseID := kirocommon.GetString(tu, "toolUseId")
+				toolName := kirocommon.GetString(tu, "name")
 
 				// Check for duplicate
 				if processedIDs[toolUseID] {
@@ -2615,7 +2622,6 @@ func (e *KiroExecutor) streamToChannel(ctx context.Context, body io.Reader, out
 
 				// Emit tool_use content block
 				contentBlockIndex++
-				toolName := kirocommon.GetString(tu, "name")
 
 				blockStart := kiroclaude.BuildClaudeContentBlockStartEvent(contentBlockIndex, "tool_use", toolUseID, toolName)
 				sseData := sdktranslator.TranslateStream(ctx, sdktranslator.FromString("kiro"), targetFormat, model, originalReq, claudeBody, blockStart, &translatorParam)
@@ -2888,7 +2894,7 @@ func (e *KiroExecutor) streamToChannel(ctx context.Context, body io.Reader, out
 		if calculatedInputTokens > 0 {
 			localEstimate := totalUsage.InputTokens
 			totalUsage.InputTokens = calculatedInputTokens
-			log.Infof("kiro: using contextUsagePercentage (%.2f%%) to calculate input tokens: %d (local estimate was: %d)",
+			log.Debugf("kiro: using contextUsagePercentage (%.2f%%) to calculate input tokens: %d (local estimate was: %d)",
 				upstreamContextPercentage, calculatedInputTokens, localEstimate)
 		}
 	}
@@ -2897,7 +2903,7 @@ func (e *KiroExecutor) streamToChannel(ctx context.Context, body io.Reader, out
 
 	// Log upstream usage information if received
 	if hasUpstreamUsage {
-		log.Infof("kiro: upstream usage - credits: %.4f, context: %.2f%%, final tokens - input: %d, output: %d, total: %d",
+		log.Debugf("kiro: upstream usage - credits: %.4f, context: %.2f%%, final tokens - input: %d, output: %d, total: %d",
 			upstreamCreditUsage, upstreamContextPercentage,
 			totalUsage.InputTokens, totalUsage.OutputTokens, totalUsage.TotalTokens)
 	}

diff --git a/internal/translator/kiro/claude/kiro_claude_request.go b/internal/translator/kiro/claude/kiro_claude_request.go
@@ -6,6 +6,7 @@ package claude
 import (
 	"encoding/json"
 	"fmt"
+	"net/http"
 	"strings"
 	"time"
 	"unicode/utf8"
@@ -33,6 +34,7 @@ type KiroInferenceConfig struct {
 	TopP        float64 `json:"topP,omitempty"`
 }
 
+
 // KiroConversationState holds the conversation context
 type KiroConversationState struct {
 	ChatTriggerType string               `json:"chatTriggerType"` // Required: "MANUAL" - must be first field
@@ -134,9 +136,11 @@ func ConvertClaudeRequestToKiro(modelName string, inputRawJSON []byte, stream bo
 // origin parameter determines which quota to use: "CLI" for Amazon Q, "AI_EDITOR" for Kiro IDE.
 // isAgentic parameter enables chunked write optimization prompt for -agentic model variants.
 // isChatOnly parameter disables tool calling for -chat model variants (pure conversation mode).
-// Supports thinking mode - when Claude API thinking parameter is present, injects thinkingHint.
+// headers parameter allows checking Anthropic-Beta header for thinking mode detection.
+// metadata parameter is kept for API compatibility but no longer used for thinking configuration.
+// Supports thinking mode - when enabled, injects thinking tags into system prompt.
 // Returns the payload and a boolean indicating whether thinking mode was injected.
-func BuildKiroPayload(claudeBody []byte, modelID, profileArn, origin string, isAgentic, isChatOnly bool) ([]byte, bool) {
+func BuildKiroPayload(claudeBody []byte, modelID, profileArn, origin string, isAgentic, isChatOnly bool, headers http.Header, metadata map[string]any) ([]byte, bool) {
 	// Extract max_tokens for potential use in inferenceConfig
 	// Handle -1 as "use maximum" (Kiro max output is ~32000 tokens)
 	const kiroMaxOutputTokens = 32000
@@ -181,26 +185,9 @@ func BuildKiroPayload(claudeBody []byte, modelID, profileArn, origin string, isA
 	// Extract system prompt
 	systemPrompt := extractSystemPrompt(claudeBody)
 
-	// Check for thinking mode using the comprehensive IsThinkingEnabled function
-	// This supports Claude API format, OpenAI reasoning_effort, and AMP/Cursor format
-	thinkingEnabled := IsThinkingEnabled(claudeBody)
-	_, budgetTokens := checkThinkingMode(claudeBody) // Get budget tokens from Claude format if available
-	if budgetTokens <= 0 {
-		// Calculate budgetTokens based on max_tokens if available
-		// Use 50% of max_tokens for thinking, with min 8000 and max 24000
-		if maxTokens > 0 {
-			budgetTokens = maxTokens / 2
-			if budgetTokens < 8000 {
-				budgetTokens = 8000
-			}
-			if budgetTokens > 24000 {
-				budgetTokens = 24000
-			}
-			log.Debugf("kiro: budgetTokens calculated from max_tokens: %d (max_tokens=%d)", budgetTokens, maxTokens)
-		} else {
-			budgetTokens = 16000 // Default budget tokens
-		}
-	}
+	// Check for thinking mode using the comprehensive IsThinkingEnabledWithHeaders function
+	// This supports Claude API format, OpenAI reasoning_effort, AMP/Cursor format, and Anthropic-Beta header
+	thinkingEnabled := IsThinkingEnabledWithHeaders(claudeBody, headers)
 
 	// Inject timestamp context
 	timestamp := time.Now().Format("2006-01-02 15:04:05 MST")
@@ -231,19 +218,26 @@ func BuildKiroPayload(claudeBody []byte, modelID, profileArn, origin string, isA
 		log.Debugf("kiro: injected tool_choice hint into system prompt")
 	}
 
-	// Inject thinking hint when thinking mode is enabled
+	// Convert Claude tools to Kiro format
+	kiroTools := convertClaudeToolsToKiro(tools)
+
+	// Thinking mode implementation:
+	// Kiro API doesn't accept max_tokens for thinking. Instead, thinking mode is enabled
+	// by injecting <thinking_mode> and <max_thinking_length> tags into the system prompt.
+	// We use a fixed max_thinking_length value since Kiro handles the actual budget internally.
 	if thinkingEnabled {
-	if thinkingEnabled {
+if thinkingEnabled && !hasThinkingTagInBody(claudeBody) {
-	if thinkingEnabled {
+if thinkingEnabled && !hasThinkingTagInBody(claudeBody) {
+		thinkingHint := `<thinking_mode>interleaved</thinking_mode>
+<max_thinking_length>200000</max_thinking_length>
+
+IMPORTANT: You MUST use <thinking>...</thinking> tags to show your reasoning process before providing your final response. Think step by step inside the thinking tags.`
 		if systemPrompt != "" {
-			systemPrompt += "\n"
+			systemPrompt = thinkingHint + "\n\n" + systemPrompt
+		} else {
+			systemPrompt = thinkingHint
 		}
-		dynamicThinkingHint := fmt.Sprintf("<thinking_mode>interleaved</thinking_mode><max_thinking_length>%d</max_thinking_length>", budgetTokens)
-		systemPrompt += dynamicThinkingHint
-		log.Debugf("kiro: injected dynamic thinking hint into system prompt, max_thinking_length: %d", budgetTokens)
+		log.Infof("kiro: injected thinking prompt, has_tools: %v", len(kiroTools) > 0)
 	}
 
-	// Convert Claude tools to Kiro format
-	kiroTools := convertClaudeToolsToKiro(tools)
-
 	// Process messages and build history
 	history, currentUserMsg, currentToolResults := processMessages(messages, modelID, origin)
 
@@ -280,6 +274,7 @@ func BuildKiroPayload(claudeBody []byte, modelID, profileArn, origin string, isA
 	}
 
 	// Build inferenceConfig if we have any inference parameters
+	// Note: Kiro API doesn't actually use max_tokens for thinking budget
 	var inferenceConfig *KiroInferenceConfig
 	if maxTokens > 0 || hasTemperature || hasTopP {
 		inferenceConfig = &KiroInferenceConfig{}
@@ -350,7 +345,7 @@ func extractSystemPrompt(claudeBody []byte) string {
 // checkThinkingMode checks if thinking mode is enabled in the Claude request
 func checkThinkingMode(claudeBody []byte) (bool, int64) {
 	thinkingEnabled := false
-	var budgetTokens int64 = 16000
+	var budgetTokens int64 = 24000
 
 	thinkingField := gjson.GetBytes(claudeBody, "thinking")
 	if thinkingField.Exists() {
@@ -373,6 +368,32 @@ func checkThinkingMode(claudeBody []byte) (bool, int64) {
 	return thinkingEnabled, budgetTokens
 }
 
+// hasThinkingTagInBody checks if the request body already contains thinking configuration tags.
+// This is used to prevent duplicate injection when client (e.g., AMP/Cursor) already includes thinking config.
+func hasThinkingTagInBody(body []byte) bool {
+	bodyStr := string(body)
+	return strings.Contains(bodyStr, "<thinking_mode>") || strings.Contains(bodyStr, "<max_thinking_length>")
+}
+
+
+// IsThinkingEnabledFromHeader checks if thinking mode is enabled via Anthropic-Beta header.
+// Claude CLI uses "Anthropic-Beta: interleaved-thinking-2025-05-14" to enable thinking.
+func IsThinkingEnabledFromHeader(headers http.Header) bool {
+	if headers == nil {
+		return false
+	}
+	betaHeader := headers.Get("Anthropic-Beta")
+	if betaHeader == "" {
+		return false
+	}
+	// Check for interleaved-thinking beta feature
+	if strings.Contains(betaHeader, "interleaved-thinking") {
+		log.Debugf("kiro: thinking mode enabled via Anthropic-Beta header: %s", betaHeader)
+		return true
+	}
+	return false
+}
+
 // IsThinkingEnabled is a public wrapper to check if thinking mode is enabled.
 // This is used by the executor to determine whether to parse <thinking> tags in responses.
 // When thinking is NOT enabled in the request, <thinking> tags in responses should be
@@ -383,6 +404,21 @@ func checkThinkingMode(claudeBody []byte) (bool, int64) {
 // - OpenAI format: reasoning_effort parameter
 // - AMP/Cursor format: <thinking_mode>interleaved</thinking_mode> in system prompt
 func IsThinkingEnabled(body []byte) bool {
+	return IsThinkingEnabledWithHeaders(body, nil)
+}
+
+// IsThinkingEnabledWithHeaders checks if thinking mode is enabled from body or headers.
+// This is the comprehensive check that supports all thinking detection methods:
+// - Claude API format: thinking.type = "enabled"
+// - OpenAI format: reasoning_effort parameter
+// - AMP/Cursor format: <thinking_mode>interleaved</thinking_mode> in system prompt
+// - Anthropic-Beta header: interleaved-thinking-2025-05-14
+func IsThinkingEnabledWithHeaders(body []byte, headers http.Header) bool {
+	// Check Anthropic-Beta header first (Claude Code uses this)
+	if IsThinkingEnabledFromHeader(headers) {
+		return true
+	}
+
 	// Check Claude API format first (thinking.type = "enabled")
 	enabled, _ := checkThinkingMode(body)
 	if enabled {
@@ -771,4 +807,4 @@ func BuildAssistantMessageStruct(msg gjson.Result) KiroAssistantResponseMessage
 		Content:  contentBuilder.String(),
 		ToolUses: toolUses,
 	}
-}
+}
diff --git a/internal/translator/kiro/claude/kiro_claude_response.go b/internal/translator/kiro/claude/kiro_claude_response.go
@@ -4,6 +4,8 @@
 package claude
 
 import (
+	"crypto/sha256"
+	"encoding/base64"
 	"encoding/json"
 	"strings"
 
@@ -14,6 +16,18 @@ import (
 	kirocommon "github.com/router-for-me/CLIProxyAPI/v6/internal/translator/kiro/common"
 )
 
+// generateThinkingSignature generates a signature for thinking content.
+// This is required by Claude API for thinking blocks in non-streaming responses.
+// The signature is a base64-encoded hash of the thinking content.
+func generateThinkingSignature(thinkingContent string) string {
+	if thinkingContent == "" {
+		return ""
+	}
+	// Generate a deterministic signature based on content hash
+	hash := sha256.Sum256([]byte(thinkingContent))
+	return base64.StdEncoding.EncodeToString(hash[:])
+}
+
 // Local references to kirocommon constants for thinking block parsing
 var (
 	thinkingStartTag = kirocommon.ThinkingStartTag
@@ -149,9 +163,12 @@ func ExtractThinkingFromContent(content string) []map[string]interface{} {
 		if endIdx == -1 {
 			// No closing tag found, treat rest as thinking content (incomplete response)
 			if strings.TrimSpace(remaining) != "" {
+				// Generate signature for thinking content (required by Claude API)
+				signature := generateThinkingSignature(remaining)
 				blocks = append(blocks, map[string]interface{}{
-					"type":     "thinking",
-					"thinking": remaining,
+					"type":      "thinking",
+					"thinking":  remaining,
+					"signature": signature,
 				})
 				log.Warnf("kiro: extractThinkingFromContent - missing closing </thinking> tag")
 			}
@@ -161,9 +178,12 @@ func ExtractThinkingFromContent(content string) []map[string]interface{} {
 		// Extract thinking content between tags
 		thinkContent := remaining[:endIdx]
 		if strings.TrimSpace(thinkContent) != "" {
+			// Generate signature for thinking content (required by Claude API)
+			signature := generateThinkingSignature(thinkContent)
 			blocks = append(blocks, map[string]interface{}{
-				"type":     "thinking",
-				"thinking": thinkContent,
+				"type":      "thinking",
+				"thinking":  thinkContent,
+				"signature": signature,
 			})
 			log.Debugf("kiro: extractThinkingFromContent - extracted thinking block (len: %d)", len(thinkContent))
 		}