router-for-me
diff --git a/‎config.example.yaml‎
Lines changed: 5 additions & 0 deletions b/‎config.example.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎internal/config/sdk_config.go‎
Lines changed: 15 additions & 0 deletions b/‎internal/config/sdk_config.go‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎internal/runtime/executor/antigravity_executor.go‎
Lines changed: 1 addition & 1 deletion b/‎internal/runtime/executor/antigravity_executor.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎internal/translator/antigravity/claude/antigravity_claude_response.go‎
Lines changed: 19 additions & 0 deletions b/‎internal/translator/antigravity/claude/antigravity_claude_response.go‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎internal/translator/antigravity/openai/chat-completions/antigravity_openai_response.go‎
Lines changed: 11 additions & 0 deletions b/‎internal/translator/antigravity/openai/chat-completions/antigravity_openai_response.go‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎internal/translator/gemini-cli/openai/chat-completions/gemini-cli_openai_request.go‎
Lines changed: 1 addition & 1 deletion b/‎internal/translator/gemini-cli/openai/chat-completions/gemini-cli_openai_request.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎internal/translator/gemini/openai/chat-completions/gemini_openai_request.go‎
Lines changed: 1 addition & 1 deletion b/‎internal/translator/gemini/openai/chat-completions/gemini_openai_request.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎internal/translator/gemini/openai/chat-completions/gemini_openai_response.go‎
Lines changed: 19 additions & 0 deletions b/‎internal/translator/gemini/openai/chat-completions/gemini_openai_response.go‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎sdk/api/handlers/claude/code_handlers.go‎
Lines changed: 57 additions & 42 deletions b/‎sdk/api/handlers/claude/code_handlers.go‎
Lines changed: 57 additions & 42 deletions
@@ -78,6 +78,11 @@ routing:
 # When true, enable authentication for the WebSocket API (/v1/ws).
 ws-auth: false
 
+# Streaming behavior (SSE keep-alives + safe bootstrap retries).
+# streaming:
+#   keepalive-seconds: 15   # Default: 0 (disabled). <= 0 disables keep-alives.
+#   bootstrap-retries: 1    # Default: 0 (disabled). Retries before first byte is sent.
+
 # Gemini API keys
 # gemini-api-key:
 #   - api-key: "AIzaSy...01"
 
@@ -22,6 +22,21 @@ type SDKConfig struct {
 
 	// Access holds request authentication provider configuration.
 	Access AccessConfig `yaml:"auth,omitempty" json:"auth,omitempty"`
+
+	// Streaming configures server-side streaming behavior (keep-alives and safe bootstrap retries).
+	Streaming StreamingConfig `yaml:"streaming" json:"streaming"`
+}
+
+// StreamingConfig holds server streaming behavior configuration.
+type StreamingConfig struct {
+	// KeepAliveSeconds controls how often the server emits SSE heartbeats (": keep-alive\n\n").
+	// nil means default (15 seconds). <= 0 disables keep-alives.
+	KeepAliveSeconds *int `yaml:"keepalive-seconds,omitempty" json:"keepalive-seconds,omitempty"`
+
+	// BootstrapRetries controls how many times the server may retry a streaming request before any bytes are sent,
+	// to allow auth rotation / transient recovery.
+	// nil means default (2). 0 disables bootstrap retries.
+	BootstrapRetries *int `yaml:"bootstrap-retries,omitempty" json:"bootstrap-retries,omitempty"`
 }
 
 // AccessConfig groups request authentication providers.
 
@@ -42,7 +42,7 @@ const (
 	antigravityModelsPath          = "/v1internal:fetchAvailableModels"
 	antigravityClientID            = "1071006060591-tmhssin2h21lcre235vtolojh4g403ep.apps.googleusercontent.com"
 	antigravityClientSecret        = "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"
-	defaultAntigravityAgent        = "antigravity/1.11.5 windows/amd64"
+	defaultAntigravityAgent        = "antigravity/1.104.0 darwin/arm64"
 	antigravityAuthType            = "antigravity"
 	refreshSkew                    = 3000 * time.Second
 )
 
@@ -35,6 +35,7 @@ type Params struct {
 	CandidatesTokenCount int64  // Cached candidate token count from usage metadata
 	ThoughtsTokenCount   int64  // Cached thinking token count from usage metadata
 	TotalTokenCount      int64  // Cached total token count from usage metadata
+	CachedTokenCount     int64  // Cached content token count (indicates prompt caching)
 	HasSentFinalEvents   bool   // Indicates if final content/message events have been sent
 	HasToolUse           bool   // Indicates if tool use was observed in the stream
 	HasContent           bool   // Tracks whether any content (text, thinking, or tool use) has been output
@@ -274,6 +275,7 @@ func ConvertAntigravityResponseToClaude(_ context.Context, _ string, originalReq
 		params.CandidatesTokenCount = usageResult.Get("candidatesTokenCount").Int()
 		params.ThoughtsTokenCount = usageResult.Get("thoughtsTokenCount").Int()
 		params.TotalTokenCount = usageResult.Get("totalTokenCount").Int()
+		params.CachedTokenCount = usageResult.Get("cachedContentTokenCount").Int()
 		if params.CandidatesTokenCount == 0 && params.TotalTokenCount > 0 {
 			params.CandidatesTokenCount = params.TotalTokenCount - params.PromptTokenCount - params.ThoughtsTokenCount
 			if params.CandidatesTokenCount < 0 {
@@ -322,6 +324,14 @@ func appendFinalEvents(params *Params, output *string, force bool) {
 	*output = *output + "event: message_delta\n"
 	*output = *output + "data: "
 	delta := fmt.Sprintf(`{"type":"message_delta","delta":{"stop_reason":"%s","stop_sequence":null},"usage":{"input_tokens":%d,"output_tokens":%d}}`, stopReason, params.PromptTokenCount, usageOutputTokens)
+	// Add cache_read_input_tokens if cached tokens are present (indicates prompt caching is working)
+	if params.CachedTokenCount > 0 {
+		var err error
+		delta, err = sjson.Set(delta, "usage.cache_read_input_tokens", params.CachedTokenCount)
+		if err != nil {
+			log.Warnf("antigravity claude response: failed to set cache_read_input_tokens: %v", err)
+		}
+	}
 	*output = *output + delta + "\n\n\n"
 
 	params.HasSentFinalEvents = true
@@ -361,6 +371,7 @@ func ConvertAntigravityResponseToClaudeNonStream(_ context.Context, _ string, or
 	candidateTokens := root.Get("response.usageMetadata.candidatesTokenCount").Int()
 	thoughtTokens := root.Get("response.usageMetadata.thoughtsTokenCount").Int()
 	totalTokens := root.Get("response.usageMetadata.totalTokenCount").Int()
+	cachedTokens := root.Get("response.usageMetadata.cachedContentTokenCount").Int()
 	outputTokens := candidateTokens + thoughtTokens
 	if outputTokens == 0 && totalTokens > 0 {
 		outputTokens = totalTokens - promptTokens
@@ -374,6 +385,14 @@ func ConvertAntigravityResponseToClaudeNonStream(_ context.Context, _ string, or
 	responseJSON, _ = sjson.Set(responseJSON, "model", root.Get("response.modelVersion").String())
 	responseJSON, _ = sjson.Set(responseJSON, "usage.input_tokens", promptTokens)
 	responseJSON, _ = sjson.Set(responseJSON, "usage.output_tokens", outputTokens)
+	// Add cache_read_input_tokens if cached tokens are present (indicates prompt caching is working)
+	if cachedTokens > 0 {
+		var err error
+		responseJSON, err = sjson.Set(responseJSON, "usage.cache_read_input_tokens", cachedTokens)
+		if err != nil {
+			log.Warnf("antigravity claude response: failed to set cache_read_input_tokens: %v", err)
+		}
+	}
 
 	contentArrayInitialized := false
 	ensureContentArray := func() {
 
@@ -13,6 +13,8 @@ import (
 	"sync/atomic"
 	"time"
 
+	log "github.com/sirupsen/logrus"
+
 	. "github.com/router-for-me/CLIProxyAPI/v6/internal/translator/gemini/openai/chat-completions"
 	"github.com/tidwall/gjson"
 	"github.com/tidwall/sjson"
@@ -93,10 +95,19 @@ func ConvertAntigravityResponseToOpenAI(_ context.Context, _ string, originalReq
 		}
 		promptTokenCount := usageResult.Get("promptTokenCount").Int()
 		thoughtsTokenCount := usageResult.Get("thoughtsTokenCount").Int()
+		cachedTokenCount := usageResult.Get("cachedContentTokenCount").Int()
 		template, _ = sjson.Set(template, "usage.prompt_tokens", promptTokenCount+thoughtsTokenCount)
 		if thoughtsTokenCount > 0 {
 			template, _ = sjson.Set(template, "usage.completion_tokens_details.reasoning_tokens", thoughtsTokenCount)
 		}
+		// Include cached token count if present (indicates prompt caching is working)
+		if cachedTokenCount > 0 {
+			var err error
+			template, err = sjson.Set(template, "usage.prompt_tokens_details.cached_tokens", cachedTokenCount)
+			if err != nil {
+				log.Warnf("antigravity openai response: failed to set cached_tokens: %v", err)
+			}
+		}
 	}
 
 	// Process the main content part of the response.
 
@@ -244,7 +244,7 @@ func ConvertOpenAIRequestToGeminiCLI(modelName string, inputRawJSON []byte, _ bo
 					out, _ = sjson.SetRawBytes(out, "request.contents.-1", node)
 
 					// Append a single tool content combining name + response per function
-					toolNode := []byte(`{"role":"tool","parts":[]}`)
+					toolNode := []byte(`{"role":"user","parts":[]}`)
 					pp := 0
 					for _, fid := range fIDs {
 						if name, ok := tcID2Name[fid]; ok {
 
@@ -286,7 +286,7 @@ func ConvertOpenAIRequestToGemini(modelName string, inputRawJSON []byte, _ bool)
 					out, _ = sjson.SetRawBytes(out, "contents.-1", node)
 
 					// Append a single tool content combining name + response per function
-					toolNode := []byte(`{"role":"tool","parts":[]}`)
+					toolNode := []byte(`{"role":"user","parts":[]}`)
 					pp := 0
 					for _, fid := range fIDs {
 						if name, ok := tcID2Name[fid]; ok {
 
@@ -13,6 +13,7 @@ import (
 	"sync/atomic"
 	"time"
 
+	log "github.com/sirupsen/logrus"
 	"github.com/tidwall/gjson"
 	"github.com/tidwall/sjson"
 )
@@ -96,10 +97,19 @@ func ConvertGeminiResponseToOpenAI(_ context.Context, _ string, originalRequestR
 		}
 		promptTokenCount := usageResult.Get("promptTokenCount").Int()
 		thoughtsTokenCount := usageResult.Get("thoughtsTokenCount").Int()
+		cachedTokenCount := usageResult.Get("cachedContentTokenCount").Int()
 		template, _ = sjson.Set(template, "usage.prompt_tokens", promptTokenCount+thoughtsTokenCount)
 		if thoughtsTokenCount > 0 {
 			template, _ = sjson.Set(template, "usage.completion_tokens_details.reasoning_tokens", thoughtsTokenCount)
 		}
+		// Include cached token count if present (indicates prompt caching is working)
+		if cachedTokenCount > 0 {
+			var err error
+			template, err = sjson.Set(template, "usage.prompt_tokens_details.cached_tokens", cachedTokenCount)
+			if err != nil {
+				log.Warnf("gemini openai response: failed to set cached_tokens in streaming: %v", err)
+			}
+		}
 	}
 
 	// Process the main content part of the response.
@@ -240,10 +250,19 @@ func ConvertGeminiResponseToOpenAINonStream(_ context.Context, _ string, origina
 		}
 		promptTokenCount := usageResult.Get("promptTokenCount").Int()
 		thoughtsTokenCount := usageResult.Get("thoughtsTokenCount").Int()
+		cachedTokenCount := usageResult.Get("cachedContentTokenCount").Int()
 		template, _ = sjson.Set(template, "usage.prompt_tokens", promptTokenCount+thoughtsTokenCount)
 		if thoughtsTokenCount > 0 {
 			template, _ = sjson.Set(template, "usage.completion_tokens_details.reasoning_tokens", thoughtsTokenCount)
 		}
+		// Include cached token count if present (indicates prompt caching is working)
+		if cachedTokenCount > 0 {
+			var err error
+			template, err = sjson.Set(template, "usage.prompt_tokens_details.cached_tokens", cachedTokenCount)
+			if err != nil {
+				log.Warnf("gemini openai response: failed to set cached_tokens in non-streaming: %v", err)
+			}
+		}
 	}
 
 	// Process the main content part of the response.
 
@@ -14,7 +14,6 @@ import (
 	"fmt"
 	"io"
 	"net/http"
-	"time"
 
 	"github.com/gin-gonic/gin"
 	. "github.com/router-for-me/CLIProxyAPI/v6/internal/constant"
@@ -185,14 +184,6 @@ func (h *ClaudeCodeAPIHandler) handleNonStreamingResponse(c *gin.Context, rawJSO
 //   - c: The Gin context for the request.
 //   - rawJSON: The raw JSON request body.
 func (h *ClaudeCodeAPIHandler) handleStreamingResponse(c *gin.Context, rawJSON []byte) {
-	// Set up Server-Sent Events (SSE) headers for streaming response
-	// These headers are essential for maintaining a persistent connection
-	// and enabling real-time streaming of chat completions
-	c.Header("Content-Type", "text/event-stream")
-	c.Header("Cache-Control", "no-cache")
-	c.Header("Connection", "keep-alive")
-	c.Header("Access-Control-Allow-Origin", "*")
-
 	// Get the http.Flusher interface to manually flush the response.
 	// This is crucial for streaming as it allows immediate sending of data chunks
 	flusher, ok := c.Writer.(http.Flusher)
@@ -213,58 +204,82 @@ func (h *ClaudeCodeAPIHandler) handleStreamingResponse(c *gin.Context, rawJSON [
 	cliCtx, cliCancel := h.GetContextWithCancel(h, c, context.Background())
 
 	dataChan, errChan := h.ExecuteStreamWithAuthManager(cliCtx, h.HandlerType(), modelName, rawJSON, "")
-	h.forwardClaudeStream(c, flusher, func(err error) { cliCancel(err) }, dataChan, errChan)
-	return
-}
+	setSSEHeaders := func() {
+		c.Header("Content-Type", "text/event-stream")
+		c.Header("Cache-Control", "no-cache")
+		c.Header("Connection", "keep-alive")
+		c.Header("Access-Control-Allow-Origin", "*")
+	}
 
-func (h *ClaudeCodeAPIHandler) forwardClaudeStream(c *gin.Context, flusher http.Flusher, cancel func(error), data <-chan []byte, errs <-chan *interfaces.ErrorMessage) {
-	// OpenAI-style stream forwarding: write each SSE chunk and flush immediately.
-	// This guarantees clients see incremental output even for small responses.
+	// Peek at the first chunk to determine success or failure before setting headers
 	for {
 		select {
 		case <-c.Request.Context().Done():
-			cancel(c.Request.Context().Err())
+			cliCancel(c.Request.Context().Err())
 			return
-
-		case chunk, ok := <-data:
+		case errMsg, ok := <-errChan:
+			if !ok {
+				// Err channel closed cleanly; wait for data channel.
+				errChan = nil
+				continue
+			}
+			// Upstream failed immediately. Return proper error status and JSON.
+			h.WriteErrorResponse(c, errMsg)
+			if errMsg != nil {
+				cliCancel(errMsg.Error)
+			} else {
+				cliCancel(nil)
+			}
+			return
+		case chunk, ok := <-dataChan:
 			if !ok {
+				// Stream closed without data? Send DONE or just headers.
+				setSSEHeaders()
 				flusher.Flush()
-				cancel(nil)
+				cliCancel(nil)
 				return
 			}
+
+			// Success! Set headers now.
+			setSSEHeaders()
+
+			// Write the first chunk
 			if len(chunk) > 0 {
 				_, _ = c.Writer.Write(chunk)
 				flusher.Flush()
 			}
 
-		case errMsg, ok := <-errs:
-			if !ok {
-				continue
-			}
-			if errMsg != nil {
-				status := http.StatusInternalServerError
-				if errMsg.StatusCode > 0 {
-					status = errMsg.StatusCode
-				}
-				c.Status(status)
-
-				// An error occurred: emit as a proper SSE error event
-				errorBytes, _ := json.Marshal(h.toClaudeError(errMsg))
-				_, _ = fmt.Fprintf(c.Writer, "event: error\ndata: %s\n\n", errorBytes)
-				flusher.Flush()
-			}
-
-			var execErr error
-			if errMsg != nil {
-				execErr = errMsg.Error
-			}
-			cancel(execErr)
+			// Continue streaming the rest
+			h.forwardClaudeStream(c, flusher, func(err error) { cliCancel(err) }, dataChan, errChan)
 			return
-		case <-time.After(500 * time.Millisecond):
 		}
 	}
 }
 
+func (h *ClaudeCodeAPIHandler) forwardClaudeStream(c *gin.Context, flusher http.Flusher, cancel func(error), data <-chan []byte, errs <-chan *interfaces.ErrorMessage) {
+	h.ForwardStream(c, flusher, cancel, data, errs, handlers.StreamForwardOptions{
+		WriteChunk: func(chunk []byte) {
+			if len(chunk) == 0 {
+				return
+			}
+			_, _ = c.Writer.Write(chunk)
+		},
+		WriteTerminalError: func(errMsg *interfaces.ErrorMessage) {
+			if errMsg == nil {
+				return
+			}
+			status := http.StatusInternalServerError
+			if errMsg.StatusCode > 0 {
+				status = errMsg.StatusCode
+			}
+			c.Status(status)
+
+			errorBytes, _ := json.Marshal(h.toClaudeError(errMsg))
+			_, _ = fmt.Fprintf(c.Writer, "event: error\ndata: %s\n\n", errorBytes)
+		},
+	})
+}
+
 type claudeErrorDetail struct {
 	Type    string `json:"type"`
 	Message string `json:"message"`
Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ const (`
`42`	`42`	`antigravityModelsPath = "/v1internal:fetchAvailableModels"`
`43`	`43`	`antigravityClientID = "1071006060591-tmhssin2h21lcre235vtolojh4g403ep.apps.googleusercontent.com"`
`44`	`44`	`antigravityClientSecret = "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"`
`45`		`- defaultAntigravityAgent = "antigravity/1.11.5 windows/amd64"`
	`45`	`+ defaultAntigravityAgent = "antigravity/1.104.0 darwin/arm64"`
`46`	`46`	`antigravityAuthType = "antigravity"`
`47`	`47`	`refreshSkew = 3000 * time.Second`
`48`	`48`	`)`