mudler · mudler · Jan 5, 2026 · Jan 4, 2026 · Jan 5, 2026 · Jan 5, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -77,3 +77,49 @@ When fixing compilation errors after upstream changes:
 - HTTP server uses `server_routes` with HTTP handlers
 - Both use the same `server_context` and task queue infrastructure
 - gRPC methods: `LoadModel`, `Predict`, `PredictStream`, `Embedding`, `Rerank`, `TokenizeString`, `GetMetrics`, `Health`
+
+## Tool Call Parsing Maintenance
+
+When working on JSON/XML tool call parsing functionality, always check llama.cpp for reference implementation and updates:
+
+### Checking for XML Parsing Changes
+
+1. **Review XML Format Definitions**: Check `llama.cpp/common/chat-parser-xml-toolcall.h` for `xml_tool_call_format` struct changes
+2. **Review Parsing Logic**: Check `llama.cpp/common/chat-parser-xml-toolcall.cpp` for parsing algorithm updates
+3. **Review Format Presets**: Check `llama.cpp/common/chat-parser.cpp` for new XML format presets (search for `xml_tool_call_format form`)
+4. **Review Model Lists**: Check `llama.cpp/common/chat.h` for `COMMON_CHAT_FORMAT_*` enum values that use XML parsing:
+   - `COMMON_CHAT_FORMAT_GLM_4_5`
+   - `COMMON_CHAT_FORMAT_MINIMAX_M2`
+   - `COMMON_CHAT_FORMAT_KIMI_K2`
+   - `COMMON_CHAT_FORMAT_QWEN3_CODER_XML`
+   - `COMMON_CHAT_FORMAT_APRIEL_1_5`
+   - `COMMON_CHAT_FORMAT_XIAOMI_MIMO`
+   - Any new formats added
+
+### Model Configuration Options
+
+Always check `llama.cpp` for new model configuration options that should be supported in LocalAI:
+
+1. **Check Server Context**: Review `llama.cpp/tools/server/server-context.cpp` for new parameters
+2. **Check Chat Params**: Review `llama.cpp/common/chat.h` for `common_chat_params` struct changes
+3. **Check Server Options**: Review `llama.cpp/tools/server/server.cpp` for command-line argument changes
+4. **Examples of options to check**:
+   - `ctx_shift` - Context shifting support
+   - `parallel_tool_calls` - Parallel tool calling
+   - `reasoning_format` - Reasoning format options
+   - Any new flags or parameters
+
+### Implementation Guidelines
+
+1. **Feature Parity**: Always aim for feature parity with llama.cpp's implementation
+2. **Test Coverage**: Add tests for new features matching llama.cpp's behavior
+3. **Documentation**: Update relevant documentation when adding new formats or options
+4. **Backward Compatibility**: Ensure changes don't break existing functionality
+
+### Files to Monitor
+
+- `llama.cpp/common/chat-parser-xml-toolcall.h` - Format definitions
+- `llama.cpp/common/chat-parser-xml-toolcall.cpp` - Parsing logic
+- `llama.cpp/common/chat-parser.cpp` - Format presets and model-specific handlers
+- `llama.cpp/common/chat.h` - Format enums and parameter structures
+- `llama.cpp/tools/server/server-context.cpp` - Server configuration options
diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
@@ -66,10 +66,111 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 	}
 	processTools := func(noAction string, prompt string, req *schema.OpenAIRequest, config *config.ModelConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse, extraUsage bool) error {
 		result := ""
+		lastEmittedCount := 0
 		_, tokenUsage, err := ComputeChoices(req, prompt, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
 			result += s
-			// TODO: Change generated BNF grammar to be compliant with the schema so we can
-			// stream the result token by token here.
+			// Try incremental XML parsing for streaming support using iterative parser
+			// This allows emitting partial tool calls as they're being generated
+			cleanedResult := functions.CleanupLLMResult(result, config.FunctionsConfig)
+
+			// Determine XML format from config
+			var xmlFormat *functions.XMLToolCallFormat
+			if config.FunctionsConfig.XMLFormat != nil {
+				xmlFormat = config.FunctionsConfig.XMLFormat
+			} else if config.FunctionsConfig.XMLFormatPreset != "" {
+				xmlFormat = functions.GetXMLFormatPreset(config.FunctionsConfig.XMLFormatPreset)
+			}
+
+			// Use iterative parser for streaming (partial parsing enabled)
+			// Try XML parsing first
+			partialResults, parseErr := functions.ParseXMLIterative(cleanedResult, xmlFormat, true)
+			if parseErr == nil && len(partialResults) > 0 {
+				// Emit new XML tool calls that weren't emitted before
+				if len(partialResults) > lastEmittedCount {
+					for i := lastEmittedCount; i < len(partialResults); i++ {
+						toolCall := partialResults[i]
+						initialMessage := schema.OpenAIResponse{
+							ID:      id,
+							Created: created,
+							Model:   req.Model,
+							Choices: []schema.Choice{{
+								Delta: &schema.Message{
+									Role: "assistant",
+									ToolCalls: []schema.ToolCall{
+										{
+											Index: i,
+											ID:    id,
+											Type:  "function",
+											FunctionCall: schema.FunctionCall{
+												Name: toolCall.Name,
+											},
+										},
+									},
+								},
+								Index:        0,
+								FinishReason: nil,
+							}},
+							Object: "chat.completion.chunk",
+						}
+						select {
+						case responses <- initialMessage:
+						default:
+						}
+					}
+					lastEmittedCount = len(partialResults)
+				}
+			} else {
+				// Try JSON tool call parsing for streaming
+				// Check if the result looks like JSON tool calls
+				jsonResults, jsonErr := functions.ParseJSONIterative(cleanedResult, true)
+				if jsonErr == nil && len(jsonResults) > 0 {
+					// Check if these are tool calls (have "name" and optionally "arguments")
+					for _, jsonObj := range jsonResults {
+						if name, ok := jsonObj["name"].(string); ok && name != "" {
+							// This looks like a tool call
+							args := "{}"
+							if argsVal, ok := jsonObj["arguments"]; ok {
+								if argsStr, ok := argsVal.(string); ok {
+									args = argsStr
+								} else {
+									argsBytes, _ := json.Marshal(argsVal)
+									args = string(argsBytes)
+								}
+							}
+							// Emit tool call
+							initialMessage := schema.OpenAIResponse{
+								ID:      id,
+								Created: created,
+								Model:   req.Model,
+								Choices: []schema.Choice{{
+									Delta: &schema.Message{
+										Role: "assistant",
+										ToolCalls: []schema.ToolCall{
+											{
+												Index: lastEmittedCount,
+												ID:    id,
+												Type:  "function",
+												FunctionCall: schema.FunctionCall{
+													Name:      name,
+													Arguments: args,
+												},
+											},
+										},
+									},
+									Index:        0,
+									FinishReason: nil,
+								}},
+								Object: "chat.completion.chunk",
+							}
+							select {
+							case responses <- initialMessage:
+							default:
+							}
+							lastEmittedCount++
+						}
+					}
+				}
+			}
 			return true
 		})
 		if err != nil {