Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,49 @@ When fixing compilation errors after upstream changes:
- HTTP server uses `server_routes` with HTTP handlers
- Both use the same `server_context` and task queue infrastructure
- gRPC methods: `LoadModel`, `Predict`, `PredictStream`, `Embedding`, `Rerank`, `TokenizeString`, `GetMetrics`, `Health`

## Tool Call Parsing Maintenance

When working on JSON/XML tool call parsing functionality, always check llama.cpp for reference implementation and updates:

### Checking for XML Parsing Changes

1. **Review XML Format Definitions**: Check `llama.cpp/common/chat-parser-xml-toolcall.h` for `xml_tool_call_format` struct changes
2. **Review Parsing Logic**: Check `llama.cpp/common/chat-parser-xml-toolcall.cpp` for parsing algorithm updates
3. **Review Format Presets**: Check `llama.cpp/common/chat-parser.cpp` for new XML format presets (search for `xml_tool_call_format form`)
4. **Review Model Lists**: Check `llama.cpp/common/chat.h` for `COMMON_CHAT_FORMAT_*` enum values that use XML parsing:
- `COMMON_CHAT_FORMAT_GLM_4_5`
- `COMMON_CHAT_FORMAT_MINIMAX_M2`
- `COMMON_CHAT_FORMAT_KIMI_K2`
- `COMMON_CHAT_FORMAT_QWEN3_CODER_XML`
- `COMMON_CHAT_FORMAT_APRIEL_1_5`
- `COMMON_CHAT_FORMAT_XIAOMI_MIMO`
- Any new formats added

### Model Configuration Options

Always check `llama.cpp` for new model configuration options that should be supported in LocalAI:

1. **Check Server Context**: Review `llama.cpp/tools/server/server-context.cpp` for new parameters
2. **Check Chat Params**: Review `llama.cpp/common/chat.h` for `common_chat_params` struct changes
3. **Check Server Options**: Review `llama.cpp/tools/server/server.cpp` for command-line argument changes
4. **Examples of options to check**:
- `ctx_shift` - Context shifting support
- `parallel_tool_calls` - Parallel tool calling
- `reasoning_format` - Reasoning format options
- Any new flags or parameters

### Implementation Guidelines

1. **Feature Parity**: Always aim for feature parity with llama.cpp's implementation
2. **Test Coverage**: Add tests for new features matching llama.cpp's behavior
3. **Documentation**: Update relevant documentation when adding new formats or options
4. **Backward Compatibility**: Ensure changes don't break existing functionality

### Files to Monitor

- `llama.cpp/common/chat-parser-xml-toolcall.h` - Format definitions
- `llama.cpp/common/chat-parser-xml-toolcall.cpp` - Parsing logic
- `llama.cpp/common/chat-parser.cpp` - Format presets and model-specific handlers
- `llama.cpp/common/chat.h` - Format enums and parameter structures
- `llama.cpp/tools/server/server-context.cpp` - Server configuration options
105 changes: 103 additions & 2 deletions core/http/endpoints/openai/chat.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,111 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
}
processTools := func(noAction string, prompt string, req *schema.OpenAIRequest, config *config.ModelConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse, extraUsage bool) error {
result := ""
lastEmittedCount := 0
_, tokenUsage, err := ComputeChoices(req, prompt, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
result += s
// TODO: Change generated BNF grammar to be compliant with the schema so we can
// stream the result token by token here.
// Try incremental XML parsing for streaming support using iterative parser
// This allows emitting partial tool calls as they're being generated
cleanedResult := functions.CleanupLLMResult(result, config.FunctionsConfig)

// Determine XML format from config
var xmlFormat *functions.XMLToolCallFormat
if config.FunctionsConfig.XMLFormat != nil {
xmlFormat = config.FunctionsConfig.XMLFormat
} else if config.FunctionsConfig.XMLFormatPreset != "" {
xmlFormat = functions.GetXMLFormatPreset(config.FunctionsConfig.XMLFormatPreset)
}

// Use iterative parser for streaming (partial parsing enabled)
// Try XML parsing first
partialResults, parseErr := functions.ParseXMLIterative(cleanedResult, xmlFormat, true)
if parseErr == nil && len(partialResults) > 0 {
// Emit new XML tool calls that weren't emitted before
if len(partialResults) > lastEmittedCount {
for i := lastEmittedCount; i < len(partialResults); i++ {
toolCall := partialResults[i]
initialMessage := schema.OpenAIResponse{
ID: id,
Created: created,
Model: req.Model,
Choices: []schema.Choice{{
Delta: &schema.Message{
Role: "assistant",
ToolCalls: []schema.ToolCall{
{
Index: i,
ID: id,
Type: "function",
FunctionCall: schema.FunctionCall{
Name: toolCall.Name,
},
},
},
},
Index: 0,
FinishReason: nil,
}},
Object: "chat.completion.chunk",
}
select {
case responses <- initialMessage:
default:
}
}
lastEmittedCount = len(partialResults)
}
} else {
// Try JSON tool call parsing for streaming
// Check if the result looks like JSON tool calls
jsonResults, jsonErr := functions.ParseJSONIterative(cleanedResult, true)
if jsonErr == nil && len(jsonResults) > 0 {
// Check if these are tool calls (have "name" and optionally "arguments")
for _, jsonObj := range jsonResults {
if name, ok := jsonObj["name"].(string); ok && name != "" {
// This looks like a tool call
args := "{}"
if argsVal, ok := jsonObj["arguments"]; ok {
if argsStr, ok := argsVal.(string); ok {
args = argsStr
} else {
argsBytes, _ := json.Marshal(argsVal)
args = string(argsBytes)
}
}
// Emit tool call
initialMessage := schema.OpenAIResponse{
ID: id,
Created: created,
Model: req.Model,
Choices: []schema.Choice{{
Delta: &schema.Message{
Role: "assistant",
ToolCalls: []schema.ToolCall{
{
Index: lastEmittedCount,
ID: id,
Type: "function",
FunctionCall: schema.FunctionCall{
Name: name,
Arguments: args,
},
},
},
},
Index: 0,
FinishReason: nil,
}},
Object: "chat.completion.chunk",
}
select {
case responses <- initialMessage:
default:
}
lastEmittedCount++
}
}
}
}
return true
})
if err != nil {
Expand Down
Loading
Loading