-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathggml-engine.h
More file actions
200 lines (167 loc) · 7.82 KB
/
ggml-engine.h
File metadata and controls
200 lines (167 loc) · 7.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#pragma once
/**
* GGMLEngine - Simplified LLM inference API for Android
*
* Wraps llama.cpp internals into a clean C API suitable for JNI binding.
* Supports model loading via file path or Android SAF file descriptor.
*/
#include <stdint.h>
#include <stddef.h>
#include <stdbool.h>
#ifdef __cplusplus
extern "C" {
#endif
// Opaque handle
typedef struct ggml_engine ggml_engine_t;
// Status codes
typedef enum {
GGML_ENGINE_OK = 0,
GGML_ENGINE_ERROR_LOAD_FAILED = 1,
GGML_ENGINE_ERROR_CONTEXT_FAIL = 2,
GGML_ENGINE_ERROR_NO_MODEL = 3,
GGML_ENGINE_ERROR_TOKENIZE = 4,
GGML_ENGINE_ERROR_DECODE = 5,
GGML_ENGINE_ERROR_CANCELLED = 6,
GGML_ENGINE_ERROR_OUT_OF_MEM = 7,
GGML_ENGINE_ERROR_VLM_ENCODE = 8,
GGML_ENGINE_ERROR_VLM_NO_PROJ = 9,
} ggml_engine_status;
// Engine configuration
typedef struct {
int32_t n_ctx; // context size (0 = model default)
int32_t n_batch; // batch size for prompt processing
int32_t n_threads; // number of threads (0 = auto-detect)
int32_t n_threads_batch; // threads for batch processing (0 = same as n_threads)
bool use_mmap; // memory-map model file
bool use_mlock; // lock model in memory
int32_t n_gpu_layers; // always 0 for CPU-only
float rope_freq_base; // RoPE base frequency (0 = model default)
float rope_freq_scale; // RoPE frequency scale (0 = model default)
bool flash_attn; // use flash attention if available
} ggml_engine_params;
// Sampling parameters
typedef struct {
float temperature; // 0.0 = greedy
int32_t top_k; // top-k sampling (0 = disabled)
float top_p; // top-p (nucleus) sampling (1.0 = disabled)
float min_p; // min-p sampling (0.0 = disabled)
float repeat_penalty; // repetition penalty (1.0 = disabled)
int32_t repeat_last_n; // last n tokens to penalize
float frequency_penalty; // frequency penalty
float presence_penalty; // presence penalty
uint32_t seed; // random seed (0xFFFFFFFF = random)
int32_t n_predict; // max tokens to generate (-1 = unlimited)
const char * stop_sequences[8]; // up to 8 stop sequences (NULL terminated array)
int32_t stop_sequence_count;
} ggml_engine_sampling;
// Token callback: return false to stop generation
typedef bool (*ggml_engine_token_callback)(const char * token_text, void * user_data);
// Get default parameters
ggml_engine_params ggml_engine_default_params(void);
ggml_engine_sampling ggml_engine_default_sampling(void);
// Lifecycle
ggml_engine_t * ggml_engine_create(ggml_engine_params params);
void ggml_engine_free(ggml_engine_t * engine);
// Model loading
ggml_engine_status ggml_engine_load_model(ggml_engine_t * engine, const char * path);
ggml_engine_status ggml_engine_load_model_from_fd(ggml_engine_t * engine, int fd);
void ggml_engine_unload_model(ggml_engine_t * engine);
bool ggml_engine_is_loaded(const ggml_engine_t * engine);
// Model info - returns JSON string (caller must free with ggml_engine_free_string)
char * ggml_engine_model_info_json(const ggml_engine_t * engine);
void ggml_engine_free_string(char * str);
// Text generation
ggml_engine_status ggml_engine_generate(
ggml_engine_t * engine,
const char * prompt,
ggml_engine_sampling sampling,
ggml_engine_token_callback callback,
void * user_data
);
// Cancel ongoing generation (thread-safe)
void ggml_engine_cancel(ggml_engine_t * engine);
// Get full generated text from last generation (caller must free)
char * ggml_engine_get_response(const ggml_engine_t * engine);
// Context management
void ggml_engine_clear_context(ggml_engine_t * engine);
int32_t ggml_engine_context_used(const ggml_engine_t * engine);
int32_t ggml_engine_context_size(const ggml_engine_t * engine);
// Tokenization utilities
int32_t ggml_engine_tokenize(const ggml_engine_t * engine,
const char * text, int32_t * tokens, int32_t max_tokens);
char * ggml_engine_detokenize(const ggml_engine_t * engine,
const int32_t * tokens, int32_t n_tokens);
// Performance info
typedef struct {
double prompt_eval_ms; // time to process prompt
double generation_ms; // time to generate tokens
int32_t prompt_tokens; // number of prompt tokens
int32_t generated_tokens; // number of generated tokens
double prompt_tokens_per_sec;
double generation_tokens_per_sec;
} ggml_engine_perf;
ggml_engine_perf ggml_engine_get_perf(const ggml_engine_t * engine);
// =====================================================
// VLM (Vision Language Model) API
// =====================================================
// Opaque VLM handle
typedef struct ggml_engine_vlm ggml_engine_vlm_t;
// VLM configuration
typedef struct {
int32_t n_threads; // threads for vision encode (0 = same as engine)
int32_t image_min_tokens; // -1 = model default
int32_t image_max_tokens; // -1 = model default
} ggml_engine_vlm_params;
// Media data: file bytes (image/audio, width=0 + height=0) or raw RGB pixels
typedef struct {
const unsigned char * data; // file bytes or RGB pixels
size_t size; // byte count
uint32_t width; // 0 = file mode (auto-detect JPEG/PNG/etc)
uint32_t height; // 0 = file mode
} ggml_engine_image;
// Audio data: encoded file bytes (WAV/MP3/FLAC/etc. supported by miniaudio)
typedef struct {
const unsigned char * data;
size_t size;
} ggml_engine_audio;
// Get default VLM parameters
ggml_engine_vlm_params ggml_engine_vlm_default_params(void);
// Load vision projector (mmproj GGUF). Call after loading text model.
ggml_engine_vlm_t * ggml_engine_vlm_load(
ggml_engine_t * engine, const char * mmproj_path,
ggml_engine_vlm_params params);
ggml_engine_vlm_t * ggml_engine_vlm_load_from_fd(
ggml_engine_t * engine, int fd,
ggml_engine_vlm_params params);
void ggml_engine_vlm_free(ggml_engine_vlm_t * vlm);
bool ggml_engine_vlm_is_loaded(const ggml_engine_vlm_t * vlm);
// Generate from text + images. Use "<__media__>" markers in prompt for image positions.
ggml_engine_status ggml_engine_vlm_generate(
ggml_engine_t * engine, ggml_engine_vlm_t * vlm,
const char * prompt,
const ggml_engine_image * images, int32_t n_images,
ggml_engine_sampling sampling,
ggml_engine_token_callback callback, void * user_data);
// Encode image only (returns token count, -1 on error)
int32_t ggml_engine_vlm_encode_image(
ggml_engine_vlm_t * vlm, const ggml_engine_image * image);
// Convenience audio helpers. Audio inputs are passed as encoded file bytes.
ggml_engine_status ggml_engine_vlm_generate_audio(
ggml_engine_t * engine, ggml_engine_vlm_t * vlm,
const char * prompt,
const ggml_engine_audio * audio, int32_t n_audio,
ggml_engine_sampling sampling,
ggml_engine_token_callback callback, void * user_data);
int32_t ggml_engine_vlm_encode_audio(
ggml_engine_vlm_t * vlm, const ggml_engine_audio * audio);
// VLM info - returns JSON string (caller must free with ggml_engine_free_string)
char * ggml_engine_vlm_info_json(const ggml_engine_vlm_t * vlm);
// Default image marker used in prompts
const char * ggml_engine_vlm_default_marker(void);
// Capability queries
bool ggml_engine_vlm_supports_vision(const ggml_engine_vlm_t * vlm);
bool ggml_engine_vlm_supports_audio(const ggml_engine_vlm_t * vlm);
int32_t ggml_engine_vlm_audio_bitrate(const ggml_engine_vlm_t * vlm);
#ifdef __cplusplus
}
#endif