Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 92 additions & 22 deletions engine/ggml-engine-vlm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,15 @@ bool ggml_engine_vlm_is_loaded(const ggml_engine_vlm_t * vlm) {
return vlm && vlm->mtmd_ctx;
}

static ggml_engine_image ggml_engine_image_from_audio(const ggml_engine_audio & audio) {
ggml_engine_image image{};
image.data = audio.data;
image.size = audio.size;
image.width = 0;
image.height = 0;
return image;
}

ggml_engine_status ggml_engine_vlm_generate(
ggml_engine_t * engine, ggml_engine_vlm_t * vlm,
const char * prompt,
Expand Down Expand Up @@ -172,21 +181,23 @@ ggml_engine_status ggml_engine_vlm_generate(
return ggml_engine_generate_loop(engine, sampling, callback, user_data);
}

int32_t ggml_engine_vlm_encode_image(
ggml_engine_vlm_t * vlm, const ggml_engine_image * image)
{
if (!vlm || !vlm->mtmd_ctx || !image) return -1;

// create bitmap
mtmd_bitmap * bmp = nullptr;
if (image->width == 0 || image->height == 0) {
bmp = mtmd_helper_bitmap_init_from_buf(vlm->mtmd_ctx, image->data, image->size);
} else {
bmp = mtmd_bitmap_init(image->width, image->height, image->data);
static int32_t count_media_tokens(mtmd_input_chunks * chunks, mtmd_input_chunk_type chunk_type) {
int32_t n_tokens = 0;
const size_t n_chunks = mtmd_input_chunks_size(chunks);
for (size_t i = 0; i < n_chunks; i++) {
const mtmd_input_chunk * chunk = mtmd_input_chunks_get(chunks, i);
if (mtmd_input_chunk_get_type(chunk) == chunk_type) {
n_tokens += (int32_t) mtmd_input_chunk_get_n_tokens(chunk);
}
}
if (!bmp) return -1;
return n_tokens;
}

static int32_t tokenize_media_tokens(
ggml_engine_vlm_t * vlm, mtmd_bitmap * bmp, mtmd_input_chunk_type chunk_type)
{
if (!vlm || !vlm->mtmd_ctx || !bmp) return -1;

// tokenize with a simple marker prompt to get token count
mtmd_input_chunks * chunks = mtmd_input_chunks_init();
mtmd_input_text input_text;
const char * marker = mtmd_default_marker();
Expand All @@ -205,18 +216,72 @@ int32_t ggml_engine_vlm_encode_image(
return -1;
}

// count image tokens from chunks
int32_t n_image_tokens = 0;
size_t n_chunks = mtmd_input_chunks_size(chunks);
for (size_t i = 0; i < n_chunks; i++) {
const mtmd_input_chunk * chunk = mtmd_input_chunks_get(chunks, i);
if (mtmd_input_chunk_get_type(chunk) == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
n_image_tokens += (int32_t)mtmd_input_chunk_get_n_tokens(chunk);
const int32_t n_tokens = count_media_tokens(chunks, chunk_type);
mtmd_input_chunks_free(chunks);
return n_tokens;
}

static bool is_valid_audio_buffer(const ggml_engine_audio * audio) {
return audio && audio->data != nullptr && audio->size > 0;
}

int32_t ggml_engine_vlm_encode_image(
ggml_engine_vlm_t * vlm, const ggml_engine_image * image)
{
if (!vlm || !vlm->mtmd_ctx || !image) return -1;

// create bitmap
mtmd_bitmap * bmp = nullptr;
if (image->width == 0 || image->height == 0) {
bmp = mtmd_helper_bitmap_init_from_buf(vlm->mtmd_ctx, image->data, image->size);
} else {
bmp = mtmd_bitmap_init(image->width, image->height, image->data);
}
if (!bmp) return -1;

return tokenize_media_tokens(vlm, bmp, MTMD_INPUT_CHUNK_TYPE_IMAGE);
}

ggml_engine_status ggml_engine_vlm_generate_audio(
ggml_engine_t * engine, ggml_engine_vlm_t * vlm,
const char * prompt,
const ggml_engine_audio * audio, int32_t n_audio,
ggml_engine_sampling sampling,
ggml_engine_token_callback callback, void * user_data)
{
if (n_audio < 0 || (!audio && n_audio > 0)) {
return GGML_ENGINE_ERROR_VLM_ENCODE;
}

std::vector<ggml_engine_image> media;
media.reserve(n_audio > 0 ? static_cast<size_t>(n_audio) : 0U);
for (int32_t i = 0; i < n_audio; ++i) {
if (!is_valid_audio_buffer(&audio[i])) {
return GGML_ENGINE_ERROR_VLM_ENCODE;
}
media.push_back(ggml_engine_image_from_audio(audio[i]));
}

mtmd_input_chunks_free(chunks);
return n_image_tokens;
return ggml_engine_vlm_generate(
engine,
vlm,
prompt,
media.empty() ? nullptr : media.data(),
n_audio,
sampling,
callback,
user_data);
}

int32_t ggml_engine_vlm_encode_audio(
ggml_engine_vlm_t * vlm, const ggml_engine_audio * audio)
{
if (!vlm || !vlm->mtmd_ctx || !is_valid_audio_buffer(audio)) return -1;

mtmd_bitmap * bmp = mtmd_helper_bitmap_init_from_buf(vlm->mtmd_ctx, audio->data, audio->size);
if (!bmp) return -1;

return tokenize_media_tokens(vlm, bmp, MTMD_INPUT_CHUNK_TYPE_AUDIO);
}

char * ggml_engine_vlm_info_json(const ggml_engine_vlm_t * vlm) {
Expand Down Expand Up @@ -248,3 +313,8 @@ bool ggml_engine_vlm_supports_audio(const ggml_engine_vlm_t * vlm) {
if (!vlm || !vlm->mtmd_ctx) return false;
return mtmd_support_audio(vlm->mtmd_ctx);
}

int32_t ggml_engine_vlm_audio_bitrate(const ggml_engine_vlm_t * vlm) {
if (!vlm || !vlm->mtmd_ctx) return -1;
return mtmd_get_audio_bitrate(vlm->mtmd_ctx);
}
21 changes: 19 additions & 2 deletions engine/ggml-engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,14 +135,20 @@ typedef struct {
int32_t image_max_tokens; // -1 = model default
} ggml_engine_vlm_params;

// Image data: either file bytes (width=0, height=0) or raw RGB pixels
// Media data: file bytes (image/audio, width=0 or height=0) or raw RGB pixels
typedef struct {
const unsigned char * data; // file bytes or RGB pixels
size_t size; // byte count
uint32_t width; // 0 = file mode (auto-detect JPEG/PNG/etc)
uint32_t width; // 0 = file mode (auto-detect media bytes)
uint32_t height; // 0 = file mode
} ggml_engine_image;

// Audio data: encoded file bytes (WAV/MP3/FLAC/etc. supported by miniaudio)
typedef struct {
const unsigned char * data;
size_t size;
} ggml_engine_audio;

// Get default VLM parameters
ggml_engine_vlm_params ggml_engine_vlm_default_params(void);

Expand All @@ -168,6 +174,16 @@ ggml_engine_status ggml_engine_vlm_generate(
int32_t ggml_engine_vlm_encode_image(
ggml_engine_vlm_t * vlm, const ggml_engine_image * image);

// Convenience audio helpers. Audio inputs are passed as encoded file bytes.
ggml_engine_status ggml_engine_vlm_generate_audio(
ggml_engine_t * engine, ggml_engine_vlm_t * vlm,
const char * prompt,
const ggml_engine_audio * audio, int32_t n_audio,
ggml_engine_sampling sampling,
ggml_engine_token_callback callback, void * user_data);
int32_t ggml_engine_vlm_encode_audio(
ggml_engine_vlm_t * vlm, const ggml_engine_audio * audio);

// VLM info - returns JSON string (caller must free with ggml_engine_free_string)
char * ggml_engine_vlm_info_json(const ggml_engine_vlm_t * vlm);

Expand All @@ -177,6 +193,7 @@ const char * ggml_engine_vlm_default_marker(void);
// Capability queries
bool ggml_engine_vlm_supports_vision(const ggml_engine_vlm_t * vlm);
bool ggml_engine_vlm_supports_audio(const ggml_engine_vlm_t * vlm);
int32_t ggml_engine_vlm_audio_bitrate(const ggml_engine_vlm_t * vlm);

#ifdef __cplusplus
}
Expand Down
90 changes: 90 additions & 0 deletions engine/llama-test-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -412,13 +412,15 @@ static void test_vlm_info(const char * model_path, const char * mmproj_path) {
char * info = ggml_engine_vlm_info_json(vlm);
TEST_ASSERT(info != nullptr, "vlm info not null", "returned null");
TEST_ASSERT(strstr(info, "supports_vision") != nullptr, "vlm info has supports_vision", "missing field");
TEST_ASSERT(strstr(info, "supports_audio") != nullptr, "vlm info has supports_audio", "missing field");
print_info("VLM info: %s", info);
ggml_engine_free_string(info);

// default marker
const char * marker = ggml_engine_vlm_default_marker();
TEST_ASSERT(marker != nullptr && strlen(marker) > 0, "vlm default marker", "empty marker");
print_info("Default marker: %s", marker);
print_info("Audio bitrate: %d", ggml_engine_vlm_audio_bitrate(vlm));

ggml_engine_vlm_free(vlm);
ggml_engine_free(engine);
Expand Down Expand Up @@ -505,6 +507,81 @@ static void test_vlm_generation(const char * model_path, const char * mmproj_pat
ggml_engine_free(engine);
}

// ---- Test: VLM Audio Encode ----
static void test_vlm_audio_encode(const char * model_path, const char * mmproj_path, const char * audio_path) {
print_header("VLM Audio Encode");

auto params = ggml_engine_default_params();
params.n_ctx = 4096;
auto * engine = ggml_engine_create(params);
ggml_engine_load_model(engine, model_path);

auto vlm_params = ggml_engine_vlm_default_params();
auto * vlm = ggml_engine_vlm_load(engine, mmproj_path, vlm_params);

auto audio_bytes = load_file_bytes(audio_path);
TEST_ASSERT(!audio_bytes.empty(), "vlm: audio file loaded", "failed to read audio file");

ggml_engine_audio audio;
audio.data = audio_bytes.data();
audio.size = audio_bytes.size();

int32_t n_tokens = ggml_engine_vlm_encode_audio(vlm, &audio);
TEST_ASSERT(n_tokens > 0, "vlm: audio encode returns positive tokens", "expected > 0");
print_info("Audio encoded to %d tokens", n_tokens);

ggml_engine_vlm_free(vlm);
ggml_engine_free(engine);
}

// ---- Test: VLM Audio Generation ----
static void test_vlm_audio_generation(const char * model_path, const char * mmproj_path, const char * audio_path) {
print_header("VLM Audio Generation");

auto params = ggml_engine_default_params();
params.n_ctx = 4096;
params.n_threads = 4;
auto * engine = ggml_engine_create(params);
ggml_engine_load_model(engine, model_path);

auto vlm_params = ggml_engine_vlm_default_params();
auto * vlm = ggml_engine_vlm_load(engine, mmproj_path, vlm_params);

auto audio_bytes = load_file_bytes(audio_path);
TEST_ASSERT(!audio_bytes.empty(), "vlm: audio file loaded", "failed to read audio file");

ggml_engine_audio audio;
audio.data = audio_bytes.data();
audio.size = audio_bytes.size();

const char * marker = ggml_engine_vlm_default_marker();
std::string prompt = std::string(marker) + "\nTranscribe this audio.";

auto sampling = ggml_engine_default_sampling();
sampling.n_predict = 64;
sampling.temperature = 0.1f;

std::string output;
printf(" VLM audio output: ");
auto status = ggml_engine_vlm_generate_audio(engine, vlm, prompt.c_str(),
&audio, 1, sampling, token_callback, &output);
printf("\n");

TEST_ASSERT(status == GGML_ENGINE_OK, "vlm audio generation status OK", "generation failed");
TEST_ASSERT(!output.empty(), "vlm audio output not empty", "no output generated");
print_info("Generated %zu chars", output.length());

auto perf = ggml_engine_get_perf(engine);
TEST_ASSERT(perf.prompt_tokens > 0, "vlm audio perf prompt tokens > 0", "expected > 0");
print_info("Prompt: %d tokens, %.1f ms (%.1f t/s)",
perf.prompt_tokens, perf.prompt_eval_ms, perf.prompt_tokens_per_sec);
print_info("Generation: %d tokens, %.1f ms (%.1f t/s)",
perf.generated_tokens, perf.generation_ms, perf.generation_tokens_per_sec);

ggml_engine_vlm_free(vlm);
ggml_engine_free(engine);
}

// ---- Test: VLM Error Cases ----
static void test_vlm_errors(const char * model_path) {
print_header("VLM Error Cases");
Expand All @@ -528,6 +605,8 @@ static void test_vlm_errors(const char * model_path) {

// is_loaded on null
TEST_ASSERT(!ggml_engine_vlm_is_loaded(nullptr), "vlm: null is not loaded", "should be false");
TEST_ASSERT(ggml_engine_vlm_encode_audio(nullptr, nullptr) == -1, "vlm: null audio encode returns -1", "wrong error code");
TEST_ASSERT(ggml_engine_vlm_audio_bitrate(nullptr) == -1, "vlm: null audio bitrate == -1", "wrong bitrate");

ggml_engine_free(engine);
}
Expand Down Expand Up @@ -791,6 +870,7 @@ static void print_usage(const char * prog) {
printf(" -m <path> Path to GGUF model file (required for model tests)\n");
printf(" --mmproj <path> Path to mmproj GGUF file (required for VLM tests)\n");
printf(" --image <path> Path to test image file (required for VLM encode/gen)\n");
printf(" --audio <path> Path to test audio file (optional for VLM audio encode/gen)\n");
printf(" --embed-model <path> Path to embedding model GGUF (for RAG tests)\n");
printf(" --rag-text <path> Path to text file for RAG large file test\n");
printf(" --all Run all tests (default)\n");
Expand All @@ -803,6 +883,7 @@ int main(int argc, char ** argv) {
const char * model_path = nullptr;
const char * mmproj_path = nullptr;
const char * image_path = nullptr;
const char * audio_path = nullptr;
const char * embed_model_path = nullptr;
const char * rag_text_path = nullptr;
bool run_model_tests = true;
Expand All @@ -815,6 +896,8 @@ int main(int argc, char ** argv) {
mmproj_path = argv[++i];
} else if (strcmp(argv[i], "--image") == 0 && i + 1 < argc) {
image_path = argv[++i];
} else if (strcmp(argv[i], "--audio") == 0 && i + 1 < argc) {
audio_path = argv[++i];
} else if (strcmp(argv[i], "--embed-model") == 0 && i + 1 < argc) {
embed_model_path = argv[++i];
} else if (strcmp(argv[i], "--rag-text") == 0 && i + 1 < argc) {
Expand Down Expand Up @@ -865,6 +948,13 @@ int main(int argc, char ** argv) {
} else {
print_info("Skipping VLM encode/generation tests (no --image provided)");
}

if (audio_path) {
test_vlm_audio_encode(model_path, mmproj_path, audio_path);
test_vlm_audio_generation(model_path, mmproj_path, audio_path);
} else {
print_info("Skipping VLM audio tests (no --audio provided)");
}
} else {
// still run error case tests (only needs text model)
if (!quick_mode) {
Expand Down