Siddhesh2377 · Godzilla675 · Mar 12, 2026 · Mar 12, 2026
diff --git a/engine/ggml-engine-vlm.cpp b/engine/ggml-engine-vlm.cpp
@@ -80,6 +80,15 @@ bool ggml_engine_vlm_is_loaded(const ggml_engine_vlm_t * vlm) {
     return vlm && vlm->mtmd_ctx;
 }
 
+static ggml_engine_image ggml_engine_image_from_audio(const ggml_engine_audio & audio) {
+    ggml_engine_image image{};
+    image.data = audio.data;
+    image.size = audio.size;
+    image.width = 0;
+    image.height = 0;
+    return image;
+}
+
 ggml_engine_status ggml_engine_vlm_generate(
     ggml_engine_t * engine, ggml_engine_vlm_t * vlm,
     const char * prompt,
@@ -172,21 +181,23 @@ ggml_engine_status ggml_engine_vlm_generate(
     return ggml_engine_generate_loop(engine, sampling, callback, user_data);
 }
 
-int32_t ggml_engine_vlm_encode_image(
-    ggml_engine_vlm_t * vlm, const ggml_engine_image * image)
-{
-    if (!vlm || !vlm->mtmd_ctx || !image) return -1;
-
-    // create bitmap
-    mtmd_bitmap * bmp = nullptr;
-    if (image->width == 0 || image->height == 0) {
-        bmp = mtmd_helper_bitmap_init_from_buf(vlm->mtmd_ctx, image->data, image->size);
-    } else {
-        bmp = mtmd_bitmap_init(image->width, image->height, image->data);
+static int32_t count_media_tokens(mtmd_input_chunks * chunks, mtmd_input_chunk_type chunk_type) {
+    int32_t n_tokens = 0;
+    const size_t n_chunks = mtmd_input_chunks_size(chunks);
+    for (size_t i = 0; i < n_chunks; i++) {
+        const mtmd_input_chunk * chunk = mtmd_input_chunks_get(chunks, i);
+        if (mtmd_input_chunk_get_type(chunk) == chunk_type) {
+            n_tokens += (int32_t) mtmd_input_chunk_get_n_tokens(chunk);
+        }
     }
-    if (!bmp) return -1;
+    return n_tokens;
+}
+
+static int32_t tokenize_media_tokens(
+    ggml_engine_vlm_t * vlm, mtmd_bitmap * bmp, mtmd_input_chunk_type chunk_type)
+{
+    if (!vlm || !vlm->mtmd_ctx || !bmp) return -1;
 
-    // tokenize with a simple marker prompt to get token count
     mtmd_input_chunks * chunks = mtmd_input_chunks_init();
     mtmd_input_text input_text;
     const char * marker = mtmd_default_marker();
@@ -205,18 +216,72 @@ int32_t ggml_engine_vlm_encode_image(
         return -1;
     }
 
-    // count image tokens from chunks
-    int32_t n_image_tokens = 0;
-    size_t n_chunks = mtmd_input_chunks_size(chunks);
-    for (size_t i = 0; i < n_chunks; i++) {
-        const mtmd_input_chunk * chunk = mtmd_input_chunks_get(chunks, i);
-        if (mtmd_input_chunk_get_type(chunk) == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-            n_image_tokens += (int32_t)mtmd_input_chunk_get_n_tokens(chunk);
+    const int32_t n_tokens = count_media_tokens(chunks, chunk_type);
+    mtmd_input_chunks_free(chunks);
+    return n_tokens;
+}
+
+static bool is_valid_audio_buffer(const ggml_engine_audio * audio) {
+    return audio && audio->data != nullptr && audio->size > 0;
+}
+
+int32_t ggml_engine_vlm_encode_image(
+    ggml_engine_vlm_t * vlm, const ggml_engine_image * image)
+{
+    if (!vlm || !vlm->mtmd_ctx || !image) return -1;
+
+    // create bitmap
+    mtmd_bitmap * bmp = nullptr;
+    if (image->width == 0 || image->height == 0) {
+        bmp = mtmd_helper_bitmap_init_from_buf(vlm->mtmd_ctx, image->data, image->size);
+    } else {
+        bmp = mtmd_bitmap_init(image->width, image->height, image->data);
+    }
+    if (!bmp) return -1;
+
+    return tokenize_media_tokens(vlm, bmp, MTMD_INPUT_CHUNK_TYPE_IMAGE);
+}
+
+ggml_engine_status ggml_engine_vlm_generate_audio(
+    ggml_engine_t * engine, ggml_engine_vlm_t * vlm,
+    const char * prompt,
+    const ggml_engine_audio * audio, int32_t n_audio,
+    ggml_engine_sampling sampling,
+    ggml_engine_token_callback callback, void * user_data)
+{
+    if (n_audio < 0 || (!audio && n_audio > 0)) {
+        return GGML_ENGINE_ERROR_VLM_ENCODE;
+    }
+
+    std::vector<ggml_engine_image> media;
+    media.reserve(n_audio > 0 ? static_cast<size_t>(n_audio) : 0U);
+    for (int32_t i = 0; i < n_audio; ++i) {
+        if (!is_valid_audio_buffer(&audio[i])) {
+            return GGML_ENGINE_ERROR_VLM_ENCODE;
         }
+        media.push_back(ggml_engine_image_from_audio(audio[i]));
     }
 
-    mtmd_input_chunks_free(chunks);
-    return n_image_tokens;
+    return ggml_engine_vlm_generate(
+        engine,
+        vlm,
+        prompt,
+        media.empty() ? nullptr : media.data(),
+        n_audio,
+        sampling,
+        callback,
+        user_data);
+}
+
+int32_t ggml_engine_vlm_encode_audio(
+    ggml_engine_vlm_t * vlm, const ggml_engine_audio * audio)
+{
+    if (!vlm || !vlm->mtmd_ctx || !is_valid_audio_buffer(audio)) return -1;
+
+    mtmd_bitmap * bmp = mtmd_helper_bitmap_init_from_buf(vlm->mtmd_ctx, audio->data, audio->size);
+    if (!bmp) return -1;
+
+    return tokenize_media_tokens(vlm, bmp, MTMD_INPUT_CHUNK_TYPE_AUDIO);
 }
 
 char * ggml_engine_vlm_info_json(const ggml_engine_vlm_t * vlm) {
@@ -248,3 +313,8 @@ bool ggml_engine_vlm_supports_audio(const ggml_engine_vlm_t * vlm) {
     if (!vlm || !vlm->mtmd_ctx) return false;
     return mtmd_support_audio(vlm->mtmd_ctx);
 }
+
+int32_t ggml_engine_vlm_audio_bitrate(const ggml_engine_vlm_t * vlm) {
+    if (!vlm || !vlm->mtmd_ctx) return -1;
+    return mtmd_get_audio_bitrate(vlm->mtmd_ctx);
+}
diff --git a/engine/ggml-engine.h b/engine/ggml-engine.h
@@ -135,14 +135,20 @@ typedef struct {
     int32_t  image_max_tokens;   // -1 = model default
 } ggml_engine_vlm_params;
 
-// Image data: either file bytes (width=0, height=0) or raw RGB pixels
+// Media data: file bytes (image/audio, width=0 or height=0) or raw RGB pixels
 typedef struct {
     const unsigned char * data;   // file bytes or RGB pixels
     size_t                size;   // byte count
-    uint32_t              width;  // 0 = file mode (auto-detect JPEG/PNG/etc)
+    uint32_t              width;  // 0 = file mode (auto-detect media bytes)
     uint32_t              height; // 0 = file mode
 } ggml_engine_image;
 
+// Audio data: encoded file bytes (WAV/MP3/FLAC/etc. supported by miniaudio)
+typedef struct {
+    const unsigned char * data;
+    size_t                size;
+} ggml_engine_audio;
+
 // Get default VLM parameters
 ggml_engine_vlm_params ggml_engine_vlm_default_params(void);
 
@@ -168,6 +174,16 @@ ggml_engine_status  ggml_engine_vlm_generate(
 int32_t             ggml_engine_vlm_encode_image(
     ggml_engine_vlm_t * vlm, const ggml_engine_image * image);
 
+// Convenience audio helpers. Audio inputs are passed as encoded file bytes.
+ggml_engine_status  ggml_engine_vlm_generate_audio(
+    ggml_engine_t * engine, ggml_engine_vlm_t * vlm,
+    const char * prompt,
+    const ggml_engine_audio * audio, int32_t n_audio,
+    ggml_engine_sampling sampling,
+    ggml_engine_token_callback callback, void * user_data);
+int32_t             ggml_engine_vlm_encode_audio(
+    ggml_engine_vlm_t * vlm, const ggml_engine_audio * audio);
+
 // VLM info - returns JSON string (caller must free with ggml_engine_free_string)
 char *              ggml_engine_vlm_info_json(const ggml_engine_vlm_t * vlm);
 
@@ -177,6 +193,7 @@ const char *        ggml_engine_vlm_default_marker(void);
 // Capability queries
 bool                ggml_engine_vlm_supports_vision(const ggml_engine_vlm_t * vlm);
 bool                ggml_engine_vlm_supports_audio(const ggml_engine_vlm_t * vlm);
+int32_t             ggml_engine_vlm_audio_bitrate(const ggml_engine_vlm_t * vlm);
 
 #ifdef __cplusplus
 }

diff --git a/engine/llama-test-cli.cpp b/engine/llama-test-cli.cpp
@@ -412,13 +412,15 @@ static void test_vlm_info(const char * model_path, const char * mmproj_path) {
     char * info = ggml_engine_vlm_info_json(vlm);
     TEST_ASSERT(info != nullptr, "vlm info not null", "returned null");
     TEST_ASSERT(strstr(info, "supports_vision") != nullptr, "vlm info has supports_vision", "missing field");
+    TEST_ASSERT(strstr(info, "supports_audio") != nullptr, "vlm info has supports_audio", "missing field");
     print_info("VLM info: %s", info);
     ggml_engine_free_string(info);
 
     // default marker
     const char * marker = ggml_engine_vlm_default_marker();
     TEST_ASSERT(marker != nullptr && strlen(marker) > 0, "vlm default marker", "empty marker");
     print_info("Default marker: %s", marker);
+    print_info("Audio bitrate: %d", ggml_engine_vlm_audio_bitrate(vlm));
 
     ggml_engine_vlm_free(vlm);
     ggml_engine_free(engine);
@@ -505,6 +507,81 @@ static void test_vlm_generation(const char * model_path, const char * mmproj_pat
     ggml_engine_free(engine);
 }
 
+// ---- Test: VLM Audio Encode ----
+static void test_vlm_audio_encode(const char * model_path, const char * mmproj_path, const char * audio_path) {
+    print_header("VLM Audio Encode");
+
+    auto params = ggml_engine_default_params();
+    params.n_ctx = 4096;
+    auto * engine = ggml_engine_create(params);
+    ggml_engine_load_model(engine, model_path);
+
+    auto vlm_params = ggml_engine_vlm_default_params();
+    auto * vlm = ggml_engine_vlm_load(engine, mmproj_path, vlm_params);
+
+    auto audio_bytes = load_file_bytes(audio_path);
+    TEST_ASSERT(!audio_bytes.empty(), "vlm: audio file loaded", "failed to read audio file");
+
+    ggml_engine_audio audio;
+    audio.data = audio_bytes.data();
+    audio.size = audio_bytes.size();
+
+    int32_t n_tokens = ggml_engine_vlm_encode_audio(vlm, &audio);
+    TEST_ASSERT(n_tokens > 0, "vlm: audio encode returns positive tokens", "expected > 0");
+    print_info("Audio encoded to %d tokens", n_tokens);
+
+    ggml_engine_vlm_free(vlm);
+    ggml_engine_free(engine);
+}
+
+// ---- Test: VLM Audio Generation ----
+static void test_vlm_audio_generation(const char * model_path, const char * mmproj_path, const char * audio_path) {
+    print_header("VLM Audio Generation");
+
+    auto params = ggml_engine_default_params();
+    params.n_ctx = 4096;
+    params.n_threads = 4;
+    auto * engine = ggml_engine_create(params);
+    ggml_engine_load_model(engine, model_path);
+
+    auto vlm_params = ggml_engine_vlm_default_params();
+    auto * vlm = ggml_engine_vlm_load(engine, mmproj_path, vlm_params);
+
+    auto audio_bytes = load_file_bytes(audio_path);
+    TEST_ASSERT(!audio_bytes.empty(), "vlm: audio file loaded", "failed to read audio file");
+
+    ggml_engine_audio audio;
+    audio.data = audio_bytes.data();
+    audio.size = audio_bytes.size();
+
+    const char * marker = ggml_engine_vlm_default_marker();
+    std::string prompt = std::string(marker) + "\nTranscribe this audio.";
+
+    auto sampling = ggml_engine_default_sampling();
+    sampling.n_predict = 64;
+    sampling.temperature = 0.1f;
+
+    std::string output;
+    printf("  VLM audio output: ");
+    auto status = ggml_engine_vlm_generate_audio(engine, vlm, prompt.c_str(),
+        &audio, 1, sampling, token_callback, &output);
+    printf("\n");
+
+    TEST_ASSERT(status == GGML_ENGINE_OK, "vlm audio generation status OK", "generation failed");
+    TEST_ASSERT(!output.empty(), "vlm audio output not empty", "no output generated");
+    print_info("Generated %zu chars", output.length());
+
+    auto perf = ggml_engine_get_perf(engine);
+    TEST_ASSERT(perf.prompt_tokens > 0, "vlm audio perf prompt tokens > 0", "expected > 0");
+    print_info("Prompt: %d tokens, %.1f ms (%.1f t/s)",
+        perf.prompt_tokens, perf.prompt_eval_ms, perf.prompt_tokens_per_sec);
+    print_info("Generation: %d tokens, %.1f ms (%.1f t/s)",
+        perf.generated_tokens, perf.generation_ms, perf.generation_tokens_per_sec);
+
+    ggml_engine_vlm_free(vlm);
+    ggml_engine_free(engine);
+}
+
 // ---- Test: VLM Error Cases ----
 static void test_vlm_errors(const char * model_path) {
     print_header("VLM Error Cases");
@@ -528,6 +605,8 @@ static void test_vlm_errors(const char * model_path) {
 
     // is_loaded on null
     TEST_ASSERT(!ggml_engine_vlm_is_loaded(nullptr), "vlm: null is not loaded", "should be false");
+    TEST_ASSERT(ggml_engine_vlm_encode_audio(nullptr, nullptr) == -1, "vlm: null audio encode returns -1", "wrong error code");
+    TEST_ASSERT(ggml_engine_vlm_audio_bitrate(nullptr) == -1, "vlm: null audio bitrate == -1", "wrong bitrate");
 
     ggml_engine_free(engine);
 }
@@ -791,6 +870,7 @@ static void print_usage(const char * prog) {
     printf("  -m <path>        Path to GGUF model file (required for model tests)\n");
     printf("  --mmproj <path>  Path to mmproj GGUF file (required for VLM tests)\n");
     printf("  --image <path>   Path to test image file (required for VLM encode/gen)\n");
+    printf("  --audio <path>   Path to test audio file (optional for VLM audio encode/gen)\n");
     printf("  --embed-model <path>  Path to embedding model GGUF (for RAG tests)\n");
     printf("  --rag-text <path>    Path to text file for RAG large file test\n");
     printf("  --all            Run all tests (default)\n");
@@ -803,6 +883,7 @@ int main(int argc, char ** argv) {
     const char * model_path = nullptr;
     const char * mmproj_path = nullptr;
     const char * image_path = nullptr;
+    const char * audio_path = nullptr;
     const char * embed_model_path = nullptr;
     const char * rag_text_path = nullptr;
     bool run_model_tests = true;
@@ -815,6 +896,8 @@ int main(int argc, char ** argv) {
             mmproj_path = argv[++i];
         } else if (strcmp(argv[i], "--image") == 0 && i + 1 < argc) {
             image_path = argv[++i];
+        } else if (strcmp(argv[i], "--audio") == 0 && i + 1 < argc) {
+            audio_path = argv[++i];
         } else if (strcmp(argv[i], "--embed-model") == 0 && i + 1 < argc) {
             embed_model_path = argv[++i];
         } else if (strcmp(argv[i], "--rag-text") == 0 && i + 1 < argc) {
@@ -865,6 +948,13 @@ int main(int argc, char ** argv) {
             } else {
                 print_info("Skipping VLM encode/generation tests (no --image provided)");
             }
+
+            if (audio_path) {
+                test_vlm_audio_encode(model_path, mmproj_path, audio_path);
+                test_vlm_audio_generation(model_path, mmproj_path, audio_path);
+            } else {
+                print_info("Skipping VLM audio tests (no --audio provided)");
+            }
         } else {
             // still run error case tests (only needs text model)
             if (!quick_mode) {