diff --git a/engine/ggml-engine-vlm.cpp b/engine/ggml-engine-vlm.cpp index fd58caf5..86a1ddcc 100644 --- a/engine/ggml-engine-vlm.cpp +++ b/engine/ggml-engine-vlm.cpp @@ -80,6 +80,15 @@ bool ggml_engine_vlm_is_loaded(const ggml_engine_vlm_t * vlm) { return vlm && vlm->mtmd_ctx; } +static ggml_engine_image ggml_engine_image_from_audio(const ggml_engine_audio & audio) { + ggml_engine_image image{}; + image.data = audio.data; + image.size = audio.size; + image.width = 0; + image.height = 0; + return image; +} + ggml_engine_status ggml_engine_vlm_generate( ggml_engine_t * engine, ggml_engine_vlm_t * vlm, const char * prompt, @@ -172,21 +181,23 @@ ggml_engine_status ggml_engine_vlm_generate( return ggml_engine_generate_loop(engine, sampling, callback, user_data); } -int32_t ggml_engine_vlm_encode_image( - ggml_engine_vlm_t * vlm, const ggml_engine_image * image) -{ - if (!vlm || !vlm->mtmd_ctx || !image) return -1; - - // create bitmap - mtmd_bitmap * bmp = nullptr; - if (image->width == 0 || image->height == 0) { - bmp = mtmd_helper_bitmap_init_from_buf(vlm->mtmd_ctx, image->data, image->size); - } else { - bmp = mtmd_bitmap_init(image->width, image->height, image->data); +static int32_t count_media_tokens(mtmd_input_chunks * chunks, mtmd_input_chunk_type chunk_type) { + int32_t n_tokens = 0; + const size_t n_chunks = mtmd_input_chunks_size(chunks); + for (size_t i = 0; i < n_chunks; i++) { + const mtmd_input_chunk * chunk = mtmd_input_chunks_get(chunks, i); + if (mtmd_input_chunk_get_type(chunk) == chunk_type) { + n_tokens += (int32_t) mtmd_input_chunk_get_n_tokens(chunk); + } } - if (!bmp) return -1; + return n_tokens; +} + +static int32_t tokenize_media_tokens( + ggml_engine_vlm_t * vlm, mtmd_bitmap * bmp, mtmd_input_chunk_type chunk_type) +{ + if (!vlm || !vlm->mtmd_ctx || !bmp) return -1; - // tokenize with a simple marker prompt to get token count mtmd_input_chunks * chunks = mtmd_input_chunks_init(); mtmd_input_text input_text; const char * marker = mtmd_default_marker(); @@ -205,18 +216,72 @@ int32_t ggml_engine_vlm_encode_image( return -1; } - // count image tokens from chunks - int32_t n_image_tokens = 0; - size_t n_chunks = mtmd_input_chunks_size(chunks); - for (size_t i = 0; i < n_chunks; i++) { - const mtmd_input_chunk * chunk = mtmd_input_chunks_get(chunks, i); - if (mtmd_input_chunk_get_type(chunk) == MTMD_INPUT_CHUNK_TYPE_IMAGE) { - n_image_tokens += (int32_t)mtmd_input_chunk_get_n_tokens(chunk); + const int32_t n_tokens = count_media_tokens(chunks, chunk_type); + mtmd_input_chunks_free(chunks); + return n_tokens; +} + +static bool is_valid_audio_buffer(const ggml_engine_audio * audio) { + return audio && audio->data != nullptr && audio->size > 0; +} + +int32_t ggml_engine_vlm_encode_image( + ggml_engine_vlm_t * vlm, const ggml_engine_image * image) +{ + if (!vlm || !vlm->mtmd_ctx || !image) return -1; + + // create bitmap + mtmd_bitmap * bmp = nullptr; + if (image->width == 0 || image->height == 0) { + bmp = mtmd_helper_bitmap_init_from_buf(vlm->mtmd_ctx, image->data, image->size); + } else { + bmp = mtmd_bitmap_init(image->width, image->height, image->data); + } + if (!bmp) return -1; + + return tokenize_media_tokens(vlm, bmp, MTMD_INPUT_CHUNK_TYPE_IMAGE); +} + +ggml_engine_status ggml_engine_vlm_generate_audio( + ggml_engine_t * engine, ggml_engine_vlm_t * vlm, + const char * prompt, + const ggml_engine_audio * audio, int32_t n_audio, + ggml_engine_sampling sampling, + ggml_engine_token_callback callback, void * user_data) +{ + if (n_audio < 0 || (!audio && n_audio > 0)) { + return GGML_ENGINE_ERROR_VLM_ENCODE; + } + + std::vector media; + media.reserve(n_audio > 0 ? static_cast(n_audio) : 0U); + for (int32_t i = 0; i < n_audio; ++i) { + if (!is_valid_audio_buffer(&audio[i])) { + return GGML_ENGINE_ERROR_VLM_ENCODE; } + media.push_back(ggml_engine_image_from_audio(audio[i])); } - mtmd_input_chunks_free(chunks); - return n_image_tokens; + return ggml_engine_vlm_generate( + engine, + vlm, + prompt, + media.empty() ? nullptr : media.data(), + n_audio, + sampling, + callback, + user_data); +} + +int32_t ggml_engine_vlm_encode_audio( + ggml_engine_vlm_t * vlm, const ggml_engine_audio * audio) +{ + if (!vlm || !vlm->mtmd_ctx || !is_valid_audio_buffer(audio)) return -1; + + mtmd_bitmap * bmp = mtmd_helper_bitmap_init_from_buf(vlm->mtmd_ctx, audio->data, audio->size); + if (!bmp) return -1; + + return tokenize_media_tokens(vlm, bmp, MTMD_INPUT_CHUNK_TYPE_AUDIO); } char * ggml_engine_vlm_info_json(const ggml_engine_vlm_t * vlm) { @@ -248,3 +313,8 @@ bool ggml_engine_vlm_supports_audio(const ggml_engine_vlm_t * vlm) { if (!vlm || !vlm->mtmd_ctx) return false; return mtmd_support_audio(vlm->mtmd_ctx); } + +int32_t ggml_engine_vlm_audio_bitrate(const ggml_engine_vlm_t * vlm) { + if (!vlm || !vlm->mtmd_ctx) return -1; + return mtmd_get_audio_bitrate(vlm->mtmd_ctx); +} diff --git a/engine/ggml-engine.h b/engine/ggml-engine.h index 0f8625f5..16271465 100644 --- a/engine/ggml-engine.h +++ b/engine/ggml-engine.h @@ -135,14 +135,20 @@ typedef struct { int32_t image_max_tokens; // -1 = model default } ggml_engine_vlm_params; -// Image data: either file bytes (width=0, height=0) or raw RGB pixels +// Media data: file bytes (image/audio, width=0 or height=0) or raw RGB pixels typedef struct { const unsigned char * data; // file bytes or RGB pixels size_t size; // byte count - uint32_t width; // 0 = file mode (auto-detect JPEG/PNG/etc) + uint32_t width; // 0 = file mode (auto-detect media bytes) uint32_t height; // 0 = file mode } ggml_engine_image; +// Audio data: encoded file bytes (WAV/MP3/FLAC/etc. supported by miniaudio) +typedef struct { + const unsigned char * data; + size_t size; +} ggml_engine_audio; + // Get default VLM parameters ggml_engine_vlm_params ggml_engine_vlm_default_params(void); @@ -168,6 +174,16 @@ ggml_engine_status ggml_engine_vlm_generate( int32_t ggml_engine_vlm_encode_image( ggml_engine_vlm_t * vlm, const ggml_engine_image * image); +// Convenience audio helpers. Audio inputs are passed as encoded file bytes. +ggml_engine_status ggml_engine_vlm_generate_audio( + ggml_engine_t * engine, ggml_engine_vlm_t * vlm, + const char * prompt, + const ggml_engine_audio * audio, int32_t n_audio, + ggml_engine_sampling sampling, + ggml_engine_token_callback callback, void * user_data); +int32_t ggml_engine_vlm_encode_audio( + ggml_engine_vlm_t * vlm, const ggml_engine_audio * audio); + // VLM info - returns JSON string (caller must free with ggml_engine_free_string) char * ggml_engine_vlm_info_json(const ggml_engine_vlm_t * vlm); @@ -177,6 +193,7 @@ const char * ggml_engine_vlm_default_marker(void); // Capability queries bool ggml_engine_vlm_supports_vision(const ggml_engine_vlm_t * vlm); bool ggml_engine_vlm_supports_audio(const ggml_engine_vlm_t * vlm); +int32_t ggml_engine_vlm_audio_bitrate(const ggml_engine_vlm_t * vlm); #ifdef __cplusplus } diff --git a/engine/llama-test-cli.cpp b/engine/llama-test-cli.cpp index 58dbfb7f..4aeb2a5c 100644 --- a/engine/llama-test-cli.cpp +++ b/engine/llama-test-cli.cpp @@ -412,6 +412,7 @@ static void test_vlm_info(const char * model_path, const char * mmproj_path) { char * info = ggml_engine_vlm_info_json(vlm); TEST_ASSERT(info != nullptr, "vlm info not null", "returned null"); TEST_ASSERT(strstr(info, "supports_vision") != nullptr, "vlm info has supports_vision", "missing field"); + TEST_ASSERT(strstr(info, "supports_audio") != nullptr, "vlm info has supports_audio", "missing field"); print_info("VLM info: %s", info); ggml_engine_free_string(info); @@ -419,6 +420,7 @@ static void test_vlm_info(const char * model_path, const char * mmproj_path) { const char * marker = ggml_engine_vlm_default_marker(); TEST_ASSERT(marker != nullptr && strlen(marker) > 0, "vlm default marker", "empty marker"); print_info("Default marker: %s", marker); + print_info("Audio bitrate: %d", ggml_engine_vlm_audio_bitrate(vlm)); ggml_engine_vlm_free(vlm); ggml_engine_free(engine); @@ -505,6 +507,81 @@ static void test_vlm_generation(const char * model_path, const char * mmproj_pat ggml_engine_free(engine); } +// ---- Test: VLM Audio Encode ---- +static void test_vlm_audio_encode(const char * model_path, const char * mmproj_path, const char * audio_path) { + print_header("VLM Audio Encode"); + + auto params = ggml_engine_default_params(); + params.n_ctx = 4096; + auto * engine = ggml_engine_create(params); + ggml_engine_load_model(engine, model_path); + + auto vlm_params = ggml_engine_vlm_default_params(); + auto * vlm = ggml_engine_vlm_load(engine, mmproj_path, vlm_params); + + auto audio_bytes = load_file_bytes(audio_path); + TEST_ASSERT(!audio_bytes.empty(), "vlm: audio file loaded", "failed to read audio file"); + + ggml_engine_audio audio; + audio.data = audio_bytes.data(); + audio.size = audio_bytes.size(); + + int32_t n_tokens = ggml_engine_vlm_encode_audio(vlm, &audio); + TEST_ASSERT(n_tokens > 0, "vlm: audio encode returns positive tokens", "expected > 0"); + print_info("Audio encoded to %d tokens", n_tokens); + + ggml_engine_vlm_free(vlm); + ggml_engine_free(engine); +} + +// ---- Test: VLM Audio Generation ---- +static void test_vlm_audio_generation(const char * model_path, const char * mmproj_path, const char * audio_path) { + print_header("VLM Audio Generation"); + + auto params = ggml_engine_default_params(); + params.n_ctx = 4096; + params.n_threads = 4; + auto * engine = ggml_engine_create(params); + ggml_engine_load_model(engine, model_path); + + auto vlm_params = ggml_engine_vlm_default_params(); + auto * vlm = ggml_engine_vlm_load(engine, mmproj_path, vlm_params); + + auto audio_bytes = load_file_bytes(audio_path); + TEST_ASSERT(!audio_bytes.empty(), "vlm: audio file loaded", "failed to read audio file"); + + ggml_engine_audio audio; + audio.data = audio_bytes.data(); + audio.size = audio_bytes.size(); + + const char * marker = ggml_engine_vlm_default_marker(); + std::string prompt = std::string(marker) + "\nTranscribe this audio."; + + auto sampling = ggml_engine_default_sampling(); + sampling.n_predict = 64; + sampling.temperature = 0.1f; + + std::string output; + printf(" VLM audio output: "); + auto status = ggml_engine_vlm_generate_audio(engine, vlm, prompt.c_str(), + &audio, 1, sampling, token_callback, &output); + printf("\n"); + + TEST_ASSERT(status == GGML_ENGINE_OK, "vlm audio generation status OK", "generation failed"); + TEST_ASSERT(!output.empty(), "vlm audio output not empty", "no output generated"); + print_info("Generated %zu chars", output.length()); + + auto perf = ggml_engine_get_perf(engine); + TEST_ASSERT(perf.prompt_tokens > 0, "vlm audio perf prompt tokens > 0", "expected > 0"); + print_info("Prompt: %d tokens, %.1f ms (%.1f t/s)", + perf.prompt_tokens, perf.prompt_eval_ms, perf.prompt_tokens_per_sec); + print_info("Generation: %d tokens, %.1f ms (%.1f t/s)", + perf.generated_tokens, perf.generation_ms, perf.generation_tokens_per_sec); + + ggml_engine_vlm_free(vlm); + ggml_engine_free(engine); +} + // ---- Test: VLM Error Cases ---- static void test_vlm_errors(const char * model_path) { print_header("VLM Error Cases"); @@ -528,6 +605,8 @@ static void test_vlm_errors(const char * model_path) { // is_loaded on null TEST_ASSERT(!ggml_engine_vlm_is_loaded(nullptr), "vlm: null is not loaded", "should be false"); + TEST_ASSERT(ggml_engine_vlm_encode_audio(nullptr, nullptr) == -1, "vlm: null audio encode returns -1", "wrong error code"); + TEST_ASSERT(ggml_engine_vlm_audio_bitrate(nullptr) == -1, "vlm: null audio bitrate == -1", "wrong bitrate"); ggml_engine_free(engine); } @@ -791,6 +870,7 @@ static void print_usage(const char * prog) { printf(" -m Path to GGUF model file (required for model tests)\n"); printf(" --mmproj Path to mmproj GGUF file (required for VLM tests)\n"); printf(" --image Path to test image file (required for VLM encode/gen)\n"); + printf(" --audio Path to test audio file (optional for VLM audio encode/gen)\n"); printf(" --embed-model Path to embedding model GGUF (for RAG tests)\n"); printf(" --rag-text Path to text file for RAG large file test\n"); printf(" --all Run all tests (default)\n"); @@ -803,6 +883,7 @@ int main(int argc, char ** argv) { const char * model_path = nullptr; const char * mmproj_path = nullptr; const char * image_path = nullptr; + const char * audio_path = nullptr; const char * embed_model_path = nullptr; const char * rag_text_path = nullptr; bool run_model_tests = true; @@ -815,6 +896,8 @@ int main(int argc, char ** argv) { mmproj_path = argv[++i]; } else if (strcmp(argv[i], "--image") == 0 && i + 1 < argc) { image_path = argv[++i]; + } else if (strcmp(argv[i], "--audio") == 0 && i + 1 < argc) { + audio_path = argv[++i]; } else if (strcmp(argv[i], "--embed-model") == 0 && i + 1 < argc) { embed_model_path = argv[++i]; } else if (strcmp(argv[i], "--rag-text") == 0 && i + 1 < argc) { @@ -865,6 +948,13 @@ int main(int argc, char ** argv) { } else { print_info("Skipping VLM encode/generation tests (no --image provided)"); } + + if (audio_path) { + test_vlm_audio_encode(model_path, mmproj_path, audio_path); + test_vlm_audio_generation(model_path, mmproj_path, audio_path); + } else { + print_info("Skipping VLM audio tests (no --audio provided)"); + } } else { // still run error case tests (only needs text model) if (!quick_mode) {