Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions engine/ggml-engine-vlm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,15 @@ bool ggml_engine_vlm_is_loaded(const ggml_engine_vlm_t * vlm) {
return vlm && vlm->mtmd_ctx;
}

static ggml_engine_image ggml_engine_image_from_audio(const ggml_engine_audio & audio) {
ggml_engine_image image{};
image.data = audio.data;
image.size = audio.size;
image.width = 0;
image.height = 0;
return image;
}

ggml_engine_status ggml_engine_vlm_generate(
ggml_engine_t * engine, ggml_engine_vlm_t * vlm,
const char * prompt,
Expand Down Expand Up @@ -219,6 +228,42 @@ int32_t ggml_engine_vlm_encode_image(
return n_image_tokens;
}

ggml_engine_status ggml_engine_vlm_generate_audio(
ggml_engine_t * engine, ggml_engine_vlm_t * vlm,
const char * prompt,
const ggml_engine_audio * audio, int32_t n_audio,
ggml_engine_sampling sampling,
ggml_engine_token_callback callback, void * user_data)
{
if (!audio && n_audio > 0) {
return GGML_ENGINE_ERROR_VLM_ENCODE;
}

std::vector<ggml_engine_image> media;
media.reserve(n_audio > 0 ? static_cast<size_t>(n_audio) : 0U);
for (int32_t i = 0; i < n_audio; ++i) {
media.push_back(ggml_engine_image_from_audio(audio[i]));
}

return ggml_engine_vlm_generate(
engine,
vlm,
prompt,
media.empty() ? nullptr : media.data(),
n_audio,
sampling,
callback,
user_data);
}

int32_t ggml_engine_vlm_encode_audio(
ggml_engine_vlm_t * vlm, const ggml_engine_audio * audio)
{
if (!audio) return -1;
const auto image = ggml_engine_image_from_audio(*audio);
return ggml_engine_vlm_encode_image(vlm, &image);
}

char * ggml_engine_vlm_info_json(const ggml_engine_vlm_t * vlm) {
if (!vlm || !vlm->mtmd_ctx) return strdup_alloc("{}");

Expand Down Expand Up @@ -248,3 +293,8 @@ bool ggml_engine_vlm_supports_audio(const ggml_engine_vlm_t * vlm) {
if (!vlm || !vlm->mtmd_ctx) return false;
return mtmd_support_audio(vlm->mtmd_ctx);
}

int32_t ggml_engine_vlm_audio_bitrate(const ggml_engine_vlm_t * vlm) {
if (!vlm || !vlm->mtmd_ctx) return -1;
return mtmd_get_audio_bitrate(vlm->mtmd_ctx);
}
19 changes: 18 additions & 1 deletion engine/ggml-engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,14 +135,20 @@ typedef struct {
int32_t image_max_tokens; // -1 = model default
} ggml_engine_vlm_params;

// Image data: either file bytes (width=0, height=0) or raw RGB pixels
// Media data: file bytes (image/audio, width=0 + height=0) or raw RGB pixels
typedef struct {
const unsigned char * data; // file bytes or RGB pixels
size_t size; // byte count
uint32_t width; // 0 = file mode (auto-detect JPEG/PNG/etc)
uint32_t height; // 0 = file mode
} ggml_engine_image;

// Audio data: encoded file bytes (WAV/MP3/FLAC/etc. supported by miniaudio)
typedef struct {
const unsigned char * data;
size_t size;
} ggml_engine_audio;

// Get default VLM parameters
ggml_engine_vlm_params ggml_engine_vlm_default_params(void);

Expand All @@ -168,6 +174,16 @@ ggml_engine_status ggml_engine_vlm_generate(
int32_t ggml_engine_vlm_encode_image(
ggml_engine_vlm_t * vlm, const ggml_engine_image * image);

// Convenience audio helpers. Audio inputs are passed as encoded file bytes.
ggml_engine_status ggml_engine_vlm_generate_audio(
ggml_engine_t * engine, ggml_engine_vlm_t * vlm,
const char * prompt,
const ggml_engine_audio * audio, int32_t n_audio,
ggml_engine_sampling sampling,
ggml_engine_token_callback callback, void * user_data);
int32_t ggml_engine_vlm_encode_audio(
ggml_engine_vlm_t * vlm, const ggml_engine_audio * audio);

// VLM info - returns JSON string (caller must free with ggml_engine_free_string)
char * ggml_engine_vlm_info_json(const ggml_engine_vlm_t * vlm);

Expand All @@ -177,6 +193,7 @@ const char * ggml_engine_vlm_default_marker(void);
// Capability queries
bool ggml_engine_vlm_supports_vision(const ggml_engine_vlm_t * vlm);
bool ggml_engine_vlm_supports_audio(const ggml_engine_vlm_t * vlm);
int32_t ggml_engine_vlm_audio_bitrate(const ggml_engine_vlm_t * vlm);

#ifdef __cplusplus
}
Expand Down
89 changes: 89 additions & 0 deletions engine/llama-test-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -412,13 +412,15 @@ static void test_vlm_info(const char * model_path, const char * mmproj_path) {
char * info = ggml_engine_vlm_info_json(vlm);
TEST_ASSERT(info != nullptr, "vlm info not null", "returned null");
TEST_ASSERT(strstr(info, "supports_vision") != nullptr, "vlm info has supports_vision", "missing field");
TEST_ASSERT(strstr(info, "supports_audio") != nullptr, "vlm info has supports_audio", "missing field");
print_info("VLM info: %s", info);
ggml_engine_free_string(info);

// default marker
const char * marker = ggml_engine_vlm_default_marker();
TEST_ASSERT(marker != nullptr && strlen(marker) > 0, "vlm default marker", "empty marker");
print_info("Default marker: %s", marker);
print_info("Audio bitrate: %d", ggml_engine_vlm_audio_bitrate(vlm));

ggml_engine_vlm_free(vlm);
ggml_engine_free(engine);
Expand Down Expand Up @@ -505,6 +507,80 @@ static void test_vlm_generation(const char * model_path, const char * mmproj_pat
ggml_engine_free(engine);
}

// ---- Test: VLM Audio Encode ----
static void test_vlm_audio_encode(const char * model_path, const char * mmproj_path, const char * audio_path) {
print_header("VLM Audio Encode");

auto params = ggml_engine_default_params();
params.n_ctx = 4096;
auto * engine = ggml_engine_create(params);
ggml_engine_load_model(engine, model_path);

auto vlm_params = ggml_engine_vlm_default_params();
auto * vlm = ggml_engine_vlm_load(engine, mmproj_path, vlm_params);

auto audio_bytes = load_file_bytes(audio_path);
TEST_ASSERT(!audio_bytes.empty(), "vlm: audio file loaded", "failed to read audio file");

ggml_engine_audio audio;
audio.data = audio_bytes.data();
audio.size = audio_bytes.size();

int32_t n_tokens = ggml_engine_vlm_encode_audio(vlm, &audio);
TEST_ASSERT(n_tokens > 0, "vlm: audio encode returns positive tokens", "expected > 0");
print_info("Audio encoded to %d tokens", n_tokens);

ggml_engine_vlm_free(vlm);
ggml_engine_free(engine);
}

// ---- Test: VLM Audio Generation ----
static void test_vlm_audio_generation(const char * model_path, const char * mmproj_path, const char * audio_path) {
print_header("VLM Audio Generation");

auto params = ggml_engine_default_params();
params.n_ctx = 4096;
params.n_threads = 4;
auto * engine = ggml_engine_create(params);
ggml_engine_load_model(engine, model_path);

auto vlm_params = ggml_engine_vlm_default_params();
auto * vlm = ggml_engine_vlm_load(engine, mmproj_path, vlm_params);

auto audio_bytes = load_file_bytes(audio_path);

ggml_engine_audio audio;
audio.data = audio_bytes.data();
audio.size = audio_bytes.size();

const char * marker = ggml_engine_vlm_default_marker();
std::string prompt = std::string(marker) + "\nTranscribe this audio.";

auto sampling = ggml_engine_default_sampling();
sampling.n_predict = 64;
sampling.temperature = 0.1f;

std::string output;
printf(" VLM audio output: ");
auto status = ggml_engine_vlm_generate_audio(engine, vlm, prompt.c_str(),
&audio, 1, sampling, token_callback, &output);
printf("\n");

TEST_ASSERT(status == GGML_ENGINE_OK, "vlm audio generation status OK", "generation failed");
TEST_ASSERT(!output.empty(), "vlm audio output not empty", "no output generated");
print_info("Generated %zu chars", output.length());

auto perf = ggml_engine_get_perf(engine);
TEST_ASSERT(perf.prompt_tokens > 0, "vlm audio perf prompt tokens > 0", "expected > 0");
print_info("Prompt: %d tokens, %.1f ms (%.1f t/s)",
perf.prompt_tokens, perf.prompt_eval_ms, perf.prompt_tokens_per_sec);
print_info("Generation: %d tokens, %.1f ms (%.1f t/s)",
perf.generated_tokens, perf.generation_ms, perf.generation_tokens_per_sec);

ggml_engine_vlm_free(vlm);
ggml_engine_free(engine);
}

// ---- Test: VLM Error Cases ----
static void test_vlm_errors(const char * model_path) {
print_header("VLM Error Cases");
Expand All @@ -528,6 +604,8 @@ static void test_vlm_errors(const char * model_path) {

// is_loaded on null
TEST_ASSERT(!ggml_engine_vlm_is_loaded(nullptr), "vlm: null is not loaded", "should be false");
TEST_ASSERT(ggml_engine_vlm_encode_audio(nullptr, nullptr) == -1, "vlm: null audio encode returns -1", "wrong error code");
TEST_ASSERT(ggml_engine_vlm_audio_bitrate(nullptr) == -1, "vlm: null audio bitrate == -1", "wrong bitrate");

ggml_engine_free(engine);
}
Expand Down Expand Up @@ -791,6 +869,7 @@ static void print_usage(const char * prog) {
printf(" -m <path> Path to GGUF model file (required for model tests)\n");
printf(" --mmproj <path> Path to mmproj GGUF file (required for VLM tests)\n");
printf(" --image <path> Path to test image file (required for VLM encode/gen)\n");
printf(" --audio <path> Path to test audio file (optional for VLM audio encode/gen)\n");
printf(" --embed-model <path> Path to embedding model GGUF (for RAG tests)\n");
printf(" --rag-text <path> Path to text file for RAG large file test\n");
printf(" --all Run all tests (default)\n");
Expand All @@ -803,6 +882,7 @@ int main(int argc, char ** argv) {
const char * model_path = nullptr;
const char * mmproj_path = nullptr;
const char * image_path = nullptr;
const char * audio_path = nullptr;
const char * embed_model_path = nullptr;
const char * rag_text_path = nullptr;
bool run_model_tests = true;
Expand All @@ -815,6 +895,8 @@ int main(int argc, char ** argv) {
mmproj_path = argv[++i];
} else if (strcmp(argv[i], "--image") == 0 && i + 1 < argc) {
image_path = argv[++i];
} else if (strcmp(argv[i], "--audio") == 0 && i + 1 < argc) {
audio_path = argv[++i];
} else if (strcmp(argv[i], "--embed-model") == 0 && i + 1 < argc) {
embed_model_path = argv[++i];
} else if (strcmp(argv[i], "--rag-text") == 0 && i + 1 < argc) {
Expand Down Expand Up @@ -865,6 +947,13 @@ int main(int argc, char ** argv) {
} else {
print_info("Skipping VLM encode/generation tests (no --image provided)");
}

if (audio_path) {
test_vlm_audio_encode(model_path, mmproj_path, audio_path);
test_vlm_audio_generation(model_path, mmproj_path, audio_path);
} else {
print_info("Skipping VLM audio tests (no --audio provided)");
}
} else {
// still run error case tests (only needs text model)
if (!quick_mode) {
Expand Down
Loading