From eb3b96c96ac3569755047b876dcf61b65256dea9 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 1 Oct 2024 11:23:42 -0700 Subject: [PATCH 01/13] initial pass --- .gitignore | 2 + Makefile | 5 + README.md | 8 +- bench/batch.sh | 1 + debug.sql | 46 ++++++ sqlite-lembed.c | 424 ++++++++++++++++++++++++++++++++++++++++++++++-- test.sql | 95 ++++++++++- tmp.py | 16 ++ tmp.sql | 49 ++++++ 9 files changed, 631 insertions(+), 15 deletions(-) create mode 100644 bench/batch.sh create mode 100644 debug.sql create mode 100644 tmp.py create mode 100644 tmp.sql diff --git a/.gitignore b/.gitignore index cc72133..e1e2d23 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ ggml-metal.metal News*.json sqlite-lembed.h dist/ + +*.db diff --git a/Makefile b/Makefile index b11db35..3608c39 100644 --- a/Makefile +++ b/Makefile @@ -85,6 +85,11 @@ $(TARGET_LOADABLE): sqlite-lembed.c sqlite-lembed.h $(BUILD_DIR) $(prefix) ls $(BUILD_DIR) cp $(BUILT_LOADABLE_PATH) $@ +$(TARGET_STATIC): sqlite-lembed.c sqlite-lembed.h $(BUILD_DIR) $(prefix) + cmake --build $(BUILD_DIR) -t sqlite_lembed_static $(EXTRA_CMAKE_BUILD) + ls $(BUILD_DIR) + cp $(BUILT_LOADABLE_PATH) $@ + sqlite-lembed.h: sqlite-lembed.h.tmpl VERSION VERSION=$(shell cat VERSION) \ diff --git a/README.md b/README.md index 2d856ef..7fc141f 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,8 @@ To load it into `sqlite-lembed`, register it with the `temp.lembed_models` table ```sql .load ./lembed0 -INSERT INTO temp.lembed_models(name, model) - select 'all-MiniLM-L6-v2', lembed_model_from_file('all-MiniLM-L6-v2.e4ce9877.q8_0.gguf'); +insert into temp.lembed_models(name, model) + values ('default', lembed_model_from_file('all-MiniLM-L6-v2.e4ce9877.q8_0.gguf')); select lembed( 'all-MiniLM-L6-v2', @@ -54,7 +54,7 @@ create virtual table vec_articles using vec0( ); insert into vec_articles(rowid, headline_embeddings) - select rowid, lembed('all-MiniLM-L6-v2', headline) + select rowid, lembed( headline) from articles; ``` @@ -71,7 +71,7 @@ with matches as ( rowid, distance from vec_articles - where headline_embeddings match lembed('all-MiniLM-L6-v2', :query) + where headline_embeddings match lembed(:query) order by distance limit 3 ) diff --git a/bench/batch.sh b/bench/batch.sh new file mode 100644 index 0000000..a9bf588 --- /dev/null +++ b/bench/batch.sh @@ -0,0 +1 @@ +#!/bin/bash diff --git a/debug.sql b/debug.sql new file mode 100644 index 0000000..8c8051e --- /dev/null +++ b/debug.sql @@ -0,0 +1,46 @@ +.load ./dist/lembed0 +.load ../sqlite-vec/dist/vec0 + +select lembed_version(), lembed_debug(); + +insert into temp.lembed_models(name, model) + select 'default', lembed_model_from_file(''); + +select vec_to_json(vec_slice(lembed('Shohei Ohtani''s ex-interpreter pleads guilty to charges related to gambling and theft'), 0, 8)); +select vec_to_json(vec_slice(lembed('The jury has been selected in Hunter Biden''s gun trial'), 0, 8)); + + +.mode box +.header on + +select + rowid, + contents, + --length(embedding), + vec_to_json(vec_slice(embedding, 0, 8)), + vec_to_json(vec_slice(lembed(contents), 0, 8)) + +from lembed_batch( + ( + '[ + {"contents": "Shohei Ohtani''s ex-interpreter pleads guilty to charges related to gambling and theft"} + ]' + ) +); + + +select + rowid, + contents, + --length(embedding), + vec_to_json(vec_slice(embedding, 0, 8)), + vec_to_json(vec_slice(lembed(contents), 0, 8)) + +from lembed_batch( + ( + '[ + {"contents": "Shohei Ohtani''s ex-interpreter pleads guilty to charges related to gambling and theft"}, + {"contents": "The jury has been selected in Hunter Biden''s gun trial"} + ]' + ) +); diff --git a/sqlite-lembed.c b/sqlite-lembed.c index 479a554..d833967 100644 --- a/sqlite-lembed.c +++ b/sqlite-lembed.c @@ -16,7 +16,7 @@ SQLITE_EXTENSION_INIT1 void dummy_log(enum ggml_log_level level, const char *text, void *user_data) {} static void normalize(float *vec, float *out, int n) { - float norm = 0; + double norm = 0; for (int i = 0; i < n; i++) { norm += vec[i] * vec[i]; } @@ -52,16 +52,24 @@ int tokenize(struct llama_model *model, const char *input, size_t input_length, return SQLITE_OK; } -int embed_single(struct llama_model *model, struct llama_context *context, +int embed_single(struct llama_context *context, const char *input, size_t input_length, /** Output float embedding */ float **out_embedding, /** Output embedding length (n dimensions) */ int *out_dimensions) { + struct llama_model * model = (struct llama_model *) llama_get_model(context); + int n_batch = 512; int n_ctx_train = llama_n_ctx_train(model); int n_ctx = llama_n_ctx(context); + int dimensions = llama_n_embd(model); + float *output_embedding = sqlite3_malloc(sizeof(float) * dimensions); + if(!output_embedding) { + return SQLITE_NOMEM; + } + llama_token *tokens; int token_count; int rc = tokenize(model, input, input_length, &token_count, &tokens); @@ -85,13 +93,6 @@ int embed_single(struct llama_model *model, struct llama_context *context, batch.n_tokens++; } - int dimensions = llama_n_embd(model); - float *output_embedding = sqlite3_malloc(sizeof(float) * dimensions); - if(!output_embedding) { - llama_batch_free(batch); - return SQLITE_NOMEM; - } - llama_kv_cache_clear(context); // KV not needed for embeddings? rc = llama_decode(context, batch); if(rc != 0) { @@ -302,7 +303,7 @@ static void lembed(sqlite3_context *context, int argc, sqlite3_value **argv) { int dimensions; float *embedding; - rc = embed_single(model, ctx, input, input_len, &embedding, &dimensions); + rc = embed_single(ctx, input, input_len, &embedding, &dimensions); if(rc != SQLITE_OK) { sqlite3_result_error(context, "Error generating embedding", -1); return; @@ -478,6 +479,7 @@ static int lembed_modelsUpdate(sqlite3_vtab *pVTab, int argc, struct llama_context *ctx; struct llama_context_params cparams = llama_context_default_params(); cparams.embeddings = 1; + cparams.n_ubatch = cparams.n_batch = 512; if (contextOptions) { if (contextOptions->defined[0]) { cparams.seed = contextOptions->seed; @@ -833,6 +835,407 @@ static sqlite3_module lembed_chunksModule = { /* xShadowName */ 0}; #pragma endregion +#pragma region lembed_batch + + +struct Array { + size_t element_size; + size_t length; + size_t capacity; + void *z; +}; + +/** + * @brief Initial an array with the given element size and capacity. + * + * @param array + * @param element_size + * @param init_capacity + * @return SQLITE_OK on success, error code on failure. Only error is + * SQLITE_NOMEM + */ +int array_init(struct Array *array, size_t element_size, size_t init_capacity) { + int sz = element_size * init_capacity; + void *z = sqlite3_malloc(sz); + if (!z) { + return SQLITE_NOMEM; + } + memset(z, 0, sz); + + array->element_size = element_size; + array->length = 0; + array->capacity = init_capacity; + array->z = z; + return SQLITE_OK; +} + +int array_append(struct Array *array, const void *element) { + if (array->length == array->capacity) { + size_t new_capacity = array->capacity * 2 + 100; + void *z = sqlite3_realloc64(array->z, array->element_size * new_capacity); + if (z) { + array->capacity = new_capacity; + array->z = z; + } else { + return SQLITE_NOMEM; + } + } + memcpy(&((unsigned char *)array->z)[array->length * array->element_size], + element, array->element_size); + array->length++; + return SQLITE_OK; +} + +void array_cleanup(struct Array *array) { + if (!array) + return; + array->element_size = 0; + array->length = 0; + array->capacity = 0; + sqlite3_free(array->z); + array->z = NULL; +} + +typedef struct lembed_batch_vtab lembed_batch_vtab; +struct lembed_batch_vtab { + sqlite3_vtab base; + sqlite3 * db; + struct Api * api; +}; + +typedef struct lembed_batch_cursor lembed_batch_cursor; +struct lembed_batch_cursor { + sqlite3_vtab_cursor base; + struct Api * api; + struct llama_context *lctx; + sqlite3_int64 iRowid; + sqlite3_stmt * stmt; + int dimensions; + int eof; + int stmtRc; + + + int batchIdx; + int batchSize; + struct Array contentsArray; + struct Array contentLengthsArray; + float * embeddings; +}; + + +static int lembed_batchConnect( + sqlite3 *db, + void *pAux, + int argc, const char *const*argv, + sqlite3_vtab **ppVtab, + char **pzErr +){ + lembed_batch_vtab *pNew; + int rc; + + rc = sqlite3_declare_vtab(db, + "CREATE TABLE x(contents,embedding, model hidden, input hidden)" + ); +#define LEMBED_BATCH_CONTENTS 0 +#define LEMBED_BATCH_EMBEDDING 1 +#define LEMBED_BATCH_MODEL 2 +#define LEMBED_BATCH_INPUT 3 + if( rc==SQLITE_OK ){ + pNew = sqlite3_malloc( sizeof(*pNew) ); + *ppVtab = (sqlite3_vtab*)pNew; + if( pNew==0 ) return SQLITE_NOMEM; + memset(pNew, 0, sizeof(*pNew)); + } + rc = sqlite3_open(":memory:", &pNew->db); + pNew->api = pAux; + return rc; +} + +static int lembed_batchDisconnect(sqlite3_vtab *pVtab){ + lembed_batch_vtab *p = (lembed_batch_vtab*)pVtab; + sqlite3_close(p->db); + sqlite3_free(p); + return SQLITE_OK; +} + +static int lembed_batchOpen(sqlite3_vtab *p, sqlite3_vtab_cursor **ppCursor){ + lembed_batch_cursor *pCur; + pCur = sqlite3_malloc( sizeof(*pCur) ); + if( pCur==0 ) return SQLITE_NOMEM; + memset(pCur, 0, sizeof(*pCur)); + *ppCursor = &pCur->base; + pCur->api = ( (lembed_batch_vtab *) p)->api; + int rc = sqlite3_prepare_v2( + ( (lembed_batch_vtab *) p)->db, + "select json_extract(value, '$.contents') from json_each(?)", + -1, + &pCur->stmt, + NULL + ); + assert(rc == SQLITE_OK); + return rc; +} + +static int lembed_batchClose(sqlite3_vtab_cursor *cur){ + lembed_batch_cursor *pCur = (lembed_batch_cursor*)cur; + sqlite3_finalize(pCur->stmt); + sqlite3_free(pCur); + return SQLITE_OK; +} + +static int lembed_batchBestIndex( + sqlite3_vtab *pVTab, + sqlite3_index_info *pIdxInfo +){ + + int hasSource = 0; + for (int i = 0; i < pIdxInfo->nConstraint; i++) { + const struct sqlite3_index_constraint *pCons = &pIdxInfo->aConstraint[i]; + switch (pCons->iColumn) { + case LEMBED_BATCH_MODEL: { + if (!hasSource && !pCons->usable || + pCons->op != SQLITE_INDEX_CONSTRAINT_EQ) + return SQLITE_CONSTRAINT; + hasSource = 1; + pIdxInfo->aConstraintUsage[i].argvIndex = 1; + pIdxInfo->aConstraintUsage[i].omit = 1; + break; + } + } + } + if (!hasSource) { + pVTab->zErrMsg = sqlite3_mprintf("source argument is required"); + return SQLITE_ERROR; + } + + pIdxInfo->estimatedCost = (double)10; + pIdxInfo->estimatedRows = 10; + return SQLITE_OK; +} + +// SQLITE_ROW: embed some, stmt has more +// SQLITE_DONE: done after this chunk +// else: error +int embed_batch( + lembed_batch_cursor *pCur + ) { + int32_t n_batch = 512; + struct llama_batch batch = llama_batch_init(n_batch, 0, 1); + int nprocessed = 0; + int rc; + + while(1) { + if(pCur->stmtRc == SQLITE_DONE) { + pCur->eof = 1; + break; + } + assert(pCur->stmtRc == SQLITE_ROW); + + char * s = (char *) sqlite3_column_text(pCur->stmt, 0); + int len = sqlite3_column_bytes(pCur->stmt, 0); + + int input_token_count_estimate = llama_tokenize(llama_get_model(pCur->lctx), s, len, NULL, 0, true, true); + assert(input_token_count_estimate < 0); + llama_token *tokens = sqlite3_malloc(sizeof(llama_token) * abs(input_token_count_estimate)); + assert(tokens); + + int input_token_count = llama_tokenize(llama_get_model(pCur->lctx), s, len, tokens, abs(input_token_count_estimate), true, true); + assert(input_token_count == abs(input_token_count_estimate)); + + if (batch.n_tokens + input_token_count > n_batch) { + assert(nprocessed>0); + sqlite3_free(tokens); + break; + } + + for (size_t i = 0; i < input_token_count; i++) { + batch.token [batch.n_tokens] = tokens[i]; + batch.pos [batch.n_tokens] = i; + batch.n_seq_id[batch.n_tokens] = 1; + batch.seq_id[batch.n_tokens][0] = nprocessed; + batch.logits [batch.n_tokens] = i == (input_token_count - 1); + batch.n_tokens++; + } + sqlite3_free(tokens); + nprocessed += 1; + char * zCopy = sqlite3_mprintf("%.*s", len, s); + assert(zCopy); + assert(array_append(&pCur->contentsArray, &zCopy) == SQLITE_OK); + assert(array_append(&pCur->contentLengthsArray, &len) == SQLITE_OK); + pCur->stmtRc = sqlite3_step(pCur->stmt); + } + if(nprocessed==0) { + pCur->batchSize = 0; + pCur->batchIdx = 0; + return SQLITE_DONE; + } + printf("nprocessed=%d\n", nprocessed); + + float * embeddings = sqlite3_malloc(pCur->dimensions * sizeof(float) * nprocessed); + assert(embeddings); + memset(embeddings, 0, pCur->dimensions * sizeof(float) * nprocessed); + + llama_kv_cache_clear(pCur->lctx); + rc = llama_decode(pCur->lctx, batch); + assert(rc >= 0 ); + for (int i = 0; i < batch.n_tokens; i++) { + if (!batch.logits[i]) { + continue; + } + + float * embd = llama_get_embeddings_seq(pCur->lctx, batch.seq_id[i][0]); + assert(embd); + float * out = embeddings + batch.seq_id[i][0] * pCur->dimensions; + normalize(embd, out, pCur->dimensions); + } + + llama_batch_free(batch); + pCur->embeddings = embeddings; + pCur->batchSize = nprocessed; + pCur->batchIdx = 0; + return SQLITE_ROW; +} +static int lembed_batchFilter( + sqlite3_vtab_cursor *pVtabCursor, + int idxNum, const char *idxStr, + int argc, sqlite3_value **argv +){ + int rc; + lembed_batch_cursor *pCur = (lembed_batch_cursor *)pVtabCursor; + sqlite3_reset(pCur->stmt); + sqlite3_clear_bindings(pCur->stmt); + sqlite3_bind_text(pCur->stmt, 1, sqlite3_value_text(argv[0]), sqlite3_value_bytes(argv[0]), SQLITE_TRANSIENT); + pCur->stmtRc = sqlite3_step(pCur->stmt); + assert(pCur->stmtRc == SQLITE_ROW || pCur->stmtRc == SQLITE_DONE); + + struct llama_model *model; + rc = api_model_from_name(pCur->api, "default", strlen("default"), &model, &pCur->lctx); + if(rc != SQLITE_OK) { + return SQLITE_ERROR; + } + pCur->dimensions = llama_n_embd(model); + for(int i = 0; i < pCur->batchSize; i++) { + sqlite3_free(((char **)pCur->contentsArray.z)[i]); + } + array_cleanup(&pCur->contentsArray); + array_cleanup(&pCur->contentLengthsArray); + if(pCur->embeddings) { + sqlite3_free(pCur->embeddings); + pCur->embeddings = NULL; + } + rc = array_init(&pCur->contentsArray, sizeof(char *), 32); + assert(rc == SQLITE_OK); + rc = array_init(&pCur->contentLengthsArray, sizeof(int), 32); + assert(rc == SQLITE_OK); + pCur->iRowid = 0; + pCur->eof = 0; + + rc = embed_batch(pCur); + assert(rc == SQLITE_ROW || rc == SQLITE_DONE); + return SQLITE_OK; +} + +static int lembed_batchEof(sqlite3_vtab_cursor *cur){ + lembed_batch_cursor *pCur = (lembed_batch_cursor*)cur; + return (pCur->batchIdx >= pCur->batchSize) && pCur->eof; +} + + +static int lembed_batchNext(sqlite3_vtab_cursor *cur){ + lembed_batch_cursor *pCur = (lembed_batch_cursor*)cur; + pCur->iRowid++; + pCur->batchIdx++; + if(pCur->batchIdx >= pCur->batchSize) { + int rc; + for(int i = 0; i < pCur->batchSize; i++) { + sqlite3_free(((char **)pCur->contentsArray.z)[i]); + } + array_cleanup(&pCur->contentsArray); + array_cleanup(&pCur->contentLengthsArray); + if(pCur->embeddings) { + sqlite3_free(pCur->embeddings); + pCur->embeddings = NULL; + } + rc = array_init(&pCur->contentsArray, sizeof(char *), 32); + assert(rc == SQLITE_OK); + rc = array_init(&pCur->contentLengthsArray, sizeof(int), 32); + assert(rc == SQLITE_OK); + rc = embed_batch(pCur); + assert(rc == SQLITE_ROW || rc == SQLITE_DONE); + } + return SQLITE_OK; +} + +static int lembed_batchRowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid){ + lembed_batch_cursor *pCur = (lembed_batch_cursor*)cur; + *pRowid = pCur->iRowid; + return SQLITE_OK; +} + + +static int lembed_batchColumn( + sqlite3_vtab_cursor *cur, + sqlite3_context *context, + int i +){ + lembed_batch_cursor *pCur = (lembed_batch_cursor*)cur; + switch( i ){ + case LEMBED_BATCH_CONTENTS: + sqlite3_result_text( + context, + ((char **)pCur->contentsArray.z)[pCur->batchIdx], + ((int *) pCur->contentLengthsArray.z)[pCur->batchIdx], + SQLITE_TRANSIENT + ); + break; + case LEMBED_BATCH_EMBEDDING: + sqlite3_result_blob( + context, + pCur->embeddings + (pCur->dimensions * pCur->batchIdx), + sizeof(float) * pCur->dimensions, + SQLITE_TRANSIENT + ); + sqlite3_result_subtype(context, 223); // TODO define + break; + default: + sqlite3_result_null(context); + } + return SQLITE_OK; +} + +/* +** This following structure defines all the methods for the +** virtual table. +*/ +static sqlite3_module lembed_batchModule = { + /* iVersion */ 3, + /* xCreate */ 0, + /* xConnect */ lembed_batchConnect, + /* xBestIndex */ lembed_batchBestIndex, + /* xDisconnect */ lembed_batchDisconnect, + /* xDestroy */ 0, + /* xOpen */ lembed_batchOpen, + /* xClose */ lembed_batchClose, + /* xFilter */ lembed_batchFilter, + /* xNext */ lembed_batchNext, + /* xEof */ lembed_batchEof, + /* xColumn */ lembed_batchColumn, + /* xRowid */ lembed_batchRowid, + /* xUpdate */ 0, + /* xBegin */ 0, + /* xSync */ 0, + /* xCommit */ 0, + /* xRollback */ 0, + /* xFindMethod */ 0, + /* xRename */ 0, + /* xSavepoint */ 0, + /* xRelease */ 0, + /* xRollbackTo */ 0, + /* xShadowName */ 0, + /* xIntegrity */ 0 +}; +#pragma endregion + #ifndef SQLITE_SUBTYPE #define SQLITE_SUBTYPE 0x000100000 #endif @@ -917,5 +1320,6 @@ __declspec(dllexport) sqlite3_create_module_v2(db, "lembed_chunks", &lembed_chunksModule, a, NULL); sqlite3_create_module_v2(db, "lembed_models", &lembed_modelsModule, a, NULL); + sqlite3_create_module_v2(db, "lembed_batch", &lembed_batchModule, a, NULL); return SQLITE_OK; } diff --git a/test.sql b/test.sql index 20dc3a2..9554775 100644 --- a/test.sql +++ b/test.sql @@ -13,7 +13,100 @@ select lembed_version(), lembed_debug(); INSERT INTO temp.lembed_models(name, model) - select 'all-MiniLM-L6-v2', lembed_model_from_file('models/all-MiniLM-L6-v2-44eb4044.gguf'); + select 'default', lembed_model_from_file('/Users/alex/projects/llama.cpp/all-MiniLM-L6-v2.F16.gguf'); + + +create table articles as + select column1 as headline + from (VALUES + ('Shohei Ohtani''s ex-interpreter pleads guilty to charges related to gambling and theft'), + ('The jury has been selected in Hunter Biden''s gun trial'), + ('Larry Allen, a Super Bowl champion and famed Dallas Cowboy, has died at age 52'), + ('After saying Charlotte, a lone stingray, was pregnant, aquarium now says she''s sick'), + ('An Epoch Times executive is facing money laundering charge'), + ('Hassan Nasrallah’s killing transforms an already deadly regional conflict'), + ('Who was Hassan Nasrallah, the Hezbollah leader killed by Israel?'), + ('What is Hezbollah, the militia fighting Israel in Lebanon?'), + ('Netanyahu defies calls for a cease-fire at the U.N., as Israel strikes Lebanon'), + ('Death toll from Hurricane Helene mounts as aftermath assessment begins'), + ('5 things to know from this week’s big report on cannabis'), + ('VP debates may alter a close race’s dynamic even when they don''t predict the winner'), + ('SpaceX launches ISS-bound crew that hopes to bring home 2 stuck astronauts'), + ('Why the price of eggs is on the rise again'), + ('A guide to your weekend viewing and reading'), + ('At the border in Arizona, Harris lays out a plan to get tough on fentanyl'), + ('A new kind of drug for schizophrenia promises fewer side effects'), + ('Meet the astronauts preparing to travel farther from Earth than any human before'), + ('‘SNL’ has always taken on politics. Here’s what works — and why'), + ('Golden-age rappers make a digital-age leap — and survive'), + ('Why Russia''s broadcaster RT turned to covertly funding American pro-Trump influencers'), + ('Read the indictment: NYC Mayor Eric Adams charged with bribery, fraud, foreign donations'), + ('Justice Department sues Alabama, claiming it purged voters too close to the election'), + ('Exactly 66 years ago, another Hurricane Helene rocked the Carolinas'), + ('A meteorologist in Atlanta rescued a woman from Helene floodwaters on camera') + ); + +select * from articles; + +.timer on +select headline, length(lembed( headline)) from articles; + +select + rowid, + contents, + --length(embedding), + vec_to_json(vec_slice(embedding, 0, 8)) +from lembed_batch( + ( + select json_group_array( + json_object( + 'id', rowid, + 'contents', headline + ) + ) from articles + ) +); + +select + rowid, + headline, + vec_to_json(vec_slice(lembed(headline), 0, 8)) +from articles; + +.exit + +select + rowid, + contents, + --length(embedding), + vec_to_json(vec_slice(embedding, 0, 8)), + vec_to_json(vec_slice(lembed(contents), 0, 8)) + +from lembed_batch( + ( + '[ + {"contents": "Shohei Ohtani''s ex-interpreter pleads guilty to charges related to gambling and theft"} + ]' + ) +); +select + rowid, + contents, + --length(embedding), + vec_to_json(vec_slice(embedding, 0, 8)), + vec_to_json(vec_slice(lembed(contents), 0, 8)) + +from lembed_batch( + ( + '[ + {"contents": "Shohei Ohtani''s ex-interpreter pleads guilty to charges related to gambling and theft"}, + {"contents": "The jury has been selected in Hunter Biden''s gun trial"} + ]' + ) +); + + +.exit select vec_length(lembed('all-MiniLM-L6-v2', 'hello')) as embedding; diff --git a/tmp.py b/tmp.py new file mode 100644 index 0000000..2367811 --- /dev/null +++ b/tmp.py @@ -0,0 +1,16 @@ +import sqlite3 +import time +from sentence_transformers import SentenceTransformer + +db = sqlite3.connect("bench/headlines-2024.db") + +sentences = [ + row[0] for row in db.execute("select headline from articles limit 1000").fetchall() +] + +model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") +t0 = time.time() +print(t0) +embeddings = model.encode(sentences) +print(time.time() - t0) +print(embeddings[0][0:8]) diff --git a/tmp.sql b/tmp.sql new file mode 100644 index 0000000..70b5c84 --- /dev/null +++ b/tmp.sql @@ -0,0 +1,49 @@ +.bail on + +.load ./dist/lembed0 +.load ../sqlite-vec/dist/vec0 + +select lembed_version(), lembed_debug(); + +insert into temp.lembed_models(name, model) + --select 'default', lembed_model_from_file('/Users/alex/projects/llama.cpp/all-MiniLM-L6-v2.F16.gguf'); + --select 'default', lembed_model_from_file('./all-MiniLM-L6-v2.e4ce9877.f32.gguf'); + --select 'default', lembed_model_from_file('./all-MiniLM-L6-v2.F32.gguf'); + select 'default', lembed_model_from_file('all-MiniLM-L6-v2.Q6_K.gguf'); + +--select length(lembed('asdf')); +.mode box +.header on +.timer on + +select + rowid, + --contents, + typeof(embedding), + quote(substr(embedding, 0, 8)) + --vec_to_json(vec_slice(embedding, 0, 4)) +from lembed_batch( + ( + select json_group_array( + json_object('contents', headline) + ) + from (select * from articles limit 1000) + ) +); + +select sum(length(lembed(headline))) from (select * from articles limit 1000); + +select + rowid, + --contents, + typeof(embedding), + quote(substr(embedding, 0, 8)) + --vec_to_json(vec_slice(embedding, 0, 4)) +from lembed_batch( + ( + select json_group_array( + json_object('contents', headline) + ) + from (select * from articles limit 1000) + ) +); From e33ff77cffc9bc2f1200fb24585bc6f8ebea76ac Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Thu, 21 Nov 2024 17:13:44 -0800 Subject: [PATCH 02/13] rm lembed_chunks, error handling, new columns --- Makefile | 14 +- sqlite-lembed.c | 418 ++++++++++++++++------------------------- test.sql | 17 +- tests/test-loadable.py | 111 ++++++++++- 4 files changed, 297 insertions(+), 263 deletions(-) diff --git a/Makefile b/Makefile index 3608c39..94bcff9 100644 --- a/Makefile +++ b/Makefile @@ -105,8 +105,18 @@ $(MODELS_DIR): $(BUILD_DIR) $(MODELS_DIR)/all-MiniLM-L6-v2.e4ce9877.q8_0.gguf: $(MODELS_DIR) curl -L -o $@ https://huggingface.co/asg017/sqlite-lembed-model-examples/resolve/main/all-MiniLM-L6-v2/all-MiniLM-L6-v2.e4ce9877.q8_0.gguf -test-loadable: $(TARGET_LOADABLE) $(MODELS_DIR)/all-MiniLM-L6-v2.e4ce9877.q8_0.gguf - $(PYTHON) -m pytest tests/test-loadable.py +$(MODELS_DIR)/mxbai-embed-xsmall-v1-q8_0.gguf: $(MODELS_DIR) + curl -L -o $@ https://huggingface.co/mixedbread-ai/mxbai-embed-xsmall-v1/resolve/main/gguf/mxbai-embed-xsmall-v1-q8_0.gguf + +$(MODELS_DIR)/nomic-embed-text-v1.5.Q2_K.gguf: $(MODELS_DIR) + curl -L -o $@ https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF/resolve/main/nomic-embed-text-v1.5.Q2_K.gguf + +models: $(MODELS_DIR)/all-MiniLM-L6-v2.e4ce9877.q8_0.gguf $(MODELS_DIR)/mxbai-embed-xsmall-v1-q8_0.gguf $(MODELS_DIR)/nomic-embed-text-v1.5.Q2_K.gguf + +test-loadable: $(TARGET_LOADABLE) models + $(PYTHON) -m pytest tests/test-loadable.py -s -x -vv +test-loadable-watch: + watchexec -w sqlite-lembed.c -w tests/test-loadable.py -w Makefile --clear -- make test-loadable FORMAT_FILES=sqlite-lembed.c diff --git a/sqlite-lembed.c b/sqlite-lembed.c index d833967..85de08c 100644 --- a/sqlite-lembed.c +++ b/sqlite-lembed.c @@ -52,15 +52,16 @@ int tokenize(struct llama_model *model, const char *input, size_t input_length, return SQLITE_OK; } + int embed_single(struct llama_context *context, const char *input, size_t input_length, /** Output float embedding */ float **out_embedding, /** Output embedding length (n dimensions) */ - int *out_dimensions) { + int *out_dimensions, + char ** errmsg) { struct llama_model * model = (struct llama_model *) llama_get_model(context); - int n_batch = 512; int n_ctx_train = llama_n_ctx_train(model); int n_ctx = llama_n_ctx(context); @@ -75,10 +76,16 @@ int embed_single(struct llama_context *context, int rc = tokenize(model, input, input_length, &token_count, &tokens); if(rc != SQLITE_OK) { // TODO error message + *errmsg = sqlite3_mprintf("Could not tokenize input."); return rc; } - struct llama_batch batch = llama_batch_init(n_batch, 0, 1); + if(token_count > n_ctx) { + *errmsg = sqlite3_mprintf("Input too long, provided %lld tokens, but model has context size of %lld", (int64_t) token_count, (int64_t) n_ctx); + return SQLITE_ERROR; + } + + struct llama_batch batch = llama_batch_init(n_ctx, 0, 1); int seq_id = 0; // llama_batch_add(batch, tokens, 0, ) @@ -98,6 +105,7 @@ int embed_single(struct llama_context *context, if(rc != 0) { sqlite3_free(output_embedding); llama_batch_free(batch); + *errmsg = sqlite3_mprintf("Could not decode batch"); return SQLITE_ERROR; } @@ -111,6 +119,7 @@ int embed_single(struct llama_context *context, if(!source_embedding) { sqlite3_free(output_embedding); llama_batch_free(batch); + *errmsg = sqlite3_mprintf("Could not find embedding"); return SQLITE_ERROR; } @@ -161,22 +170,38 @@ static void lembed_model_size(sqlite3_context *context, int argc, static void lembed_model_options_(sqlite3_context *context, int argc, sqlite3_value **argv) { - assert(argc >= 0); - assert(argc % 2 == 0); + + if(argc % 2 == 0) { + sqlite3_result_error(context, "an even number of arguments are required in lembed_model_options, key-value pairs", -1); + return; + } lembed_model_options *o = sqlite3_malloc(sizeof(lembed_model_options)); - assert(o); + if(!o) { + sqlite3_result_error_nomem(context); + return; + } memset(o, 0, sizeof(*o)); for (int i = 0; i < argc; i += 2) { sqlite3_value *key = argv[i]; sqlite3_value *value = argv[i + 1]; - assert(sqlite3_value_type(key) == SQLITE_TEXT); + if(sqlite3_value_type(key) != SQLITE_TEXT) { + char * errmsg = sqlite3_mprintf("Expected string key at index %d", i); + sqlite3_result_error(context, errmsg, -1); + sqlite3_free(errmsg); + sqlite3_free(o); + return; + } const char *k = (const char *)sqlite3_value_text(key); if (sqlite3_stricmp(k, "n_gpu_layers") == 0) { o->n_gpu_layers = sqlite3_value_int(value); o->defined[0] = 1; } else { - abort(); + char * errmsg = sqlite3_mprintf("Unknown model option '%s'", k); + sqlite3_result_error(context, errmsg, -1); + sqlite3_free(errmsg); + sqlite3_free(o); + return; } } sqlite3_result_pointer(context, o, POINTER_NAME_MODEL_OPTIONS, sqlite3_free); @@ -195,25 +220,43 @@ static char *POINTER_NAME_CONTEXT_OPTIONS = "lembed_context_options"; static void lembed_context_options_(sqlite3_context *context, int argc, sqlite3_value **argv) { - assert(argc >= 0); - assert(argc % 2 == 0); + if(argc % 2 == 0) { + sqlite3_result_error(context, "an even number of arguments are required in lembed_context_options, key-value pairs", -1); + return; + } lembed_context_options *o = sqlite3_malloc(sizeof(lembed_context_options)); - assert(o); + if(!o) { + sqlite3_result_error_nomem(context); + return; + } memset(o, 0, sizeof(*o)); for (int i = 0; i < argc; i += 2) { sqlite3_value *key = argv[i]; sqlite3_value *value = argv[i + 1]; - assert(sqlite3_value_type(key) == SQLITE_TEXT); + if(sqlite3_value_type(key) != SQLITE_TEXT) { + char * errmsg = sqlite3_mprintf("Expected string value at index %d", i+1); + sqlite3_result_error(context, errmsg, -1); + sqlite3_free(errmsg); + return; + } const char *k = (const char *)sqlite3_value_text(key); if (sqlite3_stricmp("seed", k) == 0) { sqlite3_int64 v = sqlite3_value_int64(value); - assert(v > 0); + if(v < 0) { + sqlite3_result_error(context, "Expected positive value for seed", -1); + sqlite3_free(o); + return; + } o->seed = v; o->defined[0] = 1; } else if (sqlite3_stricmp("n_ctx", k) == 0) { sqlite3_int64 v = sqlite3_value_int64(value); - assert(v > 0); + if(v < 0) { + sqlite3_result_error(context, "Expected positive value for n_ctx", -1); + sqlite3_free(o); + return; + } o->n_ctx = v; o->defined[1] = 1; } else if (sqlite3_stricmp("rope_scaling_type", k) == 0) { @@ -303,9 +346,10 @@ static void lembed(sqlite3_context *context, int argc, sqlite3_value **argv) { int dimensions; float *embedding; - rc = embed_single(ctx, input, input_len, &embedding, &dimensions); + char * errmsg; + rc = embed_single(ctx, input, input_len, &embedding, &dimensions, &errmsg); if(rc != SQLITE_OK) { - sqlite3_result_error(context, "Error generating embedding", -1); + sqlite3_result_error(context, sqlite3_mprintf("Error generating embedding: %z", errmsg), -1); return; } sqlite3_result_blob(context, embedding, sizeof(float) * dimensions, sqlite3_free); @@ -314,16 +358,42 @@ static void lembed(sqlite3_context *context, int argc, sqlite3_value **argv) { static void lembed_tokenize_json(sqlite3_context *context, int argc, sqlite3_value **argv) { + int rc; struct llama_model *model; - int rc = api_model_from_name((struct Api *)sqlite3_user_data(context), + struct llama_context *ctx; + const char *input; + sqlite3_int64 input_len; + + if(argc == 1) { + input = (const char *)sqlite3_value_text(argv[0]); + input_len = sqlite3_value_bytes(argv[0]); + rc = api_model_from_name((struct Api *)sqlite3_user_data(context), "default", strlen("default"), &model, &ctx); + if(rc != SQLITE_OK) { + sqlite3_result_error(context, "No default model has been registered yet with lembed_models", -1); + return; + } + }else { + input = (const char *)sqlite3_value_text(argv[1]); + input_len = sqlite3_value_bytes(argv[1]); + rc = api_model_from_name((struct Api *)sqlite3_user_data(context), (const char *)sqlite3_value_text(argv[0]), - sqlite3_value_bytes(argv[0]), &model, NULL); - const char *input = (const char *)sqlite3_value_text(argv[1]); - sqlite3_int64 input_len = sqlite3_value_bytes(argv[1]); + sqlite3_value_bytes(argv[0]), &model, &ctx); + + if(rc != SQLITE_OK) { + char * zSql = sqlite3_mprintf("Unknown model name '%s'. Was it registered with lembed_models?", sqlite3_value_text(argv[0])); + sqlite3_result_error(context, zSql, -1); + sqlite3_free(zSql); + return; + } + } + int token_count; llama_token *tokens; rc = tokenize(model, input, input_len, &token_count, &tokens); - assert(rc == SQLITE_OK); + if(rc != SQLITE_OK) { + sqlite3_result_error(context, "Failed to tokenize input", -1); + return; + } sqlite3_str *s = sqlite3_str_new(NULL); sqlite3_str_appendchar(s, 1, '['); @@ -335,8 +405,11 @@ static void lembed_tokenize_json(sqlite3_context *context, int argc, } sqlite3_str_appendchar(s, 1, ']'); char *result = sqlite3_str_finish(s); - assert(result); - sqlite3_result_text(context, result, -1, sqlite3_free); + if(!result) { + sqlite3_result_error_nomem(context); + }else { + sqlite3_result_text(context, result, -1, sqlite3_free); + } } static void lembed_token_score(sqlite3_context *context, int argc, @@ -375,6 +448,15 @@ static void ggml_test(sqlite3_context *context, int argc, sqlite3_result_int64(context, ggml_cpu_has_avx()); } + +void vtab_set_error(sqlite3_vtab *pVTab, const char *zFormat, ...) { + va_list args; + sqlite3_free(pVTab->zErrMsg); + va_start(args, zFormat); + pVTab->zErrMsg = sqlite3_vmprintf(zFormat, args); + va_end(args); +} + #pragma region lembed_models() table function typedef struct lembed_models_vtab lembed_models_vtab; @@ -399,9 +481,12 @@ static int lembed_modelsConnect(sqlite3 *db, void *pAux, int argc, } #define LEMBED_MODELS_NAME 0 #define LEMBED_MODELS_MODEL 1 -#define LEMBED_MODELS_MODEL_OPTIONS 2 -#define LEMBED_MODELS_CONTEXT_OPTIONS 3 - rc = sqlite3_declare_vtab(db, "CREATE TABLE x(name, model, model_options " +#define LEMBED_MODELS_DIMENSIONS 2 +#define LEMBED_MODELS_N_CTX 3 +#define LEMBED_MODELS_POOLING_TYPE 4 +#define LEMBED_MODELS_MODEL_OPTIONS 5 +#define LEMBED_MODELS_CONTEXT_OPTIONS 6 + rc = sqlite3_declare_vtab(db, "CREATE TABLE x(name, model, dimensions, n_ctx, pooling_type, model_options " "hidden, context_options hidden)"); if (rc == SQLITE_OK) { pNew = sqlite3_malloc(sizeof(*pNew)); @@ -432,8 +517,13 @@ static int lembed_modelsUpdate(sqlite3_vtab *pVTab, int argc, // INSERT operation else if (argc > 1 && sqlite3_value_type(argv[0]) == SQLITE_NULL) { sqlite3_value **columnValues = &argv[2]; - const char *key = - (const char *)sqlite3_value_text(columnValues[LEMBED_MODELS_NAME]); + const char *key; + if(sqlite3_value_type(columnValues[LEMBED_MODELS_NAME]) == SQLITE_NULL) { + key = "default"; + }else { + key = (const char *)sqlite3_value_text(columnValues[LEMBED_MODELS_NAME]); + } + int idx = -1; for (int i = 0; i < MAX_MODELS; i++) { if (!p->api->models[i].name) { @@ -445,9 +535,18 @@ static int lembed_modelsUpdate(sqlite3_vtab *pVTab, int argc, if (idx < 0) abort(); - const char *modelPath = sqlite3_value_pointer( - columnValues[LEMBED_MODELS_MODEL], POINTER_NAME_MODEL_PATH); - assert(modelPath); + + const char *modelPath; + if(sqlite3_value_subtype(columnValues[LEMBED_MODELS_MODEL]) == POINTER_SUBTYPE) { + modelPath = sqlite3_value_pointer(columnValues[LEMBED_MODELS_MODEL], POINTER_NAME_MODEL_PATH); + } + else if (sqlite3_value_type(columnValues[LEMBED_MODELS_MODEL]) == SQLITE_TEXT) { + modelPath = sqlite3_value_text(columnValues[LEMBED_MODELS_MODEL]); + } + if(!modelPath) { + vtab_set_error(pVTab, "Could not resolve model path"); + return SQLITE_ERROR; + } lembed_model_options *modelOptions = NULL; if (sqlite3_value_subtype(columnValues[LEMBED_MODELS_MODEL_OPTIONS]) == @@ -479,7 +578,7 @@ static int lembed_modelsUpdate(sqlite3_vtab *pVTab, int argc, struct llama_context *ctx; struct llama_context_params cparams = llama_context_default_params(); cparams.embeddings = 1; - cparams.n_ubatch = cparams.n_batch = 512; + //cparams.n_ubatch = cparams.n_batch = 4096; if (contextOptions) { if (contextOptions->defined[0]) { cparams.seed = contextOptions->seed; @@ -585,6 +684,34 @@ static int lembed_modelsColumn(sqlite3_vtab_cursor *cur, sqlite3_result_text(context, p->api->models[pCur->iRowid].name, -1, SQLITE_TRANSIENT); break; + case LEMBED_MODELS_DIMENSIONS: + sqlite3_result_int64(context, llama_n_embd(p->api->models[pCur->iRowid].model)); + break; + case LEMBED_MODELS_N_CTX: + sqlite3_result_int64(context, llama_n_ctx(p->api->models[pCur->iRowid].context)); + break; + case LEMBED_MODELS_POOLING_TYPE: { + switch(llama_pooling_type(p->api->models[pCur->iRowid].context)) { + case LLAMA_POOLING_TYPE_NONE: { + sqlite3_result_text(context, "none", -1, SQLITE_STATIC); + break; + } + case LLAMA_POOLING_TYPE_MEAN: { + sqlite3_result_text(context, "mean", -1, SQLITE_STATIC); + break; + } + case LLAMA_POOLING_TYPE_CLS: { + sqlite3_result_text(context, "cls", -1, SQLITE_STATIC); + break; + } + case LLAMA_POOLING_TYPE_UNSPECIFIED: { + sqlite3_result_text(context, "unspecified", -1, SQLITE_STATIC); + break; + } + } + break; + } + case LEMBED_MODELS_MODEL: sqlite3_result_pointer(context, p->api->models[pCur->iRowid].model, POINTER_NAME_MODEL, NULL); @@ -620,221 +747,6 @@ static sqlite3_module lembed_modelsModule = { /* xShadowName */ 0}; #pragma endregion -#pragma region lembed_chunks() table function - -typedef struct lembed_chunks_vtab lembed_chunks_vtab; -struct lembed_chunks_vtab { - sqlite3_vtab base; - struct Api *api; -}; - -typedef struct lembed_chunks_cursor lembed_chunks_cursor; -struct lembed_chunks_cursor { - sqlite3_vtab_cursor base; - sqlite3_int64 iRowid; - int32_t chunks_count; - char **chunks; -}; - -static int lembed_chunksConnect(sqlite3 *db, void *pAux, int argc, - const char *const *argv, sqlite3_vtab **ppVtab, - char **pzErr) { - lembed_chunks_vtab *pNew; - int rc; -#define lembed_chunks_CONTENTS 0 -#define lembed_chunks_TOKEN_COUNT 1 -#define lembed_chunks_SOURCE 2 -#define lembed_chunks_CHUNK_SIZE 3 - rc = sqlite3_declare_vtab(db, "CREATE TABLE x(contents, token_count, source " - "hidden, chunk_size hidden)"); - if (rc == SQLITE_OK) { - pNew = sqlite3_malloc(sizeof(*pNew)); - *ppVtab = (sqlite3_vtab *)pNew; - if (pNew == 0) - return SQLITE_NOMEM; - memset(pNew, 0, sizeof(*pNew)); - pNew->api = pAux; - } - return rc; -} - -static int lembed_chunksDisconnect(sqlite3_vtab *pVtab) { - lembed_chunks_vtab *p = (lembed_chunks_vtab *)pVtab; - sqlite3_free(p); - return SQLITE_OK; -} - -static int lembed_chunksOpen(sqlite3_vtab *p, sqlite3_vtab_cursor **ppCursor) { - lembed_chunks_cursor *pCur; - pCur = sqlite3_malloc(sizeof(*pCur)); - if (pCur == 0) - return SQLITE_NOMEM; - memset(pCur, 0, sizeof(*pCur)); - *ppCursor = &pCur->base; - return SQLITE_OK; -} - -static int lembed_chunksClose(sqlite3_vtab_cursor *cur) { - lembed_chunks_cursor *pCur = (lembed_chunks_cursor *)cur; - sqlite3_free(pCur); - return SQLITE_OK; -} - -static int lembed_chunksBestIndex(sqlite3_vtab *pVTab, - sqlite3_index_info *pIdxInfo) { - int hasSource = 0; - int idxChunkSize = -1; - for (int i = 0; i < pIdxInfo->nConstraint; i++) { - const struct sqlite3_index_constraint *pCons = &pIdxInfo->aConstraint[i]; - switch (pCons->iColumn) { - case lembed_chunks_SOURCE: { - if (!hasSource && !pCons->usable || - pCons->op != SQLITE_INDEX_CONSTRAINT_EQ) - return SQLITE_CONSTRAINT; - hasSource = 1; - pIdxInfo->aConstraintUsage[i].argvIndex = 1; - pIdxInfo->aConstraintUsage[i].omit = 1; - break; - } - case lembed_chunks_CHUNK_SIZE: { - } - } - } - if (!hasSource) { - pVTab->zErrMsg = sqlite3_mprintf("source argument is required"); - return SQLITE_ERROR; - } - - pIdxInfo->idxNum = 1; - pIdxInfo->estimatedCost = (double)10; - pIdxInfo->estimatedRows = 10; - return SQLITE_OK; -} - -static int lembed_chunksFilter(sqlite3_vtab_cursor *pVtabCursor, int idxNum, - const char *idxStr, int argc, - sqlite3_value **argv) { - lembed_chunks_cursor *pCur = (lembed_chunks_cursor *)pVtabCursor; - struct Api *api = ((lembed_chunks_vtab *)pVtabCursor->pVtab)->api; - struct llama_model *model; - int rc = api_model_from_name(api, (const char *)sqlite3_value_text(argv[0]), - sqlite3_value_bytes(argv[0]), &model, NULL); - pCur->iRowid = 0; - - char *input = (char *)sqlite3_value_text(argv[1]); - sqlite3_int64 input_len = sqlite3_value_bytes(argv[1]); - int32_t chunk_size = 5; // sqlite3_value_int(argv[1]); - int32_t overlap = 0; // argc > 2 ? sqlite3_value_int(argv[2]) : 0; - - int token_count; - llama_token *tokens; - rc = tokenize(model, input, input_len, &token_count, &tokens); - assert(rc == SQLITE_OK); - - char *ptr = input; - int nchunks = ceil(1.0 * token_count / chunk_size); - pCur->chunks_count = nchunks; - pCur->chunks = sqlite3_malloc(sizeof(char *) * nchunks); - assert(pCur->chunks); - - for (int i = 0; i < nchunks; i++) { - sqlite3_str *str_chunk = sqlite3_str_new(NULL); - assert(str_chunk); - - for (int j = 0; j < chunk_size; j++) { - int32_t token = tokens[i * chunk_size + j]; - int32_t piece_len_neg = - llama_token_to_piece(model, token, NULL, 0, false); - // printf("%d\n", piece_len_neg); - // assert(piece_len_neg < 0); - int32_t piece_len = abs(piece_len_neg); - // include prefix space? - // assert(piece_len > 1); - if (!piece_len) - continue; - - char *piece = sqlite3_malloc(piece_len); - assert(piece); - llama_token_to_piece(model, token, piece, piece_len, false); - // printf("'%.*s' %d ", piece_len, piece, tokens[i*chunk_size + j]); - - char *begin = ptr; - while (*ptr != piece[piece_len > 1 ? 1 : 0]) { - ptr++; - } - sqlite3_str_append(str_chunk, begin, ptr - begin + piece_len); - ptr += piece_len; - - sqlite3_free(piece); - } - - char *chunk = sqlite3_str_finish(str_chunk); - assert(chunk); - pCur->chunks[i] = chunk; - } - - return SQLITE_OK; -} - -static int lembed_chunksRowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid) { - lembed_chunks_cursor *pCur = (lembed_chunks_cursor *)cur; - *pRowid = pCur->iRowid; - return SQLITE_OK; -} - -static int lembed_chunksNext(sqlite3_vtab_cursor *cur) { - lembed_chunks_cursor *pCur = (lembed_chunks_cursor *)cur; - pCur->iRowid++; - return SQLITE_OK; -} - -static int lembed_chunksEof(sqlite3_vtab_cursor *cur) { - lembed_chunks_cursor *pCur = (lembed_chunks_cursor *)cur; - return pCur->iRowid >= pCur->chunks_count; -} - -static int lembed_chunksColumn(sqlite3_vtab_cursor *cur, - sqlite3_context *context, int i) { - lembed_chunks_cursor *pCur = (lembed_chunks_cursor *)cur; - switch (i) { - case lembed_chunks_CONTENTS: - sqlite3_result_text(context, pCur->chunks[pCur->iRowid], -1, SQLITE_STATIC); - break; - case lembed_chunks_SOURCE: - // TODO - sqlite3_result_null(context); - break; - } - return SQLITE_OK; -} - -static sqlite3_module lembed_chunksModule = { - /* iVersion */ 0, - /* xCreate */ 0, - /* xConnect */ lembed_chunksConnect, - /* xBestIndex */ lembed_chunksBestIndex, - /* xDisconnect */ lembed_chunksDisconnect, - /* xDestroy */ 0, - /* xOpen */ lembed_chunksOpen, - /* xClose */ lembed_chunksClose, - /* xFilter */ lembed_chunksFilter, - /* xNext */ lembed_chunksNext, - /* xEof */ lembed_chunksEof, - /* xColumn */ lembed_chunksColumn, - /* xRowid */ lembed_chunksRowid, - /* xUpdate */ 0, - /* xBegin */ 0, - /* xSync */ 0, - /* xCommit */ 0, - /* xRollback */ 0, - /* xFindMethod */ 0, - /* xRename */ 0, - /* xSavepoint */ 0, - /* xRelease */ 0, - /* xRollbackTo */ 0, - /* xShadowName */ 0}; -#pragma endregion - #pragma region lembed_batch @@ -1019,7 +931,7 @@ static int lembed_batchBestIndex( int embed_batch( lembed_batch_cursor *pCur ) { - int32_t n_batch = 512; + uint32_t n_batch = llama_n_ctx(pCur->lctx); struct llama_batch batch = llama_batch_init(n_batch, 0, 1); int nprocessed = 0; int rc; @@ -1060,8 +972,8 @@ int embed_batch( nprocessed += 1; char * zCopy = sqlite3_mprintf("%.*s", len, s); assert(zCopy); - assert(array_append(&pCur->contentsArray, &zCopy) == SQLITE_OK); - assert(array_append(&pCur->contentLengthsArray, &len) == SQLITE_OK); + array_append(&pCur->contentsArray, &zCopy) == SQLITE_OK;//assert(); + array_append(&pCur->contentLengthsArray, &len) == SQLITE_OK;//assert(); pCur->stmtRc = sqlite3_step(pCur->stmt); } if(nprocessed==0) { @@ -1263,7 +1175,9 @@ __declspec(dllexport) llama_log_set(dummy_log, NULL); struct Api *a = sqlite3_malloc(sizeof(struct Api)); - assert(a); + if(!a) { + return SQLITE_NOMEM; + } memset(a, 0, sizeof(*a)); int rc = SQLITE_OK; @@ -1298,6 +1212,7 @@ __declspec(dllexport) // clang-format off {"lembed", lembed, 1}, {"lembed", lembed, 2}, + {"lembed_tokenize_json", lembed_tokenize_json, 1}, {"lembed_tokenize_json", lembed_tokenize_json, 2}, {"lembed_token_score", lembed_token_score, 2}, {"lembed_token_to_piece", lembed_token_to_piece_, 2}, @@ -1318,7 +1233,6 @@ __declspec(dllexport) sqlite3_create_function_v2(db, "_lembed_api", 0, 0, a, _noop, NULL, NULL, api_free); - sqlite3_create_module_v2(db, "lembed_chunks", &lembed_chunksModule, a, NULL); sqlite3_create_module_v2(db, "lembed_models", &lembed_modelsModule, a, NULL); sqlite3_create_module_v2(db, "lembed_batch", &lembed_batchModule, a, NULL); return SQLITE_OK; diff --git a/test.sql b/test.sql index 9554775..88c7669 100644 --- a/test.sql +++ b/test.sql @@ -15,7 +15,6 @@ select lembed_version(), lembed_debug(); INSERT INTO temp.lembed_models(name, model) select 'default', lembed_model_from_file('/Users/alex/projects/llama.cpp/all-MiniLM-L6-v2.F16.gguf'); - create table articles as select column1 as headline from (VALUES @@ -46,6 +45,22 @@ create table articles as ('A meteorologist in Atlanta rescued a woman from Helene floodwaters on camera') ); +select + contents, + vec_to_json(vec_slice(embedding, 0, 8)) +from lembed_batch( + ( + select json_group_array( + json_object( + 'id', rowid, + 'contents', headline + ) + ) from articles + ) +); + + +.exit select * from articles; .timer on diff --git a/tests/test-loadable.py b/tests/test-loadable.py index bf1166c..b47ddbe 100644 --- a/tests/test-loadable.py +++ b/tests/test-loadable.py @@ -8,6 +8,8 @@ EXT_PATH = "./dist/lembed0" MODEL1_PATH = "./dist/.models/all-MiniLM-L6-v2.e4ce9877.q8_0.gguf" +MODEL2_PATH = "./dist/.models/mxbai-embed-xsmall-v1-q8_0.gguf" +MODEL3_PATH = "./dist/.models/nomic-embed-text-v1.5.Q2_K.gguf" def connect(ext, path=":memory:", extra_entrypoint=None): @@ -71,10 +73,11 @@ def _raises(message, error=sqlite3.OperationalError): "lembed_token_score", "lembed_token_to_piece", "lembed_tokenize_json", + "lembed_tokenize_json", "lembed_version", ] MODULES = [ - "lembed_chunks", + "lembed_batch", "lembed_models", ] @@ -110,13 +113,14 @@ def test_lembed_debug(): def test_lembed(): + lembed = lambda *args: db.execute( + "select lembed({})".format(spread_args(args)), args + ).fetchone()[0] + db.execute( "insert into temp.lembed_models(name, model) values (?, lembed_model_from_file(?))", ["aaa", MODEL1_PATH], ) - lembed = lambda *args: db.execute( - "select lembed({})".format(spread_args(args)), args - ).fetchone()[0] a = lembed("aaa", "alex garcia") assert len(a) == (384 * 4) assert struct.unpack("1f", a[0:4])[0] == pytest.approx( @@ -128,6 +132,36 @@ def test_lembed(): ): lembed("aaaaaaaaa", "alex garcia") +def test_lembed_multiple(): + db = connect(EXT_PATH) + lembed = lambda *args: db.execute( + "select lembed({})".format(spread_args(args)), args + ).fetchone()[0] + + db.execute( + "insert into temp.lembed_models(name, model) values (?, ?), (?, ?), (?, ?)", + ["aaa", MODEL1_PATH, "bbb", MODEL2_PATH, "ccc", MODEL3_PATH], + ) + a = lembed("aaa", "alex garcia") + b = lembed("bbb", "alex garcia") + c = lembed("ccc", "alex garcia") + assert len(a) == (384 * 4) + assert len(b) == (384 * 4) + assert len(c) == (768 * 4) + + assert execute_all(db, "select * from lembed_models") == [ + {"name": "aaa", "model": None, "dimensions": 384, "n_ctx": 512, "pooling_type": "none"}, + {"name": "bbb", "model": None, "dimensions": 384, "n_ctx": 512, "pooling_type": "mean"}, + {"name": "ccc", "model": None, "dimensions": 768, "n_ctx": 512, "pooling_type": "mean"}, + ] + + +def test_lembed_default(): + db = connect(EXT_PATH) + lembed = lambda *args: db.execute( + "select lembed({})".format(spread_args(args)), args + ).fetchone()[0] + with _raises("No default model has been registered yet with lembed_models"): lembed("alex garcia") @@ -141,6 +175,68 @@ def test_lembed(): -0.09205757826566696, rel=1e-2 ) + # test 2: try with NULL name + db = connect(EXT_PATH) + lembed = lambda *args: db.execute( + "select lembed({})".format(spread_args(args)), args + ).fetchone()[0] + + with _raises("No default model has been registered yet with lembed_models"): + lembed("alex garcia") + + db.execute( + "insert into temp.lembed_models(model) values (lembed_model_from_file(?))", + [MODEL1_PATH], + ) + a = lembed("alex garcia") + assert len(a) == (384 * 4) + assert struct.unpack("1f", a[0:4])[0] == pytest.approx( + -0.09205757826566696, rel=1e-2 + ) + + # test 3: try text path to model + db = connect(EXT_PATH) + lembed = lambda *args: db.execute( + "select lembed({})".format(spread_args(args)), args + ).fetchone()[0] + + with _raises("No default model has been registered yet with lembed_models"): + lembed("alex garcia") + + db.execute( + "insert into temp.lembed_models(model) values (?)", + [MODEL1_PATH], + ) + a = lembed("alex garcia") + assert len(a) == (384 * 4) + assert struct.unpack("1f", a[0:4])[0] == pytest.approx( + -0.09205757826566696, rel=1e-2 + ) + +def test_stress_mxbai_xsmall(): + db = connect(EXT_PATH) + lembed = lambda *args: db.execute( + "select lembed({})".format(spread_args(args)), args + ).fetchone()[0] + + with _raises("No default model has been registered yet with lembed_models"): + lembed("alex garcia") + + db.execute( + "insert into temp.lembed_models(name, model) values (?, lembed_model_from_file(?))", + ["default", MODEL1_PATH], + ) + assert len(lembed("a " * 256)) == 384*4 + #print(db.execute('select lembed_tokenize_json(\'a a a a\') as x').fetchone()["x"]) + + # including start and end token, this is 512 tokens, max ctx size for all-mini + lembed("a " * (510)) + + with _raises("Error generating embedding: Input too long, provided 513 tokens, but model has context size of 512"): + lembed("a " * (511)) + + with _raises("Error generating embedding: Input too long, provided 4098 tokens, but model has context size of 512"): + lembed("a " * (4096)) @pytest.mark.skip(reason="TODO") def test__lembed_api(): @@ -203,11 +299,10 @@ def test_lembed_token_to_piece(): ).fetchone()[0] pass - @pytest.mark.skip(reason="TODO") -def test_lembed_chunks(): - lembed_chunks = lambda *args: db.execute( - "select * from lembed_chunks()", args +def test_lembed_batch(): + lembed_batch = lambda *args: db.execute( + "select * from lembed_batch()", args ).fetchone()[0] pass From 62ddd9a518d44f582adb7a5c99a543abf27c9d3e Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Thu, 21 Nov 2024 17:29:21 -0800 Subject: [PATCH 03/13] cleanup --- bench/batch.sh | 1 - debug.sql | 46 ---------------------------------------------- tmp.py | 16 ---------------- tmp.sql | 49 ------------------------------------------------- 4 files changed, 112 deletions(-) delete mode 100644 bench/batch.sh delete mode 100644 debug.sql delete mode 100644 tmp.py delete mode 100644 tmp.sql diff --git a/bench/batch.sh b/bench/batch.sh deleted file mode 100644 index a9bf588..0000000 --- a/bench/batch.sh +++ /dev/null @@ -1 +0,0 @@ -#!/bin/bash diff --git a/debug.sql b/debug.sql deleted file mode 100644 index 8c8051e..0000000 --- a/debug.sql +++ /dev/null @@ -1,46 +0,0 @@ -.load ./dist/lembed0 -.load ../sqlite-vec/dist/vec0 - -select lembed_version(), lembed_debug(); - -insert into temp.lembed_models(name, model) - select 'default', lembed_model_from_file(''); - -select vec_to_json(vec_slice(lembed('Shohei Ohtani''s ex-interpreter pleads guilty to charges related to gambling and theft'), 0, 8)); -select vec_to_json(vec_slice(lembed('The jury has been selected in Hunter Biden''s gun trial'), 0, 8)); - - -.mode box -.header on - -select - rowid, - contents, - --length(embedding), - vec_to_json(vec_slice(embedding, 0, 8)), - vec_to_json(vec_slice(lembed(contents), 0, 8)) - -from lembed_batch( - ( - '[ - {"contents": "Shohei Ohtani''s ex-interpreter pleads guilty to charges related to gambling and theft"} - ]' - ) -); - - -select - rowid, - contents, - --length(embedding), - vec_to_json(vec_slice(embedding, 0, 8)), - vec_to_json(vec_slice(lembed(contents), 0, 8)) - -from lembed_batch( - ( - '[ - {"contents": "Shohei Ohtani''s ex-interpreter pleads guilty to charges related to gambling and theft"}, - {"contents": "The jury has been selected in Hunter Biden''s gun trial"} - ]' - ) -); diff --git a/tmp.py b/tmp.py deleted file mode 100644 index 2367811..0000000 --- a/tmp.py +++ /dev/null @@ -1,16 +0,0 @@ -import sqlite3 -import time -from sentence_transformers import SentenceTransformer - -db = sqlite3.connect("bench/headlines-2024.db") - -sentences = [ - row[0] for row in db.execute("select headline from articles limit 1000").fetchall() -] - -model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") -t0 = time.time() -print(t0) -embeddings = model.encode(sentences) -print(time.time() - t0) -print(embeddings[0][0:8]) diff --git a/tmp.sql b/tmp.sql deleted file mode 100644 index 70b5c84..0000000 --- a/tmp.sql +++ /dev/null @@ -1,49 +0,0 @@ -.bail on - -.load ./dist/lembed0 -.load ../sqlite-vec/dist/vec0 - -select lembed_version(), lembed_debug(); - -insert into temp.lembed_models(name, model) - --select 'default', lembed_model_from_file('/Users/alex/projects/llama.cpp/all-MiniLM-L6-v2.F16.gguf'); - --select 'default', lembed_model_from_file('./all-MiniLM-L6-v2.e4ce9877.f32.gguf'); - --select 'default', lembed_model_from_file('./all-MiniLM-L6-v2.F32.gguf'); - select 'default', lembed_model_from_file('all-MiniLM-L6-v2.Q6_K.gguf'); - ---select length(lembed('asdf')); -.mode box -.header on -.timer on - -select - rowid, - --contents, - typeof(embedding), - quote(substr(embedding, 0, 8)) - --vec_to_json(vec_slice(embedding, 0, 4)) -from lembed_batch( - ( - select json_group_array( - json_object('contents', headline) - ) - from (select * from articles limit 1000) - ) -); - -select sum(length(lembed(headline))) from (select * from articles limit 1000); - -select - rowid, - --contents, - typeof(embedding), - quote(substr(embedding, 0, 8)) - --vec_to_json(vec_slice(embedding, 0, 4)) -from lembed_batch( - ( - select json_group_array( - json_object('contents', headline) - ) - from (select * from articles limit 1000) - ) -); From 880d938c02920ebc24a37d58469789b636a8377a Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Fri, 22 Nov 2024 12:16:54 -0800 Subject: [PATCH 04/13] improve cmakelist, sqlite3 cli target --- CMakeLists.txt | 70 +++++++++++++++++++++++++++++++++++-------------- Makefile | 10 +++++-- core_init.c | 12 +++++++++ sqlite-lembed.c | 32 +++++++++++----------- test.sql | 10 ++++--- 5 files changed, 93 insertions(+), 41 deletions(-) create mode 100644 core_init.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 8fdef01..ad76302 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,42 +4,72 @@ project(SqliteEmbed C CXX) set(CMAKE_C_STANDARD 99) set(CMAKE_C_STANDARD_REQUIRED ON) -set(LLAMA_METAL OFF) -set(LLAMA_STATIC ON) -set(LLAMA_OPENMP OFF) - -set(LLAMA_CPP_DIR "${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp") -set(SQLITE_AMALGAMATION_DIR "${CMAKE_CURRENT_SOURCE_DIR}/vendor/sqlite") +include(FetchContent) -include(ExternalProject) -set(SQLITE_VERSION 3450300) +# sqlite amalgamation, for up-to-date headers and sqlite3 CLI +set(SQLITE_VERSION 3470000) set(SQLITE_YEAR 2024) -set(SQLITE_URL https://www.sqlite.org/${SQLITE_YEAR}/sqlite-amalgamation-${SQLITE_VERSION}.zip) -ExternalProject_Add(sqlite_amalgamation - URL ${SQLITE_URL} - DOWNLOAD_DIR ${CMAKE_BINARY_DIR}/downloads - SOURCE_DIR ${SQLITE_AMALGAMATION_DIR} - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" -) +set(SQLITE_URL) +FetchContent_Declare( + sqlite_amalgamation + URL https://www.sqlite.org/${SQLITE_YEAR}/sqlite-amalgamation-${SQLITE_VERSION}.zip + ) +FetchContent_MakeAvailable(sqlite_amalgamation) +# llama.cpp +set(LLAMA_METAL OFF) +set(LLAMA_STATIC ON) +set(LLAMA_OPENMP OFF) +set(LLAMA_CPP_DIR "${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp") add_subdirectory(${LLAMA_CPP_DIR} ${CMAKE_BINARY_DIR}/llama.cpp) -include_directories(${SQLITE_AMALGAMATION_DIR}) +# sqlite-lembed loadable add_library(sqlite_lembed SHARED sqlite-lembed.c) -add_dependencies(sqlite_lembed sqlite_amalgamation) target_link_libraries(sqlite_lembed ggml_static llama) target_include_directories(sqlite_lembed PRIVATE ${LLAMA_CPP_DIR}) +target_include_directories(sqlite_lembed PRIVATE ${sqlite_amalgamation_SOURCE_DIR}) set_target_properties(sqlite_lembed PROPERTIES PREFIX "") set_target_properties(sqlite_lembed PROPERTIES OUTPUT_NAME "lembed0") +# sqlite-lembed static add_library(sqlite_lembed_static STATIC sqlite-lembed.c) -add_dependencies(sqlite_lembed_static sqlite_amalgamation) target_link_libraries(sqlite_lembed_static ggml_static llama) target_include_directories(sqlite_lembed_static PRIVATE ${LLAMA_CPP_DIR}) +target_include_directories(sqlite_lembed_static PRIVATE ${sqlite_amalgamation_SOURCE_DIR}) target_compile_definitions(sqlite_lembed_static PRIVATE SQLITE_CORE) set_target_properties(sqlite_lembed_static PROPERTIES OUTPUT_NAME "sqlite_lembed0") + + +# sqlite-vec, for a better sqlite3 CLI +set(SQLITE_VEC_VERSION 0.1.6) +FetchContent_Declare( + sqlite_vec + URL https://github.com/asg017/sqlite-vec/releases/download/v${SQLITE_VEC_VERSION}/sqlite-vec-${SQLITE_VEC_VERSION}-amalgamation.tar.gz +) +FetchContent_MakeAvailable(sqlite_vec) + + +# sqlite3 CLI, with sqlite-lembed and sqlite-vec +add_executable( + sqlite3_cli + ${sqlite_amalgamation_SOURCE_DIR}/shell.c + ${sqlite_amalgamation_SOURCE_DIR}/sqlite3.c + ${sqlite_vec_SOURCE_DIR}/sqlite-vec.c + core_init.c +) +add_dependencies(sqlite3_cli sqlite_lembed_static) +target_link_libraries(sqlite3_cli sqlite_lembed_static) +target_include_directories( + sqlite3_cli PRIVATE + ${sqlite_amalgamation_SOURCE_DIR} + ${sqlite_vec_SOURCE_DIR} +) +target_compile_definitions( + sqlite3_cli PUBLIC + SQLITE_EXTRA_INIT=core_init + SQLITE_CORE +) +set_target_properties(sqlite3_cli PROPERTIES OUTPUT_NAME "sqlite3") diff --git a/Makefile b/Makefile index 94bcff9..cc09b58 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ COMMIT=$(shell git rev-parse HEAD) VERSION=$(shell cat VERSION) DATE=$(shell date +'%FT%TZ%z') -LLAMA_CMAKE_FLAGS=-DLLAMA_OPENMP=OFF +LLAMA_CMAKE_FLAGS+=-DLLAMA_OPENMP=OFF ifndef CC CC=gcc endif @@ -55,10 +55,11 @@ $(prefix): TARGET_LOADABLE=$(prefix)/lembed0.$(LOADABLE_EXTENSION) TARGET_STATIC=$(prefix)/libsqlite_lembed0.a TARGET_STATIC_H=$(prefix)/sqlite-lembed.h +TARGET_CLI=$(prefix)/sqlite3 loadable: $(TARGET_LOADABLE) static: $(TARGET_STATIC) - +cli: $(TARGET_CLI) BUILD_DIR=$(prefix)/.build @@ -90,6 +91,11 @@ $(TARGET_STATIC): sqlite-lembed.c sqlite-lembed.h $(BUILD_DIR) $(prefix) ls $(BUILD_DIR) cp $(BUILT_LOADABLE_PATH) $@ +$(TARGET_CLI): sqlite-lembed.c sqlite-lembed.h $(BUILD_DIR) $(prefix) + cmake --build $(BUILD_DIR) -t sqlite3_cli $(EXTRA_CMAKE_BUILD) + ls $(BUILD_DIR) + cp $(BUILD_DIR)/sqlite3 $@ + sqlite-lembed.h: sqlite-lembed.h.tmpl VERSION VERSION=$(shell cat VERSION) \ diff --git a/core_init.c b/core_init.c new file mode 100644 index 0000000..f8ee09f --- /dev/null +++ b/core_init.c @@ -0,0 +1,12 @@ +#include "sqlite3.h" +#include "sqlite-vec.h" +#include "sqlite-lembed.h" +#include +int core_init(const char *dummy) { + int rc; + rc = sqlite3_auto_extension((void *)sqlite3_vec_init); + if(rc == SQLITE_OK) { + rc = sqlite3_auto_extension((void *)sqlite3_lembed_init); + } + return rc; +} diff --git a/sqlite-lembed.c b/sqlite-lembed.c index 85de08c..1b5be47 100644 --- a/sqlite-lembed.c +++ b/sqlite-lembed.c @@ -449,7 +449,7 @@ static void ggml_test(sqlite3_context *context, int argc, } -void vtab_set_error(sqlite3_vtab *pVTab, const char *zFormat, ...) { +void lembed_vtab_set_error(sqlite3_vtab *pVTab, const char *zFormat, ...) { va_list args; sqlite3_free(pVTab->zErrMsg); va_start(args, zFormat); @@ -544,7 +544,7 @@ static int lembed_modelsUpdate(sqlite3_vtab *pVTab, int argc, modelPath = sqlite3_value_text(columnValues[LEMBED_MODELS_MODEL]); } if(!modelPath) { - vtab_set_error(pVTab, "Could not resolve model path"); + lembed_vtab_set_error(pVTab, "Could not resolve model path"); return SQLITE_ERROR; } @@ -766,7 +766,7 @@ struct Array { * @return SQLITE_OK on success, error code on failure. Only error is * SQLITE_NOMEM */ -int array_init(struct Array *array, size_t element_size, size_t init_capacity) { +int lembed_array_init(struct Array *array, size_t element_size, size_t init_capacity) { int sz = element_size * init_capacity; void *z = sqlite3_malloc(sz); if (!z) { @@ -781,7 +781,7 @@ int array_init(struct Array *array, size_t element_size, size_t init_capacity) { return SQLITE_OK; } -int array_append(struct Array *array, const void *element) { +int lembed_array_append(struct Array *array, const void *element) { if (array->length == array->capacity) { size_t new_capacity = array->capacity * 2 + 100; void *z = sqlite3_realloc64(array->z, array->element_size * new_capacity); @@ -798,7 +798,7 @@ int array_append(struct Array *array, const void *element) { return SQLITE_OK; } -void array_cleanup(struct Array *array) { +void lembed_array_cleanup(struct Array *array) { if (!array) return; array->element_size = 0; @@ -899,8 +899,8 @@ static int lembed_batchBestIndex( sqlite3_vtab *pVTab, sqlite3_index_info *pIdxInfo ){ - int hasSource = 0; + for (int i = 0; i < pIdxInfo->nConstraint; i++) { const struct sqlite3_index_constraint *pCons = &pIdxInfo->aConstraint[i]; switch (pCons->iColumn) { @@ -972,8 +972,8 @@ int embed_batch( nprocessed += 1; char * zCopy = sqlite3_mprintf("%.*s", len, s); assert(zCopy); - array_append(&pCur->contentsArray, &zCopy) == SQLITE_OK;//assert(); - array_append(&pCur->contentLengthsArray, &len) == SQLITE_OK;//assert(); + lembed_array_append(&pCur->contentsArray, &zCopy) == SQLITE_OK;//assert(); + lembed_array_append(&pCur->contentLengthsArray, &len) == SQLITE_OK;//assert(); pCur->stmtRc = sqlite3_step(pCur->stmt); } if(nprocessed==0) { @@ -1029,15 +1029,15 @@ static int lembed_batchFilter( for(int i = 0; i < pCur->batchSize; i++) { sqlite3_free(((char **)pCur->contentsArray.z)[i]); } - array_cleanup(&pCur->contentsArray); - array_cleanup(&pCur->contentLengthsArray); + lembed_array_cleanup(&pCur->contentsArray); + lembed_array_cleanup(&pCur->contentLengthsArray); if(pCur->embeddings) { sqlite3_free(pCur->embeddings); pCur->embeddings = NULL; } - rc = array_init(&pCur->contentsArray, sizeof(char *), 32); + rc = lembed_array_init(&pCur->contentsArray, sizeof(char *), 32); assert(rc == SQLITE_OK); - rc = array_init(&pCur->contentLengthsArray, sizeof(int), 32); + rc = lembed_array_init(&pCur->contentLengthsArray, sizeof(int), 32); assert(rc == SQLITE_OK); pCur->iRowid = 0; pCur->eof = 0; @@ -1062,15 +1062,15 @@ static int lembed_batchNext(sqlite3_vtab_cursor *cur){ for(int i = 0; i < pCur->batchSize; i++) { sqlite3_free(((char **)pCur->contentsArray.z)[i]); } - array_cleanup(&pCur->contentsArray); - array_cleanup(&pCur->contentLengthsArray); + lembed_array_cleanup(&pCur->contentsArray); + lembed_array_cleanup(&pCur->contentLengthsArray); if(pCur->embeddings) { sqlite3_free(pCur->embeddings); pCur->embeddings = NULL; } - rc = array_init(&pCur->contentsArray, sizeof(char *), 32); + rc = lembed_array_init(&pCur->contentsArray, sizeof(char *), 32); assert(rc == SQLITE_OK); - rc = array_init(&pCur->contentLengthsArray, sizeof(int), 32); + rc = lembed_array_init(&pCur->contentLengthsArray, sizeof(int), 32); assert(rc == SQLITE_OK); rc = embed_batch(pCur); assert(rc == SQLITE_ROW || rc == SQLITE_DONE); diff --git a/test.sql b/test.sql index 88c7669..49af8ff 100644 --- a/test.sql +++ b/test.sql @@ -13,10 +13,12 @@ select lembed_version(), lembed_debug(); INSERT INTO temp.lembed_models(name, model) - select 'default', lembed_model_from_file('/Users/alex/projects/llama.cpp/all-MiniLM-L6-v2.F16.gguf'); + select 'default', 'dist/.models/mxbai-embed-xsmall-v1-q8_0.gguf'; create table articles as - select column1 as headline + select + column1 as headline, + random() % 100 as random from (VALUES ('Shohei Ohtani''s ex-interpreter pleads guilty to charges related to gambling and theft'), ('The jury has been selected in Hunter Biden''s gun trial'), @@ -46,6 +48,7 @@ create table articles as ); select + *, contents, vec_to_json(vec_slice(embedding, 0, 8)) from lembed_batch( @@ -53,7 +56,8 @@ from lembed_batch( select json_group_array( json_object( 'id', rowid, - 'contents', headline + 'contents', headline, + 'random', random ) ) from articles ) From 40dafb363e879ddd12cd6db1a2263baeeb69dda7 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Fri, 22 Nov 2024 12:33:07 -0800 Subject: [PATCH 05/13] try cosmo build --- .github/workflows/test.yaml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 049d4cd..e497497 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -3,6 +3,7 @@ on: push: branches: - main + - batching permissions: contents: read jobs: @@ -63,3 +64,25 @@ jobs: with: name: sqlite-lembed-windows-x86_64-extension path: dist/*.dll + build-cosmopolitan: + runs-on: macos-latest + permissions: + contents: write + steps: + - uses: actions/checkout@v4 + - run: | + mkdir $HOME/cosmo + curl -L -o cosmocc-$COSMO_VERSION.zip https://github.com/jart/cosmopolitan/releases/download/$COSMO_VERSION/cosmocc-$COSMO_VERSION.zip + unzip cosmocc-$COSMO_VERSION.zip -d $HOME/cosmo + env: + COSMO_VERSION: "3.5.4" + - run: make sqlite-lembed.h + - run: make cli CC=$HOME/cosmo/bin/cosmocc CXX=$HOME/cosmo/bin/cosmoc++ AR=$HOME/cosmo/bin/cosmoar OMIT_SIMD=1 + - run: tar -czvf sqlite-lembed-$(cat VERSION)-cli-cosmopolitan.tar.gz dist/sqlite3 + - run: gh release upload ${{ github.ref_name }} sqlite-lembed-$(cat VERSION)-cli-cosmopolitan.tar.gz + env: + GH_TOKEN: ${{ github.token }} + - uses: actions/upload-artifact@v4 + with: + name: sqlite-lembed-cosmopolitan + path: dist/* From 0fa5fad68895323f3f15f6b2144f5b2fea256f78 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Fri, 22 Nov 2024 12:35:23 -0800 Subject: [PATCH 06/13] bump cosmo --- .github/workflows/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index e497497..968c948 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -75,7 +75,7 @@ jobs: curl -L -o cosmocc-$COSMO_VERSION.zip https://github.com/jart/cosmopolitan/releases/download/$COSMO_VERSION/cosmocc-$COSMO_VERSION.zip unzip cosmocc-$COSMO_VERSION.zip -d $HOME/cosmo env: - COSMO_VERSION: "3.5.4" + COSMO_VERSION: "3.9.6" - run: make sqlite-lembed.h - run: make cli CC=$HOME/cosmo/bin/cosmocc CXX=$HOME/cosmo/bin/cosmoc++ AR=$HOME/cosmo/bin/cosmoar OMIT_SIMD=1 - run: tar -czvf sqlite-lembed-$(cat VERSION)-cli-cosmopolitan.tar.gz dist/sqlite3 From f4fbab6ba0a9b17d09b943c62f3d1b8f7f046d20 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Fri, 22 Nov 2024 12:35:36 -0800 Subject: [PATCH 07/13] , --- test.sql | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test.sql b/test.sql index 49af8ff..e841be1 100644 --- a/test.sql +++ b/test.sql @@ -1,5 +1,5 @@ -.load ./dist/lembed0 -.load ../sqlite-vec/dist/vec0 +--.load ./dist/lembed0 +--.load ../sqlite-vec/dist/vec0 .mode box .header on @@ -8,7 +8,7 @@ .timer on .echo on -select lembed_version(), lembed_debug(); +select sqlite_version(), lembed_version(), vec_version(); From b38db9effd765a0918548374aefcdbaf969f2b93 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Fri, 22 Nov 2024 14:34:13 -0800 Subject: [PATCH 08/13] skip cosmo for now --- .github/workflows/test.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 968c948..01ccc2a 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -65,6 +65,7 @@ jobs: name: sqlite-lembed-windows-x86_64-extension path: dist/*.dll build-cosmopolitan: + if: false runs-on: macos-latest permissions: contents: write From a650a86f0b7ecf925fe1cced58571ebd68dd03b8 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Fri, 22 Nov 2024 14:34:38 -0800 Subject: [PATCH 09/13] llama.cpp in cmake and not submodule? --- .github/workflows/test.yaml | 8 -------- CMakeLists.txt | 8 ++++++-- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 01ccc2a..d93fd75 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -11,8 +11,6 @@ jobs: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v4 - with: - submodules: recursive - uses: actions/setup-python@v5 with: python-version: "3.12" @@ -26,8 +24,6 @@ jobs: runs-on: macos-12 steps: - uses: actions/checkout@v4 - with: - submodules: recursive - run: make loadable - run: /usr/local/opt/python@3/libexec/bin/python -m pip install --break-system-packages pytest numpy; make test-loadable python=/usr/local/opt/python@3/libexec/bin/python - uses: actions/upload-artifact@v4 @@ -38,8 +34,6 @@ jobs: runs-on: macos-14 steps: - uses: actions/checkout@v4 - with: - submodules: recursive - run: make loadable - run: /opt/homebrew/opt/python3/libexec/bin/python -m pip install pytest numpy --break-system-packages; make test-loadable python=/opt/homebrew/opt/python3/libexec/bin/python - uses: actions/upload-artifact@v4 @@ -50,8 +44,6 @@ jobs: runs-on: windows-2019 steps: - uses: actions/checkout@v4 - with: - submodules: recursive - uses: ilammy/msvc-dev-cmd@v1 - uses: actions/setup-python@v5 with: diff --git a/CMakeLists.txt b/CMakeLists.txt index ad76302..794e240 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,8 +22,12 @@ FetchContent_MakeAvailable(sqlite_amalgamation) set(LLAMA_METAL OFF) set(LLAMA_STATIC ON) set(LLAMA_OPENMP OFF) -set(LLAMA_CPP_DIR "${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp") -add_subdirectory(${LLAMA_CPP_DIR} ${CMAKE_BINARY_DIR}/llama.cpp) + +ExternalProject_Add( + "llama.cpp" + GIT_REPOSITORY git@github.com:ggerganov/llama.cpp.git + GIT_TAG b3091 +) # sqlite-lembed loadable From 2226ad9b52281c4c88e87dca4b6cc3736e1f0c07 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Fri, 22 Nov 2024 14:53:45 -0800 Subject: [PATCH 10/13] just fetchcontent? --- CMakeLists.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 794e240..0b1d5ab 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,7 @@ set(CMAKE_C_STANDARD 99) set(CMAKE_C_STANDARD_REQUIRED ON) include(FetchContent) +include(ExternalProject) # sqlite amalgamation, for up-to-date headers and sqlite3 CLI @@ -23,12 +24,12 @@ set(LLAMA_METAL OFF) set(LLAMA_STATIC ON) set(LLAMA_OPENMP OFF) -ExternalProject_Add( - "llama.cpp" +FetchContent_Declare( + llama_cpp GIT_REPOSITORY git@github.com:ggerganov/llama.cpp.git GIT_TAG b3091 ) - +FetchContent_MakeAvailable(llama_cpp) # sqlite-lembed loadable add_library(sqlite_lembed SHARED sqlite-lembed.c) From d9137e3db815898dfce45199873f955790970037 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Fri, 22 Nov 2024 14:55:49 -0800 Subject: [PATCH 11/13] https? --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0b1d5ab..97e36bc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,7 +26,7 @@ set(LLAMA_OPENMP OFF) FetchContent_Declare( llama_cpp - GIT_REPOSITORY git@github.com:ggerganov/llama.cpp.git + GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git GIT_TAG b3091 ) FetchContent_MakeAvailable(llama_cpp) From f9b0158f1b750d3d3f44caf2c98609726c5c5803 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Fri, 22 Nov 2024 14:58:21 -0800 Subject: [PATCH 12/13] yeet out llama.cpp submodule --- .gitmodules | 3 --- vendor/llama.cpp | 1 - 2 files changed, 4 deletions(-) delete mode 160000 vendor/llama.cpp diff --git a/.gitmodules b/.gitmodules index 7edf097..e69de29 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +0,0 @@ -[submodule "vendor/llama.cpp"] - path = vendor/llama.cpp - url = https://github.com/ggerganov/llama.cpp.git diff --git a/vendor/llama.cpp b/vendor/llama.cpp deleted file mode 160000 index 2b33896..0000000 --- a/vendor/llama.cpp +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 2b3389677a833cee0880226533a1768b1a9508d2 From 1f45a0783491687b164966071f97d97351e73467 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Sat, 23 Nov 2024 22:10:06 -0800 Subject: [PATCH 13/13] rm model_size() stuff --- sqlite-lembed.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/sqlite-lembed.c b/sqlite-lembed.c index 1b5be47..59ac0cc 100644 --- a/sqlite-lembed.c +++ b/sqlite-lembed.c @@ -13,6 +13,8 @@ SQLITE_EXTENSION_INIT1 #define UNUSED_PARAMETER(X) (void)(X) #endif +#define SQLITE_VEC_FLOAT32_SUBTYPE 223 + void dummy_log(enum ggml_log_level level, const char *text, void *user_data) {} static void normalize(float *vec, float *out, int n) { @@ -159,15 +161,6 @@ struct lembed_model_options { static char *POINTER_NAME_MODEL = "lembed_model"; static char *POINTER_NAME_MODEL_OPTIONS = "lembed_model_options"; -static void lembed_model_size(sqlite3_context *context, int argc, - sqlite3_value **argv) { - struct llama_model *model = - sqlite3_value_pointer(argv[0], POINTER_NAME_MODEL); - if (!model) - return; - sqlite3_result_int64(context, llama_model_size(model)); -} - static void lembed_model_options_(sqlite3_context *context, int argc, sqlite3_value **argv) { @@ -353,7 +346,7 @@ static void lembed(sqlite3_context *context, int argc, sqlite3_value **argv) { return; } sqlite3_result_blob(context, embedding, sizeof(float) * dimensions, sqlite3_free); - sqlite3_result_subtype(context, 223); // TODO define + sqlite3_result_subtype(context, SQLITE_VEC_FLOAT32_SUBTYPE); } static void lembed_tokenize_json(sqlite3_context *context, int argc, @@ -481,12 +474,13 @@ static int lembed_modelsConnect(sqlite3 *db, void *pAux, int argc, } #define LEMBED_MODELS_NAME 0 #define LEMBED_MODELS_MODEL 1 -#define LEMBED_MODELS_DIMENSIONS 2 -#define LEMBED_MODELS_N_CTX 3 -#define LEMBED_MODELS_POOLING_TYPE 4 -#define LEMBED_MODELS_MODEL_OPTIONS 5 -#define LEMBED_MODELS_CONTEXT_OPTIONS 6 - rc = sqlite3_declare_vtab(db, "CREATE TABLE x(name, model, dimensions, n_ctx, pooling_type, model_options " +#define LEMBED_MODELS_SIZE 2 +#define LEMBED_MODELS_DIMENSIONS 3 +#define LEMBED_MODELS_N_CTX 4 +#define LEMBED_MODELS_POOLING_TYPE 5 +#define LEMBED_MODELS_MODEL_OPTIONS 6 +#define LEMBED_MODELS_CONTEXT_OPTIONS 7 + rc = sqlite3_declare_vtab(db, "CREATE TABLE x(name, model, size, dimensions, n_ctx, pooling_type, model_options " "hidden, context_options hidden)"); if (rc == SQLITE_OK) { pNew = sqlite3_malloc(sizeof(*pNew)); @@ -684,6 +678,9 @@ static int lembed_modelsColumn(sqlite3_vtab_cursor *cur, sqlite3_result_text(context, p->api->models[pCur->iRowid].name, -1, SQLITE_TRANSIENT); break; + case LEMBED_MODELS_SIZE: + sqlite3_result_int64(context, llama_model_size(p->api->models[pCur->iRowid].model)); + break; case LEMBED_MODELS_DIMENSIONS: sqlite3_result_int64(context, llama_n_embd(p->api->models[pCur->iRowid].model)); break; @@ -1107,7 +1104,7 @@ static int lembed_batchColumn( sizeof(float) * pCur->dimensions, SQLITE_TRANSIENT ); - sqlite3_result_subtype(context, 223); // TODO define + sqlite3_result_subtype(context, SQLITE_VEC_FLOAT32_SUBTYPE); break; default: sqlite3_result_null(context); @@ -1216,7 +1213,6 @@ __declspec(dllexport) {"lembed_tokenize_json", lembed_tokenize_json, 2}, {"lembed_token_score", lembed_token_score, 2}, {"lembed_token_to_piece", lembed_token_to_piece_, 2}, - {"lembed_model_size", lembed_model_size, 1}, {"lembed_model_from_file", lembed_model_from_file, 1}, {"lembed_model_options", lembed_model_options_, -1}, {"lembed_context_options", lembed_context_options_, -1},