Skip to content

context : allow cache-less context for embeddings #13108

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 21 commits into
base: gg/llama-kv-cache-v6
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
1a31566
kv-cache : serparate recurrent vs non-recurrent impl (wip)
ggerganov Apr 7, 2025
d564115
kv-cache : init -> contructor + add llama_memory_params
ggerganov Apr 15, 2025
709ade1
kv-cache : fix callback reference
ggerganov Apr 15, 2025
c55fc45
context : llama_kv_cache -> llama_memory_i
ggerganov Apr 17, 2025
5909d35
context : move memory creation logic to model
ggerganov Apr 17, 2025
e41dcac
llama : remove reference of memory during encode
ggerganov Apr 17, 2025
7e79438
kv-cache : hide padding details in the implementation
ggerganov Apr 23, 2025
733babc
kv-cache : add ubatch_next()
ggerganov Apr 23, 2025
b46d574
context : simplify sbatch logic
ggerganov Apr 23, 2025
60c138f
kv-cache : hide defrag logic in the implementation
ggerganov Apr 23, 2025
f2175ca
context : hide kv cache details in implementation
ggerganov Apr 23, 2025
21eef7d
build : fix
ggerganov Apr 23, 2025
5d28934
cont : another fix
ggerganov Apr 23, 2025
43fbc5f
kv-cache : simplify interface (wip)
ggerganov Apr 24, 2025
d3f22ea
kv-cache : use separate KV cell structs for unified/recurrent
ggerganov Apr 24, 2025
bb81bfd
kv-cache : clean-up
ggerganov Apr 24, 2025
56dfde4
model : better llama_model::create_model() signature
ggerganov Apr 24, 2025
dec80ac
kv-cache : fix recurrent seq_rm()
ggerganov Apr 25, 2025
5f5c3b7
context : allow cache-less context for embeddings
ggerganov Apr 25, 2025
2dba70d
context : enable reranking with encode()
ggerganov Apr 25, 2025
4f0ea9b
context : encode() clears embd_seq
ggerganov Apr 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/embedding/embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
}
} else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
// decoder-only model
if (llama_decode(ctx, batch) < 0) {
if (llama_encode(ctx, batch) < 0) {
LOG_ERR("%s : failed to decode\n", __func__);
}
}
Expand Down
2 changes: 1 addition & 1 deletion examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3941,7 +3941,7 @@ int main(int argc, char ** argv) {
const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
server_task_type type,
json & data,
std::function<bool()> is_connection_closed,
const std::function<bool()> & is_connection_closed,
httplib::Response & res,
oaicompat_type oaicompat) {
GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
Expand Down
6 changes: 5 additions & 1 deletion src/llama-batch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
return ubatch;
}

void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
GGML_ASSERT(batch.n_tokens >= 0);
this->batch = &batch;
this->n_embd = n_embd;
Expand All @@ -203,6 +203,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
for (size_t i = 0; i < n_tokens; ++i) {
ids[i] = i;
}

if (simple_split) {
seq.resize(1);
llama_sbatch_seq & s = seq[0];
Expand All @@ -212,6 +213,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
s.length = n_tokens;
return;
}

std::sort(ids.begin(), ids.end(),
[&batch](size_t a, size_t b) {
int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1;
Expand Down Expand Up @@ -239,6 +241,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
return n_seq_a > n_seq_b;
}
);

// init seq
llama_sbatch_seq * last_seq = nullptr;

Expand All @@ -262,6 +265,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
seq.push_back(new_seq);
last_seq = &seq.back();
}

// keep shared prompts first at the end, then sort by length descending.
std::sort(seq.begin(), seq.end(),
[](llama_sbatch_seq & a, llama_sbatch_seq & b) {
Expand Down
3 changes: 2 additions & 1 deletion src/llama-batch.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ struct llama_sbatch {
// sequence-wise split
llama_ubatch split_seq(size_t n_ubatch);

void from_batch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
llama_sbatch() = default;
llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
};

// temporary allocate memory for the input batch if needed
Expand Down
Loading
Loading