-
-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathllama-att-softmax.c
79 lines (69 loc) · 2.64 KB
/
llama-att-softmax.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "ggml.h"
#include "ggml-cpu.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
/*
* This example is intended to get a feel/understanding for how the attention mask
* used with the QK attention scores work in Llama.cpp's attention softmax.
*/
int main(int argc, char **argv) {
printf("GGML llama attention softmax example\n");
struct ggml_init_params params = {
.mem_size = 16*1024*1024,
.mem_buffer = NULL,
};
struct ggml_context* ctx = ggml_init(params);
struct ggml_tensor* logits = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 32, 1, 1);
ggml_set_name(logits, "logits");
float tensor_data[32] = {
0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0,
10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0,
20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0,
30.0, 31.0
};
memcpy((char *)logits->data, tensor_data, ggml_nbytes(logits));
for (int i = 0; i < ggml_nelements(logits); i++) {
float value = *(float *) ((char *) logits->data + i * logits->nb[0]);
printf("%.4f ", value);
}
printf("\n");
struct ggml_tensor* mask = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 32);
ggml_set_name(mask, "mask");
float mask_data[32] = {
-INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY, 0.0f, 0.0f, 0.0f, 0.0f,
0.0f , 0.0f , 0.0f , 0.0f , -INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY,
-INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY,
-INFINITY, -INFINITY,
};
memcpy((char *)mask->data, mask_data, ggml_nbytes(mask));
for (int i = 0; i < ggml_nelements(mask); i++) {
float value = *(float *) ((char *) mask->data + i * mask->nb[0]);
printf("%.4f ", value);
}
printf("\n");
struct ggml_tensor* result = ggml_soft_max_ext(ctx, logits, mask, 1.0f, 0.0f);
ggml_set_name(result, "result");
struct ggml_cgraph* c_graph = ggml_new_graph(ctx);
ggml_build_forward_expand(c_graph, result);
int n_threads = 1;
enum ggml_status st = ggml_graph_compute_with_ctx(ctx, c_graph, n_threads);
if (st != GGML_STATUS_SUCCESS) {
printf("could not compute graph\n");
return 1;
}
printf("result tensor type: %s\n", ggml_type_name(result->type));
printf("result dim: %d\n", ggml_n_dims(result));
printf("result dim[0]: %ld\n", result->ne[0]);
float sum = 0.0f;
for (int i = 0; i < ggml_nelements(result); i++) {
float value = *(float *) ((char *) result->data + i * result->nb[0]);
printf("%.4f ", value);
sum += value;
}
printf("\nsum: %.4f\n", sum);
ggml_free(ctx);
return 0;
}