Skip to content

Commit 1831f53

Browse files
llama-bench: add -d depth arg (#13096)
* add depth param * update llama-bench README and add depth param * llama-bench: default params for depth arg for faster execution * Update examples/llama-bench/README.md Co-authored-by: Johannes Gäßler <[email protected]> * fix buffer print ub * use user provided args * remove extra whitespaces --------- Co-authored-by: Johannes Gäßler <[email protected]>
1 parent 4e87962 commit 1831f53

File tree

2 files changed

+137
-65
lines changed

2 files changed

+137
-65
lines changed

examples/llama-bench/README.md

+96-59
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ options:
2828
-p, --n-prompt <n> (default: 512)
2929
-n, --n-gen <n> (default: 128)
3030
-pg <pp,tg> (default: )
31+
-d, --n-depth <n> (default: 0)
3132
-b, --batch-size <n> (default: 2048)
3233
-ub, --ubatch-size <n> (default: 512)
3334
-ctk, --cache-type-k <t> (default: f16)
@@ -66,6 +67,8 @@ With the exception of `-r`, `-o` and `-v`, all options can be specified multiple
6667

6768
Each test is repeated the number of times given by `-r`, and the results are averaged. The results are given in average tokens per second (t/s) and standard deviation. Some output formats (e.g. json) also include the individual results of each repetition.
6869

70+
Using the `-d <n>` option, each test can be run at a specified context depth, prefilling the KV cache with `<n>` tokens.
71+
6972
For a description of the other options, see the [main example](../main/README.md).
7073

7174
Note:
@@ -148,6 +151,19 @@ $ ./llama-bench -ngl 10,20,30,31,32,33,34,35
148151
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | pp 512 | 2400.01 ± 7.72 |
149152
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | tg 128 | 131.66 ± 0.49 |
150153

154+
### Different prefilled context
155+
156+
```
157+
$ ./llama-bench -d 0,512
158+
```
159+
160+
| model | size | params | backend | ngl | test | t/s |
161+
| ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
162+
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 | 7340.20 ± 23.45 |
163+
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 | 120.60 ± 0.59 |
164+
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 @ d512 | 6425.91 ± 18.88 |
165+
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 @ d512 | 116.71 ± 0.60 |
166+
151167
## Output formats
152168

153169
By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option.
@@ -170,9 +186,9 @@ $ ./llama-bench -o csv
170186
```
171187

172188
```csv
173-
build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
174-
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
175-
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
189+
build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,use_mmap,embeddings,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
190+
"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","512","0","0","2025-04-24T11:57:09Z","70285660","982040","7285.676949","100.064434"
191+
"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","128","0","2025-04-24T11:57:10Z","1067431600","3834831","119.915244","0.430617"
176192
```
177193

178194
### JSON
@@ -184,64 +200,78 @@ $ ./llama-bench -o json
184200
```json
185201
[
186202
{
187-
"build_commit": "3469684",
188-
"build_number": 1275,
189-
"cuda": true,
190-
"metal": false,
191-
"gpu_blas": true,
192-
"blas": true,
193-
"cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
194-
"gpu_info": "NVIDIA GeForce RTX 3090 Ti",
195-
"model_filename": "models/7B/ggml-model-q4_0.gguf",
196-
"model_type": "llama 7B mostly Q4_0",
197-
"model_size": 3825065984,
198-
"model_n_params": 6738415616,
199-
"n_batch": 512,
200-
"n_threads": 16,
201-
"f16_kv": true,
203+
"build_commit": "8cf427ff",
204+
"build_number": 5163,
205+
"cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor",
206+
"gpu_info": "NVIDIA GeForce RTX 4080",
207+
"backends": "CUDA",
208+
"model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
209+
"model_type": "qwen2 7B Q4_K - Medium",
210+
"model_size": 4677120000,
211+
"model_n_params": 7615616512,
212+
"n_batch": 2048,
213+
"n_ubatch": 512,
214+
"n_threads": 8,
215+
"cpu_mask": "0x0",
216+
"cpu_strict": false,
217+
"poll": 50,
218+
"type_k": "f16",
219+
"type_v": "f16",
202220
"n_gpu_layers": 99,
221+
"split_mode": "layer",
203222
"main_gpu": 0,
204-
"mul_mat_q": true,
223+
"no_kv_offload": false,
224+
"flash_attn": false,
205225
"tensor_split": "0.00",
226+
"use_mmap": true,
227+
"embeddings": false,
206228
"n_prompt": 512,
207229
"n_gen": 0,
208-
"test_time": "2023-09-23T12:09:57Z",
209-
"avg_ns": 212365953,
210-
"stddev_ns": 985423,
211-
"avg_ts": 2410.974041,
212-
"stddev_ts": 11.163766,
213-
"samples_ns": [ 213837238, 211635853, 212328053, 211329715, 212698907 ],
214-
"samples_ts": [ 2394.34, 2419.25, 2411.36, 2422.75, 2407.16 ]
230+
"n_depth": 0,
231+
"test_time": "2025-04-24T11:58:50Z",
232+
"avg_ns": 72135640,
233+
"stddev_ns": 1453752,
234+
"avg_ts": 7100.002165,
235+
"stddev_ts": 140.341520,
236+
"samples_ns": [ 74601900, 71632900, 71745200, 71952700, 70745500 ],
237+
"samples_ts": [ 6863.1, 7147.55, 7136.37, 7115.79, 7237.21 ]
215238
},
216239
{
217-
"build_commit": "3469684",
218-
"build_number": 1275,
219-
"cuda": true,
220-
"metal": false,
221-
"gpu_blas": true,
222-
"blas": true,
223-
"cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
224-
"gpu_info": "NVIDIA GeForce RTX 3090 Ti",
225-
"model_filename": "models/7B/ggml-model-q4_0.gguf",
226-
"model_type": "llama 7B mostly Q4_0",
227-
"model_size": 3825065984,
228-
"model_n_params": 6738415616,
229-
"n_batch": 512,
230-
"n_threads": 16,
231-
"f16_kv": true,
240+
"build_commit": "8cf427ff",
241+
"build_number": 5163,
242+
"cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor",
243+
"gpu_info": "NVIDIA GeForce RTX 4080",
244+
"backends": "CUDA",
245+
"model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
246+
"model_type": "qwen2 7B Q4_K - Medium",
247+
"model_size": 4677120000,
248+
"model_n_params": 7615616512,
249+
"n_batch": 2048,
250+
"n_ubatch": 512,
251+
"n_threads": 8,
252+
"cpu_mask": "0x0",
253+
"cpu_strict": false,
254+
"poll": 50,
255+
"type_k": "f16",
256+
"type_v": "f16",
232257
"n_gpu_layers": 99,
258+
"split_mode": "layer",
233259
"main_gpu": 0,
234-
"mul_mat_q": true,
260+
"no_kv_offload": false,
261+
"flash_attn": false,
235262
"tensor_split": "0.00",
263+
"use_mmap": true,
264+
"embeddings": false,
236265
"n_prompt": 0,
237266
"n_gen": 128,
238-
"test_time": "2023-09-23T12:09:59Z",
239-
"avg_ns": 977425219,
240-
"stddev_ns": 9268593,
241-
"avg_ts": 130.965708,
242-
"stddev_ts": 1.238924,
243-
"samples_ns": [ 984472709, 974901233, 989474741, 970729355, 967548060 ],
244-
"samples_ts": [ 130.019, 131.295, 129.362, 131.86, 132.293 ]
267+
"n_depth": 0,
268+
"test_time": "2025-04-24T11:58:51Z",
269+
"avg_ns": 1076767880,
270+
"stddev_ns": 9449585,
271+
"avg_ts": 118.881588,
272+
"stddev_ts": 1.041811,
273+
"samples_ns": [ 1075361300, 1065089400, 1071761200, 1081934900, 1089692600 ],
274+
"samples_ts": [ 119.03, 120.178, 119.43, 118.307, 117.464 ]
245275
}
246276
]
247277
```
@@ -254,8 +284,8 @@ $ ./llama-bench -o jsonl
254284
```
255285

256286
```json lines
257-
{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]}
258-
{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]}
287+
{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 70497220, "stddev_ns": 883196, "avg_ts": 7263.609157, "stddev_ts": 90.940578, "samples_ns": [ 71551000, 71222800, 70364100, 69439100, 69909100 ],"samples_ts": [ 7155.74, 7188.71, 7276.44, 7373.37, 7323.8 ]}
288+
{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 1068078400, "stddev_ns": 6279455, "avg_ts": 119.844681, "stddev_ts": 0.699739, "samples_ns": [ 1066331700, 1064864900, 1079042600, 1063328400, 1066824400 ],"samples_ts": [ 120.038, 120.203, 118.624, 120.377, 119.982 ]}
259289
```
260290

261291

@@ -271,32 +301,39 @@ $ ./llama-bench -o sql
271301
CREATE TABLE IF NOT EXISTS test (
272302
build_commit TEXT,
273303
build_number INTEGER,
274-
cuda INTEGER,
275-
metal INTEGER,
276-
gpu_blas INTEGER,
277-
blas INTEGER,
278304
cpu_info TEXT,
279305
gpu_info TEXT,
306+
backends TEXT,
280307
model_filename TEXT,
281308
model_type TEXT,
282309
model_size INTEGER,
283310
model_n_params INTEGER,
284311
n_batch INTEGER,
312+
n_ubatch INTEGER,
285313
n_threads INTEGER,
286-
f16_kv INTEGER,
314+
cpu_mask TEXT,
315+
cpu_strict INTEGER,
316+
poll INTEGER,
317+
type_k TEXT,
318+
type_v TEXT,
287319
n_gpu_layers INTEGER,
320+
split_mode TEXT,
288321
main_gpu INTEGER,
289-
mul_mat_q INTEGER,
322+
no_kv_offload INTEGER,
323+
flash_attn INTEGER,
290324
tensor_split TEXT,
325+
use_mmap INTEGER,
326+
embeddings INTEGER,
291327
n_prompt INTEGER,
292328
n_gen INTEGER,
329+
n_depth INTEGER,
293330
test_time TEXT,
294331
avg_ns INTEGER,
295332
stddev_ns INTEGER,
296333
avg_ts REAL,
297334
stddev_ts REAL
298335
);
299336

300-
INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
301-
INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
337+
INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '512', '0', '0', '2025-04-24T12:00:08Z', '69905000', '519516', '7324.546977', '54.032613');
338+
INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '128', '0', '2025-04-24T12:00:09Z', '1063608780', '4464130', '120.346696', '0.504647');
302339
```

0 commit comments

Comments
 (0)