@@ -28,6 +28,7 @@ options:
28
28
-p, --n-prompt <n> (default: 512)
29
29
-n, --n-gen <n> (default: 128)
30
30
-pg <pp,tg> (default: )
31
+ -d, --n-depth <n> (default: 0)
31
32
-b, --batch-size <n> (default: 2048)
32
33
-ub, --ubatch-size <n> (default: 512)
33
34
-ctk, --cache-type-k <t> (default: f16)
@@ -66,6 +67,8 @@ With the exception of `-r`, `-o` and `-v`, all options can be specified multiple
66
67
67
68
Each test is repeated the number of times given by ` -r ` , and the results are averaged. The results are given in average tokens per second (t/s) and standard deviation. Some output formats (e.g. json) also include the individual results of each repetition.
68
69
70
+ Using the ` -d <n> ` option, each test can be run at a specified context depth, prefilling the KV cache with ` <n> ` tokens.
71
+
69
72
For a description of the other options, see the [ main example] ( ../main/README.md ) .
70
73
71
74
Note:
@@ -148,6 +151,19 @@ $ ./llama-bench -ngl 10,20,30,31,32,33,34,35
148
151
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | pp 512 | 2400.01 ± 7.72 |
149
152
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | tg 128 | 131.66 ± 0.49 |
150
153
154
+ ### Different prefilled context
155
+
156
+ ```
157
+ $ ./llama-bench -d 0,512
158
+ ```
159
+
160
+ | model | size | params | backend | ngl | test | t/s |
161
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
162
+ | qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 | 7340.20 ± 23.45 |
163
+ | qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 | 120.60 ± 0.59 |
164
+ | qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 @ d512 | 6425.91 ± 18.88 |
165
+ | qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 @ d512 | 116.71 ± 0.60 |
166
+
151
167
## Output formats
152
168
153
169
By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the ` -o ` option.
@@ -170,9 +186,9 @@ $ ./llama-bench -o csv
170
186
```
171
187
172
188
``` csv
173
- build_commit,build_number,cuda,metal,gpu_blas,blas, cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv, n_gpu_layers,main_gpu,mul_mat_q, tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
174
- "3469684 ","1275 ","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K ","NVIDIA GeForce RTX 3090 Ti ","models/7B/ggml-model-q4_0 .gguf","llama 7B mostly Q4_0 ","3825065984 ","6738415616 ","512","16 ","1 ","99","0","1 ","0.00","512","0","2023-09-23T12:09:01Z ","212155977 ","732372 ","2413.341687 ","8.305961 "
175
- "3469684 ","1275 ","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K ","NVIDIA GeForce RTX 3090 Ti ","models/7B/ggml-model-q4_0 .gguf","llama 7B mostly Q4_0 ","3825065984 ","6738415616 ","512","16 ","1 ","99","0","1 ","0.00","0","128","2023-09-23T12:09:02Z ","969320879 ","2728399 ","132.052051 ","0.371342 "
189
+ build_commit,build_number,cpu_info,gpu_info,backends, model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch, n_threads,cpu_mask,cpu_strict,poll,type_k,type_v, n_gpu_layers,split_mode, main_gpu,no_kv_offload,flash_attn, tensor_split,use_mmap,embeddings, n_prompt,n_gen,n_depth ,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
190
+ "8cf427ff ","5163 ","AMD Ryzen 7 7800X3D 8-Core Processor ","NVIDIA GeForce RTX 4080 ","CUDA"," models/Qwen2.5-7B-Instruct-Q4_K_M .gguf","qwen2 7B Q4_K - Medium ","4677120000 ","7615616512 ","2048"," 512","8 ","0x0 ","0","50","f16","f16"," 99","layer"," 0","0 ","0","0 .00","1","0"," 512","0","0","2025-04-24T11:57:09Z ","70285660 ","982040 ","7285.676949 ","100.064434 "
191
+ "8cf427ff ","5163 ","AMD Ryzen 7 7800X3D 8-Core Processor ","NVIDIA GeForce RTX 4080 ","CUDA"," models/Qwen2.5-7B-Instruct-Q4_K_M .gguf","qwen2 7B Q4_K - Medium ","4677120000 ","7615616512 ","2048"," 512","8 ","0x0 ","0","50","f16","f16"," 99","layer"," 0","0 ","0","0 .00","1"," 0","0"," 128","0","2025-04-24T11:57:10Z ","1067431600 ","3834831 ","119.915244 ","0.430617 "
176
192
```
177
193
178
194
### JSON
@@ -184,64 +200,78 @@ $ ./llama-bench -o json
184
200
``` json
185
201
[
186
202
{
187
- "build_commit" : " 3469684" ,
188
- "build_number" : 1275 ,
189
- "cuda" : true ,
190
- "metal" : false ,
191
- "gpu_blas" : true ,
192
- "blas" : true ,
193
- "cpu_info" : " 13th Gen Intel(R) Core(TM) i9-13900K" ,
194
- "gpu_info" : " NVIDIA GeForce RTX 3090 Ti" ,
195
- "model_filename" : " models/7B/ggml-model-q4_0.gguf" ,
196
- "model_type" : " llama 7B mostly Q4_0" ,
197
- "model_size" : 3825065984 ,
198
- "model_n_params" : 6738415616 ,
199
- "n_batch" : 512 ,
200
- "n_threads" : 16 ,
201
- "f16_kv" : true ,
203
+ "build_commit" : " 8cf427ff" ,
204
+ "build_number" : 5163 ,
205
+ "cpu_info" : " AMD Ryzen 7 7800X3D 8-Core Processor" ,
206
+ "gpu_info" : " NVIDIA GeForce RTX 4080" ,
207
+ "backends" : " CUDA" ,
208
+ "model_filename" : " models/Qwen2.5-7B-Instruct-Q4_K_M.gguf" ,
209
+ "model_type" : " qwen2 7B Q4_K - Medium" ,
210
+ "model_size" : 4677120000 ,
211
+ "model_n_params" : 7615616512 ,
212
+ "n_batch" : 2048 ,
213
+ "n_ubatch" : 512 ,
214
+ "n_threads" : 8 ,
215
+ "cpu_mask" : " 0x0" ,
216
+ "cpu_strict" : false ,
217
+ "poll" : 50 ,
218
+ "type_k" : " f16" ,
219
+ "type_v" : " f16" ,
202
220
"n_gpu_layers" : 99 ,
221
+ "split_mode" : " layer" ,
203
222
"main_gpu" : 0 ,
204
- "mul_mat_q" : true ,
223
+ "no_kv_offload" : false ,
224
+ "flash_attn" : false ,
205
225
"tensor_split" : " 0.00" ,
226
+ "use_mmap" : true ,
227
+ "embeddings" : false ,
206
228
"n_prompt" : 512 ,
207
229
"n_gen" : 0 ,
208
- "test_time" : " 2023-09-23T12:09:57Z" ,
209
- "avg_ns" : 212365953 ,
210
- "stddev_ns" : 985423 ,
211
- "avg_ts" : 2410.974041 ,
212
- "stddev_ts" : 11.163766 ,
213
- "samples_ns" : [ 213837238 , 211635853 , 212328053 , 211329715 , 212698907 ],
214
- "samples_ts" : [ 2394.34 , 2419.25 , 2411.36 , 2422.75 , 2407.16 ]
230
+ "n_depth" : 0 ,
231
+ "test_time" : " 2025-04-24T11:58:50Z" ,
232
+ "avg_ns" : 72135640 ,
233
+ "stddev_ns" : 1453752 ,
234
+ "avg_ts" : 7100.002165 ,
235
+ "stddev_ts" : 140.341520 ,
236
+ "samples_ns" : [ 74601900 , 71632900 , 71745200 , 71952700 , 70745500 ],
237
+ "samples_ts" : [ 6863.1 , 7147.55 , 7136.37 , 7115.79 , 7237.21 ]
215
238
},
216
239
{
217
- "build_commit" : " 3469684" ,
218
- "build_number" : 1275 ,
219
- "cuda" : true ,
220
- "metal" : false ,
221
- "gpu_blas" : true ,
222
- "blas" : true ,
223
- "cpu_info" : " 13th Gen Intel(R) Core(TM) i9-13900K" ,
224
- "gpu_info" : " NVIDIA GeForce RTX 3090 Ti" ,
225
- "model_filename" : " models/7B/ggml-model-q4_0.gguf" ,
226
- "model_type" : " llama 7B mostly Q4_0" ,
227
- "model_size" : 3825065984 ,
228
- "model_n_params" : 6738415616 ,
229
- "n_batch" : 512 ,
230
- "n_threads" : 16 ,
231
- "f16_kv" : true ,
240
+ "build_commit" : " 8cf427ff" ,
241
+ "build_number" : 5163 ,
242
+ "cpu_info" : " AMD Ryzen 7 7800X3D 8-Core Processor" ,
243
+ "gpu_info" : " NVIDIA GeForce RTX 4080" ,
244
+ "backends" : " CUDA" ,
245
+ "model_filename" : " models/Qwen2.5-7B-Instruct-Q4_K_M.gguf" ,
246
+ "model_type" : " qwen2 7B Q4_K - Medium" ,
247
+ "model_size" : 4677120000 ,
248
+ "model_n_params" : 7615616512 ,
249
+ "n_batch" : 2048 ,
250
+ "n_ubatch" : 512 ,
251
+ "n_threads" : 8 ,
252
+ "cpu_mask" : " 0x0" ,
253
+ "cpu_strict" : false ,
254
+ "poll" : 50 ,
255
+ "type_k" : " f16" ,
256
+ "type_v" : " f16" ,
232
257
"n_gpu_layers" : 99 ,
258
+ "split_mode" : " layer" ,
233
259
"main_gpu" : 0 ,
234
- "mul_mat_q" : true ,
260
+ "no_kv_offload" : false ,
261
+ "flash_attn" : false ,
235
262
"tensor_split" : " 0.00" ,
263
+ "use_mmap" : true ,
264
+ "embeddings" : false ,
236
265
"n_prompt" : 0 ,
237
266
"n_gen" : 128 ,
238
- "test_time" : " 2023-09-23T12:09:59Z" ,
239
- "avg_ns" : 977425219 ,
240
- "stddev_ns" : 9268593 ,
241
- "avg_ts" : 130.965708 ,
242
- "stddev_ts" : 1.238924 ,
243
- "samples_ns" : [ 984472709 , 974901233 , 989474741 , 970729355 , 967548060 ],
244
- "samples_ts" : [ 130.019 , 131.295 , 129.362 , 131.86 , 132.293 ]
267
+ "n_depth" : 0 ,
268
+ "test_time" : " 2025-04-24T11:58:51Z" ,
269
+ "avg_ns" : 1076767880 ,
270
+ "stddev_ns" : 9449585 ,
271
+ "avg_ts" : 118.881588 ,
272
+ "stddev_ts" : 1.041811 ,
273
+ "samples_ns" : [ 1075361300 , 1065089400 , 1071761200 , 1081934900 , 1089692600 ],
274
+ "samples_ts" : [ 119.03 , 120.178 , 119.43 , 118.307 , 117.464 ]
245
275
}
246
276
]
247
277
```
@@ -254,8 +284,8 @@ $ ./llama-bench -o jsonl
254
284
```
255
285
256
286
``` json lines
257
- {"build_commit" :" 3469684 " , "build_number" :1275 , "cuda" : true , "metal" : false , "gpu_blas" : true , "blas" : true , " cpu_info" :" 13th Gen Intel(R) Core(TM) i9-13900K " , "gpu_info" :" NVIDIA GeForce RTX 3090 Ti " , "model_filename" :" models/7B/ggml-model-q4_0 .gguf" ,"model_type" :" llama 7B mostly Q4_0 " , "model_size" :3825065984 , "model_n_params" :6738415616 , "n_batch" :512 ,"n_threads" :16 , "f16_kv" : true , " n_gpu_layers" :99 ," main_gpu" :0 , "mul_mat_q" : true , " tensor_split" :" 0.00" ," n_prompt" :512 ,"n_gen" :0 , " test_time" :" 2023-09-23T12:09:57Z " , "avg_ns" :212365953 , "stddev_ns" :985423 , "avg_ts" :2410.974041 , "stddev_ts" :11.163766 , "samples_ns" :[ 213837238 , 211635853 , 212328053 , 211329715 , 212698907 ],"samples_ts" :[ 2394.34 , 2419.25 , 2411.36 , 2422.75 , 2407.16 ]}
258
- {"build_commit" :" 3469684 " , "build_number" :1275 , "cuda" : true , "metal" : false , "gpu_blas" : true , "blas" : true , " cpu_info" :" 13th Gen Intel(R) Core(TM) i9-13900K " , "gpu_info" :" NVIDIA GeForce RTX 3090 Ti " , "model_filename" :" models/7B/ggml-model-q4_0 .gguf" ,"model_type" :" llama 7B mostly Q4_0 " , "model_size" :3825065984 , "model_n_params" :6738415616 , "n_batch" :512 ,"n_threads" :16 , "f16_kv" : true , " n_gpu_layers" :99 ," main_gpu" :0 , "mul_mat_q" : true , " tensor_split" :" 0.00" ," n_prompt" :0 , "n_gen" :128 ," test_time" :" 2023-09-23T12:09:59Z " , "avg_ns" :977425219 , "stddev_ns" :9268593 , "avg_ts" :130.965708 , "stddev_ts" :1.238924 , "samples_ns" :[ 984472709 , 974901233 , 989474741 , 970729355 , 967548060 ],"samples_ts" :[ 130.019 , 131.295 , 129.362 , 131.86 , 132.293 ]}
287
+ {"build_commit" : " 8cf427ff " , "build_number" : 5163 , " cpu_info" : " AMD Ryzen 7 7800X3D 8-Core Processor " , "gpu_info" : " NVIDIA GeForce RTX 4080 " , "backends" : " CUDA " , "model_filename" : " models/Qwen2.5-7B-Instruct-Q4_K_M .gguf" , "model_type" : " qwen2 7B Q4_K - Medium " , "model_size" : 4677120000 , "model_n_params" : 7615616512 , "n_batch" : 2048 , "n_ubatch" : 512 , "n_threads" : 8 , "cpu_mask" : " 0x0 " , "cpu_strict" : false , "poll" : 50 , "type_k" : " f16 " , "type_v" : " f16 " , " n_gpu_layers" : 99 , "split_mode" : " layer " , " main_gpu" : 0 , "no_kv_offload" : false , "flash_attn" : false , " tensor_split" : " 0.00" , "use_mmap" : true , "embeddings" : false , " n_prompt" : 512 , "n_gen" : 0 , "n_depth" : 0 , " test_time" : " 2025-04-24T11:59:33Z " , "avg_ns" : 70497220 , "stddev_ns" : 883196 , "avg_ts" : 7263.609157 , "stddev_ts" : 90.940578 , "samples_ns" : [ 71551000 , 71222800 , 70364100 , 69439100 , 69909100 ],"samples_ts" : [ 7155.74 , 7188.71 , 7276.44 , 7373.37 , 7323.8 ]}
288
+ {"build_commit" : " 8cf427ff " , "build_number" : 5163 , " cpu_info" : " AMD Ryzen 7 7800X3D 8-Core Processor " , "gpu_info" : " NVIDIA GeForce RTX 4080 " , "backends" : " CUDA " , "model_filename" : " models/Qwen2.5-7B-Instruct-Q4_K_M .gguf" , "model_type" : " qwen2 7B Q4_K - Medium " , "model_size" : 4677120000 , "model_n_params" : 7615616512 , "n_batch" : 2048 , "n_ubatch" : 512 , "n_threads" : 8 , "cpu_mask" : " 0x0 " , "cpu_strict" : false , "poll" : 50 , "type_k" : " f16 " , "type_v" : " f16 " , " n_gpu_layers" : 99 , "split_mode" : " layer " , " main_gpu" : 0 , "no_kv_offload" : false , "flash_attn" : false , " tensor_split" : " 0.00" , "use_mmap" : true , "embeddings" : false , " n_prompt" : 0 , "n_gen" : 128 , "n_depth" : 0 , " test_time" : " 2025-04-24T11:59:33Z " , "avg_ns" : 1068078400 , "stddev_ns" : 6279455 , "avg_ts" : 119.844681 , "stddev_ts" : 0.699739 , "samples_ns" : [ 1066331700 , 1064864900 , 1079042600 , 1063328400 , 1066824400 ],"samples_ts" : [ 120.038 , 120.203 , 118.624 , 120.377 , 119.982 ]}
259
289
```
260
290
261
291
@@ -271,32 +301,39 @@ $ ./llama-bench -o sql
271
301
CREATE TABLE IF NOT EXISTS test (
272
302
build_commit TEXT ,
273
303
build_number INTEGER ,
274
- cuda INTEGER ,
275
- metal INTEGER ,
276
- gpu_blas INTEGER ,
277
- blas INTEGER ,
278
304
cpu_info TEXT ,
279
305
gpu_info TEXT ,
306
+ backends TEXT ,
280
307
model_filename TEXT ,
281
308
model_type TEXT ,
282
309
model_size INTEGER ,
283
310
model_n_params INTEGER ,
284
311
n_batch INTEGER ,
312
+ n_ubatch INTEGER ,
285
313
n_threads INTEGER ,
286
- f16_kv INTEGER ,
314
+ cpu_mask TEXT ,
315
+ cpu_strict INTEGER ,
316
+ poll INTEGER ,
317
+ type_k TEXT ,
318
+ type_v TEXT ,
287
319
n_gpu_layers INTEGER ,
320
+ split_mode TEXT ,
288
321
main_gpu INTEGER ,
289
- mul_mat_q INTEGER ,
322
+ no_kv_offload INTEGER ,
323
+ flash_attn INTEGER ,
290
324
tensor_split TEXT ,
325
+ use_mmap INTEGER ,
326
+ embeddings INTEGER ,
291
327
n_prompt INTEGER ,
292
328
n_gen INTEGER ,
329
+ n_depth INTEGER ,
293
330
test_time TEXT ,
294
331
avg_ns INTEGER ,
295
332
stddev_ns INTEGER ,
296
333
avg_ts REAL ,
297
334
stddev_ts REAL
298
335
);
299
336
300
- INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES (' 3469684 ' , ' 1275 ' , ' 1 ' , ' 0 ' , ' 0 ' , ' 1 ' , ' 1 ' , ' 13th Gen Intel(R) Core(TM) i9-13900K ' , ' NVIDIA GeForce RTX 3090 Ti ' , ' models/7B/ggml-model-q4_0 .gguf' , ' llama 7B mostly Q4_0 ' , ' 3825065984 ' , ' 6738415616 ' , ' 512' , ' 16 ' , ' 1 ' , ' 99' , ' 0' , ' 1 ' , ' 0.00' , ' 512' , ' 0' , ' 2023-09-23T12:10:30Z ' , ' 212693772 ' , ' 743623 ' , ' 2407.240204 ' , ' 8.409634 ' );
301
- INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES (' 3469684 ' , ' 1275 ' , ' 1 ' , ' 0 ' , ' 0 ' , ' 1 ' , ' 1 ' , ' 13th Gen Intel(R) Core(TM) i9-13900K ' , ' NVIDIA GeForce RTX 3090 Ti ' , ' models/7B/ggml-model-q4_0 .gguf' , ' llama 7B mostly Q4_0 ' , ' 3825065984 ' , ' 6738415616 ' , ' 512' , ' 16 ' , ' 1 ' , ' 99' , ' 0' , ' 1 ' , ' 0.00' , ' 0' , ' 128' , ' 2023-09-23T12:10:31Z ' , ' 977925003 ' , ' 4037361 ' , ' 130.891159 ' , ' 0.537692 ' );
337
+ INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES (' 8cf427ff ' , ' 5163 ' , ' AMD Ryzen 7 7800X3D 8- Core Processor ' , ' NVIDIA GeForce RTX 4080 ' , ' CUDA ' , ' models/Qwen2.5-7B-Instruct-Q4_K_M .gguf' , ' qwen2 7B Q4_K - Medium ' , ' 4677120000 ' , ' 7615616512 ' , ' 2048 ' , ' 512' , ' 8 ' , ' 0x0 ' , ' 0 ' , ' 50 ' , ' f16 ' , ' f16 ' , ' 99' , ' layer ' , ' 0' , ' 0 ' , ' 0' , ' 0 .00' , ' 1 ' , ' 0 ' , ' 512' , ' 0' , ' 0 ' , ' 2025-04-24T12:00:08Z ' , ' 69905000 ' , ' 519516 ' , ' 7324.546977 ' , ' 54.032613 ' );
338
+ INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES (' 8cf427ff ' , ' 5163 ' , ' AMD Ryzen 7 7800X3D 8- Core Processor ' , ' NVIDIA GeForce RTX 4080 ' , ' CUDA ' , ' models/Qwen2.5-7B-Instruct-Q4_K_M .gguf' , ' qwen2 7B Q4_K - Medium ' , ' 4677120000 ' , ' 7615616512 ' , ' 2048 ' , ' 512' , ' 8 ' , ' 0x0 ' , ' 0 ' , ' 50 ' , ' f16 ' , ' f16 ' , ' 99' , ' layer ' , ' 0' , ' 0 ' , ' 0' , ' 0 .00' , ' 1 ' , ' 0' , ' 0 ' , ' 128' , ' 0 ' , ' 2025-04-24T12:00:09Z ' , ' 1063608780 ' , ' 4464130 ' , ' 120.346696 ' , ' 0.504647 ' );
302
339
```
0 commit comments