Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion 3rdparty/cutlass
Submodule cutlass updated 384 files
1 change: 1 addition & 0 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ The output CSV will contain detailed metrics including:
| `--ep_size` | Expert-parallel world size |
| `--ep_rank` | Expert-parallel rank |
| `--gated_act` | Gated activation function: `swiglu` (default) or `geglu` |
| `--autotune` | Enable autotune for supported operation |

### MOE Routing Method Compatibility

Expand Down
100 changes: 42 additions & 58 deletions benchmarks/bench_append_paged_kv_cache.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import sys


import argparse
import dataclasses
from typing import Tuple

import numpy as np
import torch
import paddle
from flashinfer.paddle_utils import *

import flashinfer
from flashinfer.testing.utils import bench_gpu_time
Expand All @@ -17,42 +21,25 @@ class ModelConfig:


def _make_70b(tp: int) -> ModelConfig:
return ModelConfig(
num_kv_heads=8 // tp,
num_layers=80,
head_dim=128,
)
return ModelConfig(num_kv_heads=8 // tp, num_layers=80, head_dim=128)


MODELS = {
"l1b": ModelConfig(
num_kv_heads=8,
num_layers=16,
head_dim=64,
),
"l3b": ModelConfig(
num_kv_heads=8,
num_layers=28,
head_dim=128,
),
"l8b": ModelConfig(
num_kv_heads=8,
num_layers=32,
head_dim=128,
),
"l1b": ModelConfig(num_kv_heads=8, num_layers=16, head_dim=64),
"l3b": ModelConfig(num_kv_heads=8, num_layers=28, head_dim=128),
"l8b": ModelConfig(num_kv_heads=8, num_layers=32, head_dim=128),
"l70b-tp8": _make_70b(8),
}


@torch.inference_mode()
@paddle.no_grad()
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--seqlen", type=int, default=5000)
parser.add_argument("--batch-size", type=int, default=8)
parser.add_argument("--page-len", type=int, default=16)
parser.add_argument("--dtype", type=str, default="float16")
args = parser.parse_args()

seqlens_ = [
[1] * args.batch_size,
[args.seqlen - args.batch_size + 1] + [1] * (args.batch_size - 1),
Expand All @@ -62,28 +49,22 @@ def main():
seqlen_strlen = max(len(str(seqlens)) for seqlens in seqlens_)
page_len = int(args.page_len)
dtype = getattr(torch, args.dtype)
assert isinstance(dtype, torch.dtype)
device = torch.device("cuda:0")
assert isinstance(dtype, paddle.dtype)
device = device2str("cuda:0")
total_pages = int(256000 / page_len)

torch.cuda.profiler.start()

>>>>>> torch.cuda.profiler.start()
for model_name, model in MODELS.items():
page_shape = (2, page_len, model.num_kv_heads, model.head_dim)
layer_buf = torch.empty((total_pages,) + page_shape, dtype=dtype, device=device)
page_shape = 2, page_len, model.num_kv_heads, model.head_dim
layer_buf = paddle.empty(shape=(total_pages,) + page_shape, dtype=dtype)
for seqlens in seqlens_:
k = torch.rand(
(sum(seqlens), model.num_kv_heads, model.head_dim),
dtype=dtype,
device=device,
k = paddle.rand(
shape=(sum(seqlens), model.num_kv_heads, model.head_dim), dtype=dtype
)
v = torch.rand(
(sum(seqlens), model.num_kv_heads, model.head_dim),
dtype=dtype,
device=device,
v = paddle.rand(
shape=(sum(seqlens), model.num_kv_heads, model.head_dim), dtype=dtype
)
x_indptr = torch.tensor([0] + seqlens, device=device, dtype=torch.int32)
x_indptr = torch.cumsum(x_indptr, 0, dtype=torch.int32)
x_indptr = paddle.to_tensor(data=[0] + seqlens, dtype="int32", place=device)
x_indptr = paddle.cumsum(x=x_indptr, axis=0, dtype="int32")
kv_indices_host = []
kv_indptr_host = [0]
next_page_id = 0
Expand All @@ -92,27 +73,31 @@ def main():
kv_indices_host.extend(range(next_page_id, next_page_id + npages))
next_page_id += npages
kv_indptr_host.append(len(kv_indices_host))
kv_indices = torch.tensor(kv_indices_host, device=device, dtype=torch.int32)
kv_indptr = torch.tensor(kv_indptr_host, device=device, dtype=torch.int32)
kv_last_page_len = torch.tensor(
[(seqlen - 1) % page_len + 1 for seqlen in seqlens],
device=device,
dtype=torch.int32,
kv_indices = paddle.to_tensor(
data=kv_indices_host, dtype="int32", place=device
)
kv_indptr = paddle.to_tensor(
data=kv_indptr_host, dtype="int32", place=device
)
kv_last_page_len = paddle.to_tensor(
data=[((seqlen - 1) % page_len + 1) for seqlen in seqlens],
dtype="int32",
place=device,
)

@torch.cuda.nvtx.range(f"convert model={model_name}, seqlens={seqlens}")
def fn_convert() -> Tuple[torch.Tensor, torch.Tensor]:
>>>>>> @torch.cuda.nvtx.range(f"convert model={model_name}, seqlens={seqlens}")
def fn_convert() -> Tuple[paddle.Tensor, paddle.Tensor]:
return flashinfer.get_batch_indices_positions(
x_indptr,
flashinfer.get_seq_lens(kv_indptr, kv_last_page_len, page_len),
k.shape[0],
tuple(k.shape)[0],
)

batch_indices, positions = fn_convert()
convert_latencies = bench_gpu_time(fn_convert)
convert_latency_ms = np.median(convert_latencies)

@torch.cuda.nvtx.range(f"append model={model_name}, seqlens={seqlens}")
>>>>>> @torch.cuda.nvtx.range(f"append model={model_name}, seqlens={seqlens}")
def fn() -> None:
flashinfer.append_paged_kv_cache(
k,
Expand All @@ -130,23 +115,22 @@ def fn() -> None:
latency_ms = np.median(latencies)
all_layers_latency_ms = convert_latency_ms + latency_ms * model.num_layers
throughput = (
k.numel()
k.size
* k.element_size()
* sum(1 for _ in ["k", "v"])
* sum(1 for _ in ["read", "write"])
/ (latency_ms * 1e-3)
/ (latency_ms * 0.001)
)
print(
f"model: {model_name:8}",
f"seqlens: {seqlens!r:{seqlen_strlen}}",
f"convert: {convert_latency_ms * 1e3:2.0f}us",
f"1layer: {latency_ms * 1e3:2.0f}us",
f"{model.num_layers}layers: {all_layers_latency_ms * 1e3:3.0f}us",
f"throughput: {throughput * 1e-9:8.3f}GB/s",
f"convert: {convert_latency_ms * 1000.0:2.0f}us",
f"1layer: {latency_ms * 1000.0:2.0f}us",
f"{model.num_layers}layers: {all_layers_latency_ms * 1000.0:3.0f}us",
f"throughput: {throughput * 1e-09:8.3f}GB/s",
)
print("---")

torch.cuda.profiler.stop()
>>>>>> torch.cuda.profiler.stop()


if __name__ == "__main__":
Expand Down
84 changes: 38 additions & 46 deletions benchmarks/bench_append_paged_mla_kv_cache.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import sys


import argparse
import dataclasses
from typing import Tuple

import numpy as np
import torch
import paddle
from flashinfer.paddle_utils import *

import flashinfer
from flashinfer.testing.utils import bench_gpu_time
Expand All @@ -22,15 +26,14 @@ class ModelConfig:
}


@torch.inference_mode()
@paddle.no_grad()
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--seqlen", type=int, default=5000)
parser.add_argument("--batch-size", type=int, default=8)
parser.add_argument("--page-len", type=int, default=16)
parser.add_argument("--dtype", type=str, default="float16")
args = parser.parse_args()

seqlens_ = [
[1] * args.batch_size,
[args.seqlen - args.batch_size + 1] + [1] * (args.batch_size - 1),
Expand All @@ -40,34 +43,20 @@ def main():
seqlen_strlen = max(len(str(seqlens)) for seqlens in seqlens_)
page_len = int(args.page_len)
dtype = getattr(torch, args.dtype)
assert isinstance(dtype, torch.dtype)
device = torch.device("cuda:0")
assert isinstance(dtype, paddle.dtype)
device = device2str("cuda:0")
total_pages = int(25600 / page_len)

torch.cuda.profiler.start()

>>>>>> torch.cuda.profiler.start()
for model_name, model in MODELS.items():
ckv_page_shape = (page_len, model.ckv_dim)
kpe_page_shape = (page_len, model.kpe_dim)
ckv_layer_buf = torch.empty(
(total_pages,) + ckv_page_shape, dtype=dtype, device=device
)
kpe_layer_buf = torch.empty(
(total_pages,) + kpe_page_shape, dtype=dtype, device=device
)
ckv_page_shape = page_len, model.ckv_dim
kpe_page_shape = page_len, model.kpe_dim
ckv_layer_buf = paddle.empty(shape=(total_pages,) + ckv_page_shape, dtype=dtype)
kpe_layer_buf = paddle.empty(shape=(total_pages,) + kpe_page_shape, dtype=dtype)
for seqlens in seqlens_:
ckv = torch.rand(
(sum(seqlens), model.ckv_dim),
dtype=dtype,
device=device,
)
kpe = torch.rand(
(sum(seqlens), model.kpe_dim),
dtype=dtype,
device=device,
)
x_indptr = torch.tensor([0] + seqlens, device=device, dtype=torch.int32)
x_indptr = torch.cumsum(x_indptr, 0, dtype=torch.int32)
ckv = paddle.rand(shape=(sum(seqlens), model.ckv_dim), dtype=dtype)
kpe = paddle.rand(shape=(sum(seqlens), model.kpe_dim), dtype=dtype)
x_indptr = paddle.to_tensor(data=[0] + seqlens, dtype="int32", place=device)
x_indptr = paddle.cumsum(x=x_indptr, axis=0, dtype="int32")
kv_indices_host = []
kv_indptr_host = [0]
next_page_id = 0
Expand All @@ -76,27 +65,31 @@ def main():
kv_indices_host.extend(range(next_page_id, next_page_id + npages))
next_page_id += npages
kv_indptr_host.append(len(kv_indices_host))
kv_indices = torch.tensor(kv_indices_host, device=device, dtype=torch.int32)
kv_indptr = torch.tensor(kv_indptr_host, device=device, dtype=torch.int32)
kv_last_page_len = torch.tensor(
[(seqlen - 1) % page_len + 1 for seqlen in seqlens],
device=device,
dtype=torch.int32,
kv_indices = paddle.to_tensor(
data=kv_indices_host, dtype="int32", place=device
)
kv_indptr = paddle.to_tensor(
data=kv_indptr_host, dtype="int32", place=device
)
kv_last_page_len = paddle.to_tensor(
data=[((seqlen - 1) % page_len + 1) for seqlen in seqlens],
dtype="int32",
place=device,
)

@torch.cuda.nvtx.range(f"convert model={model_name}, seqlens={seqlens}")
def fn_convert() -> Tuple[torch.Tensor, torch.Tensor]:
>>>>>> @torch.cuda.nvtx.range(f"convert model={model_name}, seqlens={seqlens}")
def fn_convert() -> Tuple[paddle.Tensor, paddle.Tensor]:
return flashinfer.get_batch_indices_positions(
x_indptr,
flashinfer.get_seq_lens(kv_indptr, kv_last_page_len, page_len),
ckv.shape[0],
tuple(ckv.shape)[0],
)

batch_indices, positions = fn_convert()
convert_latencies = bench_gpu_time(fn_convert)
convert_latency_ms = np.median(convert_latencies)

@torch.cuda.nvtx.range(f"append model={model_name}, seqlens={seqlens}")
>>>>>> @torch.cuda.nvtx.range(f"append model={model_name}, seqlens={seqlens}")
def fn() -> None:
flashinfer.append_paged_mla_kv_cache(
ckv,
Expand All @@ -114,22 +107,21 @@ def fn() -> None:
latency_ms = np.median(latencies)
all_layers_latency_ms = convert_latency_ms + latency_ms * model.num_layers
throughput = (
(ckv.numel() + kpe.numel())
(ckv.size + kpe.size)
* ckv.element_size()
* sum(1 for _ in ["read", "write"])
/ (latency_ms * 1e-3)
/ (latency_ms * 0.001)
)
print(
f"model: {model_name:8}",
f"seqlens: {seqlens!r:{seqlen_strlen}}",
f"convert: {convert_latency_ms * 1e3:2.0f}us",
f"1layer: {latency_ms * 1e3:2.0f}us",
f"{model.num_layers}layers: {all_layers_latency_ms * 1e3:3.0f}us",
f"throughput: {throughput * 1e-9:8.3f}GB/s",
f"convert: {convert_latency_ms * 1000.0:2.0f}us",
f"1layer: {latency_ms * 1000.0:2.0f}us",
f"{model.num_layers}layers: {all_layers_latency_ms * 1000.0:3.0f}us",
f"throughput: {throughput * 1e-09:8.3f}GB/s",
)
print("---")

torch.cuda.profiler.stop()
>>>>>> torch.cuda.profiler.stop()


if __name__ == "__main__":
Expand Down
Loading