Skip to content

Commit

Permalink
ruff format; ruff check --fix .;
Browse files Browse the repository at this point in the history
  • Loading branch information
abcdabcd987 committed Nov 22, 2023
1 parent eb3cd4d commit 1495038
Show file tree
Hide file tree
Showing 38 changed files with 4,190 additions and 4,039 deletions.
89 changes: 44 additions & 45 deletions benchmarks/bench_backbone_vs_lora.py
Original file line number Diff line number Diff line change
@@ -1,70 +1,69 @@
import gzip
import itertools
import json
import pathlib
from datetime import datetime

import pytz
import numpy as np
import pytz
import torch
from tqdm import tqdm

from .benchmark_utils import bench, gc_torch
from .benchmark_utils import bench


@torch.inference_mode()
def bench_backbone_vs_lora(f):
torch.manual_seed(0xabcdabcd987)
dtype = torch.float16
device = torch.device("cuda:0")
h1 = 4096
h2 = 11008
r = 16
bs_list = np.arange(1, 65)
torch.manual_seed(0xABCDABCD987)
dtype = torch.float16
device = torch.device("cuda:0")
h1 = 4096
h2 = 11008
r = 16
bs_list = np.arange(1, 65)

res = dict(
backbone_avg=[],
backbone_std=[],
single_lora_avg=[],
single_lora_std=[],
multi_lora_avg=[],
multi_lora_std=[],
)
for bs in tqdm(bs_list):
w = torch.randn(h1, h2, dtype=dtype, device=device)
wa = torch.randn(h1, r, dtype=dtype, device=device)
wb = torch.randn(r, h2, dtype=dtype, device=device)
x = torch.randn(bs, 1, h1, dtype=dtype, device=device)
res = dict(
backbone_avg=[],
backbone_std=[],
single_lora_avg=[],
single_lora_std=[],
multi_lora_avg=[],
multi_lora_std=[],
)
for bs in tqdm(bs_list):
w = torch.randn(h1, h2, dtype=dtype, device=device)
wa = torch.randn(h1, r, dtype=dtype, device=device)
wb = torch.randn(r, h2, dtype=dtype, device=device)
x = torch.randn(bs, 1, h1, dtype=dtype, device=device)

def muti_lora():
for i in range(bs):
x[i] @ wa @ wb
def muti_lora():
for i in range(bs):
x[i] @ wa @ wb

l_backbone = bench(lambda: x @ w, warmup=200, repeat=500)
l_single_lora = bench(lambda: x @ wa @ wb, warmup=200, repeat=500)
l_multi_lora = bench(muti_lora, warmup=200, repeat=500)
l_backbone = bench(lambda: x @ w, warmup=200, repeat=500)
l_single_lora = bench(lambda: x @ wa @ wb, warmup=200, repeat=500)
l_multi_lora = bench(muti_lora, warmup=200, repeat=500)

res["backbone_avg"].append(l_backbone.avg())
res["backbone_std"].append(l_backbone.std())
res["single_lora_avg"].append(l_single_lora.avg())
res["single_lora_std"].append(l_single_lora.std())
res["multi_lora_avg"].append(l_multi_lora.avg())
res["multi_lora_std"].append(l_multi_lora.std())
res["backbone_avg"].append(l_backbone.avg())
res["backbone_std"].append(l_backbone.std())
res["single_lora_avg"].append(l_single_lora.avg())
res["single_lora_std"].append(l_single_lora.std())
res["multi_lora_avg"].append(l_multi_lora.avg())
res["multi_lora_std"].append(l_multi_lora.std())

json.dump(res, f)
json.dump(res, f)


def main():
this_file = pathlib.Path(__file__)
project_root = this_file.parents[1]
now = datetime.now(pytz.timezone("US/Pacific"))
out_filename = f"{now:%Y%m%d-%H%M%S}-{this_file.stem}.json.gz"
out_path = project_root / "data" / out_filename
this_file = pathlib.Path(__file__)
project_root = this_file.parents[1]
now = datetime.now(pytz.timezone("US/Pacific"))
out_filename = f"{now:%Y%m%d-%H%M%S}-{this_file.stem}.json.gz"
out_path = project_root / "data" / out_filename

print(out_path)
with gzip.open(out_path, "wt") as f:
bench_backbone_vs_lora(f)
print(out_path)
with gzip.open(out_path, "wt") as f:
bench_backbone_vs_lora(f)


if __name__ == "__main__":
main()
main()
184 changes: 99 additions & 85 deletions benchmarks/bench_batch_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,104 +9,118 @@
from tqdm import tqdm

import punica.ops
from punica.utils.kvcache import BatchedKvCache, KvCache, KvPool
from punica import BatchedKvCache, KvCache, KvPool

from .benchmark_utils import bench, gc_torch


class batch_decode_Resources:

def __init__(
self,
num_heads: int,
head_dim: int,
block_len: int,
seqlens: list[int],
dtype: str,
device: torch.device,
):
dtype = getattr(torch, dtype)
self.kvpool = KvPool(
num_layers=1,
num_heads=num_heads,
head_dim=head_dim,
capacity=sum((l + block_len - 1) // block_len for l in seqlens),
block_len=block_len,
dtype=dtype,
device=device,
)
self.q = torch.randn((len(seqlens), num_heads, head_dim),
dtype=dtype,
device=device)
kv_list: list[KvCache] = []
for seqlen in seqlens:
kv_list.append(KvCache(self.kvpool, seqlen))
self.kv_list = kv_list
self.kv = BatchedKvCache(kv_list)

def release(self):
for kvcache in self.kv_list:
kvcache.release()
def __init__(
self,
num_heads: int,
head_dim: int,
block_len: int,
seqlens: list[int],
dtype: str,
device: torch.device,
):
dtype = getattr(torch, dtype)
self.kvpool = KvPool(
num_layers=1,
num_heads=num_heads,
head_dim=head_dim,
capacity=sum((l + block_len - 1) // block_len for l in seqlens),
block_len=block_len,
dtype=dtype,
device=device,
)
self.q = torch.randn(
(len(seqlens), num_heads, head_dim), dtype=dtype, device=device
)
kv_list: list[KvCache] = []
for seqlen in seqlens:
kv_list.append(KvCache(self.kvpool, seqlen))
self.kv_list = kv_list
self.kv = BatchedKvCache(kv_list)

def release(self):
for kvcache in self.kv_list:
kvcache.release()


@torch.inference_mode()
def bench_batch_decode(f):
num_heads_ = [32, 40]
batch_size_ = [
1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 64
]
seqlen_ = list(reversed(range(2048, 0, -64)))
dtype = "float16"
device = torch.device("cuda:0")
block_len = 16
head_dim = 128

all_ = list(itertools.product(num_heads_, seqlen_, batch_size_))
for num_heads, seqlen, batch_size in (pbar := tqdm(all_)):
setup = dict(
num_heads=num_heads,
head_dim=head_dim,
block_len=block_len,
seqlen=seqlen,
batch_size=batch_size,
)
pbar.set_postfix(setup)
torch.manual_seed(0xabcdabcd987)
gc_torch()
res = batch_decode_Resources(
num_heads=num_heads,
head_dim=head_dim,
block_len=block_len,
seqlens=[seqlen] * batch_size,
dtype=dtype,
device=device,
)
latency = bench(
lambda: punica.ops.batch_decode(res.q, res.kv, layer_idx=0))
res.release()

result = {
"setup": setup,
"latency": {
"avg": latency.avg(),
"std": latency.std()
},
}
f.write(json.dumps(result) + "\n")
f.flush()
num_heads_ = [32, 40]
batch_size_ = [
1,
2,
3,
4,
5,
6,
7,
8,
10,
12,
14,
16,
20,
24,
28,
32,
40,
48,
56,
64,
]
seqlen_ = list(reversed(range(2048, 0, -64)))
dtype = "float16"
device = torch.device("cuda:0")
block_len = 16
head_dim = 128

all_ = list(itertools.product(num_heads_, seqlen_, batch_size_))
for num_heads, seqlen, batch_size in (pbar := tqdm(all_)):
setup = dict(
num_heads=num_heads,
head_dim=head_dim,
block_len=block_len,
seqlen=seqlen,
batch_size=batch_size,
)
pbar.set_postfix(setup)
torch.manual_seed(0xABCDABCD987)
gc_torch()
res = batch_decode_Resources(
num_heads=num_heads,
head_dim=head_dim,
block_len=block_len,
seqlens=[seqlen] * batch_size,
dtype=dtype,
device=device,
)
latency = bench(lambda: punica.ops.batch_decode(res.q, res.kv, layer_idx=0))
res.release()

result = {
"setup": setup,
"latency": {"avg": latency.avg(), "std": latency.std()},
}
f.write(json.dumps(result) + "\n")
f.flush()


def main():
this_file = pathlib.Path(__file__)
project_root = this_file.parents[1]
now = datetime.now(pytz.timezone("US/Pacific"))
out_filename = f"{now:%Y%m%d-%H%M%S}-{this_file.stem}.jsonl.gz"
out_path = project_root / "data" / out_filename
this_file = pathlib.Path(__file__)
project_root = this_file.parents[1]
now = datetime.now(pytz.timezone("US/Pacific"))
out_filename = f"{now:%Y%m%d-%H%M%S}-{this_file.stem}.jsonl.gz"
out_path = project_root / "data" / out_filename

print(out_path)
with gzip.open(out_path, "wt") as f:
bench_batch_decode(f)
print(out_path)
with gzip.open(out_path, "wt") as f:
bench_batch_decode(f)


if __name__ == "__main__":
main()
main()
Loading

0 comments on commit 1495038

Please sign in to comment.