ruff format; ruff check --fix .;

punica-ai · Nov 22, 2023 · 1495038 · 1495038
1 parent eb3cd4d
commit 1495038
Show file tree

Hide file tree

Showing 38 changed files with 4,190 additions and 4,039 deletions.
diff --git a/benchmarks/bench_backbone_vs_lora.py b/benchmarks/bench_backbone_vs_lora.py
@@ -1,70 +1,69 @@
 import gzip
-import itertools
 import json
 import pathlib
 from datetime import datetime
 
-import pytz
 import numpy as np
+import pytz
 import torch
 from tqdm import tqdm
 
-from .benchmark_utils import bench, gc_torch
+from .benchmark_utils import bench
 
 
 @torch.inference_mode()
 def bench_backbone_vs_lora(f):
-  torch.manual_seed(0xabcdabcd987)
-  dtype = torch.float16
-  device = torch.device("cuda:0")
-  h1 = 4096
-  h2 = 11008
-  r = 16
-  bs_list = np.arange(1, 65)
+    torch.manual_seed(0xABCDABCD987)
+    dtype = torch.float16
+    device = torch.device("cuda:0")
+    h1 = 4096
+    h2 = 11008
+    r = 16
+    bs_list = np.arange(1, 65)
 
-  res = dict(
-      backbone_avg=[],
-      backbone_std=[],
-      single_lora_avg=[],
-      single_lora_std=[],
-      multi_lora_avg=[],
-      multi_lora_std=[],
-  )
-  for bs in tqdm(bs_list):
-    w = torch.randn(h1, h2, dtype=dtype, device=device)
-    wa = torch.randn(h1, r, dtype=dtype, device=device)
-    wb = torch.randn(r, h2, dtype=dtype, device=device)
-    x = torch.randn(bs, 1, h1, dtype=dtype, device=device)
+    res = dict(
+        backbone_avg=[],
+        backbone_std=[],
+        single_lora_avg=[],
+        single_lora_std=[],
+        multi_lora_avg=[],
+        multi_lora_std=[],
+    )
+    for bs in tqdm(bs_list):
+        w = torch.randn(h1, h2, dtype=dtype, device=device)
+        wa = torch.randn(h1, r, dtype=dtype, device=device)
+        wb = torch.randn(r, h2, dtype=dtype, device=device)
+        x = torch.randn(bs, 1, h1, dtype=dtype, device=device)
 
-    def muti_lora():
-      for i in range(bs):
-        x[i] @ wa @ wb
+        def muti_lora():
+            for i in range(bs):
+                x[i] @ wa @ wb
 
-    l_backbone = bench(lambda: x @ w, warmup=200, repeat=500)
-    l_single_lora = bench(lambda: x @ wa @ wb, warmup=200, repeat=500)
-    l_multi_lora = bench(muti_lora, warmup=200, repeat=500)
+        l_backbone = bench(lambda: x @ w, warmup=200, repeat=500)
+        l_single_lora = bench(lambda: x @ wa @ wb, warmup=200, repeat=500)
+        l_multi_lora = bench(muti_lora, warmup=200, repeat=500)
 
-    res["backbone_avg"].append(l_backbone.avg())
-    res["backbone_std"].append(l_backbone.std())
-    res["single_lora_avg"].append(l_single_lora.avg())
-    res["single_lora_std"].append(l_single_lora.std())
-    res["multi_lora_avg"].append(l_multi_lora.avg())
-    res["multi_lora_std"].append(l_multi_lora.std())
+        res["backbone_avg"].append(l_backbone.avg())
+        res["backbone_std"].append(l_backbone.std())
+        res["single_lora_avg"].append(l_single_lora.avg())
+        res["single_lora_std"].append(l_single_lora.std())
+        res["multi_lora_avg"].append(l_multi_lora.avg())
+        res["multi_lora_std"].append(l_multi_lora.std())
 
-  json.dump(res, f)
+    json.dump(res, f)
 
 
 def main():
-  this_file = pathlib.Path(__file__)
-  project_root = this_file.parents[1]
-  now = datetime.now(pytz.timezone("US/Pacific"))
-  out_filename = f"{now:%Y%m%d-%H%M%S}-{this_file.stem}.json.gz"
-  out_path = project_root / "data" / out_filename
+    this_file = pathlib.Path(__file__)
+    project_root = this_file.parents[1]
+    now = datetime.now(pytz.timezone("US/Pacific"))
+    out_filename = f"{now:%Y%m%d-%H%M%S}-{this_file.stem}.json.gz"
+    out_path = project_root / "data" / out_filename
 
-  print(out_path)
-  with gzip.open(out_path, "wt") as f:
-    bench_backbone_vs_lora(f)
+    print(out_path)
+    with gzip.open(out_path, "wt") as f:
+        bench_backbone_vs_lora(f)
 
 
 if __name__ == "__main__":
-  main()
+    main()
diff --git a/benchmarks/bench_batch_decode.py b/benchmarks/bench_batch_decode.py
@@ -9,104 +9,118 @@
 from tqdm import tqdm
 
 import punica.ops
-from punica.utils.kvcache import BatchedKvCache, KvCache, KvPool
+from punica import BatchedKvCache, KvCache, KvPool
 
 from .benchmark_utils import bench, gc_torch
 
 
 class batch_decode_Resources:
-
-  def __init__(
-      self,
-      num_heads: int,
-      head_dim: int,
-      block_len: int,
-      seqlens: list[int],
-      dtype: str,
-      device: torch.device,
-  ):
-    dtype = getattr(torch, dtype)
-    self.kvpool = KvPool(
-        num_layers=1,
-        num_heads=num_heads,
-        head_dim=head_dim,
-        capacity=sum((l + block_len - 1) // block_len for l in seqlens),
-        block_len=block_len,
-        dtype=dtype,
-        device=device,
-    )
-    self.q = torch.randn((len(seqlens), num_heads, head_dim),
-                         dtype=dtype,
-                         device=device)
-    kv_list: list[KvCache] = []
-    for seqlen in seqlens:
-      kv_list.append(KvCache(self.kvpool, seqlen))
-    self.kv_list = kv_list
-    self.kv = BatchedKvCache(kv_list)
-
-  def release(self):
-    for kvcache in self.kv_list:
-      kvcache.release()
+    def __init__(
+        self,
+        num_heads: int,
+        head_dim: int,
+        block_len: int,
+        seqlens: list[int],
+        dtype: str,
+        device: torch.device,
+    ):
+        dtype = getattr(torch, dtype)
+        self.kvpool = KvPool(
+            num_layers=1,
+            num_heads=num_heads,
+            head_dim=head_dim,
+            capacity=sum((l + block_len - 1) // block_len for l in seqlens),
+            block_len=block_len,
+            dtype=dtype,
+            device=device,
+        )
+        self.q = torch.randn(
+            (len(seqlens), num_heads, head_dim), dtype=dtype, device=device
+        )
+        kv_list: list[KvCache] = []
+        for seqlen in seqlens:
+            kv_list.append(KvCache(self.kvpool, seqlen))
+        self.kv_list = kv_list
+        self.kv = BatchedKvCache(kv_list)
+
+    def release(self):
+        for kvcache in self.kv_list:
+            kvcache.release()
 
 
 @torch.inference_mode()
 def bench_batch_decode(f):
-  num_heads_ = [32, 40]
-  batch_size_ = [
-      1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 64
-  ]
-  seqlen_ = list(reversed(range(2048, 0, -64)))
-  dtype = "float16"
-  device = torch.device("cuda:0")
-  block_len = 16
-  head_dim = 128
-
-  all_ = list(itertools.product(num_heads_, seqlen_, batch_size_))
-  for num_heads, seqlen, batch_size in (pbar := tqdm(all_)):
-    setup = dict(
-        num_heads=num_heads,
-        head_dim=head_dim,
-        block_len=block_len,
-        seqlen=seqlen,
-        batch_size=batch_size,
-    )
-    pbar.set_postfix(setup)
-    torch.manual_seed(0xabcdabcd987)
-    gc_torch()
-    res = batch_decode_Resources(
-        num_heads=num_heads,
-        head_dim=head_dim,
-        block_len=block_len,
-        seqlens=[seqlen] * batch_size,
-        dtype=dtype,
-        device=device,
-    )
-    latency = bench(
-        lambda: punica.ops.batch_decode(res.q, res.kv, layer_idx=0))
-    res.release()
-
-    result = {
-        "setup": setup,
-        "latency": {
-            "avg": latency.avg(),
-            "std": latency.std()
-        },
-    }
-    f.write(json.dumps(result) + "\n")
-    f.flush()
+    num_heads_ = [32, 40]
+    batch_size_ = [
+        1,
+        2,
+        3,
+        4,
+        5,
+        6,
+        7,
+        8,
+        10,
+        12,
+        14,
+        16,
+        20,
+        24,
+        28,
+        32,
+        40,
+        48,
+        56,
+        64,
+    ]
+    seqlen_ = list(reversed(range(2048, 0, -64)))
+    dtype = "float16"
+    device = torch.device("cuda:0")
+    block_len = 16
+    head_dim = 128
+
+    all_ = list(itertools.product(num_heads_, seqlen_, batch_size_))
+    for num_heads, seqlen, batch_size in (pbar := tqdm(all_)):
+        setup = dict(
+            num_heads=num_heads,
+            head_dim=head_dim,
+            block_len=block_len,
+            seqlen=seqlen,
+            batch_size=batch_size,
+        )
+        pbar.set_postfix(setup)
+        torch.manual_seed(0xABCDABCD987)
+        gc_torch()
+        res = batch_decode_Resources(
+            num_heads=num_heads,
+            head_dim=head_dim,
+            block_len=block_len,
+            seqlens=[seqlen] * batch_size,
+            dtype=dtype,
+            device=device,
+        )
+        latency = bench(lambda: punica.ops.batch_decode(res.q, res.kv, layer_idx=0))
+        res.release()
+
+        result = {
+            "setup": setup,
+            "latency": {"avg": latency.avg(), "std": latency.std()},
+        }
+        f.write(json.dumps(result) + "\n")
+        f.flush()
 
 
 def main():
-  this_file = pathlib.Path(__file__)
-  project_root = this_file.parents[1]
-  now = datetime.now(pytz.timezone("US/Pacific"))
-  out_filename = f"{now:%Y%m%d-%H%M%S}-{this_file.stem}.jsonl.gz"
-  out_path = project_root / "data" / out_filename
+    this_file = pathlib.Path(__file__)
+    project_root = this_file.parents[1]
+    now = datetime.now(pytz.timezone("US/Pacific"))
+    out_filename = f"{now:%Y%m%d-%H%M%S}-{this_file.stem}.jsonl.gz"
+    out_path = project_root / "data" / out_filename
 
-  print(out_path)
-  with gzip.open(out_path, "wt") as f:
-    bench_batch_decode(f)
+    print(out_path)
+    with gzip.open(out_path, "wt") as f:
+        bench_batch_decode(f)
 
 
 if __name__ == "__main__":
-  main()
+    main()