|
| 1 | +import argparse |
| 2 | +import math |
| 3 | +import os |
| 4 | +import time |
| 5 | +from contextlib import contextmanager |
| 6 | + |
| 7 | +import torch |
| 8 | +import torch.distributed as dist |
| 9 | +import torch.nn as nn |
| 10 | +from torch._dynamo import config as dynamo_config |
| 11 | +from torch.nn.parallel import DistributedDataParallel as DDP |
| 12 | + |
| 13 | +from torchao.dtypes.nf4tensor import linear_nf4, to_nf4 |
| 14 | + |
| 15 | + |
| 16 | +class LoRALinear(nn.Module): |
| 17 | + def __init__( |
| 18 | + self, |
| 19 | + hidden_dim: int, |
| 20 | + lora_rank: int = None, |
| 21 | + lora_alpha: float = 16, |
| 22 | + dtype: torch.dtype = torch.float32, |
| 23 | + ): |
| 24 | + super().__init__() |
| 25 | + self.hidden_dim = hidden_dim |
| 26 | + if lora_rank is None: |
| 27 | + lora_rank = hidden_dim // 2 |
| 28 | + |
| 29 | + weight = torch.randn(hidden_dim, hidden_dim, dtype=dtype) |
| 30 | + self.lora_rank = lora_rank |
| 31 | + self.lora_alpha = lora_alpha |
| 32 | + self.register_parameter( |
| 33 | + "weight", nn.Parameter(to_nf4(weight), requires_grad=False) |
| 34 | + ) |
| 35 | + self.lora_a = nn.Linear( |
| 36 | + in_features=hidden_dim, out_features=self.lora_rank, bias=False |
| 37 | + ) |
| 38 | + self.lora_b = nn.Linear( |
| 39 | + in_features=self.lora_rank, out_features=hidden_dim, bias=False |
| 40 | + ) |
| 41 | + self.initialize_parameters() |
| 42 | + |
| 43 | + def initialize_parameters(self): |
| 44 | + nn.init.kaiming_uniform_(self.lora_a.weight, a=math.sqrt(5)) |
| 45 | + nn.init.kaiming_uniform_(self.lora_b.weight, a=math.sqrt(5)) |
| 46 | + |
| 47 | + def forward(self, x: torch.Tensor) -> torch.Tensor: |
| 48 | + out = linear_nf4(input=x, weight=self.weight) |
| 49 | + lora_out = self.lora_a(x) |
| 50 | + lora_out = (self.lora_alpha / self.lora_rank) * self.lora_b(lora_out) |
| 51 | + return out + lora_out |
| 52 | + |
| 53 | + |
| 54 | +def _init_model(dim, num_linears, device, dtype) -> nn.Module: |
| 55 | + with torch.device(device): |
| 56 | + modules = [] |
| 57 | + for i in range(num_linears): |
| 58 | + modules += [LoRALinear(hidden_dim=dim, dtype=dtype)] |
| 59 | + seq = nn.Sequential(*modules) |
| 60 | + |
| 61 | + return seq |
| 62 | + |
| 63 | + |
| 64 | +def dist_print(*args, delay=0.5): |
| 65 | + rank = dist.get_rank() |
| 66 | + time.sleep(delay * rank) |
| 67 | + print(f"[rank{rank}]: ", *args, flush=True) |
| 68 | + |
| 69 | + |
| 70 | +def make_batch(global_bs, dim, dtype, device): |
| 71 | + batch = torch.randn((global_bs, dim), dtype=dtype, device=device) |
| 72 | + if dist.get_world_size() > 1: |
| 73 | + batch = batch.chunk(dist.get_world_size(), dim=0)[dist.get_rank()] |
| 74 | + return batch |
| 75 | + |
| 76 | + |
| 77 | +def run_ddp(global_bs, dim, num_linears, device, dtype, num_steps, save_dir, compile): |
| 78 | + os.makedirs(save_dir, exist_ok=True) |
| 79 | + model = _init_model(dim, num_linears, device, dtype) |
| 80 | + model = DDP(model, device_ids=[device]) |
| 81 | + |
| 82 | + if compile: |
| 83 | + model = torch.compile(model) |
| 84 | + optim = torch.optim.Adam(model.parameters(), lr=1e-2) |
| 85 | + |
| 86 | + losses = [] |
| 87 | + |
| 88 | + for i in range(num_steps): |
| 89 | + inp = make_batch(global_bs, dim, dtype, device) |
| 90 | + loss = model(inp).sum() |
| 91 | + losses.append(loss) |
| 92 | + loss.backward() |
| 93 | + optim.step() |
| 94 | + optim.zero_grad() |
| 95 | + |
| 96 | + dist.barrier() |
| 97 | + |
| 98 | + save_path = f"{save_dir}/ddp-{dist.get_rank()}.pt" |
| 99 | + torch.save(model.state_dict(), save_path) |
| 100 | + dist_print("Saved model to", save_path) |
| 101 | + |
| 102 | + |
| 103 | +def init_dist(): |
| 104 | + dist.init_process_group(backend="nccl") |
| 105 | + torch.cuda.set_device(dist.get_rank()) |
| 106 | + dist_print("Dist initialized with world size", dist.get_world_size()) |
| 107 | + |
| 108 | + |
| 109 | +def cleanup_dist(): |
| 110 | + dist.barrier() |
| 111 | + if dist.get_rank() == 0: |
| 112 | + print("Cleaning up dist") |
| 113 | + dist.destroy_process_group() |
| 114 | + |
| 115 | + |
| 116 | +@contextmanager |
| 117 | +def distributed_context(): |
| 118 | + init_dist() |
| 119 | + yield |
| 120 | + cleanup_dist() |
| 121 | + |
| 122 | + |
| 123 | +if __name__ == "__main__": |
| 124 | + parser = argparse.ArgumentParser() |
| 125 | + |
| 126 | + parser.add_argument("--global_bs", type=int, default=8) |
| 127 | + parser.add_argument("--dim", type=int, default=128) |
| 128 | + parser.add_argument("--num_linears", type=int, default=1) |
| 129 | + parser.add_argument("--seed", type=int, default=42) |
| 130 | + parser.add_argument("--device", type=str, default="cuda") |
| 131 | + parser.add_argument("--dtype", type=str, default="float32") |
| 132 | + parser.add_argument("--num_steps", type=int, default=3) |
| 133 | + parser.add_argument("--save_dir", type=str, default="checkpoints") |
| 134 | + parser.add_argument("--compile", action="store_true") |
| 135 | + parser.add_argument("--optimize_ddp", type=str, default="ddp_optimizer") |
| 136 | + args = parser.parse_args() |
| 137 | + |
| 138 | + args.dtype = getattr(torch, args.dtype) |
| 139 | + dynamo_config.optimize_ddp = args.optimize_ddp |
| 140 | + |
| 141 | + if args.optimize_ddp == "python_reducer": |
| 142 | + dynamo_config.compiled_autograd = True |
| 143 | + |
| 144 | + with distributed_context(): |
| 145 | + torch.manual_seed(args.seed) |
| 146 | + run_ddp( |
| 147 | + global_bs=args.global_bs, |
| 148 | + dim=args.dim, |
| 149 | + num_linears=args.num_linears, |
| 150 | + device=args.device, |
| 151 | + dtype=args.dtype, |
| 152 | + num_steps=args.num_steps, |
| 153 | + save_dir=args.save_dir, |
| 154 | + compile=args.compile, |
| 155 | + ) |
0 commit comments