PrimeIntellect-ai · samsja · Feb 1, 2025 · Feb 1, 2025 · Feb 8, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ description = "ZeroBand is a production ready codebase for decentralized trainin
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
-    "torch==2.5.1",
+    "torch==2.6.0",
     "numpy",
     "setuptools",
     "transformers>=4.44.2",

diff --git a/src/zeroband/checkpoint.py b/src/zeroband/checkpoint.py
@@ -28,7 +28,7 @@
 from torch.distributed.checkpoint.stateful import Stateful
 import warnings
 import logging
-from torch.distributed._tensor.api import DTensor
+from torch.distributed.tensor import DTensor
 from zeroband.utils.state_dict_send_recv import (
     _get_sendable_state_dict,
     recv_state_dict,

diff --git a/src/zeroband/diloco.py b/src/zeroband/diloco.py
@@ -8,7 +8,7 @@
 from zeroband.utils.logger import get_logger
 from zeroband.config import DilocoConfig
 import torch.distributed as dist
-from torch.distributed._tensor.api import DTensor
+from torch.distributed.tensor import DTensor
 from functools import lru_cache
 
 

diff --git a/src/zeroband/models/llama/model.py b/src/zeroband/models/llama/model.py
@@ -24,7 +24,7 @@
 from torch.nn.attention.flex_attention import create_block_mask, flex_attention, BlockMask, _DEFAULT_SPARSE_BLOCK_SIZE
 from torch.nn.attention import SDPBackend, sdpa_kernel
 
-_flex_attention_compiled = torch.compile(flex_attention, dynamic=False)
+_flex_attention_compiled = torch.compile(flex_attention, dynamic=False, mode="max-autotune-no-cudagraphs")
 
 
 # copied from https://github.com/pytorch/torchtune/blob/f2bd4bc25b24587aef40f486087412b9da8f1d94/torchtune/modules/attention_utils.py#L27

diff --git a/src/zeroband/train.py b/src/zeroband/train.py
@@ -5,7 +5,7 @@
 
 import torch
 import torch.distributed as dist
-from torch.distributed._composable.fsdp import fully_shard, MixedPrecisionPolicy, CPUOffloadPolicy  # type: ignore
+from torch.distributed.fsdp import fully_shard, MixedPrecisionPolicy, CPUOffloadPolicy  # type: ignore
 from torch.autograd.profiler import record_function
 
 from zeroband.checkpoint import CkptManager, TrainingProgress
@@ -70,10 +70,9 @@ def log_hash_training_state(
             logger.debug(f"outer diloco optimizer hash {id} : {outer_optimizer_hash}")
             logger.debug(f"outer diloco model hash {id} : {outer_model_hash}")
 
-            metrics.update({
-                f"outer_optimizer_hash_{id}": outer_optimizer_hash,
-                f"outer_model_hash_{id}": outer_model_hash
-            })
+            metrics.update(
+                {f"outer_optimizer_hash_{id}": outer_optimizer_hash, f"outer_model_hash_{id}": outer_model_hash}
+            )
         if world_info.rank == 0:
             assert metric_logger is not None
             metric_logger.log(metrics)
@@ -142,13 +141,11 @@ def train(config: Config):
             apply_ac_ckpt(model, num)
 
         elastic_device_mesh = ElasticDeviceMesh(
-            enable=config.diloco is not None,
-            live_recovery_rank_src=config.ckpt.live_recovery_rank_src
+            enable=config.diloco is not None, live_recovery_rank_src=config.ckpt.live_recovery_rank_src
         )
 
         mp_policy = MixedPrecisionPolicy(
-            param_dtype=torch.bfloat16,
-            reduce_dtype=torch.float32 if config.train.reduce_fp32 else None
+            param_dtype=torch.bfloat16, reduce_dtype=torch.float32 if config.train.reduce_fp32 else None
         )
 
         offload_policy = CPUOffloadPolicy(pin_memory=True) if config.train.fsdp_cpu_offload else None
@@ -365,9 +362,13 @@ def train(config: Config):
 
             with sw.record_block("Loss allreduce()"):
                 # Launch both allreduces at the same time to hide latency
-                loss_allreduce = dist.all_reduce(tensor=loss_batch, op=dist.ReduceOp.AVG, group=elastic_device_mesh.local_pg, async_op=True)
+                loss_allreduce = dist.all_reduce(
+                    tensor=loss_batch, op=dist.ReduceOp.AVG, group=elastic_device_mesh.local_pg, async_op=True
+                )
                 if config.optim.z_loss:
-                    z_loss_allreduce = dist.all_reduce(tensor=z_loss_batch, op=dist.ReduceOp.AVG, group=elastic_device_mesh.local_pg, async_op=True)
+                    z_loss_allreduce = dist.all_reduce(
+                        tensor=z_loss_batch, op=dist.ReduceOp.AVG, group=elastic_device_mesh.local_pg, async_op=True
+                    )
 
                 assert isinstance(loss_allreduce, torch.distributed.Work)
                 loss_allreduce.wait()

diff --git a/src/zeroband/utils/__init__.py b/src/zeroband/utils/__init__.py
@@ -3,7 +3,7 @@
 import time
 import torch
 from torch.distributed.fsdp import ShardingStrategy
-from torch.distributed._tensor.api import DTensor
+from torch.distributed.tensor import DTensor
 from distributed_shampoo import DistributedShampoo
 
 
@@ -193,4 +193,4 @@ def __init__(self):
         self.pad_token_id = 2
 
     def __len__(self):
-        return self.vocab_size
+        return self.vocab_size
diff --git a/src/zeroband/utils/state_dict_send_recv.py b/src/zeroband/utils/state_dict_send_recv.py
@@ -2,7 +2,7 @@
 import pickle
 import torch
 from torch.distributed import ProcessGroup
-from torch.distributed._tensor.api import DTensor
+from torch.distributed.tensor import DTensor
 
 
 def _object_to_tensor(obj):