diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index 310e7160..00000000
--- a/.gitmodules
+++ /dev/null
@@ -1,3 +0,0 @@
-[submodule "third_party/gloo"]
-	path = third_party/gloo
-	url = https://github.com/facebookincubator/gloo.git
diff --git a/INTELLECT_1_Technical_Report.pdf b/INTELLECT_1_Technical_Report.pdf
deleted file mode 100644
index 60bc6500..00000000
Binary files a/INTELLECT_1_Technical_Report.pdf and /dev/null differ
diff --git a/README.md b/README.md
index 776487e8..506a9ec6 100644
--- a/README.md
+++ b/README.md
@@ -1,189 +1,9 @@
 # prime - decentralized training at scale
-prime (previously called ZeroBand) is a framework for efficient, globally distributed training of AI models over the internet.
+this is a fork of prime that remove the distributed part just to debug the core stuff
 
-https://github.com/user-attachments/assets/c034d2a2-400c-4bf8-acd0-c84b6c897d69
-
-## Key Features
-
-- **`ElasticDeviceMesh` for Fault Tolerant Training:**
-    - In Prime, we’ve added a new distributed abstraction called `ElasticDeviceMesh` which encapsulates dynamic global process groups for fault-tolerant communication across the internet and local process groups for communication within a node or datacenter.
-    - The `ElasticDeviceMesh` manages the resizing of the global process groups when nodes join or leave, unlike the standard `DeviceMesh` in torch distributed, which will crash and require a cold restart to resize the process group.
-    - In order to know when to resize the process groups, we use a heartbeat mechanism to discover dead nodes and remove them from the process group. Crashing nodes will attempt a best effort deathrattle to fail their own heartbeat quickly, saving its comrades the timeout.
-- **Asynchronous distributed checkpointing**
-    - Due to the size of the model, checkpointing can be an expensive operation, taking up to 20 minutes on the nodes we tested. This would reduce our compute utilisation if it blocked the main training process.
-    - In order to minimize the blocking time, we first checkpoint into `/dev/shm` which is a RAM backed filesystem. This operation is much faster and we can unblock the main training process once the checkpoint has been created in `/dev/shm`.
-    - We then use two subprocesses to asynchronously copy the checkpoint out of `/dev/shm` into the checkpoint directory on disk as well as upload it to the remote.
-- **Live checkpoint recovery**
-    - Nodes that wish to join the run mid-training need to be able to get the most recent state of the model and optimiser before being able to contribute to the training. They must complete this operation in the time window between two outer steps, otherwise, the checkpoint they receive would be stale.
-    - In order to do this quickly, we have the joining nodes request the checkpoints from its peers which all host a sidecar HTTP server serving the latest checkpoint out of `/dev/shm`.
-    - Once the joining node has downloaded and initialized the model, it skips the inner steps and joins the outer step with zero pseudo-gradients. This is to prevent the joining node from stalling the existing nodes. If the joining node also performed the inner steps, it would be late to the outer step by the time it took to download and load the checkpoint, reducing the clusters compute utilisation.
-- **Custom Int8 All-Reduce Kernel**
-    - In our experiments, we found that we are able to perform int8 quantization on the pseudo gradients without any impact on the loss curves. This means that we can reduce the payload size of each outer step all-reduce by 4x if we communicate the pseudo-gradients in int8 instead of fp32.
-    - However, we need to accumulate the reduce in fp32, dequantizing and re-quantizing intermediate results during the all-reduce. This is not supported by any collective communication libraries.
-    - We thus implemented our own fully pipelined ring-reduce kernel in C++ which is JIT compiled as a custom operator using the torch library.
-    - However, with the amount of quantization work we needed to perform, using the torch ops (`quantize_per_tensor`, `scatter_add`, `index`, etc) was too slow, resulting in underutilisation of our target network bandwidth of 4 Gbps.
-    - We thus implemented our own multithreaded uint8 ops in C++  to perform the quantization and dequantization operations, improving the quantization speed by more than 60x.
-- **Maximising bandwidth utilization:**
-    - By sharding our DiLoCo pseudo-gradients in a node, we can maximise network bandwidth utilization by opening multiple connections at the same time when performing the all-reduce. This yielded a transfer speed improvement of 8x on some nodes.
-    - Relying on the public IP forward resulted in poor or unstable p2p bandwidth on some compute providers. To mitigate this, we employ VPN technology to optimize peer-to-peer connections between nodes, allowing us to better utilize the available internet bandwidth between nodes by modifying the routing of packets through the internet.
-    - We’ve improved bandwidth utilization between nodes in similar data center settings by up to 40x compared to our OpenDiLoCo release, achieving up to 4Gb/s connections between data centers across the whole United States.
-- **PyTorch FSDP2 / DTensor ZeRO-3 implementation**
-    - In order to fit the 10B model training within our given memory resources, we had to do shard the model weights, gradients and optimizer states between intra-node GPUs.
-    - We achieved this using the `fully_shard` API from PyTorch FSDP2 which wraps the model parameters as `DTensor`s and registers hooks to schedule all-gather and reduce-scatter on the tensors when they are used. FSDP2 also optimizes the collectives by bucketing the parameters into `FSDPParamGroup`s. This allows us to execute the collectives on larger tensors, improving protocol-to-payload ratio and improving the overlap from pipelining. We employ the same trick for our pseudo-gradients, bucketing them by layer.
-- **CPU Off-Loading**
-    - Our Diloco optimizer does not add any GPU overhead. All the tensors required by the Diloco optimizer are offloaded to CPU memory.
-    - Since we only perform a global sync every hundreds of steps, the reduced speed of copying and calculating the pseudo-gradient on cpu is negligible relative to the time to execute the inner steps and all-reduce.
-
-A research paper about the framework and our INTELLECT-1 10B experiment can be found [here](https://arxiv.org/abs/2412.01152).
-
-## Getting Started
-
-For an easy install that download the data
-
-```
-curl -sSL https://raw.githubusercontent.com/PrimeIntellect-ai/prime/main/scripts/install/install.sh | bash
-```
-
-step by step :
-
-
-1. Clone: 
-
-```bash
-git clone git@github.com:PrimeIntellect-ai/prime.git
-```
-
-2. Install `uv`:
-
-```bash
-curl -LsSf https://astral.sh/uv/install.sh | sh
-source $HOME/.local/bin/env
 ```
-
-3. Set up the environment:
-```bash
-sudo apt install iperf -y
-uv venv
-source .venv/bin/activate
-uv sync --extra all
-git submodule update --init --recursive
-```
-
-
-4. Log into Hugging Face:
-```bash
-huggingface-cli login
+curl -sSL https://raw.githubusercontent.com/samsja/prime/main/install.sh | bash
 ```
 
-5. Download the data 
-```
-mkdir -p datasets
-uv run python scripts/subset_data.py --dataset_name PrimeIntellect/fineweb-edu --data_world_size 1 --data_rank 0 --max_shards 32
-mv fineweb-edu/ datasets/fineweb-edu/
-```
-
-
-### Quick Check
 
-Verify your setup:
 
-```bash
-GLOO_SOCKET_IFNAME=lo GLOBAL_ADDR=localhost GLOBAL_RANK=0 GLOBAL_UNIQUE_ID=0 GLOBAL_WORLD_SIZE=1 GLOBAL_PORT=8989  uv run torchrun --nproc_per_node=2 src/zeroband/train.py  @configs/debug/diloco.toml
-```
-
-## Usage
-
-### Running DiLoCo
-
-To test DiLoCo locally you can use the helper script `scripts/simulate_multi_node_diloco.sh`
-
-```bash
-# Using 4 GPUs (2 diloco workers, each across 2 GPUs)
-ZERO_BAND_LOG_LEVEL=DEBUG ./scripts/simulate_multi_node_diloco.sh 2 2 src/zeroband/train.py @configs/debug/diloco.toml
-
-# Using 2 GPUs (2 diloco workers, each on a single GPU)
-ZERO_BAND_LOG_LEVEL=DEBUG ./scripts/simulate_multi_node_diloco.sh 2 1 src/zeroband/train.py @configs/debug/diloco.toml
-```
-
-### Running Tests
-
-Ensure you have at least two GPU to run the full test suite:
-```bash
-uv run pytest
-```
-
-
-### Eval
-
-To eval you need first to convert the checkpoint to a huggingface compatible model.
-
-```bash
-uv run python scripts/export_dcp.py @configs/10B/H100.toml --ckpt.path CONVERTED_MODEL_PATH --ckpt.resume CHECKPOINT_PATH --torch_dtype bfloat16  --ckpt.interval 1
-```
-
-
-```
-uv run accelerate launch -m lm_eval --model hf --model_args pretrained=CONVERTED_MODEL_PATH,add_bos_token=True  --tasks hellaswag --num_fewshot 10
-```
-
-
-## Environment variables
-### Global Store Initialization
-| Environment Variable  | Description                                      | Default Value |
-|-----------------------|--------------------------------------------------|---------------|
-| `GLOBAL_UNIQUE_ID`    | Unique identifier worker in global store.        | `None`  |
-| `GLOBAL_ADDR`         | IP Address of the global store                   | `None`  |
-| `GLOBAL_PORT`         | Port number of the global store.                 | `None` |
-| `GLOBAL_WORLD_SIZE`   | The size of the global process group.            | `1` |
-| `GLOBAL_RANK`         | Rank of the process in the global process group. | `0` |
-
-### Elastic Device Mesh Configuration
-| Environment Variable  | Description                                      | Default Value |
-|-----------------------|--------------------------------------------------|---------------|
-| `ZERO_BAND_LOG_LEVEL` | Enable debug log lines | `False` |
-| `ZERO_BAND_GLOBAL_STORE_TIMEOUT_SECONDS` | Number of seconds before the global store operations timeout | `300` |
-| `ZERO_BAND_GLOBAL_PG_TIMEOUT_SECONDS` | Number of seconds before the global process group operations timeout | `600` |
-| `ZERO_BAND_GLOBAL_STORE_POLLING_INTERVAL_SECONDS` | Number of seconds between polls to the store when waiting for values | `0.1` |
-| `ZERO_BAND_EDM_HEARTBEAT_INTERVAL_SECONDS` | Interval in seconds between heartbeats | `2` |
-| `ZERO_BAND_EDM_HEARTBEAT_TIMEOUT_SECONDS` | Time in seconds after which a node is considered dead if no heartbeat is received | `10` |
-| `ZERO_BAND_LIVE_RECO_PORT` | Port number for the live recovery server | random |
-| `ZERO_BAND_LIVE_RECO_ADDR` | IP Address for the live recovery server | `localhost` |
-
-## Troubleshooting
-
-If you encounter any dataset loading errors at the beginning of training, try setting:
-
-```bash
-export HF_HUB_ETAG_TIMEOUT=500
-```
-
-## Pre-downloading datasets
-Streaming datasets from huggingface hub can sometimes result in http 443 errors which will crash the training process.
-To avoid them, you can pre-download the dataset.
-
-Here is an example that downloads all the files in `PrimeIntellect/fineweb-edu` which are used by `data_rank` 5 in a training with `data_world_size` of 12.
-```bash
-python3 scripts/subset_data.py --dataset_name PrimeIntellect/fineweb-edu --data_world_size 12 --data_rank 5
-```
-
-For info about the arguments to the script, do:
-```bash
-python3 scripts/subset_data.py --help
-```
-
-# Exporting checkpoints to huggingface compatible model
-You can convert the checkpoints saved by the training script to a model that can be run with any huggingface-compatible inference engine (e.g. transformers, vLLM) using our export script.
-The export script takes the training config as a positional argument and 2 keyword arguments, `ckpt.resume` which is the path to the checkpoint, `ckpt.path` which is the path you wish to save the converted model.
-You may also pass the `torch_dtype` argument to either `float32` or `bfloat16` to specify the precision of the exported model weights. The default `torch_dtype` is `float32`.
-
-Example export command:
-```bash
-python scripts/export_dcp.py @configs/10B/H100.toml --ckpt.path /path/to/save/converted_model --ckpt.resume /path/to/ckpt/step_84000 --torch_dtype bfloat16
-```
-
-You can then upload the model to huggingface using huggingface-cli:
-```bash
-# Usage:  huggingface-cli upload [repo_id] [local_path] [path_in_repo]
-huggingface-cli upload username/mymodel /path/to/save/converted_model . --private
-```
-The repo will be created if `repo_id` does not exist. The `--private` will create the repo as a private repo and cab ne ommited to create a publicly accessible repo.
diff --git a/configs/10B/H100.toml b/configs/10B/H100.toml
deleted file mode 100644
index d743cc8a..00000000
--- a/configs/10B/H100.toml
+++ /dev/null
@@ -1,41 +0,0 @@
-name_model = "10B"
-project = "10B_zero_band"
-wandb_resume = false
-
-[train]
-micro_bs = 1
-ac_ckpt = true
-
-[optim]
-sched_type = "wsd-sqrt"
-batch_size = 128 #1M tokens bs
-warmup_steps = 1000
-total_steps = 1_000_000_000_000
-
-
-z_loss = true
-
-[optim.optim]
-lr = 7.5e-5
-betas1 = 0.9
-betas2 = 0.95
-weight_decay = 0.1
-
-[data]
-seq_length = 8192
-dataset_name_or_paths = "/data/datasets/fineweb-edu,/data/datasets/fineweb,/data/datasets/StackV1-popular,/data/datasets/dclm-baseline-1.0-parquet,/data/datasets/open-web-math"
-dataset_ratio = "55:10:20:10:5"
-num_workers = 4
-reverse_data_files = true
-split_by_data_rank = false # the 10b training assume that data was already split by datarank. Keeping this for backward compatibility
-
-
-[diloco]
-inner_steps = 100
-compression = "uint8"
-
-[ckpt]
-interval = 100
-topk = 40
-path = "/data/10B"
-remote_data_path = "/data/10B_data_ckpt"
diff --git a/configs/10B/H100_cooldown.toml b/configs/10B/H100_cooldown.toml
deleted file mode 100644
index c443e0ed..00000000
--- a/configs/10B/H100_cooldown.toml
+++ /dev/null
@@ -1,40 +0,0 @@
-name_model = "10B"
-project = "10B_zero_band"
-wandb_resume = false
-
-[train]
-micro_bs = 1
-ac_ckpt = true
-
-[optim]
-sched_type = "wsd-sqrt"
-batch_size = 128 #1M tokens bs
-warmup_steps = 1000
-stable_steps = 74700
-total_steps = 90400
-
-z_loss = true
-
-[optim.optim]
-lr = 7.5e-5
-betas1 = 0.9
-betas2 = 0.95
-weight_decay = 0.1
-
-[data]
-seq_length = 8192
-dataset_name_or_paths = "/data/datasets/fineweb-edu,/data/datasets/fineweb,/data/datasets/StackV1-popular"
-dataset_ratio = "80:10:10"
-num_workers = 4
-reverse_data_files = false
-split_by_data_rank = false # the 10b training assume that data was already split by datarank. Keeping this for backward compatibility
-
-[diloco]
-inner_steps = 100
-compression = "uint8"
-
-[ckpt]
-interval = 100
-topk = 40
-path = "/data/10B"
-remote_data_path = "/data/10B_data_ckpt"
diff --git a/configs/10B/H100_devel.toml b/configs/10B/H100_devel.toml
deleted file mode 100644
index 45529fc0..00000000
--- a/configs/10B/H100_devel.toml
+++ /dev/null
@@ -1,32 +0,0 @@
-name_model = "10B" # "26B"
-type_model = "llama2"
-
-project = "debug_I2_zero_band"
-run_name = "testing :3"
-
-metric_logger_type = "dummy" # "wandb"
-log_level = "DEBUG"
-
-
-[train]
-micro_bs = 1
-ac_ckpt = true
-torch_profiler = false
-torch_compile = true
-fused_linear_ce = true
-fsdp_cpu_offload = true
-
-[train.memory_profiler]
-freq = 1
-snapshot_dir = "logs/"
-
-[optim]
-sched_type = "wsd-sqrt"
-batch_size = 128
-warmup_steps = 0
-total_steps = 1 # 2_000
-z_loss = true
-
-[data]
-seq_length = 8192
-num_workers = 4
diff --git a/configs/13B/H100.toml b/configs/13B/H100.toml
deleted file mode 100644
index 4bfc3e05..00000000
--- a/configs/13B/H100.toml
+++ /dev/null
@@ -1,17 +0,0 @@
-name_model = "13B"
-project = "debug_13B_zero_band"
-
-[train]
-micro_bs = 1
-ac_ckpt = true
-
-[optim]
-batch_size = 1024 #2M tokens bs
-warmup_steps = 1000
-total_steps = 88_000
-
-[optim.optim]
-lr = 3e-4
-
-[data]
-seq_length = 2048
\ No newline at end of file
diff --git a/configs/70M/H100.toml b/configs/70M/H100.toml
index 3d077a30..c7ddfaf5 100644
--- a/configs/70M/H100.toml
+++ b/configs/70M/H100.toml
@@ -1,16 +1,16 @@
 name_model = "70M"
-project = "debug_70m_zero_band"
+project = "70m_prime_simple"
 type_model = "llama2"
 
 [train]
-micro_bs = 128 # change this base on the gpu
+micro_bs = 64 # change this base on the gpu
 reshard_after_forward = false
 
 [optim]
 batch_size = 512
-warmup_steps = 1000
-total_steps = 88_000
+warmup_steps = 500
+total_steps = 4000
 
 [optim.optim]
-lr = 4e-4
+lr = 1e-3
 
diff --git a/configs/7B_diloco/H100.toml b/configs/7B_diloco/H100.toml
deleted file mode 100644
index b6a84d2c..00000000
--- a/configs/7B_diloco/H100.toml
+++ /dev/null
@@ -1,25 +0,0 @@
-name_model = "7B"
-project = "debug_7B_zero_band"
-type_model = "llama2"
-
-[train]
-micro_bs = 1
-
-[optim]
-batch_size = 1024 #2M tokens bs
-warmup_steps = 1000
-total_steps = 88_000
-
-[optim.optim]
-lr = 3e-4
-
-[data]
-seq_length = 2048
-
-[diloco]
-inner_steps = 50
-
-[ckpt]
-path = "/data/outputs_1b_diloco_50"
-interval = 1000
-
diff --git a/configs/debug/diloco.toml b/configs/debug/diloco.toml
deleted file mode 100644
index c98e4603..00000000
--- a/configs/debug/diloco.toml
+++ /dev/null
@@ -1,19 +0,0 @@
-name_model = "debugmodel"
-project = "/tmp/debug"
-metric_logger_type = "dummy"
-type_model = "llama2"
-
-[train]
-micro_bs = 8
-
-[optim]
-batch_size = 16
-warmup_steps = 10
-total_steps = 4
-
-[data]
-fake = true
-
-[diloco]
-inner_steps = 5
-
diff --git a/configs/debug/normal.toml b/configs/debug/normal.toml
index cd64084c..907e2e9f 100644
--- a/configs/debug/normal.toml
+++ b/configs/debug/normal.toml
@@ -1,6 +1,5 @@
 name_model = "debugmodel"
-project = "/tmp/debug"
-metric_logger_type = "dummy"
+wandb = false
 type_model = "llama2"
 
 [train]
diff --git a/scripts/install/install.sh b/install.sh
similarity index 94%
rename from scripts/install/install.sh
rename to install.sh
index e96c9309..e89688f6 100755
--- a/scripts/install/install.sh
+++ b/install.sh
@@ -27,7 +27,7 @@ main() {
     sudo apt install iperf -y
 
     log_info "Cloning repository..."
-    git clone https://github.com/PrimeIntellect-ai/prime.git
+    git clone https://github.com/samsja/prime.git
     
     log_info "Entering project directory..."
     cd prime
@@ -41,13 +41,13 @@ main() {
     fi
     
     log_info "Creating virtual environment..."
-    uv venv
+    uv venv --python 3.10
     
     log_info "Activating virtual environment..."
     source .venv/bin/activate
     
     log_info "Installing dependencies..."
-    uv sync --extra all
+    uv sync
         
     log_info "Updating git submodules..."
     git submodule update --init --recursive
@@ -60,4 +60,4 @@ main() {
     log_info "Installation completed! You can double check that everything is install correctly by running 'GLOO_SOCKET_IFNAME=lo GLOBAL_ADDR=localhost GLOBAL_RANK=0 GLOBAL_UNIQUE_ID=0 GLOBAL_WORLD_SIZE=1 GLOBAL_PORT=8989  uv run torchrun --nproc_per_node=2 src/zeroband/train.py  @configs/debug/diloco.toml'"
 }
 
-main
\ No newline at end of file
+main
diff --git a/pyproject.toml b/pyproject.toml
index d1917715..1ecc0fe1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,32 +5,17 @@ description = "ZeroBand is a production ready codebase for decentralized trainin
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
-    "torch==2.5.1",
+    "torch==2.6.0",
     "numpy",
     "setuptools",
     "transformers>=4.44.2",
     "datasets>=3.0.0",
     "pydantic_config @ git+https://github.com/samsja/pydantic_config.git@b7becc3",
     "torchdata>=0.8.0",
-    "fsspec[gcs]>=2024.3.1",
     "ninja",
     "zstandard",
     "pyarrow",
-    "toposolve>=0.1.17",
-    "psutil",
-    "torch-shampoo @ git+https://github.com/facebookresearch/optimizers.git@main",
-    "liger-kernel-nightly>=0.5.2.dev20250122195349",
-]
-
-[project.optional-dependencies]
-
-
-all = [
-    "wandb",
-    "asyncio>=3.4.3",
-    "aiohttp>=3.10.5",
-    "requests>=2.32.3",
-    "lm-eval"
+    "wandb"
 ]
 
 
diff --git a/scripts/all_reduce.py b/scripts/all_reduce.py
deleted file mode 100644
index 2d99b418..00000000
--- a/scripts/all_reduce.py
+++ /dev/null
@@ -1,69 +0,0 @@
-from pydantic_config import BaseConfig, parse_argv
-import torch
-from torch.distributed import destroy_process_group, init_process_group, ReduceOp
-import torch.utils.benchmark as benchmark
-
-from zeroband.collectives import Compression, all_reduce
-from zeroband.utils.world_info import get_world_info
-from zeroband.utils.logger import get_logger
-
-from enum import Enum
-
-
-class TorchDtype(str, Enum):
-    FLOAT32 = "float32"
-    FLOAT16 = "float16"
-    BFLOAT16 = "bfloat16"
-    UINT8 = "uint8"
-
-
-TORCH_DTYPE_MAP = {
-    None: None,
-    TorchDtype.FLOAT32: torch.float32,
-    TorchDtype.FLOAT16: torch.float16,
-    TorchDtype.BFLOAT16: torch.bfloat16,
-    TorchDtype.UINT8: torch.uint8,
-}
-
-
-class Config(BaseConfig):
-    size_model: int = int(1e7)
-    n_iters: int = 4
-    compression: Compression = Compression.NO
-
-
-def main(config: Config):
-    world_info = get_world_info()
-
-    mat = torch.rand(1, config.size_model)
-
-    logger.info(
-        f"\n ======== Benchmark all reduce between {world_info.world_size} gpus over {world_info.nnodes} nodes =========\n"
-    )
-
-    t0 = benchmark.Timer(
-        stmt="compressed_all_reduce(compression, mat, op=op)",
-        globals={
-            "compressed_all_reduce": all_reduce,
-            "mat": mat,
-            "compression": config.compression,
-            "op": ReduceOp.SUM,
-        },
-    )
-
-    measured_time = t0.timeit(config.n_iters).mean
-
-    bandwidth = config.size_model * 4 / 1e6 / measured_time
-
-    logger.info(f"Average time per iteration: {measured_time:.2f} seconds, Average bandwidth: {bandwidth:.4f} MB/s")
-
-
-if __name__ == "__main__":
-    config = Config(**parse_argv())
-
-    torch.set_float32_matmul_precision("high")
-    init_process_group(backend="gloo")
-
-    logger = get_logger()
-    main(config)
-    destroy_process_group()
diff --git a/scripts/bandwith/down.sh b/scripts/bandwith/down.sh
deleted file mode 100755
index e8c32c37..00000000
--- a/scripts/bandwith/down.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-
-# Check if the script is run as root
-if [ "$EUID" -ne 0 ]; then
-  echo "Please run as root"
-  exit 1
-fi
-
-# Define variables
-INTERFACE="lo"  # localhost interface
-RATE="500mbit"  # 500 Mbps
-BURST="500k"    # burst size
-LATENCY="50ms"  # maximum latency
-
-# Remove any existing traffic control rules on the interface
-tc qdisc del dev $INTERFACE root 2>/dev/null
-
-# Add the rate limiting rule
-tc qdisc add dev $INTERFACE root tbf rate $RATE burst $BURST latency $LATENCY
-
-echo "Bandwidth limit of $RATE has been set on $INTERFACE"
-
-# To remove the limit, run:
-# tc qdisc del dev $INTERFACE root
\ No newline at end of file
diff --git a/scripts/bandwith/up.sh b/scripts/bandwith/up.sh
deleted file mode 100755
index fff8a800..00000000
--- a/scripts/bandwith/up.sh
+++ /dev/null
@@ -1 +0,0 @@
-tc qdisc del dev lo root
diff --git a/scripts/convert_dl_ckpt.sh b/scripts/convert_dl_ckpt.sh
deleted file mode 100755
index efc1defc..00000000
--- a/scripts/convert_dl_ckpt.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-set -e
-
-# Wrapper script to run the Python command on 8 checkpoints in parallel
-# Usage: ./convert_all.sh /data/10b/step_50800/diloco_0/data
-
-# Input path prefix
-INPUT_PATH=$1
-
-# Run the commands for each checkpoint in parallel
-for i in {0..7}; do
-    CHECKPOINT_PATH="${INPUT_PATH}/_${i}.pt"
-    BACKUP_PATH="${INPUT_PATH}/_${i}_old.pt"
-    TMP_PATH="${INPUT_PATH}/_${i}_tmp.pt"
-
-    if [ -f "$BACKUP_PATH" ]; then
-        echo "Checkpoint ${CHECKPOINT_PATH} has already been processed, skipping." &
-    else
-        (
-            uv run python scripts/convert_dl_state.py @configs/10B/H100.toml \
-                --input_path "$CHECKPOINT_PATH" \
-                --output_path "$TMP_PATH" \
-                --rank "$i" \
-                --world_size 8 && \
-            mv "$CHECKPOINT_PATH" "$BACKUP_PATH" && \
-            mv "$TMP_PATH" "$CHECKPOINT_PATH" && \
-            echo "Processed ${CHECKPOINT_PATH} and moved to ${BACKUP_PATH}"
-        ) &
-    fi
-done
-
-# Wait for all background jobs to complete
-wait
-
-echo "All checkpoints processed"
diff --git a/scripts/convert_dl_state.py b/scripts/convert_dl_state.py
deleted file mode 100755
index d1d1b61c..00000000
--- a/scripts/convert_dl_state.py
+++ /dev/null
@@ -1,141 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-# Example Usage:
-# python scripts/convert_dl_state.py @configs/10B/H100.toml --input_path /workspace/step_49200/diloco_0/data/_3.pt --output_path ./meow.pt --rank 3 --world_size 8
-
-import torch
-from zeroband.config import resolve_env_vars
-from zeroband.data import get_dataloader
-from transformers import AutoTokenizer
-from zeroband.train import Config
-from zeroband.utils.logger import get_logger
-from pydantic_config import parse_argv
-
-COMMON_KEYS = [
-    "_snapshot._main_snapshot._sampler_iter_yielded",
-    "_snapshot._snapshot_step",
-    "_snapshot._main_snapshot._index_sampler_state.samples_yielded",
-    "_snapshot._main_snapshot._num_workers",
-    "_snapshot._main_snapshot._sampler_iter_state",
-    "_snapshot._main_snapshot._shared_seed",
-    "_snapshot._last_yielded_worker_id",
-    "_snapshot._main_snapshot._base_seed",
-]
-
-
-def traverse_dict(d: dict, key: str):
-    _k = key.split(".")
-    for k in _k:
-        d = d[k]
-    return d
-
-
-def transfer_states(old_state_dict: dict, new_state_dict: dict):
-    for k in COMMON_KEYS:
-        parent, _, child = k.rpartition(".")
-        if parent:
-            traverse_dict(new_state_dict, parent)[child] = traverse_dict(old_state_dict, parent)[child]
-    for worker_id in range(4):
-        ex_iterables = [
-            ds_state["ex_iterable"]
-            for ds_state in traverse_dict(
-                old_state_dict, f"_snapshot._worker_snapshots.worker_{worker_id}.dataset_state.ex_iterable.ex_iterables"
-            )
-        ]
-        num_ds = len(ex_iterables)
-        new_ds_state = traverse_dict(
-            new_state_dict, f"_snapshot._worker_snapshots.worker_{worker_id}.dataset_state.dataset"
-        )
-        # HACK: dataset_4 is openwebmath which is not always present
-        if "dataset_4" not in new_ds_state.keys():
-            num_ds -= 1
-        new_ds_state = [
-            traverse_dict(
-                new_state_dict, f"_snapshot._worker_snapshots.worker_{worker_id}.dataset_state.dataset.dataset_{i}"
-            )
-            for i in range(num_ds)
-        ]
-
-        for new_state, old_state in zip(new_ds_state, ex_iterables):
-            # HACK: We might index error because of skipping into a different sized shard for dclm
-            new_state["file_index"] = (old_state["shard_idx"] + 1) % len(new_state["files"])
-            new_state["row_index"] = 0  # old_state["shard_example_idx"]
-
-
-class ExportConfig(Config):
-    input_path: str
-    output_path: str
-    rank: int
-    world_size: int
-
-
-def main(config: ExportConfig):
-    old_state_dict = torch.load(config.input_path)["data_loader"]
-
-    if config.type_model == "llama2":
-        tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=True)
-    elif config.type_model == "llama3":
-        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B", use_fast=True)
-    else:
-        raise ValueError(f"Model type {config.type_model} not supported")
-
-    dl = get_dataloader(
-        tokenizer=tokenizer,
-        world_size=config.world_size,
-        rank=config.rank,
-        batch_size=config.train.micro_bs,
-        data_config=config.data,
-    )
-
-    iter_dl = iter(dl)
-
-    # Needed to init the states because they are lazy
-    while True:
-        try:
-            _ = next(iter_dl)
-            new_state_dict = dl.state_dict()
-            transfer_states(old_state_dict, new_state_dict)
-            break
-        except KeyError:
-            print("Not inited, sampling again")
-            pass
-
-    print(f"Saving to {config.output_path}")
-    torch.save({"data_loader": new_state_dict}, config.output_path)
-
-    del dl
-
-
-def test_dl(config: ExportConfig):
-    if config.type_model == "llama2":
-        tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=True)
-    elif config.type_model == "llama3":
-        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B", use_fast=True)
-    else:
-        raise ValueError(f"Model type {config.type_model} not supported")
-
-    dl = get_dataloader(
-        tokenizer=tokenizer,
-        world_size=config.world_size,
-        rank=config.rank,
-        batch_size=config.train.micro_bs,
-        data_config=config.data,
-    )
-    dl.load_state_dict(torch.load(config.output_path, weights_only=True)["data_loader"])
-
-    iter_dl = iter(dl)
-
-    # Needed to init the states because they are lazy
-    for i in range(10):
-        batch = next(iter_dl)
-        print(batch.keys(), batch["input_ids"].shape)
-
-
-if __name__ == "__main__":
-    logger = get_logger()
-    config = ExportConfig(**parse_argv())
-    resolve_env_vars(config)
-    logger.debug(f"config: {config.model_dump()}")
-
-    main(config)
-    test_dl(config)
diff --git a/scripts/export_dcp.py b/scripts/export_dcp.py
deleted file mode 100644
index dd21e3d5..00000000
--- a/scripts/export_dcp.py
+++ /dev/null
@@ -1,228 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-# Example Usage:
-# python scripts/export_dcp.py @configs/10B/H100.toml --ckpt.path /data/intellect-1-step17000 --ckpt.resume /data/10b/step_17000/diloco_0
-
-import torch
-from typing import Literal
-import torch.distributed.checkpoint as dcp
-from zeroband.models.llama import get_model
-from zeroband.config import resolve_env_vars
-from zeroband.checkpoint import ModelWrapper
-from zeroband.utils import get_module_signature
-from zeroband.train import Config
-from zeroband.utils.logger import get_logger
-from pydantic_config import parse_argv
-from transformers import AutoTokenizer
-import math
-from pathlib import Path
-from safetensors.torch import save_file
-import json
-from zeroband.models.llama import ModelArgs
-from transformers import LlamaConfig
-from transformers.generation import GenerationConfig
-
-
-class ExportConfig(Config):
-    save_format: Literal["pt", "safetensors"] = "safetensors"
-    torch_dtype: Literal["float32", "bfloat16"] = "float32"
-    with_debug_automap: bool = False
-
-
-def remap_keys_llama(k: str) -> str:
-    """Maps ZeroBand keys to HuggingFace keys"""
-    return ("model." if "output.weight" not in k else "") + k.replace("tok_embeddings", "embed_tokens").replace(
-        "attention.wq", "self_attn.q_proj"
-    ).replace("attention.wk", "self_attn.k_proj").replace("attention.wv", "self_attn.v_proj").replace(
-        "attention.wo", "self_attn.o_proj"
-    ).replace("attention_norm", "input_layernorm").replace("feed_forward.w3", "mlp.up_proj").replace(
-        "feed_forward.w2", "mlp.down_proj"
-    ).replace("feed_forward.w1", "mlp.gate_proj").replace("ffn_norm", "post_attention_layernorm").replace(
-        "output.weight", "lm_head.weight"
-    )
-
-
-def _get_ffn_dim(hidden_dim: int, ffn_dim_multiplier: float, multiple_of: int) -> int:
-    """Get the FFN dimension from ZeroBand args"""
-    hidden_dim = int(8 * hidden_dim / 3)
-    # custom dim factor multiplier
-    if ffn_dim_multiplier is not None:
-        hidden_dim = int(ffn_dim_multiplier * hidden_dim)
-    hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
-    return hidden_dim
-
-
-def convert_config_zb_to_hf(
-    zb_config: ModelArgs, with_debug_automap: bool = False, type_model: str = "llama3"
-) -> LlamaConfig:
-    """Convert ZeroBand config to HuggingFace config"""
-    config = LlamaConfig()
-    config.hidden_size = zb_config.dim
-    config.num_hidden_layers = zb_config.n_layers
-    config.num_attention_heads = zb_config.n_heads
-    config.num_key_value_heads = zb_config.n_kv_heads
-    config.vocab_size = zb_config.vocab_size
-    config.intermediate_size = _get_ffn_dim(zb_config.dim, zb_config.ffn_dim_multiplier, zb_config.multiple_of)
-    config.rms_norm_eps = zb_config.norm_eps
-    config.rope_theta = float(zb_config.rope_theta)
-    config.max_position_embeddings = zb_config.max_seq_len
-
-    if type_model == "llama2":
-        config.bos_token_id = [1]
-        config.eos_token_id = [2]
-    else:
-        config.bos_token_id = [128000]
-        config.eos_token_id = [128001, 128008, 128009]
-
-    config.architectures = ["LlamaForCausalLM"]
-
-    # Rope scaling
-    config.rope_scaling = {
-        "original_max_position_embeddings": 8192,
-        "rope_type": "default",
-    }
-
-    if with_debug_automap:
-        config.auto_map = {
-            "AutoConfig": "PrimeIntellect/prime-llama-debug--configuration_llama.LlamaConfig",
-            "AutoModelForCausalLM": "PrimeIntellect/prime-llama-debug--modeling_llama.LlamaForCausalLM",
-        }
-
-    return config
-
-
-@torch.no_grad
-def convert_qk_from_complex_to_rotate_half(linear_weight: torch.FloatTensor, head_dim: int) -> torch.FloatTensor:
-    """Converts the Q/K weight from complex to rotate half form.
-    This is required because the rotary implementation in ZeroBand uses complex numbers which encodes even elements as real and odd number as complex.
-    [0, 1, 2, 3] -> [0 + 1j, 2 + 3j]
-    However, the HuggingFace implementation uses rotate_half which encodes top half as real and bottom half as complex.
-    [0, 1, 2, 3] -> [0, 1] + [2, 3]j
-
-    We thus need to permute the QK outputs to match the HuggingFace implementation.
-    """
-    new_weight = torch.zeros_like(linear_weight)
-
-    num_heads = linear_weight.size(0) // head_dim
-    hhd = head_dim // 2
-
-    # This applies the riffle shuffle permutation to the outputs of the linear for each attn head
-    # Even numbers go to the top half, odd numbers go to the bottom half
-    for i in range(num_heads):
-        new_weight[i * head_dim : (i * head_dim + hhd), :].copy_(
-            linear_weight[i * head_dim + 0 : (i + 1) * head_dim : 2, :]
-        )
-        new_weight[i * head_dim + hhd : (i + 1) * head_dim, :].copy_(
-            linear_weight[i * head_dim + 1 : (i + 1) * head_dim : 2, :]
-        )
-
-    return new_weight
-
-
-def main(config: ExportConfig):
-    # Create save path
-    save_path = Path(config.ckpt.path)
-    save_path.mkdir(parents=True, exist_ok=True)
-
-    # Load model
-    logger.info("Getting tokenizer (for vocab size)")
-    if config.type_model == "llama2":
-        tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=True)
-    elif config.type_model == "llama3":
-        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B", use_fast=True)
-    else:
-        raise ValueError(f"Model type {config.type_model} not supported")
-
-    logger.info("Getting model")
-    model, model_config = get_model(
-        config.name_model,
-        config.type_model,
-        vocab_size=len(tokenizer),
-        seq_length=config.data.seq_length,
-        attn_fn=config.train.attn_fn,
-    )
-
-    # Convert ZeroBand config to HuggingFace config
-    hf_config = convert_config_zb_to_hf(
-        model_config, with_debug_automap=config.with_debug_automap, type_model=config.type_model
-    )
-    hf_config.to_json_file(save_path / "config.json")
-
-    # Load checkpoint
-    logger.info("Before load: %s", get_module_signature(model))
-    states = {
-        "model": ModelWrapper(model),
-    }
-    logger.info("Loading from %s", config.ckpt.resume)
-    dcp.load(
-        state_dict=states,
-        checkpoint_id=config.ckpt.resume,
-    )
-
-    logger.info("After load: %s", get_module_signature(model))
-
-    # Convert model to HuggingFace format
-    num_shards = int(sum(p.numel() for p in model.parameters()) / 1e9)
-    state_dict = model.state_dict()
-
-    index_json = {}
-    total_size = 0
-    state_dict = {remap_keys_llama(k): v for k, v in state_dict.items()}
-    if not config.with_debug_automap:  # The debug uses complex rotary impl
-        with torch.no_grad():
-            for i in range(hf_config.num_hidden_layers):
-                old_q = state_dict[f"model.layers.{i}.self_attn.q_proj.weight"]
-                old_k = state_dict[f"model.layers.{i}.self_attn.k_proj.weight"]
-                new_q = convert_qk_from_complex_to_rotate_half(old_q, 128)
-                new_k = convert_qk_from_complex_to_rotate_half(old_k, 128)
-                state_dict[f"model.layers.{i}.self_attn.q_proj.weight"].copy_(new_q)
-                state_dict[f"model.layers.{i}.self_attn.k_proj.weight"].copy_(new_k)
-    if "model.freqs_cis" in state_dict:  # This should not be persisted
-        del state_dict["model.freqs_cis"]
-    if config.torch_dtype == "bfloat16":
-        state_dict = {k: v.to(torch.bfloat16) for k, v in state_dict.items()}
-
-    # Save model
-    state_keys = list(state_dict.keys())
-    shard_size = int(math.ceil(len(state_keys) / num_shards))
-    logger.info("Saving model to %d shards", num_shards)
-
-    for i in range(num_shards):
-        _file = save_path / f"model-{i:04}-of-{num_shards:04}.{config.save_format}"
-        start = i * shard_size
-        end = min((i + 1) * shard_size, len(state_keys))
-        shard = {k: state_dict[k] for k in state_keys[start:end]}
-
-        index_json.update({k: _file.name for k in shard.keys()})
-        total_size += sum(p.numel() for p in shard.values())
-        if config.save_format == "pt":
-            torch.save(shard, _file)
-        else:
-            save_file(shard, _file, metadata=dict(format="pt"))
-
-    json.dump(
-        {
-            "weight_map": index_json,
-            "metadata": {
-                "total_size": total_size * (2 if config.torch_dtype == "bfloat16" else 4),
-            },
-        },
-        (save_path / "model.safetensors.index.json").open("w"),
-        indent=2,
-    )
-
-    # Save Tokenizer
-    tokenizer.save_pretrained(save_path)
-
-    # Save Generation Config
-    gconfig = GenerationConfig(max_length=100, use_cache=False, temperature=0.7, top_k=None, do_sample=True)
-    gconfig.save_pretrained(save_path)
-
-
-if __name__ == "__main__":
-    logger = get_logger()
-    config = ExportConfig(**parse_argv())
-    resolve_env_vars(config)
-    logger.debug(f"config: {config.model_dump()}")
-
-    main(config)
diff --git a/scripts/simple_gloo.py b/scripts/simple_gloo.py
deleted file mode 100644
index b0c45097..00000000
--- a/scripts/simple_gloo.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import os
-import torch.distributed as dist
-
-master_addr = os.environ["MASTER_ADDR"]
-master_port = 12345
-rank = int(os.environ["RANK"])
-world_size = int(os.environ["WORLD_SIZE"])
-
-print("Ho")
-store = dist.TCPStore(host_name=master_addr, port=master_port, is_master=(rank == 0), world_size=2)
-
-store.set("j", "k")
-print("Hi")
-pg = dist.distributed_c10d.ProcessGroupGloo(store, rank, world_size)
-print("Hi 1")
-
-del pg
diff --git a/scripts/simulate_multi_node_diloco.sh b/scripts/simulate_multi_node_diloco.sh
deleted file mode 100755
index 38212900..00000000
--- a/scripts/simulate_multi_node_diloco.sh
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/bin/bash
-
-#
-# simulate multi nodes on one gpu. start N torchrun on X gpu locally.
-# example how to run ./scripts/simulate_multi_node.sh 2 1  src/zeroband/train.py @configs/debug/normal.toml
-
-# Function to get CUDA devices based on the number of GPUs and index
-function get_cuda_devices() {
-    local num_gpu=$1
-    local index=$2
-    local start_gpu=$((num_gpu * index))
-    local end_gpu=$((start_gpu + num_gpu - 1))
-
-    if [ "$num_gpu" -eq 1 ]; then
-        echo $start_gpu
-    else
-        echo $(seq -s ',' $start_gpu $end_gpu)
-    fi
-}
-
-# Array to store PIDs of child processes
-child_pids=()
-
-# Modified cleanup function to handle tail separately
-cleanup() {
-    echo "Cleaning up child processes..."
-    local killed=0
-
-    # First kill the main processes
-    for pid in "${child_pids[@]}"; do
-        if kill -TERM "$pid" 2>/dev/null; then
-            ((killed++))
-        fi
-    done
-
-    # Kill the tail process if it exists
-    if [ -n "$tail_pid" ]; then
-        kill -TERM "$tail_pid" 2>/dev/null
-        ((killed++))
-    fi
-
-    wait
-    echo "All child processes terminated. Killed $killed processes."
-    exit
-}
-
-# Check if at least three arguments were passed
-if [ "$#" -lt 3 ]; then
-    echo "Usage: $0 <N> <initial_peer> <num_gpu> [additional_python_args]"
-    exit 1
-fi
-
-
-N=$1         # The number of processes
-NUM_GPU=$2   # The number of GPUs used by each process
-# Remove the first three arguments so $@ contains only additional Python arguments
-shift 2
-
-# Register the cleanup function to be called on SIGINT (Ctrl+C)
-trap cleanup SIGINT
-
-
-mkdir -p logs
-
-export GLOBAL_ADDR=localhost
-export GLOBAL_PORT=${GLOBAL_PORT:-5565}
-export GLOBAL_WORLD_SIZE=$N
-export BASE_PORT=${BASE_PORT:-10001}
-export GLOO_SOCKET_IFNAME=lo
-
-for i in $(seq 0 $(($N - 1 )))
-do
-    > logs/log$i.log
-    WANDB_MODE=$([ $i -eq 0 ] && echo "online" || echo "offline") GLOBAL_UNIQUE_ID=$i GLOBAL_RANK=$i CUDA_VISIBLE_DEVICES=$(get_cuda_devices $NUM_GPU $i) uv run torchrun --nproc_per_node=$NUM_GPU --node-rank 0 --rdzv-endpoint localhost:$((BASE_PORT + $i)) --nnodes=1  $@ --data.data_rank $i --data.data_world_size $N > logs/log$i.log 2>&1 &
-    child_pids+=($!)
-done
-
-# Start tail in background and store its PID separately
-tail -f logs/log0.log &
-tail_pid=$!
-
-# Wait for the main processes only
-for pid in "${child_pids[@]}"; do
-    wait $pid
-done
-
-# Once main processes are done, kill the tail process
-if [ -n "$tail_pid" ]; then
-    kill -TERM "$tail_pid"
-fi
diff --git a/scripts/skip_data.py b/scripts/skip_data.py
deleted file mode 100644
index 2f2bc48a..00000000
--- a/scripts/skip_data.py
+++ /dev/null
@@ -1,88 +0,0 @@
-"""
-This script is simulating a training to exaust the datasets and recover the dataloader ckpt.
-
-It has the same api as the training one. The only difference is that you probably want to change the total_steps and put a data_path.
-
-It can load config from the config file to have the same setup as the real run.
-
-example.
-```
-uv run torchrun --nproc_per_node=4 scripts/skip_data.py @configs/150M/3090.toml --optim.total_steps 100 --ckpt.data_path out_data
-```
-
-"""
-
-import os
-import torch
-from pydantic_config import parse_argv
-
-
-from transformers import AutoTokenizer
-from zeroband.checkpoint import CkptManager
-from zeroband.config import resolve_env_vars
-from zeroband.train import Config
-
-from zeroband.data import get_dataloader
-
-from zeroband.utils.world_info import get_world_info
-from zeroband.utils.logger import get_logger
-
-
-def skip_data(config: Config):
-    # batch_size is the total batch size for all GPUs
-    assert config.optim.batch_size % world_info.local_world_size == 0
-    batch_size = config.optim.batch_size // world_info.local_world_size
-
-    assert batch_size % config.train.micro_bs == 0
-    gradient_accumulation_steps = batch_size // config.train.micro_bs
-
-    if config.type_model == "llama2":
-        tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=True)
-    elif config.type_model == "llama3":
-        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B", use_fast=True)
-    else:
-        raise ValueError(f"Model type {config.type_model} not supported")
-
-    logger.debug("tokenizer loaded")
-
-    train_dataloader = get_dataloader(
-        tokenizer=tokenizer,
-        world_size=world_info.world_size,
-        rank=world_info.rank,
-        batch_size=config.train.micro_bs,
-        data_config=config.data,
-    )
-
-    train_dataloader_iterator = iter(train_dataloader)
-
-    logger.info("starting skipping data up to step: %d", config.optim.total_steps)
-
-    total_steps = 0
-
-    while True:
-        num_inner_steps = config.diloco.inner_steps if config.diloco is not None else 1
-
-        for _inner_step in range(num_inner_steps):
-            for _ in range(gradient_accumulation_steps):
-                next(train_dataloader_iterator)
-
-        total_steps += num_inner_steps
-        logger.info("total steps: %d", total_steps)
-        if total_steps >= config.optim.total_steps:
-            break
-
-    CkptManager.save_data(os.path.join(config.ckpt.data_path, "data"), train_dataloader, world_info.local_rank)
-
-    logger.info("skipped data up to step: %d", config.optim.total_steps)
-
-
-if __name__ == "__main__":
-    torch.manual_seed(42)
-
-    world_info = get_world_info()
-    logger = get_logger()
-
-    config = Config(**parse_argv())
-    resolve_env_vars(config)
-
-    skip_data(config)
diff --git a/src/zeroband/C/__init__.py b/src/zeroband/C/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/src/zeroband/C/collectives.py b/src/zeroband/C/collectives.py
deleted file mode 100644
index 8372d121..00000000
--- a/src/zeroband/C/collectives.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import os
-from typing import Optional
-import torch
-import torch.distributed as dist
-from torch.utils import cpp_extension
-from pathlib import Path
-from torch.testing._internal.distributed.fake_pg import FakeProcessGroup
-
-
-parent = Path(__file__).parent
-INCLUDES = [str(parent / "csrc"), str(parent.parent.parent.parent / "third_party/gloo")]
-COLLECTIVES_CSRC_PATH = parent / "csrc" / "collectives.cpp"
-
-collectives_ops = cpp_extension.load(
-    name="collectives",
-    sources=[COLLECTIVES_CSRC_PATH],
-    extra_cflags=["-O3", "-DUSE_C10D_GLOO"],
-    verbose=False if os.environ.get("ZERO_BAND_LOG_LEVEL") == "DEBUG" else True,
-    extra_include_paths=INCLUDES,
-)
-
-
-def ring_allreduce(
-    tensor: torch.Tensor,
-    op: dist.ReduceOp = dist.ReduceOp.SUM,
-    group: Optional[dist.ProcessGroup] = None,
-) -> None:
-    if group is None:
-        group = dist.distributed_c10d._get_default_group()
-    if isinstance(group, dist.distributed_c10d.ProcessGroupGloo):
-        collectives_ops.ring_allreduce_gloo(tensor, op, group)
-    elif isinstance(group, FakeProcessGroup):
-        return
-    else:
-        collectives_ops.ring_allreduce(tensor, op, group)
diff --git a/src/zeroband/C/compression.py b/src/zeroband/C/compression.py
deleted file mode 100644
index f2e3cc21..00000000
--- a/src/zeroband/C/compression.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import Tuple
-import torch
-from torch.utils.cpp_extension import load
-from pathlib import Path
-
-COMPRESS_CSRC_PATH = Path(__file__).parent / "csrc" / "compression.cpp"
-
-compress_ops = load(name="compression", sources=[COMPRESS_CSRC_PATH], extra_cflags=["-O3"], verbose=False)
-
-
-def uniform_8bit_quantize(tensor: torch.Tensor, inplace: bool = True) -> Tuple[torch.Tensor, torch.Tensor]:
-    """Quantize a tensor to 8-bit integers
-    Args:
-        tensor (torch.Tensor): The tensor to quantize
-        inplace (bool): Whether the operation is allowed to modify the input tensor
-    Returns:
-        Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the lookup table
-    """
-    return compress_ops.uniform_8bit_quantize(tensor, inplace)
-
-
-def average_buckets(tensor: torch.Tensor, quant_weight: torch.Tensor, n_bins: int) -> torch.Tensor:
-    """Return the average value in each bin
-    Args:
-        tensor (torch.Tensor): The tensor to average
-        quant_weight (torch.Tensor): The tensor of indices
-        n_bins (int): The number of bins
-    Returns:
-        torch.Tensor: The average value in each bin
-    """
-    return compress_ops.average_buckets(tensor, quant_weight, n_bins)
-
-
-def quantize_per_tensor_uint8(tensor: torch.Tensor, scale: float, zero_point: int) -> torch.Tensor:
-    """Quantize a tensor to 8-bit integers
-
-    quantized_value = clamp((round(input / scale) + zero_point), 0, 255)
-
-    Args:
-        tensor (torch.Tensor): The tensor to quantize
-        scale (float): The scale of the quantization
-        zero_point (int): The zero point of the quantization
-    Returns:
-        torch.Tensor: The quantized tensor
-    """
-    return compress_ops.quantize_per_tensor_uint8(tensor, scale, zero_point)
diff --git a/src/zeroband/C/csrc/collectives.cpp b/src/zeroband/C/csrc/collectives.cpp
deleted file mode 100644
index ab7777fc..00000000
--- a/src/zeroband/C/csrc/collectives.cpp
+++ /dev/null
@@ -1,249 +0,0 @@
-#include <torch/torch.h>
-#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
-#include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
-#include <compression.cpp>
-
-constexpr int BUFFER_COUNT = 2;
-
-template <typename T>
-void fast_index_add_omp(T* output, const T* lookup_table, const uint8_t* indices, int64_t n) {
-    #pragma omp parallel for
-    for (int64_t i = 0; i < n; ++i) {
-        output[i] += lookup_table[indices[i]];
-    }
-}
-
-template <typename T>
-void fast_index_set_omp(T* output, const T* lookup_table, const uint8_t* indices, int64_t n) {
-    #pragma omp parallel for
-    for (int64_t i = 0; i < n; ++i) {
-        output[i] = lookup_table[indices[i]];
-    }
-}
-
-inline size_t get_num_threads() {
-    return std::max(1u, std::thread::hardware_concurrency());
-}
-
-template <typename T>
-void fast_index_add_worker(T* output, const T* lookup_table, const uint8_t* indices, int64_t start, int64_t end) {
-    for (int64_t i = start; i < end; ++i) {
-        output[i] += lookup_table[indices[i]];
-    }
-}
-
-template <typename T>
-void fast_index_add(T* output, const T* lookup_table, const uint8_t* indices, int64_t n) {
-    size_t num_threads = get_num_threads();
-    std::vector<std::thread> threads;
-    int64_t chunk_size = n / num_threads;
-
-    for (size_t i = 0; i < num_threads; ++i) {
-        int64_t start = i * chunk_size;
-        int64_t end = (i == num_threads - 1) ? n : (i + 1) * chunk_size;
-        threads.emplace_back(fast_index_add_worker<T>, output, lookup_table, indices, start, end);
-    }
-
-    for (auto& thread : threads) {
-        thread.join();
-    }
-}
-
-template <typename T>
-void fast_index_set_worker(T* output, const T* lookup_table, const uint8_t* indices, int64_t start, int64_t end) {
-    for (int64_t i = start; i < end; ++i) {
-        output[i] = lookup_table[indices[i]];
-    }
-}
-
-template <typename T>
-void fast_index_set(T* output, const T* lookup_table, const uint8_t* indices, int64_t n) {
-    size_t num_threads = get_num_threads();
-    std::vector<std::thread> threads;
-    int64_t chunk_size = n / num_threads;
-
-    for (size_t i = 0; i < num_threads; ++i) {
-        int64_t start = i * chunk_size;
-        int64_t end = (i == num_threads - 1) ? n : (i + 1) * chunk_size;
-        threads.emplace_back(fast_index_set_worker<T>, output, lookup_table, indices, start, end);
-    }
-
-    for (auto& thread : threads) {
-        thread.join();
-    }
-}
-
-template <typename T>
-void ring_allreduce(
-    torch::Tensor& tensor,
-    c10d::ReduceOp op,
-    T* group
-) {
-    TORCH_CHECK(group != nullptr, "Group must be provided");
-    TORCH_CHECK(op == c10d::ReduceOp::SUM || op == c10d::ReduceOp::AVG, "Unsupported reduce operation. Only SUM and AVG are supported.");
-
-    int world_size = group->getSize();
-    int rank = group->getRank();
-
-    // Divide the tensor into chunks
-    auto flat_tensor = tensor.view({tensor.numel()});
-    std::vector<torch::Tensor> chunks = flat_tensor.chunk(world_size * BUFFER_COUNT);
-
-    // Temporary buffers for transferring data
-    int num_buffers = BUFFER_COUNT * world_size;
-    std::vector<torch::Tensor> recv_buffer;
-    std::vector<torch::Tensor> send_buffer;
-    std::vector<torch::Tensor> send_lookup_buffer;
-    std::vector<torch::Tensor> recv_lookup_buffer;
-    std::vector<c10::intrusive_ptr<c10d::Work>> send_lookup_work(BUFFER_COUNT);
-    std::vector<c10::intrusive_ptr<c10d::Work>> recv_lookup_work(BUFFER_COUNT);
-    std::vector<c10::intrusive_ptr<c10d::Work>> send_work(BUFFER_COUNT);
-    std::vector<c10::intrusive_ptr<c10d::Work>> recv_work(BUFFER_COUNT);
-
-    for (int i = 0; i < BUFFER_COUNT; ++i) {
-        recv_buffer.push_back(torch::empty_like(chunks[0], torch::kUInt8));
-        send_buffer.push_back(torch::Tensor());
-        send_lookup_buffer.push_back(torch::Tensor());
-        recv_lookup_buffer.push_back(torch::empty({256}, chunks[0].options()));
-    }
-
-    // Send and receive ranks
-    int send_rank = (rank + 1) % world_size;
-    int recv_rank = (rank - 1 + world_size) % world_size;
-
-    // Reduce-scatter loop
-    for (int step = 1; step <= world_size * BUFFER_COUNT; ++step) {
-        int send_chunk = (rank * BUFFER_COUNT - step + num_buffers) % num_buffers;
-
-        if (send_work[step % BUFFER_COUNT]) {
-            send_work[step % BUFFER_COUNT]->wait();
-            recv_work[step % BUFFER_COUNT]->wait();
-            send_lookup_work[step % BUFFER_COUNT]->wait();
-            recv_lookup_work[step % BUFFER_COUNT]->wait();
-
-            auto& chunk = chunks[send_chunk];
-            auto& lookup = recv_lookup_buffer[step % BUFFER_COUNT];
-            auto& indices = recv_buffer[step % BUFFER_COUNT];
-
-            fast_index_add_omp<float>(
-                static_cast<float*>(chunk.data_ptr()),
-                static_cast<const float*>(lookup.data_ptr()),
-                static_cast<const uint8_t*>(indices.data_ptr()),
-                chunk.numel()
-            );
-        }
-
-        if (step <= (world_size - 1) * BUFFER_COUNT) {
-            // Quantize and send
-            std::tie(send_buffer[step % BUFFER_COUNT], send_lookup_buffer[step % BUFFER_COUNT]) = uniform_8bit_quantize(chunks[send_chunk], false);
-
-            std::vector<torch::Tensor> send_tensors = {send_lookup_buffer[step % BUFFER_COUNT]};
-            send_lookup_work[step % BUFFER_COUNT] = group->send(send_tensors, send_rank, step + 1000);
-
-            std::vector<torch::Tensor> recv_tensors = {recv_lookup_buffer[step % BUFFER_COUNT]};
-            recv_lookup_work[step % BUFFER_COUNT] = group->recv(recv_tensors, recv_rank, step + 1000);
-
-            send_tensors = {send_buffer[step % BUFFER_COUNT]};
-            send_work[step % BUFFER_COUNT] = group->send(send_tensors, send_rank, step);
-
-            recv_tensors = {recv_buffer[step % BUFFER_COUNT]};
-            recv_work[step % BUFFER_COUNT] = group->recv(recv_tensors, recv_rank, step);
-        }
-    }
-
-    // TODO: Interleave these with the previous loop?
-    if (op == c10d::ReduceOp::AVG) {
-        for (int i = 0; i < BUFFER_COUNT; ++i) {
-            chunks[i + rank * BUFFER_COUNT].div_(world_size);
-        }
-    }
-    
-    for (int i = 0; i < BUFFER_COUNT; ++i) {
-        std::tie(send_buffer[0], send_lookup_buffer[0]) = uniform_8bit_quantize(chunks[i + rank * BUFFER_COUNT], true);
-        auto& chunk = chunks[i + rank * BUFFER_COUNT];
-        auto& lookup = send_lookup_buffer[0];
-        auto& indices = send_buffer[0];
-
-        fast_index_set_omp<float>(
-            static_cast<float*>(chunk.data_ptr()),
-            static_cast<const float*>(lookup.data_ptr()),
-            static_cast<const uint8_t*>(indices.data_ptr()),
-            chunk.numel()
-        );
-    }
-
-    // Reset buffers for the second phase
-    recv_buffer.clear();
-    send_buffer.clear();
-    send_lookup_buffer.clear();
-    recv_lookup_buffer.clear();
-    for (int i = 0; i < BUFFER_COUNT; ++i) {
-        recv_buffer.push_back(torch::empty_like(chunks[0], torch::kUInt8));
-        send_buffer.push_back(torch::Tensor());
-        send_lookup_buffer.push_back(torch::Tensor());
-        recv_lookup_buffer.push_back(torch::empty({256}, chunks[0].options()));
-    }
-    std::fill(send_work.begin(), send_work.end(), nullptr);
-    std::fill(recv_work.begin(), recv_work.end(), nullptr);
-    std::fill(send_lookup_work.begin(), send_lookup_work.end(), nullptr);
-    std::fill(recv_lookup_work.begin(), recv_lookup_work.end(), nullptr);
-
-    for (int step = 1; step <= world_size * BUFFER_COUNT; ++step) {
-        int send_chunk = (rank * BUFFER_COUNT + BUFFER_COUNT - step + num_buffers) % num_buffers;
-
-        if (send_work[step % BUFFER_COUNT]) {
-            send_work[step % BUFFER_COUNT]->wait();
-            recv_work[step % BUFFER_COUNT]->wait();
-            send_lookup_work[step % BUFFER_COUNT]->wait();
-            recv_lookup_work[step % BUFFER_COUNT]->wait();
-
-            auto& chunk = chunks[send_chunk];
-            auto& lookup = recv_lookup_buffer[step % BUFFER_COUNT];
-            auto& indices = recv_buffer[step % BUFFER_COUNT];
-
-            fast_index_set_omp<float>(
-                static_cast<float*>(chunk.data_ptr()),
-                static_cast<const float*>(lookup.data_ptr()),
-                static_cast<const uint8_t*>(indices.data_ptr()),
-                chunk.numel()
-            );
-        }
-
-        if (step <= (world_size - 1) * BUFFER_COUNT) {
-            // Quantize and send
-            // todo(jackmin): this quantization is redundant, we should be able to reuse the quantized values we just received
-            std::tie(send_buffer[step % BUFFER_COUNT], send_lookup_buffer[step % BUFFER_COUNT]) = uniform_8bit_quantize(chunks[send_chunk], false);
-
-            std::vector<torch::Tensor> send_tensors = {send_lookup_buffer[step % BUFFER_COUNT]};
-            send_lookup_work[step % BUFFER_COUNT] = group->send(send_tensors, send_rank, step + 1000);
-
-            std::vector<torch::Tensor> recv_tensors = {recv_lookup_buffer[step % BUFFER_COUNT]};
-            recv_lookup_work[step % BUFFER_COUNT] = group->recv(recv_tensors, recv_rank, step + 1000);
-
-            send_tensors = {send_buffer[step % BUFFER_COUNT]};
-            send_work[step % BUFFER_COUNT] = group->send(send_tensors, send_rank, step);
-
-            recv_tensors = {recv_buffer[step % BUFFER_COUNT]};
-            recv_work[step % BUFFER_COUNT] = group->recv(recv_tensors, recv_rank, step);
-        }
-    }
-}
-
-PYBIND11_MODULE(collectives, m) {
-    m.def(
-        "ring_allreduce",
-        &ring_allreduce<c10d::ProcessGroup>,
-        "Ring allreduce implementation",
-        py::arg("tensor"),
-        py::arg("op"),
-        py::arg("pg")
-    );
-    m.def(
-        "ring_allreduce_gloo",
-        &ring_allreduce<c10d::ProcessGroupGloo>,
-        "Ring allreduce implementation",
-        py::arg("tensor"),
-        py::arg("op"),
-        py::arg("pg")
-    );
-}
\ No newline at end of file
diff --git a/src/zeroband/C/csrc/compression.cpp b/src/zeroband/C/csrc/compression.cpp
deleted file mode 100644
index 8bd7dcbd..00000000
--- a/src/zeroband/C/csrc/compression.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-#include <torch/torch.h>
-
-namespace py = pybind11;
-
-constexpr int n_bins = 256;  // 8-bit quantization
-constexpr double RANGE_IN_SIGMAS = 6.0;
-const int max_num_threads = std::thread::hardware_concurrency();
-
-torch::Tensor quantize_per_tensor_multithreaded(const torch::Tensor& tensor, float scale, int32_t zero_point, int num_threads) {
-    torch::TensorOptions options = tensor.options().dtype(torch::kByte);
-    torch::Tensor quantized_tensor = torch::empty_like(tensor, options);
-    
-    float* tensor_data = tensor.data_ptr<float>();
-    uint8_t* quant_data = quantized_tensor.data_ptr<uint8_t>();
-    int64_t numel = tensor.numel();
-    float inv_scale = 1.0f / scale;
-
-    std::vector<std::thread> threads;
-    int64_t chunk_size = numel / num_threads;
-
-    auto quantize_chunk = [&](int64_t start, int64_t end) {
-        for (int64_t i = start; i < end; ++i) {
-            int32_t quant_val = static_cast<int32_t>(std::round(tensor_data[i] * inv_scale)) + zero_point;
-            quant_data[i] = static_cast<uint8_t>(std::clamp(quant_val, 0, 255));
-        }
-    };
-
-    for (int i = 0; i < num_threads - 1; ++i) {
-        int64_t start = i * chunk_size;
-        int64_t end = (i + 1) * chunk_size;
-        threads.emplace_back(quantize_chunk, start, end);
-    }
-
-    // Handle the last chunk (which may be slightly larger due to rounding)
-    threads.emplace_back(quantize_chunk, (num_threads - 1) * chunk_size, numel);
-
-    // Wait for all threads to complete
-    for (auto& thread : threads) {
-        thread.join();
-    }
-
-    return quantized_tensor;
-}
-
-torch::Tensor average_buckets_multithread(const torch::Tensor& tensor, const torch::Tensor& quant_weight, int64_t n_bins, int num_threads) {
-    torch::NoGradGuard no_grad;
-    auto flat_tensor = tensor.flatten().contiguous();
-    auto flat_quant_weight = quant_weight.flatten().contiguous();
-    auto options = flat_tensor.options();
-    auto bin_sums = torch::zeros({n_bins}, options);
-    auto bin_counts = torch::zeros({n_bins}, options.dtype(torch::kLong));
-
-    // Get raw pointers
-    float* tensor_data = flat_tensor.data_ptr<float>();
-    uint8_t* quant_data = flat_quant_weight.data_ptr<uint8_t>();
-    float* sums_data = bin_sums.data_ptr<float>();
-    int64_t* counts_data = bin_counts.data_ptr<int64_t>();
-    int64_t numel = flat_tensor.numel();
-
-    // Create a vector to hold our threads
-    std::vector<std::thread> threads;
-
-    // Lambda function for the work each thread will do
-    auto worker = [&](int64_t start, int64_t end) {
-        std::vector<float> local_sums(n_bins, 0.0f);
-        std::vector<int64_t> local_counts(n_bins, 0);
-
-        for (int64_t i = start; i < end; ++i) {
-            uint8_t bin = quant_data[i];
-            if (bin < n_bins) {  // No need to check for >= 0 as uint8_t is always non-negative
-                local_sums[bin] += tensor_data[i];
-                local_counts[bin]++;
-            }
-        }
-
-        // Use a mutex to safely update the shared data
-        static std::mutex mutex;
-        std::lock_guard<std::mutex> lock(mutex);
-        for (int64_t i = 0; i < n_bins; ++i) {
-            sums_data[i] += local_sums[i];
-            counts_data[i] += local_counts[i];
-        }
-    };
-
-    // Divide the work among threads
-    int64_t chunk_size = numel / num_threads;
-    for (unsigned int i = 0; i < num_threads; ++i) {
-        int64_t start = i * chunk_size;
-        int64_t end = (i == num_threads - 1) ? numel : (i + 1) * chunk_size;
-        threads.emplace_back(worker, start, end);
-    }
-
-    // Wait for all threads to complete
-    for (auto& thread : threads) {
-        thread.join();
-    }
-
-    // Compute averages
-    for (int64_t i = 0; i < n_bins; ++i) {
-        sums_data[i] = counts_data[i] > 0 ? sums_data[i] / counts_data[i] : 0.0f;
-    }
-
-    return bin_sums;
-}
-
-std::tuple<torch::Tensor, torch::Tensor> uniform_8bit_quantize(torch::Tensor tensor, bool inplace) {
-    int offset = n_bins / 2;
-    
-    // Centered tensor handling (currently commented out, so no centering)
-    torch::Tensor centered_tensor = tensor;
-
-    // Calculate unbiased standard deviation
-    double std_unbiased = centered_tensor.norm().item<double>() / std::sqrt(centered_tensor.numel() - 1);
-
-    // Calculate scale for quantization
-    double scale = RANGE_IN_SIGMAS * std_unbiased / n_bins;
-
-    // Perform quantization
-    torch::Tensor quantized_tensor = quantize_per_tensor_multithreaded(centered_tensor, scale, offset, max_num_threads);
-
-    // Call average_buckets to create the lookup table
-    torch::Tensor lookup = average_buckets_multithread(tensor, quantized_tensor, n_bins, max_num_threads);
-
-    return std::make_tuple(quantized_tensor, lookup);
-}
-
-
-// PyBind11 module
-PYBIND11_MODULE(compression, m) {
-    m.def(
-        "average_buckets",
-        &average_buckets_multithread,
-        "Average buckets for quantized values",
-        py::arg("tensor"),
-        py::arg("quant_weight"),
-        py::arg("n_bins"),
-        py::arg("num_threads") = max_num_threads
-    )
-    .def(
-        "uniform_8bit_quantize",
-        &uniform_8bit_quantize,
-        "Uniform 8-bit quantization function",
-        py::arg("tensor"),
-        py::arg("inplace") = true
-    )
-    .def(
-        "quantize_per_tensor_uint8",
-        &quantize_per_tensor_multithreaded,
-        "Faster torch::quantize_per_tensor",
-        py::arg("tensor"),
-        py::arg("scale"),
-        py::arg("zero_point"),
-        py::arg("num_threads") = max_num_threads
-    );
-}
diff --git a/src/zeroband/checkpoint.py b/src/zeroband/checkpoint.py
deleted file mode 100644
index bdeb4d48..00000000
--- a/src/zeroband/checkpoint.py
+++ /dev/null
@@ -1,563 +0,0 @@
-from dataclasses import dataclass
-import gc
-import multiprocessing
-import os
-import shutil
-import threading
-import time
-from typing import Any
-import uuid
-import fsspec
-from fsspec.generic import rsync as rsync_fsspec
-import torch
-from torch import nn
-from torch.optim import Optimizer
-from torch.optim.lr_scheduler import LRScheduler
-from torchdata.stateful_dataloader import StatefulDataLoader
-import torch.distributed.checkpoint as dcp
-from torch.distributed.checkpoint.state_dict import (
-    set_optimizer_state_dict,
-    set_model_state_dict,
-    get_model_state_dict,
-    get_optimizer_state_dict,
-    StateDictOptions,
-)
-import torch.distributed as dist
-
-
-from torch.distributed.checkpoint.stateful import Stateful
-import warnings
-import logging
-from torch.distributed._tensor.api import DTensor
-from zeroband.utils.state_dict_send_recv import (
-    _get_sendable_state_dict,
-    recv_state_dict,
-    send_state_dict,
-    send_tensor_and_state_dict,
-)
-from distributed_shampoo import DistributedShampoo
-from zeroband.utils.logger import get_logger
-from zeroband.config import CkptConfig
-from zeroband.utils.world_info import get_world_info
-
-## code inspired by torchtitan https://github.com/pytorch/torchtitan/blob/main/torchtitan/checkpoint.py
-
-
-@dataclass
-class TrainingProgress(Stateful):
-    total_tokens: int
-    outer_step: int
-    step: int
-
-    def state_dict(self) -> dict[str, Any]:
-        return {"total_tokens": self.total_tokens, "outer_step": self.outer_step, "step": self.step}
-
-    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
-        self.total_tokens = state_dict["total_tokens"]
-        self.outer_step = state_dict["outer_step"]
-        self.step = state_dict["step"]
-
-
-class ModelWrapper(Stateful):
-    def __init__(self, model: nn.Module) -> None:
-        self.model = model
-
-    def state_dict(self) -> dict[str, Any]:
-        return get_model_state_dict(self.model, options=StateDictOptions(strict=False))
-
-    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
-        set_model_state_dict(model=self.model, model_state_dict=state_dict, options=StateDictOptions(strict=False))
-
-
-class OptimizerWrapper(Stateful):
-    def __init__(
-        self,
-        model: nn.Module,
-        optim: torch.optim.Optimizer,
-    ) -> None:
-        self.model = model
-        self.optim = optim
-
-    def state_dict(self) -> dict[str, Any]:
-        if isinstance(self.optim, DistributedShampoo):
-            return self.optim.distributed_state_dict(key_to_param=self.model.named_parameters())
-        else:
-            return get_optimizer_state_dict(
-                model=self.model, optimizers=self.optim, options=StateDictOptions(flatten_optimizer_state_dict=True)
-            )
-
-    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
-        if isinstance(self.optim, DistributedShampoo):
-            self.optim.load_distributed_state_dict(state_dict, key_to_param=self.model.named_parameters())
-        else:
-            set_optimizer_state_dict(
-                model=self.model,
-                optimizers=self.optim,
-                optim_state_dict=state_dict,
-                options=StateDictOptions(flatten_optimizer_state_dict=True),
-            )
-
-
-def cast_dtensor_to_tensor(state_dict: dict[str, Any]) -> dict[str, Any]:
-    """
-    Traverse a state dict and cast all DTensor in the state dict to tensor
-    """
-    new_state_dict = {}
-
-    for key, value in state_dict.items():
-        if isinstance(value, dict):
-            new_state_dict[key] = cast_dtensor_to_tensor(value)
-        elif isinstance(value, DTensor):
-            new_state_dict[key] = value.to_local()
-        else:
-            new_state_dict[key] = value
-    return new_state_dict
-
-
-def load_dtensor_state_dict(state_src, loaded_state_dict):
-    for key, value in state_src.items():
-        if isinstance(value, dict):
-            load_dtensor_state_dict(value, loaded_state_dict[key])
-        elif isinstance(value, DTensor):
-            local_tensor = value.to_local()
-
-            local_tensor.copy_(loaded_state_dict[key])
-            loaded_state_dict[key] = value
-        else:
-            loaded_state_dict[key] = value
-
-
-class OuterOptimizerWrapper(Stateful):
-    def __init__(self, optimizer: Optimizer) -> None:
-        self.optimizer = optimizer
-
-    def state_dict(self) -> dict[str, Any]:
-        # the idea here is to cast any DTensor into local tensor
-        state = self.optimizer.state_dict()
-        return cast_dtensor_to_tensor(state)
-
-    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
-        # we pre-init the opt buffer DTensor.
-        # !! this assume that the model have grad buffer init
-        self.optimizer.step()  # pre init buffer
-
-        ## here the idea is for any DTensor, load the value from the state_dict into the local tensor
-        current_state = self.optimizer.state_dict()
-        load_dtensor_state_dict(current_state, state_dict)
-        self.optimizer.load_state_dict(state_dict)
-
-
-def non_error_barrier():
-    try:
-        dist.barrier()
-    except Exception as e:
-        from zeroband.utils.logger import get_logger
-        get_logger().info(f"Error in data checkpointing barrier: {e}, continuing training")
-
-
-class CkptManager:
-    """Its name CkptManager because I (sami) always misstyped chekcpoint.
-
-    Checkpoint are saved in a folder with the following structure:
-    ckpt_path/
-        step_0/
-            _0_0.pt
-            _1_0.pt
-            ...
-        step_1/
-            ...
-    """
-
-    states: dict[str, Stateful]
-
-    def __init__(
-        self,
-        config: CkptConfig,
-        model: nn.Module,
-        optimizer: Optimizer,
-        scheduler: LRScheduler,
-        dataloader: StatefulDataLoader,
-        training_progress: TrainingProgress,
-        data_rank: int | None,
-        diloco_offloaded_param_list: list[nn.Parameter] | None,
-        diloco_offloaded_optimizer: Optimizer | None,
-    ):
-        self.config = config
-
-        self.model = model
-        self.optimizer = optimizer
-        self.scheduler = scheduler
-        self.dataloader = dataloader
-        self.training_progress = training_progress
-        self.data_rank = data_rank
-
-        assert (diloco_offloaded_param_list is None) == (
-            diloco_offloaded_optimizer is None
-        ), "diloco_offloaded_model and diloco_offloaded_optimizer must be both None or both have values"
-
-        self.diloco_offloaded_optimizer = diloco_offloaded_optimizer  # he we don't use Wrapper because it failed
-        # which might make the ckpt less generic in term of loading from different number of device. FSDP ckpt seems to be a mess tho
-        self.diloco_offloaded_param_list = diloco_offloaded_param_list
-
-        self._init_state()
-
-        self._logger = get_logger(config)
-        self.world_info = get_world_info()
-
-        self.non_blocking_process: list[multiprocessing.Process] = []
-        self.blocking_process: list[multiprocessing.Process] = []
-        self._live_reco_thread: threading.Thread | None = None
-
-        if self.world_info.local_rank == 0:
-            if self.config.path is not None:
-                self.check_path_access(self.config.path)
-
-            if self.config.remote is not None:
-                self.check_path_access(self.config.remote.path)
-
-            if self.config.remote_data_path is not None:
-                self.check_path_access(self.config.remote_data_path)
-
-    def check_path_access(
-        self,
-        ckpt_path: str,
-    ):
-        rank = uuid.uuid4()
-        dummy_file_path = os.path.join(ckpt_path, f".dummy_file_{rank}.txt")
-
-        try:
-            # Create the directory if it doesn't exist
-            fs, _ = fsspec.core.url_to_fs(ckpt_path)
-            fs.makedirs(ckpt_path, exist_ok=True)
-
-            with fsspec.open(dummy_file_path, "w") as f:
-                f.write("This is a dummy file for testing access.")
-        except Exception as e:
-            self._logger.error(f"Error checking path access {ckpt_path}: {e}, aborting training")
-            raise e
-
-    def _init_state(self):
-        # states can only be stateful object, hence we need to wrap Model and Optimizer
-        self.states: dict[str, Stateful] = {
-            "model": ModelWrapper(self.model),
-            "optimizer": OptimizerWrapper(self.model, self.optimizer),
-            "scheduler": self.scheduler,
-            # "dataloader": self.dataloader, # ignoring dataloader for now as each rank has its own dataloader
-            "training_progress": self.training_progress,
-        }
-
-        # if self.diloco_offloaded_optimizer is not None:
-        #     # even if the diloco_offloaded target the cpu list model, we still use the gpu model to load and save state.
-        #     # main reason is that we actually don't a cpu model but just a list of cpu parameters.
-        #     self.states["diloco_optimizer"] = self.diloco_offloaded_optimizer
-
-    @torch.no_grad()
-    def save(self, remote: bool = False) -> None:
-        """
-        Each rank will save the right shard of the model and optimizer.
-
-        Saving is done inplace.
-
-        Save in the subfolder `step_<step>`.
-
-        """
-
-        step_ckpt_path = os.path.join(self.config.path, f"step_{self.training_progress.step}")
-
-        if remote and self.config.remote is not None:
-            remote_ckpt_path = os.path.join(self.config.remote.path, f"step_{self.training_progress.step}")
-
-        # if we are not in self recovery mode we save to disk
-        time_start = time.perf_counter()
-        self._save(step_ckpt_path)
-        self._logger.info(f"Saved checkpoint to {step_ckpt_path} in {time.perf_counter() - time_start} seconds")
-
-        # push to remote
-        non_error_barrier()
-        if self.world_info.local_rank == 0:
-            if remote and self.config.remote is not None:
-                self._async_save_remote(step_ckpt_path, remote_ckpt_path)
-
-    @torch.no_grad()
-    def _save(self, ckpt_path: str):
-        self.wait_for_blocking_job()
-
-        catch_warning = self._logger.getEffectiveLevel() <= logging.INFO
-
-        with warnings.catch_warnings():
-            # pytorch has an annoying warning when saving the optimizer state https://github.com/pytorch/pytorch/issues/136907
-            # we can ignore it if we are not logging in DEBUG mode
-            if catch_warning:
-                warnings.simplefilter("ignore")
-
-            dcp.save(self.states, checkpoint_id=ckpt_path)
-
-            if self.diloco_offloaded_optimizer:
-                with open(os.path.join(ckpt_path, f"__{self.world_info.local_rank}_0.pt"), "wb") as f:
-                    state = {}
-                    state["optimizer"] = OuterOptimizerWrapper(self.diloco_offloaded_optimizer).state_dict()
-
-                    torch.save(state, f)
-
-            data_path = os.path.join(ckpt_path, "data")
-            self.save_data(data_path, self.dataloader, self.world_info.local_rank)
-
-            non_error_barrier()
-
-            if self.config.remote_data_path is not None:
-                remote_data_path = os.path.join(
-                    self.config.remote_data_path, f"data_{self.data_rank}", f"step_{self.training_progress.step}"
-                )
-                latest_remote_data_path = os.path.join(self.config.remote_data_path, f"data_{self.data_rank}", "latest")
-
-                self._async_save_remote(data_path, remote_data_path, blocking=False)
-                self._async_save_remote(data_path, latest_remote_data_path, blocking=False)
-
-        gc.collect()
-
-    @staticmethod
-    def save_data(data_path: str, dataloader, local_rank: int):
-        os.makedirs(data_path, exist_ok=True)
-        with open(os.path.join(data_path, f"_{local_rank}.pt"), "wb") as f:
-            state = {"data_loader": dataloader.state_dict()}
-            torch.save(state, f)
-
-    def _async_save_remote(self, ckpt_path: str, remote_ckpt_path: str, blocking: bool = True) -> None:
-        """asyncronously rsync a ckpt folder to a remote location. Using fsspec to handle remote cloud storage without to install
-        specific libraries (e.g. s3fs).
-        """
-
-        def rsync():
-            time_start = time.perf_counter()
-            self._logger.info(f"start pushing {ckpt_path} to {remote_ckpt_path} asynchronously")
-            try:
-                rsync_fsspec(ckpt_path, destination=remote_ckpt_path)
-            except Exception as e:
-                self._logger.error(f"Error pushing {ckpt_path} to {remote_ckpt_path}: {e}")
-            self._logger.info(
-                f"finish pushing {ckpt_path} to {remote_ckpt_path} in {time.perf_counter() - time_start} seconds"
-            )
-
-        processes = multiprocessing.Process(target=rsync, daemon=True)
-        processes.start()
-
-        if blocking:
-            self.blocking_process.append(processes)
-        else:
-            self.non_blocking_process.append(processes)
-
-    def wait_for_blocking_job(self):
-        for process in self.blocking_process:
-            process.join()
-
-        self.blocking_process = []
-
-        if self.world_info.local_rank == 0:
-            if self.config.topk is not None:
-                delete_topk(self.logger, self.config.path, self.config.topk)
-
-    def _del__(self):
-        self.wait_for_blocking_job()
-
-        for process in self.non_blocking_process:
-            process.join()
-
-    @torch.no_grad()
-    def _load_data(self, resume_ckpt_path: str):
-        self._logger.debug(f"loading data from {resume_ckpt_path}")
-        world_info = get_world_info()
-
-        data_path = os.path.join(resume_ckpt_path, "data")
-
-        with open(os.path.join(data_path, f"_{world_info.local_rank}.pt"), "rb") as f:
-            state = torch.load(f)
-            self.dataloader.load_state_dict(state["data_loader"])
-
-    @torch.no_grad()
-    def load(
-        self,
-        resume_ckpt_path: str,
-        skip_dataloader: bool = False,
-        data_path: str | None = None,
-    ) -> None:
-        """
-        loading should be done after fsdp wrap and optimizer init.
-        Each rank will load the right shard of the model and optimizer.
-        All rank will load the global states (scheduler, step, total_tokens, dataloader).
-
-        `resume_ckpt_path` should point to a specific step and not to the base ckpt folder. Example: `ckpt_path/step_100`
-
-        Loading is done inplace.
-
-        """
-        time_start = time.perf_counter()
-
-        world_info = get_world_info()
-
-        files = os.listdir(resume_ckpt_path)
-
-        if len(files) == 1 and files[0].startswith("diloco_"):
-            self._logger.warning(
-                f"Loading diloco ckpt from {files[0]}. This is deprecated and will be removed in the future"
-            )
-            resume_ckpt_path = os.path.join(resume_ckpt_path, files[0])
-
-        dcp.load(self.states, checkpoint_id=resume_ckpt_path)
-
-        if self.config.token_count is not None:
-            self.training_progress.total_tokens = self.config.token_count
-
-        self._logger.debug("sync inner model")
-        # todo(refactor): here we should rather let the diloco class handle this logic
-        if self.diloco_offloaded_param_list is not None:
-            for param_offloaded, param in zip(self.diloco_offloaded_param_list, self.model.parameters()):
-                param_offloaded.data.to_local().copy_(param.data.to_local())
-
-        if self.diloco_offloaded_optimizer:
-            with open(os.path.join(resume_ckpt_path, f"__{world_info.local_rank}_0.pt"), "rb") as f:
-                rank_state_dict = torch.load(f)
-
-            opt_wrapper = OuterOptimizerWrapper(self.diloco_offloaded_optimizer)
-            opt_wrapper.load_state_dict(rank_state_dict["optimizer"])
-
-        if not skip_dataloader:
-            if self.config.remote_data_load:
-                self.remote_data_load()
-            else:
-                data_path = resume_ckpt_path if data_path is None else data_path
-                self._load_data(data_path)
-
-        self._init_state()
-
-        self._logger.info(f"Loaded checkpoint from {resume_ckpt_path} in {time.perf_counter() - time_start} seconds")
-
-    def remote_data_load(self):
-        remote_data_path = os.path.join(self.config.remote_data_path, f"data_{self.data_rank}", "latest")
-        id_ = uuid.uuid4()
-        dest = f"/tmp/zeroband/data_{id_}"
-        rsync_fsspec(remote_data_path, os.path.join(dest, "data"))
-        data_path = dest
-        self._load_data(data_path)
-
-    @torch.no_grad()
-    def recv_ckpt_from_peer(self, global_pg: dist.ProcessGroup):
-        assert self.diloco_offloaded_param_list is not None, "recv_ckpt_from_peers is only supported with diloco"
-
-        time_start = time.perf_counter()
-        self._logger.debug(f"Start receiving ckpt from rank {self.config.live_recovery_rank_src}")
-
-        jobs = []
-        buffers = []
-        for i, param in enumerate(self.diloco_offloaded_param_list):
-            data = param.data
-            if isinstance(param.data, DTensor):
-                data = param.data.to_local()
-
-            buffer = torch.empty_like(data)
-            buffers.append(buffer)
-            jobs.append(global_pg.recv([buffer], self.config.live_recovery_rank_src, i))
-
-        for job in jobs:
-            job.wait()
-
-        for buffer, param in zip(buffers, self.model.parameters()):
-            data = param.data
-            if isinstance(data, DTensor):
-                data = data.to_local()
-            data.copy_(buffer)
-
-        self._logger.debug("live recovery progress: offloaded model received 1/5")
-
-        outer_opt_state_dict = recv_state_dict(
-            global_pg, self.config.live_recovery_rank_src, self.diloco_offloaded_optimizer.state_dict()
-        )
-        self.diloco_offloaded_optimizer.load_state_dict(outer_opt_state_dict)
-
-        self._logger.debug("live recovery progress: outer optimizer state dict received 2/5")
-
-        training_process_state_dict = recv_state_dict(
-            global_pg, self.config.live_recovery_rank_src, self.training_progress.state_dict()
-        )
-        self.training_progress.load_state_dict(training_process_state_dict)
-        self._logger.debug("live recovery progress: training progress state dict received 3/5")
-
-        for group in self.optimizer.param_groups:
-            for p in group["params"]:
-                p.grad = torch.randn_like(p)
-
-        self.optimizer.step()
-        self.optimizer.zero_grad()
-
-        inner_opt_state_dict = recv_state_dict(
-            global_pg, self.config.live_recovery_rank_src, self.optimizer.state_dict()
-        )
-        self.optimizer.load_state_dict(inner_opt_state_dict)
-
-        self._logger.debug("live recovery progress: inner optimizer state dict received 4/5")
-
-        sheduler_state_dict = recv_state_dict(
-            global_pg, self.config.live_recovery_rank_src, self.scheduler.state_dict()
-        )
-        self.scheduler.load_state_dict(sheduler_state_dict)
-
-        self._logger.debug("live recovery progress: scheduler state dict received 5/5")
-
-        self._logger.debug(
-            f"Received ckpt from rank {self.config.live_recovery_rank_src} in {time.perf_counter() - time_start} seconds"
-        )
-
-    @torch.no_grad()
-    def send_ckpt_to_peer(self, global_pg: dist.ProcessGroup, dest_rank: int, blocking: bool = False):
-        def async_send():
-            assert self.diloco_offloaded_param_list is not None, "send_ckpt_to_peers is only supported with diloco"
-            time_start = time.perf_counter()
-            self._logger.debug(f"Start sending ckpt to rank {dest_rank}")
-
-            try:
-                jobs = []
-                for i, param in enumerate(self.diloco_offloaded_param_list):
-                    data = param.data
-                    if isinstance(data, DTensor):
-                        data = data.to_local()
-                    jobs.append(global_pg.send([data], dest_rank, i))
-
-                for job in jobs:
-                    job.wait()
-
-                send_state_dict(global_pg, self.diloco_offloaded_optimizer.state_dict(), dest_rank)
-                send_state_dict(global_pg, self.training_progress.state_dict(), dest_rank)
-
-                inner_optimizer_non_tensor_state_dict, inner_optimizer_tensors = _get_sendable_state_dict(
-                    self.optimizer.state_dict()
-                )
-                send_tensor_and_state_dict(
-                    global_pg, dest_rank, inner_optimizer_non_tensor_state_dict, inner_optimizer_tensors
-                )
-
-                send_state_dict(global_pg, self.scheduler.state_dict(), dest_rank)
-            except RuntimeError as e:
-                self._logger.error(f"Error sending ckpt to rank {dest_rank}: {e}")
-            else:
-                self._logger.debug(f"Sent ckpt to rank {dest_rank} in {time.perf_counter() - time_start} seconds")
-
-        thread = threading.Thread(target=async_send)
-        thread.start()
-        self._logger.debug("Live recovery thread started")
-        if blocking:
-            thread.join()
-        else:
-            self._live_reco_thread = thread
-
-
-def delete_topk(logger: logging.Logger, ckpt_path: str, topk: int):
-    checkpoints_to_delete = get_checkpoints_to_delete(ckpt_path, topk)
-    for ckpt_path in checkpoints_to_delete:
-        shutil.rmtree(ckpt_path, ignore_errors=True)
-    if len(checkpoints_to_delete) > 0:
-        logger.info(f"Deleted {checkpoints_to_delete} checkpoints")
-
-
-def get_checkpoints_to_delete(ckpt_path: str, topk: int) -> list[str]:
-    checkpoints = [d for d in os.listdir(ckpt_path) if d.startswith("step_")]
-    sorted_checkpoints = sorted(checkpoints, key=lambda x: int(x.split("_")[1]), reverse=True)
-    return [os.path.join(ckpt_path, d) for d in sorted_checkpoints[topk:]]
diff --git a/src/zeroband/collectives.py b/src/zeroband/collectives.py
deleted file mode 100644
index f9f6d47c..00000000
--- a/src/zeroband/collectives.py
+++ /dev/null
@@ -1,192 +0,0 @@
-from typing import Callable, Optional, TypeAlias
-import torch
-import torch.distributed as dist
-
-from zeroband.config import Compression
-
-AllReduceFunc: TypeAlias = Callable[
-    [torch.Tensor, dist.ReduceOp, Optional[dist.ProcessGroup], Optional[torch.dtype]], None
-]
-
-
-def gloo_all_reduce(
-    tensor: torch.Tensor,
-    op: dist.ReduceOp = dist.ReduceOp.SUM, # type: ignore (defined weird)
-    group: Optional[dist.ProcessGroup] = None,
-) -> None:
-    """Wrap gloo all reduce"""
-    if group is None:
-        group = dist.distributed_c10d._get_default_group()
-    if op not in [dist.ReduceOp.SUM, dist.ReduceOp.AVG]:
-        raise ValueError(f"Unsupported reduce operation {op}. Only SUM and AVG are supported.")
-
-    # group = cast(dist.ProcessGroup, group) # just type hint stuff for IDE
-    if op == dist.ReduceOp.AVG:
-        # todo check numerical stability of doing post or pre div
-        tensor.div_(group.size())
-
-    dist.all_reduce(tensor, op, group=group)
-
-
-def all_reduce(
-    compression: Compression,
-    tensor: torch.Tensor,
-    op: dist.ReduceOp = dist.ReduceOp.SUM, # type: ignore
-    group: Optional[dist.ProcessGroup] = None,
-) -> None:
-    if compression == Compression.UINT8:
-        from zeroband.C.collectives import ring_allreduce as ring_allreduce_c
-
-        return ring_allreduce_c(tensor, op, group)
-    else:
-        return gloo_all_reduce(tensor, op, group)
-
-
-# ===============
-# Code purgatory
-# ---------------
-# This code is still here because it is used by tests
-# ring_allreduce is used by tests/test_c/test_collectives.py to make sure the new c impl doesnt deviate too much numerically
-BUFFER_COUNT = 2
-
-
-def ring_allreduce_py(
-    tensor: torch.Tensor,
-    op: dist.ReduceOp = dist.ReduceOp.SUM, # type: ignore
-    group: Optional[dist.ProcessGroup] = None,
-    transfer_dtype: Optional[torch.dtype] = None,
-    quantization_func: Optional[Callable] = None,
-) -> None:
-    """
-    Perform all-reduce on a tensor using ring algorithm.
-    The accumulation will be done in-place on the input tensor.
-    The transfers will be done using the specified transfer_dtype.
-    """
-    if quantization_func is not None:
-        if transfer_dtype is not None:
-            raise ValueError("Quantization and transfer_dtype cannot be used together")
-        transfer_dtype = tensor.dtype
-    if transfer_dtype is None:
-        transfer_dtype = tensor.dtype
-    if group is None:
-        group = dist.distributed_c10d._get_default_group()
-    if op not in [dist.ReduceOp.SUM, dist.ReduceOp.AVG]:
-        raise ValueError(f"Unsupported reduce operation {op}. Only SUM and AVG are supported.")
-
-    world_size = group.size()
-    rank = group.rank()
-
-    # Divide the tensor into chunks
-    flat_tensor = tensor.as_strided((tensor.numel(),), (1,))
-    chunks = flat_tensor.chunk(world_size * BUFFER_COUNT)
-
-    assert flat_tensor.size(0) % (world_size * BUFFER_COUNT) == 0, "Tensor size must be divisible by world size"
-
-    # Temporary buffers for transferring data
-    num_buffers = BUFFER_COUNT * world_size
-    if quantization_func is not None:
-        recv_buffer = [torch.empty_like(chunks[0], dtype=torch.uint8) for _ in range(BUFFER_COUNT)]
-        send_buffer = [None for _ in range(BUFFER_COUNT)]
-        send_lookup_buffer = [None for _ in range(BUFFER_COUNT)]
-        recv_lookup_buffer = [torch.empty(256, dtype=chunks[0].dtype) for _ in range(BUFFER_COUNT)]
-        send_lookup_work = [None for _ in range(BUFFER_COUNT)]
-        recv_lookup_work = [None for _ in range(BUFFER_COUNT)]
-    else:
-        recv_buffer = [torch.empty_like(chunks[0], dtype=transfer_dtype) for _ in range(BUFFER_COUNT)]
-        send_buffer = [torch.empty_like(chunks[0], dtype=transfer_dtype) for _ in range(BUFFER_COUNT)]
-    send_work = [None] * BUFFER_COUNT
-    recv_work = [None] * BUFFER_COUNT
-
-    send_rank = (rank + 1) % world_size
-    recv_rank = (rank - 1) % world_size
-    for step in range(1, world_size * BUFFER_COUNT + 1):
-        send_chunk = (rank * BUFFER_COUNT - step) % num_buffers
-
-        if send_work[step % BUFFER_COUNT] is not None:
-            send_work[step % BUFFER_COUNT].wait()
-            recv_work[step % BUFFER_COUNT].wait()
-            if quantization_func is not None:
-                send_lookup_work[step % BUFFER_COUNT].wait()
-                recv_lookup_work[step % BUFFER_COUNT].wait()
-                # print(recv_lookup_buffer[step % BUFFER_COUNT][recv_buffer[step % BUFFER_COUNT].long()])
-                chunks[send_chunk].add_(
-                    recv_lookup_buffer[step % BUFFER_COUNT][recv_buffer[step % BUFFER_COUNT].long()]
-                )
-            else:
-                chunks[send_chunk].add_(recv_buffer[step % BUFFER_COUNT])
-
-        if step <= (world_size - 1) * BUFFER_COUNT:
-            # Send and receive
-            if quantization_func is not None:
-                send_buffer[step % BUFFER_COUNT], send_lookup_buffer[step % BUFFER_COUNT] = quantization_func(
-                    chunks[send_chunk]
-                )
-                send_lookup_work[step % BUFFER_COUNT] = dist.isend(
-                    send_lookup_buffer[step % BUFFER_COUNT], dst=send_rank, group=group, tag=step + 1000
-                )
-                recv_lookup_work[step % BUFFER_COUNT] = dist.irecv(
-                    recv_lookup_buffer[step % BUFFER_COUNT], src=recv_rank, group=group, tag=step + 1000
-                )
-            else:
-                send_buffer[step % BUFFER_COUNT].copy_(chunks[send_chunk])
-            send_work[step % BUFFER_COUNT] = dist.isend(
-                send_buffer[step % BUFFER_COUNT], dst=send_rank, group=group, tag=step
-            )
-            recv_work[step % BUFFER_COUNT] = dist.irecv(
-                recv_buffer[step % BUFFER_COUNT], src=recv_rank, group=group, tag=step
-            )
-
-    if op == dist.ReduceOp.AVG:
-        for i in range(BUFFER_COUNT):
-            chunks[i + rank * BUFFER_COUNT].divide_(world_size)
-    if quantization_func is not None:
-        for i in range(BUFFER_COUNT):
-            quant_weight, lookup = quantization_func(chunks[i + rank * BUFFER_COUNT])
-            chunks[i + rank * BUFFER_COUNT].copy_(lookup[quant_weight.long()])
-
-    if quantization_func is not None:
-        recv_buffer = [torch.empty_like(chunks[0], dtype=torch.uint8) for _ in range(BUFFER_COUNT)]
-        send_buffer = [None for _ in range(BUFFER_COUNT)]
-        send_lookup_buffer = [None for _ in range(BUFFER_COUNT)]
-        recv_lookup_buffer = [torch.empty(256, dtype=chunks[0].dtype) for _ in range(BUFFER_COUNT)]
-        send_lookup_work = [None for _ in range(BUFFER_COUNT)]
-        recv_lookup_work = [None for _ in range(BUFFER_COUNT)]
-    send_work = [None] * BUFFER_COUNT
-    recv_work = [None] * BUFFER_COUNT
-
-    for step in range(1, world_size * BUFFER_COUNT + 1):
-        send_chunk = (rank * BUFFER_COUNT + BUFFER_COUNT - step) % num_buffers
-
-        if send_work[step % BUFFER_COUNT] is not None:
-            send_work[step % BUFFER_COUNT].wait()
-            recv_work[step % BUFFER_COUNT].wait()
-            if quantization_func is not None:
-                send_lookup_work[step % BUFFER_COUNT].wait()
-                recv_lookup_work[step % BUFFER_COUNT].wait()
-                chunks[send_chunk].copy_(
-                    recv_lookup_buffer[step % BUFFER_COUNT][recv_buffer[step % BUFFER_COUNT].long()]
-                )
-            else:
-                chunks[send_chunk].copy_(recv_buffer[step % BUFFER_COUNT])
-
-        if step <= (world_size - 1) * BUFFER_COUNT:
-            # Send and receive
-            if quantization_func is not None:
-                send_buffer[step % BUFFER_COUNT], send_lookup_buffer[step % BUFFER_COUNT] = quantization_func(
-                    chunks[send_chunk]
-                )
-                send_lookup_work[step % BUFFER_COUNT] = dist.isend(
-                    send_lookup_buffer[step % BUFFER_COUNT], dst=send_rank, group=group, tag=step + 1000
-                )
-                recv_lookup_work[step % BUFFER_COUNT] = dist.irecv(
-                    recv_lookup_buffer[step % BUFFER_COUNT], src=recv_rank, group=group, tag=step + 1000
-                )
-            else:
-                send_buffer[step % BUFFER_COUNT].copy_(chunks[send_chunk])
-
-            send_work[step % BUFFER_COUNT] = dist.isend(
-                send_buffer[step % BUFFER_COUNT], dst=send_rank, group=group, tag=step
-            )
-            recv_work[step % BUFFER_COUNT] = dist.irecv(
-                recv_buffer[step % BUFFER_COUNT], src=recv_rank, group=group, tag=step
-            )
diff --git a/src/zeroband/comms.py b/src/zeroband/comms.py
deleted file mode 100644
index ca3d7ce6..00000000
--- a/src/zeroband/comms.py
+++ /dev/null
@@ -1,609 +0,0 @@
-import sys
-import os
-import time
-import subprocess
-from torch.distributed.device_mesh import init_device_mesh
-from zeroband.utils.world_info import get_world_info
-from zeroband.utils.logger import get_logger
-import torch.distributed as dist
-from datetime import timedelta
-from typing import List, Tuple, Optional
-from torch.testing._internal.distributed.fake_pg import FakeProcessGroup
-import multiprocessing as mp
-from uuid import uuid4
-import toposolve
-from zeroband.utils.ip import parse_iperf_output
-
-TCPSTORE_TIMEOUT = timedelta(seconds=int(os.getenv("ZERO_BAND_GLOBAL_STORE_TIMEOUT_SECONDS", "300")))
-TCPSTORE_POLLING_INTERVAL = float(os.getenv("ZERO_BAND_GLOBAL_STORE_POLLING_INTERVAL_SECONDS", "0.1"))
-GLOBAL_PG_TIMEOUT = timedelta(seconds=int(os.getenv("ZERO_BAND_GLOBAL_PG_TIMEOUT_SECONDS", "600")))
-MAX_JOINERS = 100  # Maximum number of nodes that can join in a single reinit
-HEARTBEAT_INTERVAL = int(
-    os.getenv("ZERO_BAND_EDM_HEARTBEAT_INTERVAL_SECONDS", "2")
-)  # Interval in seconds between heartbeats
-HEARTBEAT_TIMEOUT = int(
-    os.getenv("ZERO_BAND_EDM_HEARTBEAT_TIMEOUT_SECONDS", "10")
-)  # Time in seconds after which a node is considered dead if no heartbeat is received
-IPERF_PORT = int(os.getenv("ZERO_BAND_IPERF_PORT", "10101"))
-IPERF_IFNAME = os.getenv("GLOO_SOCKET_IFNAME", "eth0")
-BENCH_TENSOR_SIZE = 1_000_000
-
-
-class ElasticDeviceMesh:
-    """A class to manage the process groups for elastic training without restarts.
-
-    The way it works is rank 0 coordinates the joining and leaving of nodes.
-    Rank 0 manages the status to coordinate the creation and recreation of the process groups.
-    When a node wants to join, rank 0 will setup the store so that all nodes know the new world size and their respective ranks.
-
-    Store keys used:
-    - status: "init", "running", "reinit"
-    - world_size: The current world size
-    - mesh_count: The version of the mesh
-    - rank_{uuid}: The rank of the node with the given uuid
-    - joiner_{i}: The uuid of the ith joiner. Its a KV implmentation of a queue.
-    """
-
-    local_pg: dist.ProcessGroup
-    global_pg: dist.ProcessGroup
-
-    def __init__(
-        self, backend: str = "cpu:gloo,cuda:nccl", enable: bool = True, live_recovery_rank_src: int | None = None
-    ):
-        self._logger = get_logger()
-        self.world_info = get_world_info()
-        self.live_recovery_rank_src = live_recovery_rank_src
-
-        # Initialize global process group
-        self.global_pg = FakeProcessGroup(self.world_info.rank, 1)
-
-        self.enable = enable
-        if enable:
-            self._init_global_pg()
-
-        # Initialize local process group
-        dist.init_process_group(backend=backend)
-        self.mesh = init_device_mesh(
-            "cuda",
-            (self.world_info.nnodes, self.world_info.local_world_size),
-            mesh_dim_names=("internode", "intranode"),
-        )
-        self.local_pg = self.mesh.get_group("intranode")
-
-        # Start heartbeat
-
-        self.cuda_local_mesh = init_device_mesh("cuda", mesh_shape=(self.local_pg.size(),))
-        self.cpu_local_mesh = init_device_mesh("cpu", mesh_shape=(self.local_pg.size(),))
-
-        # Logging
-        if self.enable:
-            self._optimize_ring_ranks()
-            if self.live_recovery_rank_src is not None:
-                self.live_recovery.ask_for_live_ckpt(self.live_recovery_rank_src)
-            self.global_pg.barrier().wait()
-
-            self._logger.info(f"global_pg size : {self.global_pg.size()}, local_pg size: {self.local_pg.size()}")
-
-    def __del__(self):
-        self._stop_heartbeat()
-        dist.destroy_process_group()
-
-    def _init_global_store(self):
-        self._logger.info(
-            f"[{self.world_info.global_unique_id}](Leader: {self._global_leader}) TCPStore init: Connecting via {self.world_info.global_addr}:{self.world_info.global_port + self.world_info.rank}"
-        )
-        self.global_store = dist.TCPStore(
-            host_name=self.world_info.global_addr,
-            port=self.world_info.global_port + self.world_info.rank,
-            timeout=TCPSTORE_TIMEOUT,
-            is_master=self._global_leader,
-        )
-        self.god_store = dist.TCPStore(
-            host_name=self.world_info.global_addr,
-            port=self.world_info.global_port,
-            timeout=TCPSTORE_TIMEOUT,
-            is_master=False,
-        )
-
-    def _init_global_store_values(self):
-        """Initialize the global store with mesh_count, joiner_0, and status. Also sets the global status."""
-        self._logger.debug("Initializing global store values")
-        self.global_store.set(f"gid_{self.world_info.global_rank}", self.world_info.global_unique_id)
-        self.global_store.set(f"rank_{self.world_info.global_unique_id}", str(self.world_info.global_rank))
-        if self._global_leader:
-            self.global_store.set("mesh_count", "0")
-            self.global_store.set("world_size", str(self.world_info.global_world_size))
-            self.global_store.set("joiner_0", "null")
-            for i in range(self.world_info.global_world_size):
-                self.global_store.set(f"barrier_{i}", "null")
-            self._global_ids = [
-                self.global_store.get(f"gid_{i}").decode("utf-8") for i in range(self.world_info.global_world_size)
-            ]
-            for i in self._global_ids:
-                for j in self._global_ids:
-                    self.global_store.set(f"ping_{i}_{j}", "1000_000_000")
-            self.global_store.set("status", "init")
-            self.global_status = "init"
-        else:
-            self.global_status = self._wait_for_status()
-            self._global_ids = [
-                self.global_store.get(f"gid_{i}").decode("utf-8") for i in range(self.world_info.global_world_size)
-            ]
-
-    def _create_global_pg(self):
-        # Delete the old global_pg
-        if hasattr(self, "global_pg"):
-            if sys.getrefcount(self.global_pg) > 2:
-                self._logger.warning(
-                    f"Global PG refcount was {sys.getrefcount(self.global_pg)} when 2 is expected during deletion. This may cause a memory leak."
-                )
-            del self.global_pg  # TODO(jackmin): Where do we catch errors in teardown?
-            self._logger.info("Destroyed process group")
-
-        # Get new global rank and world size
-        self.world_info.global_rank = int(
-            self.global_store.get(f"rank_{self.world_info.global_unique_id}").decode("utf-8")
-        )
-        self.world_info.global_world_size = int(self.global_store.get("world_size").decode("utf-8"))
-        self.mesh_count = int(self.global_store.get("mesh_count").decode("utf-8"))
-        self._logger.debug(
-            f"New global rank: {self.world_info.global_rank}, New global world size: {self.world_info.global_world_size} New mesh count: {self.mesh_count}"
-        )
-
-        # Create prefix store
-        prefix_store = dist.PrefixStore(f"mesh_{self.mesh_count}", self.global_store)
-        self._logger.debug(f"Created prefix store with mesh_{self.mesh_count}")
-
-        # Create process group
-        self._logger.debug(
-            f"Creating global pg with {self.world_info.global_world_size} rank {self.world_info.global_rank}"
-        )
-        self.global_pg = dist.ProcessGroupGloo(
-            prefix_store, self.world_info.global_rank, self.world_info.global_world_size, GLOBAL_PG_TIMEOUT
-        )
-        self._logger.debug("Global pg created with %d peers. Timeout of %s", self.global_pg.size(), GLOBAL_PG_TIMEOUT)
-
-    def _optimize_ring_ranks(self):
-        self._global_ids = [
-            self.global_store.get(f"gid_{i}").decode("utf-8") for i in range(self.world_info.global_world_size)
-        ]
-        if self.world_info.local_rank == 0:
-            self._logger.debug("Measuring bandwidths")
-            self._measure_connectivity()
-            self._logger.debug("Measuring bandwidths done")
-
-        self.local_pg.barrier().wait()
-        self.global_pg.barrier().wait()
-
-        if self._global_leader:
-            self._logger.debug("Calculating TSP")
-            pings = self.get_pings()
-            min_dist, path = toposolve.TSPSolver().solve_tsp(pings)
-            self._logger.debug(f"Min distance: {min_dist}")
-            self._logger.debug(f"Path: {path}")
-            new_gids = [self._global_ids[i] for i in path[:-1]]
-            assert set(new_gids) == set(self._global_ids)
-
-            for i, gid in enumerate(new_gids):
-                self.global_store.set(f"rank_{gid}", str(i))
-                self.global_store.set(f"gid_{i}", gid)
-            self.global_store.set("mesh_count", str(self.mesh_count + 1))
-
-        self.local_pg.barrier().wait()
-        self.global_pg.barrier().wait()
-
-        self._global_ids = [
-            self.global_store.get(f"gid_{i}").decode("utf-8") for i in range(self.world_info.global_world_size)
-        ]
-        self._create_global_pg()
-
-    def _queue_join(self):
-        """Queue a node to join the mesh."""
-        for i in range(MAX_JOINERS):
-            joiner_id = self.global_store.get(f"joiner_{i}").decode("utf-8")
-            if joiner_id == "null":
-                self.global_store.set(f"joiner_{i}", self.world_info.global_unique_id)
-                self.global_store.set(f"joiner_{i + 1}", "null")
-                break
-        else:
-            raise RuntimeError("Too many joiners")
-
-    def _get_joiners(self) -> Tuple[List[str], List[str]]:
-        joiners = []
-        for i in range(MAX_JOINERS):
-            joiner_id = self.global_store.get(f"joiner_{i}").decode("utf-8")
-            if joiner_id == "null":
-                break
-            joiners.append(joiner_id)
-        return joiners
-
-    def _clear_joiners(self):
-        self.global_store.set("joiner_0", "null")
-
-    def _wait_for_status(self, status: Optional[str] = None) -> str:
-        """Wait for status to be set in the store.
-
-        Args:
-            store (dist.Store): The store to check.
-            status (Optional[str], optional): The status to wait for. If None, wait for any status. Defaults to None.
-        Returns:
-            status (str): The status.
-        """
-        while True:
-            try:
-                ret = self.global_store.get("status").decode("utf-8")
-                if status is None or ret == status:
-                    return ret
-                time.sleep(TCPSTORE_POLLING_INTERVAL)
-            except dist.DistStoreError as e:
-                if status is not None:
-                    raise e
-                time.sleep(0.1)
-
-    def _init_global_pg(self) -> None:
-        # Each rank gets its own global store with global rank 0 as the master
-        time_start = time.perf_counter()
-
-        self._global_leader = self.world_info.global_rank == 0
-        self._init_global_store()
-
-        # Initialize store values
-        self._init_global_store_values()
-
-        self.live_recovery = LiveRecovery(store=self.global_store)
-
-        if self.global_status == "running":  # Join path
-            # Ask to join and then wait for the status to be "reinit"
-            self._logger.info("Waiting to join")
-            self._queue_join()
-            self._wait_for_status("reinit")
-
-        # Create global process group
-        self._create_global_pg()
-
-        # Update global store values
-        if self._global_leader:
-            self.global_store.set("status", "running")
-            self.global_store.set("resolved_time", uuid4().hex)
-        self.global_status = "running"
-        self._last_resolved_time = self.global_store.get("resolved_time").decode("utf-8")
-
-        self._start_heartbeat()
-
-        self._logger.info(
-            f"Elastic Device mesh init done with {self.global_pg.size()} peers in {time.perf_counter() - time_start} seconds"
-        )
-
-        if self.world_info.local_rank == 0:
-            self._start_iperf_server()
-        self._evicted_nodes = []
-
-    def _start_heartbeat(self):
-        """Start sending heartbeats to the global store in a separate process."""
-        self._heartbeat_stop_event = mp.Event()
-        self._heartbeat_process = mp.Process(target=self._heartbeat_loop, args=(self._heartbeat_stop_event,))
-        self._heartbeat_process.start()
-
-    def _stop_heartbeat(self):
-        """Stop the heartbeat process."""
-        self._send_deathrattle()
-        if hasattr(self, "_heartbeat_stop_event"):
-            self._heartbeat_stop_event.set()
-            self._heartbeat_process.join()
-
-    def _heartbeat_loop(self, stop_event):
-        """Continuously send heartbeats until stopped."""
-        try:
-            while not stop_event.is_set():
-                self._send_heartbeat()
-                time.sleep(HEARTBEAT_INTERVAL)
-        finally:
-            self._send_deathrattle()
-
-    def _send_heartbeat(self):
-        """Send a heartbeat to the global store."""
-        current_time = time.time()
-        try:
-            self.global_store.set(f"heartbeat_{self.world_info.global_unique_id}", str(current_time))
-        except Exception:
-            self._logger.error("Error sending heartbeat", exc_info=True)
-            pass
-
-    def _send_deathrattle(self):
-        """Send a deathrattle to the global store."""
-        if hasattr(self, "global_store"):
-            self.global_store.set(f"heartbeat_{self.world_info.global_unique_id}", "-100")
-        else:
-            import warnings
-
-            warnings.warn("global_store garbage collected. Skipping deathrattle.")
-
-    def _check_heartbeats(self) -> List[str]:
-        """Check heartbeats and return a list of nodes that have missed their heartbeats."""
-        dead_nodes = []
-        current_time = time.time()
-        for gid in self._global_ids:
-            try:
-                last_heartbeat = float(self.global_store.get(f"heartbeat_{gid}").decode("utf-8"))
-                self._logger.debug(f"Node {gid} last heartbeat: {last_heartbeat}")
-                if current_time - last_heartbeat > HEARTBEAT_TIMEOUT:
-                    dead_nodes.append(gid)
-                    self.global_store.delete_key(f"heartbeat_{gid}")
-            except dist.DistStoreError:
-                self._logger.warning(f"Node {gid} has no heartbeat")
-        return dead_nodes
-
-    def _resolve_world(self, admit_joiners: bool = False) -> bool:
-        """Set the new world size and ranks for all nodes if there are joiners or dead nodes. Else, do nothing.
-
-        Args:
-            admit_joiners (bool, optional): Whether to admit joiners. Defaults to False.
-        Returns:
-            bool: True if the world was changed, False otherwise.
-        """
-        # Find joiners
-        if admit_joiners:
-            joiners = self._get_joiners()
-        else:
-            joiners = []
-
-        # Check for dead nodes
-        dead_nodes = self._check_heartbeats()
-        self._logger.debug(
-            "Joiners (%sadmitting): %s, Dead nodes: %s, Evicting nodes: %s",
-            "" if admit_joiners else "not ",
-            joiners,
-            dead_nodes,
-            self._evicted_nodes,
-        )
-        dead_nodes.extend(self._evicted_nodes)
-
-        # If no joiners or dead nodes, no resolution needed
-        if len(joiners) == 0 and len(dead_nodes) == 0:
-            return False
-
-        # Remap live ranks to smaller world_size caused by dead nodes
-        leaving_nodes = set(dead_nodes)
-        live_ranks = [i for i in self._global_ids if i not in leaving_nodes]
-        for i, rank in enumerate(live_ranks):
-            self.global_store.set(f"rank_{rank}", str(i))
-            self.global_store.set(f"gid_{i}", rank)
-        new_world_size = len(live_ranks)
-
-        # Give joiners new ranks
-        for joiner_id in joiners:
-            self.global_store.set(f"rank_{joiner_id}", str(new_world_size))
-            self.global_store.set(f"gid_{new_world_size}", joiner_id)
-            live_ranks.append(joiner_id)
-            new_world_size += 1
-
-        self._global_ids = live_ranks
-        for i in self._global_ids:
-            for j in self._global_ids:
-                self.global_store.set(f"ping_{i}_{j}", "1000_000_000")
-        for i in range(1, new_world_size):
-            self.global_store.set(f"barrier_{i}", "null")
-        # Update world_size
-        self.global_store.set("world_size", str(new_world_size))
-        self.global_store.set("mesh_count", str(self.mesh_count + 1))
-        # Set status to "reinit"
-        self.global_store.set("status", "reinit")
-        return True
-
-    def maybe_reinit_global_pg(self, admit_joiners: bool = False) -> bool:
-        """Reinitialize the global_pg if there are is a state change.
-
-        Args:
-            admit_joiners (bool, optional): Whether to admit joiners. Defaults to False.
-        Returns:
-            bool: True if the global_pg was reinitialized, False otherwise.
-        """
-        if not self.enable:
-            # no op if disabled
-            return
-
-        time_start = time.perf_counter()
-        self._logger.debug("[%s] Resolving world", self.world_info.global_unique_id)
-        if self._global_leader:
-            self._resolve_world(admit_joiners=admit_joiners)
-            self.global_store.set("resolved_time", uuid4().hex)
-        else:
-            while (ans := self.global_store.get("resolved_time").decode("utf-8")) == self._last_resolved_time:
-                # TODO: Have a timeout here in case the leader is dead
-                time.sleep(TCPSTORE_POLLING_INTERVAL)
-            self._last_resolved_time = ans
-
-        self._logger.debug("World resolved in %s seconds", time.perf_counter() - time_start)
-
-        status = self.global_store.get("status").decode("utf-8")
-        if status == "running":  # No joiners or dead nodes
-            return False
-
-        # Reinit Path
-        try:
-            self._create_global_pg()
-            self._optimize_ring_ranks()
-            self.global_pg.barrier().wait()
-        except Exception as e:
-            self._logger.error(f"Error recreating process group: {e}. Retrying...")
-            return self.maybe_reinit_global_pg(admit_joiners=admit_joiners)
-
-        if self._global_leader:
-            self._clear_joiners()
-            self.global_store.set("status", "running")
-
-        self._logger.debug("Reinitialized global_pg done in %s seconds", time.perf_counter() - time_start)
-
-        # TODO: We need to reset the self.world_info.global_rank reference
-        # Somehow the reference becomes stale and the heartbeats become wrong
-        # This will be fixed when heartbeats become unique id dependent which never changes
-        self._logger.debug("Reset Heartbet")
-        self._stop_heartbeat()
-        self._start_heartbeat()
-        self._logger.debug("Reset Heartbeat done")
-        return True
-
-    def get_global_pg(self, maybe_reinit: bool = False) -> dist.ProcessGroup:
-        """Get the global process group. If maybe_reinit is True, reinitialize the global process group if needed."""
-        if maybe_reinit:
-            self.maybe_reinit_global_pg()
-        return self.global_pg
-
-    def monitored_barrier(self, flag: str):
-        flag = str(flag)
-        time_start = time.perf_counter()
-        self._logger.debug("[%s] Monitored Barrier %s", self.world_info.global_unique_id, flag)
-        if self._global_leader:
-            self._logger.debug("Others have %d seconds to resolve", GLOBAL_PG_TIMEOUT.total_seconds())
-            while not all(
-                self.global_store.get(f"barrier_{i}").decode("utf-8") == flag
-                for i in range(1, self.world_info.global_world_size)
-            ):
-                if time.perf_counter() - time_start > GLOBAL_PG_TIMEOUT.total_seconds():
-                    self._logger.error("Monitored barrier failed due to timeout")
-                    self._evicted_nodes = [
-                        i
-                        for i in range(1, self.world_info.global_world_size)
-                        if self.global_store.get(f"barrier_{i}").decode("utf-8") != flag
-                    ]
-                    self._logger.info("Evicting nodes: %s", self._evicted_nodes)
-                    self.global_store.set(f"barrier_{self.world_info.global_rank}", "error")
-                    # We neeed to evict the dead node
-                    raise RuntimeError("Monitored barrier failed due to timeout")
-                time.sleep(TCPSTORE_POLLING_INTERVAL)
-            self.global_store.set(f"barrier_{self.world_info.global_rank}", flag)
-        else:
-            self.global_store.set(f"barrier_{self.world_info.global_rank}", flag)
-            while (ans := self.global_store.get("barrier_0").decode("utf-8")) != flag:
-                if ans == "error":
-                    raise RuntimeError("Monitored barrier failed due to error")
-                # TODO: Have a timeout here in case the leader is dead
-                time.sleep(TCPSTORE_POLLING_INTERVAL)
-
-        self._logger.debug("Monitored barrier resolved in %s seconds", time.perf_counter() - time_start)
-
-    def get_pings(self) -> List[List[int]]:
-        pings = [[1000_000_000] * self.world_info.global_world_size for _ in range(self.world_info.global_world_size)]
-        for i, e1 in enumerate(self._global_ids):
-            for j, e2 in enumerate(self._global_ids):
-                if i == j:
-                    continue
-                pings[i][j] = int(self.god_store.get(f"ping_{e1}_{e2}"))
-
-        self._logger.debug("\n %s", format_grid(pings))
-        return pings
-
-    def _start_iperf_server(self) -> None:
-        """Start the iperf server process."""
-        try:
-            from zeroband.utils.ip import get_ip_address
-
-            iperf_addr = get_ip_address(IPERF_IFNAME)
-            iperf_port = IPERF_PORT + self.world_info.global_rank
-            cmd: List[str] = ["iperf", "-s", "-p", str(iperf_port)]
-            self.server_process = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-            self.god_store.set(f"iperf_{self.world_info.global_unique_id}", f"{iperf_addr}:{iperf_port}")
-            self._logger.info(f"Started iperf server on {iperf_addr} with port {iperf_port}")
-        except Exception as e:
-            self._logger.error(f"Failed to start iperf server: {str(e)}")
-            raise
-
-    def _measure_connectivity(self):
-        for i in self._global_ids:
-            if i == self.world_info.global_unique_id:
-                continue
-            target_host, target_port = self.god_store.get(f"iperf_{i}").decode("utf-8").split(":")
-            target_port = int(target_port)
-            time_taken = self.measure_bandwidth(target_host, target_port)
-            self.god_store.set(f"ping_{self.world_info.global_unique_id}_{i}", str(time_taken))
-
-    def measure_bandwidth(self, target_host: str, target_port: int) -> int:
-        """
-        Measure bandwidth to a specific target.
-
-        Args:
-            target_host: The host to measure bandwidth to
-            target_port: The port to measure bandwidth to
-
-        Returns:
-            int: The time taken to transfer 10Tb of data in seconds
-        """
-        try:
-            cmd: List[str] = [
-                "iperf",
-                "-c",
-                target_host,
-                "-p",
-                str(target_port),
-                "-t",
-                "1",  # 1 second test
-            ]
-            result: subprocess.CompletedProcess = subprocess.run(cmd, capture_output=True, text=True, timeout=5)
-
-            if result.returncode != 0:
-                raise Exception(f"iperf error: {result.stderr}")
-
-            time_taken: int = int(1e13 / parse_iperf_output(result.stdout))
-            time_taken = min(time_taken, 1_000_000_000)
-
-            return time_taken
-        except Exception as e:
-            self._logger.error(f"Error measuring bandwidth to {target_host}:{target_port} {str(e)}")
-            return int(1e9)
-
-
-def format_grid(grid):
-    N = len(grid)
-
-    # Set the main diagonal elements to 0
-    for i in range(N):
-        grid[i][i] = 0
-
-    # Determine the width needed for formatting based on max possible value (99.99) and indices
-    cell_width = 6
-
-    # Create header row with column indices
-    header_row = "   " + " | ".join(f"{j:>{cell_width-1}}" for j in range(N))
-
-    # Start building the formatted grid string
-    formatted_grid = header_row + "\n"
-
-    for i, row in enumerate(grid):
-        # Format each element in the row
-        formatted_row = [f"{i:>2}"]  # Add row index at the beginning of the row
-        for value in row:
-            # Divide by 1000 and format to 2 decimal places
-            formatted_value = f"{value / 1000:.2f}"
-            formatted_row.append(formatted_value)
-
-        # Join the elements of the row with '|' and add it to the grid string
-        formatted_grid += " | ".join(formatted_row).center(cell_width * (N + 1)) + "\n"
-
-    return formatted_grid.strip()
-
-
-class LiveRecovery:
-    def __init__(self, store: dist.Store):
-        self.logger = get_logger()
-        self.world_info = get_world_info()
-
-        self.store = dist.PrefixStore("live_recovery", store)
-        self.reset()
-
-    def reset(self):
-        self.store.set(f"rank_{self.world_info.global_rank}", "null")
-
-    def should_send_ckpt_to(self) -> int | None:
-        """use this function to check if someone is awaiting for a live ckpt"""
-        data = self.store.get(f"rank_{self.world_info.global_rank}").decode("utf-8")
-        if data == "null":
-            return None
-        try:
-            return int(data)
-        except ValueError as e:
-            self.logger.error(f"Error parsing live recovery data: {e}")
-            return None
-
-    def ask_for_live_ckpt(self, rank: int) -> int | None:
-        """use this function to send a signal to a node to ask for a live ckpt"""
-        self.store.set(f"rank_{rank}", str(self.world_info.global_rank))
diff --git a/src/zeroband/compression.py b/src/zeroband/compression.py
deleted file mode 100644
index 2fc1da75..00000000
--- a/src/zeroband/compression.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Code adapted from https://github.com/PrimeIntellect-ai/hivemind/blob/213bff98a62accb91f254e2afdccbf1d69ebdea9/hivemind/compression/quantization.py
-# Original code is licensed under the MIT License.
-# See the LICENSE file in the original repository for more information.
-
-import torch
-import numpy as np
-from typing import Tuple
-import math
-from concurrent.futures import ThreadPoolExecutor
-import os
-
-RANGE_IN_SIGMAS: int = 6
-EXECUTOR = ThreadPoolExecutor(max_workers=int(os.environ.get("QUANTIZATION_THREADS", 128)))
-n_bins = 2**8
-
-
-def average_buckets(tensor: torch.Tensor, quant_weight: torch.Tensor, n_bins: int) -> torch.Tensor:
-    """Return the average value in each bucket"""
-    bin_sums = torch.zeros(n_bins).scatter_add_(0, quant_weight.flatten().long(), tensor.flatten())
-    bin_counts = torch.clamp_min_(torch.bincount(quant_weight.flatten(), minlength=n_bins), 1)
-    lookup = bin_sums / bin_counts
-    return lookup
-
-
-def get_chunk_size(num_elements: int, min_chunk_size: int) -> int:
-    """Adjust chunk_size to minimize imbalance between chunk sizes"""
-    if min_chunk_size >= num_elements:
-        return min_chunk_size
-    leftover_elements = num_elements % min_chunk_size
-    num_chunks = num_elements // min_chunk_size
-    return min_chunk_size + (leftover_elements - 1) // num_chunks + 1
-
-
-def quantile_qq_approximation(array: np.ndarray, n_quantiles: int, min_chunk_size: int = 10**5) -> np.ndarray:
-    """Estimate uniform quantiles of data using quantile-of-quantiles. Runs in parallel."""
-    if not array.data.c_contiguous and array.data.f_contiguous:
-        array = array.T
-    array = np.ascontiguousarray(array.reshape(-1))
-    quantiles = np.linspace(0.0, 1.0, num=n_quantiles, dtype=array.dtype)
-    chunk_size = get_chunk_size(len(array), min_chunk_size)
-    num_chunks = (len(array) - 1) // chunk_size + 1
-    partition_quantiles = np.empty((num_chunks, len(quantiles)), dtype=array.dtype)
-
-    jobs = []
-    for i in range(num_chunks):
-        chunk = slice(chunk_size * i, chunk_size * (i + 1))
-        jobs.append(EXECUTOR.submit(np.quantile, array[chunk], quantiles, out=partition_quantiles[i]))
-
-    for job in jobs:
-        job.result()
-    return np.quantile(partition_quantiles, quantiles)
-
-
-def uniform_8bit_quantize(tensor: torch.Tensor, inplace: bool = True) -> Tuple[torch.Tensor, torch.Tensor]:
-    offset = n_bins // 2
-    # shift = tensor.mean()
-    # centered_tensor = tensor.sub_(shift) if inplace else tensor - shift
-    centered_tensor = tensor
-    std_unbiased = centered_tensor.norm() / math.sqrt(centered_tensor.numel() - 1)
-    scale = RANGE_IN_SIGMAS * std_unbiased / n_bins
-    quantized = torch.quantize_per_tensor(centered_tensor, scale, offset, torch.quint8).int_repr()
-    lookup = average_buckets(tensor, quantized, n_bins)
-    return quantized, lookup
-
-
-def quantile_8bit_quantize(tensor: torch.Tensor, inplace: bool = True) -> Tuple[torch.Tensor, torch.Tensor]:
-    borders = torch.as_tensor(quantile_qq_approximation(tensor.numpy(), n_bins + 1)[1:-1])
-    quantized = torch.clamp_(torch.bucketize(tensor, borders), 0, n_bins - 1)
-    lookup = average_buckets(tensor, quantized, n_bins)
-    return quantized, lookup
diff --git a/src/zeroband/config.py b/src/zeroband/config.py
deleted file mode 100644
index 84e2b294..00000000
--- a/src/zeroband/config.py
+++ /dev/null
@@ -1,275 +0,0 @@
-from enum import Enum
-from typing import Any, Literal, TypeAlias
-import os
-
-from pydantic import create_model, model_validator
-from pydantic_config import BaseConfig
-
-AttnFnType: TypeAlias = Literal["flex", "math"]
-
-class Compression(Enum):
-    NO = "no"
-    UINT8 = "uint8"
-
-
-class DataConfig(BaseConfig):
-    dataset_name_or_paths: str = "datasets/fineweb-edu"
-    val_dataset_name_or_paths: str | None = None
-    seq_length: int = 1024
-    fake: bool = False
-    num_workers: int = 4
-    max_train_samples: int | None = None
-    max_eval_samples: int | None = None
-    dataset_ratio: str | None = None
-    data_rank: int | None = None
-    data_world_size: int | None = None
-    reverse_data_files: bool = False
-    split_by_data_rank: bool = True
-
-
-class AdamConfig(BaseConfig):
-    type: Literal["adam"] = (
-        "adam"  # the literal is used to distinguish between the different optimizers configuration in the union type
-    )
-    lr: float = 4e-4
-    weight_decay: float = 0.1
-    betas1: float = 0.9
-    betas2: float = 0.95
-
-
-class SoapConfig(BaseConfig):
-    type: Literal["soap"] = "soap"
-    lr: float = 4e-4
-    weight_decay: float = 1e-05
-    betas1: float = 0.9
-    betas2: float = 0.95
-
-    max_preconditioner_dim: int = 8192
-    precondition_frequency: int = 100
-
-
-OptimizersConfig: TypeAlias = AdamConfig | SoapConfig
-
-
-class OptimConfig(BaseConfig):
-    optim: OptimizersConfig = AdamConfig()
-
-    sched_type: Literal["cosine", "linear", "wsd-sqrt"] = "cosine"
-    warmup_steps: int = 1000
-    stable_steps: int = 80_000
-    total_steps: int = 88_000
-    batch_size: int = 512
-
-    z_loss: bool = False
-    z_loss_weight: float = 2e-4
-    num_chunks: int | None = None
-
-
-class DilocoConfig(BaseConfig):
-    outer_lr: float = 0.7
-    inner_steps: int
-    compression: Compression = Compression.NO
-
-    retry_all_reduce: int = 3
-
-
-class MemoryProfilerConfig(BaseConfig):
-    freq: int = 10
-    snapshot_dir: str
-
-
-class TrainConfig(BaseConfig):
-    micro_bs: int = 1
-
-    ac_ckpt: bool | int = False
-    reshard_after_forward: bool = True  # old shard grad op True mean full shard
-
-    reduce_fp32: bool = False  # should be True if SXM. Keep to false as default for backward compatibility
-
-    log_model_hash: bool = False
-
-    memory_profiler: MemoryProfilerConfig | None = None
-
-    torch_profiler: bool = False
-
-    sequence_packing: bool = True
-
-    torch_compile: bool = True
-
-    fused_linear_ce: bool = False
-
-    fsdp_cpu_offload: bool = False
-
-    attn_fn: AttnFnType = "flex"
-
-
-class MonitorConfig(BaseConfig):
-    log_flush_interval: int = 10
-    base_url: str | None = None
-    auth_token: str | None = None
-
-
-class RemoteConfig(BaseConfig):
-    path: str  # could be a s3 path
-    interval: int
-
-
-class CkptConfig(BaseConfig):
-    path: str | None = None
-    interval: int | None = None
-    topk: int | None = None
-
-    remote: RemoteConfig | None = None
-
-    remote_data_path: str | None = None
-    remote_data_load: bool = False
-
-    resume: str | None = None
-
-    skip_dataloader: bool = False
-
-    live_recovery_rank_src: int | None = None
-
-    data_path: str | None = None
-
-    token_count: int | None = None
-
-    @model_validator(mode="after")
-    def validate_path_and_interval(self):
-        if (self.path is None) != (self.interval is None):
-            raise ValueError("path and interval must be both set or both None")
-        if self.path is None and self.remote is not None:
-            raise ValueError("remote_path is set but path is not set")
-
-        return self
-
-    @model_validator(mode="after")
-    def validate_remote_data_path(self):
-        if self.remote_data_load and self.data_path is not None:
-            raise ValueError("remote_data_load and data_path are mutually exclusive")
-
-        if self.remote_data_load and self.remote_data_path is None:
-            raise ValueError("remote_data_load is set but remote_data_path is not set")
-        return self
-
-
-ENV_VAR_PREFIX = "ZERO_BAND_"
-
-class Config(BaseConfig):
-    # main config
-    name_model: Literal["debugmodel", "70M","150M", "271M", "1B", "7B", "10B", "13B", "26B", "70B"] = "150M"
-    type_model: Literal["llama2", "llama3"] = "llama3"
-
-    # Project/Run
-    project: str = "zeroband"
-    run_id: str | None = None
-    run_name: str | None = None
-
-    # Logger
-    metric_logger_type: Literal["wandb", "dummy"] = "wandb"
-    wandb_resume: bool = False
-    log_level: Literal["NOTSET", "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = "INFO"
-    log_all_rank: bool = False
-
-    # sub config
-    diloco: DilocoConfig | None = None
-    data: DataConfig = DataConfig()
-    optim: OptimConfig = OptimConfig()
-    train: TrainConfig
-    monitor: MonitorConfig | None = None
-
-    ckpt: CkptConfig = CkptConfig()
-
-    @model_validator(mode="after")
-    def ckpt_diloco_step(self):
-        if self.ckpt is not None and self.ckpt.interval is not None and self.diloco is not None:
-            assert (
-                self.ckpt.interval % self.diloco.inner_steps == 0
-            ), "ckpt interval must be a multiple of diloco inner steps as we only save at the end of an outer step"
-        return self
-
-    @model_validator(mode="after")
-    def validate_live_recovery_rank_src(self):
-        if self.ckpt is not None and self.ckpt.live_recovery_rank_src is not None and self.diloco is None:
-            raise ValueError("live_recovery_rank_src is only supported with diloco")
-        return self
-
-
-def resolve_env_vars(config: Config) -> None:
-    """
-    Resolve environment variables for config fields.
-    Modifies the config in place.
-    Environment variables should be prefixed with ZERO_BAND_.
-    """
-
-    def _resolve_value(env_var: str, field_name: str, config_obj: Any) -> Any:
-        """
-        Resolve a single value from an environment variable
-        env_var: full environment variable name (e.g. ZERO_BAND_TRAIN_MICRO_BS)
-        field_name: actual field name in the config object (e.g. micro_bs)
-        """
-        value = os.environ.get(env_var)
-        if value is not None:
-            if (field_info := config_obj.__class__.model_fields.get(field_name)) is None:
-                raise AttributeError(f"Config {config_obj} has no attribute {field_name}")
-
-            try:
-                # Create a temporary model with just this field, then validate and rip it out.
-                py_model = create_model('TempModel', __base__ = BaseConfig, **{field_name: (field_info.annotation, ...)}) # type: ignore
-                validated = py_model.model_validate({field_name: value})
-                return getattr(validated, field_name)
-            except Exception as e:
-                raise ValueError(f"Error setting {env_var}={value}: {e}")
-        return None
-
-    def _resolve_nested(prefix: str, config_obj: Any) -> None:
-        if not hasattr(config_obj, 'model_fields'):
-            return
-
-        for field_name, _ in config_obj.__class__.model_fields.items():
-            # Build the full env var name
-            full_env_var = f"{ENV_VAR_PREFIX}{prefix}_{field_name}".upper() if prefix else f"{ENV_VAR_PREFIX}{field_name}".upper()
-
-            # Try to resolve the field directly using the local field name
-            value = _resolve_value(full_env_var, field_name, config_obj)
-            if value is not None:
-                setattr(config_obj, field_name, value)
-
-            # Handle nested configs
-            field_value = getattr(config_obj, field_name)
-            if field_value is not None and hasattr(field_value, 'model_fields'):
-                # Pass the prefix for building env var names, but use local field names for lookup
-                _resolve_nested(f"{prefix}_{field_name}" if prefix else field_name, field_value)
-
-    def _get_valid_env_vars(prefix: str, config_obj: Any) -> set[str]:
-        """Recursively collect all valid environment variable names"""
-        valid_vars = set()
-        if not hasattr(config_obj, 'model_fields'):
-            return valid_vars
-
-        for field_name, _ in config_obj.__class__.model_fields.items():
-            full_env_var = f"{ENV_VAR_PREFIX}{prefix}_{field_name}".upper() if prefix else f"{ENV_VAR_PREFIX}{field_name}".upper()
-            valid_vars.add(full_env_var)
-
-            field_value = getattr(config_obj, field_name)
-            if field_value is not None and hasattr(field_value, 'model_fields'):
-                nested_prefix = f"{prefix}_{field_name}" if prefix else field_name
-                valid_vars.update(_get_valid_env_vars(nested_prefix, field_value))
-
-        return valid_vars
-
-    # Check for any invalid ZERO_BAND_ environment variables
-    valid_env_vars = _get_valid_env_vars("", config)
-    invalid_vars = []
-    for env_var in os.environ:
-        if env_var.startswith(ENV_VAR_PREFIX) and env_var not in valid_env_vars:
-            invalid_vars.append(env_var)
-
-    if invalid_vars:
-        raise ValueError(
-            f"Found invalid environment variables with {ENV_VAR_PREFIX} prefix: {', '.join(invalid_vars)}\n"
-             "See the full list of valid config veriables in src/zeroband/config.py."
-        )
-
-    # Now resolve the valid ones.
-    _resolve_nested("", config)
diff --git a/src/zeroband/data.py b/src/zeroband/data.py
index 64d8be19..b8e98b77 100644
--- a/src/zeroband/data.py
+++ b/src/zeroband/data.py
@@ -3,8 +3,9 @@
 from typing import Any, Generator, Optional, List, Dict, TypedDict, Union
 import functools
 
-from zeroband.utils.logger import get_logger
-from zeroband.config import DataConfig
+from pydantic_config import BaseConfig
+
+from zeroband.logger import get_logger
 
 import torch
 from torch.utils.data import IterableDataset, Dataset
@@ -19,6 +20,21 @@
 TEST_VOCAB_SIZE = 1024
 
 
+class DataConfig(BaseConfig):
+    dataset_name_or_paths: str = "datasets/fineweb-edu"
+    val_dataset_name_or_paths: str | None = None
+    seq_length: int = 1024
+    fake: bool = False
+    num_workers: int = 4
+    max_train_samples: int | None = None
+    max_eval_samples: int | None = None
+    dataset_ratio: str | None = None
+    data_rank: int | None = None
+    data_world_size: int | None = None
+    reverse_data_files: bool = False
+    split_by_data_rank: bool = True
+
+
 class FakeTokenizedDataset(IterableDataset):
     """This is a dummy dataset that generates random sequences of length seq_len and vocab_size"""
 
@@ -273,6 +289,7 @@ def load_state_dict(self, state_dict):
             dataset.load_state_dict(state_dict[f"dataset_{i}"])
         self._init_random_state()
 
+
 def get_dataloader(
     tokenizer,
     world_size: int,
@@ -395,7 +412,6 @@ def load_all_datasets(
         split_rank = rank
         split_world_size = world_size
 
-
     get_logger().info("Loading Train dataset(s)")
 
     ds = _load_datasets(
diff --git a/src/zeroband/diloco.py b/src/zeroband/diloco.py
deleted file mode 100644
index 630a8d88..00000000
--- a/src/zeroband/diloco.py
+++ /dev/null
@@ -1,215 +0,0 @@
-import re
-import time
-import torch
-from torch import nn
-from zeroband.comms import ElasticDeviceMesh
-from zeroband.collectives import Compression, all_reduce
-from zeroband.utils.world_info import get_world_info
-from zeroband.utils.logger import get_logger
-from zeroband.config import DilocoConfig
-import torch.distributed as dist
-from torch.distributed._tensor.api import DTensor
-from functools import lru_cache
-
-
-@lru_cache(maxsize=None)
-def _find_first_number(s: str) -> int:
-    match = re.search(r"\d+", s)
-    if match:
-        return int(match.group())
-    else:
-        return -1
-
-
-class Diloco:
-    """
-    This class implements the diloco algorithm from  https://arxiv.org/abs/2311.08105 and https://arxiv.org/abs/2407.07852.
-
-    It handles the outer loop as well as the inter node communication.
-
-    There is no VRAM overhead with this implementation as the model is outer optimizer is offloaded to cpu.
-    All reduce communication are also done on cpu using GLOO.
-
-    Example usage:
-
-    # Example usage in a training loop:
-
-    diloco = Diloco(config.diloco, model, elastic_device_mesh)
-
-    for outer_step in range(num_outer_steps):
-        for inner_step in range(config.diloco.inner_steps):
-            # Regular inner training loop
-                optimizer.zero_grad()
-                loss = model(batch)
-                loss.backward()
-                optimizer.step()
-
-        diloco.step(model)
-    """
-
-    def __init__(
-        self,
-        config: DilocoConfig,
-        model: nn.Module,
-        elastic_device_mesh: ElasticDeviceMesh,
-    ):
-        self.config = config
-
-        if config.compression == Compression.UINT8:
-            from zeroband.C.collectives import ring_allreduce as _  # noqa: F401
-            # just force compilation
-
-        self.elastic_device_mesh = elastic_device_mesh
-
-        self._logger = get_logger()
-        self.world_info = get_world_info()
-
-        self._init_offloaded_optimizer(model=model)
-
-    @torch.no_grad()
-    def _init_offloaded_optimizer(self, model):
-        self.param_list_cpu = self.get_offloaded_param(model)
-        self.outer_optimizer = torch.optim.SGD(
-            self.param_list_cpu, lr=self.config.outer_lr, momentum=0.9, nesterov=True
-        )
-        self._logger.debug("offload model to cpu")
-
-    @torch.no_grad()
-    def sync_pseudo_gradient(self, model: nn.Module, fake: bool = False, flag: str = "outer"):
-        """
-        Sync the pseudo gradient from the local process group to the global process group
-        """
-        _start_time = time.perf_counter()
-
-        self.elastic_device_mesh.maybe_reinit_global_pg(admit_joiners=False)
-        world_size_post_init = self.elastic_device_mesh.global_pg.size()
-
-        world_size = world_size_post_init
-
-        self._logger.debug("sync pseudo gradient %s with world size %d", " fake" if fake else "", world_size)
-
-        global_pg = self.elastic_device_mesh.global_pg
-        for i in range(self.config.retry_all_reduce):
-            for param_offloaded, param in zip(self.param_list_cpu, model.parameters()):
-                assert isinstance(param_offloaded.grad, DTensor)
-                if fake:
-                    param_offloaded.grad.to_local().zero_()
-                else:
-                    param_offloaded.grad.to_local().copy_(param_offloaded.data.to_local())
-                    param_offloaded.grad.to_local().sub_(param.data.to_local().to(param_offloaded.data.device))
-            try:
-                self.offloaded_grad_flat_tensor.div_(world_size)
-                _collective_start_time = time.perf_counter()
-                self._logger.debug("Waiting on barrier")
-                self.elastic_device_mesh.monitored_barrier(flag)
-
-                self._logger.debug("Beginning all reduce")
-                # all_reduce(self.config.compression, self.offloaded_grad_flat_tensor, dist.ReduceOp.SUM, global_pg)
-                for j, tensor_group in enumerate(self._offloaded_grad_grouped_tensor):
-                    t0 = time.perf_counter()
-                    all_reduce(self.config.compression, tensor_group, dist.ReduceOp.SUM, global_pg)
-                    self._logger.debug(
-                        f"{j}/{len(self._offloaded_grad_grouped_tensor)} all reduce bucket done in {time.perf_counter() - t0:.6f} seconds, numel: {tensor_group.numel()}"
-                    )
-
-                self._logger.debug(
-                    f"All reduce takes {time.perf_counter() - _collective_start_time:.6f} seconds numels: {self.offloaded_grad_flat_tensor.numel()}"
-                )
-                break
-            except Exception as e:
-                self._logger.error(f"Error syncing pseudo gradient: {e}, retry {i+1}/{self.config.retry_all_reduce}")
-                global_pg = self.elastic_device_mesh.get_global_pg(maybe_reinit=True)
-        else:
-            self._logger.error(
-                "Failed to sync pseudo gradient after %d retries. Resorting to calculating pseudo-gradient without reduce",
-                self.config.retry_all_reduce,
-            )
-            for param_offloaded, param in zip(self.param_list_cpu, model.parameters()):
-                if fake:
-                    param_offloaded.grad.to_local().zero_()
-                else:
-                    param_offloaded.grad.to_local().copy_(param_offloaded.data.to_local())
-                    param_offloaded.grad.to_local().sub_(param.data.to_local().to(param_offloaded.data.device))
-
-        self._logger.info(f"Sync psuedo-gradient in {time.perf_counter() - _start_time:.6f} seconds")
-
-    @torch.no_grad()
-    def sync_inner_model(self, model: nn.Module):
-        """
-        Sync the inner model from the CPU outer model to GPU
-        """
-
-        self._logger.debug("sync inner model")
-        for param_offloaded, param in zip(self.param_list_cpu, model.parameters()):
-            param.data.to_local().copy_(param_offloaded.data.to_local())
-
-    @torch.no_grad()
-    def get_offloaded_param(self, model: nn.Module) -> list[nn.Parameter]:
-        """
-        Offload the model parameters to cpu
-        """
-        param_items = [(name, param) for name, param in model.named_parameters() if param.requires_grad]
-        numels = sum(param.to_local().numel() for _, param in param_items)
-
-        self.offloaded_data_flat_tensor = torch.empty((numels,), device="cpu", dtype=torch.float32)
-        self.offloaded_grad_flat_tensor = torch.zeros((numels,), device="cpu", dtype=torch.float32)
-        current_offset = 0
-        offloaded_params = []
-        param_group_cutoff = []
-
-        prev_id = None
-        for name, param in param_items:
-            if _find_first_number(name) != prev_id:
-                param_group_cutoff.append(current_offset)
-                prev_id = _find_first_number(name)
-
-            # so here we copy the DTensor from gpu to cpu. The trick is that we need to recreate the DTensor with the correct
-            # cpu devise mesh, otherwise we have a cpu DTensor with a cuda device mesh which will fail to do any communication
-            target = param.data.to_local().detach()
-            data_tensor = self.offloaded_data_flat_tensor.as_strided(target.size(), target.stride(), current_offset)
-            grad_tensor = self.offloaded_grad_flat_tensor.as_strided(target.size(), target.stride(), current_offset)
-            current_offset += data_tensor.numel()
-            data_tensor.copy_(target)
-
-            offloaded_param = nn.Parameter(
-                DTensor.from_local(
-                    data_tensor,
-                    device_mesh=self.elastic_device_mesh.cpu_local_mesh,
-                    placements=param.data.placements,
-                )
-            )
-
-            offloaded_param.grad = DTensor.from_local(
-                grad_tensor,
-                device_mesh=self.elastic_device_mesh.cpu_local_mesh,
-                placements=param.data.placements,
-            )
-            # here we pre-allocate the grad DTensor on cpu.
-            offloaded_param.requires_grad = True
-            offloaded_params.append(offloaded_param)
-
-        param_group_cutoff.append(current_offset)
-        # self._logger.debug(f"Cutoffs: {param_group_cutoff}")
-
-        self._offloaded_grad_grouped_tensor = [
-            self.offloaded_grad_flat_tensor.as_strided((j - i,), (1,), i)
-            for i, j in zip(param_group_cutoff, param_group_cutoff[1:])
-        ]
-        # self._logger.debug(
-        #     f"Grouped Tensors({len(self._offloaded_grad_grouped_tensor)}){[i.numel() for i in self._offloaded_grad_grouped_tensor]}"
-        # )
-        return offloaded_params
-
-    @torch.no_grad()
-    def step(self, model: nn.Module, fake: bool = False, flag: str = "outer"):
-        """
-        Step the optimizer
-        """
-        time_start = time.perf_counter()
-        self.sync_pseudo_gradient(model, fake=fake, flag=flag)
-        self._logger.info(f"all reduce pseudo gradient in: {time.perf_counter() - time_start} seconds")
-
-        if self.outer_optimizer is not None:
-            self.outer_optimizer.step()
-
-        self.sync_inner_model(model)
diff --git a/src/zeroband/utils/logger.py b/src/zeroband/logger.py
similarity index 51%
rename from src/zeroband/utils/logger.py
rename to src/zeroband/logger.py
index 91050bf0..d1aa2f91 100644
--- a/src/zeroband/utils/logger.py
+++ b/src/zeroband/logger.py
@@ -1,17 +1,10 @@
 import logging
 
-from zeroband.config import Config
-from zeroband.utils.world_info import get_world_info
+from zeroband.world_info import get_world_info
 
 logger = None
 
 
-"""
-ZERO_BAND_LOG_LEVEL=DEBUG allow to control the log level for all ranks
-ZERO_BAND_LOG_ALL_RANK=true allow to control if all ranks should log or only the local rank 0
-"""
-
-
 class CustomFormatter(logging.Formatter):
     def __init__(self, local_rank: int):
         super().__init__()
@@ -24,28 +17,18 @@ def format(self, record):
         return formatter.format(record)
 
 
-def get_logger(config: Config | None = None, name: str | None = None) -> logging.Logger:
+def get_logger(config=None, name: str | None = None) -> logging.Logger:
     global logger  # Add this line to modify the global logger variable
     if logger is not None:
         return logger
+    world_info = get_world_info()
 
-    try:
-        world_info = get_world_info()
-    except KeyError:
-        from zeroband.utils.world_info import WorldInfo
-
-        world_info = WorldInfo.__new__(WorldInfo)
-        world_info.local_rank = 0
     logger = logging.getLogger(name or __name__)
 
-    log_level = config.log_level if config else "DEBUG"
     if world_info.local_rank == 0:
-        logger.setLevel(level=getattr(logging, log_level, logging.INFO))
+        logger.setLevel(level=logging.INFO)
     else:
-        if (not config) or config.log_all_rank:
-            logger.setLevel(level=getattr(logging, log_level, logging.INFO))
-        else:
-            logger.setLevel(level=logging.CRITICAL)  # Disable logging for non-zero ranks
+        logger.setLevel(level=logging.CRITICAL)
 
     handler = logging.StreamHandler()
     handler.setFormatter(CustomFormatter(world_info.local_rank))
diff --git a/src/zeroband/loss.py b/src/zeroband/loss.py
deleted file mode 100644
index a7c04a43..00000000
--- a/src/zeroband/loss.py
+++ /dev/null
@@ -1,87 +0,0 @@
-from torch import Tensor
-import torch
-import torch.nn.functional as F
-
-def compute_cross_entropy_loss(
-        logits: Tensor,
-        labels: Tensor,
-        z_weight: float | None = None,
-        num_chunks: int | None = None,
-        ignore_index: int = -100,
-        fused_linear_weight: Tensor | None = None,
-    ) -> tuple[Tensor, Tensor | None]:
-    """
-    Compute cross entropy loss in fp32, optionally chunked, and optionally with max z loss.
-
-    Do not torch compile this function if you set num_chunks >= 1. It will unroll the chunking loop, thus removing the benefit of chunking.
-
-    Max z loss is from the baichuan2 paper: https://arxiv.org/abs/2309.10305
-
-    .. math::
-        z_{loss} = weight z^{2}
-    where z is the max logit
-    """
-
-    if fused_linear_weight is None:
-        num_elements = (labels != ignore_index).sum().float()
-
-        if num_chunks is not None and not num_chunks <= 1:
-            l_labels: list[Tensor] = [target_chunk.reshape(-1) for target_chunk in labels.chunk(num_chunks, dim=0)]
-            l_logits: list[Tensor] = [logit_chunk.reshape(-1, logit_chunk.size(-1)) for logit_chunk in logits.reshape(-1, logits.size(-1)).chunk(num_chunks, dim=0)]
-        else:
-            l_labels: list[Tensor] = [labels.reshape(-1)]
-            l_logits: list[Tensor] = [logits.reshape(-1, logits.size(-1))]
-
-        loss = 0.0
-        ce_loss = None if z_weight is None else 0.0
-        for logits_chunk, labels_chunk in zip(l_logits, l_labels):
-            if z_weight is None:
-                loss += _upcast_cross_entropy(logits_chunk, labels_chunk, ignore_index=ignore_index)
-            else:
-                ce, z = _upcast_cross_entropy_max_z(logits_chunk, labels_chunk, z_weight, ignore_index=ignore_index)
-                loss += ce
-                ce_loss += z
-
-        return (loss / num_elements), (None if ce_loss is None else ce_loss / num_elements)
-
-    else:
-        # Ignore number of chunks, since it is not confugrable in liger.
-        from liger_kernel.ops.fused_linear_cross_entropy import LigerFusedLinearCrossEntropyFunction
-        ret = LigerFusedLinearCrossEntropyFunction.apply(
-            logits,                                      # _input
-            fused_linear_weight,                         # weight
-            labels,                                      # target
-            None,                                        # ce_weight
-            None,                                        # bias
-            ignore_index,                                # ce_weight=None
-            z_weight if z_weight is not None else 0.0,   # lse_square_scale
-            0.0,                                         # label_smoothing
-            "mean",                                      # reduction
-            None,                                        # softcap
-            fused_linear_weight is not None,             # return_z_loss
-        )
-        if not isinstance(ret, tuple):
-            assert isinstance(ret, Tensor)
-            ret = (ret, None)
-        return ret
-
-
-# Compile the upcast into the CE calculation
-@torch.compile
-def _upcast_cross_entropy(logit_chunk, label_chunk, ignore_index) -> Tensor:
-    return F.cross_entropy(logit_chunk.float(), label_chunk, ignore_index=ignore_index, reduction="sum")
-
-
-@torch.compile
-def _upcast_cross_entropy_max_z(
-    logits: Tensor,
-    targets: Tensor,
-    z_loss_weight: float,
-    ignore_index: int = -100,
-) -> tuple[Tensor, Tensor]:
-    # max is not differentiable. But here we just pick the indices of the max value, so it's fine for backpropagation.
-    loss = F.cross_entropy(logits.float(), targets, ignore_index=ignore_index, reduction="sum")
-    max_logits = logits.max(dim=-1)[0]
-    max_logits = max_logits.where(targets != ignore_index, 0)
-    z_loss = z_loss_weight * max_logits.pow(2).mean()
-    return loss, z_loss
diff --git a/src/zeroband/models/llama/__init__.py b/src/zeroband/models/llama/__init__.py
index 55ce25e8..40cc971a 100644
--- a/src/zeroband/models/llama/__init__.py
+++ b/src/zeroband/models/llama/__init__.py
@@ -7,7 +7,7 @@
 # Llama 2 is licensed under the LLAMA 2 Community License,
 # Copyright (c) Meta Platforms, Inc. All Rights Reserved.
 
-from zeroband.config import Config
+from typing import Literal
 from zeroband.models.llama.model import ModelArgs, Transformer
 
 __all__ = ["Transformer"]
@@ -83,21 +83,21 @@
 
 
 def get_model(
-    config: Config,
+    type_model: Literal["llama2", "llama3"],
+    name_model: str,
+    seq_length: int,
     vocab_size: int,
 ) -> tuple[Transformer, ModelArgs]:
     """get the transformer model"""
 
-    if config.type_model == "llama2":
-        model_config = llama2_configs[config.name_model]
-    elif config.type_model == "llama3":
-        model_config = llama3_configs[config.name_model]
+    if type_model == "llama2":
+        model_config = llama2_configs[name_model]
+    elif type_model == "llama3":
+        model_config = llama3_configs[name_model]
     else:
-        raise ValueError(f"Model type {config.type_model} not supported")
+        raise ValueError(f"Model type {type_model} not supported")
 
     model_config.vocab_size = vocab_size
-    model_config.max_seq_len = config.data.seq_length
-    model_config.attn_fn = config.train.attn_fn
-    model_config.fused_linear_ce = config.train.fused_linear_ce
+    model_config.max_seq_len = seq_length
 
     return Transformer(model_config), model_config
diff --git a/src/zeroband/models/llama/model.py b/src/zeroband/models/llama/model.py
index d9650358..d54a269f 100644
--- a/src/zeroband/models/llama/model.py
+++ b/src/zeroband/models/llama/model.py
@@ -13,13 +13,12 @@
 
 import contextlib
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Literal, Optional, Tuple
 
 import torch
 import torch.nn.functional as F
 from torch import nn
 from zeroband.models.norms import build_norm
-from zeroband.config import AttnFnType
 
 from torch.nn.attention.flex_attention import create_block_mask, flex_attention, BlockMask, _DEFAULT_SPARSE_BLOCK_SIZE
 from torch.nn.attention import SDPBackend, sdpa_kernel
@@ -63,7 +62,7 @@ class ModelArgs:
 
     fused_linear_ce: bool = False
 
-    attn_fn: AttnFnType = "flex"  # slow for testing
+    attn_fn: Literal["flex", "math"] = "flex"  # slow for testing
 
 
 def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0) -> torch.Tensor:
diff --git a/src/zeroband/optimizers.py b/src/zeroband/optimizers.py
deleted file mode 100644
index 321fecf9..00000000
--- a/src/zeroband/optimizers.py
+++ /dev/null
@@ -1,55 +0,0 @@
-from typing import Iterable
-
-import torch
-import torch.distributed.fsdp
-import torch.distributed.tensor
-
-from distributed_shampoo import (
-    DefaultEigenvalueCorrectedShampooConfig,
-    DistributedShampoo,
-    FullyShardShampooConfig,
-    ShampooPT2CompileConfig,
-)
-
-from zeroband.config import Config, AdamConfig, SoapConfig, OptimizersConfig
-
-
-def get_optimizer(config: Config, params: Iterable[torch.nn.Parameter]) -> torch.optim.Optimizer:
-    """
-    Obtain the optimizer for the model.
-    """
-
-    _config: OptimizersConfig = config.optim.optim
-
-    if isinstance(_config, AdamConfig):
-        opt = torch.optim.AdamW(
-            params,
-            lr=_config.lr,
-            weight_decay=_config.weight_decay,
-            betas=(_config.betas1, _config.betas2),
-        )
-    elif isinstance(_config, SoapConfig):
-        opt = DistributedShampoo(
-            params,
-            lr=_config.lr,
-            betas=(_config.betas1, _config.betas2),
-            epsilon=1e-12,
-            weight_decay=_config.weight_decay,
-            max_preconditioner_dim=_config.max_preconditioner_dim,
-            precondition_frequency=_config.precondition_frequency,
-            use_decoupled_weight_decay=True,
-            # This can also be set to `DefaultSOAPConfig` which uses QR decompositions, hence is
-            # less expensive and might thereby allow for a smaller `precondition_frequency`.
-            preconditioner_config=DefaultEigenvalueCorrectedShampooConfig,
-            distributed_config=FullyShardShampooConfig(),
-            shampoo_pt2_compile_config=ShampooPT2CompileConfig(
-                enable_shampoo_pt2_dynamic_shape=False
-            ),
-        )
-    else:
-        raise ValueError(f"Unknown optimizer {_config.optimizer}")
-
-    return opt
-
-
-__all__ = ["OptimizersConfig", "get_optimizer"]
diff --git a/src/zeroband/train.py b/src/zeroband/train.py
deleted file mode 100644
index ecab1829..00000000
--- a/src/zeroband/train.py
+++ /dev/null
@@ -1,588 +0,0 @@
-import os
-import time
-from typing import TYPE_CHECKING
-from multiprocessing.process import _children  # type: ignore
-
-import torch
-import torch.distributed as dist
-from torch.distributed._composable.fsdp import fully_shard, MixedPrecisionPolicy, CPUOffloadPolicy  # type: ignore
-from torch.autograd.profiler import record_function
-
-from zeroband.checkpoint import CkptManager, TrainingProgress
-from zeroband.comms import ElasticDeviceMesh
-from zeroband.config import Config, resolve_env_vars
-from zeroband.data import TEST_VOCAB_SIZE, get_dataloader
-from zeroband.diloco import Diloco
-from zeroband.loss import compute_cross_entropy_loss
-from zeroband.lr_scheduler import get_scheduler
-from zeroband.models.llama import get_model
-from zeroband.models.llama.model import create_block_mask_from_seqlens
-from zeroband.optimizers import get_optimizer
-from zeroband.utils import (
-    FakeTokenizer,
-    PerfCounter,
-    get_module_signature,
-    get_optimizer_signature,
-    get_tensor_list_signature,
-    get_peak_flops,
-    get_num_params,
-    get_num_flop_per_token,
-)
-from zeroband.utils.metric_logger import MetricLogger, WandbMetricLogger, DummyMetricLogger
-from zeroband.utils.monitor import HttpMonitor
-from zeroband.utils.activation_ckpt import apply_ac_ckpt
-from zeroband.utils.profiler import MemoryProfiler
-from zeroband.utils.world_info import get_world_info
-from zeroband.utils.logger import get_logger
-from zeroband.utils.stopwatch import Stopwatch
-
-from transformers import AutoTokenizer
-from pydantic_config import parse_argv
-
-
-def log_hash_training_state(
-    config: Config,
-    model: torch.nn.Module,
-    inner_optimizer: torch.optim.Optimizer,
-    diloco: Diloco | None,
-    metric_logger: MetricLogger | None,
-    step: int,
-    id: str = "",
-):
-    """Log the hash of the model and optimizer. This function is slow"""
-    if config.train.log_model_hash:
-        inner_model_hash = get_module_signature(model)
-        inner_optimizer_hash = get_optimizer_signature(inner_optimizer)
-
-        logger.debug(f"inner diloco model {id} : {inner_model_hash}")
-        logger.debug(f"inner optimizer hash {id} : {inner_optimizer_hash}")
-
-        metrics = {
-            "step": step,
-            f"inner_model_hash_{id}": inner_model_hash,
-            f"inner_optimizer_hash_{id}": inner_optimizer_hash,
-        }
-
-        if config.diloco is not None and diloco is not None:
-            outer_optimizer_hash = get_optimizer_signature(diloco.outer_optimizer)
-            outer_model_hash = get_tensor_list_signature(diloco.param_list_cpu)  # type: ignore
-
-            logger.debug(f"outer diloco optimizer hash {id} : {outer_optimizer_hash}")
-            logger.debug(f"outer diloco model hash {id} : {outer_model_hash}")
-
-            metrics.update({
-                f"outer_optimizer_hash_{id}": outer_optimizer_hash,
-                f"outer_model_hash_{id}": outer_model_hash
-            })
-        if world_info.rank == 0:
-            assert metric_logger is not None
-            metric_logger.log(metrics)
-
-
-def train(config: Config):
-    # batch_size is the total batch size for all GPUs
-    assert config.optim.batch_size % world_info.local_world_size == 0
-    batch_size = config.optim.batch_size // world_info.local_world_size
-
-    assert (
-        batch_size % config.train.micro_bs == 0
-    ), f"The micro batch size ({config.train.micro_bs}) must divide the number of samples on each GPU ({batch_size})."
-    gradient_accumulation_steps = batch_size // config.train.micro_bs
-
-    if config.ckpt is not None and config.ckpt.interval is not None and config.diloco is not None:
-        assert (
-            config.ckpt.interval % config.diloco.inner_steps == 0
-        ), "ckpt interval must be a multiple of diloco inner steps as we only save at the end of an outer step"
-
-    sw = Stopwatch(config)
-    sw.start("train()")
-
-    # Load tokenizer
-    with sw.record_block("Load Tokenizer"):
-        if config.data.fake and config.name_model == "debugmodel":
-            tokenizer = FakeTokenizer()
-        elif config.type_model == "llama2":
-            tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=True)
-        elif config.type_model == "llama3":
-            tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B", use_fast=True)
-        else:
-            raise ValueError(f"Model type {config.type_model} not supported")
-
-    with sw.record_block("Get Dataloader"):
-        train_dataloader = get_dataloader(
-            tokenizer=tokenizer,
-            world_size=world_info.world_size,
-            rank=world_info.rank,
-            batch_size=config.train.micro_bs,
-            data_config=config.data,
-        )
-        train_dataloader_iterator = iter(train_dataloader)
-
-    with sw.record_block("Get Model"):
-        model, model_config = get_model(
-            config,
-            vocab_size=len(tokenizer) if config.name_model != "debugmodel" or not config.data.fake else TEST_VOCAB_SIZE,
-        )
-
-
-    gpu_peak_flops = get_peak_flops(torch.cuda.get_device_name(torch.device("cuda")))
-    logger.info(f"Peak FLOPS used for computing MFU: {gpu_peak_flops:.3e}")
-
-    num_params = get_num_params(model, exclude_embedding=True)
-    logger.info(f"Number of parameters: {num_params}")
-    num_flop_per_token = get_num_flop_per_token(
-        num_params,
-        model_config,
-        config.data.seq_length,
-    )
-
-    with sw.record_block("Shard Model"):
-        if config.train.ac_ckpt:
-            num = 1 if isinstance(config.train.ac_ckpt, bool) else config.train.ac_ckpt
-            apply_ac_ckpt(model, num)
-
-        elastic_device_mesh = ElasticDeviceMesh(
-            enable=config.diloco is not None,
-            live_recovery_rank_src=config.ckpt.live_recovery_rank_src
-        )
-
-        mp_policy = MixedPrecisionPolicy(
-            param_dtype=torch.bfloat16,
-            reduce_dtype=torch.float32 if config.train.reduce_fp32 else None
-        )
-
-        offload_policy = CPUOffloadPolicy(pin_memory=True) if config.train.fsdp_cpu_offload else None
-
-        for layer_id, transformer_block in model.layers.items():
-            if config.train.reshard_after_forward:
-                reshard_after_forward = int(layer_id) < len(model.layers) - 1
-            else:
-                reshard_after_forward = False
-            fully_shard(
-                transformer_block,
-                mp_policy=mp_policy,
-                mesh=elastic_device_mesh.cuda_local_mesh,
-                reshard_after_forward=reshard_after_forward,
-                offload_policy=offload_policy,
-            )
-        fully_shard(
-            model,
-            mp_policy=mp_policy,
-            mesh=elastic_device_mesh.cuda_local_mesh,
-            reshard_after_forward=config.train.reshard_after_forward,
-            offload_policy=offload_policy,
-        )
-
-    # Setup optimizers
-    with sw.record_block("Optimizer Setup"):
-        inner_optimizer = get_optimizer(config, model.parameters())
-
-        diloco = Diloco(config.diloco, model, elastic_device_mesh) if config.diloco is not None else None
-
-        scheduler = get_scheduler(
-            sched_type=config.optim.sched_type,
-            optimizer=inner_optimizer,
-            num_warmup_steps=config.optim.warmup_steps,
-            num_stable_steps=config.optim.stable_steps,
-            num_training_steps=config.optim.total_steps,
-        )
-
-        training_progress = TrainingProgress(total_tokens=0, outer_step=0, step=0)
-
-        ckpt_manager = CkptManager(
-            config=config.ckpt,
-            model=model,
-            optimizer=inner_optimizer,
-            scheduler=scheduler,
-            dataloader=train_dataloader,
-            training_progress=training_progress,
-            data_rank=config.data.data_rank,
-            diloco_offloaded_optimizer=diloco.outer_optimizer if config.diloco is not None else None,  # type: ignore
-            diloco_offloaded_param_list=diloco.param_list_cpu if config.diloco is not None else None,  # type: ignore
-        )
-
-    if world_info.rank == 0:
-        logger_cls = WandbMetricLogger if config.metric_logger_type == "wandb" else DummyMetricLogger
-        metric_logger = logger_cls(
-            project=config.project,
-            logger_config={"config": config.model_dump(), "world_info": world_info.json()},
-            resume=config.wandb_resume,
-        )
-    else:
-        metric_logger = None
-
-    with sw.record_block("Compile Model"):
-        if config.train.torch_compile:
-            # we need to compile AFTER creating the CKPT manager, DON'T ASK ME WHY
-            model = torch.compile(model) if not TYPE_CHECKING else model
-
-    if config.ckpt.resume is not None:
-        with sw.record_block("Resume Checkpoint"):
-            # all is inplace
-            ckpt_manager.load(
-                resume_ckpt_path=config.ckpt.resume,
-                skip_dataloader=config.ckpt.skip_dataloader,
-                data_path=config.ckpt.data_path,
-            )
-            log_hash_training_state(
-                config, model, inner_optimizer, diloco, metric_logger, step=training_progress.step, id="resume"
-            )
-
-    if config.train.memory_profiler is not None:
-        memory_profiler = MemoryProfiler(config.train.memory_profiler.freq, config.train.memory_profiler.snapshot_dir)
-
-    if config.monitor is not None:
-        monitor = HttpMonitor(config=config.model_dump(), resume=False)
-        monitor.set_stage("init")
-
-    num_inner_steps = config.diloco.inner_steps if config.diloco is not None else 1
-    perf_counter = PerfCounter(window_size=10)
-
-    logger.debug("Finished setup in %f seconds", sw.elapsed())
-
-    need_live_recovery = config.ckpt.live_recovery_rank_src is not None
-    while True:
-        if num_inner_steps > 1:
-            # if we don't use diloco we don't print the outer step logs
-            logger.info(f"outer_step step: {training_progress.outer_step}")
-
-        time_start_outer = time.perf_counter()
-
-        if config.diloco is not None:
-            assert diloco is not None
-            # this is a patch for now to allow live recovery worker to not affect the all reduce at all
-
-            if not need_live_recovery:
-                elastic_device_mesh.maybe_reinit_global_pg(admit_joiners=True)
-
-                maybe_dest_rank = elastic_device_mesh.live_recovery.should_send_ckpt_to()
-                if maybe_dest_rank is not None:
-                    logger.info(f"Start live recovery to rank {maybe_dest_rank}")
-                    ckpt_manager.send_ckpt_to_peer(elastic_device_mesh.global_pg, maybe_dest_rank, blocking=True)
-
-                    elastic_device_mesh.live_recovery.reset()
-            else:
-                ## receiving
-                time_start_live_recovery = time.perf_counter()
-                logger.info(f"Start live recovery from rank {config.ckpt.live_recovery_rank_src}")
-
-                ## we create grad buffer and opts stats mamnually, the value will be overwritten by the ckpt but we need the DTensor to be correctly init before loading it
-
-                diloco.outer_optimizer.step()  # need to step to init the DTensor stats
-
-                ckpt_manager.recv_ckpt_from_peer(elastic_device_mesh.global_pg)
-
-                log_hash_training_state(
-                    config,
-                    model,
-                    inner_optimizer,
-                    diloco,
-                    metric_logger,
-                    step=training_progress.step,
-                    id="live_reco_recv",
-                )
-                need_live_recovery = False
-
-                if config.ckpt.remote_data_load:
-                    ckpt_manager.remote_data_load()
-
-                logger.info("live recovery done in %f", time.perf_counter() - time_start_live_recovery)
-
-        # at the beginning of the inner steps we allow joiner to arrive.
-        # We maybe reinit before the all reduce but only to allow leaving, not to join anymore
-
-        if world_info.rank == 0 and config.monitor is not None:
-            monitor.set_stage("inner_loop")
-
-        for inner_step in range(num_inner_steps):
-            logger.debug("Starting inner step.")
-            sw.start("inner_step")
-
-            loss_batch = 0
-            z_loss_batch = 0
-
-            with sw.record_block("Grad Acc Steps"):
-                for grad_acc_step in range(gradient_accumulation_steps):
-                    sw.start("grad_acc_step")
-
-                    is_accumulating = grad_acc_step < gradient_accumulation_steps - 1
-                    # no sync if we are accumulating gradients
-                    model.set_requires_gradient_sync(not is_accumulating)
-
-                    with sw.record_block("Load batch"):
-                        # TODO/NOTE: We could overlap sending the batch with communication
-                        #            although to be honest the perf impact is minimal
-                        batch = next(train_dataloader_iterator)
-                        input_ids = batch["input_ids"].to("cuda")
-                        labels = batch["labels"].to("cuda")
-                        if config.train.sequence_packing:
-                            seqlens = [seqlen.to("cuda") for seqlen in batch["seqlens"]]
-                            block_mask = create_block_mask_from_seqlens(seqlens) if seqlens is not None else None
-                        else:
-                            seqlens = None
-                            block_mask = None
-
-                    with sw.record_block("Run forward()"):
-                        logits = model(tokens=input_ids, block_mask=block_mask).contiguous()
-                        flatten_logits = logits.reshape(-1, logits.size(-1))  # b seq vocab -> (b * seq) vocab
-                        flatten_labels = labels.reshape(-1)                   # b seq -> (b * seq)
-
-                    with sw.record_block("Loss Calculation"):
-                        ce_loss, z_loss = compute_cross_entropy_loss(
-                            flatten_logits,
-                            flatten_labels,
-                            z_weight=config.optim.z_loss_weight if config.optim.z_loss else None,
-                            num_chunks=config.optim.num_chunks,
-                            fused_linear_weight=model.output.weight if config.train.fused_linear_ce else None,
-                        )
-
-                        del logits
-                        del flatten_logits
-                        del flatten_labels
-
-                        if config.optim.z_loss:
-                            assert z_loss is not None
-                            ce_loss /= gradient_accumulation_steps
-                            z_loss /= gradient_accumulation_steps
-                            loss = ce_loss + z_loss
-                        else:
-                            loss = ce_loss / gradient_accumulation_steps
-
-                    with sw.record_block("Run backward()"):
-                        loss.backward()
-
-                    with record_function("Clone Loss"):
-                        # No need to time, takes 0 seconds
-                        if config.optim.z_loss:
-                            assert z_loss is not None
-                            loss_batch += ce_loss.detach().clone()
-                            z_loss_batch += z_loss.detach().clone()
-                        else:
-                            loss_batch += loss.detach().clone()
-
-                    elapsed = sw.stop("grad_acc_step")
-                    logger.debug(f"Grad acc step {grad_acc_step} completed in {elapsed:.2f} seconds")
-
-            with sw.record_block("Loss allreduce()"):
-                # Launch both allreduces at the same time to hide latency
-                loss_allreduce = dist.all_reduce(tensor=loss_batch, op=dist.ReduceOp.AVG, group=elastic_device_mesh.local_pg, async_op=True)
-                if config.optim.z_loss:
-                    z_loss_allreduce = dist.all_reduce(tensor=z_loss_batch, op=dist.ReduceOp.AVG, group=elastic_device_mesh.local_pg, async_op=True)
-
-                assert isinstance(loss_allreduce, torch.distributed.Work)
-                loss_allreduce.wait()
-                if config.optim.z_loss:
-                    assert isinstance(z_loss_allreduce, torch.distributed.Work)
-                    z_loss_allreduce.wait()
-
-            with sw.record_block("Clip Grad"):
-                grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0).full_tensor() # type: ignore (is a dtensor)
-
-            with sw.record_block("Optimizer Step"):
-                inner_optimizer.step()
-                scheduler.step()
-
-            with sw.record_block("Optimizer Zero Grad"):
-                inner_optimizer.zero_grad()
-
-            # logging
-            training_progress.step += 1
-            inner_lr = [group["lr"] for group in inner_optimizer.param_groups][0]
-
-            # syncing loss across all data parallel rank within a nodes
-            new_tokens = config.data.seq_length * config.optim.batch_size
-            perf_counter.count_tokens(new_tokens)
-
-            if config.diloco is None:
-                training_progress.total_tokens += new_tokens
-            else:
-                # we count the total tokens with respect to all diloco workers
-                # might need to tweak this as some worker might fail to join the all reduce later
-                training_progress.total_tokens += new_tokens * elastic_device_mesh.global_pg.size()
-
-            assert isinstance(loss_batch, torch.Tensor)
-            metrics = {
-                "Loss": loss_batch.item(),
-                "step": training_progress.step,
-                "inner_lr": inner_lr,
-                "Perplexity": torch.exp(loss_batch).item(),
-                "total_tokens": training_progress.total_tokens,
-                "time": time.time(),
-                "grad_norm": grad_norm.item(),
-            }
-
-            if config.optim.z_loss:
-                assert isinstance(z_loss_batch, torch.Tensor)
-                metrics["z_loss"] = z_loss_batch.item()
-
-            log = f"step: {training_progress.step}, loss: {loss_batch.item():.4f}"
-
-            tokens_per_second = perf_counter.get_tokens_per_second()
-            if tokens_per_second is not None:
-                metrics["tokens_per_second"] = tokens_per_second
-                metrics["mfu"] = (
-                    100 * num_flop_per_token * tokens_per_second / gpu_peak_flops / world_info.local_world_size
-                )
-                log += f", tokens_per_second: {tokens_per_second:.2f}, mfu: {metrics['mfu']:.2f}"
-
-            if config.diloco is not None:
-                metrics["num_peers"] = elastic_device_mesh.global_pg.size()
-                log += f", diloco_peers: {metrics['num_peers']}"
-
-            if world_info.rank == 0:
-                assert metric_logger is not None
-                metric_logger.log(metrics)
-                if config.monitor is not None:
-                    monitor.log(metrics)
-
-            logger.info(log)
-
-            if config.train.memory_profiler is not None:
-                memory_profiler.step()
-
-            elapsed = sw.stop("inner_step")
-            logger.debug(f"Inner step {inner_step} completed in {elapsed:.2f} seconds")
-
-        if config.diloco is not None:
-            assert diloco is not None
-            if world_info.rank == 0 and config.monitor is not None:
-                monitor.set_stage("outer_loop")
-
-            time_start_inner = time.perf_counter()
-            diloco.step(model=model, flag=str(training_progress.outer_step))
-            diloco_time = time.perf_counter() - time_start_inner
-
-            log_hash_training_state(
-                config, model, inner_optimizer, diloco, metric_logger, step=training_progress.step, id="outer_step"
-            )
-
-        training_progress.outer_step += 1
-
-        if (
-            config.ckpt.interval is not None
-            and training_progress.step > 0
-            and training_progress.step % config.ckpt.interval == 0
-        ):
-            # we only allow to checkpoint after a outer step. For non diloco training outer step = 1 anyway
-
-            do_remote = config.ckpt.remote is not None and training_progress.step % config.ckpt.remote.interval == 0
-            ckpt_manager.save(remote=do_remote)
-            log_hash_training_state(
-                config, model, inner_optimizer, diloco, metric_logger, step=training_progress.step, id="save"
-            )
-
-        if config.diloco:
-            tokens_per_second = (
-                config.optim.batch_size
-                * config.diloco.inner_steps
-                * config.data.seq_length
-                / (time.perf_counter() - time_start_outer)
-            )
-            mfu = 100 * num_flop_per_token * tokens_per_second / gpu_peak_flops / world_info.local_world_size
-            logger.info(f"effective mfu: {mfu}")
-
-            if world_info.rank == 0:
-                assert metric_logger is not None
-                metric_logger.log(
-                    {
-                        "outer_mfu": mfu,
-                        "step": training_progress.step,
-                        "outer_step": training_progress.outer_step,
-                        "outer_tokens_per_second": tokens_per_second,
-                        "all_reduce_step": diloco_time,
-                    }
-                )
-
-        if training_progress.step >= config.optim.total_steps:
-            # we only allow to break outisde of the inner loop.
-            # This avoid ending the training in the middle of a the inner loop
-            # Since ckpt strategy and all reduce is done at the outer loop level.
-            break
-
-    if world_info.rank == 0:
-        assert metric_logger is not None
-        metric_logger.finish()
-        if config.monitor is not None:
-            monitor.finish()
-
-    ckpt_manager.wait_for_blocking_job()
-
-    del elastic_device_mesh  # allow to clean up for smoother tests transition
-
-    if config.train.memory_profiler is not None:
-        logger.debug(f"Max memory used: {torch.cuda.max_memory_allocated() / 1024**2:.2f} MB")
-
-    logger.info("Training finished, exiting ...")
-
-
-if __name__ == "__main__":
-    # Allow eager fallback during production so that that the training runs dont die
-    # However, in development, we want to know that we broke torch compile
-    torch._dynamo.config.suppress_errors = "ZERO_BAND_DEV" not in os.environ  # type: ignore
-    torch.set_float32_matmul_precision("high")
-    torch.manual_seed(42)
-
-    config = Config(**parse_argv())  # type: ignore
-    resolve_env_vars(config)
-    world_info = get_world_info()
-    logger = get_logger(config)
-
-    # torch.set_default_device("cuda")
-    torch.cuda.set_device(world_info.local_rank)
-
-    def pretty_dict(d, indent=2):
-        for key, value in d.items():
-            if isinstance(value, dict):
-                logger.debug(" " * indent + f"{key}:")
-                pretty_dict(value, indent + 2)
-            else:
-                logger.debug(" " * indent + f"{key}: {value}")
-
-    logger.debug("config:")
-    pretty_dict(config.model_dump())
-
-    try:
-        if config.train.torch_profiler and world_info.rank == 0:
-            # NOTE(apaz-cli): I cannot seem to get the memory profiler to work.
-            # Running into this issue: https://github.com/pytorch/pytorch/issues/64345
-            # In the meantime, we can use the memory snapshotter.
-
-            logger.debug("Running train() with profiler.")
-            prof = torch.profiler.profile(
-                activities=[
-                    torch.profiler.ProfilerActivity.CPU,
-                    torch.profiler.ProfilerActivity.CUDA,
-                ],
-                record_shapes=True,
-                # profile_memory=True,
-                # with_stack=True,
-            )
-            try:
-                prof.__enter__()
-                train(config)
-            finally:
-                logger.debug("Exiting profiler context.")
-                prof.__exit__(None, None, None)
-
-            logger.info("Exporting chrome trace.")
-            prof.export_chrome_trace("logs/profile.json.gz")
-
-            width = 30
-            logger.info("\n" + "*" * width + " GPU TIME " + "*" * width)
-            logger.info(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
-
-            logger.info("\n" + "*" * width + " GPU MEM " + "*" * width)
-            logger.info(prof.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=10))
-
-            # logger.info("Exporting memory timeline.")
-            # prof.export_memory_timeline(f"logs/mem_timeline.html", device="cuda:0")
-        else:
-            train(config)
-    except Exception as e:
-        # Subprocesses can prevent the main process from exiting, so we need to terminate them
-        logger.info("Caught an exception, terminating children")
-        logger.info(e)
-        for p in _children:
-            p.terminate()
-
-        raise e
diff --git a/src/zeroband/utils/__init__.py b/src/zeroband/utils.py
similarity index 61%
rename from src/zeroband/utils/__init__.py
rename to src/zeroband/utils.py
index fafa9c7b..18bab868 100644
--- a/src/zeroband/utils/__init__.py
+++ b/src/zeroband/utils.py
@@ -1,13 +1,29 @@
-import hashlib
+from zeroband.models.llama.model import Transformer
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import checkpoint_wrapper
+from zeroband.logger import get_logger
 import socket
 import time
 import torch
 from torch.distributed.fsdp import ShardingStrategy
-from torch.distributed._tensor.api import DTensor
-from distributed_shampoo import DistributedShampoo
 
 
-__all__ = ["get_sharding_strategy", "get_peak_flops", "get_num_flop_per_token", "get_num_params"]
+def apply_ac_ckpt(model: Transformer, num: int):
+    """Apply activation checkpointing to the model.
+    Apply to layers multiple of `num`.
+
+    Example if `num=2` only half of the layers are checkpointed.
+    """
+    logger = get_logger()
+
+    layers_ckpt = 0
+
+    for layer_id, transformer_block in model.layers.named_children():
+        if layers_ckpt % num == 0:
+            transformer_block = checkpoint_wrapper(transformer_block, preserve_rng_state=False)
+            model.layers.register_module(layer_id, transformer_block)
+            layers_ckpt += 1
+
+    logger.debug(f"Applied activation checkpointing to {layers_ckpt} layers")
 
 
 def get_sharding_strategy(sharding_strategy: str) -> ShardingStrategy:
@@ -96,76 +112,6 @@ def get_tokens_per_second(self) -> float | None:
         return sum(self.tokens[1:]) / (self.times[-1] - self.times[0])
 
 
-TENSOR_SIG_SAMPLE_SIZE = 100
-
-
-def get_tensor_signature(a: torch.Tensor | torch.nn.Parameter) -> str:
-    """
-    Get the tensor signature
-    """
-    while isinstance(a, torch.nn.Parameter):
-        a = a.data
-
-    if isinstance(a, DTensor):
-        a = a.full_tensor()
-
-    if a.numel() < TENSOR_SIG_SAMPLE_SIZE:
-        b = a.as_strided(size=(a.numel(),), stride=(1,))
-    else:
-        step_size = a.numel() // TENSOR_SIG_SAMPLE_SIZE
-        b = a.as_strided(size=(TENSOR_SIG_SAMPLE_SIZE,), stride=(step_size,))
-    element_str = "".join([f"{x:.3e}" for x in b])
-    element_hash = hashlib.md5(element_str.encode("utf-8")).hexdigest()
-    return f"{a.dtype}{a.shape}{a.stride()}<{element_hash}>"
-
-
-def get_module_signature(module: torch.nn.Module, compress: bool = True) -> str:
-    """
-    Get the module signature
-    """
-    state_dict_sig = {name: get_tensor_signature(param) for name, param in module.named_parameters()}
-    if compress:
-        return hashlib.md5(str(state_dict_sig).encode("utf-8")).hexdigest()
-    else:
-        return "\n".join(f"{name}: {sig}" for name, sig in state_dict_sig.items())
-
-
-def get_dict_signature(dict: dict, compress: bool = True) -> str:
-    return hashlib.md5(str(dict).encode("utf-8")).hexdigest()
-
-
-def get_optimizer_signature(optimizer: torch.optim.Optimizer, compress: bool = True) -> str:
-    """
-    Get the optimizer signature
-    """
-
-    if isinstance(optimizer, DistributedShampoo):
-        return "mocked signature because shampoo does not support state_dict()"
-
-    def unwrap_tensor(state_dict: dict) -> dict:
-        new_dict = {}
-        for key, value in state_dict.items():
-            if isinstance(value, dict):
-                new_dict[key] = unwrap_tensor(value)
-            elif isinstance(value, torch.Tensor):
-                new_dict[key] = get_tensor_signature(value)
-            else:
-                new_dict[key] = str(value)
-        return new_dict
-
-    state_dict_sig = unwrap_tensor(optimizer.state_dict())
-
-    if compress:
-        return hashlib.md5(str(state_dict_sig).encode("utf-8")).hexdigest()
-    else:
-        return "\n".join(f"{name}: {sig}" for name, sig in state_dict_sig.items())
-
-
-def get_tensor_list_signature(tensor_list: list[torch.Tensor]) -> str:
-    tensors = [get_tensor_signature(tensor) for tensor in tensor_list]
-    return hashlib.md5(str(tensors).encode("utf-8")).hexdigest()
-
-
 def get_random_available_port_list(num_port):
     # https://stackoverflow.com/questions/1365265/on-localhost-how-do-i-pick-a-free-port-number
     ports = []
@@ -193,4 +139,4 @@ def __init__(self):
         self.pad_token_id = 2
 
     def __len__(self):
-        return self.vocab_size
\ No newline at end of file
+        return self.vocab_size
diff --git a/src/zeroband/utils/activation_ckpt.py b/src/zeroband/utils/activation_ckpt.py
deleted file mode 100644
index eea9a98d..00000000
--- a/src/zeroband/utils/activation_ckpt.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from zeroband.models.llama.model import Transformer
-
-from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import checkpoint_wrapper
-
-from zeroband.utils.logger import get_logger
-
-
-def apply_ac_ckpt(model: Transformer, num: int):
-    """Apply activation checkpointing to the model.
-    Apply to layers multiple of `num`.
-
-    Example if `num=2` only half of the layers are checkpointed.
-    """
-    logger = get_logger()
-
-    layers_ckpt = 0
-
-    for layer_id, transformer_block in model.layers.named_children():
-        if layers_ckpt % num == 0:
-            transformer_block = checkpoint_wrapper(transformer_block, preserve_rng_state=False)
-            model.layers.register_module(layer_id, transformer_block)
-            layers_ckpt += 1
-
-    logger.debug(f"Applied activation checkpointing to {layers_ckpt} layers")
diff --git a/src/zeroband/utils/ip.py b/src/zeroband/utils/ip.py
deleted file mode 100644
index 4ec30aa9..00000000
--- a/src/zeroband/utils/ip.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from typing import Optional
-import socket
-import fcntl
-import struct
-
-MULTIPLIER = {"Kbits/sec": 1e3, "Mbits/sec": 1e6, "Gbits/sec": 1e9, "Tbits/sec": 1e12}
-
-
-def parse_iperf_output(output: str) -> Optional[int]:
-    try:
-        value, mult = output.strip().split()[-2:]
-        return int(float(value) * MULTIPLIER[mult])
-    except Exception:
-        return None
-
-
-# Taken from https://stackoverflow.com/questions/24196932/how-can-i-get-the-ip-address-from-a-nic-network-interface-controller-in-python
-def get_ip_address(ifname: str) -> str:
-    """Get the IP address of the specified network interface.
-
-    Args:
-        ifname (str): The name of the network interface.
-    Returns:
-        str: The IP address of the network interface.
-    """
-    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
-    ret = socket.inet_ntoa(
-        fcntl.ioctl(
-            s.fileno(),
-            0x8915,  # SIOCGIFADDR
-            struct.pack("256s", ifname.encode("utf-8")[:15]),
-        )[20:24]
-    )
-    s.close()
-    return ret
diff --git a/src/zeroband/utils/metric_logger.py b/src/zeroband/utils/metric_logger.py
deleted file mode 100644
index 73befcaf..00000000
--- a/src/zeroband/utils/metric_logger.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import pickle
-from typing import Any, Protocol
-import importlib.util
-
-
-class MetricLogger(Protocol):
-    def __init__(self, project, logger_config): ...
-
-    def log(self, metrics: dict[str, Any]): ...
-
-    def finish(self): ...
-
-
-class WandbMetricLogger(MetricLogger):
-    def __init__(self, project, logger_config, resume: bool):
-        if importlib.util.find_spec("wandb") is None:
-            raise ImportError("wandb is not installed. Please install it to use WandbMonitor.")
-
-        import wandb
-
-        wandb.init(
-            project=project, config=logger_config, name=logger_config["config"]["run_name"], resume="auto" if resume else None
-        )  # make wandb reuse the same run id if possible
-
-    def log(self, metrics: dict[str, Any]):
-        import wandb
-
-        wandb.log(metrics)
-
-    def finish(self):
-        import wandb
-
-        wandb.finish()
-
-
-class DummyMetricLogger(MetricLogger):
-    def __init__(self, project, logger_config, *args, **kwargs):
-        self.project = project
-        self.logger_config = logger_config
-        open(self.project, "a").close()  # Create an empty file to append to
-
-        self.data = []
-
-    def log(self, metrics: dict[str, Any]):
-        self.data.append(metrics)
-
-    def finish(self):
-        with open(self.project, "wb") as f:
-            pickle.dump(self.data, f)
diff --git a/src/zeroband/utils/monitor.py b/src/zeroband/utils/monitor.py
deleted file mode 100644
index e7af2990..00000000
--- a/src/zeroband/utils/monitor.py
+++ /dev/null
@@ -1,136 +0,0 @@
-from typing import Any
-from zeroband.utils.logger import get_logger
-import aiohttp
-from aiohttp import ClientError
-import asyncio
-
-
-async def _get_external_ip(max_retries=3, retry_delay=5):
-    async with aiohttp.ClientSession() as session:
-        for attempt in range(max_retries):
-            try:
-                async with session.get('https://api.ipify.org', timeout=10) as response:
-                    response.raise_for_status()
-                    return await response.text()
-            except ClientError:
-                if attempt < max_retries - 1:
-                    await asyncio.sleep(retry_delay)
-    return None
-
-
-class HttpMonitor:
-    """
-    Logs the status of nodes, and training progress to an API
-    """
-
-    def __init__(self, config, *args, **kwargs):
-        self.data = []
-        self.log_flush_interval = config["monitor"]["log_flush_interval"]
-        self.base_url = config["monitor"]["base_url"]
-        self.auth_token = config["monitor"]["auth_token"]
-
-        self._logger = get_logger()
-
-        self.run_id = config.get("run_id", None)
-        if self.run_id is None:
-            raise ValueError("run_id must be set for HttpMonitor")
-
-        self.node_ip_address = None
-        self.node_ip_address_fetch_status = None
-
-        self.loop = asyncio.new_event_loop()
-        asyncio.set_event_loop(self.loop)
-
-    def __del__(self):
-        self.loop.close()
-
-    def _remove_duplicates(self):
-        seen = set()
-        unique_logs = []
-        for log in self.data:
-            log_tuple = tuple(sorted(log.items()))
-            if log_tuple not in seen:
-                unique_logs.append(log)
-                seen.add(log_tuple)
-        self.data = unique_logs
-
-    def set_stage(self, stage: str):
-        import time
-
-        # add a new log entry with the stage name
-        self.data.append({"stage": stage, "time": time.time()})
-        self._handle_send_batch(flush=True)  # it's useful to have the most up-to-date stage broadcasted
-
-    def log(self, data: dict[str, Any]):
-        # Lowercase the keys in the data dictionary
-        lowercased_data = {k.lower(): v for k, v in data.items()}
-        self.data.append(lowercased_data)
-
-        self._handle_send_batch()
-
-    def _handle_send_batch(self, flush: bool = False):
-        if len(self.data) >= self.log_flush_interval or flush:
-            self.loop.run_until_complete(self._send_batch())
-
-    async def _set_node_ip_address(self):
-        if self.node_ip_address is None and self.node_ip_address_fetch_status != "failed":
-            ip_address = await _get_external_ip()
-            if ip_address is None:
-                self._logger.error("Failed to get external IP address")
-                # set this to "failed" so we keep trying again
-                self.node_ip_address_fetch_status = "failed"
-            else:
-                self.node_ip_address = ip_address
-                self.node_ip_address_fetch_status = "success"
-
-    async def _send_batch(self):
-        import aiohttp
-
-        self._remove_duplicates()
-        await self._set_node_ip_address()
-
-        batch = self.data[:self.log_flush_interval]
-        # set node_ip_address of batch
-        batch = [{**log, "node_ip_address": self.node_ip_address} for log in batch]
-        headers = {
-            "Content-Type": "application/json",
-            "Authorization": f"Bearer {self.auth_token}"
-        }
-        payload = {
-            "logs": batch
-        }
-        api = f"{self.base_url}/metrics/{self.run_id}/logs"
-
-        try:
-            async with aiohttp.ClientSession() as session:
-                async with session.post(api, json=payload, headers=headers) as response:
-                    if response is not None:
-                        response.raise_for_status()
-        except Exception as e:
-            self._logger.error(f"Error sending batch to server: {str(e)}")
-            pass
-
-        self.data = self.data[self.log_flush_interval :]
-        return True
-
-    async def _finish(self):
-        import requests
-
-        # Send any remaining logs
-        while self.data:
-            await self._send_batch()
-
-        headers = {"Content-Type": "application/json"}
-        api = f"{self.base_url}/metrics/{self.run_id}/finish"
-        try:
-            response = requests.post(api, headers=headers)
-            response.raise_for_status()
-            return True
-        except requests.RequestException as e:
-            self._logger.debug(f"Failed to send finish signal to http monitor: {e}")
-            return False
-
-    def finish(self):
-        self.set_stage("finishing")
-
-        self.loop.run_until_complete(self._finish())
diff --git a/src/zeroband/utils/profiler.py b/src/zeroband/utils/profiler.py
deleted file mode 100644
index e6a87b32..00000000
--- a/src/zeroband/utils/profiler.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import os
-import pickle
-import torch
-from zeroband.utils.logger import get_logger
-from zeroband.utils.world_info import get_world_info
-
-_MAX_ENTRIES = 10000
-
-
-class MemoryProfiler:
-    """Pytorch Memory Profiler.
-    The output are pickles file that can be visualized here: https://pytorch.org/memory_viz
-    """
-
-    def __init__(self, freq: int, snapshot_dir: str):
-        torch.cuda.memory._record_memory_history(max_entries=_MAX_ENTRIES)
-        self.freq = freq
-
-        self.world_info = get_world_info()
-        self.logger = get_logger()
-        self.step_num = 0
-
-        os.makedirs(snapshot_dir, exist_ok=True)
-        self.snapshot_dir = snapshot_dir
-
-    def log_memory_summary(self, curr_snapshot_dir):
-        """Log memory summary and memory allocated"""
-        summary = torch.cuda.memory_summary(device=None, abbreviated=False)
-        allocated_memory = torch.cuda.memory_allocated()
-
-        # Save the memory summary to a file
-        with open(f"{curr_snapshot_dir}/rank{self.world_info.rank}_memory_summary.txt", "w") as summary_file:
-            summary_file.write(summary)
-
-        # Save the allocated memory as a text log
-        with open(f"{curr_snapshot_dir}/rank{self.world_info.rank}_memory_allocated.txt", "w") as alloc_file:
-            alloc_file.write(f"Allocated memory: {allocated_memory / 1024 ** 2:.2f} MB\n")
-
-        # log this information using the logger
-        self.logger.info(f"Memory summary and allocation saved for rank {self.world_info.rank} at step {self.step_num}")
-
-    def step(self):
-        self.step_num += 1
-        if self.step_num % self.freq != 0:
-            return
-
-        dir_name = f"iteration_{self.step_num}"
-
-        curr_snapshot_dir = os.path.join(self.snapshot_dir, dir_name)
-        if not os.path.exists(curr_snapshot_dir):
-            os.makedirs(curr_snapshot_dir, exist_ok=True)
-
-        # Save memory snapshot
-        with open(f"{curr_snapshot_dir}/rank{self.world_info.rank}_memory_snapshot.pickle", "wb") as output:
-            pickle.dump(torch.cuda.memory._snapshot(), output)
-
-        # Log memory summary and allocated memory
-        self.log_memory_summary(curr_snapshot_dir)
-
-        torch.distributed.barrier()
diff --git a/src/zeroband/utils/state_dict_send_recv.py b/src/zeroband/utils/state_dict_send_recv.py
deleted file mode 100644
index 66366dd9..00000000
--- a/src/zeroband/utils/state_dict_send_recv.py
+++ /dev/null
@@ -1,165 +0,0 @@
-import io
-import pickle
-import torch
-from torch.distributed import ProcessGroup
-from torch.distributed._tensor.api import DTensor
-
-
-def _object_to_tensor(obj):
-    f = io.BytesIO()
-    pickle.Pickler(f).dump(obj)
-    byte_storage = torch.ByteStorage._from_buffer(f.getvalue())  # type: ignore[attr-defined]
-    # Do not replace `torch.ByteTensor` or `torch.LongTensor` with torch.tensor and specifying dtype.
-    # Otherwise, it will casue 100X slowdown.
-    # See: https://github.com/pytorch/pytorch/issues/65696
-    byte_tensor = torch.ByteTensor(byte_storage)
-    local_size = torch.LongTensor([byte_tensor.numel()])
-    return byte_tensor, local_size
-
-
-def _tensor_to_object(tensor, tensor_size):
-    tensor = tensor.cpu()
-    buf = tensor.numpy().tobytes()[:tensor_size]
-    return pickle.Unpickler(io.BytesIO(buf)).load()
-
-
-def _tensor_to_placeholder(idx: int, tensor: torch.Tensor) -> str:
-    return f"zeroband_tensor_{idx}_{tensor.shape}_{tensor.dtype}"
-
-
-def _validate_placeholder_to_tensor(placeholder: str, tensors: list[torch.Tensor]) -> torch.Tensor:
-    """
-    validate that the tensor is compatible with the placeholder.
-    """
-    try:
-        idx, shape, dtype = placeholder.split("_")[2:]
-    except ValueError as e:
-        raise ValueError(f"Invalid tensor placeholder {placeholder}") from e
-
-    tensor = tensors[int(idx)]
-    if shape != str(tensor.shape):
-        raise ValueError(
-            f"tensor {idx} try to load a tensor with shape {shape} but the tensor has shape {tensor.shape}"
-        )
-    if dtype != str(tensor.dtype):
-        raise ValueError(
-            f"tensor {idx} try to load a tensor with dtype {dtype} but the tensor has dtype {tensor.dtype}"
-        )
-
-    return tensor
-
-
-def _get_sendable_state_dict(state_dict: dict) -> tuple[dict, list[torch.Tensor]]:
-    """
-    This function take a state dict (dict with tensor inside) and return a torch.send/recv-able format.
-
-    It splits the state dict into two part :
-    * a list of tensor
-    * a dict emptied from tensor
-
-    The order is deterministic. The function can be used in pair with  _load_sendable_state_dict
-    """
-    tensors: list[torch.Tensor] = []
-
-    def _split(state_dict_, tensors_):
-        new_dict = {}
-        for key, value in state_dict_.items():
-            if isinstance(value, dict):
-                new_dict[key] = _split(value, tensors_)
-            elif isinstance(value, torch.Tensor):
-                idx = len(tensors_)
-                tensors_.append(value)
-                new_dict[key] = _tensor_to_placeholder(idx, value)
-            else:
-                new_dict[key] = value
-
-        return new_dict
-
-    state_dict = _split(state_dict, tensors)
-    return state_dict, tensors
-
-
-def _load_sendable_state_dict(tensors: list[torch.Tensor], state_dict: dict) -> dict:
-    """
-    This function take a list of tensor and a state dict and return state dict.
-
-    The function can be used in pair with _get_sendable_state_dict
-    """
-
-    def _load(state_dict_):
-        for key, value in list(state_dict_.items()):  # list needed as we modify the state_dict_ as we traverse it
-            if isinstance(value, dict):
-                state_dict_[key] = _load(value)
-            elif isinstance(value, str) and value.startswith("zeroband_tensor_"):
-                state_dict_[key] = _validate_placeholder_to_tensor(value, tensors)
-
-        return state_dict_
-
-    return _load(state_dict)
-
-
-def send_state_dict(pg: ProcessGroup, state_dict: dict, dest_rank: int) -> None:
-    non_tensored_state_dict, tensors = _get_sendable_state_dict(state_dict)
-    send_tensor_and_state_dict(pg, dest_rank, non_tensored_state_dict, tensors)
-
-
-def send_tensor_and_state_dict(pg: ProcessGroup, dest_rank: int, state_dict: dict, tensors: list[torch.Tensor]) -> None:
-    # logger = get_logger()
-    # logger.debug(f"recv tensors {get_tensor_list_signature(tensors)}")
-
-    state_dict_tensor_buffer, size = _object_to_tensor(state_dict)
-    pg.send([size], dest_rank, 0).wait()
-    pg.send([state_dict_tensor_buffer], dest_rank, 0).wait()
-
-    jobs = []
-    for i, tensor in enumerate(tensors):
-        buffer = tensor
-        if isinstance(tensor, DTensor):
-            buffer = tensor.to_local()
-
-        buffer = buffer.detach().cpu()
-
-        jobs.append(pg.send([buffer], dest_rank, i))
-
-    for job in jobs:
-        job.wait()
-
-
-def recv_state_dict(pg: ProcessGroup, src_rank: int, og_state_dict: dict) -> dict:
-    size = torch.LongTensor(1)
-
-    # Receive object sizes
-    pg.recv([size], src_rank, 0).wait()
-    # Tensor to receive serialized objects into.
-    object_tensor = torch.empty(size.item(), dtype=torch.uint8)
-
-    pg.recv([object_tensor], src_rank, 0).wait()
-    state_dict = _tensor_to_object(object_tensor, size)
-
-    _, tensors = _get_sendable_state_dict(og_state_dict)
-
-    jobs = []
-    datas = []
-    for i, tensor in enumerate(tensors):
-        buffer = tensor
-        if isinstance(tensor, DTensor):
-            buffer = tensor.to_local()
-
-        data = torch.empty_like(buffer, device="cpu")
-        jobs.append(pg.recv([data], src_rank, i))
-        datas.append(data)
-
-    for job in jobs:
-        job.wait()
-
-    for tensor, data in zip(tensors, datas):
-        if isinstance(tensor, DTensor):
-            tensor = tensor.to_local()
-        tensor.copy_(data)
-
-    state_dict = _load_sendable_state_dict(tensors, state_dict)
-
-    # logger = get_logger()
-    # logger.debug(f"recv tensors {get_tensor_list_signature(tensors)}")
-
-    return state_dict
diff --git a/src/zeroband/utils/stopwatch.py b/src/zeroband/utils/stopwatch.py
deleted file mode 100644
index 2b49d4fb..00000000
--- a/src/zeroband/utils/stopwatch.py
+++ /dev/null
@@ -1,130 +0,0 @@
-import time
-
-from torch.autograd.profiler import record_function
-
-from zeroband.config import Config
-from zeroband.utils.logger import get_logger
-
-
-class _RecordBlockContext:
-    def __init__(self, sw, prof_name):
-        self.sw = sw
-        self.prof_name = prof_name
-
-    def __enter__(self):
-        self.torch_context = record_function(self.prof_name)
-        self.torch_context.__enter__()
-
-        if self.sw.disabled:
-            return self
-        self.sw.start_block(message=f"Starting \"{self.prof_name}\"")
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.torch_context.__exit__(exc_type, exc_val, exc_tb)
-        self.torch_context = None
-
-        if self.sw.disabled:
-            return
-        self.sw.end_block(format_str=f"Finished \"{self.prof_name}\"")
-
-
-class Stopwatch:
-    def __init__(self, config: Config | None = None):
-        self.timers: dict[str, dict[str, float]] = {} # Timer name -> {start_time, last_lap_time}
-        self.stack: list[str] = [] # List timer names in order of last constructed
-        self.logger = get_logger(config)
-        self.disabled = (config.log_level != "DEBUG") if config else False
-
-    def _resolve_name(self, name: str | None) -> str:
-        if name is None:
-            if not self.stack:
-                raise ValueError("No active timers")
-            return self.stack[-1]
-        return name
-
-    def start(self, name: str) -> None:
-        if self.disabled:
-            return
-
-        current_time = time.perf_counter()
-        self.timers[name] = {
-            'start_time': current_time,
-            'last_lap_time': current_time
-        }
-        self.stack.append(name)
-
-    def _lap(self, name: str | None = None) -> float:
-        if self.disabled:
-            return 0.0
-
-        name = self._resolve_name(name)
-        if name not in self.stack:
-            raise ValueError(f"Timer '{name}' is not active")
-
-        timer = self.timers.get(name)
-        if not timer:
-            raise ValueError(f"Timer '{name}' does not exist")
-
-        current_time = time.perf_counter()
-        elapsed = current_time - timer['last_lap_time']
-        timer['last_lap_time'] = current_time
-        return elapsed
-
-    def start_block(self, message: str | None = None, name: str | None = None) -> None:
-        if self.disabled:
-            return
-
-        self._lap(name)
-        if message:
-            self.logger.debug(message)
-
-    def end_block(self, format_str: str | None = None, name: str | None = None) -> None:
-        if self.disabled:
-            return
-
-        lap_time = self._lap(name)
-        if not format_str:
-            return
-        elif "{" in format_str:
-            self.logger.debug(format_str.format(name=name, time=lap_time))
-        else:
-            self.logger.debug(f"{format_str} in {lap_time:.2f} seconds")
-
-    def elapsed(self, name: str | None = None) -> float:
-        if self.disabled:
-            return 0.0
-
-        name = self._resolve_name(name)
-        timer = self.timers.get(name)
-        if not timer:
-            raise ValueError(f"Timer '{name}' does not exist")
-
-        current_time = time.perf_counter()
-        return current_time - timer['start_time']
-
-    def stop(self, name: str | None = None) -> float:
-        if self.disabled:
-            return 0.0
-
-        name = self._resolve_name(name)
-        elapsed = self.elapsed(name)
-
-        if name in self.stack:
-            self.stack.remove(name)
-            self.timers.pop(name)
-
-        return elapsed
-
-    def reset(self) -> None:
-        self.timers.clear()
-        self.stack.clear()
-
-    def record_block(self, prof_name: str) -> _RecordBlockContext:
-        """
-        Calls the torch profiler record_function() and times with start_block() and end_block().
-        end_format_str is passed as end_block's format_str.
-        start_message is passed as start_block's message.
-        """
-        return _RecordBlockContext(self, prof_name)
-
diff --git a/src/zeroband/utils/wget.py b/src/zeroband/utils/wget.py
deleted file mode 100644
index 849e504e..00000000
--- a/src/zeroband/utils/wget.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import subprocess
-
-import shutil
-
-def _get_cut_dirs_from_url(url: str) -> int:
-    return len(url.rstrip().partition("//")[-1].split("/"))
-
-def wget(source: str, destination: str) -> None:
-    # logger = get_logger()
-    cmd = f"wget -r -np -nH --cut-dirs={_get_cut_dirs_from_url(source)} -P {destination} {source}"
-
-    if shutil.which("wget") is None:
-        raise RuntimeError("wget is required but not found. Please install wget and try again.")
-
-    try:
-        subprocess.run(cmd, shell=True, check=True, capture_output=True, text=True)
-    except subprocess.CalledProcessError as e:
-        # logger.error(f"Error output: {e.stderr}")
-        print(f"Error output: {e.stderr}")
-        raise e
diff --git a/src/zeroband/utils/world_info.py b/src/zeroband/world_info.py
similarity index 100%
rename from src/zeroband/utils/world_info.py
rename to src/zeroband/world_info.py
diff --git a/tests/test_c/conftest.py b/tests/test_c/conftest.py
deleted file mode 100644
index bb6ab323..00000000
--- a/tests/test_c/conftest.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import pytest
-import socket
-from contextlib import contextmanager
-import os
-from unittest import mock
-
-
-def get_random_available_port():
-    # https://stackoverflow.com/questions/1365265/on-localhost-how-do-i-pick-a-free-port-number
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        s.bind(("", 0))
-        return s.getsockname()[1]
-
-
-@pytest.fixture()
-def random_available_port():
-    return get_random_available_port()
-
-
-@pytest.fixture()
-def dist_environment() -> callable:
-    @contextmanager
-    def dist_environment(
-        random_available_port, backend=None, rank=0, local_rank=0, world_size=1, local_world_size=1, global_unique_id=""
-    ):
-        with mock.patch.dict(
-            os.environ,
-            {
-                "GLOBAL_UNIQUE_ID": global_unique_id,
-                "RANK": str(rank),
-                "WORLD_SIZE": str(world_size),
-                "LOCAL_RANK": str(local_rank),
-                "LOCAL_WORLD_SIZE": str(local_world_size),
-                "MASTER_ADDR": "localhost",
-                "MASTER_PORT": str(random_available_port),
-                "ZERO_BAND_LOG_LEVEL": "DEBUG",
-            },
-        ):
-            yield
-
-    return dist_environment
diff --git a/tests/test_c/test_collectives.py b/tests/test_c/test_collectives.py
deleted file mode 100644
index 09c4b405..00000000
--- a/tests/test_c/test_collectives.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import torch
-import torch.distributed as dist
-from zeroband.C.collectives import ring_allreduce
-from zeroband.collectives import ring_allreduce_py
-from zeroband.C.compression import uniform_8bit_quantize
-import math
-import pytest
-import multiprocessing as mp
-
-N = 1_000_000
-TIME_COUNT = 2
-
-
-@pytest.mark.parametrize("world_size", [2, 4])
-@pytest.mark.parametrize("pg_source", ["gloo", "default"])
-def test_ring_allreduce(world_size: int, pg_source: str, random_available_port: int, dist_environment):
-    def all_reduce(rank: int, world_size: int):
-        with dist_environment(random_available_port, "gloo", rank=rank, world_size=world_size):
-            dist.init_process_group(backend="gloo")
-            rank = dist.get_rank()
-            world_size = dist.get_world_size()
-            if pg_source == "gloo":
-                store = dist.TCPStore(
-                    host_name="localhost",
-                    port=random_available_port + 1,
-                    world_size=world_size,
-                    is_master=(rank == 0),
-                )
-                pg = dist.distributed_c10d.ProcessGroupGloo(store, rank, world_size)
-            else:
-                pg = dist.distributed_c10d._get_default_group()
-            a = torch.randn(N) * 10
-            b = torch.clone(a)
-            c = torch.clone(a)
-
-            ring_allreduce(a, dist.ReduceOp.SUM, pg)
-            ring_allreduce_py(
-                b,
-                dist.ReduceOp.SUM,
-                dist.distributed_c10d._get_default_group(),
-                quantization_func=uniform_8bit_quantize,
-            )
-            dist.all_reduce(c, dist.ReduceOp.SUM, group=pg)
-
-            if rank == 0:
-                error_new = torch.norm(a - c)
-                diff_new = (a - c).abs()
-                error_old = torch.norm(b - c)
-                diff_old = (b - c).abs()
-                print(
-                    f"[New] norm: {error_new:.4f} diff mean: {diff_new.mean():.4f} std: {diff_new.std()} max: {diff_new.max():.4f}"
-                )
-                print(
-                    f"[Old] norm: {error_old:.4f} diff mean: {diff_old.mean():.4f} std: {diff_old.std()} max: {diff_old.max():.4f}"
-                )
-
-                assert (error_new - error_old).abs() / math.sqrt(N) < 0.5
-
-            dist.destroy_process_group()
-
-    # Perform ring all-reduce
-    processes = [mp.Process(target=all_reduce, args=(rank, world_size)) for rank in range(world_size)]
-    for p in processes:
-        p.start()
-    for p in processes:
-        p.join()
-        if p.exitcode != 0:
-            pytest.fail(f"Process {p.pid} failed with exit code {p.exitcode}")
diff --git a/tests/test_c/test_compression.py b/tests/test_c/test_compression.py
deleted file mode 100644
index c713da6f..00000000
--- a/tests/test_c/test_compression.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import torch
-from torch.utils.benchmark import Timer
-from zeroband.compression import uniform_8bit_quantize as uniform_8bit_quantize_old
-from zeroband.compression import average_buckets as average_buckets_old
-
-from zeroband.C.compression import average_buckets, uniform_8bit_quantize, quantize_per_tensor_uint8
-
-N = 10_000_000
-TIME_COUNT = 1
-
-
-def test_uniform_8bit_quantize():
-    a = torch.randn(N)
-
-    # Benchmark old function
-    timer_old = Timer(
-        stmt="uniform_8bit_quantize_old(a)", globals={"uniform_8bit_quantize_old": uniform_8bit_quantize_old, "a": a}
-    )
-    time_old = timer_old.timeit(TIME_COUNT)
-
-    # Benchmark new function
-    timer_new = Timer(stmt="uniform_8bit_quantize(a)", globals={"uniform_8bit_quantize": uniform_8bit_quantize, "a": a})
-    time_new = timer_new.timeit(TIME_COUNT)
-
-    print(f"New function time: {time_new.mean:.6f} seconds")
-    print(f"Old function time: {time_old.mean:.6f} seconds")
-
-    new_result, new_lookup = uniform_8bit_quantize(a)
-    old_result, old_lookup = uniform_8bit_quantize_old(a)
-
-    new_ten = new_lookup[new_result.long()]
-    old_ten = old_lookup[old_result.long()]
-
-    new_err = torch.norm(new_ten - a)
-    old_err = torch.norm(old_ten - a)
-    new_diff = (new_ten - a).abs()
-    old_diff = (old_ten - a).abs()
-    print(
-        f"New error: {new_err:.6f} Diff mean: {new_diff.mean():.6f} Std: {new_diff.std():.6f} Max: {new_diff.max():.6f}"
-    )
-    print(
-        f"Old error: {old_err:.6f} Diff mean: {old_diff.mean():.6f} Std: {old_diff.std():.6f} Max: {old_diff.max():.6f}"
-    )
-
-
-def test_quantize_per_tensor_uint8():
-    a = torch.ones(N) * 10
-    scale = 0.01
-    print(f"Tensor size: {a.numel():,}")
-
-    timer_new = Timer(
-        stmt="quantize_per_tensor(a, scale, 128)",
-        globals={"quantize_per_tensor": quantize_per_tensor_uint8, "a": a, "scale": scale},
-    )
-    time_new = timer_new.timeit(TIME_COUNT)
-    print(f"Custom quantize_per_tensor function time: {time_new.mean:.6f} seconds")
-
-    timer_old = Timer(
-        stmt="torch.quantize_per_tensor(a, scale, 128, torch.quint8).int_repr()",
-        globals={"torch": torch, "a": a, "scale": scale},
-    )
-    time_old = timer_old.timeit(TIME_COUNT)
-    print(f"torch.quantize_per_tensor time: {time_old.mean:.6f} seconds")
-
-
-def test_average_buckets():
-    a = torch.randn(N) * 10
-    b = torch.randint(0, 255, (N,), dtype=torch.uint8)
-
-    timer_new = Timer(stmt="average_buckets(a, b, 256)", globals={"average_buckets": average_buckets, "a": a, "b": b})
-    time_new = timer_new.timeit(TIME_COUNT)
-    print(f"Custom average_buckets function time: {time_new.mean:.6f} seconds")
-
-    timer_old = Timer(
-        stmt="average_buckets(a, b, 256)", globals={"average_buckets": average_buckets_old, "a": a, "b": b}
-    )
-    time_old = timer_old.timeit(TIME_COUNT)
-    print(f"torch.bucketize time: {time_old.mean:.6f} seconds")
diff --git a/tests/test_configs.py b/tests/test_configs.py
deleted file mode 100644
index eff493a6..00000000
--- a/tests/test_configs.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""
-Tests all of the config file. usefull to catch mismatch key after a renaming of a arg name
-Need to be run from the root folder
-"""
-
-import os
-from zeroband.train import Config
-import pytest
-import tomli
-
-
-def get_all_toml_files(directory):
-    toml_files = []
-    for root, _, files in os.walk(directory):
-        for file in files:
-            if file.endswith(".toml"):
-                toml_files.append(os.path.join(root, file))
-    return toml_files
-
-
-config_file_paths = get_all_toml_files("configs")
-
-
-@pytest.mark.parametrize("config_file_path", config_file_paths)
-def test_load_config(config_file_path):
-    with open(f"{config_file_path}", "rb") as f:
-        content = tomli.load(f)
-    config = Config(**content)
-    assert config is not None
diff --git a/tests/test_data.py b/tests/test_data.py
deleted file mode 100644
index fff9d537..00000000
--- a/tests/test_data.py
+++ /dev/null
@@ -1,272 +0,0 @@
-import copy
-import torch
-from zeroband.data import InterleaveDataset, ParquetDataset, SequencePackingDataSet, collate_fn
-from torch.utils.data import DataLoader
-from zeroband.data import load_all_datasets, DataConfig
-from zeroband.utils.logger import get_logger
-from collections import Counter
-from itertools import chain
-import pytest
-import logging
-import pyarrow as pa
-import pyarrow.parquet as pq
-from faker import Faker
-from typing import List
-import string
-from torchdata.stateful_dataloader import StatefulDataLoader
-
-
-@pytest.mark.skip(reason="not using hf for now")
-@pytest.mark.parametrize(
-    "ratio, lower, upper",
-    [
-        ("3:2", 1.2821, 1.7549),
-        ("0.5:1", 0.4247, 0.5886),
-    ],
-)
-def test_load_all_datasets_vanilla(ratio: str, lower: float, upper: float):
-    config = DataConfig(
-        dataset_name_or_paths="Jackmin108/abc-testing:A,Jackmin108/abc-testing:C",
-        dataset_ratio=ratio,
-        streaming=True,
-        fake=False,
-    )
-
-    ds = load_all_datasets(config, "train")
-    print(ds)
-
-    dl = DataLoader(ds, batch_size=256)
-    batches = [i["text"] for i, _ in zip(dl, range(10))]
-    assert len(batches) == 10
-
-    # Check that the ratio is correct
-    letter_count = Counter(i[0] for i in chain(*batches))
-    print(letter_count, letter_count["A"] / letter_count["C"])
-    assert letter_count["A"] / letter_count["C"] < upper
-    assert letter_count["A"] / letter_count["C"] > lower
-
-
-@pytest.mark.skip(reason="not using hf for now")
-@pytest.mark.parametrize(
-    "ratio, lower, upper, data_rank, data_world_size",
-    [
-        ("3:2", 1.2821, 1.7549, 1, 4),
-        ("0.5:1", 0.4247, 0.5886, 0, 3),
-    ],
-)
-def test_load_all_datasets_data_rank(ratio: str, lower: float, upper: float, data_rank: int, data_world_size: int):
-    get_logger().setLevel(logging.DEBUG)
-    config = DataConfig(
-        dataset_name_or_paths="Jackmin108/abc-testing:A,Jackmin108/abc-testing:C",
-        dataset_ratio=ratio,
-        streaming=True,
-        fake=False,
-        data_world_size=data_world_size,
-        data_rank=data_rank,
-    )
-
-    ds = load_all_datasets(config, "train")
-    print(ds)
-
-    dl = DataLoader(ds, batch_size=256)
-    batches = [i["text"] for i, _ in zip(dl, range(10))]
-    assert len(batches) == 10
-
-    # Check that the ratio is correct
-    letter_count = Counter(i[0] for i in chain(*batches))
-    print(letter_count, letter_count["A"] / letter_count["C"])
-    assert letter_count["A"] / letter_count["C"] < upper
-    assert letter_count["A"] / letter_count["C"] > lower
-
-    c_num_set = {int(i[1:]) for i in chain(*batches) if i[0] == "C"}
-    a_num_set = {int(i[1:]) for i in chain(*batches) if i[0] == "A"}
-
-    # Check that the data is correctly sharded
-    first_a_shard = set(range(data_rank * (2**12), (data_rank + 1) * (2**12)))
-    first_10_c_shard = set()
-    for i in range(data_rank, data_world_size * 10, data_world_size):
-        first_10_c_shard = first_10_c_shard.union(set(range(i * (2**8), (i + 1) * (2**8))))
-    assert all(i in first_a_shard for i in a_num_set)
-    assert all(i in first_10_c_shard for i in c_num_set)
-
-
-def test_squence_packing():
-    class FakeDataset(torch.utils.data.Dataset):
-        def __init__(self):
-            self.data = [[6, 1, 2, 3, 4], [6, 3, 3, 4, 1, 7], [3, 2], [1, 2], [1, 4, 5, 3, 4, 1, 7, 8]]
-
-        def __len__(self):
-            return len(self.data)
-
-        def __getitem__(self, index):
-            return {"input_ids": self.data[index]}
-
-    MAX_SEQ_LEN = 8
-    dataset = SequencePackingDataSet(FakeDataset(), max_seq_length=MAX_SEQ_LEN, eos_token=0)
-
-    input_ids = []
-    labels = []
-    for data in dataset:
-        assert data["input_ids"].shape[0] == MAX_SEQ_LEN
-        assert data["labels"].shape[0] == MAX_SEQ_LEN
-        assert sum(data["seqlens"]) == MAX_SEQ_LEN
-
-        input_ids.append(data["input_ids"].tolist())
-        labels.append(data["labels"].tolist())
-
-    assert input_ids == [[6, 1, 2, 3, 4, 6, 3, 3], [3, 2, 1, 2, 1, 4, 5, 3]]
-    assert labels == [[1, 2, 3, 4, 0, 3, 3, 4], [2, 0, 2, 0, 4, 5, 3, 4]]
-
-
-class SimpleTokenizer:
-    def __init__(self):
-        # Create vocabulary: a-z (0-25) and unknown token (26)
-        self.char_to_id = {char: idx for idx, char in enumerate(string.ascii_lowercase)}
-        self.unknown_token = 26
-
-    def encode(self, text: str) -> List[int]:
-        """Convert text to list of token ids"""
-        return [self.char_to_id.get(char.lower(), self.unknown_token) for char in text]
-
-
-@pytest.fixture
-def fake_sentences():
-    """Generate 500 fake sentences (100 per file * 5 files)"""
-    fake = Faker()
-    return [fake.sentence() for _ in range(10_000)]
-
-
-@pytest.fixture
-def parquet_files(tmp_path, fake_sentences):
-    """Create 10 parquet files with 100 sentences each"""
-    files = []
-    for i in range(10):
-        # Create data for this file
-        start_idx = i * 100
-        sentences = fake_sentences[start_idx : start_idx + 100]
-
-        # Create arrow table
-        table = pa.Table.from_arrays([pa.array(sentences)], names=["text"])
-
-        # Write to parquet file
-        file_path = tmp_path / f"data_{i}.parquet"
-        pq.write_table(table, file_path)
-        files.append(str(file_path))
-
-    return files
-
-
-@pytest.fixture
-def tokenizer():
-    """Get a simple character-based tokenizer"""
-    return SimpleTokenizer()
-
-
-def test_parquet_dataset_ckpt(parquet_files, tokenizer):
-    # Create first dataset and iterate halfway
-    dataset1 = ParquetDataset(parquet_files, tokenizer)
-    halfway_point = 100
-
-    for _, data in zip(range(halfway_point), dataset1):
-        pass
-    # Save state
-    state_dict = dataset1.state_dict()
-
-    # Create new dataset and load state
-    dataset2 = ParquetDataset(parquet_files, tokenizer)
-    dataset2.load_state_dict(state_dict)
-
-    max_to_yield = 200
-    # Continue first dataset
-
-    for _, data1, data2 in zip(range(max_to_yield), dataset1, dataset2):
-        assert data1["input_ids"] == data2["input_ids"]
-
-
-def test_sequence_packing_dataset_ckpt(parquet_files, tokenizer):
-    dataset1 = SequencePackingDataSet(ParquetDataset(parquet_files, tokenizer), max_seq_length=16, eos_token=0)
-
-    halfway_point = 100
-
-    for _, data in zip(range(halfway_point), dataset1):
-        pass
-    # Save state
-    state_dict = dataset1.state_dict()
-
-    # Create new dataset and load state
-    dataset2 = SequencePackingDataSet(ParquetDataset(parquet_files, tokenizer), max_seq_length=16, eos_token=0)
-    dataset2.load_state_dict(state_dict)
-
-    assert dataset1.state_dict() == dataset2.state_dict()
-
-    max_to_yield = 199
-    # Continue first dataset
-
-    for _, data1, data2 in zip(range(max_to_yield), dataset1, dataset2):
-        assert (data1["input_ids"] == data2["input_ids"]).all()
-        assert (data1["labels"] == data2["labels"]).all()
-        assert data1["seqlens"] == data2["seqlens"]
-
-
-def test_interleave_dataset_ckpt(parquet_files, tokenizer):
-    # Split parquet files into two groups to create two datasets
-    files1 = parquet_files[:2]  # First two files
-    files2 = parquet_files[2:4]  # Next two files
-
-    # Create first dataset and iterate halfway
-    dataset1 = InterleaveDataset(
-        [ParquetDataset(files1, tokenizer), ParquetDataset(files2, tokenizer)], probabilities=[0.5, 0.5]
-    )
-
-    halfway_point = 100
-
-    for _, data in zip(range(halfway_point), dataset1):
-        pass
-    # Save state
-    state_dict = dataset1.state_dict()
-
-    # Create new dataset and load state
-    dataset2 = InterleaveDataset(
-        [ParquetDataset(files1, tokenizer), ParquetDataset(files2, tokenizer)], probabilities=[0.5, 0.5]
-    )
-    dataset2.load_state_dict(state_dict=copy.deepcopy(state_dict))
-
-    assert dataset1.state_dict() == dataset2.state_dict()
-
-    max_to_yield = 250
-
-    for _, data1, data2 in zip(range(max_to_yield), dataset1, dataset2):
-        assert data1["input_ids"] == data2["input_ids"]
-
-
-@pytest.mark.skip(reason="not working for now")
-@pytest.mark.parametrize("num_workers", [0, 2, 16])
-def test_dataloader_parquet_dataset(parquet_files, tokenizer, num_workers):
-    dataset = SequencePackingDataSet(ParquetDataset(parquet_files, tokenizer), max_seq_length=8, eos_token=0)
-
-    loader = StatefulDataLoader(dataset, batch_size=8, num_workers=num_workers, collate_fn=collate_fn)
-
-    total_samples = 100
-
-    for _, _batch in zip(range(total_samples), loader):
-        ...
-
-    # Save state
-    state_dict = loader.state_dict()
-
-    # Create new loader and load state
-    dataset2 = SequencePackingDataSet(ParquetDataset(parquet_files, tokenizer), max_seq_length=8, eos_token=0)
-
-    loader2 = StatefulDataLoader(dataset2, batch_size=8, num_workers=num_workers, collate_fn=collate_fn)
-
-    print(state_dict)
-
-    loader2.load_state_dict(state_dict)
-
-    warmup = 10
-
-    for i, batch1, batch2 in zip(range(total_samples), loader, loader2):
-        if i > warmup:
-            assert (batch1["input_ids"] == batch2["input_ids"]).all()
-            assert (batch1["labels"] == batch2["labels"]).all()
-            assert (batch1["seqlens"] == batch2["seqlens"]).all()
diff --git a/tests/test_dist/conftest.py b/tests/test_dist/conftest.py
deleted file mode 100644
index 99361de8..00000000
--- a/tests/test_dist/conftest.py
+++ /dev/null
@@ -1,85 +0,0 @@
-"""
-torch distribted test
-
-this test are different from the torchrun integration tests
-
-They manually do the job of torchrun to start the distributed process making it easy to write unit tests
-"""
-
-import torch
-import pytest
-from torch.distributed import destroy_process_group, init_process_group
-
-
-import os
-from unittest import mock
-import socket
-from contextlib import contextmanager
-import gc
-
-
-@pytest.fixture(autouse=True)
-def memory_cleanup():
-    # credits to : https://github.com/pytorch/pytorch/issues/82218#issuecomment-1675254117
-    try:
-        gc.collect()
-        torch.cuda.empty_cache()
-        yield
-    finally:
-        gc.collect()
-        torch.cuda.empty_cache()
-
-
-def get_random_available_port():
-    # https://stackoverflow.com/questions/1365265/on-localhost-how-do-i-pick-a-free-port-number
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        s.bind(("", 0))
-        return s.getsockname()[1]
-
-
-@pytest.fixture()
-def random_available_port():
-    return get_random_available_port()
-
-
-@pytest.fixture()
-def dist_environment() -> callable:
-    @contextmanager
-    def dist_environment(
-        random_available_port, backend=None, rank=0, local_rank=0, world_size=1, local_world_size=1, global_unique_id=""
-    ):
-        with mock.patch.dict(
-            os.environ,
-            {
-                "GLOBAL_UNIQUE_ID": global_unique_id,
-                "RANK": str(rank),
-                "WORLD_SIZE": str(world_size),
-                "LOCAL_RANK": str(local_rank),
-                "LOCAL_WORLD_SIZE": str(local_world_size),
-                "MASTER_ADDR": "localhost",
-                "MASTER_PORT": str(random_available_port),
-                "ZERO_BAND_LOG_LEVEL": "DEBUG",
-            },
-        ):
-            try:
-                init_process_group(backend=backend)
-                torch.cuda.set_device(local_rank)
-                yield
-            finally:
-                destroy_process_group()
-
-    return dist_environment
-
-
-@pytest.fixture()
-def mock_env() -> callable:
-    @contextmanager
-    def env(**kwargs):
-        kwargs = {k.upper(): str(v) for k, v in kwargs.items()}
-        with mock.patch.dict(
-            os.environ,
-            kwargs,
-        ):
-            yield
-
-    return env
diff --git a/tests/test_dist/test_comms.py b/tests/test_dist/test_comms.py
deleted file mode 100644
index 28732949..00000000
--- a/tests/test_dist/test_comms.py
+++ /dev/null
@@ -1,237 +0,0 @@
-import time
-import torch
-import torch.distributed as dist
-import pytest
-from zeroband.comms import ElasticDeviceMesh
-import multiprocessing as mp
-
-pytest.skip("Skipping test file", allow_module_level=True)
-# skipping this test for now as they slow down the ci and we are going to remove them anyway
-
-
-@pytest.mark.parametrize("world_size", [2, 8])
-def test_elastic_device_mesh_no_global(world_size: int, random_available_port: int, mock_env):
-    def foo(**kwargs):
-        with mock_env(**kwargs):
-            edm = ElasticDeviceMesh(enable=False)
-
-            rank = int(kwargs["RANK"])
-            a = torch.arange(3) * (rank + 1)
-            dist.all_reduce(a, op=dist.ReduceOp.SUM, group=edm.local_pg)
-            sum_ints = world_size * (world_size + 1) // 2
-            assert torch.allclose(a, torch.tensor([0, sum_ints, 2 * sum_ints]))
-
-            dist.all_reduce(a, op=dist.ReduceOp.SUM, group=edm.global_pg)
-            assert torch.allclose(a, torch.tensor([0, sum_ints, 2 * sum_ints]))
-
-            del edm
-
-    processes = []
-    for rank in range(world_size):
-        processes.append(
-            mp.Process(
-                target=foo,
-                kwargs={
-                    "MASTER_ADDR": "localhost",
-                    "MASTER_PORT": str(random_available_port),
-                    "RANK": str(rank),
-                    "WORLD_SIZE": str(world_size),
-                    "LOCAL_RANK": str(rank),
-                    "LOCAL_WORLD_SIZE": str(world_size),
-                    "ZERO_BAND_LOG_LEVEL": "DEBUG",
-                },
-            )
-        )
-    for p in processes:
-        p.start()
-    for p in processes:
-        p.join()
-        if p.exitcode != 0:
-            pytest.fail(f"Process {p.pid} failed with exit code {p.exitcode}")
-
-
-@pytest.mark.parametrize("world_size", [2, 8])
-@pytest.mark.parametrize("global_world_size", [2, 8])
-def test_elastic_device_mesh(world_size: int, global_world_size: int, mock_env):
-    def foo(**kwargs):
-        with mock_env(**kwargs):
-            edm = ElasticDeviceMesh()
-
-            rank = int(kwargs["RANK"])
-            a = torch.arange(3) * (rank + 1)
-            dist.all_reduce(a, op=dist.ReduceOp.SUM, group=edm.local_pg)
-            sum_ints = world_size * (world_size + 1) // 2
-            assert torch.allclose(a, torch.tensor([0, sum_ints, 2 * sum_ints]))
-
-            global_rank = int(kwargs["GLOBAL_RANK"])
-            a = torch.arange(3) * (global_rank + 1) + rank
-            dist.all_reduce(a, op=dist.ReduceOp.SUM, group=edm.global_pg)
-            sum_ints = global_world_size * (global_world_size + 1) // 2
-            assert torch.allclose(a, torch.tensor([0, sum_ints, 2 * sum_ints]) + rank * global_world_size)
-
-            del edm
-
-    global_ports = [i for i in range(21970, 21970 + world_size)]
-    master_ports = [i for i in range(31000, 31000 + global_world_size)]
-    processes = []
-    for global_rank in range(global_world_size):
-        for rank in range(world_size):
-            processes.append(
-                mp.Process(
-                    target=foo,
-                    kwargs={
-                        "MASTER_ADDR": "localhost",
-                        "MASTER_PORT": str(master_ports[global_rank]),
-                        "RANK": str(rank),
-                        "WORLD_SIZE": str(world_size),
-                        "LOCAL_RANK": str(rank),
-                        "LOCAL_WORLD_SIZE": str(world_size),
-                        "GLOBAL_UNIQUE_ID": str(global_rank),
-                        "GLOBAL_ADDR": "localhost",
-                        "GLOBAL_PORT": str(global_ports[0]),
-                        "GLOBAL_RANK": str(global_rank),
-                        "GLOBAL_WORLD_SIZE": str(global_world_size),
-                        "ZERO_BAND_LOG_LEVEL": "DEBUG",
-                    },
-                )
-            )
-    for p in processes:
-        p.start()
-    for p in processes:
-        p.join()
-        if p.exitcode != 0:
-            pytest.fail(f"Process {p.pid} failed with exit code {p.exitcode}")
-
-
-@pytest.mark.parametrize("world_size", [1, 2])
-@pytest.mark.parametrize("global_world_size", [2, 4])
-def test_elastic_device_mesh_on_off_ramp(world_size: int, global_world_size: int, mock_env):
-    ready_event = mp.Event()
-
-    def foo(**kwargs):
-        with mock_env(**kwargs):
-            test_value = int(kwargs["TEST_VALUE"])
-
-            edm = ElasticDeviceMesh()
-            edm.maybe_reinit_global_pg()
-            assert edm.mesh_count == 0
-            assert edm.global_pg.size() == global_world_size
-
-            ready_event.wait()  # Wait for bar to signal readiness
-            time.sleep(0.5)  # Give time for bar to queue
-
-            edm.maybe_reinit_global_pg()
-            assert edm.mesh_count == 0
-            assert edm.global_pg.size() == global_world_size
-
-            time.sleep(1)  # TODO: I actually don't know why this is necessary
-
-            edm.maybe_reinit_global_pg(admit_joiners=True)
-            assert edm.mesh_count == 1
-            assert edm.global_pg.size() == global_world_size + 1
-
-            a = torch.arange(3) * (test_value + 1)
-            sum_ints = global_world_size * (global_world_size + 1) // 2 + 100
-            dist.all_reduce(a, op=dist.ReduceOp.SUM, group=edm.global_pg)
-            assert torch.allclose(a, torch.tensor([0, sum_ints, 2 * sum_ints]))
-
-            if test_value == 1:
-                return
-            time.sleep(2)
-            edm.maybe_reinit_global_pg()
-            assert edm.mesh_count == 2
-            assert edm.global_pg.size() == global_world_size
-
-            a = torch.arange(3) * (test_value + 1)
-            sum_ints = global_world_size * (global_world_size + 1) // 2 + 100 - 2
-            dist.all_reduce(a, op=dist.ReduceOp.SUM, group=edm.global_pg)
-            assert torch.allclose(a, torch.tensor([0, sum_ints, 2 * sum_ints]))
-
-            dist.barrier(edm.global_pg)
-
-            del edm
-
-    def bar(**kwargs):
-        with mock_env(**kwargs):
-            test_value = int(kwargs["TEST_VALUE"])
-            time.sleep(1)
-
-            ready_event.set()  # Signal that we are about to queue
-
-            edm = ElasticDeviceMesh()
-            assert edm.mesh_count == 1
-            assert edm.global_pg.size() == global_world_size + 1
-
-            a = torch.arange(3) * test_value
-            sum_ints = global_world_size * (global_world_size + 1) // 2 + 100
-            dist.all_reduce(a, op=dist.ReduceOp.SUM, group=edm.global_pg)
-            assert torch.allclose(a, torch.tensor([0, sum_ints, 2 * sum_ints]))
-
-            edm.maybe_reinit_global_pg()
-            assert edm.mesh_count == 2
-            assert edm.global_pg.size() == global_world_size
-
-            a = torch.arange(3) * test_value
-            sum_ints = global_world_size * (global_world_size + 1) // 2 + 100 - 2
-            dist.all_reduce(a, op=dist.ReduceOp.SUM, group=edm.global_pg)
-            assert torch.allclose(a, torch.tensor([0, sum_ints, 2 * sum_ints]))
-
-            dist.barrier(edm.global_pg)
-
-            del edm
-
-    global_ports = [i for i in range(21970, 21970 + world_size)]
-    master_ports = [i for i in range(31000, 31000 + global_world_size + 1)]
-    processes = []
-    for global_rank in range(global_world_size):
-        for rank in range(world_size):
-            processes.append(
-                mp.Process(
-                    target=foo,
-                    kwargs={
-                        "MASTER_ADDR": "localhost",
-                        "MASTER_PORT": str(master_ports[global_rank]),
-                        "RANK": str(rank),
-                        "WORLD_SIZE": str(world_size),
-                        "LOCAL_RANK": str(rank),
-                        "LOCAL_WORLD_SIZE": str(world_size),
-                        "GLOBAL_UNIQUE_ID": str(global_rank),
-                        "GLOBAL_ADDR": "localhost",
-                        "GLOBAL_PORT": str(global_ports[0]),
-                        "GLOBAL_RANK": str(global_rank),
-                        "GLOBAL_WORLD_SIZE": str(global_world_size),
-                        "ZERO_BAND_LOG_LEVEL": "DEBUG",
-                        "ZERO_BAND_LOG_ALL_RANK": "true",
-                        "TEST_VALUE": str(global_rank),
-                    },
-                )
-            )
-
-    for rank in range(world_size):
-        processes.append(
-            mp.Process(
-                target=bar,
-                kwargs={
-                    "MASTER_ADDR": "localhost",
-                    "MASTER_PORT": str(master_ports[global_world_size]),
-                    "RANK": str(rank),
-                    "WORLD_SIZE": str(world_size),
-                    "LOCAL_RANK": str(rank),
-                    "LOCAL_WORLD_SIZE": str(world_size),
-                    "GLOBAL_UNIQUE_ID": "A",
-                    "GLOBAL_ADDR": "localhost",
-                    "GLOBAL_PORT": str(global_ports[0]),
-                    "GLOBAL_RANK": "100",
-                    "GLOBAL_WORLD_SIZE": str(global_world_size),
-                    "ZERO_BAND_LOG_LEVEL": "DEBUG",
-                    "TEST_VALUE": "100",
-                },
-            )
-        )
-
-    for p in processes:
-        p.start()
-    for p in processes:
-        p.join()
-        if p.exitcode != 0:
-            pytest.fail(f"Process {p.pid} failed with exit code {p.exitcode}")
diff --git a/tests/test_dist/test_diloco.py b/tests/test_dist/test_diloco.py
deleted file mode 100644
index ba71f107..00000000
--- a/tests/test_dist/test_diloco.py
+++ /dev/null
@@ -1,64 +0,0 @@
-"""test Diloco."""
-
-import multiprocessing
-import pytest
-
-import torch
-import torch.distributed as dist
-from torch.distributed.fsdp import ShardingStrategy
-
-from zeroband.diloco import Diloco, DilocoConfig
-
-
-@pytest.mark.skip("test failed since introduce of custom all reduce")
-@pytest.mark.parametrize("world_size", [2])  # [1, 2])
-def test_diloco_all_reduce(world_size, random_available_port, dist_environment):
-    """
-    In this test we manually create a inner model and a outer model where we control the weight:
-    inner has weight: (rank + 1) / 2
-    outer has weight: (rank + 1)
-
-    since we know the world_size we can predict the results of the all reduce of the pseudo gradient and therefore test
-    if it is done correclty.
-    """
-
-    class FakeElasticDeviceMesh:
-        def __init__(self):
-            self.global_pg = dist.new_group(backend="gloo")
-
-        def maybe_reinit_global_pg(self, *args, **kwargs) -> None: ...
-
-    def all_reduce(rank: int, world_size: int):
-        with dist_environment(random_available_port, rank=rank, world_size=world_size, global_unique_id=str(rank)):
-            diloco_config = DilocoConfig(inner_steps=10)
-
-            model = torch.nn.Linear(10, 10)
-
-            # init param to rank + 1
-            for param in model.parameters():
-                param.data = (rank + 1) * torch.ones_like(param.data).to("cuda")
-
-            diloco = Diloco(diloco_config, model, ShardingStrategy.FULL_SHARD, FakeElasticDeviceMesh())
-
-            # simulate inner model updates
-            for param in model.parameters():
-                param.data = (rank + 1) / 2 * torch.ones_like(param.data).to("cuda")
-
-            diloco.sync_pseudo_gradient(model)
-
-            for param in diloco.param_list_cpu:
-                print(f"param.grad.mean() {param.grad.mean()}")
-                target = (
-                    torch.ones_like(param.grad)
-                    * sum([(rank + 1) - (rank + 1) / 2 for rank in range(world_size)])
-                    / world_size
-                )
-                assert param.grad.mean() == target.mean()
-
-    processes = [multiprocessing.Process(target=all_reduce, args=(rank, world_size)) for rank in range(world_size)]
-    for p in processes:
-        p.start()
-    for p in processes:
-        p.join()
-        if p.exitcode != 0:
-            pytest.fail(f"Process {p.pid} failed with exit code {p.exitcode}")
diff --git a/tests/test_dist/test_send_state_dict.py b/tests/test_dist/test_send_state_dict.py
deleted file mode 100644
index e4e1f22f..00000000
--- a/tests/test_dist/test_send_state_dict.py
+++ /dev/null
@@ -1,110 +0,0 @@
-import os
-import pytest
-import torch
-from zeroband.comms import ElasticDeviceMesh
-from zeroband.utils.state_dict_send_recv import (
-    _get_sendable_state_dict,
-    _load_sendable_state_dict,
-    recv_state_dict,
-    send_state_dict,
-)
-import multiprocessing as mp
-
-
-def test_load_state_dict():
-    state_dict_to_send = {
-        "step": 0,
-        "world": "karl is having his best life",
-        "optim_sates": torch.ones(10),
-        "nested_data": {"foo": "bar", "tensor": torch.ones(10)},
-    }
-
-    state_dict_copy = {
-        "step": 0,
-        "world": "karl is having his best life",
-        "optim_sates": torch.ones(10),
-        "nested_data": {"foo": "bar", "tensor": torch.ones(10)},
-    }
-
-    non_tensored_state_send, tensors_send = _get_sendable_state_dict(state_dict_to_send)
-
-    assert isinstance(non_tensored_state_send["optim_sates"], str)
-    assert non_tensored_state_send["optim_sates"].startswith("zeroband_tensor")
-
-    print(len(tensors_send))
-    print(non_tensored_state_send)
-    _load_sendable_state_dict(tensors_send, non_tensored_state_send)
-
-    assert (state_dict_to_send["optim_sates"] == state_dict_copy["optim_sates"]).all()
-    assert id(state_dict_to_send["optim_sates"]) != id(state_dict_copy["optim_sates"])
-
-    assert (state_dict_to_send["nested_data"]["tensor"] == state_dict_copy["nested_data"]["tensor"]).all()
-    assert id(state_dict_to_send["nested_data"]["tensor"]) != id(state_dict_copy["nested_data"]["tensor"])
-
-    assert state_dict_to_send["step"] == state_dict_copy["step"]
-    assert state_dict_to_send["world"] == state_dict_copy["world"]
-    assert state_dict_to_send["nested_data"]["foo"] == state_dict_copy["nested_data"]["foo"]
-
-
-@pytest.mark.skip(reason="hang")
-@pytest.mark.parametrize("world_size", [2])
-def test_send_recv_state_dict(world_size: int, random_available_port: int, mock_env):
-    def foo(**kwargs):
-        with mock_env(**kwargs):
-            edm = ElasticDeviceMesh()
-
-            state_dict_to_send = {
-                "step": 0,
-                "world": "karl is having his best life",
-                "optim_sates": torch.ones(10),
-                "nested_data": {"foo": "bar", "tensor": torch.ones(10)},
-            }
-
-            state_dict_to_recv = {
-                "step": 10,
-                "world": "karl is in holiday",
-                "optim_sates": torch.zeros(10),
-                "nested_data": {"foo": "barman", "tensor": torch.zeros(10)},
-            }
-
-            rank = int(os.environ.get("RANK"))
-
-            if rank == 0:
-                send_state_dict(state_dict_to_send, 1, world_size)
-            else:
-                state_dict = recv_state_dict(pg=edm.global_pg, rank=0, world_size=world_size)
-
-                assert (state_dict["optim_sates"] == state_dict_to_recv["optim_sates"]).all()
-                assert id(state_dict["optim_sates"]) != id(state_dict_to_recv["optim_sates"])
-
-                assert (state_dict["nested_data"]["tensor"] == state_dict_to_recv["nested_data"]["tensor"]).all()
-                assert id(state_dict["nested_data"]["tensor"]) != id(state_dict_to_recv["nested_data"]["tensor"])
-
-                assert state_dict["step"] == state_dict_to_recv["step"]
-                assert state_dict["world"] == state_dict_to_recv["world"]
-                assert state_dict["nested_data"]["foo"] == state_dict_to_recv["nested_data"]["foo"]
-
-            del edm
-
-    processes = []
-    for rank in range(world_size):
-        processes.append(
-            mp.Process(
-                target=foo,
-                kwargs={
-                    "MASTER_ADDR": "localhost",
-                    "MASTER_PORT": str(random_available_port),
-                    "RANK": str(rank),
-                    "WORLD_SIZE": str(world_size),
-                    "LOCAL_RANK": str(rank),
-                    "LOCAL_WORLD_SIZE": str(world_size),
-                    "ZERO_BAND_LOG_LEVEL": "DEBUG",
-                },
-            )
-        )
-    for p in processes:
-        p.start()
-    for p in processes:
-        p.join()
-        if p.exitcode != 0:
-            pytest.fail(f"Process {p.pid} failed with exit code {p.exitcode}")
diff --git a/tests/test_model.py b/tests/test_model.py
deleted file mode 100644
index 7853cb22..00000000
--- a/tests/test_model.py
+++ /dev/null
@@ -1,227 +0,0 @@
-import random
-import pytest
-import torch
-from zeroband.models.llama import Transformer, llama2_configs
-from zeroband.models.llama.model import Attention, ModelArgs, create_block_mask_from_seqlens
-
-
-VOCAB_SIZE = 1024
-
-ERROR_ATOL = {
-    torch.float: 3e-4,
-    torch.half: 4e-3,
-    torch.bfloat16: 2e-2,
-}
-ERROR_RTOL = {
-    torch.float: 2e-5,
-    torch.half: 4e-4,
-    torch.bfloat16: 5e-3,
-}
-
-
-@pytest.fixture
-def llama_config() -> ModelArgs:
-    config = llama2_configs["debugmodel"]
-    config.vocab_size = VOCAB_SIZE
-    return config
-
-
-def test_llama(llama_config: ModelArgs):
-    seq_len = 512
-    bs = 8
-    model = Transformer(llama_config).to("cuda")
-    input_ = torch.randint(0, llama_config.vocab_size, (bs, seq_len)).to("cuda")
-    with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
-        output = model(input_)
-
-    assert output.shape == (bs, seq_len, llama_config.vocab_size)
-
-
-def get_freqs_cis(llama_config: ModelArgs):
-    model = Transformer(llama_config).to("cuda")
-    return model.freqs_cis
-
-
-def test_attn(llama_config: ModelArgs):
-    seq_len = 512
-    bs = 8
-
-    freqs_cis = get_freqs_cis(llama_config)
-    input_ = torch.rand(bs, seq_len, llama_config.dim).to("cuda")
-    seqlens = [torch.Tensor([seq_len]).int().to("cuda") for _ in range(bs)]
-    block_mask = create_block_mask_from_seqlens(seqlens)
-
-    attn = Attention(llama_config).to("cuda")
-
-    with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
-        output_sdpa = attn(input_, freqs_cis)
-
-    with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
-        output_flex = attn(input_, freqs_cis, block_mask=block_mask)
-
-    rtol = ERROR_RTOL[torch.bfloat16]
-    atol = ERROR_ATOL[torch.bfloat16]
-    assert output_sdpa.shape == output_flex.shape
-    torch.testing.assert_close(output_sdpa, output_flex, rtol=rtol, atol=atol)
-
-
-def test_packing_simple(llama_config: ModelArgs):
-    seq_len = 512
-    bs = 8
-
-    freqs_cis = get_freqs_cis(llama_config)
-    input_ = torch.rand(bs, seq_len, llama_config.dim).to("cuda")
-    seqlens = [torch.Tensor([seq_len // 4] * 4).int().to("cuda") for _ in range(bs)]
-    block_mask = create_block_mask_from_seqlens(seqlens)
-
-    attn = Attention(llama_config).to("cuda")
-
-    with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
-        output = attn(input_, freqs_cis, block_mask=block_mask)
-
-    assert output.shape == (bs, seq_len, llama_config.dim)
-
-
-def test_sequence_packing_two_time_same_sequence(llama_config: ModelArgs):
-    """
-    In this test we take a sequence and pack it with itself along the seqlen dimension.
-    We then pass the packed sequence to the attention layer and check that the output for each sequence is the same.
-    """
-
-    model = Attention(llama_config).to("cuda")
-
-    emb = torch.nn.Embedding(10, llama_config.dim).to("cuda")
-
-    seq = [2, 1, 4, 8]
-    input_stuff_raw = torch.Tensor([seq + seq]).long().to("cuda")
-    seqlens = [torch.Tensor([len(seq), len(seq)]).int().to("cuda")]
-    block_mask = create_block_mask_from_seqlens(seqlens)
-
-    input_stuff = emb(input_stuff_raw)
-
-    freqs_cis = get_freqs_cis(llama_config)
-
-    with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
-        output = model(input_stuff, freqs_cis, block_mask=block_mask)
-
-    output_left = output[:, :4, :]
-    output_right = output[:, 4:, :]
-
-    ### TESTING
-    assert output_left.shape == output_right.shape
-
-    rtol = ERROR_RTOL[torch.bfloat16]
-    atol = ERROR_ATOL[torch.bfloat16]
-    torch.testing.assert_close(output_left, output_right, atol=atol, rtol=rtol)
-
-
-def test_sequence_packing_vs_normal(llama_config: ModelArgs):
-    """
-    take two sequences and compare the outout of attention on individual sequences vs the output of attention on the packed sequence
-    """
-
-    model = Attention(llama_config).to("cuda")
-    emb = torch.nn.Embedding(10, llama_config.dim).to("cuda")
-
-    freqs_cis = get_freqs_cis(llama_config)
-
-    seq_1 = [2, 1, 4, 8]
-    seq_2 = [3, 7, 5, 6]
-
-    input_packed_raw = torch.Tensor([seq_1 + seq_2]).long().to("cuda")
-    seqlens = [torch.Tensor([len(seq_1), len(seq_2)]).int().to("cuda")]
-    block_mask = create_block_mask_from_seqlens(seqlens)
-
-    input_packed = emb(input_packed_raw)
-
-    with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
-        output = model(input_packed, freqs_cis, block_mask=block_mask)
-
-    output_packed_1 = output[:, :4, :]
-    output_packed_2 = output[:, 4:, :]
-
-    input_raw_1 = torch.Tensor([seq_1]).long().to("cuda")
-    input_raw_2 = torch.Tensor([seq_2]).long().to("cuda")
-
-    emb_1 = emb(input_raw_1)
-    emb_2 = emb(input_raw_2)
-
-    with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
-        output_1 = model(emb_1, freqs_cis)
-        output_2 = model(emb_2, freqs_cis)
-
-    rtol = ERROR_RTOL[torch.bfloat16]
-    atol = ERROR_ATOL[torch.bfloat16]
-
-    ### TESTING
-    assert output_1.shape == output_packed_1.shape
-    assert output_2.shape == output_packed_2.shape
-
-    torch.testing.assert_close(output_1, output_packed_1, atol=atol, rtol=rtol)
-    torch.testing.assert_close(output_2, output_packed_2, atol=atol, rtol=rtol)
-
-
-def test_sequence_packing_vs_normal_random(llama_config: ModelArgs):
-    """
-    take two sequences and compare the outout of attention on individual sequences vs the output of attention on the packed sequence
-    """
-
-    model = Attention(llama_config).to("cuda")
-
-    freqs_cis = get_freqs_cis(llama_config)
-
-    MAX_SEQ_LEN = 256
-
-    for _ in range(10):
-        seq_len_cutoff = random.randint(1, MAX_SEQ_LEN)
-
-        seq1 = seq_len_cutoff
-        seq2 = MAX_SEQ_LEN - seq_len_cutoff
-        input_1 = torch.rand(1, seq1, llama_config.dim).to("cuda")
-        input_2 = torch.rand(1, seq2, llama_config.dim).to("cuda")
-
-        seqlens = [torch.Tensor([seq1, seq2]).int().to("cuda")]
-        block_mask = create_block_mask_from_seqlens(seqlens)
-
-        packed_input = torch.cat([input_1, input_2], dim=1)
-
-        # packed output
-        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
-            output = model(packed_input, freqs_cis, block_mask=block_mask)
-
-        output_packed_1 = output[:, :seq_len_cutoff, :]
-        output_packed_2 = output[:, seq_len_cutoff:, :]
-
-        # normal output
-        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
-            output_1 = model(input_1, freqs_cis)
-            output_2 = model(input_2, freqs_cis)
-
-        rtol = ERROR_RTOL[torch.bfloat16]
-        atol = ERROR_ATOL[torch.bfloat16]
-
-        ### TESTING
-        assert output_1.shape == output_packed_1.shape
-        assert output_2.shape == output_packed_2.shape
-
-        torch.testing.assert_close(output_1, output_packed_1, atol=atol, rtol=rtol)
-        torch.testing.assert_close(output_2, output_packed_2, atol=atol, rtol=rtol)
-
-
-def test_end_to_end_packing(llama_config: ModelArgs):
-    model = Transformer(llama_config).to("cuda")
-
-    BS = 8
-    SEQ_LEN = 128
-
-    input_ = torch.randint(1, llama_config.vocab_size, (BS, SEQ_LEN)).to("cuda")
-
-    seqlens = [torch.Tensor([SEQ_LEN // 4, SEQ_LEN // 4, SEQ_LEN // 2]).int().to("cuda") for _ in range(BS)]
-    block_mask = create_block_mask_from_seqlens(seqlens)
-    with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
-        output = model(input_, block_mask=block_mask)
-
-    assert output.shape == (BS, SEQ_LEN, llama_config.vocab_size)
-
-    loss = output.mean()
-    loss.backward()  # test that the backward for fa2
diff --git a/tests/test_torchrun/test_train.py b/tests/test_torchrun/test_train.py
deleted file mode 100644
index 7c45b07c..00000000
--- a/tests/test_torchrun/test_train.py
+++ /dev/null
@@ -1,269 +0,0 @@
-import copy
-import os
-from pathlib import Path
-import pickle
-import subprocess
-import pytest
-import socket
-
-from zeroband.diloco import Compression
-
-import torch
-
-num_gpu = torch.cuda.device_count()
-
-
-def get_random_available_port_list(num_port):
-    # https://stackoverflow.com/questions/1365265/on-localhost-how-do-i-pick-a-free-port-number
-    ports = []
-
-    while len(ports) < num_port:
-        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-            s.bind(("", 0))
-            new_port = s.getsockname()[1]
-
-        if new_port not in ports:
-            ports.append(new_port)
-
-    return ports
-
-
-def get_random_available_port(num_port):
-    return get_random_available_port_list(num_port)[0]
-
-
-def gpus_to_use(num_nodes, num_gpu, rank):
-    return ",".join(map(str, range(rank * num_gpu, (rank + 1) * num_gpu)))
-
-
-def _test_multi_gpu(num_gpus, config, extra_args=[], diloco=False):
-    num_nodes, num_gpu = num_gpus[0], num_gpus[1]
-
-    processes = []
-    ports = get_random_available_port_list(num_nodes)
-    new_port = get_random_available_port(1)
-    for i in range(num_nodes):
-        cmd = [
-            "torchrun",
-            f"--nproc_per_node={num_gpu}",
-            "--rdzv-endpoint",
-            f"localhost:{ports[i]}",
-            "src/zeroband/train.py",
-            f"@configs/{config}",
-            *extra_args,
-        ]
-
-        env = copy.deepcopy(os.environ)
-
-        if diloco:
-            new_env = {
-                "GLOBAL_RANK": str(i),
-                "GLOBAL_UNIQUE_ID": str(i),
-                "GLOBAL_ADDR": "localhost",
-                "GLOBAL_WORLD_SIZE": str(num_nodes),
-                "GLOBAL_PORT": str(new_port),
-                "GLOO_SOCKET_IFNAME": "lo",
-            }
-            env.update(new_env)
-
-        env["CUDA_VISIBLE_DEVICES"] = gpus_to_use(num_nodes, num_gpu, i)
-        env["ZERO_BAND_LOG_LEVEL"] = "DEBUG"
-
-        process1 = subprocess.Popen(cmd, env=env)
-        processes.append(process1)
-
-    for process in processes:
-        result = process.wait()
-        if result != 0:
-            pytest.fail(f"Process {result} failed {result}")
-
-
-@pytest.mark.parametrize("num_gpus", [[1, 1], [2, 1], [1, 2]])
-def test_multi_gpu(num_gpus):
-    _test_multi_gpu(num_gpus, "debug/normal.toml")
-
-
-@pytest.mark.parametrize("num_gpus", [[2, 1], [2, 2]] if num_gpu >= 4 else [[2, 1]])
-def test_multi_gpu_diloco(num_gpus):
-    _test_multi_gpu(num_gpus, "debug/diloco.toml", diloco=True)
-
-
-def test_act_ckpt():
-    num_gpus = [1, 2]
-    _test_multi_gpu(num_gpus, "debug/normal.toml", extra_args=["--train.ac_ckpt"])
-
-
-def test_act_ckpt_num():
-    num_gpus = [1, 2]
-    _test_multi_gpu(num_gpus, "debug/normal.toml", extra_args=["--train.ac_ckpt", "2"])
-
-
-@pytest.mark.parametrize("backend", [Compression.NO, Compression.UINT8])
-def test_all_reduce_diloco(backend: Compression):
-    num_gpus = [2, 1]
-    _test_multi_gpu(num_gpus, "debug/diloco.toml", extra_args=["--diloco.compression", backend.value], diloco=True)
-
-
-def test_z_loss():
-    num_gpus = [1, 1]
-    _test_multi_gpu(num_gpus, "debug/normal.toml", extra_args=["--optim.z_loss"])
-
-
-@pytest.mark.parametrize("packing", [True, False])
-def test_packing(packing: bool):
-    num_gpus = [2, 1]
-    packing_arg = "--train.sequence_packing" if packing else "--no-train.sequence_packing"
-    _test_multi_gpu(num_gpus, "debug/normal.toml", extra_args=[packing_arg])
-
-
-@pytest.mark.parametrize("diloco", [False, True])
-def test_soap(diloco: bool):
-    num_gpus = [1, 2] if diloco else [2, 1]
-    _test_multi_gpu(
-        num_gpus,
-        "debug/diloco.toml" if diloco else "debug/normal.toml",
-        extra_args=["--optim.optim.precondition_frequency", "1"],
-        diloco=diloco,
-    )
-
-
-@pytest.mark.parametrize("soap", [False, True])
-def test_ckpt(tmp_path: Path, soap: bool):
-    num_gpus = [1, 2]
-    v1_file = tmp_path / "v1.log"
-    v2_file = tmp_path / "v2.log"
-    # v3_file = tmp_path / "v3.log"
-
-    v1_ckpt = tmp_path / "v1_ckpt"
-    v2_ckpt = tmp_path / "v2_ckpt"
-    # v3_ckpt = tmp_path / "v3_ckpt"
-
-    os.mkdir(v1_ckpt)
-    os.mkdir(v2_ckpt)
-    # os.mkdir(v3_ckpt)
-
-    _test_multi_gpu(
-        num_gpus,
-        "debug/diloco.toml",
-        extra_args=[
-            "--project",
-            str(v1_file),
-            "--ckpt.path",
-            str(v1_ckpt),
-            "--ckpt.interval",
-            "5",
-            "--optim.total_steps",
-            "20",
-            "--train.log_model_hash",
-            "--no-train.sequence_packing",
-            "--train.attn_fn",
-            "math",
-        ]
-        + (["--optim.optim.precondition_frequency", "1"] if soap else []),
-        diloco=True,
-    )
-    _test_multi_gpu(
-        num_gpus,
-        "debug/diloco.toml",
-        extra_args=[
-            "--project",
-            str(v2_file),
-            "--ckpt.path",
-            str(v2_ckpt),
-            "--ckpt.interval",
-            "5",
-            "--ckpt.resume",
-            str(v1_ckpt / "step_5"),
-            "--optim.total_steps",
-            "20",
-            "--train.log_model_hash",
-            "--no-train.sequence_packing",
-            "--train.attn_fn",
-            "math",
-        ]
-        + (["--optim.optim.precondition_frequency", "1"] if soap else []),
-        diloco=True,
-    )
-    # _test_multi_gpu(
-    #     num_gpus,
-    #     "debug/diloco.toml",
-    #     extra_args=[
-    #         "--project",
-    #         str(v3_file),
-    #         "--ckpt.path",
-    #         str(v3_ckpt),
-    #         "--ckpt.interval",
-    #         "5",
-    #         "--ckpt.resume",
-    #         str(v2_ckpt / "step_10"),
-    #         "--optim.total_steps",
-    #         "20",
-    #         "--train.log_model_hash",
-    #         "--no-train.sequence_packing",
-    #         "--train.attn_fn",
-    #         "math",
-    #     ],
-    #     diloco=True,
-    # )
-
-    key_to_round = ["Perplexity", "Loss"]
-    digit_to_round = [0, 3]
-
-    def read_logs(path: Path):
-        with path.open("rb") as f:
-            data = pickle.load(f)
-
-        filtered_data = {}
-        for entry in data:
-            step = entry.pop("step")
-
-            # Round perplexity and loss
-            for key, digit in zip(key_to_round, digit_to_round):
-                if key in entry:
-                    entry[key] = round(entry[key], digit)
-
-            if step in filtered_data:
-                filtered_data[step].update(entry)
-            else:
-                filtered_data[step] = entry
-
-        return filtered_data
-
-    v1_data = read_logs(v1_file)
-    v2_data = read_logs(v2_file)
-    # v3_data = read_logs(v3_file)
-
-    ## check that loading from v1 to v2 worked
-
-    # first check that the hash of saving is the same as the hash of loading
-    assert v1_data[5]["inner_model_hash_save"] == v2_data[5]["inner_model_hash_resume"]
-    assert v1_data[5]["inner_optimizer_hash_save"] == v2_data[5]["inner_optimizer_hash_resume"]
-    assert v1_data[5]["outer_optimizer_hash_save"] == v2_data[5]["outer_optimizer_hash_resume"]
-    assert v1_data[5]["outer_model_hash_save"] == v2_data[5]["outer_model_hash_resume"]
-
-    # then we check that the loss and lr value are the same after loading the ckpt
-    for step, data_v2 in v2_data.items():
-        if step == 5:
-            continue  # not testing 5 as ts the one were we restarted from
-
-        data_v1 = v1_data[step]
-        assert abs(data_v1["Loss"] - data_v2["Loss"]) < .1
-        assert data_v1["inner_lr"] == data_v2["inner_lr"]
-        assert data_v1["total_tokens"] == data_v2["total_tokens"]
-
-    # ## check that the second loading is working
-    # ## why ? We had bugs where ckpt was working but not when the training was resuming
-
-    # assert v2_data[10]["inner_model_hash_save"] == v3_data[10]["inner_model_hash_resume"]
-    # assert v2_data[10]["inner_optimizer_hash_save"] == v3_data[10]["inner_optimizer_hash_resume"]
-    # assert v2_data[10]["outer_optimizer_hash_save"] == v3_data[10]["outer_optimizer_hash_resume"]
-    # assert v2_data[10]["outer_model_hash_save"] == v3_data[10]["outer_model_hash_resume"]
-
-    # for step, data_v3 in v3_data.items():
-    #     if step == 10:
-    #         continue  # not testing 10 as ts the one were we restarted from
-
-    #     data_v2 = v2_data[step]
-    #     assert data_v2["Loss"] == data_v3["Loss"]
-    #     assert data_v2["inner_lr"] == data_v3["inner_lr"]
-    #     assert data_v2["total_tokens"] == data_v3["total_tokens"]
diff --git a/third_party/gloo b/third_party/gloo
deleted file mode 160000
index 5354032e..00000000
--- a/third_party/gloo
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 5354032ea08eadd7fc4456477f7f7c6308818509
diff --git a/train_ddp.py b/train_ddp.py
new file mode 100644
index 00000000..cffdbb8b
--- /dev/null
+++ b/train_ddp.py
@@ -0,0 +1,385 @@
+from dataclasses import dataclass
+import os
+import time
+from typing import TYPE_CHECKING, Literal
+
+import torch
+import torch.distributed as dist
+import wandb
+
+from zeroband.data import TEST_VOCAB_SIZE, DataConfig, get_dataloader
+from zeroband.lr_scheduler import get_scheduler
+from zeroband.models.llama import get_model
+from zeroband.models.llama.model import create_block_mask_from_seqlens
+from zeroband.utils import (
+    FakeTokenizer,
+    PerfCounter,
+    get_peak_flops,
+    get_num_params,
+    get_num_flop_per_token,
+    apply_ac_ckpt,
+)
+from zeroband.logger import get_logger
+
+from transformers import AutoTokenizer
+from pydantic_config import BaseConfig, parse_argv
+import torch.nn.functional as F
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+from zeroband.world_info import get_world_info
+
+from torch import Tensor
+
+
+class MuonConfig(BaseConfig):
+    type: Literal["muon"] = "muon"
+    lr: float = 2e-2
+    weight_decay: float = 0.01
+    momentum: float = 0.95
+    nesterov: bool = True
+    ns_steps: int = 5
+
+
+class OptimConfig(BaseConfig):
+    optim: MuonConfig = MuonConfig()
+    sched_type: Literal["cosine", "linear", "wsd-sqrt"] = "cosine"
+    warmup_steps: int = 1000
+    stable_steps: int = 80_000
+    total_steps: int = 88_000
+    batch_size: int = 512
+
+
+class TrainConfig(BaseConfig):
+    micro_bs: int = 1
+    ac_ckpt: bool | int = False
+    reshard_after_forward: bool = True  # old shard grad op True mean full shard
+    torch_compile: bool = True
+
+
+class Config(BaseConfig):
+    name_model: Literal["debugmodel", "70M", "150M", "271M", "1B", "7B", "10B", "13B", "26B", "70B"] = "150M"
+    type_model: Literal["llama2", "llama3"] = "llama3"
+
+    project: str = "prime_simple"
+    wandb: bool = True
+
+    data: DataConfig = DataConfig()
+    optim: OptimConfig = OptimConfig()
+    train: TrainConfig
+
+
+@dataclass
+class TrainingProgress:
+    total_tokens: int
+    outer_step: int
+    step: int
+
+
+def zeropower_via_newtonschulz5(G: Tensor, steps: int) -> Tensor:
+    """
+    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+    zero even beyond the point where the iteration no longer converges all the way to one everywhere
+    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert (
+        G.ndim >= 2
+    )  # batched Muon implementation by @scottjmaddox, and put into practice in the record by @YouJiacheng
+    a, b, c = (3.4445, -4.7750, 2.0315)
+    X = G.bfloat16()
+    if G.size(-2) > G.size(-1):
+        X = X.mT
+
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm(dim=(-2, -1), keepdim=True) + 1e-7)
+    # Perform the NS iterations
+    for _ in range(steps):
+        A = X @ X.mT
+        B = (
+            b * A + c * A @ A
+        )  # quintic computation strategy adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
+        X = a * X + B @ X
+
+    if G.size(-2) > G.size(-1):
+        X = X.mT
+    return X
+
+
+class Muon(torch.optim.Optimizer):
+    """
+    Muon - MomentUm Orthogonalized by Newton-schulz
+
+    https://kellerjordan.github.io/posts/muon/
+
+    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
+    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
+    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
+    the advantage that it can be stably run in bfloat16 on the GPU.
+
+    Some warnings:
+    - This optimizer should not be used for the embedding layer, the final fully connected layer,
+    or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW).
+    - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions.
+
+    Arguments:
+        lr: The learning rate used by the internal SGD.
+        momentum: The momentum used by the internal SGD.
+        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
+        ns_steps: The number of Newton-Schulz iteration steps to use.
+    """
+
+    def __init__(
+        self, params, lr=0.02, weight_decay=0.01, momentum=0.95, nesterov=True, ns_steps=5, rank=0, world_size=1
+    ):
+        self.rank = rank
+        self.world_size = world_size
+        defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps)
+        params: list[Tensor] = [*params]
+        param_groups = []
+        for size in {p.numel() for p in params}:
+            b = torch.empty(world_size, size, dtype=torch.bfloat16, device="cuda")
+            group = dict(
+                params=[p for p in params if p.numel() == size],
+                update_buffer=b,
+                update_buffer_views=[b[i] for i in range(world_size)],
+            )
+            param_groups.append(group)
+        super().__init__(param_groups, defaults)
+
+    @torch.no_grad()
+    def step(self):
+        for group in self.param_groups:
+            update_buffer: Tensor = group["update_buffer"]
+            update_buffer_views: list[Tensor] = group["update_buffer_views"]
+            # generate weight updates in distributed fashion
+            params: list[Tensor] = group["params"]
+            handle = None
+            params_world = None
+
+            def update_prev():  # optimized Muon implementation contributed by @YouJiacheng
+                handle.wait()
+                for p_world, g_world in zip(params_world, update_buffer_views):
+                    p_world.mul_(1 - group["lr"] * group["weight_decay"])
+                    p_world.add_(
+                        g_world.view_as(p_world),
+                        alpha=-group["lr"] * max(1, p_world.size(-2) / p_world.size(-1)) ** 0.5,
+                    )
+
+            for base_i in range(len(params))[:: self.world_size]:
+                if base_i + self.rank < len(params):
+                    p = params[base_i + self.rank]
+                    g = p.grad
+                    assert g is not None
+                    state = self.state[p]
+                    if "momentum_buffer" not in state:
+                        state["momentum_buffer"] = torch.zeros_like(g)
+                    buf: Tensor = state["momentum_buffer"]
+                    buf.lerp_(g, 1 - group["momentum"])
+                    g = g.lerp_(buf, group["momentum"]) if group["nesterov"] else buf
+                    if g.ndim == 4:  # for the case of conv filters
+                        g = g.view(len(g), -1)
+                    g = zeropower_via_newtonschulz5(g, steps=group["ns_steps"]).flatten()
+                else:
+                    g = update_buffer_views[self.rank]
+                if base_i > 0:
+                    update_prev()  # async all_gather instead of sync all_reduce by @YouJiacheng
+                handle = dist.all_gather_into_tensor(update_buffer, g, async_op=True)
+                params_world = params[base_i : base_i + self.world_size]
+            update_prev()
+
+
+def train(config: Config):
+    # batch_size is the total batch size for all GPUs
+    assert config.optim.batch_size % world_info.local_world_size == 0
+    batch_size = config.optim.batch_size // world_info.local_world_size
+
+    assert batch_size % config.train.micro_bs == 0, (
+        f"The micro batch size ({config.train.micro_bs}) must divide the number of samples on each GPU ({batch_size})."
+    )
+    gradient_accumulation_steps = batch_size // config.train.micro_bs
+
+    # Load tokenizer
+    if config.data.fake and config.name_model == "debugmodel":
+        tokenizer = FakeTokenizer()
+    elif config.type_model == "llama2":
+        tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=True)
+    elif config.type_model == "llama3":
+        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B", use_fast=True)
+    else:
+        raise ValueError(f"Model type {config.type_model} not supported")
+
+    train_dataloader = get_dataloader(
+        tokenizer=tokenizer,
+        world_size=world_info.world_size,
+        rank=world_info.rank,
+        batch_size=config.train.micro_bs,
+        data_config=config.data,
+    )
+    train_dataloader_iterator = iter(train_dataloader)
+
+    model, model_config = get_model(
+        type_model=config.type_model,
+        name_model=config.name_model,
+        seq_length=config.data.seq_length,
+        vocab_size=len(tokenizer) if config.name_model != "debugmodel" or not config.data.fake else TEST_VOCAB_SIZE,
+    )
+    model = model.to(world_info.local_rank)
+
+    gpu_peak_flops = get_peak_flops(torch.cuda.get_device_name(torch.device("cuda")))
+    logger.info(f"Peak FLOPS used for computing MFU: {gpu_peak_flops:.3e}")
+
+    num_params = get_num_params(model, exclude_embedding=True)
+    logger.info(f"Number of parameters: {num_params}")
+    num_flop_per_token = get_num_flop_per_token(
+        num_params,
+        model_config,
+        config.data.seq_length,
+    )
+
+    if config.train.ac_ckpt:
+        num = 1 if isinstance(config.train.ac_ckpt, bool) else config.train.ac_ckpt
+        apply_ac_ckpt(model, num)
+
+    logger.info(f"Initializing DDP model on device {world_info.local_rank}")
+    model = DDP(model, device_ids=[world_info.local_rank], broadcast_buffers=False, gradient_as_bucket_view=True)
+
+    hidden_matrix_params = [p for n, p in model.module.layers.named_parameters() if p.ndim >= 2 and "embed" not in n]
+    embed_params = [p for n, p in model.module.named_parameters() if "embed" in n]
+    scalar_params = [p for p in model.module.parameters() if p.ndim < 2]
+    head_params = [model.module.output.weight]
+
+    # init the optimizer(s)
+    adam_params = [
+        dict(params=head_params, lr=0.008),
+        dict(params=embed_params, lr=0.6),
+        dict(params=scalar_params, lr=0.04),
+    ]
+    optimizer1 = torch.optim.Adam(adam_params, betas=(0.8, 0.95), eps=1e-10, fused=True)
+    optimizer2 = Muon(
+        hidden_matrix_params,
+        lr=config.optim.optim.lr,
+        momentum=config.optim.optim.momentum,
+        nesterov=config.optim.optim.nesterov,
+        ns_steps=config.optim.optim.ns_steps,
+        rank=world_info.rank,
+        world_size=world_info.world_size,
+    )
+
+    optimizers = [optimizer2, optimizer1]
+
+    schedulers = [
+        get_scheduler(
+            sched_type=config.optim.sched_type,
+            optimizer=optimizer,
+            num_warmup_steps=config.optim.warmup_steps,
+            num_stable_steps=config.optim.stable_steps,
+            num_training_steps=config.optim.total_steps,
+        )
+        for optimizer in optimizers
+    ]
+
+    training_progress = TrainingProgress(total_tokens=0, outer_step=0, step=0)
+
+    if world_info.rank == 0 and config.wandb:
+        wandb.init(project=config.project, config=config.model_dump())
+
+    if config.train.torch_compile:
+        model = torch.compile(model) if not TYPE_CHECKING else model
+
+    perf_counter = PerfCounter(window_size=10)
+
+    while True:
+        loss_batch = 0
+
+        for grad_acc_step in range(gradient_accumulation_steps):
+            is_accumulating = grad_acc_step < gradient_accumulation_steps - 1
+            # no sync if we are accumulating gradients
+            model.require_backward_grad_sync = not is_accumulating
+
+            batch = next(train_dataloader_iterator)
+            input_ids = batch["input_ids"].to("cuda")
+            labels = batch["labels"].to("cuda")
+            seqlens = [seqlen.to("cuda") for seqlen in batch["seqlens"]]
+            block_mask = create_block_mask_from_seqlens(seqlens) if seqlens is not None else None
+
+            logits = model(tokens=input_ids, block_mask=block_mask).contiguous()
+            flatten_logits = logits.reshape(-1, logits.size(-1))  # b seq vocab -> (b * seq) vocab
+            flatten_labels = labels.reshape(-1)  # b seq -> (b * seq)
+
+            ce_loss = F.cross_entropy(flatten_logits, flatten_labels)
+
+            del logits
+            del flatten_logits
+            del flatten_labels
+
+            loss = ce_loss / gradient_accumulation_steps
+            loss.backward()
+            loss_batch += loss.detach().clone()
+
+            # Launch both allreduces at the same time to hide latency
+            dist.all_reduce(tensor=loss_batch, op=dist.ReduceOp.AVG)
+
+        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # type: ignore (is a dtensor)
+
+        for optimizer, scheduler in zip(optimizers, schedulers):
+            optimizer.step()
+            scheduler.step()
+
+            optimizer.zero_grad()
+
+        # logging
+        training_progress.step += 1
+        inner_lr = [group["lr"] for group in optimizers[0].param_groups][0]
+
+        # syncing loss across all data parallel rank within a nodes
+        new_tokens = config.data.seq_length * config.optim.batch_size
+        perf_counter.count_tokens(new_tokens)
+        training_progress.total_tokens += new_tokens
+
+        metrics = {
+            "Loss": loss_batch.item(),
+            "step": training_progress.step,
+            "inner_lr": inner_lr,
+            "Perplexity": torch.exp(loss_batch).item(),
+            "total_tokens": training_progress.total_tokens,
+            "time": time.time(),
+            "grad_norm": grad_norm.item(),
+        }
+
+        log = f"step: {training_progress.step}, loss: {loss_batch.item():.4f}"
+
+        tokens_per_second = perf_counter.get_tokens_per_second()
+        if tokens_per_second is not None:
+            metrics["tokens_per_second"] = tokens_per_second
+            metrics["mfu"] = 100 * num_flop_per_token * tokens_per_second / gpu_peak_flops / world_info.local_world_size
+            log += f", tokens_per_second: {tokens_per_second:.2f}, mfu: {metrics['mfu']:.2f}"
+
+        if world_info.rank == 0 and config.wandb:
+            wandb.log(metrics)
+
+        logger.info(log)
+
+        if training_progress.step > config.optim.total_steps:
+            break
+
+    logger.info("Training finished, exiting ...")
+
+
+if __name__ == "__main__":
+    # Allow eager fallback during production so that that the training runs dont die
+    # However, in development, we want to know that we broke torch compile
+    torch._dynamo.config.suppress_errors = "ZERO_BAND_DEV" not in os.environ  # type: ignore
+    torch.set_float32_matmul_precision("high")
+    torch.manual_seed(42)
+
+    config = Config(**parse_argv())  # type: ignore
+    world_info = get_world_info()
+    logger = get_logger()
+
+    torch.cuda.set_device(world_info.local_rank)
+    dist.init_process_group(backend="nccl")
+
+    train(config)
diff --git a/train_ddp_manual.py b/train_ddp_manual.py
new file mode 100644
index 00000000..bf87c732
--- /dev/null
+++ b/train_ddp_manual.py
@@ -0,0 +1,323 @@
+from dataclasses import dataclass
+import os
+import time
+from typing import TYPE_CHECKING, Literal
+
+import torch
+import torch.distributed as dist
+import wandb
+
+from zeroband.data import TEST_VOCAB_SIZE, DataConfig, get_dataloader
+from zeroband.lr_scheduler import get_scheduler
+from zeroband.models.llama import get_model
+from zeroband.models.llama.model import create_block_mask_from_seqlens
+from zeroband.utils import (
+    FakeTokenizer,
+    PerfCounter,
+    get_peak_flops,
+    get_num_params,
+    get_num_flop_per_token,
+    apply_ac_ckpt,
+)
+from zeroband.logger import get_logger
+
+from transformers import AutoTokenizer
+from pydantic_config import BaseConfig, parse_argv
+import torch.nn.functional as F
+
+from zeroband.world_info import get_world_info
+
+
+class AdamConfig(BaseConfig):
+    type: Literal["adam"] = "adam"
+    lr: float = 4e-4
+    weight_decay: float = 0.1
+    betas1: float = 0.9
+    betas2: float = 0.95
+
+
+class OptimConfig(BaseConfig):
+    optim: AdamConfig = AdamConfig()
+    sched_type: Literal["cosine", "linear", "wsd-sqrt"] = "cosine"
+    warmup_steps: int = 1000
+    stable_steps: int = 80_000
+    total_steps: int = 88_000
+    batch_size: int = 512
+
+
+class TrainConfig(BaseConfig):
+    micro_bs: int = 1
+    ac_ckpt: bool | int = False
+    reshard_after_forward: bool = True  # old shard grad op True mean full shard
+    torch_compile: bool = True
+
+
+class Config(BaseConfig):
+    name_model: Literal["debugmodel", "70M", "150M", "271M", "1B", "7B", "10B", "13B", "26B", "70B"] = "150M"
+    type_model: Literal["llama2", "llama3"] = "llama3"
+
+    project: str = "prime_simple"
+    wandb: bool = True
+
+    data: DataConfig = DataConfig()
+    optim: OptimConfig = OptimConfig()
+    train: TrainConfig
+
+
+@dataclass
+class TrainingProgress:
+    total_tokens: int
+    outer_step: int
+    step: int
+
+
+def zeropower_via_svd(G, steps=None):
+    U, S, V = G.svd()
+    return U @ V.T
+
+
+class Muon(torch.optim.Optimizer):
+    """
+    Muon: MomentUm Orthogonalized by Newton-schulz
+
+    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
+    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
+    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
+    the advantage that it can be stably run in bfloat16 on the GPU.
+
+    Some warnings:
+    - This optimizer assumes that all parameters passed in are 2D.
+    - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D
+    parameters; those should all be optimized by a standard method (e.g., AdamW).
+    - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions.
+    - We believe it is unlikely to work well for training with small batch size.
+    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
+    - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M).
+
+    Arguments:
+        lr: The learning rate used by the internal SGD.
+        momentum: The momentum used by the internal SGD.
+        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
+        backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5')
+        backend_steps: The number of iteration steps to use in the backend, if it is iterative.
+    """
+
+    def __init__(self, params, lr=3e-4, momentum=0.95, nesterov=True):
+        defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov)
+        super().__init__(params, defaults)
+
+    def step(self):
+        for group in self.param_groups:
+            lr = group["lr"]
+            momentum = group["momentum"]
+            for p in group["params"]:
+                g = p.grad
+                if g is None:
+                    continue
+                state = self.state[p]
+                if "momentum_buffer" not in state:
+                    state["momentum_buffer"] = torch.zeros_like(g)
+                buf = state["momentum_buffer"]
+                buf.mul_(momentum).add_(g)
+                if group["nesterov"]:
+                    g = g.add(buf, alpha=momentum)
+                if g.size(0) == 3 * g.size(1):  # split grouped QKV parameters
+                    g = torch.cat([zeropower_via_svd(g1, steps=group["backend_steps"]) for g1 in g.split(g.size(1))])
+                    scale = g.size(1) ** 0.5
+                else:
+                    g = zeropower_via_svd(g)
+                    scale = max(g.size(0), g.size(1)) ** 0.5  # scale to have update.square().mean() == 1
+                p.data.add_(g, alpha=-lr * scale)
+
+
+def train(config: Config):
+    # batch_size is the total batch size for all GPUs
+    assert config.optim.batch_size % world_info.local_world_size == 0
+    batch_size = config.optim.batch_size // world_info.local_world_size
+
+    assert batch_size % config.train.micro_bs == 0, (
+        f"The micro batch size ({config.train.micro_bs}) must divide the number of samples on each GPU ({batch_size})."
+    )
+    gradient_accumulation_steps = batch_size // config.train.micro_bs
+
+    # Load tokenizer
+    if config.data.fake and config.name_model == "debugmodel":
+        tokenizer = FakeTokenizer()
+    elif config.type_model == "llama2":
+        tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=True)
+    elif config.type_model == "llama3":
+        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B", use_fast=True)
+    else:
+        raise ValueError(f"Model type {config.type_model} not supported")
+
+    train_dataloader = get_dataloader(
+        tokenizer=tokenizer,
+        world_size=world_info.world_size,
+        rank=world_info.rank,
+        batch_size=config.train.micro_bs,
+        data_config=config.data,
+    )
+    train_dataloader_iterator = iter(train_dataloader)
+
+    model, model_config = get_model(
+        type_model=config.type_model,
+        name_model=config.name_model,
+        seq_length=config.data.seq_length,
+        vocab_size=len(tokenizer) if config.name_model != "debugmodel" or not config.data.fake else TEST_VOCAB_SIZE,
+    )
+    model = model.to(world_info.local_rank)
+
+    gpu_peak_flops = get_peak_flops(torch.cuda.get_device_name(torch.device("cuda")))
+    logger.info(f"Peak FLOPS used for computing MFU: {gpu_peak_flops:.3e}")
+
+    num_params = get_num_params(model, exclude_embedding=True)
+    logger.info(f"Number of parameters: {num_params}")
+    num_flop_per_token = get_num_flop_per_token(
+        num_params,
+        model_config,
+        config.data.seq_length,
+    )
+
+    if config.train.ac_ckpt:
+        num = 1 if isinstance(config.train.ac_ckpt, bool) else config.train.ac_ckpt
+        apply_ac_ckpt(model, num)
+
+    hidden_matrix_params = [p for n, p in model.layers.named_parameters() if p.ndim >= 2 and "embed" not in n]
+    embed_params = [p for n, p in model.named_parameters() if "embed" in n]
+    scalar_params = [p for p in model.parameters() if p.ndim < 2]
+    head_params = [model.output.weight]
+
+    # init the optimizer(s)
+    adam_params = [
+        dict(params=head_params, lr=0.008),
+        dict(params=embed_params, lr=0.6),
+        dict(params=scalar_params, lr=0.04),
+    ]
+    optimizer1 = torch.optim.Adam(adam_params, betas=(0.8, 0.95), eps=1e-10, fused=True)
+    optimizer2 = Muon(
+        hidden_matrix_params,
+        lr=config.optim.optim.lr,
+        momentum=0.95,
+        nesterov=True,
+    )
+
+    optimizers = [optimizer2, optimizer1]
+
+    schedulers = [
+        get_scheduler(
+            sched_type=config.optim.sched_type,
+            optimizer=optimizer,
+            num_warmup_steps=config.optim.warmup_steps,
+            num_stable_steps=config.optim.stable_steps,
+            num_training_steps=config.optim.total_steps,
+        )
+        for optimizer in optimizers
+    ]
+
+    training_progress = TrainingProgress(total_tokens=0, outer_step=0, step=0)
+
+    if world_info.rank == 0 and config.wandb:
+        wandb.init(project=config.project, config=config.model_dump())
+
+    if config.train.torch_compile:
+        model = torch.compile(model) if not TYPE_CHECKING else model
+
+    perf_counter = PerfCounter(window_size=10)
+
+    while True:
+        loss_batch = 0
+
+        for grad_acc_step in range(gradient_accumulation_steps):
+            # is_accumulating = grad_acc_step < gradient_accumulation_steps - 1
+
+            batch = next(train_dataloader_iterator)
+            input_ids = batch["input_ids"].to("cuda")
+            labels = batch["labels"].to("cuda")
+            seqlens = [seqlen.to("cuda") for seqlen in batch["seqlens"]]
+            block_mask = create_block_mask_from_seqlens(seqlens) if seqlens is not None else None
+
+            logits = model(tokens=input_ids, block_mask=block_mask).contiguous()
+            flatten_logits = logits.reshape(-1, logits.size(-1))  # b seq vocab -> (b * seq) vocab
+            flatten_labels = labels.reshape(-1)  # b seq -> (b * seq)
+
+            ce_loss = F.cross_entropy(flatten_logits, flatten_labels)
+
+            del logits
+            del flatten_logits
+            del flatten_labels
+
+            loss = ce_loss / gradient_accumulation_steps
+            loss.backward()
+            loss_batch += loss.detach().clone()
+
+            # Launch both allreduces at the same time to hide latency
+            dist.all_reduce(tensor=loss_batch, op=dist.ReduceOp.AVG)
+
+        jobs = []
+        for param in model.parameters():
+            jobs.append(dist.all_reduce(tensor=param.grad, op=dist.ReduceOp.AVG, async_op=True))
+
+        for job in jobs:
+            job.wait()
+
+        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # type: ignore (is a dtensor)
+
+        for optimizer, scheduler in zip(optimizers, schedulers):
+            optimizer.step()
+            scheduler.step()
+
+            optimizer.zero_grad()
+
+        # logging
+        training_progress.step += 1
+        inner_lr = [group["lr"] for group in optimizers[0].param_groups][0]
+
+        # syncing loss across all data parallel rank within a nodes
+        new_tokens = config.data.seq_length * config.optim.batch_size
+        perf_counter.count_tokens(new_tokens)
+        training_progress.total_tokens += new_tokens
+
+        metrics = {
+            "Loss": loss_batch.item(),
+            "step": training_progress.step,
+            "inner_lr": inner_lr,
+            "Perplexity": torch.exp(loss_batch).item(),
+            "total_tokens": training_progress.total_tokens,
+            "time": time.time(),
+            "grad_norm": grad_norm.item(),
+        }
+
+        log = f"step: {training_progress.step}, loss: {loss_batch.item():.4f}"
+
+        tokens_per_second = perf_counter.get_tokens_per_second()
+        if tokens_per_second is not None:
+            metrics["tokens_per_second"] = tokens_per_second
+            metrics["mfu"] = 100 * num_flop_per_token * tokens_per_second / gpu_peak_flops / world_info.local_world_size
+            log += f", tokens_per_second: {tokens_per_second:.2f}, mfu: {metrics['mfu']:.2f}"
+
+        if world_info.rank == 0 and config.wandb:
+            wandb.log(metrics)
+
+        logger.info(log)
+
+        if training_progress.step > config.optim.total_steps:
+            break
+
+    logger.info("Training finished, exiting ...")
+
+
+if __name__ == "__main__":
+    # Allow eager fallback during production so that that the training runs dont die
+    # However, in development, we want to know that we broke torch compile
+    torch._dynamo.config.suppress_errors = "ZERO_BAND_DEV" not in os.environ  # type: ignore
+    torch.set_float32_matmul_precision("high")
+    torch.manual_seed(42)
+
+    config = Config(**parse_argv())  # type: ignore
+    world_info = get_world_info()
+    logger = get_logger()
+
+    torch.cuda.set_device(world_info.local_rank)
+    dist.init_process_group(backend="nccl")
+
+    train(config)
diff --git a/train_fsdp.py b/train_fsdp.py
new file mode 100644
index 00000000..15dc07e7
--- /dev/null
+++ b/train_fsdp.py
@@ -0,0 +1,285 @@
+from dataclasses import dataclass
+import os
+import time
+from typing import TYPE_CHECKING, Literal
+
+import torch
+import torch.distributed as dist
+from torch.distributed._composable.fsdp import fully_shard, MixedPrecisionPolicy  # type: ignore
+import wandb
+
+from zeroband.data import TEST_VOCAB_SIZE, DataConfig, get_dataloader
+from zeroband.lr_scheduler import get_scheduler
+from zeroband.models.llama import get_model
+from zeroband.models.llama.model import create_block_mask_from_seqlens
+from zeroband.utils import (
+    FakeTokenizer,
+    PerfCounter,
+    get_peak_flops,
+    get_num_params,
+    get_num_flop_per_token,
+    apply_ac_ckpt,
+)
+from zeroband.logger import get_logger
+
+from transformers import AutoTokenizer
+from pydantic_config import BaseConfig, parse_argv
+import torch.nn.functional as F
+
+from zeroband.world_info import get_world_info
+
+
+class AdamConfig(BaseConfig):
+    type: Literal["adam"] = "adam"
+    lr: float = 4e-4
+    weight_decay: float = 0.1
+    betas1: float = 0.9
+    betas2: float = 0.95
+
+
+class OptimConfig(BaseConfig):
+    optim: AdamConfig = AdamConfig()
+    sched_type: Literal["cosine", "linear", "wsd-sqrt"] = "cosine"
+    warmup_steps: int = 1000
+    stable_steps: int = 80_000
+    total_steps: int = 88_000
+    batch_size: int = 512
+
+
+class TrainConfig(BaseConfig):
+    micro_bs: int = 1
+    ac_ckpt: bool | int = False
+    reshard_after_forward: bool = True  # old shard grad op True mean full shard
+    torch_compile: bool = True
+
+
+class Config(BaseConfig):
+    name_model: Literal["debugmodel", "70M", "150M", "271M", "1B", "7B", "10B", "13B", "26B", "70B"] = "150M"
+    type_model: Literal["llama2", "llama3"] = "llama3"
+
+    project: str = "prime_simple"
+    wandb: bool = True
+
+    data: DataConfig = DataConfig()
+    optim: OptimConfig = OptimConfig()
+    train: TrainConfig
+
+
+@dataclass
+class TrainingProgress:
+    total_tokens: int
+    outer_step: int
+    step: int
+
+
+def train(config: Config):
+    # batch_size is the total batch size for all GPUs
+    assert config.optim.batch_size % world_info.local_world_size == 0
+    batch_size = config.optim.batch_size // world_info.local_world_size
+
+    assert batch_size % config.train.micro_bs == 0, (
+        f"The micro batch size ({config.train.micro_bs}) must divide the number of samples on each GPU ({batch_size})."
+    )
+    gradient_accumulation_steps = batch_size // config.train.micro_bs
+
+    # Load tokenizer
+    if config.data.fake and config.name_model == "debugmodel":
+        tokenizer = FakeTokenizer()
+    elif config.type_model == "llama2":
+        tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=True)
+    elif config.type_model == "llama3":
+        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B", use_fast=True)
+    else:
+        raise ValueError(f"Model type {config.type_model} not supported")
+
+    train_dataloader = get_dataloader(
+        tokenizer=tokenizer,
+        world_size=world_info.world_size,
+        rank=world_info.rank,
+        batch_size=config.train.micro_bs,
+        data_config=config.data,
+    )
+    train_dataloader_iterator = iter(train_dataloader)
+
+    model, model_config = get_model(
+        type_model=config.type_model,
+        name_model=config.name_model,
+        seq_length=config.data.seq_length,
+        vocab_size=len(tokenizer) if config.name_model != "debugmodel" or not config.data.fake else TEST_VOCAB_SIZE,
+    )
+
+    gpu_peak_flops = get_peak_flops(torch.cuda.get_device_name(torch.device("cuda")))
+    logger.info(f"Peak FLOPS used for computing MFU: {gpu_peak_flops:.3e}")
+
+    num_params = get_num_params(model, exclude_embedding=True)
+    logger.info(f"Number of parameters: {num_params}")
+    num_flop_per_token = get_num_flop_per_token(
+        num_params,
+        model_config,
+        config.data.seq_length,
+    )
+
+    if config.train.ac_ckpt:
+        num = 1 if isinstance(config.train.ac_ckpt, bool) else config.train.ac_ckpt
+        apply_ac_ckpt(model, num)
+
+    mp_policy = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=None)
+
+    for layer_id, transformer_block in model.layers.items():
+        if config.train.reshard_after_forward:
+            reshard_after_forward = int(layer_id) < len(model.layers) - 1
+        else:
+            reshard_after_forward = False
+        fully_shard(transformer_block, mp_policy=mp_policy, reshard_after_forward=reshard_after_forward)
+    fully_shard(model, mp_policy=mp_policy, reshard_after_forward=config.train.reshard_after_forward)
+
+    hidden_matrix_params = [p for n, p in model.layers.named_parameters() if p.ndim >= 2 and "embed" not in n]
+    embed_params = [p for n, p in model.named_parameters() if "embed" in n]
+    scalar_params = [p for p in model.parameters() if p.ndim < 2]
+    head_params = [model.output.weight]
+
+    # init the optimizer(s)
+    adam_params = [
+        dict(params=head_params, lr=0.008),
+        dict(params=embed_params, lr=0.6),
+        dict(params=scalar_params, lr=0.04),
+    ]
+    optimizer1 = torch.optim.Adam(adam_params, betas=(0.8, 0.95), eps=1e-10, fused=True)
+    optimizer2 = torch.optim.Adam(
+        hidden_matrix_params, lr=config.optim.optim.lr, betas=(0.8, 0.95), eps=1e-10, fused=True
+    )
+    # optimizer2 = Muon(
+    #     hidden_matrix_params,
+    #     lr=config.optim.optim.lr,
+    #     momentum=config.optim.optim.momentum,
+    #     nesterov=config.optim.optim.nesterov,
+    #     ns_steps=config.optim.optim.ns_steps,
+    #     rank=world_info.rank,
+    #     world_size=world_info.world_size,
+    # )
+
+    optimizers = [optimizer2, optimizer1]
+
+    schedulers = [
+        get_scheduler(
+            sched_type=config.optim.sched_type,
+            optimizer=optimizer,
+            num_warmup_steps=config.optim.warmup_steps,
+            num_stable_steps=config.optim.stable_steps,
+            num_training_steps=config.optim.total_steps,
+        )
+        for optimizer in optimizers
+    ]
+
+    schedulers = [
+        get_scheduler(
+            sched_type=config.optim.sched_type,
+            optimizer=optimizer,
+            num_warmup_steps=config.optim.warmup_steps,
+            num_stable_steps=config.optim.stable_steps,
+            num_training_steps=config.optim.total_steps,
+        )
+        for optimizer in optimizers
+    ]
+
+    training_progress = TrainingProgress(total_tokens=0, outer_step=0, step=0)
+
+    if world_info.rank == 0 and config.wandb:
+        wandb.init(project=config.project, config=config.model_dump())
+
+    if config.train.torch_compile:
+        model = torch.compile(model) if not TYPE_CHECKING else model
+
+    perf_counter = PerfCounter(window_size=10)
+
+    while True:
+        loss_batch = 0
+
+        for grad_acc_step in range(gradient_accumulation_steps):
+            is_accumulating = grad_acc_step < gradient_accumulation_steps - 1
+            # no sync if we are accumulating gradients
+            model.set_requires_gradient_sync(not is_accumulating)
+
+            batch = next(train_dataloader_iterator)
+            input_ids = batch["input_ids"].to("cuda")
+            labels = batch["labels"].to("cuda")
+            seqlens = [seqlen.to("cuda") for seqlen in batch["seqlens"]]
+            block_mask = create_block_mask_from_seqlens(seqlens) if seqlens is not None else None
+
+            logits = model(tokens=input_ids, block_mask=block_mask).contiguous()
+            flatten_logits = logits.reshape(-1, logits.size(-1))  # b seq vocab -> (b * seq) vocab
+            flatten_labels = labels.reshape(-1)  # b seq -> (b * seq)
+
+            ce_loss = F.cross_entropy(flatten_logits, flatten_labels)
+
+            del logits
+            del flatten_logits
+            del flatten_labels
+
+            loss = ce_loss / gradient_accumulation_steps
+            loss.backward()
+            loss_batch += loss.detach().clone()
+
+            # Launch both allreduces at the same time to hide latency
+            dist.all_reduce(tensor=loss_batch, op=dist.ReduceOp.AVG)
+
+        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0).full_tensor()  # type: ignore (is a dtensor)
+
+        for optimizer, scheduler in zip(optimizers, schedulers):
+            optimizer.step()
+            scheduler.step()
+
+            optimizer.zero_grad()
+
+        # logging
+        training_progress.step += 1
+        inner_lr = [group["lr"] for group in optimizers[0].param_groups][0]
+
+        # syncing loss across all data parallel rank within a nodes
+        new_tokens = config.data.seq_length * config.optim.batch_size
+        perf_counter.count_tokens(new_tokens)
+        training_progress.total_tokens += new_tokens
+
+        metrics = {
+            "Loss": loss_batch.item(),
+            "step": training_progress.step,
+            "inner_lr": inner_lr,
+            "Perplexity": torch.exp(loss_batch).item(),
+            "total_tokens": training_progress.total_tokens,
+            "time": time.time(),
+            "grad_norm": grad_norm.item(),
+        }
+
+        log = f"step: {training_progress.step}, loss: {loss_batch.item():.4f}"
+
+        tokens_per_second = perf_counter.get_tokens_per_second()
+        if tokens_per_second is not None:
+            metrics["tokens_per_second"] = tokens_per_second
+            metrics["mfu"] = 100 * num_flop_per_token * tokens_per_second / gpu_peak_flops / world_info.local_world_size
+            log += f", tokens_per_second: {tokens_per_second:.2f}, mfu: {metrics['mfu']:.2f}"
+
+        if world_info.rank == 0 and config.wandb:
+            wandb.log(metrics)
+
+        logger.info(log)
+
+        if training_progress.step > config.optim.total_steps:
+            break
+
+    logger.info("Training finished, exiting ...")
+
+
+if __name__ == "__main__":
+    # Allow eager fallback during production so that that the training runs dont die
+    # However, in development, we want to know that we broke torch compile
+    torch._dynamo.config.suppress_errors = "ZERO_BAND_DEV" not in os.environ  # type: ignore
+    torch.set_float32_matmul_precision("high")
+    torch.manual_seed(42)
+
+    config = Config(**parse_argv())  # type: ignore
+    world_info = get_world_info()
+    logger = get_logger()
+
+    torch.cuda.set_device(world_info.local_rank)
+
+    train(config)
diff --git a/uv.lock b/uv.lock
index e5d6fdcf..fadca86c 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,55 +1,26 @@
 version = 1
 requires-python = ">=3.10"
 resolution-markers = [
-    "python_full_version >= '3.13' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and sys_platform == 'linux'",
-    "python_full_version >= '3.13' and sys_platform != 'linux'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version >= '3.12' and sys_platform == 'linux'",
+    "python_full_version >= '3.12' and sys_platform != 'linux'",
     "python_full_version == '3.11.*' and sys_platform == 'linux'",
     "python_full_version < '3.11' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and sys_platform != 'linux'",
     "python_full_version < '3.11' and sys_platform != 'linux'",
 ]
 
-[[package]]
-name = "absl-py"
-version = "2.1.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/7a/8f/fc001b92ecc467cc32ab38398bd0bfb45df46e7523bf33c2ad22a505f06e/absl-py-2.1.0.tar.gz", hash = "sha256:7820790efbb316739cde8b4e19357243fc3608a152024288513dd968d7d959ff", size = 118055 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/a2/ad/e0d3c824784ff121c03cc031f944bc7e139a8f1870ffd2845cc2dd76f6c4/absl_py-2.1.0-py3-none-any.whl", hash = "sha256:526a04eadab8b4ee719ce68f204172ead1027549089702d99b9059f129ff1308", size = 133706 },
-]
-
-[[package]]
-name = "accelerate"
-version = "1.3.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "huggingface-hub" },
-    { name = "numpy" },
-    { name = "packaging" },
-    { name = "psutil" },
-    { name = "pyyaml" },
-    { name = "safetensors" },
-    { name = "torch" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/85/15/0fab0260ab4069e5224e637d2e400538bb27b0dfc36f17daf68db9770d78/accelerate-1.3.0.tar.gz", hash = "sha256:518631c0adb80bd3d42fb29e7e2dc2256bcd7c786b0ba9119bbaa08611b36d9c", size = 342758 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/73/de/64508cb91af013aaba214752309c0967568a4219d50a4ea30e822af3c976/accelerate-1.3.0-py3-none-any.whl", hash = "sha256:5788d9e6a7a9f80fed665cf09681c4dddd9dc056bea656db4140ffc285ce423e", size = 336647 },
-]
-
 [[package]]
 name = "aiohappyeyeballs"
-version = "2.4.4"
+version = "2.4.6"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/7f/55/e4373e888fdacb15563ef6fa9fa8c8252476ea071e96fb46defac9f18bf2/aiohappyeyeballs-2.4.4.tar.gz", hash = "sha256:5fdd7d87889c63183afc18ce9271f9b0a7d32c2303e394468dd45d514a757745", size = 21977 }
+sdist = { url = "https://files.pythonhosted.org/packages/08/07/508f9ebba367fc3370162e53a3cfd12f5652ad79f0e0bfdf9f9847c6f159/aiohappyeyeballs-2.4.6.tar.gz", hash = "sha256:9b05052f9042985d32ecbe4b59a77ae19c006a78f1344d7fdad69d28ded3d0b0", size = 21726 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b9/74/fbb6559de3607b3300b9be3cc64e97548d55678e44623db17820dbd20002/aiohappyeyeballs-2.4.4-py3-none-any.whl", hash = "sha256:a980909d50efcd44795c4afeca523296716d50cd756ddca6af8c65b996e27de8", size = 14756 },
+    { url = "https://files.pythonhosted.org/packages/44/4c/03fb05f56551828ec67ceb3665e5dc51638042d204983a03b0a1541475b6/aiohappyeyeballs-2.4.6-py3-none-any.whl", hash = "sha256:147ec992cf873d74f5062644332c539fcd42956dc69453fe5204195e560517e1", size = 14543 },
 ]
 
 [[package]]
 name = "aiohttp"
-version = "3.11.11"
+version = "3.11.12"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "aiohappyeyeballs" },
@@ -61,68 +32,72 @@ dependencies = [
     { name = "propcache" },
     { name = "yarl" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/fe/ed/f26db39d29cd3cb2f5a3374304c713fe5ab5a0e4c8ee25a0c45cc6adf844/aiohttp-3.11.11.tar.gz", hash = "sha256:bb49c7f1e6ebf3821a42d81d494f538107610c3a705987f53068546b0e90303e", size = 7669618 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/75/7d/ff2e314b8f9e0b1df833e2d4778eaf23eae6b8cc8f922495d110ddcbf9e1/aiohttp-3.11.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a60804bff28662cbcf340a4d61598891f12eea3a66af48ecfdc975ceec21e3c8", size = 708550 },
-    { url = "https://files.pythonhosted.org/packages/09/b8/aeb4975d5bba233d6f246941f5957a5ad4e3def8b0855a72742e391925f2/aiohttp-3.11.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4b4fa1cb5f270fb3eab079536b764ad740bb749ce69a94d4ec30ceee1b5940d5", size = 468430 },
-    { url = "https://files.pythonhosted.org/packages/9c/5b/5b620279b3df46e597008b09fa1e10027a39467387c2332657288e25811a/aiohttp-3.11.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:731468f555656767cda219ab42e033355fe48c85fbe3ba83a349631541715ba2", size = 455593 },
-    { url = "https://files.pythonhosted.org/packages/d8/75/0cdf014b816867d86c0bc26f3d3e3f194198dbf33037890beed629cd4f8f/aiohttp-3.11.11-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb23d8bb86282b342481cad4370ea0853a39e4a32a0042bb52ca6bdde132df43", size = 1584635 },
-    { url = "https://files.pythonhosted.org/packages/df/2f/95b8f4e4dfeb57c1d9ad9fa911ede35a0249d75aa339edd2c2270dc539da/aiohttp-3.11.11-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f047569d655f81cb70ea5be942ee5d4421b6219c3f05d131f64088c73bb0917f", size = 1632363 },
-    { url = "https://files.pythonhosted.org/packages/39/cb/70cf69ea7c50f5b0021a84f4c59c3622b2b3b81695f48a2f0e42ef7eba6e/aiohttp-3.11.11-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dd7659baae9ccf94ae5fe8bfaa2c7bc2e94d24611528395ce88d009107e00c6d", size = 1668315 },
-    { url = "https://files.pythonhosted.org/packages/2f/cc/3a3fc7a290eabc59839a7e15289cd48f33dd9337d06e301064e1e7fb26c5/aiohttp-3.11.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af01e42ad87ae24932138f154105e88da13ce7d202a6de93fafdafb2883a00ef", size = 1589546 },
-    { url = "https://files.pythonhosted.org/packages/15/b4/0f7b0ed41ac6000e283e7332f0f608d734b675a8509763ca78e93714cfb0/aiohttp-3.11.11-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5854be2f3e5a729800bac57a8d76af464e160f19676ab6aea74bde18ad19d438", size = 1544581 },
-    { url = "https://files.pythonhosted.org/packages/58/b9/4d06470fd85c687b6b0e31935ef73dde6e31767c9576d617309a2206556f/aiohttp-3.11.11-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:6526e5fb4e14f4bbf30411216780c9967c20c5a55f2f51d3abd6de68320cc2f3", size = 1529256 },
-    { url = "https://files.pythonhosted.org/packages/61/a2/6958b1b880fc017fd35f5dfb2c26a9a50c755b75fd9ae001dc2236a4fb79/aiohttp-3.11.11-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:85992ee30a31835fc482468637b3e5bd085fa8fe9392ba0bdcbdc1ef5e9e3c55", size = 1536592 },
-    { url = "https://files.pythonhosted.org/packages/0f/dd/b974012a9551fd654f5bb95a6dd3f03d6e6472a17e1a8216dd42e9638d6c/aiohttp-3.11.11-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:88a12ad8ccf325a8a5ed80e6d7c3bdc247d66175afedbe104ee2aaca72960d8e", size = 1607446 },
-    { url = "https://files.pythonhosted.org/packages/e0/d3/6c98fd87e638e51f074a3f2061e81fcb92123bcaf1439ac1b4a896446e40/aiohttp-3.11.11-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:0a6d3fbf2232e3a08c41eca81ae4f1dff3d8f1a30bae415ebe0af2d2458b8a33", size = 1628809 },
-    { url = "https://files.pythonhosted.org/packages/a8/2e/86e6f85cbca02be042c268c3d93e7f35977a0e127de56e319bdd1569eaa8/aiohttp-3.11.11-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:84a585799c58b795573c7fa9b84c455adf3e1d72f19a2bf498b54a95ae0d194c", size = 1564291 },
-    { url = "https://files.pythonhosted.org/packages/0b/8d/1f4ef3503b767717f65e1f5178b0173ab03cba1a19997ebf7b052161189f/aiohttp-3.11.11-cp310-cp310-win32.whl", hash = "sha256:bfde76a8f430cf5c5584553adf9926534352251d379dcb266ad2b93c54a29745", size = 416601 },
-    { url = "https://files.pythonhosted.org/packages/ad/86/81cb83691b5ace3d9aa148dc42bacc3450d749fc88c5ec1973573c1c1779/aiohttp-3.11.11-cp310-cp310-win_amd64.whl", hash = "sha256:0fd82b8e9c383af11d2b26f27a478640b6b83d669440c0a71481f7c865a51da9", size = 442007 },
-    { url = "https://files.pythonhosted.org/packages/34/ae/e8806a9f054e15f1d18b04db75c23ec38ec954a10c0a68d3bd275d7e8be3/aiohttp-3.11.11-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ba74ec819177af1ef7f59063c6d35a214a8fde6f987f7661f4f0eecc468a8f76", size = 708624 },
-    { url = "https://files.pythonhosted.org/packages/c7/e0/313ef1a333fb4d58d0c55a6acb3cd772f5d7756604b455181049e222c020/aiohttp-3.11.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4af57160800b7a815f3fe0eba9b46bf28aafc195555f1824555fa2cfab6c1538", size = 468507 },
-    { url = "https://files.pythonhosted.org/packages/a9/60/03455476bf1f467e5b4a32a465c450548b2ce724eec39d69f737191f936a/aiohttp-3.11.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ffa336210cf9cd8ed117011085817d00abe4c08f99968deef0013ea283547204", size = 455571 },
-    { url = "https://files.pythonhosted.org/packages/be/f9/469588603bd75bf02c8ffb8c8a0d4b217eed446b49d4a767684685aa33fd/aiohttp-3.11.11-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:81b8fe282183e4a3c7a1b72f5ade1094ed1c6345a8f153506d114af5bf8accd9", size = 1685694 },
-    { url = "https://files.pythonhosted.org/packages/88/b9/1b7fa43faf6c8616fa94c568dc1309ffee2b6b68b04ac268e5d64b738688/aiohttp-3.11.11-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3af41686ccec6a0f2bdc66686dc0f403c41ac2089f80e2214a0f82d001052c03", size = 1743660 },
-    { url = "https://files.pythonhosted.org/packages/2a/8b/0248d19dbb16b67222e75f6aecedd014656225733157e5afaf6a6a07e2e8/aiohttp-3.11.11-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:70d1f9dde0e5dd9e292a6d4d00058737052b01f3532f69c0c65818dac26dc287", size = 1785421 },
-    { url = "https://files.pythonhosted.org/packages/c4/11/f478e071815a46ca0a5ae974651ff0c7a35898c55063305a896e58aa1247/aiohttp-3.11.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:249cc6912405917344192b9f9ea5cd5b139d49e0d2f5c7f70bdfaf6b4dbf3a2e", size = 1675145 },
-    { url = "https://files.pythonhosted.org/packages/26/5d/284d182fecbb5075ae10153ff7374f57314c93a8681666600e3a9e09c505/aiohttp-3.11.11-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0eb98d90b6690827dcc84c246811feeb4e1eea683c0eac6caed7549be9c84665", size = 1619804 },
-    { url = "https://files.pythonhosted.org/packages/1b/78/980064c2ad685c64ce0e8aeeb7ef1e53f43c5b005edcd7d32e60809c4992/aiohttp-3.11.11-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ec82bf1fda6cecce7f7b915f9196601a1bd1a3079796b76d16ae4cce6d0ef89b", size = 1654007 },
-    { url = "https://files.pythonhosted.org/packages/21/8d/9e658d63b1438ad42b96f94da227f2e2c1d5c6001c9e8ffcc0bfb22e9105/aiohttp-3.11.11-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:9fd46ce0845cfe28f108888b3ab17abff84ff695e01e73657eec3f96d72eef34", size = 1650022 },
-    { url = "https://files.pythonhosted.org/packages/85/fd/a032bf7f2755c2df4f87f9effa34ccc1ef5cea465377dbaeef93bb56bbd6/aiohttp-3.11.11-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:bd176afcf8f5d2aed50c3647d4925d0db0579d96f75a31e77cbaf67d8a87742d", size = 1732899 },
-    { url = "https://files.pythonhosted.org/packages/c5/0c/c2b85fde167dd440c7ba50af2aac20b5a5666392b174df54c00f888c5a75/aiohttp-3.11.11-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:ec2aa89305006fba9ffb98970db6c8221541be7bee4c1d027421d6f6df7d1ce2", size = 1755142 },
-    { url = "https://files.pythonhosted.org/packages/bc/78/91ae1a3b3b3bed8b893c5d69c07023e151b1c95d79544ad04cf68f596c2f/aiohttp-3.11.11-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:92cde43018a2e17d48bb09c79e4d4cb0e236de5063ce897a5e40ac7cb4878773", size = 1692736 },
-    { url = "https://files.pythonhosted.org/packages/77/89/a7ef9c4b4cdb546fcc650ca7f7395aaffbd267f0e1f648a436bec33c9b95/aiohttp-3.11.11-cp311-cp311-win32.whl", hash = "sha256:aba807f9569455cba566882c8938f1a549f205ee43c27b126e5450dc9f83cc62", size = 416418 },
-    { url = "https://files.pythonhosted.org/packages/fc/db/2192489a8a51b52e06627506f8ac8df69ee221de88ab9bdea77aa793aa6a/aiohttp-3.11.11-cp311-cp311-win_amd64.whl", hash = "sha256:ae545f31489548c87b0cced5755cfe5a5308d00407000e72c4fa30b19c3220ac", size = 442509 },
-    { url = "https://files.pythonhosted.org/packages/69/cf/4bda538c502f9738d6b95ada11603c05ec260807246e15e869fc3ec5de97/aiohttp-3.11.11-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e595c591a48bbc295ebf47cb91aebf9bd32f3ff76749ecf282ea7f9f6bb73886", size = 704666 },
-    { url = "https://files.pythonhosted.org/packages/46/7b/87fcef2cad2fad420ca77bef981e815df6904047d0a1bd6aeded1b0d1d66/aiohttp-3.11.11-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3ea1b59dc06396b0b424740a10a0a63974c725b1c64736ff788a3689d36c02d2", size = 464057 },
-    { url = "https://files.pythonhosted.org/packages/5a/a6/789e1f17a1b6f4a38939fbc39d29e1d960d5f89f73d0629a939410171bc0/aiohttp-3.11.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8811f3f098a78ffa16e0ea36dffd577eb031aea797cbdba81be039a4169e242c", size = 455996 },
-    { url = "https://files.pythonhosted.org/packages/b7/dd/485061fbfef33165ce7320db36e530cd7116ee1098e9c3774d15a732b3fd/aiohttp-3.11.11-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7227b87a355ce1f4bf83bfae4399b1f5bb42e0259cb9405824bd03d2f4336a", size = 1682367 },
-    { url = "https://files.pythonhosted.org/packages/e9/d7/9ec5b3ea9ae215c311d88b2093e8da17e67b8856673e4166c994e117ee3e/aiohttp-3.11.11-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d40f9da8cabbf295d3a9dae1295c69975b86d941bc20f0a087f0477fa0a66231", size = 1736989 },
-    { url = "https://files.pythonhosted.org/packages/d6/fb/ea94927f7bfe1d86178c9d3e0a8c54f651a0a655214cce930b3c679b8f64/aiohttp-3.11.11-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ffb3dc385f6bb1568aa974fe65da84723210e5d9707e360e9ecb51f59406cd2e", size = 1793265 },
-    { url = "https://files.pythonhosted.org/packages/40/7f/6de218084f9b653026bd7063cd8045123a7ba90c25176465f266976d8c82/aiohttp-3.11.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8f5f7515f3552d899c61202d99dcb17d6e3b0de777900405611cd747cecd1b8", size = 1691841 },
-    { url = "https://files.pythonhosted.org/packages/77/e2/992f43d87831cbddb6b09c57ab55499332f60ad6fdbf438ff4419c2925fc/aiohttp-3.11.11-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3499c7ffbfd9c6a3d8d6a2b01c26639da7e43d47c7b4f788016226b1e711caa8", size = 1619317 },
-    { url = "https://files.pythonhosted.org/packages/96/74/879b23cdd816db4133325a201287c95bef4ce669acde37f8f1b8669e1755/aiohttp-3.11.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8e2bf8029dbf0810c7bfbc3e594b51c4cc9101fbffb583a3923aea184724203c", size = 1641416 },
-    { url = "https://files.pythonhosted.org/packages/30/98/b123f6b15d87c54e58fd7ae3558ff594f898d7f30a90899718f3215ad328/aiohttp-3.11.11-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b6212a60e5c482ef90f2d788835387070a88d52cf6241d3916733c9176d39eab", size = 1646514 },
-    { url = "https://files.pythonhosted.org/packages/d7/38/257fda3dc99d6978ab943141d5165ec74fd4b4164baa15e9c66fa21da86b/aiohttp-3.11.11-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:d119fafe7b634dbfa25a8c597718e69a930e4847f0b88e172744be24515140da", size = 1702095 },
-    { url = "https://files.pythonhosted.org/packages/0c/f4/ddab089053f9fb96654df5505c0a69bde093214b3c3454f6bfdb1845f558/aiohttp-3.11.11-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:6fba278063559acc730abf49845d0e9a9e1ba74f85f0ee6efd5803f08b285853", size = 1734611 },
-    { url = "https://files.pythonhosted.org/packages/c3/d6/f30b2bc520c38c8aa4657ed953186e535ae84abe55c08d0f70acd72ff577/aiohttp-3.11.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:92fc484e34b733704ad77210c7957679c5c3877bd1e6b6d74b185e9320cc716e", size = 1694576 },
-    { url = "https://files.pythonhosted.org/packages/bc/97/b0a88c3f4c6d0020b34045ee6d954058abc870814f6e310c4c9b74254116/aiohttp-3.11.11-cp312-cp312-win32.whl", hash = "sha256:9f5b3c1ed63c8fa937a920b6c1bec78b74ee09593b3f5b979ab2ae5ef60d7600", size = 411363 },
-    { url = "https://files.pythonhosted.org/packages/7f/23/cc36d9c398980acaeeb443100f0216f50a7cfe20c67a9fd0a2f1a5a846de/aiohttp-3.11.11-cp312-cp312-win_amd64.whl", hash = "sha256:1e69966ea6ef0c14ee53ef7a3d68b564cc408121ea56c0caa2dc918c1b2f553d", size = 437666 },
-    { url = "https://files.pythonhosted.org/packages/49/d1/d8af164f400bad432b63e1ac857d74a09311a8334b0481f2f64b158b50eb/aiohttp-3.11.11-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:541d823548ab69d13d23730a06f97460f4238ad2e5ed966aaf850d7c369782d9", size = 697982 },
-    { url = "https://files.pythonhosted.org/packages/92/d1/faad3bf9fa4bfd26b95c69fc2e98937d52b1ff44f7e28131855a98d23a17/aiohttp-3.11.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:929f3ed33743a49ab127c58c3e0a827de0664bfcda566108989a14068f820194", size = 460662 },
-    { url = "https://files.pythonhosted.org/packages/db/61/0d71cc66d63909dabc4590f74eba71f91873a77ea52424401c2498d47536/aiohttp-3.11.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0882c2820fd0132240edbb4a51eb8ceb6eef8181db9ad5291ab3332e0d71df5f", size = 452950 },
-    { url = "https://files.pythonhosted.org/packages/07/db/6d04bc7fd92784900704e16b745484ef45b77bd04e25f58f6febaadf7983/aiohttp-3.11.11-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b63de12e44935d5aca7ed7ed98a255a11e5cb47f83a9fded7a5e41c40277d104", size = 1665178 },
-    { url = "https://files.pythonhosted.org/packages/54/5c/e95ade9ae29f375411884d9fd98e50535bf9fe316c9feb0f30cd2ac8f508/aiohttp-3.11.11-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aa54f8ef31d23c506910c21163f22b124facb573bff73930735cf9fe38bf7dff", size = 1717939 },
-    { url = "https://files.pythonhosted.org/packages/6f/1c/1e7d5c5daea9e409ed70f7986001b8c9e3a49a50b28404498d30860edab6/aiohttp-3.11.11-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a344d5dc18074e3872777b62f5f7d584ae4344cd6006c17ba12103759d407af3", size = 1775125 },
-    { url = "https://files.pythonhosted.org/packages/5d/66/890987e44f7d2f33a130e37e01a164168e6aff06fce15217b6eaf14df4f6/aiohttp-3.11.11-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b7fb429ab1aafa1f48578eb315ca45bd46e9c37de11fe45c7f5f4138091e2f1", size = 1677176 },
-    { url = "https://files.pythonhosted.org/packages/8f/dc/e2ba57d7a52df6cdf1072fd5fa9c6301a68e1cd67415f189805d3eeb031d/aiohttp-3.11.11-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c341c7d868750e31961d6d8e60ff040fb9d3d3a46d77fd85e1ab8e76c3e9a5c4", size = 1603192 },
-    { url = "https://files.pythonhosted.org/packages/6c/9e/8d08a57de79ca3a358da449405555e668f2c8871a7777ecd2f0e3912c272/aiohttp-3.11.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ed9ee95614a71e87f1a70bc81603f6c6760128b140bc4030abe6abaa988f1c3d", size = 1618296 },
-    { url = "https://files.pythonhosted.org/packages/56/51/89822e3ec72db352c32e7fc1c690370e24e231837d9abd056490f3a49886/aiohttp-3.11.11-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:de8d38f1c2810fa2a4f1d995a2e9c70bb8737b18da04ac2afbf3971f65781d87", size = 1616524 },
-    { url = "https://files.pythonhosted.org/packages/2c/fa/e2e6d9398f462ffaa095e84717c1732916a57f1814502929ed67dd7568ef/aiohttp-3.11.11-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:a9b7371665d4f00deb8f32208c7c5e652059b0fda41cf6dbcac6114a041f1cc2", size = 1685471 },
-    { url = "https://files.pythonhosted.org/packages/ae/5f/6bb976e619ca28a052e2c0ca7b0251ccd893f93d7c24a96abea38e332bf6/aiohttp-3.11.11-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:620598717fce1b3bd14dd09947ea53e1ad510317c85dda2c9c65b622edc96b12", size = 1715312 },
-    { url = "https://files.pythonhosted.org/packages/79/c1/756a7e65aa087c7fac724d6c4c038f2faaa2a42fe56dbc1dd62a33ca7213/aiohttp-3.11.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:bf8d9bfee991d8acc72d060d53860f356e07a50f0e0d09a8dfedea1c554dd0d5", size = 1672783 },
-    { url = "https://files.pythonhosted.org/packages/73/ba/a6190ebb02176c7f75e6308da31f5d49f6477b651a3dcfaaaca865a298e2/aiohttp-3.11.11-cp313-cp313-win32.whl", hash = "sha256:9d73ee3725b7a737ad86c2eac5c57a4a97793d9f442599bea5ec67ac9f4bdc3d", size = 410229 },
-    { url = "https://files.pythonhosted.org/packages/b8/62/c9fa5bafe03186a0e4699150a7fed9b1e73240996d0d2f0e5f70f3fdf471/aiohttp-3.11.11-cp313-cp313-win_amd64.whl", hash = "sha256:c7a06301c2fb096bdb0bd25fe2011531c1453b9f2c163c8031600ec73af1cc99", size = 436081 },
+sdist = { url = "https://files.pythonhosted.org/packages/37/4b/952d49c73084fb790cb5c6ead50848c8e96b4980ad806cf4d2ad341eaa03/aiohttp-3.11.12.tar.gz", hash = "sha256:7603ca26d75b1b86160ce1bbe2787a0b706e592af5b2504e12caa88a217767b0", size = 7673175 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/65/42/3880e133590820aa7bc6d068eb7d8e0ad9fdce9b4663f92b821d3f6b5601/aiohttp-3.11.12-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:aa8a8caca81c0a3e765f19c6953416c58e2f4cc1b84829af01dd1c771bb2f91f", size = 708721 },
+    { url = "https://files.pythonhosted.org/packages/d8/8c/04869803bed108b25afad75f94c651b287851843caacbec6677d8f2d572b/aiohttp-3.11.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:84ede78acde96ca57f6cf8ccb8a13fbaf569f6011b9a52f870c662d4dc8cd854", size = 468596 },
+    { url = "https://files.pythonhosted.org/packages/4f/f4/9074011f0d1335b161c953fb32545b6667cf24465e1932b9767874995c7e/aiohttp-3.11.12-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:584096938a001378484aa4ee54e05dc79c7b9dd933e271c744a97b3b6f644957", size = 455758 },
+    { url = "https://files.pythonhosted.org/packages/fd/68/06298c57ef8f534065930b805e6dbd83613f0534447922782fb9920fce28/aiohttp-3.11.12-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:392432a2dde22b86f70dd4a0e9671a349446c93965f261dbaecfaf28813e5c42", size = 1584797 },
+    { url = "https://files.pythonhosted.org/packages/bd/1e/cee6b51fcb3b1c4185a7dc62b3113bc136fae07f39386c88c90b7f79f199/aiohttp-3.11.12-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:88d385b8e7f3a870146bf5ea31786ef7463e99eb59e31db56e2315535d811f55", size = 1632535 },
+    { url = "https://files.pythonhosted.org/packages/71/1f/42424462b7a09da362e1711090db9f8d68a37a33f0aab51307335517c599/aiohttp-3.11.12-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b10a47e5390c4b30a0d58ee12581003be52eedd506862ab7f97da7a66805befb", size = 1668484 },
+    { url = "https://files.pythonhosted.org/packages/f6/79/0e25542bbe3c2bfd7a12c7a49c7bce73b09a836f65079e4b77bc2bafc89e/aiohttp-3.11.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b5263dcede17b6b0c41ef0c3ccce847d82a7da98709e75cf7efde3e9e3b5cae", size = 1589708 },
+    { url = "https://files.pythonhosted.org/packages/d1/13/93ae26b75e23f7d3a613872e472fae836ca100dc5bde5936ebc93ada8890/aiohttp-3.11.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:50c5c7b8aa5443304c55c262c5693b108c35a3b61ef961f1e782dd52a2f559c7", size = 1544752 },
+    { url = "https://files.pythonhosted.org/packages/cf/5e/48847fad1b014ef92ef18ea1339a3b58eb81d3bc717b94c3627f5d2a42c5/aiohttp-3.11.12-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d1c031a7572f62f66f1257db37ddab4cb98bfaf9b9434a3b4840bf3560f5e788", size = 1529417 },
+    { url = "https://files.pythonhosted.org/packages/ae/56/fbd4ea019303f4877f0e0b8c9de92e9db24338e7545570d3f275f3c74c53/aiohttp-3.11.12-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:7e44eba534381dd2687be50cbd5f2daded21575242ecfdaf86bbeecbc38dae8e", size = 1557808 },
+    { url = "https://files.pythonhosted.org/packages/f1/43/112189cf6b3c482ecdd6819b420eaa0c2033426f28d741bb7f19db5dd2bb/aiohttp-3.11.12-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:145a73850926018ec1681e734cedcf2716d6a8697d90da11284043b745c286d5", size = 1536765 },
+    { url = "https://files.pythonhosted.org/packages/30/12/59986547de8306e06c7b30e547ccda02d29636e152366caba2dd8627bfe1/aiohttp-3.11.12-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:2c311e2f63e42c1bf86361d11e2c4a59f25d9e7aabdbdf53dc38b885c5435cdb", size = 1607621 },
+    { url = "https://files.pythonhosted.org/packages/aa/9b/af3b323b20df3318ed20d701d8242e523d59c842ca93f23134b05c9d5054/aiohttp-3.11.12-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:ea756b5a7bac046d202a9a3889b9a92219f885481d78cd318db85b15cc0b7bcf", size = 1628977 },
+    { url = "https://files.pythonhosted.org/packages/36/62/adf5a331a7bda475cc326dde393fa2bc5849060b1b37ac3d1bee1953f2cd/aiohttp-3.11.12-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:526c900397f3bbc2db9cb360ce9c35134c908961cdd0ac25b1ae6ffcaa2507ff", size = 1564455 },
+    { url = "https://files.pythonhosted.org/packages/90/c4/4a24291f22f111a854dfdb54dc94d4e0a5229ccbb7bc7f0bed972aa50410/aiohttp-3.11.12-cp310-cp310-win32.whl", hash = "sha256:b8d3bb96c147b39c02d3db086899679f31958c5d81c494ef0fc9ef5bb1359b3d", size = 416768 },
+    { url = "https://files.pythonhosted.org/packages/51/69/5221c8006acb7bb10d9e8e2238fb216571bddc2e00a8d95bcfbe2f579c57/aiohttp-3.11.12-cp310-cp310-win_amd64.whl", hash = "sha256:7fe3d65279bfbee8de0fb4f8c17fc4e893eed2dba21b2f680e930cc2b09075c5", size = 442170 },
+    { url = "https://files.pythonhosted.org/packages/9c/38/35311e70196b6a63cfa033a7f741f800aa8a93f57442991cbe51da2394e7/aiohttp-3.11.12-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:87a2e00bf17da098d90d4145375f1d985a81605267e7f9377ff94e55c5d769eb", size = 708797 },
+    { url = "https://files.pythonhosted.org/packages/44/3e/46c656e68cbfc4f3fc7cb5d2ba4da6e91607fe83428208028156688f6201/aiohttp-3.11.12-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b34508f1cd928ce915ed09682d11307ba4b37d0708d1f28e5774c07a7674cac9", size = 468669 },
+    { url = "https://files.pythonhosted.org/packages/a0/d6/2088fb4fd1e3ac2bfb24bc172223babaa7cdbb2784d33c75ec09e66f62f8/aiohttp-3.11.12-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:936d8a4f0f7081327014742cd51d320296b56aa6d324461a13724ab05f4b2933", size = 455739 },
+    { url = "https://files.pythonhosted.org/packages/e7/dc/c443a6954a56f4a58b5efbfdf23cc6f3f0235e3424faf5a0c56264d5c7bb/aiohttp-3.11.12-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2de1378f72def7dfb5dbd73d86c19eda0ea7b0a6873910cc37d57e80f10d64e1", size = 1685858 },
+    { url = "https://files.pythonhosted.org/packages/25/67/2d5b3aaade1d5d01c3b109aa76e3aa9630531252cda10aa02fb99b0b11a1/aiohttp-3.11.12-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b9d45dbb3aaec05cf01525ee1a7ac72de46a8c425cb75c003acd29f76b1ffe94", size = 1743829 },
+    { url = "https://files.pythonhosted.org/packages/90/9b/9728fe9a3e1b8521198455d027b0b4035522be18f504b24c5d38d59e7278/aiohttp-3.11.12-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:930ffa1925393381e1e0a9b82137fa7b34c92a019b521cf9f41263976666a0d6", size = 1785587 },
+    { url = "https://files.pythonhosted.org/packages/ce/cf/28fbb43d4ebc1b4458374a3c7b6db3b556a90e358e9bbcfe6d9339c1e2b6/aiohttp-3.11.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8340def6737118f5429a5df4e88f440746b791f8f1c4ce4ad8a595f42c980bd5", size = 1675319 },
+    { url = "https://files.pythonhosted.org/packages/e5/d2/006c459c11218cabaa7bca401f965c9cc828efbdea7e1615d4644eaf23f7/aiohttp-3.11.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4016e383f91f2814e48ed61e6bda7d24c4d7f2402c75dd28f7e1027ae44ea204", size = 1619982 },
+    { url = "https://files.pythonhosted.org/packages/9d/83/ca425891ebd37bee5d837110f7fddc4d808a7c6c126a7d1b5c3ad72fc6ba/aiohttp-3.11.12-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3c0600bcc1adfaaac321422d615939ef300df81e165f6522ad096b73439c0f58", size = 1654176 },
+    { url = "https://files.pythonhosted.org/packages/25/df/047b1ce88514a1b4915d252513640184b63624e7914e41d846668b8edbda/aiohttp-3.11.12-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:0450ada317a65383b7cce9576096150fdb97396dcfe559109b403c7242faffef", size = 1660198 },
+    { url = "https://files.pythonhosted.org/packages/d3/cc/6ecb8e343f0902528620b9dbd567028a936d5489bebd7dbb0dd0914f4fdb/aiohttp-3.11.12-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:850ff6155371fd802a280f8d369d4e15d69434651b844bde566ce97ee2277420", size = 1650186 },
+    { url = "https://files.pythonhosted.org/packages/f8/f8/453df6dd69256ca8c06c53fc8803c9056e2b0b16509b070f9a3b4bdefd6c/aiohttp-3.11.12-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:8fd12d0f989c6099e7b0f30dc6e0d1e05499f3337461f0b2b0dadea6c64b89df", size = 1733063 },
+    { url = "https://files.pythonhosted.org/packages/55/f8/540160787ff3000391de0e5d0d1d33be4c7972f933c21991e2ea105b2d5e/aiohttp-3.11.12-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:76719dd521c20a58a6c256d058547b3a9595d1d885b830013366e27011ffe804", size = 1755306 },
+    { url = "https://files.pythonhosted.org/packages/30/7d/49f3bfdfefd741576157f8f91caa9ff61a6f3d620ca6339268327518221b/aiohttp-3.11.12-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:97fe431f2ed646a3b56142fc81d238abcbaff08548d6912acb0b19a0cadc146b", size = 1692909 },
+    { url = "https://files.pythonhosted.org/packages/40/9c/8ce00afd6f6112ce9a2309dc490fea376ae824708b94b7b5ea9cba979d1d/aiohttp-3.11.12-cp311-cp311-win32.whl", hash = "sha256:e10c440d142fa8b32cfdb194caf60ceeceb3e49807072e0dc3a8887ea80e8c16", size = 416584 },
+    { url = "https://files.pythonhosted.org/packages/35/97/4d3c5f562f15830de472eb10a7a222655d750839943e0e6d915ef7e26114/aiohttp-3.11.12-cp311-cp311-win_amd64.whl", hash = "sha256:246067ba0cf5560cf42e775069c5d80a8989d14a7ded21af529a4e10e3e0f0e6", size = 442674 },
+    { url = "https://files.pythonhosted.org/packages/4d/d0/94346961acb476569fca9a644cc6f9a02f97ef75961a6b8d2b35279b8d1f/aiohttp-3.11.12-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e392804a38353900c3fd8b7cacbea5132888f7129f8e241915e90b85f00e3250", size = 704837 },
+    { url = "https://files.pythonhosted.org/packages/a9/af/05c503f1cc8f97621f199ef4b8db65fb88b8bc74a26ab2adb74789507ad3/aiohttp-3.11.12-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:8fa1510b96c08aaad49303ab11f8803787c99222288f310a62f493faf883ede1", size = 464218 },
+    { url = "https://files.pythonhosted.org/packages/f2/48/b9949eb645b9bd699153a2ec48751b985e352ab3fed9d98c8115de305508/aiohttp-3.11.12-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dc065a4285307607df3f3686363e7f8bdd0d8ab35f12226362a847731516e42c", size = 456166 },
+    { url = "https://files.pythonhosted.org/packages/14/fb/980981807baecb6f54bdd38beb1bd271d9a3a786e19a978871584d026dcf/aiohttp-3.11.12-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cddb31f8474695cd61fc9455c644fc1606c164b93bff2490390d90464b4655df", size = 1682528 },
+    { url = "https://files.pythonhosted.org/packages/90/cb/77b1445e0a716914e6197b0698b7a3640590da6c692437920c586764d05b/aiohttp-3.11.12-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9dec0000d2d8621d8015c293e24589d46fa218637d820894cb7356c77eca3259", size = 1737154 },
+    { url = "https://files.pythonhosted.org/packages/ff/24/d6fb1f4cede9ccbe98e4def6f3ed1e1efcb658871bbf29f4863ec646bf38/aiohttp-3.11.12-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e3552fe98e90fdf5918c04769f338a87fa4f00f3b28830ea9b78b1bdc6140e0d", size = 1793435 },
+    { url = "https://files.pythonhosted.org/packages/17/e2/9f744cee0861af673dc271a3351f59ebd5415928e20080ab85be25641471/aiohttp-3.11.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dfe7f984f28a8ae94ff3a7953cd9678550dbd2a1f9bda5dd9c5ae627744c78e", size = 1692010 },
+    { url = "https://files.pythonhosted.org/packages/90/c4/4a1235c1df544223eb57ba553ce03bc706bdd065e53918767f7fa1ff99e0/aiohttp-3.11.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a481a574af914b6e84624412666cbfbe531a05667ca197804ecc19c97b8ab1b0", size = 1619481 },
+    { url = "https://files.pythonhosted.org/packages/60/70/cf12d402a94a33abda86dd136eb749b14c8eb9fec1e16adc310e25b20033/aiohttp-3.11.12-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1987770fb4887560363b0e1a9b75aa303e447433c41284d3af2840a2f226d6e0", size = 1641578 },
+    { url = "https://files.pythonhosted.org/packages/1b/25/7211973fda1f5e833fcfd98ccb7f9ce4fbfc0074e3e70c0157a751d00db8/aiohttp-3.11.12-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:a4ac6a0f0f6402854adca4e3259a623f5c82ec3f0c049374133bcb243132baf9", size = 1684463 },
+    { url = "https://files.pythonhosted.org/packages/93/60/b5905b4d0693f6018b26afa9f2221fefc0dcbd3773fe2dff1a20fb5727f1/aiohttp-3.11.12-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c96a43822f1f9f69cc5c3706af33239489a6294be486a0447fb71380070d4d5f", size = 1646691 },
+    { url = "https://files.pythonhosted.org/packages/b4/fc/ba1b14d6fdcd38df0b7c04640794b3683e949ea10937c8a58c14d697e93f/aiohttp-3.11.12-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:a5e69046f83c0d3cb8f0d5bd9b8838271b1bc898e01562a04398e160953e8eb9", size = 1702269 },
+    { url = "https://files.pythonhosted.org/packages/5e/39/18c13c6f658b2ba9cc1e0c6fb2d02f98fd653ad2addcdf938193d51a9c53/aiohttp-3.11.12-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:68d54234c8d76d8ef74744f9f9fc6324f1508129e23da8883771cdbb5818cbef", size = 1734782 },
+    { url = "https://files.pythonhosted.org/packages/9f/d2/ccc190023020e342419b265861877cd8ffb75bec37b7ddd8521dd2c6deb8/aiohttp-3.11.12-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c9fd9dcf9c91affe71654ef77426f5cf8489305e1c66ed4816f5a21874b094b9", size = 1694740 },
+    { url = "https://files.pythonhosted.org/packages/3f/54/186805bcada64ea90ea909311ffedcd74369bfc6e880d39d2473314daa36/aiohttp-3.11.12-cp312-cp312-win32.whl", hash = "sha256:0ed49efcd0dc1611378beadbd97beb5d9ca8fe48579fc04a6ed0844072261b6a", size = 411530 },
+    { url = "https://files.pythonhosted.org/packages/3d/63/5eca549d34d141bcd9de50d4e59b913f3641559460c739d5e215693cb54a/aiohttp-3.11.12-cp312-cp312-win_amd64.whl", hash = "sha256:54775858c7f2f214476773ce785a19ee81d1294a6bedc5cc17225355aab74802", size = 437860 },
+    { url = "https://files.pythonhosted.org/packages/c3/9b/cea185d4b543ae08ee478373e16653722c19fcda10d2d0646f300ce10791/aiohttp-3.11.12-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:413ad794dccb19453e2b97c2375f2ca3cdf34dc50d18cc2693bd5aed7d16f4b9", size = 698148 },
+    { url = "https://files.pythonhosted.org/packages/91/5c/80d47fe7749fde584d1404a68ade29bcd7e58db8fa11fa38e8d90d77e447/aiohttp-3.11.12-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4a93d28ed4b4b39e6f46fd240896c29b686b75e39cc6992692e3922ff6982b4c", size = 460831 },
+    { url = "https://files.pythonhosted.org/packages/8e/f9/de568f8a8ca6b061d157c50272620c53168d6e3eeddae78dbb0f7db981eb/aiohttp-3.11.12-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d589264dbba3b16e8951b6f145d1e6b883094075283dafcab4cdd564a9e353a0", size = 453122 },
+    { url = "https://files.pythonhosted.org/packages/8b/fd/b775970a047543bbc1d0f66725ba72acef788028fce215dc959fd15a8200/aiohttp-3.11.12-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5148ca8955affdfeb864aca158ecae11030e952b25b3ae15d4e2b5ba299bad2", size = 1665336 },
+    { url = "https://files.pythonhosted.org/packages/82/9b/aff01d4f9716245a1b2965f02044e4474fadd2bcfe63cf249ca788541886/aiohttp-3.11.12-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:525410e0790aab036492eeea913858989c4cb070ff373ec3bc322d700bdf47c1", size = 1718111 },
+    { url = "https://files.pythonhosted.org/packages/e0/a9/166fd2d8b2cc64f08104aa614fad30eee506b563154081bf88ce729bc665/aiohttp-3.11.12-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9bd8695be2c80b665ae3f05cb584093a1e59c35ecb7d794d1edd96e8cc9201d7", size = 1775293 },
+    { url = "https://files.pythonhosted.org/packages/13/c5/0d3c89bd9e36288f10dc246f42518ce8e1c333f27636ac78df091c86bb4a/aiohttp-3.11.12-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f0203433121484b32646a5f5ea93ae86f3d9559d7243f07e8c0eab5ff8e3f70e", size = 1677338 },
+    { url = "https://files.pythonhosted.org/packages/72/b2/017db2833ef537be284f64ead78725984db8a39276c1a9a07c5c7526e238/aiohttp-3.11.12-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40cd36749a1035c34ba8d8aaf221b91ca3d111532e5ccb5fa8c3703ab1b967ed", size = 1603365 },
+    { url = "https://files.pythonhosted.org/packages/fc/72/b66c96a106ec7e791e29988c222141dd1219d7793ffb01e72245399e08d2/aiohttp-3.11.12-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a7442662afebbf7b4c6d28cb7aab9e9ce3a5df055fc4116cc7228192ad6cb484", size = 1618464 },
+    { url = "https://files.pythonhosted.org/packages/3f/50/e68a40f267b46a603bab569d48d57f23508801614e05b3369898c5b2910a/aiohttp-3.11.12-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:8a2fb742ef378284a50766e985804bd6adb5adb5aa781100b09befdbfa757b65", size = 1657827 },
+    { url = "https://files.pythonhosted.org/packages/c5/1d/aafbcdb1773d0ba7c20793ebeedfaba1f3f7462f6fc251f24983ed738aa7/aiohttp-3.11.12-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2cee3b117a8d13ab98b38d5b6bdcd040cfb4181068d05ce0c474ec9db5f3c5bb", size = 1616700 },
+    { url = "https://files.pythonhosted.org/packages/b0/5e/6cd9724a2932f36e2a6b742436a36d64784322cfb3406ca773f903bb9a70/aiohttp-3.11.12-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f6a19bcab7fbd8f8649d6595624856635159a6527861b9cdc3447af288a00c00", size = 1685643 },
+    { url = "https://files.pythonhosted.org/packages/8b/38/ea6c91d5c767fd45a18151675a07c710ca018b30aa876a9f35b32fa59761/aiohttp-3.11.12-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:e4cecdb52aaa9994fbed6b81d4568427b6002f0a91c322697a4bfcc2b2363f5a", size = 1715487 },
+    { url = "https://files.pythonhosted.org/packages/8e/24/e9edbcb7d1d93c02e055490348df6f955d675e85a028c33babdcaeda0853/aiohttp-3.11.12-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:30f546358dfa0953db92ba620101fefc81574f87b2346556b90b5f3ef16e55ce", size = 1672948 },
+    { url = "https://files.pythonhosted.org/packages/25/be/0b1fb737268e003198f25c3a68c2135e76e4754bf399a879b27bd508a003/aiohttp-3.11.12-cp313-cp313-win32.whl", hash = "sha256:ce1bb21fc7d753b5f8a5d5a4bae99566386b15e716ebdb410154c16c91494d7f", size = 410396 },
+    { url = "https://files.pythonhosted.org/packages/68/fd/677def96a75057b0a26446b62f8fbb084435b20a7d270c99539c26573bfd/aiohttp-3.11.12-cp313-cp313-win_amd64.whl", hash = "sha256:f7914ab70d2ee8ab91c13e5402122edbc77821c66d2758abb53aabe87f013287", size = 436234 },
 ]
 
 [[package]]
@@ -155,15 +130,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fe/ba/e2081de779ca30d473f21f5b30e0e737c438205440784c7dfc81efc2b029/async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c", size = 6233 },
 ]
 
-[[package]]
-name = "asyncio"
-version = "3.4.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/da/54/054bafaf2c0fb8473d423743e191fcdf49b2c1fd5e9af3524efbe097bafd/asyncio-3.4.3.tar.gz", hash = "sha256:83360ff8bc97980e4ff25c964c7bd3923d333d177aa4f7fb736b019f26c7cb41", size = 204411 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/22/74/07679c5b9f98a7cb0fc147b1ef1cc1853bc07a4eb9cb5731e24732c5f773/asyncio-3.4.3-py3-none-any.whl", hash = "sha256:c4d18b22701821de07bd6aea8b53d21449ec0ec5680645e5317062ea21817d2d", size = 101767 },
-]
-
 [[package]]
 name = "attrs"
 version = "25.1.0"
@@ -173,22 +139,13 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fc/30/d4986a882011f9df997a55e6becd864812ccfcd821d64aac8570ee39f719/attrs-25.1.0-py3-none-any.whl", hash = "sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a", size = 63152 },
 ]
 
-[[package]]
-name = "cachetools"
-version = "5.5.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/d9/74/57df1ab0ce6bc5f6fa868e08de20df8ac58f9c44330c7671ad922d2bbeae/cachetools-5.5.1.tar.gz", hash = "sha256:70f238fbba50383ef62e55c6aff6d9673175fe59f7c6782c7a0b9e38f4a9df95", size = 28044 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/ec/4e/de4ff18bcf55857ba18d3a4bd48c8a9fde6bb0980c9d20b263f05387fd88/cachetools-5.5.1-py3-none-any.whl", hash = "sha256:b76651fdc3b24ead3c648bbdeeb940c1b04d365b38b4af66788f9ec4a81d42bb", size = 9530 },
-]
-
 [[package]]
 name = "certifi"
-version = "2024.12.14"
+version = "2025.1.31"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/0f/bd/1d41ee578ce09523c81a15426705dd20969f5abf006d1afe8aeff0dd776a/certifi-2024.12.14.tar.gz", hash = "sha256:b650d30f370c2b724812bee08008be0c4163b163ddaec3f2546c1caf65f191db", size = 166010 }
+sdist = { url = "https://files.pythonhosted.org/packages/1c/ab/c9f1e32b7b1bf505bf26f0ef697775960db7932abeb7b516de930ba2705f/certifi-2025.1.31.tar.gz", hash = "sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651", size = 167577 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a5/32/8f6669fc4798494966bf446c8c4a162e0b5d893dff088afddf76414f70e1/certifi-2024.12.14-py3-none-any.whl", hash = "sha256:1275f7a45be9464efc1173084eaa30f866fe2e47d389406136d332ed4967ec56", size = 164927 },
+    { url = "https://files.pythonhosted.org/packages/38/fc/bce832fd4fd99766c04d1ee0eead6b0ec6486fb100ae5e74c1d91292b982/certifi-2025.1.31-py3-none-any.whl", hash = "sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe", size = 166393 },
 ]
 
 [[package]]
@@ -257,15 +214,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9", size = 7249 },
 ]
 
-[[package]]
-name = "chardet"
-version = "5.2.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/f7b6ab21ec75897ed80c17d79b15951a719226b9fababf1e40ea74d69079/chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7", size = 2069618 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/38/6f/f5fbc992a329ee4e0f288c1fe0e2ad9485ed064cac731ed2fe47dcc38cbf/chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970", size = 199385 },
-]
-
 [[package]]
 name = "charset-normalizer"
 version = "3.4.1"
@@ -348,19 +296,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 },
 ]
 
-[[package]]
-name = "dataproperty"
-version = "1.1.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "mbstrdecoder" },
-    { name = "typepy", extra = ["datetime"] },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/0b/81/8c8b64ae873cb9014815214c07b63b12e3b18835780fb342223cfe3fe7d8/dataproperty-1.1.0.tar.gz", hash = "sha256:b038437a4097d1a1c497695c3586ea34bea67fdd35372b9a50f30bf044d77d04", size = 42574 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/21/c2/e12e95e289e6081a40454199ab213139ef16a528c7c86432de545b05a23a/DataProperty-1.1.0-py3-none-any.whl", hash = "sha256:c61fcb2e2deca35e6d1eb1f251a7f22f0dcde63e80e61f0cc18c19f42abfd25b", size = 27581 },
-]
-
 [[package]]
 name = "datasets"
 version = "3.2.0"
@@ -386,15 +321,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d7/84/0df6c5981f5fc722381662ff8cfbdf8aad64bec875f75d80b55bfef394ce/datasets-3.2.0-py3-none-any.whl", hash = "sha256:f3d2ba2698b7284a4518019658596a6a8bc79f31e51516524249d6c59cf0fe2a", size = 480647 },
 ]
 
-[[package]]
-name = "decorator"
-version = "5.1.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/66/0c/8d907af351aa16b42caae42f9d6aa37b900c67308052d10fdce809f8d952/decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330", size = 35016 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186", size = 9073 },
-]
-
 [[package]]
 name = "dill"
 version = "0.3.8"
@@ -425,28 +351,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f5/e8/f6bd1eee09314e7e6dee49cbe2c5e22314ccdb38db16c9fc72d2fa80d054/docker_pycreds-0.4.0-py2.py3-none-any.whl", hash = "sha256:7266112468627868005106ec19cd0d722702d2b7d5912a28e19b826c3d37af49", size = 8982 },
 ]
 
-[[package]]
-name = "evaluate"
-version = "0.4.3"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "datasets" },
-    { name = "dill" },
-    { name = "fsspec", extra = ["http"] },
-    { name = "huggingface-hub" },
-    { name = "multiprocess" },
-    { name = "numpy" },
-    { name = "packaging" },
-    { name = "pandas" },
-    { name = "requests" },
-    { name = "tqdm" },
-    { name = "xxhash" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/5a/a0/10a56e0939ece94c54276e81459cb4101f46f0e9a6f54fc31a35f64e8854/evaluate-0.4.3.tar.gz", hash = "sha256:3a5700cf83aabee9549264e1e5666f116367c61dbd4d38352015e859a5e2098d", size = 65679 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/a2/e7/cbca9e2d2590eb9b5aa8f7ebabe1beb1498f9462d2ecede5c9fd9735faaf/evaluate-0.4.3-py3-none-any.whl", hash = "sha256:47d8770bdea76e2c2ed0d40189273027d1a41ccea861bcc7ba12d30ec5d1e517", size = 84010 },
-]
-
 [[package]]
 name = "exceptiongroup"
 version = "1.2.2"
@@ -458,15 +362,14 @@ wheels = [
 
 [[package]]
 name = "faker"
-version = "35.0.0"
+version = "36.1.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "python-dateutil" },
-    { name = "typing-extensions" },
+    { name = "tzdata" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/d5/18/86fe668976308d09e0178041c3756e646a1f5ddc676aa7fb0cf3cd52f5b9/faker-35.0.0.tar.gz", hash = "sha256:42f2da8cf561e38c72b25e9891168b1e25fec42b6b0b5b0b6cd6041da54af885", size = 1855098 }
+sdist = { url = "https://files.pythonhosted.org/packages/50/00/43012033ef13ecd4bf00d8d936508a49abaf1a66f8d420db3b7aaafd42b3/faker-36.1.0.tar.gz", hash = "sha256:f40510350aecfe006f45cb3f8879b35e861367cf347f51a7f2ca2c0571fdcc0b", size = 1874804 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b8/fe/40452fb1730b10afa34dfe016097b28baa070ad74a1c1a3512ebed438c08/Faker-35.0.0-py3-none-any.whl", hash = "sha256:926d2301787220e0554c2e39afc4dc535ce4b0a8d0a089657137999f66334ef4", size = 1894841 },
+    { url = "https://files.pythonhosted.org/packages/eb/02/72c93c1df2eff2502bfb94d6c3937192c1090c8095c389bc24533a55d327/Faker-36.1.0-py3-none-any.whl", hash = "sha256:aa0b93487d3adf7cd89953d172e3df896cb7b35d8a5222c0da873edbe2f7adf5", size = 1917678 },
 ]
 
 [[package]]
@@ -557,31 +460,10 @@ wheels = [
 ]
 
 [package.optional-dependencies]
-gcs = [
-    { name = "gcsfs" },
-]
 http = [
     { name = "aiohttp" },
 ]
 
-[[package]]
-name = "gcsfs"
-version = "2024.9.0.post1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "aiohttp" },
-    { name = "decorator" },
-    { name = "fsspec" },
-    { name = "google-auth" },
-    { name = "google-auth-oauthlib" },
-    { name = "google-cloud-storage" },
-    { name = "requests" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/d9/9d/64fa09b9c392ee79ffaa4b26d0481d2d775ffe03969b38a4ade77bd72d15/gcsfs-2024.9.0.post1.tar.gz", hash = "sha256:7ca70ee9d7c7dbce1a3e36b4883e14102c2d7b4284f49e242843a437bc684684", size = 79460 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/72/1d/37ab60da39d3b782b0cf7770ba8c9071be8bb2aee8bc01b6d350c28b51b3/gcsfs-2024.9.0.post1-py2.py3-none-any.whl", hash = "sha256:f3ab9d3bedc45da8cf40baed7c3a1e1694e8f599160d9138d78f0ef25e4a3ca1", size = 34977 },
-]
-
 [[package]]
 name = "gitdb"
 version = "4.0.12"
@@ -606,132 +488,9 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1d/9a/4114a9057db2f1462d5c8f8390ab7383925fe1ac012eaa42402ad65c2963/GitPython-3.1.44-py3-none-any.whl", hash = "sha256:9e0e10cda9bed1ee64bc9a6de50e7e38a9c9943241cd7f585f6df3ed28011110", size = 207599 },
 ]
 
-[[package]]
-name = "google-api-core"
-version = "2.24.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "google-auth" },
-    { name = "googleapis-common-protos" },
-    { name = "proto-plus" },
-    { name = "protobuf" },
-    { name = "requests" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/b8/b7/481c83223d7b4f02c7651713fceca648fa3336e1571b9804713f66bca2d8/google_api_core-2.24.1.tar.gz", hash = "sha256:f8b36f5456ab0dd99a1b693a40a31d1e7757beea380ad1b38faaf8941eae9d8a", size = 163508 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/b1/a6/8e30ddfd3d39ee6d2c76d3d4f64a83f77ac86a4cab67b286ae35ce9e4369/google_api_core-2.24.1-py3-none-any.whl", hash = "sha256:bc78d608f5a5bf853b80bd70a795f703294de656c096c0968320830a4bc280f1", size = 160059 },
-]
-
-[[package]]
-name = "google-auth"
-version = "2.38.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "cachetools" },
-    { name = "pyasn1-modules" },
-    { name = "rsa" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/c6/eb/d504ba1daf190af6b204a9d4714d457462b486043744901a6eeea711f913/google_auth-2.38.0.tar.gz", hash = "sha256:8285113607d3b80a3f1543b75962447ba8a09fe85783432a784fdeef6ac094c4", size = 270866 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/9d/47/603554949a37bca5b7f894d51896a9c534b9eab808e2520a748e081669d0/google_auth-2.38.0-py2.py3-none-any.whl", hash = "sha256:e7dae6694313f434a2727bf2906f27ad259bae090d7aa896590d86feec3d9d4a", size = 210770 },
-]
-
-[[package]]
-name = "google-auth-oauthlib"
-version = "1.2.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "google-auth" },
-    { name = "requests-oauthlib" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/cc/0f/1772edb8d75ecf6280f1c7f51cbcebe274e8b17878b382f63738fd96cee5/google_auth_oauthlib-1.2.1.tar.gz", hash = "sha256:afd0cad092a2eaa53cd8e8298557d6de1034c6cb4a740500b5357b648af97263", size = 24970 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/1a/8e/22a28dfbd218033e4eeaf3a0533b2b54852b6530da0c0fe934f0cc494b29/google_auth_oauthlib-1.2.1-py2.py3-none-any.whl", hash = "sha256:2d58a27262d55aa1b87678c3ba7142a080098cbc2024f903c62355deb235d91f", size = 24930 },
-]
-
-[[package]]
-name = "google-cloud-core"
-version = "2.4.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "google-api-core" },
-    { name = "google-auth" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/b8/1f/9d1e0ba6919668608570418a9a51e47070ac15aeff64261fb092d8be94c0/google-cloud-core-2.4.1.tar.gz", hash = "sha256:9b7749272a812bde58fff28868d0c5e2f585b82f37e09a1f6ed2d4d10f134073", size = 35587 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/5e/0f/2e2061e3fbcb9d535d5da3f58cc8de4947df1786fe6a1355960feb05a681/google_cloud_core-2.4.1-py2.py3-none-any.whl", hash = "sha256:a9e6a4422b9ac5c29f79a0ede9485473338e2ce78d91f2370c01e730eab22e61", size = 29233 },
-]
-
-[[package]]
-name = "google-cloud-storage"
-version = "3.0.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "google-api-core" },
-    { name = "google-auth" },
-    { name = "google-cloud-core" },
-    { name = "google-crc32c" },
-    { name = "google-resumable-media" },
-    { name = "requests" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/7f/d7/dfa74049c4faa3b4d68fa1a10a7eab5a76c57d0788b47c27f927bedc606d/google_cloud_storage-3.0.0.tar.gz", hash = "sha256:2accb3e828e584888beff1165e5f3ac61aa9088965eb0165794a82d8c7f95297", size = 7665253 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/9a/ae/1a50f07161301e40a30b2e40744a7b85ffab7add16e044417925eccf9bbf/google_cloud_storage-3.0.0-py2.py3-none-any.whl", hash = "sha256:f85fd059650d2dbb0ac158a9a6b304b66143b35ed2419afec2905ca522eb2c6a", size = 173860 },
-]
-
-[[package]]
-name = "google-crc32c"
-version = "1.6.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/67/72/c3298da1a3773102359c5a78f20dae8925f5ea876e37354415f68594a6fb/google_crc32c-1.6.0.tar.gz", hash = "sha256:6eceb6ad197656a1ff49ebfbbfa870678c75be4344feb35ac1edf694309413dc", size = 14472 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/1a/be/d7846cb50e17bf72a70ea2d8159478ac5de0f1170b10cac279f50079e78d/google_crc32c-1.6.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:5bcc90b34df28a4b38653c36bb5ada35671ad105c99cfe915fb5bed7ad6924aa", size = 30267 },
-    { url = "https://files.pythonhosted.org/packages/84/3b/29cadae166132e4991087a49dc88906a1d3d5ec22b80f63bc4bc7b6e0431/google_crc32c-1.6.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:d9e9913f7bd69e093b81da4535ce27af842e7bf371cde42d1ae9e9bd382dc0e9", size = 30113 },
-    { url = "https://files.pythonhosted.org/packages/18/a9/49a7b2c4b7cc69d15778a820734f9beb647b1b4cf1a629ca43e3d3a54c70/google_crc32c-1.6.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a184243544811e4a50d345838a883733461e67578959ac59964e43cca2c791e7", size = 37702 },
-    { url = "https://files.pythonhosted.org/packages/4b/aa/52538cceddefc7c2d66c6bd59dfe67a50f65a4952f441f91049e4188eb57/google_crc32c-1.6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:236c87a46cdf06384f614e9092b82c05f81bd34b80248021f729396a78e55d7e", size = 32847 },
-    { url = "https://files.pythonhosted.org/packages/b1/2c/1928413d3faae74ae0d7bdba648cf36ed6b03328c562b47046af016b7249/google_crc32c-1.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebab974b1687509e5c973b5c4b8b146683e101e102e17a86bd196ecaa4d099fc", size = 37844 },
-    { url = "https://files.pythonhosted.org/packages/d6/f4/f62fa405e442b37c5676973b759dd6e56cd8d58a5c78662912456526f716/google_crc32c-1.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:50cf2a96da226dcbff8671233ecf37bf6e95de98b2a2ebadbfdf455e6d05df42", size = 33444 },
-    { url = "https://files.pythonhosted.org/packages/7d/14/ab47972ac79b6e7b03c8be3a7ef44b530a60e69555668dbbf08fc5692a98/google_crc32c-1.6.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:f7a1fc29803712f80879b0806cb83ab24ce62fc8daf0569f2204a0cfd7f68ed4", size = 30267 },
-    { url = "https://files.pythonhosted.org/packages/54/7d/738cb0d25ee55629e7d07da686decf03864a366e5e863091a97b7bd2b8aa/google_crc32c-1.6.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:40b05ab32a5067525670880eb5d169529089a26fe35dce8891127aeddc1950e8", size = 30112 },
-    { url = "https://files.pythonhosted.org/packages/3e/6d/33ca50cbdeec09c31bb5dac277c90994edee975662a4c890bda7ffac90ef/google_crc32c-1.6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9e4b426c3702f3cd23b933436487eb34e01e00327fac20c9aebb68ccf34117d", size = 32861 },
-    { url = "https://files.pythonhosted.org/packages/67/1e/4870896fc81ec77b1b5ebae7fdd680d5a4d40e19a4b6d724032f996ca77a/google_crc32c-1.6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51c4f54dd8c6dfeb58d1df5e4f7f97df8abf17a36626a217f169893d1d7f3e9f", size = 32490 },
-    { url = "https://files.pythonhosted.org/packages/00/9c/f5f5af3ddaa7a639d915f8f58b09bbb8d1db90ecd0459b62cd430eb9a4b6/google_crc32c-1.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:bb8b3c75bd157010459b15222c3fd30577042a7060e29d42dabce449c087f2b3", size = 33446 },
-    { url = "https://files.pythonhosted.org/packages/cf/41/65a91657d6a8123c6c12f9aac72127b6ac76dda9e2ba1834026a842eb77c/google_crc32c-1.6.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:ed767bf4ba90104c1216b68111613f0d5926fb3780660ea1198fc469af410e9d", size = 30268 },
-    { url = "https://files.pythonhosted.org/packages/59/d0/ee743a267c7d5c4bb8bd865f7d4c039505f1c8a4b439df047fdc17be9769/google_crc32c-1.6.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:62f6d4a29fea082ac4a3c9be5e415218255cf11684ac6ef5488eea0c9132689b", size = 30113 },
-    { url = "https://files.pythonhosted.org/packages/25/53/e5e449c368dd26ade5fb2bb209e046d4309ed0623be65b13f0ce026cb520/google_crc32c-1.6.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c87d98c7c4a69066fd31701c4e10d178a648c2cac3452e62c6b24dc51f9fcc00", size = 32995 },
-    { url = "https://files.pythonhosted.org/packages/52/12/9bf6042d5b0ac8c25afed562fb78e51b0641474097e4139e858b45de40a5/google_crc32c-1.6.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd5e7d2445d1a958c266bfa5d04c39932dc54093fa391736dbfdb0f1929c1fb3", size = 32614 },
-    { url = "https://files.pythonhosted.org/packages/76/29/fc20f5ec36eac1eea0d0b2de4118c774c5f59c513f2a8630d4db6991f3e0/google_crc32c-1.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:7aec8e88a3583515f9e0957fe4f5f6d8d4997e36d0f61624e70469771584c760", size = 33445 },
-    { url = "https://files.pythonhosted.org/packages/e7/ff/ed48d136b65ddc61f5aef6261c58cd817c8cd60640b16680e5419fb17018/google_crc32c-1.6.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48abd62ca76a2cbe034542ed1b6aee851b6f28aaca4e6551b5599b6f3ef175cc", size = 28057 },
-    { url = "https://files.pythonhosted.org/packages/14/fb/54deefe679b7d1c1cc81d83396fcf28ad1a66d213bddeb275a8d28665918/google_crc32c-1.6.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18e311c64008f1f1379158158bb3f0c8d72635b9eb4f9545f8cf990c5668e59d", size = 27866 },
-]
-
-[[package]]
-name = "google-resumable-media"
-version = "2.7.2"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "google-crc32c" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/58/5a/0efdc02665dca14e0837b62c8a1a93132c264bd02054a15abb2218afe0ae/google_resumable_media-2.7.2.tar.gz", hash = "sha256:5280aed4629f2b60b847b0d42f9857fd4935c11af266744df33d8074cae92fe0", size = 2163099 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/82/35/b8d3baf8c46695858cb9d8835a53baa1eeb9906ddaf2f728a5f5b640fd1e/google_resumable_media-2.7.2-py2.py3-none-any.whl", hash = "sha256:3ce7551e9fe6d99e9a126101d2536612bb73486721951e9562fee0f90c6ababa", size = 81251 },
-]
-
-[[package]]
-name = "googleapis-common-protos"
-version = "1.66.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "protobuf" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/ff/a7/8e9cccdb1c49870de6faea2a2764fa23f627dd290633103540209f03524c/googleapis_common_protos-1.66.0.tar.gz", hash = "sha256:c3e7b33d15fdca5374cc0a7346dd92ffa847425cc4ea941d970f13680052ec8c", size = 114376 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/a0/0f/c0713fb2b3d28af4b2fded3291df1c4d4f79a00d15c2374a9e010870016c/googleapis_common_protos-1.66.0-py2.py3-none-any.whl", hash = "sha256:d7abcd75fabb2e0ec9f74466401f6c119a0b498e27370e9be4c94cb7e382b8ed", size = 221682 },
-]
-
 [[package]]
 name = "huggingface-hub"
-version = "0.28.0"
+version = "0.28.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
@@ -742,18 +501,18 @@ dependencies = [
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/10/fd/c8ff7693942dac1c642ec3a93a2bf7cbac36e2e920dd61a79965d9a662b7/huggingface_hub-0.28.0.tar.gz", hash = "sha256:c2b18c02a47d4384763caddb4d0ab2a8fc6c16e0800d6de4d55d0a896244aba3", size = 387079 }
+sdist = { url = "https://files.pythonhosted.org/packages/e7/ce/a734204aaae6c35a22f9956ebcd8d8708ae5b842e15d6f42bd6f49e634a4/huggingface_hub-0.28.1.tar.gz", hash = "sha256:893471090c98e3b6efbdfdacafe4052b20b84d59866fb6f54c33d9af18c303ae", size = 387074 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/cc/ac/07f92291add9f425f40b3fd70a1d0c7117f6e1152599abc2bd7fda5b6abe/huggingface_hub-0.28.0-py3-none-any.whl", hash = "sha256:71cff4e500efe68061d94b7f6d3114e183715088be7a90bf4dd84af83b5f5cdb", size = 464084 },
+    { url = "https://files.pythonhosted.org/packages/ea/da/6c2bea5327b640920267d3bf2c9fc114cfbd0a5de234d81cda80cc9e33c8/huggingface_hub-0.28.1-py3-none-any.whl", hash = "sha256:aa6b9a3ffdae939b72c464dbb0d7f99f56e649b55c3d52406f49e0a5a620c0a7", size = 464068 },
 ]
 
 [[package]]
 name = "identify"
-version = "2.6.6"
+version = "2.6.7"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/82/bf/c68c46601bacd4c6fb4dd751a42b6e7087240eaabc6487f2ef7a48e0e8fc/identify-2.6.6.tar.gz", hash = "sha256:7bec12768ed44ea4761efb47806f0a41f86e7c0a5fdf5950d4648c90eca7e251", size = 99217 }
+sdist = { url = "https://files.pythonhosted.org/packages/83/d1/524aa3350f78bcd714d148ade6133d67d6b7de2cdbae7d99039c024c9a25/identify-2.6.7.tar.gz", hash = "sha256:3fa266b42eba321ee0b2bb0936a6a6b9e36a1351cbb69055b3082f4193035684", size = 99260 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/74/a1/68a395c17eeefb04917034bd0a1bfa765e7654fa150cca473d669aa3afb5/identify-2.6.6-py2.py3-none-any.whl", hash = "sha256:cbd1810bce79f8b671ecb20f53ee0ae8e86ae84b557de31d89709dc2a48ba881", size = 99083 },
+    { url = "https://files.pythonhosted.org/packages/03/00/1fd4a117c6c93f2dcc5b7edaeaf53ea45332ef966429be566ca16c2beb94/identify-2.6.7-py2.py3-none-any.whl", hash = "sha256:155931cb617a401807b09ecec6635d6c692d180090a1cedca8ef7d58ba5b6aa0", size = 99097 },
 ]
 
 [[package]]
@@ -786,152 +545,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/bd/0f/2ba5fbcd631e3e88689309dbe978c5769e883e4b84ebfe7da30b43275c5a/jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb", size = 134596 },
 ]
 
-[[package]]
-name = "joblib"
-version = "1.4.2"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/64/33/60135848598c076ce4b231e1b1895170f45fbcaeaa2c9d5e38b04db70c35/joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e", size = 2116621 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/91/29/df4b9b42f2be0b623cbd5e2140cafcaa2bef0759a00b7b70104dcfe2fb51/joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6", size = 301817 },
-]
-
-[[package]]
-name = "jsonlines"
-version = "4.0.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "attrs" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/35/87/bcda8e46c88d0e34cad2f09ee2d0c7f5957bccdb9791b0b934ec84d84be4/jsonlines-4.0.0.tar.gz", hash = "sha256:0c6d2c09117550c089995247f605ae4cf77dd1533041d366351f6f298822ea74", size = 11359 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/f8/62/d9ba6323b9202dd2fe166beab8a86d29465c41a0288cbe229fac60c1ab8d/jsonlines-4.0.0-py3-none-any.whl", hash = "sha256:185b334ff2ca5a91362993f42e83588a360cf95ce4b71a73548502bda52a7c55", size = 8701 },
-]
-
-[[package]]
-name = "liger-kernel-nightly"
-version = "0.5.2.dev20250129180649"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "torch" },
-    { name = "triton" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/07/23/fcab81f6e9dd018eecf112f462831f6648b9d85765fe2b70c35e73d4bdc5/liger_kernel_nightly-0.5.2.dev20250129180649.tar.gz", hash = "sha256:d11bdac72655c468ed498ca48a15bd14d2ecf2df4efd913288ab84d26bf5c3ff", size = 3460969 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/2d/c7/a843f4c8289024034eeb03687326456b3ab356748d41f7d449551d4040a1/liger_kernel_nightly-0.5.2.dev20250129180649-py3-none-any.whl", hash = "sha256:2ae45799cea28e319e401217797ba7ba73cb6476db95cdd4a40497feb70e7ca6", size = 112180 },
-]
-
-[[package]]
-name = "lm-eval"
-version = "0.4.7"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "accelerate" },
-    { name = "datasets" },
-    { name = "dill" },
-    { name = "evaluate" },
-    { name = "jsonlines" },
-    { name = "more-itertools" },
-    { name = "numexpr" },
-    { name = "peft" },
-    { name = "pybind11" },
-    { name = "pytablewriter" },
-    { name = "rouge-score" },
-    { name = "sacrebleu" },
-    { name = "scikit-learn" },
-    { name = "sqlitedict" },
-    { name = "torch" },
-    { name = "tqdm-multiprocess" },
-    { name = "transformers" },
-    { name = "word2number" },
-    { name = "zstandard" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/53/c9/b5d03d5b2bf6819008e377844999fbd04ab00dff0c43728957f1c90a53c5/lm_eval-0.4.7.tar.gz", hash = "sha256:dcbef8722f363f58cfba36b6d783fc6bb17924b24b8da1684bf1ac835866208d", size = 1115713 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/45/b9/1b4e3268b590d9ff16e087685d9526455bb677c3e4d0caeba4451f20c586/lm_eval-0.4.7-py3-none-any.whl", hash = "sha256:d84a52580468fdc1d812e511db36e86679b69ee27f5a5e3dbd50f233d0bec69f", size = 2518804 },
-]
-
-[[package]]
-name = "lxml"
-version = "5.3.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/e7/6b/20c3a4b24751377aaa6307eb230b66701024012c29dd374999cc92983269/lxml-5.3.0.tar.gz", hash = "sha256:4e109ca30d1edec1ac60cdbe341905dc3b8f55b16855e03a54aaf59e51ec8c6f", size = 3679318 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/a1/ce/2789e39eddf2b13fac29878bfa465f0910eb6b0096e29090e5176bc8cf43/lxml-5.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:dd36439be765e2dde7660212b5275641edbc813e7b24668831a5c8ac91180656", size = 8124570 },
-    { url = "https://files.pythonhosted.org/packages/24/a8/f4010166a25d41715527129af2675981a50d3bbf7df09c5d9ab8ca24fbf9/lxml-5.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ae5fe5c4b525aa82b8076c1a59d642c17b6e8739ecf852522c6321852178119d", size = 4413042 },
-    { url = "https://files.pythonhosted.org/packages/41/a4/7e45756cecdd7577ddf67a68b69c1db0f5ddbf0c9f65021ee769165ffc5a/lxml-5.3.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:501d0d7e26b4d261fca8132854d845e4988097611ba2531408ec91cf3fd9d20a", size = 5139213 },
-    { url = "https://files.pythonhosted.org/packages/02/e2/ecf845b12323c92748077e1818b64e8b4dba509a4cb12920b3762ebe7552/lxml-5.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb66442c2546446944437df74379e9cf9e9db353e61301d1a0e26482f43f0dd8", size = 4838814 },
-    { url = "https://files.pythonhosted.org/packages/12/91/619f9fb72cf75e9ceb8700706f7276f23995f6ad757e6d400fbe35ca4990/lxml-5.3.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9e41506fec7a7f9405b14aa2d5c8abbb4dbbd09d88f9496958b6d00cb4d45330", size = 5425084 },
-    { url = "https://files.pythonhosted.org/packages/25/3b/162a85a8f0fd2a3032ec3f936636911c6e9523a8e263fffcfd581ce98b54/lxml-5.3.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f7d4a670107d75dfe5ad080bed6c341d18c4442f9378c9f58e5851e86eb79965", size = 4875993 },
-    { url = "https://files.pythonhosted.org/packages/43/af/dd3f58cc7d946da6ae42909629a2b1d5dd2d1b583334d4af9396697d6863/lxml-5.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41ce1f1e2c7755abfc7e759dc34d7d05fd221723ff822947132dc934d122fe22", size = 5012462 },
-    { url = "https://files.pythonhosted.org/packages/69/c1/5ea46b2d4c98f5bf5c83fffab8a0ad293c9bc74df9ecfbafef10f77f7201/lxml-5.3.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:44264ecae91b30e5633013fb66f6ddd05c006d3e0e884f75ce0b4755b3e3847b", size = 4815288 },
-    { url = "https://files.pythonhosted.org/packages/1d/51/a0acca077ad35da458f4d3f729ef98effd2b90f003440d35fc36323f8ae6/lxml-5.3.0-cp310-cp310-manylinux_2_28_ppc64le.whl", hash = "sha256:3c174dc350d3ec52deb77f2faf05c439331d6ed5e702fc247ccb4e6b62d884b7", size = 5472435 },
-    { url = "https://files.pythonhosted.org/packages/4d/6b/0989c9368986961a6b0f55b46c80404c4b758417acdb6d87bfc3bd5f4967/lxml-5.3.0-cp310-cp310-manylinux_2_28_s390x.whl", hash = "sha256:2dfab5fa6a28a0b60a20638dc48e6343c02ea9933e3279ccb132f555a62323d8", size = 4976354 },
-    { url = "https://files.pythonhosted.org/packages/05/9e/87492d03ff604fbf656ed2bf3e2e8d28f5d58ea1f00ff27ac27b06509079/lxml-5.3.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:b1c8c20847b9f34e98080da785bb2336ea982e7f913eed5809e5a3c872900f32", size = 5029973 },
-    { url = "https://files.pythonhosted.org/packages/f9/cc/9ae1baf5472af88e19e2c454b3710c1be9ecafb20eb474eeabcd88a055d2/lxml-5.3.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:2c86bf781b12ba417f64f3422cfc302523ac9cd1d8ae8c0f92a1c66e56ef2e86", size = 4888837 },
-    { url = "https://files.pythonhosted.org/packages/d2/10/5594ffaec8c120d75b17e3ad23439b740a51549a9b5fd7484b2179adfe8f/lxml-5.3.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:c162b216070f280fa7da844531169be0baf9ccb17263cf5a8bf876fcd3117fa5", size = 5530555 },
-    { url = "https://files.pythonhosted.org/packages/ea/9b/de17f05377c8833343b629905571fb06cff2028f15a6f58ae2267662e341/lxml-5.3.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:36aef61a1678cb778097b4a6eeae96a69875d51d1e8f4d4b491ab3cfb54b5a03", size = 5405314 },
-    { url = "https://files.pythonhosted.org/packages/8a/b4/227be0f1f3cca8255925985164c3838b8b36e441ff0cc10c1d3c6bdba031/lxml-5.3.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f65e5120863c2b266dbcc927b306c5b78e502c71edf3295dfcb9501ec96e5fc7", size = 5079303 },
-    { url = "https://files.pythonhosted.org/packages/5c/ee/19abcebb7fc40319bb71cd6adefa1ad94d09b5660228715854d6cc420713/lxml-5.3.0-cp310-cp310-win32.whl", hash = "sha256:ef0c1fe22171dd7c7c27147f2e9c3e86f8bdf473fed75f16b0c2e84a5030ce80", size = 3475126 },
-    { url = "https://files.pythonhosted.org/packages/a1/35/183d32551447e280032b2331738cd850da435a42f850b71ebeaab42c1313/lxml-5.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:052d99051e77a4f3e8482c65014cf6372e61b0a6f4fe9edb98503bb5364cfee3", size = 3805065 },
-    { url = "https://files.pythonhosted.org/packages/5c/a8/449faa2a3cbe6a99f8d38dcd51a3ee8844c17862841a6f769ea7c2a9cd0f/lxml-5.3.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:74bcb423462233bc5d6066e4e98b0264e7c1bed7541fff2f4e34fe6b21563c8b", size = 8141056 },
-    { url = "https://files.pythonhosted.org/packages/ac/8a/ae6325e994e2052de92f894363b038351c50ee38749d30cc6b6d96aaf90f/lxml-5.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a3d819eb6f9b8677f57f9664265d0a10dd6551d227afb4af2b9cd7bdc2ccbf18", size = 4425238 },
-    { url = "https://files.pythonhosted.org/packages/f8/fb/128dddb7f9086236bce0eeae2bfb316d138b49b159f50bc681d56c1bdd19/lxml-5.3.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5b8f5db71b28b8c404956ddf79575ea77aa8b1538e8b2ef9ec877945b3f46442", size = 5095197 },
-    { url = "https://files.pythonhosted.org/packages/b4/f9/a181a8ef106e41e3086629c8bdb2d21a942f14c84a0e77452c22d6b22091/lxml-5.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c3406b63232fc7e9b8783ab0b765d7c59e7c59ff96759d8ef9632fca27c7ee4", size = 4809809 },
-    { url = "https://files.pythonhosted.org/packages/25/2f/b20565e808f7f6868aacea48ddcdd7e9e9fb4c799287f21f1a6c7c2e8b71/lxml-5.3.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2ecdd78ab768f844c7a1d4a03595038c166b609f6395e25af9b0f3f26ae1230f", size = 5407593 },
-    { url = "https://files.pythonhosted.org/packages/23/0e/caac672ec246d3189a16c4d364ed4f7d6bf856c080215382c06764058c08/lxml-5.3.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:168f2dfcfdedf611eb285efac1516c8454c8c99caf271dccda8943576b67552e", size = 4866657 },
-    { url = "https://files.pythonhosted.org/packages/67/a4/1f5fbd3f58d4069000522196b0b776a014f3feec1796da03e495cf23532d/lxml-5.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa617107a410245b8660028a7483b68e7914304a6d4882b5ff3d2d3eb5948d8c", size = 4967017 },
-    { url = "https://files.pythonhosted.org/packages/ee/73/623ecea6ca3c530dd0a4ed0d00d9702e0e85cd5624e2d5b93b005fe00abd/lxml-5.3.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:69959bd3167b993e6e710b99051265654133a98f20cec1d9b493b931942e9c16", size = 4810730 },
-    { url = "https://files.pythonhosted.org/packages/1d/ce/fb84fb8e3c298f3a245ae3ea6221c2426f1bbaa82d10a88787412a498145/lxml-5.3.0-cp311-cp311-manylinux_2_28_ppc64le.whl", hash = "sha256:bd96517ef76c8654446fc3db9242d019a1bb5fe8b751ba414765d59f99210b79", size = 5455154 },
-    { url = "https://files.pythonhosted.org/packages/b1/72/4d1ad363748a72c7c0411c28be2b0dc7150d91e823eadad3b91a4514cbea/lxml-5.3.0-cp311-cp311-manylinux_2_28_s390x.whl", hash = "sha256:ab6dd83b970dc97c2d10bc71aa925b84788c7c05de30241b9e96f9b6d9ea3080", size = 4969416 },
-    { url = "https://files.pythonhosted.org/packages/42/07/b29571a58a3a80681722ea8ed0ba569211d9bb8531ad49b5cacf6d409185/lxml-5.3.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:eec1bb8cdbba2925bedc887bc0609a80e599c75b12d87ae42ac23fd199445654", size = 5013672 },
-    { url = "https://files.pythonhosted.org/packages/b9/93/bde740d5a58cf04cbd38e3dd93ad1e36c2f95553bbf7d57807bc6815d926/lxml-5.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6a7095eeec6f89111d03dabfe5883a1fd54da319c94e0fb104ee8f23616b572d", size = 4878644 },
-    { url = "https://files.pythonhosted.org/packages/56/b5/645c8c02721d49927c93181de4017164ec0e141413577687c3df8ff0800f/lxml-5.3.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:6f651ebd0b21ec65dfca93aa629610a0dbc13dbc13554f19b0113da2e61a4763", size = 5511531 },
-    { url = "https://files.pythonhosted.org/packages/85/3f/6a99a12d9438316f4fc86ef88c5d4c8fb674247b17f3173ecadd8346b671/lxml-5.3.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:f422a209d2455c56849442ae42f25dbaaba1c6c3f501d58761c619c7836642ec", size = 5402065 },
-    { url = "https://files.pythonhosted.org/packages/80/8a/df47bff6ad5ac57335bf552babfb2408f9eb680c074ec1ba412a1a6af2c5/lxml-5.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:62f7fdb0d1ed2065451f086519865b4c90aa19aed51081979ecd05a21eb4d1be", size = 5069775 },
-    { url = "https://files.pythonhosted.org/packages/08/ae/e7ad0f0fbe4b6368c5ee1e3ef0c3365098d806d42379c46c1ba2802a52f7/lxml-5.3.0-cp311-cp311-win32.whl", hash = "sha256:c6379f35350b655fd817cd0d6cbeef7f265f3ae5fedb1caae2eb442bbeae9ab9", size = 3474226 },
-    { url = "https://files.pythonhosted.org/packages/c3/b5/91c2249bfac02ee514ab135e9304b89d55967be7e53e94a879b74eec7a5c/lxml-5.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:9c52100e2c2dbb0649b90467935c4b0de5528833c76a35ea1a2691ec9f1ee7a1", size = 3814971 },
-    { url = "https://files.pythonhosted.org/packages/eb/6d/d1f1c5e40c64bf62afd7a3f9b34ce18a586a1cccbf71e783cd0a6d8e8971/lxml-5.3.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:e99f5507401436fdcc85036a2e7dc2e28d962550afe1cbfc07c40e454256a859", size = 8171753 },
-    { url = "https://files.pythonhosted.org/packages/bd/83/26b1864921869784355459f374896dcf8b44d4af3b15d7697e9156cb2de9/lxml-5.3.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:384aacddf2e5813a36495233b64cb96b1949da72bef933918ba5c84e06af8f0e", size = 4441955 },
-    { url = "https://files.pythonhosted.org/packages/e0/d2/e9bff9fb359226c25cda3538f664f54f2804f4b37b0d7c944639e1a51f69/lxml-5.3.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:874a216bf6afaf97c263b56371434e47e2c652d215788396f60477540298218f", size = 5050778 },
-    { url = "https://files.pythonhosted.org/packages/88/69/6972bfafa8cd3ddc8562b126dd607011e218e17be313a8b1b9cc5a0ee876/lxml-5.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:65ab5685d56914b9a2a34d67dd5488b83213d680b0c5d10b47f81da5a16b0b0e", size = 4748628 },
-    { url = "https://files.pythonhosted.org/packages/5d/ea/a6523c7c7f6dc755a6eed3d2f6d6646617cad4d3d6d8ce4ed71bfd2362c8/lxml-5.3.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aac0bbd3e8dd2d9c45ceb82249e8bdd3ac99131a32b4d35c8af3cc9db1657179", size = 5322215 },
-    { url = "https://files.pythonhosted.org/packages/99/37/396fbd24a70f62b31d988e4500f2068c7f3fd399d2fd45257d13eab51a6f/lxml-5.3.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b369d3db3c22ed14c75ccd5af429086f166a19627e84a8fdade3f8f31426e52a", size = 4813963 },
-    { url = "https://files.pythonhosted.org/packages/09/91/e6136f17459a11ce1757df864b213efbeab7adcb2efa63efb1b846ab6723/lxml-5.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c24037349665434f375645fa9d1f5304800cec574d0310f618490c871fd902b3", size = 4923353 },
-    { url = "https://files.pythonhosted.org/packages/1d/7c/2eeecf87c9a1fca4f84f991067c693e67340f2b7127fc3eca8fa29d75ee3/lxml-5.3.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:62d172f358f33a26d6b41b28c170c63886742f5b6772a42b59b4f0fa10526cb1", size = 4740541 },
-    { url = "https://files.pythonhosted.org/packages/3b/ed/4c38ba58defca84f5f0d0ac2480fdcd99fc7ae4b28fc417c93640a6949ae/lxml-5.3.0-cp312-cp312-manylinux_2_28_ppc64le.whl", hash = "sha256:c1f794c02903c2824fccce5b20c339a1a14b114e83b306ff11b597c5f71a1c8d", size = 5346504 },
-    { url = "https://files.pythonhosted.org/packages/a5/22/bbd3995437e5745cb4c2b5d89088d70ab19d4feabf8a27a24cecb9745464/lxml-5.3.0-cp312-cp312-manylinux_2_28_s390x.whl", hash = "sha256:5d6a6972b93c426ace71e0be9a6f4b2cfae9b1baed2eed2006076a746692288c", size = 4898077 },
-    { url = "https://files.pythonhosted.org/packages/0a/6e/94537acfb5b8f18235d13186d247bca478fea5e87d224644e0fe907df976/lxml-5.3.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:3879cc6ce938ff4eb4900d901ed63555c778731a96365e53fadb36437a131a99", size = 4946543 },
-    { url = "https://files.pythonhosted.org/packages/8d/e8/4b15df533fe8e8d53363b23a41df9be907330e1fa28c7ca36893fad338ee/lxml-5.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:74068c601baff6ff021c70f0935b0c7bc528baa8ea210c202e03757c68c5a4ff", size = 4816841 },
-    { url = "https://files.pythonhosted.org/packages/1a/e7/03f390ea37d1acda50bc538feb5b2bda6745b25731e4e76ab48fae7106bf/lxml-5.3.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:ecd4ad8453ac17bc7ba3868371bffb46f628161ad0eefbd0a855d2c8c32dd81a", size = 5417341 },
-    { url = "https://files.pythonhosted.org/packages/ea/99/d1133ab4c250da85a883c3b60249d3d3e7c64f24faff494cf0fd23f91e80/lxml-5.3.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:7e2f58095acc211eb9d8b5771bf04df9ff37d6b87618d1cbf85f92399c98dae8", size = 5327539 },
-    { url = "https://files.pythonhosted.org/packages/7d/ed/e6276c8d9668028213df01f598f385b05b55a4e1b4662ee12ef05dab35aa/lxml-5.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e63601ad5cd8f860aa99d109889b5ac34de571c7ee902d6812d5d9ddcc77fa7d", size = 5012542 },
-    { url = "https://files.pythonhosted.org/packages/36/88/684d4e800f5aa28df2a991a6a622783fb73cf0e46235cfa690f9776f032e/lxml-5.3.0-cp312-cp312-win32.whl", hash = "sha256:17e8d968d04a37c50ad9c456a286b525d78c4a1c15dd53aa46c1d8e06bf6fa30", size = 3486454 },
-    { url = "https://files.pythonhosted.org/packages/fc/82/ace5a5676051e60355bd8fb945df7b1ba4f4fb8447f2010fb816bfd57724/lxml-5.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:c1a69e58a6bb2de65902051d57fde951febad631a20a64572677a1052690482f", size = 3816857 },
-    { url = "https://files.pythonhosted.org/packages/94/6a/42141e4d373903bfea6f8e94b2f554d05506dfda522ada5343c651410dc8/lxml-5.3.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8c72e9563347c7395910de6a3100a4840a75a6f60e05af5e58566868d5eb2d6a", size = 8156284 },
-    { url = "https://files.pythonhosted.org/packages/91/5e/fa097f0f7d8b3d113fb7312c6308af702f2667f22644441715be961f2c7e/lxml-5.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e92ce66cd919d18d14b3856906a61d3f6b6a8500e0794142338da644260595cd", size = 4432407 },
-    { url = "https://files.pythonhosted.org/packages/2d/a1/b901988aa6d4ff937f2e5cfc114e4ec561901ff00660c3e56713642728da/lxml-5.3.0-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d04f064bebdfef9240478f7a779e8c5dc32b8b7b0b2fc6a62e39b928d428e51", size = 5048331 },
-    { url = "https://files.pythonhosted.org/packages/30/0f/b2a54f48e52de578b71bbe2a2f8160672a8a5e103df3a78da53907e8c7ed/lxml-5.3.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c2fb570d7823c2bbaf8b419ba6e5662137f8166e364a8b2b91051a1fb40ab8b", size = 4744835 },
-    { url = "https://files.pythonhosted.org/packages/82/9d/b000c15538b60934589e83826ecbc437a1586488d7c13f8ee5ff1f79a9b8/lxml-5.3.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0c120f43553ec759f8de1fee2f4794452b0946773299d44c36bfe18e83caf002", size = 5316649 },
-    { url = "https://files.pythonhosted.org/packages/e3/ee/ffbb9eaff5e541922611d2c56b175c45893d1c0b8b11e5a497708a6a3b3b/lxml-5.3.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:562e7494778a69086f0312ec9689f6b6ac1c6b65670ed7d0267e49f57ffa08c4", size = 4812046 },
-    { url = "https://files.pythonhosted.org/packages/15/ff/7ff89d567485c7b943cdac316087f16b2399a8b997007ed352a1248397e5/lxml-5.3.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:423b121f7e6fa514ba0c7918e56955a1d4470ed35faa03e3d9f0e3baa4c7e492", size = 4918597 },
-    { url = "https://files.pythonhosted.org/packages/c6/a3/535b6ed8c048412ff51268bdf4bf1cf052a37aa7e31d2e6518038a883b29/lxml-5.3.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:c00f323cc00576df6165cc9d21a4c21285fa6b9989c5c39830c3903dc4303ef3", size = 4738071 },
-    { url = "https://files.pythonhosted.org/packages/7a/8f/cbbfa59cb4d4fd677fe183725a76d8c956495d7a3c7f111ab8f5e13d2e83/lxml-5.3.0-cp313-cp313-manylinux_2_28_ppc64le.whl", hash = "sha256:1fdc9fae8dd4c763e8a31e7630afef517eab9f5d5d31a278df087f307bf601f4", size = 5342213 },
-    { url = "https://files.pythonhosted.org/packages/5c/fb/db4c10dd9958d4b52e34d1d1f7c1f434422aeaf6ae2bbaaff2264351d944/lxml-5.3.0-cp313-cp313-manylinux_2_28_s390x.whl", hash = "sha256:658f2aa69d31e09699705949b5fc4719cbecbd4a97f9656a232e7d6c7be1a367", size = 4893749 },
-    { url = "https://files.pythonhosted.org/packages/f2/38/bb4581c143957c47740de18a3281a0cab7722390a77cc6e610e8ebf2d736/lxml-5.3.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:1473427aff3d66a3fa2199004c3e601e6c4500ab86696edffdbc84954c72d832", size = 4945901 },
-    { url = "https://files.pythonhosted.org/packages/fc/d5/18b7de4960c731e98037bd48fa9f8e6e8f2558e6fbca4303d9b14d21ef3b/lxml-5.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a87de7dd873bf9a792bf1e58b1c3887b9264036629a5bf2d2e6579fe8e73edff", size = 4815447 },
-    { url = "https://files.pythonhosted.org/packages/97/a8/cd51ceaad6eb849246559a8ef60ae55065a3df550fc5fcd27014361c1bab/lxml-5.3.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:0d7b36afa46c97875303a94e8f3ad932bf78bace9e18e603f2085b652422edcd", size = 5411186 },
-    { url = "https://files.pythonhosted.org/packages/89/c3/1e3dabab519481ed7b1fdcba21dcfb8832f57000733ef0e71cf6d09a5e03/lxml-5.3.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:cf120cce539453ae086eacc0130a324e7026113510efa83ab42ef3fcfccac7fb", size = 5324481 },
-    { url = "https://files.pythonhosted.org/packages/b6/17/71e9984cf0570cd202ac0a1c9ed5c1b8889b0fc8dc736f5ef0ffb181c284/lxml-5.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:df5c7333167b9674aa8ae1d4008fa4bc17a313cc490b2cca27838bbdcc6bb15b", size = 5011053 },
-    { url = "https://files.pythonhosted.org/packages/69/68/9f7e6d3312a91e30829368c2b3217e750adef12a6f8eb10498249f4e8d72/lxml-5.3.0-cp313-cp313-win32.whl", hash = "sha256:c802e1c2ed9f0c06a65bc4ed0189d000ada8049312cfeab6ca635e39c9608957", size = 3485634 },
-    { url = "https://files.pythonhosted.org/packages/7d/db/214290d58ad68c587bd5d6af3d34e56830438733d0d0856c0275fde43652/lxml-5.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:406246b96d552e0503e17a1006fd27edac678b3fcc9f1be71a2f94b4ff61528d", size = 3814417 },
-    { url = "https://files.pythonhosted.org/packages/99/f7/b73a431c8500565aa500e99e60b448d305eaf7c0b4c893c7c5a8a69cc595/lxml-5.3.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7b1cd427cb0d5f7393c31b7496419da594fe600e6fdc4b105a54f82405e6626c", size = 3925431 },
-    { url = "https://files.pythonhosted.org/packages/db/48/4a206623c0d093d0e3b15f415ffb4345b0bdf661a3d0b15a112948c033c7/lxml-5.3.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51806cfe0279e06ed8500ce19479d757db42a30fd509940b1701be9c86a5ff9a", size = 4216683 },
-    { url = "https://files.pythonhosted.org/packages/54/47/577820c45dd954523ae8453b632d91e76da94ca6d9ee40d8c98dd86f916b/lxml-5.3.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ee70d08fd60c9565ba8190f41a46a54096afa0eeb8f76bd66f2c25d3b1b83005", size = 4326732 },
-    { url = "https://files.pythonhosted.org/packages/68/de/96cb6d3269bc994b4f5ede8ca7bf0840f5de0a278bc6e50cb317ff71cafa/lxml-5.3.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:8dc2c0395bea8254d8daebc76dcf8eb3a95ec2a46fa6fae5eaccee366bfe02ce", size = 4218377 },
-    { url = "https://files.pythonhosted.org/packages/a5/43/19b1ef6cbffa4244a217f95cc5f41a6cb4720fed33510a49670b03c5f1a0/lxml-5.3.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:6ba0d3dcac281aad8a0e5b14c7ed6f9fa89c8612b47939fc94f80b16e2e9bc83", size = 4351237 },
-    { url = "https://files.pythonhosted.org/packages/ba/b2/6a22fb5c0885da3b00e116aee81f0b829ec9ac8f736cd414b4a09413fc7d/lxml-5.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:6e91cf736959057f7aac7adfc83481e03615a8e8dd5758aa1d95ea69e8931dba", size = 3487557 },
-]
-
 [[package]]
 name = "markdown-it-py"
 version = "3.0.0"
@@ -1002,18 +615,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/4f/65/6079a46068dfceaeabb5dcad6d674f5f5c61a6fa5673746f42a9f4c233b3/MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f", size = 15739 },
 ]
 
-[[package]]
-name = "mbstrdecoder"
-version = "1.1.4"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "chardet" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/31/ab/05ae008357c8bdb6245ebf8a101d99f26c096e0ea20800b318153da23796/mbstrdecoder-1.1.4.tar.gz", hash = "sha256:8105ef9cf6b7d7d69fe7fd6b68a2d8f281ca9b365d7a9b670be376b2e6c81b21", size = 14527 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/30/ac/5ce64a1d4cce00390beab88622a290420401f1cabf05caf2fc0995157c21/mbstrdecoder-1.1.4-py3-none-any.whl", hash = "sha256:03dae4ec50ec0d2ff4743e63fdbd5e0022815857494d35224b60775d3d934a8c", size = 7933 },
-]
-
 [[package]]
 name = "mdurl"
 version = "0.1.2"
@@ -1023,15 +624,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 },
 ]
 
-[[package]]
-name = "more-itertools"
-version = "10.6.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/88/3b/7fa1fe835e2e93fd6d7b52b2f95ae810cf5ba133e1845f726f5a992d62c2/more-itertools-10.6.0.tar.gz", hash = "sha256:2cd7fad1009c31cc9fb6a035108509e6547547a7a738374f10bd49a09eb3ee3b", size = 125009 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/23/62/0fe302c6d1be1c777cab0616e6302478251dfbf9055ad426f5d0def75c89/more_itertools-10.6.0-py3-none-any.whl", hash = "sha256:6eb054cb4b6db1473f6e15fcc676a08e4732548acd47c708f0e179c2c7c01e89", size = 63038 },
-]
-
 [[package]]
 name = "mpmath"
 version = "1.3.0"
@@ -1164,21 +756,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d9/9d/0cc1e82849070ff3cbee69f326cb48a839407bcd15d8844443c30a5e7509/ninja-1.11.1.3-py3-none-win_arm64.whl", hash = "sha256:17978ad611d8ead578d83637f5ae80c2261b033db0b493a7ce94f88623f29e1b", size = 270571 },
 ]
 
-[[package]]
-name = "nltk"
-version = "3.9.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "click" },
-    { name = "joblib" },
-    { name = "regex" },
-    { name = "tqdm" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/3c/87/db8be88ad32c2d042420b6fd9ffd4a149f9a0d7f0e86b3f543be2eeeedd2/nltk-3.9.1.tar.gz", hash = "sha256:87d127bd3de4bd89a4f81265e5fa59cb1b199b27440175370f7417d2bc7ae868", size = 2904691 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/4d/66/7d9e26593edda06e8cb531874633f7c2372279c3b0f46235539fe546df8b/nltk-3.9.1-py3-none-any.whl", hash = "sha256:4fa26829c5b00715afe3061398a8989dc643b92ce7dd93fb4585a70930d168a1", size = 1505442 },
-]
-
 [[package]]
 name = "nodeenv"
 version = "1.9.1"
@@ -1188,45 +765,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314 },
 ]
 
-[[package]]
-name = "numexpr"
-version = "2.10.2"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "numpy" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/21/67/c7415cf04ebe418193cfd6595ae03e3a64d76dac7b9c010098b39cc7992e/numexpr-2.10.2.tar.gz", hash = "sha256:b0aff6b48ebc99d2f54f27b5f73a58cb92fde650aeff1b397c71c8788b4fff1a", size = 106787 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/fd/dc/bd84219318826d138b7e729ac3ffce3c706ab9d810ce74326a55c7252dd1/numexpr-2.10.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b5b0e82d2109c1d9e63fcd5ea177d80a11b881157ab61178ddbdebd4c561ea46", size = 145011 },
-    { url = "https://files.pythonhosted.org/packages/31/6a/b1f08141283327478a57490c0ab3f26a634d4741ff33b9e22f760a7cedb0/numexpr-2.10.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3fc2b8035a0c2cdc352e58c3875cb668836018065cbf5752cb531015d9a568d8", size = 134777 },
-    { url = "https://files.pythonhosted.org/packages/7c/d6/6641864b0446ce472330de7644c78f90bd7e55d902046b44161f92721279/numexpr-2.10.2-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0db5ff5183935d1612653559c319922143e8fa3019007696571b13135f216458", size = 408893 },
-    { url = "https://files.pythonhosted.org/packages/25/ab/cb5809cb1f66431632d63dc028c58cb91492725c74dddc4b97ba62e88a92/numexpr-2.10.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:15f59655458056fdb3a621b1bb8e071581ccf7e823916c7568bb7c9a3e393025", size = 397305 },
-    { url = "https://files.pythonhosted.org/packages/9c/a0/29bcb31a9debb743e3dc46bacd55f4f6ee6a77d95eda5c8dca19a29c0627/numexpr-2.10.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ce8cccf944339051e44a49a124a06287fe3066d0acbff33d1aa5aee10a96abb7", size = 1378789 },
-    { url = "https://files.pythonhosted.org/packages/cc/72/415262a7bdda706c41bf8254311a5ca13d3b8532341ab478be4583d7061a/numexpr-2.10.2-cp310-cp310-win32.whl", hash = "sha256:ba85371c9a8d03e115f4dfb6d25dfbce05387002b9bc85016af939a1da9624f0", size = 151935 },
-    { url = "https://files.pythonhosted.org/packages/71/fa/0124f0c2a502a0bac4553c8a171c551f154cf80a83a15e40d30c43e48a7e/numexpr-2.10.2-cp310-cp310-win_amd64.whl", hash = "sha256:deb64235af9eeba59fcefa67e82fa80cfc0662e1b0aa373b7118a28da124d51d", size = 144961 },
-    { url = "https://files.pythonhosted.org/packages/de/b7/f25d6166f92ef23737c1c90416144492a664f0a56510d90f7c6577c2cd14/numexpr-2.10.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6b360eb8d392483410fe6a3d5a7144afa298c9a0aa3e9fe193e89590b47dd477", size = 145055 },
-    { url = "https://files.pythonhosted.org/packages/66/64/428361ea6415826332f38ef2dd5c3abf4e7e601f033bfc9be68b680cb765/numexpr-2.10.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d9a42f5c24880350d88933c4efee91b857c378aaea7e8b86221fff569069841e", size = 134743 },
-    { url = "https://files.pythonhosted.org/packages/3f/fb/639ec91d2ea7b4a5d66e26e8ef8e06b020c8e9b9ebaf3bab7b0a9bee472e/numexpr-2.10.2-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:83fcb11988b57cc25b028a36d285287d706d1f536ebf2662ea30bd990e0de8b9", size = 410397 },
-    { url = "https://files.pythonhosted.org/packages/89/5a/0f5c5b8a3a6d34eeecb30d0e2f722d50b9b38c0e175937e7c6268ffab997/numexpr-2.10.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4213a92efa9770bc28e3792134e27c7e5c7e97068bdfb8ba395baebbd12f991b", size = 398902 },
-    { url = "https://files.pythonhosted.org/packages/a2/d5/ec734e735eba5a753efed5be3707ee7447ebd371772f8081b65a4153fb97/numexpr-2.10.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ebdbef5763ca057eea0c2b5698e4439d084a0505d9d6e94f4804f26e8890c45e", size = 1380354 },
-    { url = "https://files.pythonhosted.org/packages/30/51/406e572531d817480bd612ee08239a36ee82865fea02fce569f15631f4ee/numexpr-2.10.2-cp311-cp311-win32.whl", hash = "sha256:3bf01ec502d89944e49e9c1b5cc7c7085be8ca2eb9dd46a0eafd218afbdbd5f5", size = 151938 },
-    { url = "https://files.pythonhosted.org/packages/04/32/5882ed1dbd96234f327a73316a481add151ff827cfaf2ea24fb4d5ad04db/numexpr-2.10.2-cp311-cp311-win_amd64.whl", hash = "sha256:e2d0ae24b0728e4bc3f1d3f33310340d67321d36d6043f7ce26897f4f1042db0", size = 144961 },
-    { url = "https://files.pythonhosted.org/packages/2b/96/d5053dea06d8298ae8052b4b049cbf8ef74998e28d57166cc27b8ae909e2/numexpr-2.10.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b5323a46e75832334f1af86da1ef6ff0add00fbacdd266250be872b438bdf2be", size = 145029 },
-    { url = "https://files.pythonhosted.org/packages/3e/3c/fcd5a812ed5dda757b2d9ef2764a3e1cca6f6d1f02dbf113dc23a2c7702a/numexpr-2.10.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a42963bd4c62d8afa4f51e7974debfa39a048383f653544ab54f50a2f7ec6c42", size = 134851 },
-    { url = "https://files.pythonhosted.org/packages/0a/52/0ed3b306d8c9944129bce97fec73a2caff13adbd7e1df148d546d7eb2d4d/numexpr-2.10.2-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5191ba8f2975cb9703afc04ae845a929e193498c0e8bcd408ecb147b35978470", size = 411837 },
-    { url = "https://files.pythonhosted.org/packages/7d/9c/6b671dd3fb67d7e7da93cb76b7c5277743f310a216b7856bb18776bb3371/numexpr-2.10.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:97298b14f0105a794bea06fd9fbc5c423bd3ff4d88cbc618860b83eb7a436ad6", size = 400577 },
-    { url = "https://files.pythonhosted.org/packages/ea/4d/a167d1a215fe10ce58c45109f2869fd13aa0eef66f7e8c69af68be45d436/numexpr-2.10.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f9d7805ccb6be2d3b0f7f6fad3707a09ac537811e8e9964f4074d28cb35543db", size = 1381735 },
-    { url = "https://files.pythonhosted.org/packages/c1/d4/17e4434f989e4917d31cbd88a043e1c9c16958149cf43fa622987111392b/numexpr-2.10.2-cp312-cp312-win32.whl", hash = "sha256:cb845b2d4f9f8ef0eb1c9884f2b64780a85d3b5ae4eeb26ae2b0019f489cd35e", size = 152102 },
-    { url = "https://files.pythonhosted.org/packages/b8/25/9ae599994076ef2a42d35ff6b0430da002647f212567851336a6c7b132d6/numexpr-2.10.2-cp312-cp312-win_amd64.whl", hash = "sha256:57b59cbb5dcce4edf09cd6ce0b57ff60312479930099ca8d944c2fac896a1ead", size = 145061 },
-    { url = "https://files.pythonhosted.org/packages/8c/cb/2ea1848c46e4d75073c038dd75628d1aa442975303264ed230bf90f74f44/numexpr-2.10.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a37d6a51ec328c561b2ca8a2bef07025642eca995b8553a5267d0018c732976d", size = 145035 },
-    { url = "https://files.pythonhosted.org/packages/ec/cf/bb2bcd81d6f3243590e19ac3e7795a1a370f3ebcd8ecec1f46dcd5333f37/numexpr-2.10.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:81d1dde7dd6166d8ff5727bb46ab42a6b0048db0e97ceb84a121334a404a800f", size = 134858 },
-    { url = "https://files.pythonhosted.org/packages/48/9b/c9128ffb453205c2a4c84a3abed35447c7591c2c2812e77e34fd238cb2bb/numexpr-2.10.2-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5b3f814437d5a10797f8d89d2037cca2c9d9fa578520fc911f894edafed6ea3e", size = 415517 },
-    { url = "https://files.pythonhosted.org/packages/7e/b0/64c04c9f8b4a563218d00daa1ec4563364961b79025162c5276ab2c7c407/numexpr-2.10.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9309f2e43fe6e4560699ef5c27d7a848b3ff38549b6b57194207cf0e88900527", size = 403846 },
-    { url = "https://files.pythonhosted.org/packages/80/35/60e9041fd709fe98dd3109d73a03cdffaeb6ee2089179155f5c3754e9934/numexpr-2.10.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ebb73b93f5c4d6994f357fa5a47a9f7a5485577e633b3c46a603cb01445bbb19", size = 1381659 },
-    { url = "https://files.pythonhosted.org/packages/bd/5a/955bf5b5cf8f3de7b044a999e36327e14191fa073ed0e329456ed0f8161d/numexpr-2.10.2-cp313-cp313-win32.whl", hash = "sha256:ec04c9a3c050c175348801e27c18c68d28673b7bfb865ef88ce333be523bbc01", size = 152105 },
-    { url = "https://files.pythonhosted.org/packages/be/7a/8ce360a1848bb5bcc30a414493371678f43790ece397f8652d5f65757e57/numexpr-2.10.2-cp313-cp313-win_amd64.whl", hash = "sha256:d7a3fc83c959288544db3adc70612475d8ad53a66c69198105c74036182d10dd", size = 145060 },
-]
-
 [[package]]
 name = "numpy"
 version = "2.2.2"
@@ -1375,6 +913,14 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763 },
 ]
 
+[[package]]
+name = "nvidia-cusparselt-cu12"
+version = "0.6.2"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/78/a8/bcbb63b53a4b1234feeafb65544ee55495e1bb37ec31b999b963cbccfd1d/nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:df2c24502fd76ebafe7457dbc4716b2fec071aabaed4fb7691a201cde03704d9", size = 150057751 },
+]
+
 [[package]]
 name = "nvidia-nccl-cu12"
 version = "2.21.5"
@@ -1399,15 +945,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/87/20/199b8713428322a2f22b722c62b8cc278cc53dffa9705d744484b5035ee9/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a", size = 99144 },
 ]
 
-[[package]]
-name = "oauthlib"
-version = "3.2.2"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/6d/fa/fbf4001037904031639e6bfbfc02badfc7e12f137a8afa254df6c4c8a670/oauthlib-3.2.2.tar.gz", hash = "sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918", size = 177352 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/7e/80/cab10959dc1faead58dc8384a781dfbf93cb4d33d50988f7a69f1b7c9bbe/oauthlib-3.2.2-py3-none-any.whl", hash = "sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca", size = 151688 },
-]
-
 [[package]]
 name = "packaging"
 version = "24.2"
@@ -1465,36 +1002,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ab/5f/b38085618b950b79d2d9164a711c52b10aefc0ae6833b96f626b7021b2ed/pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a", size = 13098436 },
 ]
 
-[[package]]
-name = "pathvalidate"
-version = "3.2.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/92/87/c7a2f51cc62df0495acb0ed2533a7c74cc895e569a1b020ee5f6e9fa4e21/pathvalidate-3.2.3.tar.gz", hash = "sha256:59b5b9278e30382d6d213497623043ebe63f10e29055be4419a9c04c721739cb", size = 61717 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/50/14/c5a0e1a947909810fc4c043b84cac472b70e438148d34f5393be1bac663f/pathvalidate-3.2.3-py3-none-any.whl", hash = "sha256:5eaf0562e345d4b6d0c0239d0f690c3bd84d2a9a3c4c73b99ea667401b27bee1", size = 24130 },
-]
-
-[[package]]
-name = "peft"
-version = "0.14.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "accelerate" },
-    { name = "huggingface-hub" },
-    { name = "numpy" },
-    { name = "packaging" },
-    { name = "psutil" },
-    { name = "pyyaml" },
-    { name = "safetensors" },
-    { name = "torch" },
-    { name = "tqdm" },
-    { name = "transformers" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/21/33/fb0c31eaa8162c01e9250b21aa65d46a5339f17a818a97c68391db2ff44b/peft-0.14.0.tar.gz", hash = "sha256:546d69af7b42f5ef715a3d3261ed818bc917ae6055e5d7e187ed3f2c76ad72dc", size = 411902 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/88/05/e58e3aaa36544d30a917814e336fc65a746f708e5874945e92999bc22fa3/peft-0.14.0-py3-none-any.whl", hash = "sha256:2f04f3a870c3baf30f15e7dcaa5dd70d3e54cfdd146d3c6c187735d3ae0a0700", size = 374831 },
-]
-
 [[package]]
 name = "platformdirs"
 version = "4.3.6"
@@ -1513,18 +1020,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 },
 ]
 
-[[package]]
-name = "portalocker"
-version = "3.1.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "pywin32", marker = "sys_platform == 'win32'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/ac/91/8bfe23e1f7f630f2061ef38b5225d9fda9068d6a30fcbc187951e678e630/portalocker-3.1.1.tar.gz", hash = "sha256:ec20f6dda2ad9ce89fa399a5f31f4f1495f515958f0cb7ca6543cef7bb5a749e", size = 43708 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/f7/60/1974cfdd5bb770568ddc6f89f3e0df4cfdd1acffd5a609dff5e95f48c6e2/portalocker-3.1.1-py3-none-any.whl", hash = "sha256:80e984e24de292ff258a5bea0e4f3f778fff84c0ae1275dbaebc4658de4aacb3", size = 19661 },
-]
-
 [[package]]
 name = "pre-commit"
 version = "4.1.0"
@@ -1614,18 +1109,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/41/b6/c5319caea262f4821995dca2107483b94a3345d4607ad797c76cb9c36bcc/propcache-0.2.1-py3-none-any.whl", hash = "sha256:52277518d6aae65536e9cea52d4e7fd2f7a66f4aa2d30ed3f2fcea620ace3c54", size = 11818 },
 ]
 
-[[package]]
-name = "proto-plus"
-version = "1.26.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "protobuf" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/26/79/a5c6cbb42268cfd3ddc652dc526889044a8798c688a03ff58e5e92b743c8/proto_plus-1.26.0.tar.gz", hash = "sha256:6e93d5f5ca267b54300880fff156b6a3386b3fa3f43b1da62e680fc0c586ef22", size = 56136 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/42/c3/59308ccc07b34980f9d532f7afc718a9f32b40e52cde7a740df8d55632fb/proto_plus-1.26.0-py3-none-any.whl", hash = "sha256:bf2dfaa3da281fc3187d12d224c707cb57214fb2c22ba854eb0c105a3fb2d4d7", size = 50166 },
-]
-
 [[package]]
 name = "protobuf"
 version = "5.29.3"
@@ -1697,36 +1180,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/36/ef/1d7975053af9d106da973bac142d0d4da71b7550a3576cc3e0b3f444d21a/pyarrow-19.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:29cd86c8001a94f768f79440bf83fee23963af5e7bc68ce3a7e5f120e17edf89", size = 42077618 },
 ]
 
-[[package]]
-name = "pyasn1"
-version = "0.6.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/ba/e9/01f1a64245b89f039897cb0130016d79f77d52669aae6ee7b159a6c4c018/pyasn1-0.6.1.tar.gz", hash = "sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034", size = 145322 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c8/f1/d6a797abb14f6283c0ddff96bbdd46937f64122b8c925cab503dd37f8214/pyasn1-0.6.1-py3-none-any.whl", hash = "sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629", size = 83135 },
-]
-
-[[package]]
-name = "pyasn1-modules"
-version = "0.4.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "pyasn1" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/1d/67/6afbf0d507f73c32d21084a79946bfcfca5fbc62a72057e9c23797a737c9/pyasn1_modules-0.4.1.tar.gz", hash = "sha256:c28e2dbf9c06ad61c71a075c7e0f9fd0f1b0bb2d2ad4377f240d33ac2ab60a7c", size = 310028 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/77/89/bc88a6711935ba795a679ea6ebee07e128050d6382eaa35a0a47c8032bdc/pyasn1_modules-0.4.1-py3-none-any.whl", hash = "sha256:49bfa96b45a292b711e986f222502c1c9a5e1f4e568fc30e2574a6c7d07838fd", size = 181537 },
-]
-
-[[package]]
-name = "pybind11"
-version = "2.13.6"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/d2/c1/72b9622fcb32ff98b054f724e213c7f70d6898baa714f4516288456ceaba/pybind11-2.13.6.tar.gz", hash = "sha256:ba6af10348c12b24e92fa086b39cfba0eff619b61ac77c406167d813b096d39a", size = 218403 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/13/2f/0f24b288e2ce56f51c920137620b4434a38fd80583dbbe24fc2a1656c388/pybind11-2.13.6-py3-none-any.whl", hash = "sha256:237c41e29157b962835d356b370ededd57594a26d5894a795960f0047cb5caf5", size = 243282 },
-]
-
 [[package]]
 name = "pycparser"
 version = "2.22"
@@ -1843,24 +1296,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293 },
 ]
 
-[[package]]
-name = "pytablewriter"
-version = "1.2.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "dataproperty" },
-    { name = "mbstrdecoder" },
-    { name = "pathvalidate" },
-    { name = "setuptools" },
-    { name = "tabledata" },
-    { name = "tcolorpy" },
-    { name = "typepy", extra = ["datetime"] },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/f6/a1/617730f290f04d347103ab40bf67d317df6691b14746f6e1ea039fb57062/pytablewriter-1.2.1.tar.gz", hash = "sha256:7bd0f4f397e070e3b8a34edcf1b9257ccbb18305493d8350a5dbc9957fced959", size = 619241 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/21/4c/c199512f01c845dfe5a7840ab3aae6c60463b5dc2a775be72502dfd9170a/pytablewriter-1.2.1-py3-none-any.whl", hash = "sha256:e906ff7ff5151d70a5f66e0f7b75642a7f2dce8d893c265b79cc9cf6bc04ddb4", size = 91083 },
-]
-
 [[package]]
 name = "pytest"
 version = "8.3.4"
@@ -1892,30 +1327,11 @@ wheels = [
 
 [[package]]
 name = "pytz"
-version = "2024.2"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/3a/31/3c70bf7603cc2dca0f19bdc53b4537a797747a58875b552c8c413d963a3f/pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a", size = 319692 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/11/c3/005fcca25ce078d2cc29fd559379817424e94885510568bc1bc53d7d5846/pytz-2024.2-py2.py3-none-any.whl", hash = "sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725", size = 508002 },
-]
-
-[[package]]
-name = "pywin32"
-version = "308"
+version = "2025.1"
 source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/5f/57/df1c9157c8d5a05117e455d66fd7cf6dbc46974f832b1058ed4856785d8a/pytz-2025.1.tar.gz", hash = "sha256:c2db42be2a2518b28e65f9207c4d05e6ff547d1efa4086469ef855e4ab70178e", size = 319617 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/72/a6/3e9f2c474895c1bb61b11fa9640be00067b5c5b363c501ee9c3fa53aec01/pywin32-308-cp310-cp310-win32.whl", hash = "sha256:796ff4426437896550d2981b9c2ac0ffd75238ad9ea2d3bfa67a1abd546d262e", size = 5927028 },
-    { url = "https://files.pythonhosted.org/packages/d9/b4/84e2463422f869b4b718f79eb7530a4c1693e96b8a4e5e968de38be4d2ba/pywin32-308-cp310-cp310-win_amd64.whl", hash = "sha256:4fc888c59b3c0bef905ce7eb7e2106a07712015ea1c8234b703a088d46110e8e", size = 6558484 },
-    { url = "https://files.pythonhosted.org/packages/9f/8f/fb84ab789713f7c6feacaa08dad3ec8105b88ade8d1c4f0f0dfcaaa017d6/pywin32-308-cp310-cp310-win_arm64.whl", hash = "sha256:a5ab5381813b40f264fa3495b98af850098f814a25a63589a8e9eb12560f450c", size = 7971454 },
-    { url = "https://files.pythonhosted.org/packages/eb/e2/02652007469263fe1466e98439831d65d4ca80ea1a2df29abecedf7e47b7/pywin32-308-cp311-cp311-win32.whl", hash = "sha256:5d8c8015b24a7d6855b1550d8e660d8daa09983c80e5daf89a273e5c6fb5095a", size = 5928156 },
-    { url = "https://files.pythonhosted.org/packages/48/ef/f4fb45e2196bc7ffe09cad0542d9aff66b0e33f6c0954b43e49c33cad7bd/pywin32-308-cp311-cp311-win_amd64.whl", hash = "sha256:575621b90f0dc2695fec346b2d6302faebd4f0f45c05ea29404cefe35d89442b", size = 6559559 },
-    { url = "https://files.pythonhosted.org/packages/79/ef/68bb6aa865c5c9b11a35771329e95917b5559845bd75b65549407f9fc6b4/pywin32-308-cp311-cp311-win_arm64.whl", hash = "sha256:100a5442b7332070983c4cd03f2e906a5648a5104b8a7f50175f7906efd16bb6", size = 7972495 },
-    { url = "https://files.pythonhosted.org/packages/00/7c/d00d6bdd96de4344e06c4afbf218bc86b54436a94c01c71a8701f613aa56/pywin32-308-cp312-cp312-win32.whl", hash = "sha256:587f3e19696f4bf96fde9d8a57cec74a57021ad5f204c9e627e15c33ff568897", size = 5939729 },
-    { url = "https://files.pythonhosted.org/packages/21/27/0c8811fbc3ca188f93b5354e7c286eb91f80a53afa4e11007ef661afa746/pywin32-308-cp312-cp312-win_amd64.whl", hash = "sha256:00b3e11ef09ede56c6a43c71f2d31857cf7c54b0ab6e78ac659497abd2834f47", size = 6543015 },
-    { url = "https://files.pythonhosted.org/packages/9d/0f/d40f8373608caed2255781a3ad9a51d03a594a1248cd632d6a298daca693/pywin32-308-cp312-cp312-win_arm64.whl", hash = "sha256:9b4de86c8d909aed15b7011182c8cab38c8850de36e6afb1f0db22b8959e3091", size = 7976033 },
-    { url = "https://files.pythonhosted.org/packages/a9/a4/aa562d8935e3df5e49c161b427a3a2efad2ed4e9cf81c3de636f1fdddfd0/pywin32-308-cp313-cp313-win32.whl", hash = "sha256:1c44539a37a5b7b21d02ab34e6a4d314e0788f1690d65b48e9b0b89f31abbbed", size = 5938579 },
-    { url = "https://files.pythonhosted.org/packages/c7/50/b0efb8bb66210da67a53ab95fd7a98826a97ee21f1d22949863e6d588b22/pywin32-308-cp313-cp313-win_amd64.whl", hash = "sha256:fd380990e792eaf6827fcb7e187b2b4b1cede0585e3d0c9e84201ec27b9905e4", size = 6542056 },
-    { url = "https://files.pythonhosted.org/packages/26/df/2b63e3e4f2df0224f8aaf6d131f54fe4e8c96400eb9df563e2aae2e1a1f9/pywin32-308-cp313-cp313-win_arm64.whl", hash = "sha256:ef313c46d4c18dfb82a2431e3051ac8f112ccee1a34f29c263c583c568db63cd", size = 7974986 },
+    { url = "https://files.pythonhosted.org/packages/eb/38/ac33370d784287baa1c3d538978b5e2ea064d4c1b93ffbd12826c190dd10/pytz-2025.1-py2.py3-none-any.whl", hash = "sha256:89dd22dca55b46eac6eda23b2d72721bf1bdfef212645d81513ef5d03038de57", size = 507930 },
 ]
 
 [[package]]
@@ -2046,19 +1462,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 },
 ]
 
-[[package]]
-name = "requests-oauthlib"
-version = "2.0.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "oauthlib" },
-    { name = "requests" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/42/f2/05f29bc3913aea15eb670be136045bf5c5bbf4b99ecb839da9b422bb2c85/requests-oauthlib-2.0.0.tar.gz", hash = "sha256:b3dffaebd884d8cd778494369603a9e7b58d29111bf6b41bdc2dcd87203af4e9", size = 55650 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/3b/5d/63d4ae3b9daea098d5d6f5da83984853c1bbacd5dc826764b249fe119d24/requests_oauthlib-2.0.0-py2.py3-none-any.whl", hash = "sha256:7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36", size = 24179 },
-]
-
 [[package]]
 name = "rich"
 version = "13.9.4"
@@ -2073,70 +1476,29 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/19/71/39c7c0d87f8d4e6c020a393182060eaefeeae6c01dab6a84ec346f2567df/rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90", size = 242424 },
 ]
 
-[[package]]
-name = "rouge-score"
-version = "0.1.2"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "absl-py" },
-    { name = "nltk" },
-    { name = "numpy" },
-    { name = "six" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/e2/c5/9136736c37022a6ad27fea38f3111eb8f02fe75d067f9a985cc358653102/rouge_score-0.1.2.tar.gz", hash = "sha256:c7d4da2683e68c9abf0135ef915d63a46643666f848e558a1b9f7ead17ff0f04", size = 17400 }
-
-[[package]]
-name = "rsa"
-version = "4.9"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "pyasn1" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/aa/65/7d973b89c4d2351d7fb232c2e452547ddfa243e93131e7cfa766da627b52/rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21", size = 29711 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/49/97/fa78e3d2f65c02c8e1268b9aba606569fe97f6c8f7c2d74394553347c145/rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7", size = 34315 },
-]
-
 [[package]]
 name = "ruff"
-version = "0.9.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/1e/7f/60fda2eec81f23f8aa7cbbfdf6ec2ca11eb11c273827933fb2541c2ce9d8/ruff-0.9.3.tar.gz", hash = "sha256:8293f89985a090ebc3ed1064df31f3b4b56320cdfcec8b60d3295bddb955c22a", size = 3586740 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/f9/77/4fb790596d5d52c87fd55b7160c557c400e90f6116a56d82d76e95d9374a/ruff-0.9.3-py3-none-linux_armv6l.whl", hash = "sha256:7f39b879064c7d9670197d91124a75d118d00b0990586549949aae80cdc16624", size = 11656815 },
-    { url = "https://files.pythonhosted.org/packages/a2/a8/3338ecb97573eafe74505f28431df3842c1933c5f8eae615427c1de32858/ruff-0.9.3-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:a187171e7c09efa4b4cc30ee5d0d55a8d6c5311b3e1b74ac5cb96cc89bafc43c", size = 11594821 },
-    { url = "https://files.pythonhosted.org/packages/8e/89/320223c3421962762531a6b2dd58579b858ca9916fb2674874df5e97d628/ruff-0.9.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:c59ab92f8e92d6725b7ded9d4a31be3ef42688a115c6d3da9457a5bda140e2b4", size = 11040475 },
-    { url = "https://files.pythonhosted.org/packages/b2/bd/1d775eac5e51409535804a3a888a9623e87a8f4b53e2491580858a083692/ruff-0.9.3-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2dc153c25e715be41bb228bc651c1e9b1a88d5c6e5ed0194fa0dfea02b026439", size = 11856207 },
-    { url = "https://files.pythonhosted.org/packages/7f/c6/3e14e09be29587393d188454064a4aa85174910d16644051a80444e4fd88/ruff-0.9.3-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:646909a1e25e0dc28fbc529eab8eb7bb583079628e8cbe738192853dbbe43af5", size = 11420460 },
-    { url = "https://files.pythonhosted.org/packages/ef/42/b7ca38ffd568ae9b128a2fa76353e9a9a3c80ef19746408d4ce99217ecc1/ruff-0.9.3-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5a5a46e09355695fbdbb30ed9889d6cf1c61b77b700a9fafc21b41f097bfbba4", size = 12605472 },
-    { url = "https://files.pythonhosted.org/packages/a6/a1/3167023f23e3530fde899497ccfe239e4523854cb874458ac082992d206c/ruff-0.9.3-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:c4bb09d2bbb394e3730d0918c00276e79b2de70ec2a5231cd4ebb51a57df9ba1", size = 13243123 },
-    { url = "https://files.pythonhosted.org/packages/d0/b4/3c600758e320f5bf7de16858502e849f4216cb0151f819fa0d1154874802/ruff-0.9.3-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:96a87ec31dc1044d8c2da2ebbed1c456d9b561e7d087734336518181b26b3aa5", size = 12744650 },
-    { url = "https://files.pythonhosted.org/packages/be/38/266fbcbb3d0088862c9bafa8b1b99486691d2945a90b9a7316336a0d9a1b/ruff-0.9.3-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9bb7554aca6f842645022fe2d301c264e6925baa708b392867b7a62645304df4", size = 14458585 },
-    { url = "https://files.pythonhosted.org/packages/63/a6/47fd0e96990ee9b7a4abda62de26d291bd3f7647218d05b7d6d38af47c30/ruff-0.9.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cabc332b7075a914ecea912cd1f3d4370489c8018f2c945a30bcc934e3bc06a6", size = 12419624 },
-    { url = "https://files.pythonhosted.org/packages/84/5d/de0b7652e09f7dda49e1a3825a164a65f4998175b6486603c7601279baad/ruff-0.9.3-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:33866c3cc2a575cbd546f2cd02bdd466fed65118e4365ee538a3deffd6fcb730", size = 11843238 },
-    { url = "https://files.pythonhosted.org/packages/9e/be/3f341ceb1c62b565ec1fb6fd2139cc40b60ae6eff4b6fb8f94b1bb37c7a9/ruff-0.9.3-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:006e5de2621304c8810bcd2ee101587712fa93b4f955ed0985907a36c427e0c2", size = 11484012 },
-    { url = "https://files.pythonhosted.org/packages/a3/c8/ff8acbd33addc7e797e702cf00bfde352ab469723720c5607b964491d5cf/ruff-0.9.3-py3-none-musllinux_1_2_i686.whl", hash = "sha256:ba6eea4459dbd6b1be4e6bfc766079fb9b8dd2e5a35aff6baee4d9b1514ea519", size = 12038494 },
-    { url = "https://files.pythonhosted.org/packages/73/b1/8d9a2c0efbbabe848b55f877bc10c5001a37ab10aca13c711431673414e5/ruff-0.9.3-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:90230a6b8055ad47d3325e9ee8f8a9ae7e273078a66401ac66df68943ced029b", size = 12473639 },
-    { url = "https://files.pythonhosted.org/packages/cb/44/a673647105b1ba6da9824a928634fe23186ab19f9d526d7bdf278cd27bc3/ruff-0.9.3-py3-none-win32.whl", hash = "sha256:eabe5eb2c19a42f4808c03b82bd313fc84d4e395133fb3fc1b1516170a31213c", size = 9834353 },
-    { url = "https://files.pythonhosted.org/packages/c3/01/65cadb59bf8d4fbe33d1a750103e6883d9ef302f60c28b73b773092fbde5/ruff-0.9.3-py3-none-win_amd64.whl", hash = "sha256:040ceb7f20791dfa0e78b4230ee9dce23da3b64dd5848e40e3bf3ab76468dcf4", size = 10821444 },
-    { url = "https://files.pythonhosted.org/packages/69/cb/b3fe58a136a27d981911cba2f18e4b29f15010623b79f0f2510fd0d31fd3/ruff-0.9.3-py3-none-win_arm64.whl", hash = "sha256:800d773f6d4d33b0a3c60e2c6ae8f4c202ea2de056365acfa519aa48acf28e0b", size = 10038168 },
-]
-
-[[package]]
-name = "sacrebleu"
-version = "2.5.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "colorama" },
-    { name = "lxml" },
-    { name = "numpy" },
-    { name = "portalocker" },
-    { name = "regex" },
-    { name = "tabulate" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/01/14/8526cf8a5b912b618e7d6ed319a5b1876788bebba1f9a660e1291832c1cc/sacrebleu-2.5.1.tar.gz", hash = "sha256:1a088cc1c74ffaff0759c3191a85db09eecfa7a52e09be244e319d8d64e2fb11", size = 1896900 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/cd/45/7b55a7bd7e5c5b573b40ad58ba43fa09962dc5c8d71b1f573d4aeaa54a7e/sacrebleu-2.5.1-py3-none-any.whl", hash = "sha256:7c9f7ee75bec3a5bf19dd87112dfd654952130e403ad30c48298fb7da3212d5d", size = 104107 },
+version = "0.9.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/2a/e1/e265aba384343dd8ddd3083f5e33536cd17e1566c41453a5517b5dd443be/ruff-0.9.6.tar.gz", hash = "sha256:81761592f72b620ec8fa1068a6fd00e98a5ebee342a3642efd84454f3031dca9", size = 3639454 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/76/e3/3d2c022e687e18cf5d93d6bfa2722d46afc64eaa438c7fbbdd603b3597be/ruff-0.9.6-py3-none-linux_armv6l.whl", hash = "sha256:2f218f356dd2d995839f1941322ff021c72a492c470f0b26a34f844c29cdf5ba", size = 11714128 },
+    { url = "https://files.pythonhosted.org/packages/e1/22/aff073b70f95c052e5c58153cba735748c9e70107a77d03420d7850710a0/ruff-0.9.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:b908ff4df65dad7b251c9968a2e4560836d8f5487c2f0cc238321ed951ea0504", size = 11682539 },
+    { url = "https://files.pythonhosted.org/packages/75/a7/f5b7390afd98a7918582a3d256cd3e78ba0a26165a467c1820084587cbf9/ruff-0.9.6-py3-none-macosx_11_0_arm64.whl", hash = "sha256:b109c0ad2ececf42e75fa99dc4043ff72a357436bb171900714a9ea581ddef83", size = 11132512 },
+    { url = "https://files.pythonhosted.org/packages/a6/e3/45de13ef65047fea2e33f7e573d848206e15c715e5cd56095589a7733d04/ruff-0.9.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1de4367cca3dac99bcbd15c161404e849bb0bfd543664db39232648dc00112dc", size = 11929275 },
+    { url = "https://files.pythonhosted.org/packages/7d/f2/23d04cd6c43b2e641ab961ade8d0b5edb212ecebd112506188c91f2a6e6c/ruff-0.9.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ac3ee4d7c2c92ddfdaedf0bf31b2b176fa7aa8950efc454628d477394d35638b", size = 11466502 },
+    { url = "https://files.pythonhosted.org/packages/b5/6f/3a8cf166f2d7f1627dd2201e6cbc4cb81f8b7d58099348f0c1ff7b733792/ruff-0.9.6-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5dc1edd1775270e6aa2386119aea692039781429f0be1e0949ea5884e011aa8e", size = 12676364 },
+    { url = "https://files.pythonhosted.org/packages/f5/c4/db52e2189983c70114ff2b7e3997e48c8318af44fe83e1ce9517570a50c6/ruff-0.9.6-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:4a091729086dffa4bd070aa5dab7e39cc6b9d62eb2bef8f3d91172d30d599666", size = 13335518 },
+    { url = "https://files.pythonhosted.org/packages/66/44/545f8a4d136830f08f4d24324e7db957c5374bf3a3f7a6c0bc7be4623a37/ruff-0.9.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d1bbc6808bf7b15796cef0815e1dfb796fbd383e7dbd4334709642649625e7c5", size = 12823287 },
+    { url = "https://files.pythonhosted.org/packages/c5/26/8208ef9ee7431032c143649a9967c3ae1aae4257d95e6f8519f07309aa66/ruff-0.9.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:589d1d9f25b5754ff230dce914a174a7c951a85a4e9270613a2b74231fdac2f5", size = 14592374 },
+    { url = "https://files.pythonhosted.org/packages/31/70/e917781e55ff39c5b5208bda384fd397ffd76605e68544d71a7e40944945/ruff-0.9.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc61dd5131742e21103fbbdcad683a8813be0e3c204472d520d9a5021ca8b217", size = 12500173 },
+    { url = "https://files.pythonhosted.org/packages/84/f5/e4ddee07660f5a9622a9c2b639afd8f3104988dc4f6ba0b73ffacffa9a8c/ruff-0.9.6-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:5e2d9126161d0357e5c8f30b0bd6168d2c3872372f14481136d13de9937f79b6", size = 11906555 },
+    { url = "https://files.pythonhosted.org/packages/f1/2b/6ff2fe383667075eef8656b9892e73dd9b119b5e3add51298628b87f6429/ruff-0.9.6-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:68660eab1a8e65babb5229a1f97b46e3120923757a68b5413d8561f8a85d4897", size = 11538958 },
+    { url = "https://files.pythonhosted.org/packages/3c/db/98e59e90de45d1eb46649151c10a062d5707b5b7f76f64eb1e29edf6ebb1/ruff-0.9.6-py3-none-musllinux_1_2_i686.whl", hash = "sha256:c4cae6c4cc7b9b4017c71114115db0445b00a16de3bcde0946273e8392856f08", size = 12117247 },
+    { url = "https://files.pythonhosted.org/packages/ec/bc/54e38f6d219013a9204a5a2015c09e7a8c36cedcd50a4b01ac69a550b9d9/ruff-0.9.6-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:19f505b643228b417c1111a2a536424ddde0db4ef9023b9e04a46ed8a1cb4656", size = 12554647 },
+    { url = "https://files.pythonhosted.org/packages/a5/7d/7b461ab0e2404293c0627125bb70ac642c2e8d55bf590f6fce85f508f1b2/ruff-0.9.6-py3-none-win32.whl", hash = "sha256:194d8402bceef1b31164909540a597e0d913c0e4952015a5b40e28c146121b5d", size = 9949214 },
+    { url = "https://files.pythonhosted.org/packages/ee/30/c3cee10f915ed75a5c29c1e57311282d1a15855551a64795c1b2bbe5cf37/ruff-0.9.6-py3-none-win_amd64.whl", hash = "sha256:03482d5c09d90d4ee3f40d97578423698ad895c87314c4de39ed2af945633caa", size = 10999914 },
+    { url = "https://files.pythonhosted.org/packages/e8/a8/d71f44b93e3aa86ae232af1f2126ca7b95c0f515ec135462b3e1f351441c/ruff-0.9.6-py3-none-win_arm64.whl", hash = "sha256:0e2bb706a2be7ddfea4a4af918562fdc1bcb16df255e5fa595bbd800ce322a5a", size = 10177499 },
 ]
 
 [[package]]
@@ -2161,105 +1523,17 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/86/ca/aa489392ec6fb59223ffce825461e1f811a3affd417121a2088be7a5758b/safetensors-0.5.2-cp38-abi3-win_amd64.whl", hash = "sha256:78abdddd03a406646107f973c7843276e7b64e5e32623529dc17f3d94a20f589", size = 303756 },
 ]
 
-[[package]]
-name = "scikit-learn"
-version = "1.6.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "joblib" },
-    { name = "numpy" },
-    { name = "scipy" },
-    { name = "threadpoolctl" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/9e/a5/4ae3b3a0755f7b35a280ac90b28817d1f380318973cff14075ab41ef50d9/scikit_learn-1.6.1.tar.gz", hash = "sha256:b4fc2525eca2c69a59260f583c56a7557c6ccdf8deafdba6e060f94c1c59738e", size = 7068312 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/2e/3a/f4597eb41049110b21ebcbb0bcb43e4035017545daa5eedcfeb45c08b9c5/scikit_learn-1.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d056391530ccd1e501056160e3c9673b4da4805eb67eb2bdf4e983e1f9c9204e", size = 12067702 },
-    { url = "https://files.pythonhosted.org/packages/37/19/0423e5e1fd1c6ec5be2352ba05a537a473c1677f8188b9306097d684b327/scikit_learn-1.6.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:0c8d036eb937dbb568c6242fa598d551d88fb4399c0344d95c001980ec1c7d36", size = 11112765 },
-    { url = "https://files.pythonhosted.org/packages/70/95/d5cb2297a835b0f5fc9a77042b0a2d029866379091ab8b3f52cc62277808/scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8634c4bd21a2a813e0a7e3900464e6d593162a29dd35d25bdf0103b3fce60ed5", size = 12643991 },
-    { url = "https://files.pythonhosted.org/packages/b7/91/ab3c697188f224d658969f678be86b0968ccc52774c8ab4a86a07be13c25/scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:775da975a471c4f6f467725dff0ced5c7ac7bda5e9316b260225b48475279a1b", size = 13497182 },
-    { url = "https://files.pythonhosted.org/packages/17/04/d5d556b6c88886c092cc989433b2bab62488e0f0dafe616a1d5c9cb0efb1/scikit_learn-1.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:8a600c31592bd7dab31e1c61b9bbd6dea1b3433e67d264d17ce1017dbdce8002", size = 11125517 },
-    { url = "https://files.pythonhosted.org/packages/6c/2a/e291c29670795406a824567d1dfc91db7b699799a002fdaa452bceea8f6e/scikit_learn-1.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:72abc587c75234935e97d09aa4913a82f7b03ee0b74111dcc2881cba3c5a7b33", size = 12102620 },
-    { url = "https://files.pythonhosted.org/packages/25/92/ee1d7a00bb6b8c55755d4984fd82608603a3cc59959245068ce32e7fb808/scikit_learn-1.6.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:b3b00cdc8f1317b5f33191df1386c0befd16625f49d979fe77a8d44cae82410d", size = 11116234 },
-    { url = "https://files.pythonhosted.org/packages/30/cd/ed4399485ef364bb25f388ab438e3724e60dc218c547a407b6e90ccccaef/scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dc4765af3386811c3ca21638f63b9cf5ecf66261cc4815c1db3f1e7dc7b79db2", size = 12592155 },
-    { url = "https://files.pythonhosted.org/packages/a8/f3/62fc9a5a659bb58a03cdd7e258956a5824bdc9b4bb3c5d932f55880be569/scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:25fc636bdaf1cc2f4a124a116312d837148b5e10872147bdaf4887926b8c03d8", size = 13497069 },
-    { url = "https://files.pythonhosted.org/packages/a1/a6/c5b78606743a1f28eae8f11973de6613a5ee87366796583fb74c67d54939/scikit_learn-1.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:fa909b1a36e000a03c382aade0bd2063fd5680ff8b8e501660c0f59f021a6415", size = 11139809 },
-    { url = "https://files.pythonhosted.org/packages/0a/18/c797c9b8c10380d05616db3bfb48e2a3358c767affd0857d56c2eb501caa/scikit_learn-1.6.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:926f207c804104677af4857b2c609940b743d04c4c35ce0ddc8ff4f053cddc1b", size = 12104516 },
-    { url = "https://files.pythonhosted.org/packages/c4/b7/2e35f8e289ab70108f8cbb2e7a2208f0575dc704749721286519dcf35f6f/scikit_learn-1.6.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:2c2cae262064e6a9b77eee1c8e768fc46aa0b8338c6a8297b9b6759720ec0ff2", size = 11167837 },
-    { url = "https://files.pythonhosted.org/packages/a4/f6/ff7beaeb644bcad72bcfd5a03ff36d32ee4e53a8b29a639f11bcb65d06cd/scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1061b7c028a8663fb9a1a1baf9317b64a257fcb036dae5c8752b2abef31d136f", size = 12253728 },
-    { url = "https://files.pythonhosted.org/packages/29/7a/8bce8968883e9465de20be15542f4c7e221952441727c4dad24d534c6d99/scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e69fab4ebfc9c9b580a7a80111b43d214ab06250f8a7ef590a4edf72464dd86", size = 13147700 },
-    { url = "https://files.pythonhosted.org/packages/62/27/585859e72e117fe861c2079bcba35591a84f801e21bc1ab85bce6ce60305/scikit_learn-1.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:70b1d7e85b1c96383f872a519b3375f92f14731e279a7b4c6cfd650cf5dffc52", size = 11110613 },
-    { url = "https://files.pythonhosted.org/packages/2e/59/8eb1872ca87009bdcdb7f3cdc679ad557b992c12f4b61f9250659e592c63/scikit_learn-1.6.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2ffa1e9e25b3d93990e74a4be2c2fc61ee5af85811562f1288d5d055880c4322", size = 12010001 },
-    { url = "https://files.pythonhosted.org/packages/9d/05/f2fc4effc5b32e525408524c982c468c29d22f828834f0625c5ef3d601be/scikit_learn-1.6.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:dc5cf3d68c5a20ad6d571584c0750ec641cc46aeef1c1507be51300e6003a7e1", size = 11096360 },
-    { url = "https://files.pythonhosted.org/packages/c8/e4/4195d52cf4f113573fb8ebc44ed5a81bd511a92c0228889125fac2f4c3d1/scikit_learn-1.6.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c06beb2e839ecc641366000ca84f3cf6fa9faa1777e29cf0c04be6e4d096a348", size = 12209004 },
-    { url = "https://files.pythonhosted.org/packages/94/be/47e16cdd1e7fcf97d95b3cb08bde1abb13e627861af427a3651fcb80b517/scikit_learn-1.6.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8ca8cb270fee8f1f76fa9bfd5c3507d60c6438bbee5687f81042e2bb98e5a97", size = 13171776 },
-    { url = "https://files.pythonhosted.org/packages/34/b0/ca92b90859070a1487827dbc672f998da95ce83edce1270fc23f96f1f61a/scikit_learn-1.6.1-cp313-cp313-win_amd64.whl", hash = "sha256:7a1c43c8ec9fde528d664d947dc4c0789be4077a3647f232869f41d9bf50e0fb", size = 11071865 },
-    { url = "https://files.pythonhosted.org/packages/12/ae/993b0fb24a356e71e9a894e42b8a9eec528d4c70217353a1cd7a48bc25d4/scikit_learn-1.6.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a17c1dea1d56dcda2fac315712f3651a1fea86565b64b48fa1bc090249cbf236", size = 11955804 },
-    { url = "https://files.pythonhosted.org/packages/d6/54/32fa2ee591af44507eac86406fa6bba968d1eb22831494470d0a2e4a1eb1/scikit_learn-1.6.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:6a7aa5f9908f0f28f4edaa6963c0a6183f1911e63a69aa03782f0d924c830a35", size = 11100530 },
-    { url = "https://files.pythonhosted.org/packages/3f/58/55856da1adec655bdce77b502e94a267bf40a8c0b89f8622837f89503b5a/scikit_learn-1.6.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0650e730afb87402baa88afbf31c07b84c98272622aaba002559b614600ca691", size = 12433852 },
-    { url = "https://files.pythonhosted.org/packages/ff/4f/c83853af13901a574f8f13b645467285a48940f185b690936bb700a50863/scikit_learn-1.6.1-cp313-cp313t-win_amd64.whl", hash = "sha256:3f59fe08dc03ea158605170eb52b22a105f238a5d512c4470ddeca71feae8e5f", size = 11337256 },
-]
-
-[[package]]
-name = "scipy"
-version = "1.15.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "numpy" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/76/c6/8eb0654ba0c7d0bb1bf67bf8fbace101a8e4f250f7722371105e8b6f68fc/scipy-1.15.1.tar.gz", hash = "sha256:033a75ddad1463970c96a88063a1df87ccfddd526437136b6ee81ff0312ebdf6", size = 59407493 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/86/53/b204ce5a4433f1864001b9d16f103b9c25f5002a602ae83585d0ea5f9c4a/scipy-1.15.1-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:c64ded12dcab08afff9e805a67ff4480f5e69993310e093434b10e85dc9d43e1", size = 41414518 },
-    { url = "https://files.pythonhosted.org/packages/c7/fc/54ffa7a8847f7f303197a6ba65a66104724beba2e38f328135a78f0dc480/scipy-1.15.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:5b190b935e7db569960b48840e5bef71dc513314cc4e79a1b7d14664f57fd4ff", size = 32519265 },
-    { url = "https://files.pythonhosted.org/packages/f1/77/a98b8ba03d6f371dc31a38719affd53426d4665729dcffbed4afe296784a/scipy-1.15.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:4b17d4220df99bacb63065c76b0d1126d82bbf00167d1730019d2a30d6ae01ea", size = 24792859 },
-    { url = "https://files.pythonhosted.org/packages/a7/78/70bb9f0df7444b18b108580934bfef774822e28fd34a68e5c263c7d2828a/scipy-1.15.1-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:63b9b6cd0333d0eb1a49de6f834e8aeaefe438df8f6372352084535ad095219e", size = 27886506 },
-    { url = "https://files.pythonhosted.org/packages/14/a7/f40f6033e06de4176ddd6cc8c3ae9f10a226c3bca5d6b4ab883bc9914a14/scipy-1.15.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9f151e9fb60fbf8e52426132f473221a49362091ce7a5e72f8aa41f8e0da4f25", size = 38375041 },
-    { url = "https://files.pythonhosted.org/packages/17/03/390a1c5c61fd76b0fa4b3c5aa3bdd7e60f6c46f712924f1a9df5705ec046/scipy-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21e10b1dd56ce92fba3e786007322542361984f8463c6d37f6f25935a5a6ef52", size = 40597556 },
-    { url = "https://files.pythonhosted.org/packages/4e/70/fa95b3ae026b97eeca58204a90868802e5155ac71b9d7bdee92b68115dd3/scipy-1.15.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:5dff14e75cdbcf07cdaa1c7707db6017d130f0af9ac41f6ce443a93318d6c6e0", size = 42938505 },
-    { url = "https://files.pythonhosted.org/packages/d6/07/427859116bdd71847c898180f01802691f203c3e2455a1eb496130ff07c5/scipy-1.15.1-cp310-cp310-win_amd64.whl", hash = "sha256:f82fcf4e5b377f819542fbc8541f7b5fbcf1c0017d0df0bc22c781bf60abc4d8", size = 43909663 },
-    { url = "https://files.pythonhosted.org/packages/8e/2e/7b71312da9c2dabff53e7c9a9d08231bc34d9d8fdabe88a6f1155b44591c/scipy-1.15.1-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:5bd8d27d44e2c13d0c1124e6a556454f52cd3f704742985f6b09e75e163d20d2", size = 41424362 },
-    { url = "https://files.pythonhosted.org/packages/81/8c/ab85f1aa1cc200c796532a385b6ebf6a81089747adc1da7482a062acc46c/scipy-1.15.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:be3deeb32844c27599347faa077b359584ba96664c5c79d71a354b80a0ad0ce0", size = 32535910 },
-    { url = "https://files.pythonhosted.org/packages/3b/9c/6f4b787058daa8d8da21ddff881b4320e28de4704a65ec147adb50cb2230/scipy-1.15.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:5eb0ca35d4b08e95da99a9f9c400dc9f6c21c424298a0ba876fdc69c7afacedf", size = 24809398 },
-    { url = "https://files.pythonhosted.org/packages/16/2b/949460a796df75fc7a1ee1becea202cf072edbe325ebe29f6d2029947aa7/scipy-1.15.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:74bb864ff7640dea310a1377d8567dc2cb7599c26a79ca852fc184cc851954ac", size = 27918045 },
-    { url = "https://files.pythonhosted.org/packages/5f/36/67fe249dd7ccfcd2a38b25a640e3af7e59d9169c802478b6035ba91dfd6d/scipy-1.15.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:667f950bf8b7c3a23b4199db24cb9bf7512e27e86d0e3813f015b74ec2c6e3df", size = 38332074 },
-    { url = "https://files.pythonhosted.org/packages/fc/da/452e1119e6f720df3feb588cce3c42c5e3d628d4bfd4aec097bd30b7de0c/scipy-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:395be70220d1189756068b3173853029a013d8c8dd5fd3d1361d505b2aa58fa7", size = 40588469 },
-    { url = "https://files.pythonhosted.org/packages/7f/71/5f94aceeac99a4941478af94fe9f459c6752d497035b6b0761a700f5f9ff/scipy-1.15.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ce3a000cd28b4430426db2ca44d96636f701ed12e2b3ca1f2b1dd7abdd84b39a", size = 42965214 },
-    { url = "https://files.pythonhosted.org/packages/af/25/caa430865749d504271757cafd24066d596217e83326155993980bc22f97/scipy-1.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:3fe1d95944f9cf6ba77aa28b82dd6bb2a5b52f2026beb39ecf05304b8392864b", size = 43896034 },
-    { url = "https://files.pythonhosted.org/packages/d8/6e/a9c42d0d39e09ed7fd203d0ac17adfea759cba61ab457671fe66e523dbec/scipy-1.15.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c09aa9d90f3500ea4c9b393ee96f96b0ccb27f2f350d09a47f533293c78ea776", size = 41478318 },
-    { url = "https://files.pythonhosted.org/packages/04/ee/e3e535c81828618878a7433992fecc92fa4df79393f31a8fea1d05615091/scipy-1.15.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:0ac102ce99934b162914b1e4a6b94ca7da0f4058b6d6fd65b0cef330c0f3346f", size = 32596696 },
-    { url = "https://files.pythonhosted.org/packages/c4/5e/b1b0124be8e76f87115f16b8915003eec4b7060298117715baf13f51942c/scipy-1.15.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:09c52320c42d7f5c7748b69e9f0389266fd4f82cf34c38485c14ee976cb8cb04", size = 24870366 },
-    { url = "https://files.pythonhosted.org/packages/14/36/c00cb73eefda85946172c27913ab995c6ad4eee00fa4f007572e8c50cd51/scipy-1.15.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:cdde8414154054763b42b74fe8ce89d7f3d17a7ac5dd77204f0e142cdc9239e9", size = 28007461 },
-    { url = "https://files.pythonhosted.org/packages/68/94/aff5c51b3799349a9d1e67a056772a0f8a47db371e83b498d43467806557/scipy-1.15.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4c9d8fc81d6a3b6844235e6fd175ee1d4c060163905a2becce8e74cb0d7554ce", size = 38068174 },
-    { url = "https://files.pythonhosted.org/packages/b0/3c/0de11ca154e24a57b579fb648151d901326d3102115bc4f9a7a86526ce54/scipy-1.15.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fb57b30f0017d4afa5fe5f5b150b8f807618819287c21cbe51130de7ccdaed2", size = 40249869 },
-    { url = "https://files.pythonhosted.org/packages/15/09/472e8d0a6b33199d1bb95e49bedcabc0976c3724edd9b0ef7602ccacf41e/scipy-1.15.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:491d57fe89927fa1aafbe260f4cfa5ffa20ab9f1435025045a5315006a91b8f5", size = 42629068 },
-    { url = "https://files.pythonhosted.org/packages/ff/ba/31c7a8131152822b3a2cdeba76398ffb404d81d640de98287d236da90c49/scipy-1.15.1-cp312-cp312-win_amd64.whl", hash = "sha256:900f3fa3db87257510f011c292a5779eb627043dd89731b9c461cd16ef76ab3d", size = 43621992 },
-    { url = "https://files.pythonhosted.org/packages/2b/bf/dd68965a4c5138a630eeed0baec9ae96e5d598887835bdde96cdd2fe4780/scipy-1.15.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:100193bb72fbff37dbd0bf14322314fc7cbe08b7ff3137f11a34d06dc0ee6b85", size = 41441136 },
-    { url = "https://files.pythonhosted.org/packages/ef/5e/4928581312922d7e4d416d74c416a660addec4dd5ea185401df2269ba5a0/scipy-1.15.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:2114a08daec64980e4b4cbdf5bee90935af66d750146b1d2feb0d3ac30613692", size = 32533699 },
-    { url = "https://files.pythonhosted.org/packages/32/90/03f99c43041852837686898c66767787cd41c5843d7a1509c39ffef683e9/scipy-1.15.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:6b3e71893c6687fc5e29208d518900c24ea372a862854c9888368c0b267387ab", size = 24807289 },
-    { url = "https://files.pythonhosted.org/packages/9d/52/bfe82b42ae112eaba1af2f3e556275b8727d55ac6e4932e7aef337a9d9d4/scipy-1.15.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:837299eec3d19b7e042923448d17d95a86e43941104d33f00da7e31a0f715d3c", size = 27929844 },
-    { url = "https://files.pythonhosted.org/packages/f6/77/54ff610bad600462c313326acdb035783accc6a3d5f566d22757ad297564/scipy-1.15.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82add84e8a9fb12af5c2c1a3a3f1cb51849d27a580cb9e6bd66226195142be6e", size = 38031272 },
-    { url = "https://files.pythonhosted.org/packages/f1/26/98585cbf04c7cf503d7eb0a1966df8a268154b5d923c5fe0c1ed13154c49/scipy-1.15.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:070d10654f0cb6abd295bc96c12656f948e623ec5f9a4eab0ddb1466c000716e", size = 40210217 },
-    { url = "https://files.pythonhosted.org/packages/fd/3f/3d2285eb6fece8bc5dbb2f9f94d61157d61d155e854fd5fea825b8218f12/scipy-1.15.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:55cc79ce4085c702ac31e49b1e69b27ef41111f22beafb9b49fea67142b696c4", size = 42587785 },
-    { url = "https://files.pythonhosted.org/packages/48/7d/5b5251984bf0160d6533695a74a5fddb1fa36edd6f26ffa8c871fbd4782a/scipy-1.15.1-cp313-cp313-win_amd64.whl", hash = "sha256:c352c1b6d7cac452534517e022f8f7b8d139cd9f27e6fbd9f3cbd0bfd39f5bef", size = 43640439 },
-    { url = "https://files.pythonhosted.org/packages/e7/b8/0e092f592d280496de52e152582030f8a270b194f87f890e1a97c5599b81/scipy-1.15.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0458839c9f873062db69a03de9a9765ae2e694352c76a16be44f93ea45c28d2b", size = 41619862 },
-    { url = "https://files.pythonhosted.org/packages/f6/19/0b6e1173aba4db9e0b7aa27fe45019857fb90d6904038b83927cbe0a6c1d/scipy-1.15.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:af0b61c1de46d0565b4b39c6417373304c1d4f5220004058bdad3061c9fa8a95", size = 32610387 },
-    { url = "https://files.pythonhosted.org/packages/e7/02/754aae3bd1fa0f2479ade3cfdf1732ecd6b05853f63eee6066a32684563a/scipy-1.15.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:71ba9a76c2390eca6e359be81a3e879614af3a71dfdabb96d1d7ab33da6f2364", size = 24883814 },
-    { url = "https://files.pythonhosted.org/packages/1f/ac/d7906201604a2ea3b143bb0de51b3966f66441ba50b7dc182c4505b3edf9/scipy-1.15.1-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:14eaa373c89eaf553be73c3affb11ec6c37493b7eaaf31cf9ac5dffae700c2e0", size = 27944865 },
-    { url = "https://files.pythonhosted.org/packages/84/9d/8f539002b5e203723af6a6f513a45e0a7671e9dabeedb08f417ac17e4edc/scipy-1.15.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f735bc41bd1c792c96bc426dece66c8723283695f02df61dcc4d0a707a42fc54", size = 39883261 },
-    { url = "https://files.pythonhosted.org/packages/97/c0/62fd3bab828bcccc9b864c5997645a3b86372a35941cdaf677565c25c98d/scipy-1.15.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:2722a021a7929d21168830790202a75dbb20b468a8133c74a2c0230c72626b6c", size = 42093299 },
-    { url = "https://files.pythonhosted.org/packages/e4/1f/5d46a8d94e9f6d2c913cbb109e57e7eed914de38ea99e2c4d69a9fc93140/scipy-1.15.1-cp313-cp313t-win_amd64.whl", hash = "sha256:bc7136626261ac1ed988dca56cfc4ab5180f75e0ee52e58f1e6aa74b5f3eacd5", size = 43181730 },
-]
-
 [[package]]
 name = "sentry-sdk"
-version = "2.20.0"
+version = "2.21.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "certifi" },
     { name = "urllib3" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/68/e8/6a366c0cd5e129dda6ecb20ff097f70b18182c248d4c27e813c21f98992a/sentry_sdk-2.20.0.tar.gz", hash = "sha256:afa82713a92facf847df3c6f63cec71eb488d826a50965def3d7722aa6f0fdab", size = 300125 }
+sdist = { url = "https://files.pythonhosted.org/packages/08/63/3f0e88709cf4af992e2813c27d8ba628a891db0805e3fcc6dc834e142c5b/sentry_sdk-2.21.0.tar.gz", hash = "sha256:a6d38e0fb35edda191acf80b188ec713c863aaa5ad8d5798decb8671d02077b6", size = 301965 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/e6/0f/6f7e6cd0f4a141752caef3f79300148422fdf2b8b68b531f30b2b0c0cbda/sentry_sdk-2.20.0-py2.py3-none-any.whl", hash = "sha256:c359a1edf950eb5e80cffd7d9111f3dbeef57994cb4415df37d39fda2cf22364", size = 322576 },
+    { url = "https://files.pythonhosted.org/packages/a4/18/7587660cb5e4d07134913d8e74137efcd4903fda873bf612c30eb34c7ab4/sentry_sdk-2.21.0-py2.py3-none-any.whl", hash = "sha256:7623cfa9e2c8150948a81ca253b8e2bfe4ce0b96ab12f8cd78e3ac9c490fd92f", size = 324096 },
 ]
 
 [[package]]
@@ -2349,12 +1623,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/04/be/d09147ad1ec7934636ad912901c5fd7667e1c858e19d355237db0d0cd5e4/smmap-5.0.2-py3-none-any.whl", hash = "sha256:b30115f0def7d7531d22a0fb6502488d879e75b260a9db4d0819cfb25403af5e", size = 24303 },
 ]
 
-[[package]]
-name = "sqlitedict"
-version = "2.1.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/12/9a/7620d1e9dcb02839ed6d4b14064e609cdd7a8ae1e47289aa0456796dd9ca/sqlitedict-2.1.0.tar.gz", hash = "sha256:03d9cfb96d602996f1d4c2db2856f1224b96a9c431bdd16e78032a72940f9e8c", size = 21846 }
-
 [[package]]
 name = "sympy"
 version = "1.13.1"
@@ -2367,46 +1635,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b2/fe/81695a1aa331a842b582453b605175f419fe8540355886031328089d840a/sympy-1.13.1-py3-none-any.whl", hash = "sha256:db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8", size = 6189177 },
 ]
 
-[[package]]
-name = "tabledata"
-version = "1.3.4"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "dataproperty" },
-    { name = "typepy" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/b2/35/171c8977162f1163368406deddde4c59673b62bd0cb2f34948a02effb075/tabledata-1.3.4.tar.gz", hash = "sha256:e9649cab129d718f3bff4150083b77f8a78c30f6634a30caf692b10fdc60cb97", size = 25074 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/08/64/fa4160151976ee4b2cf0c1217a99443ffaeb991956feddfeac9eee9952f8/tabledata-1.3.4-py3-none-any.whl", hash = "sha256:1f56e433bfdeb89f4487abfa48c4603a3b07c5d3a3c7e05ff73dd018c24bd0d4", size = 11820 },
-]
-
-[[package]]
-name = "tabulate"
-version = "0.9.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252 },
-]
-
-[[package]]
-name = "tcolorpy"
-version = "0.1.7"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/80/cc/44f2d81d8f9093aad81c3467a5bf5718d2b5f786e887b6e4adcfc17ec6b9/tcolorpy-0.1.7.tar.gz", hash = "sha256:0fbf6bf238890bbc2e32662aa25736769a29bf6d880328f310c910a327632614", size = 299437 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/05/a2/ed023f2edd1e011b4d99b6727bce8253842d66c3fbf9ed0a26fc09a92571/tcolorpy-0.1.7-py3-none-any.whl", hash = "sha256:26a59d52027e175a37e0aba72efc99dda43f074db71f55b316d3de37d3251378", size = 8096 },
-]
-
-[[package]]
-name = "threadpoolctl"
-version = "3.5.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/bd/55/b5148dcbf72f5cde221f8bfe3b6a540da7aa1842f6b491ad979a6c8b84af/threadpoolctl-3.5.0.tar.gz", hash = "sha256:082433502dd922bf738de0d8bcc4fdcbf0979ff44c42bd40f5af8a282f6fa107", size = 41936 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/4b/2c/ffbf7a134b9ab11a67b0cf0726453cedd9c5043a4fe7a35d1cefa9a1bcfb/threadpoolctl-3.5.0-py3-none-any.whl", hash = "sha256:56c1e26c150397e58c4926da8eeee87533b1e32bef131bd4bf6a2f45f3185467", size = 18414 },
-]
-
 [[package]]
 name = "tokenizers"
 version = "0.21.0"
@@ -2471,29 +1699,9 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6e/c2/61d3e0f47e2b74ef40a68b9e6ad5984f6241a942f7cd3bbfbdbd03861ea9/tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", size = 14257 },
 ]
 
-[[package]]
-name = "toposolve"
-version = "0.1.17"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "pybind11" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/51/5d/e24dd0bbbf9f508d9aa11120fdcc7b0e4caf1c1d401359495636470e0431/toposolve-0.1.17.tar.gz", hash = "sha256:539a1301ed36df5e2fbd0d3e1806f2c6cd7840c3527938647a61b0a7b53689f9", size = 5437 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c4/02/91db85f3ab2822377e90357b23783c5a1408cdb7c36f9f94a7d3db6783cc/toposolve-0.1.17-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:cc1a33e2438c29a29ccac4c957c6ffaf035b239b34f0e369f4665ed255b413c9", size = 125471 },
-    { url = "https://files.pythonhosted.org/packages/ed/4c/90f3b00b1f381ead4394cb1a1391b8dcd2a043490b2f77ebf71609e24b91/toposolve-0.1.17-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dda111813910145f97df62e34d30ad38c39df45539f129b57df0e6c51e52902b", size = 125548 },
-    { url = "https://files.pythonhosted.org/packages/61/46/062d43764ac1cf6fff3edde81295846d6191c43a103550ddf27472da49ca/toposolve-0.1.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:141ba41c36e49ecd2280517031f4ef8c2c131b4fb93b85f38c10fead17974b7f", size = 95148 },
-    { url = "https://files.pythonhosted.org/packages/58/c0/854d8b5cc5cb23d99720c28908e5eff587e1ec25c797ceac219bf5f0f3a7/toposolve-0.1.17-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:8b13cdf80823dd86dd75bda7f94418b5b093cbb4989a4a6f0956f2595b8a7be1", size = 128253 },
-    { url = "https://files.pythonhosted.org/packages/b9/ab/9e062fe4e58729594eda886ed464e4839232162e1ddbd0e36b5bcb61d664/toposolve-0.1.17-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:baa9a76f54f295d8beb06559bd90c117c9a0c2f84aa4c9c661fb283a7529fd68", size = 128431 },
-    { url = "https://files.pythonhosted.org/packages/4f/3b/b6e6c863d06f8cb533068224f954b3b49722ba88cb6a0861fd62aefbc151/toposolve-0.1.17-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac94551501ca671d428c7ad435a50da5cd33f832c432c87e676f3ab8310bf19d", size = 96749 },
-    { url = "https://files.pythonhosted.org/packages/70/1e/7028dbd313ba931c086ecb24856566fb34cd159d3bd305cf05fa6b66bb3b/toposolve-0.1.17-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:e158c666e1492832dba077eaea9c4bcd88fbb3f45b8bfde8a8f545d5de8212e2", size = 127421 },
-    { url = "https://files.pythonhosted.org/packages/44/3b/a51dd6d756076853f1a6d18cc20761126b613a28a5ff9e37a113a546477b/toposolve-0.1.17-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:17666bdb0078cddcffc89cb9e7c0f0ce35007597b8a309f7b29c6a39a0fb6d8a", size = 127540 },
-    { url = "https://files.pythonhosted.org/packages/1e/8e/0d7b43f5c751490745bd25cf2cf1f1285627749e3ebfd2da131a9bbdac8c/toposolve-0.1.17-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1b238dadaced7fe98638fcb595596619104724a66ba1205914d0faebb2a2081b", size = 95545 },
-]
-
 [[package]]
 name = "torch"
-version = "2.5.1"
+version = "2.6.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
@@ -2509,36 +1717,32 @@ dependencies = [
     { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "setuptools", marker = "python_full_version >= '3.12'" },
     { name = "sympy" },
-    { name = "triton", marker = "python_full_version < '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "typing-extensions" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/2a/ef/834af4a885b31a0b32fff2d80e1e40f771e1566ea8ded55347502440786a/torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:71328e1bbe39d213b8721678f9dcac30dfc452a46d586f1d514a6aa0a99d4744", size = 906446312 },
-    { url = "https://files.pythonhosted.org/packages/69/f0/46e74e0d145f43fa506cb336eaefb2d240547e4ce1f496e442711093ab25/torch-2.5.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:34bfa1a852e5714cbfa17f27c49d8ce35e1b7af5608c4bc6e81392c352dbc601", size = 91919522 },
-    { url = "https://files.pythonhosted.org/packages/a5/13/1eb674c8efbd04d71e4a157ceba991904f633e009a584dd65dccbafbb648/torch-2.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:32a037bd98a241df6c93e4c789b683335da76a2ac142c0973675b715102dc5fa", size = 203088048 },
-    { url = "https://files.pythonhosted.org/packages/a9/9d/e0860474ee0ff8f6ef2c50ec8f71a250f38d78a9b9df9fd241ad3397a65b/torch-2.5.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:23d062bf70776a3d04dbe74db950db2a5245e1ba4f27208a87f0d743b0d06e86", size = 63877046 },
-    { url = "https://files.pythonhosted.org/packages/d1/35/e8b2daf02ce933e4518e6f5682c72fd0ed66c15910ea1fb4168f442b71c4/torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:de5b7d6740c4b636ef4db92be922f0edc425b65ed78c5076c43c42d362a45457", size = 906474467 },
-    { url = "https://files.pythonhosted.org/packages/40/04/bd91593a4ca178ece93ca55f27e2783aa524aaccbfda66831d59a054c31e/torch-2.5.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:340ce0432cad0d37f5a31be666896e16788f1adf8ad7be481196b503dad675b9", size = 91919450 },
-    { url = "https://files.pythonhosted.org/packages/0d/4a/e51420d46cfc90562e85af2fee912237c662ab31140ab179e49bd69401d6/torch-2.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:603c52d2fe06433c18b747d25f5c333f9c1d58615620578c326d66f258686f9a", size = 203098237 },
-    { url = "https://files.pythonhosted.org/packages/d0/db/5d9cbfbc7968d79c5c09a0bc0bc3735da079f2fd07cc10498a62b320a480/torch-2.5.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:31f8c39660962f9ae4eeec995e3049b5492eb7360dd4f07377658ef4d728fa4c", size = 63884466 },
-    { url = "https://files.pythonhosted.org/packages/8b/5c/36c114d120bfe10f9323ed35061bc5878cc74f3f594003854b0ea298942f/torch-2.5.1-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:ed231a4b3a5952177fafb661213d690a72caaad97d5824dd4fc17ab9e15cec03", size = 906389343 },
-    { url = "https://files.pythonhosted.org/packages/6d/69/d8ada8b6e0a4257556d5b4ddeb4345ea8eeaaef3c98b60d1cca197c7ad8e/torch-2.5.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:3f4b7f10a247e0dcd7ea97dc2d3bfbfc90302ed36d7f3952b0008d0df264e697", size = 91811673 },
-    { url = "https://files.pythonhosted.org/packages/5f/ba/607d013b55b9fd805db2a5c2662ec7551f1910b4eef39653eeaba182c5b2/torch-2.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:73e58e78f7d220917c5dbfad1a40e09df9929d3b95d25e57d9f8558f84c9a11c", size = 203046841 },
-    { url = "https://files.pythonhosted.org/packages/57/6c/bf52ff061da33deb9f94f4121fde7ff3058812cb7d2036c97bc167793bd1/torch-2.5.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:8c712df61101964eb11910a846514011f0b6f5920c55dbf567bff8a34163d5b1", size = 63858109 },
-    { url = "https://files.pythonhosted.org/packages/69/72/20cb30f3b39a9face296491a86adb6ff8f1a47a897e4d14667e6cf89d5c3/torch-2.5.1-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:9b61edf3b4f6e3b0e0adda8b3960266b9009d02b37555971f4d1c8f7a05afed7", size = 906393265 },
-]
-
-[[package]]
-name = "torch-shampoo"
-version = "1.0.0"
-source = { git = "https://github.com/facebookresearch/optimizers.git?rev=main#9c5700ad5ee81c28dc565c1a49c4b940da28eb8d" }
-dependencies = [
-    { name = "torch" },
+    { url = "https://files.pythonhosted.org/packages/37/81/aa9ab58ec10264c1abe62c8b73f5086c3c558885d6beecebf699f0dbeaeb/torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:6860df13d9911ac158f4c44031609700e1eba07916fff62e21e6ffa0a9e01961", size = 766685561 },
+    { url = "https://files.pythonhosted.org/packages/86/86/e661e229df2f5bfc6eab4c97deb1286d598bbeff31ab0cdb99b3c0d53c6f/torch-2.6.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:c4f103a49830ce4c7561ef4434cc7926e5a5fe4e5eb100c19ab36ea1e2b634ab", size = 95751887 },
+    { url = "https://files.pythonhosted.org/packages/20/e0/5cb2f8493571f0a5a7273cd7078f191ac252a402b5fb9cb6091f14879109/torch-2.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:56eeaf2ecac90da5d9e35f7f35eb286da82673ec3c582e310a8d1631a1c02341", size = 204165139 },
+    { url = "https://files.pythonhosted.org/packages/e5/16/ea1b7842413a7b8a5aaa5e99e8eaf3da3183cc3ab345ad025a07ff636301/torch-2.6.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:09e06f9949e1a0518c5b09fe95295bc9661f219d9ecb6f9893e5123e10696628", size = 66520221 },
+    { url = "https://files.pythonhosted.org/packages/78/a9/97cbbc97002fff0de394a2da2cdfa859481fdca36996d7bd845d50aa9d8d/torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:7979834102cd5b7a43cc64e87f2f3b14bd0e1458f06e9f88ffa386d07c7446e1", size = 766715424 },
+    { url = "https://files.pythonhosted.org/packages/6d/fa/134ce8f8a7ea07f09588c9cc2cea0d69249efab977707cf67669431dcf5c/torch-2.6.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:ccbd0320411fe1a3b3fec7b4d3185aa7d0c52adac94480ab024b5c8f74a0bf1d", size = 95759416 },
+    { url = "https://files.pythonhosted.org/packages/11/c5/2370d96b31eb1841c3a0883a492c15278a6718ccad61bb6a649c80d1d9eb/torch-2.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:46763dcb051180ce1ed23d1891d9b1598e07d051ce4c9d14307029809c4d64f7", size = 204164970 },
+    { url = "https://files.pythonhosted.org/packages/0b/fa/f33a4148c6fb46ca2a3f8de39c24d473822d5774d652b66ed9b1214da5f7/torch-2.6.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:94fc63b3b4bedd327af588696559f68c264440e2503cc9e6954019473d74ae21", size = 66530713 },
+    { url = "https://files.pythonhosted.org/packages/e5/35/0c52d708144c2deb595cd22819a609f78fdd699b95ff6f0ebcd456e3c7c1/torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:2bb8987f3bb1ef2675897034402373ddfc8f5ef0e156e2d8cfc47cacafdda4a9", size = 766624563 },
+    { url = "https://files.pythonhosted.org/packages/01/d6/455ab3fbb2c61c71c8842753b566012e1ed111e7a4c82e0e1c20d0c76b62/torch-2.6.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b789069020c5588c70d5c2158ac0aa23fd24a028f34a8b4fcb8fcb4d7efcf5fb", size = 95607867 },
+    { url = "https://files.pythonhosted.org/packages/18/cf/ae99bd066571656185be0d88ee70abc58467b76f2f7c8bfeb48735a71fe6/torch-2.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:7e1448426d0ba3620408218b50aa6ada88aeae34f7a239ba5431f6c8774b1239", size = 204120469 },
+    { url = "https://files.pythonhosted.org/packages/81/b4/605ae4173aa37fb5aa14605d100ff31f4f5d49f617928c9f486bb3aaec08/torch-2.6.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:9a610afe216a85a8b9bc9f8365ed561535c93e804c2a317ef7fabcc5deda0989", size = 66532538 },
+    { url = "https://files.pythonhosted.org/packages/24/85/ead1349fc30fe5a32cadd947c91bda4a62fbfd7f8c34ee61f6398d38fb48/torch-2.6.0-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:4874a73507a300a5d089ceaff616a569e7bb7c613c56f37f63ec3ffac65259cf", size = 766626191 },
+    { url = "https://files.pythonhosted.org/packages/dd/b0/26f06f9428b250d856f6d512413e9e800b78625f63801cbba13957432036/torch-2.6.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:a0d5e1b9874c1a6c25556840ab8920569a7a4137afa8a63a32cee0bc7d89bd4b", size = 95611439 },
+    { url = "https://files.pythonhosted.org/packages/c2/9c/fc5224e9770c83faed3a087112d73147cd7c7bfb7557dcf9ad87e1dda163/torch-2.6.0-cp313-cp313-win_amd64.whl", hash = "sha256:510c73251bee9ba02ae1cb6c9d4ee0907b3ce6020e62784e2d7598e0cfa4d6cc", size = 204126475 },
+    { url = "https://files.pythonhosted.org/packages/88/8b/d60c0491ab63634763be1537ad488694d316ddc4a20eaadd639cedc53971/torch-2.6.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:ff96f4038f8af9f7ec4231710ed4549da1bdebad95923953a25045dcf6fd87e2", size = 66536783 },
 ]
 
 [[package]]
@@ -2566,22 +1770,9 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540 },
 ]
 
-[[package]]
-name = "tqdm-multiprocess"
-version = "0.0.11"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "colorama" },
-    { name = "tqdm" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/b4/1e/de81bd0f6cb2b61d6ee7ccbf304d99a42a0f53879481536dfb3288ee9a87/tqdm-multiprocess-0.0.11.tar.gz", hash = "sha256:a74002a1222ea9cbe8cdc9bd460108c6009be359621fbee9b92d0515d4d180f7", size = 8082 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/25/7e/0d889fc6c84e3df6b69aaafe893fc77f69b3d968ac9ce574d1c62c688050/tqdm_multiprocess-0.0.11-py3-none-any.whl", hash = "sha256:3ebdf03e7a675150fa0bbceaa9c3c64b8cb556e9ffafa4fe6c078e51820524aa", size = 9817 },
-]
-
 [[package]]
 name = "transformers"
-version = "4.48.1"
+version = "4.48.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
@@ -2595,41 +1786,20 @@ dependencies = [
     { name = "tokenizers" },
     { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/21/6b/caf620fae7fbf35947c81e7dd0834493b9ad9b71bb9e433025ac7a07e79a/transformers-4.48.1.tar.gz", hash = "sha256:7c1931facc3ee8adcbf86fc7a87461d54c1e40eca3bb57fef1ee9f3ecd32187e", size = 8365872 }
+sdist = { url = "https://files.pythonhosted.org/packages/e3/82/cebeb7af5e64440f1638f18c4ed0f89156d0eeaa6290d98da8ca93ac3872/transformers-4.48.3.tar.gz", hash = "sha256:a5e8f1e9a6430aa78215836be70cecd3f872d99eeda300f41ad6cc841724afdb", size = 8373458 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/7b/9f/92d3091c44cb19add044064af1bf1345cd35fbb84d32a3690f912800a295/transformers-4.48.1-py3-none-any.whl", hash = "sha256:24be0564b0a36d9e433d9a65de248f1545b6f6edce1737669605eb6a8141bbbb", size = 9665001 },
+    { url = "https://files.pythonhosted.org/packages/b6/1a/efeecb8d83705f2f4beac98d46f2148c95ecd7babfb31b5c0f1e7017e83d/transformers-4.48.3-py3-none-any.whl", hash = "sha256:78697f990f5ef350c23b46bf86d5081ce96b49479ab180b2de7687267de8fd36", size = 9669412 },
 ]
 
 [[package]]
 name = "triton"
-version = "3.1.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "filelock" },
-]
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/98/29/69aa56dc0b2eb2602b553881e34243475ea2afd9699be042316842788ff5/triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b0dd10a925263abbe9fa37dcde67a5e9b2383fc269fdf59f5657cac38c5d1d8", size = 209460013 },
-    { url = "https://files.pythonhosted.org/packages/86/17/d9a5cf4fcf46291856d1e90762e36cbabd2a56c7265da0d1d9508c8e3943/triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f34f6e7885d1bf0eaaf7ba875a5f0ce6f3c13ba98f9503651c1e6dc6757ed5c", size = 209506424 },
-    { url = "https://files.pythonhosted.org/packages/78/eb/65f5ba83c2a123f6498a3097746607e5b2f16add29e36765305e4ac7fdd8/triton-3.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8182f42fd8080a7d39d666814fa36c5e30cc00ea7eeeb1a2983dbb4c99a0fdc", size = 209551444 },
-]
-
-[[package]]
-name = "typepy"
-version = "1.3.4"
+version = "3.2.0"
 source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "mbstrdecoder" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/79/59/4c39942077d7de285f762a91024dbda731be693591732977358f77d120fb/typepy-1.3.4.tar.gz", hash = "sha256:89c1f66de6c6133209c43a94d23431d320ba03ef5db18f241091ea594035d9de", size = 39558 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ee/31/e393c3830bdedd01735bd195c85ac3034b6bcaf6c18142bab60a4047ca36/typepy-1.3.4-py3-none-any.whl", hash = "sha256:d5ed3e0c7f49521bff0603dd08cf8d453371cf68d65a29d3d0038552ccc46e2e", size = 31449 },
-]
-
-[package.optional-dependencies]
-datetime = [
-    { name = "packaging" },
-    { name = "python-dateutil" },
-    { name = "pytz" },
+    { url = "https://files.pythonhosted.org/packages/01/65/3ffa90e158a2c82f0716eee8d26a725d241549b7d7aaf7e4f44ac03ebd89/triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3e54983cd51875855da7c68ec05c05cf8bb08df361b1d5b69e05e40b0c9bd62", size = 253090354 },
+    { url = "https://files.pythonhosted.org/packages/a7/2e/757d2280d4fefe7d33af7615124e7e298ae7b8e3bc4446cdb8e88b0f9bab/triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8009a1fb093ee8546495e96731336a33fb8856a38e45bb4ab6affd6dbc3ba220", size = 253157636 },
+    { url = "https://files.pythonhosted.org/packages/06/00/59500052cb1cf8cf5316be93598946bc451f14072c6ff256904428eaf03c/triton-3.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d9b215efc1c26fa7eefb9a157915c92d52e000d2bf83e5f69704047e63f125c", size = 253159365 },
+    { url = "https://files.pythonhosted.org/packages/c7/30/37a3384d1e2e9320331baca41e835e90a3767303642c7a80d4510152cbcf/triton-3.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5dfa23ba84541d7c0a531dfce76d8bcd19159d50a4a8b14ad01e91734a5c1b0", size = 253154278 },
 ]
 
 [[package]]
@@ -2661,21 +1831,21 @@ wheels = [
 
 [[package]]
 name = "virtualenv"
-version = "20.29.1"
+version = "20.29.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "distlib" },
     { name = "filelock" },
     { name = "platformdirs" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/a7/ca/f23dcb02e161a9bba141b1c08aa50e8da6ea25e6d780528f1d385a3efe25/virtualenv-20.29.1.tar.gz", hash = "sha256:b8b8970138d32fb606192cb97f6cd4bb644fa486be9308fb9b63f81091b5dc35", size = 7658028 }
+sdist = { url = "https://files.pythonhosted.org/packages/f1/88/dacc875dd54a8acadb4bcbfd4e3e86df8be75527116c91d8f9784f5e9cab/virtualenv-20.29.2.tar.gz", hash = "sha256:fdaabebf6d03b5ba83ae0a02cfe96f48a716f4fae556461d180825866f75b728", size = 4320272 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/89/9b/599bcfc7064fbe5740919e78c5df18e5dceb0887e676256a1061bb5ae232/virtualenv-20.29.1-py3-none-any.whl", hash = "sha256:4e4cb403c0b0da39e13b46b1b2476e505cb0046b25f242bee80f62bf990b2779", size = 4282379 },
+    { url = "https://files.pythonhosted.org/packages/93/fa/849483d56773ae29740ae70043ad88e068f98a6401aa819b5d6bee604683/virtualenv-20.29.2-py3-none-any.whl", hash = "sha256:febddfc3d1ea571bdb1dc0f98d7b45d24def7428214d4fb73cc486c9568cce6a", size = 4301478 },
 ]
 
 [[package]]
 name = "wandb"
-version = "0.19.5"
+version = "0.19.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "click" },
@@ -2692,26 +1862,20 @@ dependencies = [
     { name = "setuptools" },
     { name = "typing-extensions", marker = "python_full_version < '3.12'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/87/a4/5611250c729da145803ce8da8fc38c7e1f545dd015a457cbebb5f2a48506/wandb-0.19.5.tar.gz", hash = "sha256:a2cbb5932b2970d277663515123610a4e10da01d5cbb321f94ccf36d3782c14c", size = 38909291 }
+sdist = { url = "https://files.pythonhosted.org/packages/41/a2/63fbebc6ed670a7d834ca76552b8c6382211874b23ee8a718ba26a342a4a/wandb-0.19.6.tar.gz", hash = "sha256:4661856ee070fe8a123caece5b372d495d3cf9f58176a8f981bd716830eefc49", size = 39203528 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/32/5f/ed184377bfe76f2652f8acf51e90474504787a308c304169c9456ca48fc9/wandb-0.19.5-py3-none-any.whl", hash = "sha256:5f846fd1908b9b12182de33f5293e7031df6ae52acf05c7912429363f689f60f", size = 6383456 },
-    { url = "https://files.pythonhosted.org/packages/de/5b/e2e6915f156201d840901f1a0a5b7ae15e07aa356931d2b4e49c0f449091/wandb-0.19.5-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:d18e21a7cba2602dfe4512aa834d7eb9a725e785492e5c953656d0c7753bdee4", size = 20261782 },
-    { url = "https://files.pythonhosted.org/packages/09/16/e059494e4ae69edf1dfde8a525007b659f77fae980518c8bb47038d5b63e/wandb-0.19.5-py3-none-macosx_11_0_arm64.whl", hash = "sha256:a9f972096eef097948f067e9bd92146d7af081122880c28e88ecdd9bd7d11b08", size = 19439432 },
-    { url = "https://files.pythonhosted.org/packages/a2/9d/46485a1b53589e8fe164dc660931797a5bddfb7c0d07c71e5b7aee1e0522/wandb-0.19.5-py3-none-macosx_11_0_x86_64.whl", hash = "sha256:eecd605e7580d9bc98b7e983d6ea787922869495dcd4e3dd7a859c5fc299764c", size = 20270486 },
-    { url = "https://files.pythonhosted.org/packages/f3/de/d01f45442695350cc1351966ee95eb59cec6ab039285603738c13959a1e3/wandb-0.19.5-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:967d3ab3569322c546349cea7b301f69e24b97ad1cc50162612926a864127306", size = 19068620 },
-    { url = "https://files.pythonhosted.org/packages/8a/30/8c495234e584ebcea92ec1d178897beeaf9798835bbb4f2b9a31c6533985/wandb-0.19.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f8be456cbe819e8202009cf4ac10a5a28141c4c6370f34b3f8cbd640c2dc8f9", size = 20349290 },
-    { url = "https://files.pythonhosted.org/packages/4c/29/558b896d754f4ebd30b62910adb58bd6fcd33113f8a21919b5866d3ea35c/wandb-0.19.5-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:63bb83011194137c9385a01476216dbfc0920fec0e660d87b7ca0dc1373c556f", size = 19077834 },
-    { url = "https://files.pythonhosted.org/packages/ca/b3/306ba19dc3faac29f0d0ce2367a83130a65978f7c9557c3dd377ded4346b/wandb-0.19.5-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:ac54f7d949eee335de349e80e4c3babf0cc89d52f77e49131e258c4b97673dc1", size = 20428642 },
-    { url = "https://files.pythonhosted.org/packages/27/6d/35b8455aee00f0b61d008977f8fca55e4307d9d514b3b9669ec3f74525a6/wandb-0.19.5-py3-none-win32.whl", hash = "sha256:1603dacb43093645419e8bb098ded5850ad8a7f3ab5ad50caf356f6792479a03", size = 19730874 },
-    { url = "https://files.pythonhosted.org/packages/48/20/945de2a62fbbb9193725ebbbb82887aea7ea9817a3c9d1bd4206e0d9c011/wandb-0.19.5-py3-none-win_amd64.whl", hash = "sha256:503b575c1c469e2f00caa1b2a02211bad8401c2376b43ef976b7fc88f385fb77", size = 19730876 },
+    { url = "https://files.pythonhosted.org/packages/bd/4f/5b77e20f10e643404df871557610a6618383e036de65e9c34b3a8354f2ac/wandb-0.19.6-py3-none-any.whl", hash = "sha256:0b174b5f190999a8238961c63c134622bf2173147a1301ea298a9ec58abbd7d4", size = 6387720 },
+    { url = "https://files.pythonhosted.org/packages/25/aa/824a171586f3fa1549f9f946d32187362c8d06ff67540d9f1be694ee9094/wandb-0.19.6-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:ad2887dd916207ead5a9f36e4aebc1b6624265f29033e4e883bb6fbd5b674080", size = 20776552 },
+    { url = "https://files.pythonhosted.org/packages/ad/3b/222e2a27ee3df3a973d8f165fa47f3e3bb25dc6d9ac1d3ec79b083c5ee09/wandb-0.19.6-py3-none-macosx_11_0_arm64.whl", hash = "sha256:ca90dd5519de1a48963536f02d6e14c150475807173b7af1d8ebe3e2f9e3afba", size = 19933524 },
+    { url = "https://files.pythonhosted.org/packages/65/76/1d69145ac3c9c6b63545e684c39b95711c3632c34d452626fd831227089d/wandb-0.19.6-py3-none-macosx_11_0_x86_64.whl", hash = "sha256:3cb10bd1e1c0b568464a017c88eb95e0c8c3e9c1283d9ad4ee717c8977d491c1", size = 20791479 },
+    { url = "https://files.pythonhosted.org/packages/88/96/4411c4aa29cfb0bc8e310480181d79779b423231420bbcf5e61ff8c44ff7/wandb-0.19.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fe6e7bedd396b2b5f92c7fab3d364f7e0e8cb9f645d0f0c27ba7be94e720931", size = 19539263 },
+    { url = "https://files.pythonhosted.org/packages/bc/89/2e414951d35e55caf6d8ac5758a82c61c1b8330f77852fbc733c833196eb/wandb-0.19.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd9ae9a7f08e4d3972ba341c42af787e951689e0d1a76c111aa66d09bcdadafd", size = 20861187 },
+    { url = "https://files.pythonhosted.org/packages/3a/5e/7517c9fa9aa0075160c04e467f6d0e5d1b9bb6b91c4ffd6dd6fa23dd3dd0/wandb-0.19.6-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:ff0973ca26cd06bc5451ae7ba469ad98f74024f5678dfa0d6dc78ca36eb950b6", size = 19549095 },
+    { url = "https://files.pythonhosted.org/packages/bd/be/ef3c78ab14a631558f639ab3a8379efee6f7d529e3bbf9efb0e17472495b/wandb-0.19.6-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:2e8dc997eb3ae5f22f5a1c3d4f3b30c28398dda45b9dbada9ff20b8d3984d3e2", size = 20938943 },
+    { url = "https://files.pythonhosted.org/packages/b6/43/2f9c71a1fe77a97e9d32b4828f1dd685ac545442f8dfbf703eac8128056f/wandb-0.19.6-py3-none-win32.whl", hash = "sha256:c0127d99e98202dc2471d44b920129c2c9242fb3a6b52a7aa8bbf9ffa35173e7", size = 20230403 },
+    { url = "https://files.pythonhosted.org/packages/fd/b2/a9ffa91c43dbe2a6687467f3aa196947b7532592879738665be5c0db17c3/wandb-0.19.6-py3-none-win_amd64.whl", hash = "sha256:8688a4f724d37a90075312e8dccffd948adbe8b6bcb82f9d2b38b764b53269fb", size = 20230407 },
 ]
 
-[[package]]
-name = "word2number"
-version = "1.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/4a/29/a31940c848521f0725f0df6b25dca8917f13a2025b0e8fcbe5d0457e45e6/word2number-1.1.zip", hash = "sha256:70e27a5d387f67b04c71fbb7621c05930b19bfd26efd6851e6e0f9969dcde7d0", size = 9723 }
-
 [[package]]
 name = "xxhash"
 version = "3.5.0"
@@ -2869,29 +2033,16 @@ version = "0.1.0"
 source = { editable = "." }
 dependencies = [
     { name = "datasets" },
-    { name = "fsspec", extra = ["gcs"] },
-    { name = "liger-kernel-nightly" },
     { name = "ninja" },
     { name = "numpy" },
-    { name = "psutil" },
     { name = "pyarrow" },
     { name = "pydantic-config" },
     { name = "setuptools" },
-    { name = "toposolve" },
     { name = "torch" },
-    { name = "torch-shampoo" },
     { name = "torchdata" },
     { name = "transformers" },
-    { name = "zstandard" },
-]
-
-[package.optional-dependencies]
-all = [
-    { name = "aiohttp" },
-    { name = "asyncio" },
-    { name = "lm-eval" },
-    { name = "requests" },
     { name = "wandb" },
+    { name = "zstandard" },
 ]
 
 [package.dev-dependencies]
@@ -2904,25 +2055,16 @@ dev = [
 
 [package.metadata]
 requires-dist = [
-    { name = "aiohttp", marker = "extra == 'all'", specifier = ">=3.10.5" },
-    { name = "asyncio", marker = "extra == 'all'", specifier = ">=3.4.3" },
     { name = "datasets", specifier = ">=3.0.0" },
-    { name = "fsspec", extras = ["gcs"], specifier = ">=2024.3.1" },
-    { name = "liger-kernel-nightly", specifier = ">=0.5.2.dev20250122195349" },
-    { name = "lm-eval", marker = "extra == 'all'" },
     { name = "ninja" },
     { name = "numpy" },
-    { name = "psutil" },
     { name = "pyarrow" },
     { name = "pydantic-config", git = "https://github.com/samsja/pydantic_config.git?rev=b7becc3" },
-    { name = "requests", marker = "extra == 'all'", specifier = ">=2.32.3" },
     { name = "setuptools" },
-    { name = "toposolve", specifier = ">=0.1.17" },
-    { name = "torch", specifier = "==2.5.1" },
-    { name = "torch-shampoo", git = "https://github.com/facebookresearch/optimizers.git?rev=main" },
+    { name = "torch", specifier = "==2.6.0" },
     { name = "torchdata", specifier = ">=0.8.0" },
     { name = "transformers", specifier = ">=4.44.2" },
-    { name = "wandb", marker = "extra == 'all'" },
+    { name = "wandb" },
     { name = "zstandard" },
 ]