PrimeIntellect-ai · samsja · Mar 21, 2025 · Mar 21, 2025 · Mar 26, 2025 · Mar 26, 2025
diff --git a/.gitmodules b/.gitmodules
diff --git a/README.md b/README.md
@@ -118,7 +118,7 @@ uv run pytest
 To eval you need first to convert the checkpoint to a huggingface compatible model.
 
 ```bash
-uv run python scripts/export_dcp.py @configs/10B/H100.toml --ckpt.path CONVERTED_MODEL_PATH --ckpt.resume CHECKPOINT_PATH --torch_dtype bfloat16  --ckpt.interval 1
+uv run python scripts/export_dcp.py @configs/10B/H100_simple.toml --ckpt.path CONVERTED_MODEL_PATH --ckpt.resume CHECKPOINT_PATH --torch_dtype bfloat16  --ckpt.interval 1
 ```
 
 
@@ -178,7 +178,7 @@ You may also pass the `torch_dtype` argument to either `float32` or `bfloat16` t
 
 Example export command:
 ```bash
-python scripts/export_dcp.py @configs/10B/H100.toml --ckpt.path /path/to/save/converted_model --ckpt.resume /path/to/ckpt/step_84000 --torch_dtype bfloat16
+python scripts/export_dcp.py @configs/10B/H100_simple.toml --ckpt.path /path/to/save/converted_model --ckpt.resume /path/to/ckpt/step_84000 --torch_dtype bfloat16
 ```
 
 You can then upload the model to huggingface using huggingface-cli:

diff --git a/configs/10B/H100_cooldown.toml b/configs/10B/H100_cooldown.toml
diff --git a/configs/10B/H100_devel.toml b/configs/10B/H100_devel.toml
diff --git a/configs/10B/H100.toml → configs/10B/H100_intellect1.toml b/configs/10B/H100.toml → configs/10B/H100_intellect1.toml
@@ -1,22 +1,25 @@
-name_model = "10B"
 project = "10B_zero_band"
+model_name = "10B"
+model_type = "llama3"
+
 wandb_resume = false
 
-[train]
-micro_bs = 1
-ac_ckpt = true
+[hardware]
+micro_batch_size = 1
+act_ckpt = true
 
-[optim]
-sched_type = "wsd-sqrt"
+[train]
 batch_size = 128 #1M tokens bs
-warmup_steps = 1000
-total_steps = 1_000_000_000_000
 
-
-z_loss = true
-
-[optim.optim]
+[train.lr_scheduler]
+decay_type = "sqrt"
 lr = 7.5e-5
+end_lr = 0.0
+num_warmup_steps = 1000
+num_stable_steps = 70_000
+num_decay_steps = 30_000
+
+[train.optimizer]
 betas1 = 0.9
 betas2 = 0.95
 weight_decay = 0.1
@@ -36,6 +39,4 @@ compression = "uint8"
 
 [ckpt]
 interval = 100
-topk = 40
 path = "/data/10B"
-remote_data_path = "/data/10B_data_ckpt"
diff --git a/configs/10B/H100_simple.toml b/configs/10B/H100_simple.toml
@@ -1,20 +1,23 @@
-name_model = "10B"
 project = "debug_10B_zero_band"
+model_name = "10B"
+model_type = "llama3"
 
-[train]
-micro_bs = 1
-ac_ckpt = true
-
-[optim]
-sched_type = "wsd-sqrt"
-batch_size = 128                #1M tokens bs
-warmup_steps = 1000
-total_steps = 1_000_000_000_000
+[hardware]
+micro_batch_size = 1
+act_ckpt = true
 
-z_loss = true
+[train]
+batch_size = 128 #1M tokens bs
 
-[optim.optim]
+[train.lr_scheduler]
+decay_type = "sqrt"
 lr = 7.5e-5
+end_lr = 0.0
+num_warmup_steps = 1000
+num_decay_steps = 1_000_000_000_000
+
+[train.optimizer]
+type = 'adamw'
 betas1 = 0.9
 betas2 = 0.95
 weight_decay = 0.1

diff --git a/configs/13B/H100.toml b/configs/13B/H100.toml
@@ -1,17 +1,15 @@
-name_model = "13B"
 project = "debug_13B_zero_band"
 
-[train]
-micro_bs = 1
-ac_ckpt = true
+model_name = "13B"
+model_type = "llama2"
 
-[optim]
-batch_size = 1024 #2M tokens bs
-warmup_steps = 1000
-total_steps = 88_000
+[hardware]
+micro_batch_size = 64
+reshard_after_forward = false
 
-[optim.optim]
-lr = 3e-4
+[train]
+batch_size = 512
 
 [data]
-seq_length = 2048
+seq_length = 1024
+dataset_name_or_paths = "datasets/fineweb-edu"
diff --git a/configs/150M/3090.toml b/configs/150M/3090.toml
diff --git a/configs/150M/A100_debug.toml b/configs/150M/A100_debug.toml
@@ -0,0 +1,21 @@
+project = "debug_150m_zero_band"
+
+model_name = "150M"
+model_type = "llama2"
+
+wandb = false
+
+[hardware]
+micro_batch_size = 64
+torch_compile = true
+
+[train]
+batch_size = 512
+
+[train.lr_scheduler]
+num_warmup_steps = 10
+num_decay_steps = 1000
+
+[data]
+fake = true
+
diff --git a/configs/150M/A40.toml b/configs/150M/A40.toml
diff --git a/configs/150M/H100.toml b/configs/150M/H100.toml
@@ -1,16 +1,15 @@
-name_model = "150M"
 project = "debug_150m_zero_band"
-type_model = "llama2"
 
-[train]
-micro_bs = 64 # change this base on the gpu
+model_name = "150M"
+model_type = "llama2"
+
+[hardware]
+micro_batch_size = 64
 reshard_after_forward = false
 
-[optim]
+[train]
 batch_size = 512
-warmup_steps = 1000
-total_steps = 88_000
-
-[optim.optim]
-lr = 4e-4
 
+[data]
+seq_length = 1024
+dataset_name_or_paths = "datasets/fineweb-edu"
diff --git a/configs/150M/H100-fast.toml → configs/150M/H100_best.toml b/configs/150M/H100-fast.toml → configs/150M/H100_best.toml
@@ -1,18 +1,22 @@
-name_model = "150M"
 project = "debug_150m_zero_band"
-type_model = "llama2"
+model_name = "150M"
+model_type = "llama2"
 
-[train]
-micro_bs = 64 # change this base on the gpu
+[hardware]
+micro_batch_size = 64
 reshard_after_forward = false
 
-[optim]
+[train]
 batch_size = 512
-warmup_steps = 278
-total_steps = 8192
 
-[optim.optim]
+[train.lr_scheduler]
+decay_type = 'cosine'
+num_warmup_steps = 278
+num_decay_steps = 7914 # 278 + 7914 = 8192
 lr = 0.003551730141097694
+
+[train.optimizer]
+type = 'adamw'
 betas1 = 0.9454835470717078
 betas2 = 0.9190488086654895
 weight_decay = 0.24530252977858977

diff --git a/configs/150M_short/3090.toml b/configs/150M_short/3090.toml
diff --git a/configs/150M_short/A40.toml b/configs/150M_short/A40.toml
diff --git a/configs/150M_short/H100.toml b/configs/150M_short/H100.toml
diff --git a/configs/1B/H100.toml b/configs/1B/H100.toml
@@ -1,15 +1,15 @@
-name_model = "1B"
 project = "debug_1B_zero_band"
-type_model = "llama2"
 
-[train]
-micro_bs = 32
-reshard_after_forward = true
+model_name = "1B"
+model_type = "llama2"
+
+[hardware]
+micro_batch_size = 64
+reshard_after_forward = false
 
-[optim]
-batch_size = 1024
-warmup_steps = 1000
-total_steps = 8192
+[train]
+batch_size = 512
 
-[optim.optim]
-lr = 7e-4
+[data]
+seq_length = 1024
+dataset_name_or_paths = "datasets/fineweb-edu"