diff --git a/configs/150M_big_bs/H100.toml b/configs/150M_big_bs/H100.toml new file mode 100644 index 00000000..46f4ae71 --- /dev/null +++ b/configs/150M_big_bs/H100.toml @@ -0,0 +1,16 @@ +name_model = "150M" +project = "150M_big_bs" +type_model = "llama2" + +[train] +micro_bs = 64 # change this base on the gpu +reshard_after_forward = false + +[optim] +batch_size = 2048 +warmup_steps = 500 +total_steps = 4096 + + +[optim.optim] +lr = 8e-4 diff --git a/configs/150M_short/3090.toml b/configs/150M_short/3090.toml index a468b64c..bbd5b421 100644 --- a/configs/150M_short/3090.toml +++ b/configs/150M_short/3090.toml @@ -4,7 +4,7 @@ type_model = "llama2" [train] micro_bs = 16 # change this base on the gpu -reshard_after_forward = true +reshard_after_forward = false [optim] batch_size = 512 diff --git a/configs/150M_short/A40.toml b/configs/150M_short/A40.toml index 80756de5..94844480 100644 --- a/configs/150M_short/A40.toml +++ b/configs/150M_short/A40.toml @@ -4,7 +4,7 @@ type_model = "llama2" [train] micro_bs = 32 # change this base on the gpu -reshard_after_forward = true +reshard_after_forward = false [optim] diff --git a/configs/150M_short/H100.toml b/configs/150M_short/H100.toml index f7a7223d..a106460b 100644 --- a/configs/150M_short/H100.toml +++ b/configs/150M_short/H100.toml @@ -4,7 +4,7 @@ type_model = "llama2" [train] micro_bs = 64 # change this base on the gpu -reshard_after_forward = true +reshard_after_forward = false [optim] batch_size = 512 diff --git a/pyproject.toml b/pyproject.toml index 02a88e3c..d1917715 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ dependencies = [ "setuptools", "transformers>=4.44.2", "datasets>=3.0.0", - "pydantic_config @ git+https://github.com/samsja/pydantic_config.git@74c94ee", + "pydantic_config @ git+https://github.com/samsja/pydantic_config.git@b7becc3", "torchdata>=0.8.0", "fsspec[gcs]>=2024.3.1", "ninja", diff --git a/sweeps/adam.yaml b/sweeps/adam.yaml new file mode 100644 index 00000000..e6becf97 --- /dev/null +++ b/sweeps/adam.yaml @@ -0,0 +1,39 @@ +command: + - bash + - scripts/simulate_multi_node_diloco.sh + - "1" + - "8" + - "src/zeroband/train.py" + - "@configs/150M_big_bs/H100.toml" + - ${args} +name: adam-sweep +method: bayes +early_terminate: + type: hyperband + max_iter: 27 + eta: 3 + s: 2 +metric: + goal: minimize + name: Perplexity +parameters: + optim.optim.lr: + distribution: log_uniform_values + min: 1e-4 + max: 1e-2 + # optim.optim.betas1: + # distribution: uniform + # min: 0.8 + # max: 0.999 + optim.optim.betas2: + distribution: uniform + min: 0.8 + max: 0.9999 + optim.optim.weight_decay: + distribution: uniform + min: 0.0 + max: 0.4 + optim.warmup_steps: + distribution: q_uniform + min: 0 + max: 2000 \ No newline at end of file diff --git a/sweeps/diloco.yaml b/sweeps/diloco.yaml new file mode 100644 index 00000000..1a6739d8 --- /dev/null +++ b/sweeps/diloco.yaml @@ -0,0 +1,43 @@ +command: + - bash + - scripts/simulate_multi_node_diloco.sh + - "2" + - "4" + - "src/zeroband/train.py" + - "@configs/150M_big_bs/H100.toml" + - "--optim.batch_size" + - "1024" + - "--diloco.inner_steps" + - "25" + - ${args} +name: adam-sweep +method: bayes +early_terminate: + type: hyperband + max_iter: 27 + eta: 3 + s: 2 +metric: + goal: minimize + name: Perplexity +parameters: + optim.optim.lr: + distribution: log_uniform_values + min: 1e-4 + max: 1e-2 + # optim.optim.betas1: + # distribution: uniform + # min: 0.8 + # max: 0.999 + optim.optim.betas2: + distribution: uniform + min: 0.8 + max: 0.9999 + optim.optim.weight_decay: + distribution: uniform + min: 0.0 + max: 0.4 + optim.warmup_steps: + distribution: q_uniform + min: 0 + max: 2000 \ No newline at end of file diff --git a/uv.lock b/uv.lock index 3972c782..e5d6fdcf 100644 --- a/uv.lock +++ b/uv.lock @@ -1326,7 +1326,7 @@ name = "nvidia-cudnn-cu12" version = "9.1.0.70" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12" }, + { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/9f/fd/713452cd72343f682b1c7b9321e23829f00b842ceaedcda96e742ea0b0b3/nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f", size = 664752741 }, @@ -1337,7 +1337,7 @@ name = "nvidia-cufft-cu12" version = "11.2.1.3" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12" }, + { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117 }, @@ -1356,9 +1356,9 @@ name = "nvidia-cusolver-cu12" version = "11.6.1.9" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12" }, - { name = "nvidia-cusparse-cu12" }, - { name = "nvidia-nvjitlink-cu12" }, + { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cusparse-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057 }, @@ -1369,7 +1369,7 @@ name = "nvidia-cusparse-cu12" version = "12.3.1.170" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12" }, + { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763 }, @@ -1753,7 +1753,7 @@ wheels = [ [[package]] name = "pydantic-config" version = "0.2.0" -source = { git = "https://github.com/samsja/pydantic_config.git?rev=74c94ee#74c94eed8b05b02f67027181399ead009e4b71a0" } +source = { git = "https://github.com/samsja/pydantic_config.git?rev=b7becc3#b7becc34a9aecf09e4de67e61c246c2dc928ca08" } dependencies = [ { name = "pydantic" }, { name = "rich" }, @@ -2914,7 +2914,7 @@ requires-dist = [ { name = "numpy" }, { name = "psutil" }, { name = "pyarrow" }, - { name = "pydantic-config", git = "https://github.com/samsja/pydantic_config.git?rev=74c94ee" }, + { name = "pydantic-config", git = "https://github.com/samsja/pydantic_config.git?rev=b7becc3" }, { name = "requests", marker = "extra == 'all'", specifier = ">=2.32.3" }, { name = "setuptools" }, { name = "toposolve", specifier = ">=0.1.17" },