Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
b5e6f9c
Add R1 Zero 7B
lewtun Mar 29, 2025
8a4af61
Fix chat template
lewtun Mar 29, 2025
9e0e478
Add new difficulty levels
lewtun Mar 29, 2025
b35213c
Add medium, hard, ultra hard recipes
lewtun Mar 31, 2025
1d6c0bb
Fix accuracy rewards
lewtun Mar 31, 2025
5747cfc
Return None for invalid samples
lewtun Mar 31, 2025
1078b73
Fix order of inputs
lewtun Apr 1, 2025
d9c8cd8
Use None for unferified
lewtun Apr 1, 2025
8f26046
Merge branch 'main' into r1-zero
lewtun Apr 1, 2025
5fe41f0
Pin trl
lewtun Apr 1, 2025
f22657b
Set defaults
lewtun Apr 1, 2025
82a1167
Log unique only
lewtun Apr 1, 2025
2897519
Revert config
lewtun Apr 1, 2025
d51de45
Use proper dataset
lewtun Apr 2, 2025
f1832c5
Pin TRL
lewtun Apr 3, 2025
995beb8
Clean up
lewtun Apr 4, 2025
1d7d66a
Merge branch 'main' into r1-zero
lewtun Apr 4, 2025
10a555b
Add soft format reward
lewtun Apr 7, 2025
0f98a5a
Fix soft reward to be really soft
lewtun Apr 7, 2025
23b7b69
Merge branch 'main' into r1-zero
lewtun Apr 8, 2025
f62e42a
Pin TRL for overlong masking
lewtun Apr 8, 2025
939c74c
Fix liger
lewtun Apr 9, 2025
9bed487
Add v01
lewtun Apr 9, 2025
b29e672
Add level configs and DAPO
lewtun Apr 10, 2025
7a8dead
Fix
lewtun Apr 11, 2025
2d74588
Merge branch 'main' into r1-zero
lewtun Apr 11, 2025
c1d2352
Add q3
lewtun Apr 11, 2025
8500f41
Parse GAS
lewtun Apr 12, 2025
3c312f8
Add hack for lighteval
lewtun Apr 14, 2025
b6a73c0
Merge branch 'main' into r1-zero
lewtun Apr 16, 2025
a5f3baa
Merge branch 'main' into r1-zero
lewtun Apr 17, 2025
f3920f8
Pin TRL
lewtun Apr 17, 2025
06bdd50
Merge branch 'main' into r1-zero
lewtun Apr 17, 2025
2f0b983
Add 32B recipe
lewtun Apr 22, 2025
be72ce6
Fix sharding in Slurm
lewtun Apr 23, 2025
0df1654
Tune recipe
lewtun Apr 23, 2025
c24ffd7
Fix attempt on Slurm
lewtun Apr 23, 2025
2715d31
Hack
lewtun Apr 23, 2025
cebaad5
Wait
lewtun Apr 23, 2025
2f4b0da
Revert slurm
lewtun Apr 23, 2025
f27c732
Fix
lewtun Apr 23, 2025
5f0b8f8
Remove hf-transfer in favour of hf-xet
lewtun Apr 24, 2025
46c1656
Pin transformers
lewtun Apr 26, 2025
2c0cac5
Merge branch 'main' into r1-zero
lewtun Apr 26, 2025
8d993d5
add gen batch exp config
edbeeching May 5, 2025
a82c1fd
adds weighted code reward
edbeeching May 7, 2025
d9a6c08
add latest configs
edbeeching May 7, 2025
464d951
Merge branch 'main' into r1-zero
lewtun May 8, 2025
b430693
Merge branch 'main' into r1-zero
edbeeching May 9, 2025
0ed9ea3
Merge branch 'main' into r1-zero
edbeeching May 10, 2025
a401d64
Merge branch 'main' into r1-zero
lewtun May 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions recipes/OpenR1-Zero-32B-Math/grpo/config_v00.00.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Config for 4 + 1 nodes
# Model arguments
model_name_or_path: Qwen/Qwen2.5-32B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
dataset_name: open-r1/DAPO-Math-17k-Processed
dataset_config: all

# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- math_500
- aime24
beta: 0.0
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
# gradient_checkpointing_kwargs:
# use_reentrant: false
hub_model_id: open-r1/R1-Zero-Qwen-32B-Math
hub_model_revision: v00.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
mask_truncated_completions: true
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 8192
max_steps: -1
num_generations: 16
num_iterations: 1
num_train_epochs: 1
output_dir: data/R1-Zero-Qwen-32B-Math-v00.00
overwrite_output_dir: true
per_device_train_batch_size: 1
push_to_hub: true
report_to:
- wandb
reward_funcs:
- accuracy
- format
- soft_format
reward_weights:
- 1.0
- 0.25
- 0.25
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
sync_ref_model: false
seed: 42
temperature: 1.0
vllm_server_timeout: 1200
warmup_ratio: 0.1
70 changes: 70 additions & 0 deletions recipes/OpenR1-Zero-7B-Code/grpo/config_v00.00.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Config for 1 + 1 nodes
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
dataset_prompt_column: problem

# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- math_500
- aime24
beta: 0.0
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
generation_batch_size: 512
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/R1-Zero-Qwen-7B-Code
hub_model_revision: v00.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
mask_truncated_completions: true
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 8192
max_steps: -1
num_generations: 16
num_iterations: 1
num_train_epochs: 1
output_dir: data/R1-Zero-Qwen-7B-Code-v00.00
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: true
report_to:
- wandb
reward_funcs:
- weighted_binary_code_reward
- format
- soft_format
reward_weights:
- 1.0
- 0.25
- 0.25
e2b_router_url: ip-10-53-83-71:8000
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
sync_ref_model: false
seed: 42
temperature: 1.0
warmup_ratio: 0.0
epsilon: 0.2
64 changes: 64 additions & 0 deletions recipes/OpenR1-Zero-7B-Math/grpo/config_v00.00.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Config for 1 + 1 nodes
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
dataset_name: open-r1/Big-Math-RL-Verified-Processed
dataset_config: all

# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- math_500
beta: 0.001
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
hub_model_revision: v00.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 8192
max_steps: -1
num_generations: 16
num_train_epochs: 0.1 # 21.6k prompts
output_dir: data/R1-Zero-Qwen-7B-Math-v00.00
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: true
report_to:
- wandb
reward_funcs:
- accuracy
- format
reward_weights:
- 1.0
- 0.2
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
sync_ref_model: true
ref_model_sync_steps: 100
ref_model_mixup_alpha: 1.0
seed: 42
temperature: 1.0
warmup_ratio: 0.1
66 changes: 66 additions & 0 deletions recipes/OpenR1-Zero-7B-Math/grpo/config_v01.00.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Config for 1 + 1 nodes
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
dataset_name: open-r1/Big-Math-RL-Verified-Processed
dataset_config: all

# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- math_500
- aime24
beta: 0.0
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
hub_model_revision: v01.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
mask_truncated_completions: true
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 8192
max_steps: -1
num_generations: 16
num_train_epochs: 0.1 # 21.6k prompts
output_dir: data/R1-Zero-Qwen-7B-Math-v01.00
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: true
report_to:
- wandb
reward_funcs:
- accuracy
- format
- soft_format
reward_weights:
- 1.0
- 0.25
- 0.25
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
sync_ref_model: false
seed: 42
temperature: 1.0
warmup_ratio: 0.1
67 changes: 67 additions & 0 deletions recipes/OpenR1-Zero-7B-Math/grpo/config_v02.00.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Config for 1 + 1 nodes
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
dataset_name: open-r1/Big-Math-RL-Verified-Processed
dataset_config: level_2_3_4_5

# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- math_500
- aime24
beta: 0.0
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
hub_model_revision: v02.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
mask_truncated_completions: true
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 8192
max_steps: -1
num_generations: 16
num_iterations: 1
num_train_epochs: 0.12 # 19.9k prompts
output_dir: data/R1-Zero-Qwen-7B-Math-v02.00
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: true
report_to:
- wandb
reward_funcs:
- accuracy
- format
- soft_format
reward_weights:
- 1.0
- 0.25
- 0.25
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
sync_ref_model: false
seed: 42
temperature: 1.0
warmup_ratio: 0.1
Loading
Loading