huggingface · lewtun · Mar 29, 2025 · Mar 29, 2025 · Mar 29, 2025 · Mar 31, 2025
diff --git a/recipes/OpenR1-Zero-32B-Math/grpo/config_v00.00.yaml b/recipes/OpenR1-Zero-32B-Math/grpo/config_v00.00.yaml
@@ -0,0 +1,68 @@
+# Config for 4 + 1 nodes
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-32B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- '\\n\\nUser: ' + message['content'].strip() }}\n    {%- elif message['role'] == 'system' %}\n        {{- message['content'] }}\n    {%- elif message['role'] == 'assistant' %}\n        {{- '\\n\\nAssistant: '  + message['content'] }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
+dataset_name: open-r1/DAPO-Math-17k-Processed
+dataset_config: all
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+beta: 0.0
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+# gradient_checkpointing_kwargs:
+#   use_reentrant: false
+hub_model_id: open-r1/R1-Zero-Qwen-32B-Math
+hub_model_revision: v00.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+mask_truncated_completions: true
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 8192
+max_steps: -1
+num_generations: 16
+num_iterations: 1
+num_train_epochs: 1
+output_dir: data/R1-Zero-Qwen-32B-Math-v00.00
+overwrite_output_dir: true
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+- soft_format
+reward_weights:
+- 1.0
+- 0.25
+- 0.25
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+sync_ref_model: false 
+seed: 42
+temperature: 1.0
+vllm_server_timeout: 1200
+warmup_ratio: 0.1
diff --git a/recipes/OpenR1-Zero-7B-Code/grpo/config_v00.00.yaml b/recipes/OpenR1-Zero-7B-Code/grpo/config_v00.00.yaml
@@ -0,0 +1,70 @@
+# Config for 1 + 1 nodes
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- '\\n\\nUser: ' + message['content'].strip() }}\n    {%- elif message['role'] == 'system' %}\n        {{- message['content'] }}\n    {%- elif message['role'] == 'assistant' %}\n        {{- '\\n\\nAssistant: '  + message['content'] }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+beta: 0.0
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+do_eval: false
+gradient_accumulation_steps: 16
+generation_batch_size: 512
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/R1-Zero-Qwen-7B-Code
+hub_model_revision: v00.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+mask_truncated_completions: true
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 8192
+max_steps: -1
+num_generations: 16
+num_iterations: 1
+num_train_epochs: 1
+output_dir: data/R1-Zero-Qwen-7B-Code-v00.00
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- weighted_binary_code_reward
+- format
+- soft_format
+reward_weights:
+- 1.0
+- 0.25
+- 0.25
+e2b_router_url: ip-10-53-83-71:8000
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+sync_ref_model: false 
+seed: 42
+temperature: 1.0
+warmup_ratio: 0.0
+epsilon: 0.2
diff --git a/recipes/OpenR1-Zero-7B-Math/grpo/config_v00.00.yaml b/recipes/OpenR1-Zero-7B-Math/grpo/config_v00.00.yaml
@@ -0,0 +1,64 @@
+# Config for 1 + 1 nodes
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- '\\n\\nUser: ' + message['content'].strip() }}\n    {%- elif message['role'] == 'system' %}\n        {{- message['content'] }}\n    {%- elif message['role'] == 'assistant' %}\n        {{- '\\n\\nAssistant: '  + message['content'] }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
+dataset_name: open-r1/Big-Math-RL-Verified-Processed
+dataset_config: all
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+beta: 0.001
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
+hub_model_revision: v00.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 8192
+max_steps: -1
+num_generations: 16
+num_train_epochs: 0.1 # 21.6k prompts
+output_dir: data/R1-Zero-Qwen-7B-Math-v00.00
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+reward_weights:
+- 1.0
+- 0.2
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+sync_ref_model: true 
+ref_model_sync_steps: 100 
+ref_model_mixup_alpha: 1.0
+seed: 42
+temperature: 1.0
+warmup_ratio: 0.1
diff --git a/recipes/OpenR1-Zero-7B-Math/grpo/config_v01.00.yaml b/recipes/OpenR1-Zero-7B-Math/grpo/config_v01.00.yaml
@@ -0,0 +1,66 @@
+# Config for 1 + 1 nodes
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- '\\n\\nUser: ' + message['content'].strip() }}\n    {%- elif message['role'] == 'system' %}\n        {{- message['content'] }}\n    {%- elif message['role'] == 'assistant' %}\n        {{- '\\n\\nAssistant: '  + message['content'] }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
+dataset_name: open-r1/Big-Math-RL-Verified-Processed
+dataset_config: all
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+beta: 0.0
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
+hub_model_revision: v01.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+mask_truncated_completions: true
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 8192
+max_steps: -1
+num_generations: 16
+num_train_epochs: 0.1 # 21.6k prompts
+output_dir: data/R1-Zero-Qwen-7B-Math-v01.00
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+- soft_format
+reward_weights:
+- 1.0
+- 0.25
+- 0.25
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+sync_ref_model: false 
+seed: 42
+temperature: 1.0
+warmup_ratio: 0.1
diff --git a/recipes/OpenR1-Zero-7B-Math/grpo/config_v02.00.yaml b/recipes/OpenR1-Zero-7B-Math/grpo/config_v02.00.yaml
@@ -0,0 +1,67 @@
+# Config for 1 + 1 nodes
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- '\\n\\nUser: ' + message['content'].strip() }}\n    {%- elif message['role'] == 'system' %}\n        {{- message['content'] }}\n    {%- elif message['role'] == 'assistant' %}\n        {{- '\\n\\nAssistant: '  + message['content'] }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
+dataset_name: open-r1/Big-Math-RL-Verified-Processed
+dataset_config: level_2_3_4_5
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+beta: 0.0
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
+hub_model_revision: v02.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+mask_truncated_completions: true
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 8192
+max_steps: -1
+num_generations: 16
+num_iterations: 1
+num_train_epochs: 0.12 # 19.9k prompts
+output_dir: data/R1-Zero-Qwen-7B-Math-v02.00
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+- soft_format
+reward_weights:
+- 1.0
+- 0.25
+- 0.25
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+sync_ref_model: false 
+seed: 42
+temperature: 1.0
+warmup_ratio: 0.1