diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v03.00.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v03.00.yaml
new file mode 100644
index 000000000..90351b619
--- /dev/null
+++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v03.00.yaml
@@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 64
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
+hub_model_revision: v03.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 24000
+max_steps: -1
+num_generations: 8
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v03.00
+overwrite_output_dir: true
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+e2b_router_url: "ip-10-53-85-124:8000"
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v04.00.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v04.00.yaml
new file mode 100644
index 000000000..b44fabe06
--- /dev/null
+++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v04.00.yaml
@@ -0,0 +1,66 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+
+# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 128
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
+hub_model_revision: v04.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 24000
+max_steps: -1
+num_generations: 8
+num_iterations: 1
+num_train_epochs: 1.0
+output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v04.00
+overwrite_output_dir: true
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+e2b_router_url: ip-10-53-86-47:8000
+reward_weights:
+- 1.0
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+mask_truncated_completions: true
+loss_type: dr_grpo
\ No newline at end of file
diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v05.00.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v05.00.yaml
new file mode 100644
index 000000000..6b775eae3
--- /dev/null
+++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v05.00.yaml
@@ -0,0 +1,66 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+
+# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
+hub_model_revision: v05.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 24000
+max_steps: -1
+num_generations: 16
+num_iterations: 1
+num_train_epochs: 1.0
+output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v05.00
+overwrite_output_dir: true
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+e2b_router_url: ip-10-53-86-47:8000
+reward_weights:
+- 1.0
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+mask_truncated_completions: true
+loss_type: dr_grpo
\ No newline at end of file
diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v06.00.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v06.00.yaml
new file mode 100644
index 000000000..167df6138
--- /dev/null
+++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v06.00.yaml
@@ -0,0 +1,66 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+
+# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
+hub_model_revision: v06.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 24000
+max_steps: -1
+num_generations: 16
+num_iterations: 1
+num_train_epochs: 1.0
+output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v06.00
+overwrite_output_dir: true
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- weighted_binary_code_reward
+e2b_router_url: ip-10-53-86-47:8000
+reward_weights:
+- 1.0
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+mask_truncated_completions: true
+loss_type: dr_grpo
\ No newline at end of file
diff --git a/recipes/OlympicCoder-7B/sft/config_v00.00.yaml b/recipes/OlympicCoder-7B/sft/config_v00.00.yaml
index dd0be5d96..f5b8385da 100644
--- a/recipes/OlympicCoder-7B/sft/config_v00.00.yaml
+++ b/recipes/OlympicCoder-7B/sft/config_v00.00.yaml
@@ -14,7 +14,7 @@ dataset_num_proc: 48
 bf16: true
 do_eval: false
 eval_strategy: 'no'
-gradient_accumulation_steps: 8
+gradient_accumulation_steps: 2
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
@@ -27,20 +27,20 @@ logging_strategy: steps
 lr_scheduler_type: cosine_with_min_lr
 lr_scheduler_kwargs:
   min_lr_rate: 0.1
-packing: false
+packing: true
 max_grad_norm: 0.2
-max_length: 32768
+max_length: 16000
 max_steps: -1
 num_train_epochs: 10
 output_dir: data/OlympicCoder-7B
 overwrite_output_dir: true
 per_device_eval_batch_size: 1
-per_device_train_batch_size: 2
+per_device_train_batch_size: 1
 push_to_hub: true
 report_to:
 - wandb
 save_strategy: epoch
 save_total_limit: 1
 seed: 42
-use_liger_kernel: true
+use_liger_kernel: false
 warmup_ratio: 0.03
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.05.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.05.yaml
new file mode 100644
index 000000000..174c816b9
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.05.yaml
@@ -0,0 +1,58 @@
+
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/OpenR1-Math-cn_k12-86k
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+beta: 0.001
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+
+do_eval: false
+gradient_accumulation_steps: 64
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v01.05
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 8
+num_train_epochs: 1
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.05
+overwrite_output_dir: true
+per_device_train_batch_size: 1
+push_to_hub: true
+use_liger_kernel: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+reward_weights:
+- 1.0
+- 0.2
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.06.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.06.yaml
new file mode 100644
index 000000000..8a456c9f5
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.06.yaml
@@ -0,0 +1,60 @@
+
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/OpenR1-Math-cn_k12-86k
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+beta: 0.001
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v01.06
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_train_epochs: 1
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.06
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+use_liger_kernel: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+reward_weights:
+- 1.0
+- 0.2
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+warmup_ratio: 0.1
+wandb_entity: huggingface 
+wandb_project: open-r1
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.07.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.07.yaml
new file mode 100644
index 000000000..c13458da3
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.07.yaml
@@ -0,0 +1,61 @@
+
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/OpenR1-Math-cn_k12-86k
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+beta: 0.001
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v01.07
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_train_epochs: 1
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.07
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+use_liger_kernel: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+reward_weights:
+- 1.0
+- 0.2
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+warmup_ratio: 0.1
+wandb_entity: huggingface 
+wandb_project: open-r1
+scale_rewards: false
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.08.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.08.yaml
new file mode 100644
index 000000000..5a422d214
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.08.yaml
@@ -0,0 +1,62 @@
+
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/OpenR1-Math-cn_k12-86k
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+beta: 0.0
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v01.08
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_train_epochs: 1
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.08
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+use_liger_kernel: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+reward_weights:
+- 1.0
+- 0.2
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+warmup_ratio: 0.1
+wandb_entity: huggingface 
+wandb_project: open-r1
+num_iterations: 4
+scale_rewards: false
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.09.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.09.yaml
new file mode 100644
index 000000000..e162a16d6
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.09.yaml
@@ -0,0 +1,62 @@
+
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/OpenR1-Math-cn_k12-86k
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+beta: 0.0
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v01.09
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_train_epochs: 1
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.09
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+use_liger_kernel: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+reward_weights:
+- 1.0
+- 0.2
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+warmup_ratio: 0.1
+wandb_entity: huggingface 
+wandb_project: open-r1
+num_iterations: 4
+scale_rewards: true
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.00.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.00.yaml
new file mode 100644
index 000000000..5e9156c89
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.00.yaml
@@ -0,0 +1,65 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.00
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.02.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.02.yaml
new file mode 100644
index 000000000..5d1092b85
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.02.yaml
@@ -0,0 +1,65 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.02
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.02
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.03.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.03.yaml
new file mode 100644
index 000000000..6053143c9
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.03.yaml
@@ -0,0 +1,65 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.03
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.03
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.04.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.04.yaml
new file mode 100644
index 000000000..081ef05c8
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.04.yaml
@@ -0,0 +1,65 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 64
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.04
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.04
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.05.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.05.yaml
new file mode 100644
index 000000000..cbb5c5276
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.05.yaml
@@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.05
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.05
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.06.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.06.yaml
new file mode 100644
index 000000000..87d47e8b4
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.06.yaml
@@ -0,0 +1,65 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 64
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.06
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 64
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.06
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.07.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.07.yaml
new file mode 100644
index 000000000..cb0e44266
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.07.yaml
@@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 64
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.07
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 64
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.07
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.08.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.08.yaml
new file mode 100644
index 000000000..d21020068
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.08.yaml
@@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.08
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.08
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 4
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.09.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.09.yaml
new file mode 100644
index 000000000..b8d5eb696
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.09.yaml
@@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.09
+hub_strategy: every_save
+learning_rate: 5.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.09
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 4
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.10.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.10.yaml
new file mode 100644
index 000000000..3ea1630b3
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.10.yaml
@@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.10
+hub_strategy: every_save
+learning_rate: 1.0e-05
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.10
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 4
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.11.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.11.yaml
new file mode 100644
index 000000000..50997275d
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.11.yaml
@@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.11
+hub_strategy: every_save
+learning_rate: 4.0e-05
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.11
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 4
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.12.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.12.yaml
new file mode 100644
index 000000000..0628f4822
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.12.yaml
@@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.12
+hub_strategy: every_save
+learning_rate: 5.0e-07
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.12
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 4
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.13.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.13.yaml
new file mode 100644
index 000000000..aa4c0f763
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.13.yaml
@@ -0,0 +1,66 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.13
+hub_strategy: every_save
+learning_rate: 5.0e-07
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.13
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.14.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.14.yaml
new file mode 100644
index 000000000..4891fb2c7
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.14.yaml
@@ -0,0 +1,66 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.14
+hub_strategy: every_save
+learning_rate: 5.0e-07
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.14
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.01
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.15.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.15.yaml
new file mode 100644
index 000000000..97fd1c3f6
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.15.yaml
@@ -0,0 +1,65 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.15
+hub_strategy: every_save
+learning_rate: 5.0e-07
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 1.0
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.15
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.01
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.16.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.16.yaml
new file mode 100644
index 000000000..361c8d898
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.16.yaml
@@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.16
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.16
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+e2b_router_url: "ip-10-53-85-124:8000"
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.17.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.17.yaml
new file mode 100644
index 000000000..072b61c4b
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.17.yaml
@@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.17
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.17
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+e2b_router_url: "ip-10-53-85-124:8000"
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.4
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.18.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.18.yaml
new file mode 100644
index 000000000..585b7155b
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.18.yaml
@@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.18
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.18
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+e2b_router_url: "ip-10-53-85-124:8000"
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 1.0
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml
new file mode 100644
index 000000000..4b8aa6b31
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml
@@ -0,0 +1,69 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.20
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.20
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+e2b_router_url: ip-10-53-86-47:8000
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
+
+mask_truncated_completions: true
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml
new file mode 100644
index 000000000..08244af8a
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml
@@ -0,0 +1,70 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.30
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.30
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+e2b_router_url: ip-10-53-86-47:8000
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
+
+mask_truncated_completions: true
+loss_type: dr_grpo
\ No newline at end of file
diff --git a/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v02.00.yaml b/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v02.00.yaml
new file mode 100644
index 000000000..0c4cf5fe3
--- /dev/null
+++ b/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v02.00.yaml
@@ -0,0 +1,64 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-Coder-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+beta: 0.001
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 14
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-Coder-7B-Instruct-GRPO
+hub_model_revision: v02.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 14
+num_train_epochs: 0.1
+output_dir: data/Qwen2.5-Coder-7B-Instruct-GRPO_v02.00
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v03.00.yaml b/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v03.00.yaml
new file mode 100644
index 000000000..962e66190
--- /dev/null
+++ b/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v03.00.yaml
@@ -0,0 +1,64 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+beta: 0.001
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 14
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-Coder-7B-Instruct-GRPO
+hub_model_revision: v03.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 14
+num_train_epochs: 0.1
+output_dir: data/Qwen2.5-Coder-7B-Instruct-GRPO_v03.00
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index daa2f3252..a436bedc9 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -387,14 +387,30 @@ def extract_code(completion: str, language: str = "python") -> str:
 def binary_code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]:
     rewards = code_reward(completions, num_parallel=num_parallel, e2b_router_url=e2b_router_url, **kwargs)
     BINARY_THRESHOLD = 0.99
-
+    
     output = []
     for reward in rewards:
         if reward is None:
             output.append(None)
         else:
             output.append(1.0 if reward > BINARY_THRESHOLD else 0.0)
+  
+    return output
 
+def weighted_binary_code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]:
+    # combines binary reward with a weighted reward code reward
+    rewards = code_reward(completions, num_parallel=num_parallel, e2b_router_url=e2b_router_url, **kwargs)
+    BINARY_THRESHOLD = 0.99
+    NON_BINARY_WEIGHT = 0.1
+    
+    output = []
+    for reward in rewards:
+        if reward is None:
+            output.append(None)
+        else:
+            binary_reward = 1.0 if reward > BINARY_THRESHOLD else 0.0
+            output.append(binary_reward + NON_BINARY_WEIGHT * reward)
+  
     return output
 
 
@@ -596,6 +612,14 @@ def get_reward_funcs(script_args) -> list[Callable]:
             ),
             binary_code_reward,
         ),
+        "weighted_binary_code_reward": update_wrapper(
+            partial(
+                weighted_binary_code_reward,
+                num_parallel=script_args.parallel_code_exec_per_proc,
+                e2b_router_url=script_args.e2b_router_url,
+            ),
+            weighted_binary_code_reward,
+        ),
         "ioi_code": update_wrapper(
             partial(ioi_code_reward, test_batch_size=script_args.code_eval_test_batch_size), ioi_code_reward
         ),