diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v03.00.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v03.00.yaml new file mode 100644 index 000000000..90351b619 --- /dev/null +++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v03.00.yaml @@ -0,0 +1,67 @@ +# Model arguments +model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested +dataset_prompt_column: problem + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 64 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO +hub_model_revision: v03.00 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 24000 +max_steps: -1 +num_generations: 8 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v03.00 +overwrite_output_dir: true +per_device_train_batch_size: 1 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +e2b_router_url: "ip-10-53-85-124:8000" +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v04.00.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v04.00.yaml new file mode 100644 index 000000000..b44fabe06 --- /dev/null +++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v04.00.yaml @@ -0,0 +1,66 @@ +# Model arguments +model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled +dataset_prompt_column: problem + +# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 128 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO +hub_model_revision: v04.00 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 24000 +max_steps: -1 +num_generations: 8 +num_iterations: 1 +num_train_epochs: 1.0 +output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v04.00 +overwrite_output_dir: true +per_device_train_batch_size: 1 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +e2b_router_url: ip-10-53-86-47:8000 +reward_weights: +- 1.0 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +mask_truncated_completions: true +loss_type: dr_grpo \ No newline at end of file diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v05.00.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v05.00.yaml new file mode 100644 index 000000000..6b775eae3 --- /dev/null +++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v05.00.yaml @@ -0,0 +1,66 @@ +# Model arguments +model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled +dataset_prompt_column: problem + +# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO +hub_model_revision: v05.00 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 24000 +max_steps: -1 +num_generations: 16 +num_iterations: 1 +num_train_epochs: 1.0 +output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v05.00 +overwrite_output_dir: true +per_device_train_batch_size: 1 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +e2b_router_url: ip-10-53-86-47:8000 +reward_weights: +- 1.0 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +mask_truncated_completions: true +loss_type: dr_grpo \ No newline at end of file diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v06.00.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v06.00.yaml new file mode 100644 index 000000000..167df6138 --- /dev/null +++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v06.00.yaml @@ -0,0 +1,66 @@ +# Model arguments +model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled +dataset_prompt_column: problem + +# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO +hub_model_revision: v06.00 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 24000 +max_steps: -1 +num_generations: 16 +num_iterations: 1 +num_train_epochs: 1.0 +output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v06.00 +overwrite_output_dir: true +per_device_train_batch_size: 1 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- weighted_binary_code_reward +e2b_router_url: ip-10-53-86-47:8000 +reward_weights: +- 1.0 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +mask_truncated_completions: true +loss_type: dr_grpo \ No newline at end of file diff --git a/recipes/OlympicCoder-7B/sft/config_v00.00.yaml b/recipes/OlympicCoder-7B/sft/config_v00.00.yaml index dd0be5d96..f5b8385da 100644 --- a/recipes/OlympicCoder-7B/sft/config_v00.00.yaml +++ b/recipes/OlympicCoder-7B/sft/config_v00.00.yaml @@ -14,7 +14,7 @@ dataset_num_proc: 48 bf16: true do_eval: false eval_strategy: 'no' -gradient_accumulation_steps: 8 +gradient_accumulation_steps: 2 gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false @@ -27,20 +27,20 @@ logging_strategy: steps lr_scheduler_type: cosine_with_min_lr lr_scheduler_kwargs: min_lr_rate: 0.1 -packing: false +packing: true max_grad_norm: 0.2 -max_length: 32768 +max_length: 16000 max_steps: -1 num_train_epochs: 10 output_dir: data/OlympicCoder-7B overwrite_output_dir: true per_device_eval_batch_size: 1 -per_device_train_batch_size: 2 +per_device_train_batch_size: 1 push_to_hub: true report_to: - wandb save_strategy: epoch save_total_limit: 1 seed: 42 -use_liger_kernel: true +use_liger_kernel: false warmup_ratio: 0.03 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.05.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.05.yaml new file mode 100644 index 000000000..174c816b9 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.05.yaml @@ -0,0 +1,58 @@ + +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 + +# Data training arguments +dataset_name: open-r1/OpenR1-Math-cn_k12-86k +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +beta: 0.001 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true + +do_eval: false +gradient_accumulation_steps: 64 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v01.05 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 8 +num_train_epochs: 1 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.05 +overwrite_output_dir: true +per_device_train_batch_size: 1 +push_to_hub: true +use_liger_kernel: true +report_to: +- wandb +reward_funcs: +- accuracy +- format +reward_weights: +- 1.0 +- 0.2 +save_strategy: "steps" +save_steps: 0.1 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +warmup_ratio: 0.1 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.06.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.06.yaml new file mode 100644 index 000000000..8a456c9f5 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.06.yaml @@ -0,0 +1,60 @@ + +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 + +# Data training arguments +dataset_name: open-r1/OpenR1-Math-cn_k12-86k +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +beta: 0.001 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true + +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v01.06 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_train_epochs: 1 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.06 +overwrite_output_dir: true +per_device_train_batch_size: 4 +push_to_hub: true +use_liger_kernel: true +report_to: +- wandb +reward_funcs: +- accuracy +- format +reward_weights: +- 1.0 +- 0.2 +save_strategy: "steps" +save_steps: 0.1 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +warmup_ratio: 0.1 +wandb_entity: huggingface +wandb_project: open-r1 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.07.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.07.yaml new file mode 100644 index 000000000..c13458da3 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.07.yaml @@ -0,0 +1,61 @@ + +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 + +# Data training arguments +dataset_name: open-r1/OpenR1-Math-cn_k12-86k +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +beta: 0.001 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true + +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v01.07 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_train_epochs: 1 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.07 +overwrite_output_dir: true +per_device_train_batch_size: 4 +push_to_hub: true +use_liger_kernel: true +report_to: +- wandb +reward_funcs: +- accuracy +- format +reward_weights: +- 1.0 +- 0.2 +save_strategy: "steps" +save_steps: 0.1 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +warmup_ratio: 0.1 +wandb_entity: huggingface +wandb_project: open-r1 +scale_rewards: false \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.08.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.08.yaml new file mode 100644 index 000000000..5a422d214 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.08.yaml @@ -0,0 +1,62 @@ + +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 + +# Data training arguments +dataset_name: open-r1/OpenR1-Math-cn_k12-86k +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +beta: 0.0 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true + +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v01.08 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_train_epochs: 1 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.08 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +use_liger_kernel: true +report_to: +- wandb +reward_funcs: +- accuracy +- format +reward_weights: +- 1.0 +- 0.2 +save_strategy: "steps" +save_steps: 0.1 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +warmup_ratio: 0.1 +wandb_entity: huggingface +wandb_project: open-r1 +num_iterations: 4 +scale_rewards: false \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.09.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.09.yaml new file mode 100644 index 000000000..e162a16d6 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.09.yaml @@ -0,0 +1,62 @@ + +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 + +# Data training arguments +dataset_name: open-r1/OpenR1-Math-cn_k12-86k +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +beta: 0.0 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true + +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v01.09 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_train_epochs: 1 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.09 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +use_liger_kernel: true +report_to: +- wandb +reward_funcs: +- accuracy +- format +reward_weights: +- 1.0 +- 0.2 +save_strategy: "steps" +save_steps: 0.1 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +warmup_ratio: 0.1 +wandb_entity: huggingface +wandb_project: open-r1 +num_iterations: 4 +scale_rewards: true \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.00.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.00.yaml new file mode 100644 index 000000000..5e9156c89 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.00.yaml @@ -0,0 +1,65 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- math_500 +- aime24 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.00 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.00 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.1 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.02.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.02.yaml new file mode 100644 index 000000000..5d1092b85 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.02.yaml @@ -0,0 +1,65 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.02 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.02 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.03.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.03.yaml new file mode 100644 index 000000000..6053143c9 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.03.yaml @@ -0,0 +1,65 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.03 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.03 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.04.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.04.yaml new file mode 100644 index 000000000..081ef05c8 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.04.yaml @@ -0,0 +1,65 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 64 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.04 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.04 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.05.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.05.yaml new file mode 100644 index 000000000..cbb5c5276 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.05.yaml @@ -0,0 +1,67 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.05 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: cosine_with_min_lr +lr_scheduler_kwargs: + min_lr_rate: 0.1 +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.05 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.06.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.06.yaml new file mode 100644 index 000000000..87d47e8b4 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.06.yaml @@ -0,0 +1,65 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 64 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.06 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 64 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.06 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.07.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.07.yaml new file mode 100644 index 000000000..cb0e44266 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.07.yaml @@ -0,0 +1,67 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 64 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.07 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: cosine_with_min_lr +lr_scheduler_kwargs: + min_lr_rate: 0.1 +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 64 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.07 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.08.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.08.yaml new file mode 100644 index 000000000..d21020068 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.08.yaml @@ -0,0 +1,67 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.08 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: cosine_with_min_lr +lr_scheduler_kwargs: + min_lr_rate: 0.1 +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.08 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 4 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.09.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.09.yaml new file mode 100644 index 000000000..b8d5eb696 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.09.yaml @@ -0,0 +1,67 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.09 +hub_strategy: every_save +learning_rate: 5.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: cosine_with_min_lr +lr_scheduler_kwargs: + min_lr_rate: 0.1 +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.09 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 4 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.10.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.10.yaml new file mode 100644 index 000000000..3ea1630b3 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.10.yaml @@ -0,0 +1,67 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.10 +hub_strategy: every_save +learning_rate: 1.0e-05 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: cosine_with_min_lr +lr_scheduler_kwargs: + min_lr_rate: 0.1 +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.10 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 4 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.11.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.11.yaml new file mode 100644 index 000000000..50997275d --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.11.yaml @@ -0,0 +1,67 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.11 +hub_strategy: every_save +learning_rate: 4.0e-05 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: cosine_with_min_lr +lr_scheduler_kwargs: + min_lr_rate: 0.1 +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.11 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 4 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.12.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.12.yaml new file mode 100644 index 000000000..0628f4822 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.12.yaml @@ -0,0 +1,67 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.12 +hub_strategy: every_save +learning_rate: 5.0e-07 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: cosine_with_min_lr +lr_scheduler_kwargs: + min_lr_rate: 0.1 +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.12 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 4 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.13.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.13.yaml new file mode 100644 index 000000000..aa4c0f763 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.13.yaml @@ -0,0 +1,66 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled +dataset_prompt_column: problem + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.13 +hub_strategy: every_save +learning_rate: 5.0e-07 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.13 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.14.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.14.yaml new file mode 100644 index 000000000..4891fb2c7 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.14.yaml @@ -0,0 +1,66 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled +dataset_prompt_column: problem + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.14 +hub_strategy: every_save +learning_rate: 5.0e-07 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.14 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.01 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.15.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.15.yaml new file mode 100644 index 000000000..97fd1c3f6 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.15.yaml @@ -0,0 +1,65 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled +dataset_prompt_column: problem +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.15 +hub_strategy: every_save +learning_rate: 5.0e-07 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 1.0 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.15 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.01 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.16.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.16.yaml new file mode 100644 index 000000000..361c8d898 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.16.yaml @@ -0,0 +1,67 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested +dataset_prompt_column: problem + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.16 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.16 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +e2b_router_url: "ip-10-53-85-124:8000" +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.17.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.17.yaml new file mode 100644 index 000000000..072b61c4b --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.17.yaml @@ -0,0 +1,67 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested +dataset_prompt_column: problem + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.17 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.17 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +e2b_router_url: "ip-10-53-85-124:8000" +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.4 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.18.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.18.yaml new file mode 100644 index 000000000..585b7155b --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.18.yaml @@ -0,0 +1,67 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested +dataset_prompt_column: problem + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.18 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.18 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +e2b_router_url: "ip-10-53-85-124:8000" +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 1.0 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml new file mode 100644 index 000000000..4b8aa6b31 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml @@ -0,0 +1,69 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested +dataset_prompt_column: problem + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.20 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.20 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +e2b_router_url: ip-10-53-86-47:8000 +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.1 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 + +mask_truncated_completions: true \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml new file mode 100644 index 000000000..08244af8a --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml @@ -0,0 +1,70 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled +dataset_prompt_column: problem + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.30 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.30 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +e2b_router_url: ip-10-53-86-47:8000 +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.1 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 + +mask_truncated_completions: true +loss_type: dr_grpo \ No newline at end of file diff --git a/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v02.00.yaml b/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v02.00.yaml new file mode 100644 index 000000000..0c4cf5fe3 --- /dev/null +++ b/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v02.00.yaml @@ -0,0 +1,64 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-Coder-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- math_500 +- aime24 +beta: 0.001 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 14 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-Coder-7B-Instruct-GRPO +hub_model_revision: v02.00 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 14 +num_train_epochs: 0.1 +output_dir: data/Qwen2.5-Coder-7B-Instruct-GRPO_v02.00 +overwrite_output_dir: true +per_device_train_batch_size: 4 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.1 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 \ No newline at end of file diff --git a/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v03.00.yaml b/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v03.00.yaml new file mode 100644 index 000000000..962e66190 --- /dev/null +++ b/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v03.00.yaml @@ -0,0 +1,64 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- math_500 +- aime24 +beta: 0.001 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 14 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-Coder-7B-Instruct-GRPO +hub_model_revision: v03.00 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 14 +num_train_epochs: 0.1 +output_dir: data/Qwen2.5-Coder-7B-Instruct-GRPO_v03.00 +overwrite_output_dir: true +per_device_train_batch_size: 4 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.1 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 \ No newline at end of file diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py index daa2f3252..a436bedc9 100644 --- a/src/open_r1/rewards.py +++ b/src/open_r1/rewards.py @@ -387,14 +387,30 @@ def extract_code(completion: str, language: str = "python") -> str: def binary_code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]: rewards = code_reward(completions, num_parallel=num_parallel, e2b_router_url=e2b_router_url, **kwargs) BINARY_THRESHOLD = 0.99 - + output = [] for reward in rewards: if reward is None: output.append(None) else: output.append(1.0 if reward > BINARY_THRESHOLD else 0.0) + + return output +def weighted_binary_code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]: + # combines binary reward with a weighted reward code reward + rewards = code_reward(completions, num_parallel=num_parallel, e2b_router_url=e2b_router_url, **kwargs) + BINARY_THRESHOLD = 0.99 + NON_BINARY_WEIGHT = 0.1 + + output = [] + for reward in rewards: + if reward is None: + output.append(None) + else: + binary_reward = 1.0 if reward > BINARY_THRESHOLD else 0.0 + output.append(binary_reward + NON_BINARY_WEIGHT * reward) + return output @@ -596,6 +612,14 @@ def get_reward_funcs(script_args) -> list[Callable]: ), binary_code_reward, ), + "weighted_binary_code_reward": update_wrapper( + partial( + weighted_binary_code_reward, + num_parallel=script_args.parallel_code_exec_per_proc, + e2b_router_url=script_args.e2b_router_url, + ), + weighted_binary_code_reward, + ), "ioi_code": update_wrapper( partial(ioi_code_reward, test_batch_size=script_args.code_eval_test_batch_size), ioi_code_reward ),